perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	#else
	85	# include "regcomp.h"
	86	#endif
	87
	88	#include "dquote_static.c"
	89
	90	#ifdef op
	91	#undef op
	92	#endif /* op */
	93
	94	#ifdef MSDOS
	95	# if defined(BUGGY_MSC6)
	96	/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
	97	# pragma optimize("a",off)
	98	/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
	99	# pragma optimize("w",on )
	100	# endif /* BUGGY_MSC6 */
	101	#endif /* MSDOS */
	102
	103	#ifndef STATIC
	104	#define STATIC static
	105	#endif
	106
	107	typedef struct RExC_state_t {
	108	U32 flags; /* are we folding, multilining? */
	109	char precomp; / uncompiled string. */
	110	REGEXP rx_sv; / The SV that is the regexp. */
	111	regexp rx; / perl core regexp structure */
	112	regexp_internal rxi; / internal data for regexp object pprivate field */
	113	char start; / Start of input for compile */
	114	char end; / End of input for compile */
	115	char parse; / Input-scan pointer. */
	116	I32 whilem_seen; /* number of WHILEM in this expr */
	117	regnode emit_start; / Start of emitted-code area */
	118	regnode emit_bound; / First regnode outside of the allocated space */
	119	regnode emit; / Code-emit pointer; &regdummy = don't = compiling */
	120	I32 naughty; /* How bad is this pattern? */
	121	I32 sawback; /* Did we see \1, ...? */
	122	U32 seen;
	123	I32 size; /* Code size. */
	124	I32 npar; /* Capture buffer count, (OPEN). */
	125	I32 cpar; /* Capture buffer count, (CLOSE). */
	126	I32 nestroot; /* root parens we are in - used by accept */
	127	I32 extralen;
	128	I32 seen_zerolen;
	129	I32 seen_evals;
	130	regnode *open_parens; / pointers to open parens */
	131	regnode *close_parens; / pointers to close parens */
	132	regnode opend; / END node in program */
	133	I32 utf8; /* whether the pattern is utf8 or not */
	134	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	135	/* XXX use this for future optimisation of case
	136	* where pattern must be upgraded to utf8. */
	137	I32 uni_semantics; /* If a d charset modifier should use unicode
	138	rules, even if the pattern is not in
	139	utf8 */
	140	HV paren_names; / Paren names */
	141
	142	regnode *recurse; / Recurse regops */
	143	I32 recurse_count; /* Number of recurse regops */
	144	I32 in_lookbehind;
	145	I32 contains_locale;
	146	I32 override_recoding;
	147	#if ADD_TO_REGEXEC
	148	char starttry; / -Dr: where regtry was called. */
	149	#define RExC_starttry (pRExC_state->starttry)
	150	#endif
	151	#ifdef DEBUGGING
	152	const char *lastparse;
	153	I32 lastnum;
	154	AV paren_name_list; / idx -> name */
	155	#define RExC_lastparse (pRExC_state->lastparse)
	156	#define RExC_lastnum (pRExC_state->lastnum)
	157	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	158	#endif
	159	} RExC_state_t;
	160
	161	#define RExC_flags (pRExC_state->flags)
	162	#define RExC_precomp (pRExC_state->precomp)
	163	#define RExC_rx_sv (pRExC_state->rx_sv)
	164	#define RExC_rx (pRExC_state->rx)
	165	#define RExC_rxi (pRExC_state->rxi)
	166	#define RExC_start (pRExC_state->start)
	167	#define RExC_end (pRExC_state->end)
	168	#define RExC_parse (pRExC_state->parse)
	169	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	170	#ifdef RE_TRACK_PATTERN_OFFSETS
	171	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
	172	#endif
	173	#define RExC_emit (pRExC_state->emit)
	174	#define RExC_emit_start (pRExC_state->emit_start)
	175	#define RExC_emit_bound (pRExC_state->emit_bound)
	176	#define RExC_naughty (pRExC_state->naughty)
	177	#define RExC_sawback (pRExC_state->sawback)
	178	#define RExC_seen (pRExC_state->seen)
	179	#define RExC_size (pRExC_state->size)
	180	#define RExC_npar (pRExC_state->npar)
	181	#define RExC_nestroot (pRExC_state->nestroot)
	182	#define RExC_extralen (pRExC_state->extralen)
	183	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	184	#define RExC_seen_evals (pRExC_state->seen_evals)
	185	#define RExC_utf8 (pRExC_state->utf8)
	186	#define RExC_uni_semantics (pRExC_state->uni_semantics)
	187	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	188	#define RExC_open_parens (pRExC_state->open_parens)
	189	#define RExC_close_parens (pRExC_state->close_parens)
	190	#define RExC_opend (pRExC_state->opend)
	191	#define RExC_paren_names (pRExC_state->paren_names)
	192	#define RExC_recurse (pRExC_state->recurse)
	193	#define RExC_recurse_count (pRExC_state->recurse_count)
	194	#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
	195	#define RExC_contains_locale (pRExC_state->contains_locale)
	196	#define RExC_override_recoding (pRExC_state->override_recoding)
	197
	198
	199	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	200	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	201	((*s) == '{' && regcurly(s)))
	202
	203	#ifdef SPSTART
	204	#undef SPSTART /* dratted cpp namespace... */
	205	#endif
	206	/*
	207	* Flags to be passed up and down.
	208	*/
	209	#define WORST 0 /* Worst case. */
	210	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	211
	212	/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
	213	* character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
	214	#define SIMPLE 0x02
	215	#define SPSTART 0x04 /* Starts with * or +. */
	216	#define TRYAGAIN 0x08 /* Weeded out a declaration. */
	217	#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
	218
	219	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	220
	221	/* whether trie related optimizations are enabled */
	222	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	223	#define TRIE_STUDY_OPT
	224	#define FULL_TRIE_STUDY
	225	#define TRIE_STCLASS
	226	#endif
	227
	228
	229
	230	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	231	#define PBITVAL(paren) (1 << ((paren) & 7))
	232	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	233	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	234	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	235
	236	/* If not already in utf8, do a longjmp back to the beginning */
	237	#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
	238	#define REQUIRE_UTF8 STMT_START { \
	239	if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
	240	} STMT_END
	241
	242	/* About scan_data_t.
	243
	244	During optimisation we recurse through the regexp program performing
	245	various inplace (keyhole style) optimisations. In addition study_chunk
	246	and scan_commit populate this data structure with information about
	247	what strings MUST appear in the pattern. We look for the longest
	248	string that must appear at a fixed location, and we look for the
	249	longest string that may appear at a floating location. So for instance
	250	in the pattern:
	251
	252	/FOO[xX]A.*B[xX]BAR/
	253
	254	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	255	strings (because they follow a .* construct). study_chunk will identify
	256	both FOO and BAR as being the longest fixed and floating strings respectively.
	257
	258	The strings can be composites, for instance
	259
	260	/(f)(o)(o)/
	261
	262	will result in a composite fixed substring 'foo'.
	263
	264	For each string some basic information is maintained:
	265
	266	- offset or min_offset
	267	This is the position the string must appear at, or not before.
	268	It also implicitly (when combined with minlenp) tells us how many
	269	characters must match before the string we are searching for.
	270	Likewise when combined with minlenp and the length of the string it
	271	tells us how many characters must appear after the string we have
	272	found.
	273
	274	- max_offset
	275	Only used for floating strings. This is the rightmost point that
	276	the string can appear at. If set to I32 max it indicates that the
	277	string can occur infinitely far to the right.
	278
	279	- minlenp
	280	A pointer to the minimum length of the pattern that the string
	281	was found inside. This is important as in the case of positive
	282	lookahead or positive lookbehind we can have multiple patterns
	283	involved. Consider
	284
	285	/(?=FOO).*F/
	286
	287	The minimum length of the pattern overall is 3, the minimum length
	288	of the lookahead part is 3, but the minimum length of the part that
	289	will actually match is 1. So 'FOO's minimum length is 3, but the
	290	minimum length for the F is 1. This is important as the minimum length
	291	is used to determine offsets in front of and behind the string being
	292	looked for. Since strings can be composites this is the length of the
	293	pattern at the time it was committed with a scan_commit. Note that
	294	the length is calculated by study_chunk, so that the minimum lengths
	295	are not known until the full pattern has been compiled, thus the
	296	pointer to the value.
	297
	298	- lookbehind
	299
	300	In the case of lookbehind the string being searched for can be
	301	offset past the start point of the final matching string.
	302	If this value was just blithely removed from the min_offset it would
	303	invalidate some of the calculations for how many chars must match
	304	before or after (as they are derived from min_offset and minlen and
	305	the length of the string being searched for).
	306	When the final pattern is compiled and the data is moved from the
	307	scan_data_t structure into the regexp structure the information
	308	about lookbehind is factored in, with the information that would
	309	have been lost precalculated in the end_shift field for the
	310	associated string.
	311
	312	The fields pos_min and pos_delta are used to store the minimum offset
	313	and the delta to the maximum offset at the current point in the pattern.
	314
	315	*/
	316
	317	typedef struct scan_data_t {
	318	/I32 len_min; unused /
	319	/I32 len_delta; unused /
	320	I32 pos_min;
	321	I32 pos_delta;
	322	SV *last_found;
	323	I32 last_end; /* min value, <0 unless valid. */
	324	I32 last_start_min;
	325	I32 last_start_max;
	326	SV *longest; / Either &l_fixed, or &l_float. */
	327	SV longest_fixed; / longest fixed string found in pattern */
	328	I32 offset_fixed; /* offset where it starts */
	329	I32 minlen_fixed; / pointer to the minlen relevant to the string */
	330	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	331	SV longest_float; / longest floating string found in pattern */
	332	I32 offset_float_min; /* earliest point in string it can appear */
	333	I32 offset_float_max; /* latest point in string it can appear */
	334	I32 minlen_float; / pointer to the minlen relevant to the string */
	335	I32 lookbehind_float; /* is the position of the string modified by LB */
	336	I32 flags;
	337	I32 whilem_c;
	338	I32 *last_closep;
	339	struct regnode_charclass_class *start_class;
	340	} scan_data_t;
	341
	342	/*
	343	* Forward declarations for pregcomp()'s friends.
	344	*/
	345
	346	static const scan_data_t zero_scan_data =
	347	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	348
	349	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	350	#define SF_BEFORE_SEOL 0x0001
	351	#define SF_BEFORE_MEOL 0x0002
	352	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	353	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	354
	355	#ifdef NO_UNARY_PLUS
	356	# define SF_FIX_SHIFT_EOL (0+2)
	357	# define SF_FL_SHIFT_EOL (0+4)
	358	#else
	359	# define SF_FIX_SHIFT_EOL (+2)
	360	# define SF_FL_SHIFT_EOL (+4)
	361	#endif
	362
	363	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	364	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	365
	366	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	367	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	368	#define SF_IS_INF 0x0040
	369	#define SF_HAS_PAR 0x0080
	370	#define SF_IN_PAR 0x0100
	371	#define SF_HAS_EVAL 0x0200
	372	#define SCF_DO_SUBSTR 0x0400
	373	#define SCF_DO_STCLASS_AND 0x0800
	374	#define SCF_DO_STCLASS_OR 0x1000
	375	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	376	#define SCF_WHILEM_VISITED_POS 0x2000
	377
	378	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	379	#define SCF_SEEN_ACCEPT 0x8000
	380
	381	#define UTF cBOOL(RExC_utf8)
	382	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	383	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	384	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
	385	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
	386	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
	387	#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	388	#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
	389
	390	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	391
	392	#define OOB_UNICODE 12345678
	393	#define OOB_NAMEDCLASS -1
	394
	395	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	396	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	397
	398
	399	/* length of regex to show in messages that don't mark a position within */
	400	#define RegexLengthToShowInErrorMessages 127
	401
	402	/*
	403	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	404	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	405	* op/pragma/warn/regcomp.
	406	*/
	407	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	408	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	409
	410	#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
	411
	412	/*
	413	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	414	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	415	* "...".
	416	*/
	417	#define _FAIL(code) STMT_START { \
	418	const char *ellipses = ""; \
	419	IV len = RExC_end - RExC_precomp; \
	420	\
	421	if (!SIZE_ONLY) \
	422	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	423	if (len > RegexLengthToShowInErrorMessages) { \
	424	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	425	len = RegexLengthToShowInErrorMessages - 10; \
	426	ellipses = "..."; \
	427	} \
	428	code; \
	429	} STMT_END
	430
	431	#define FAIL(msg) _FAIL( \
	432	Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
	433	msg, (int)len, RExC_precomp, ellipses))
	434
	435	#define FAIL2(msg,arg) _FAIL( \
	436	Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
	437	arg, (int)len, RExC_precomp, ellipses))
	438
	439	/*
	440	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	441	*/
	442	#define Simple_vFAIL(m) STMT_START { \
	443	const IV offset = RExC_parse - RExC_precomp; \
	444	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	445	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	446	} STMT_END
	447
	448	/*
	449	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	450	*/
	451	#define vFAIL(m) STMT_START { \
	452	if (!SIZE_ONLY) \
	453	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	454	Simple_vFAIL(m); \
	455	} STMT_END
	456
	457	/*
	458	* Like Simple_vFAIL(), but accepts two arguments.
	459	*/
	460	#define Simple_vFAIL2(m,a1) STMT_START { \
	461	const IV offset = RExC_parse - RExC_precomp; \
	462	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
	463	(int)offset, RExC_precomp, RExC_precomp + offset); \
	464	} STMT_END
	465
	466	/*
	467	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	468	*/
	469	#define vFAIL2(m,a1) STMT_START { \
	470	if (!SIZE_ONLY) \
	471	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	472	Simple_vFAIL2(m, a1); \
	473	} STMT_END
	474
	475
	476	/*
	477	* Like Simple_vFAIL(), but accepts three arguments.
	478	*/
	479	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	480	const IV offset = RExC_parse - RExC_precomp; \
	481	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
	482	(int)offset, RExC_precomp, RExC_precomp + offset); \
	483	} STMT_END
	484
	485	/*
	486	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	487	*/
	488	#define vFAIL3(m,a1,a2) STMT_START { \
	489	if (!SIZE_ONLY) \
	490	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	491	Simple_vFAIL3(m, a1, a2); \
	492	} STMT_END
	493
	494	/*
	495	* Like Simple_vFAIL(), but accepts four arguments.
	496	*/
	497	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	498	const IV offset = RExC_parse - RExC_precomp; \
	499	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
	500	(int)offset, RExC_precomp, RExC_precomp + offset); \
	501	} STMT_END
	502
	503	#define ckWARNreg(loc,m) STMT_START { \
	504	const IV offset = loc - RExC_precomp; \
	505	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	506	(int)offset, RExC_precomp, RExC_precomp + offset); \
	507	} STMT_END
	508
	509	#define ckWARNregdep(loc,m) STMT_START { \
	510	const IV offset = loc - RExC_precomp; \
	511	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	512	m REPORT_LOCATION, \
	513	(int)offset, RExC_precomp, RExC_precomp + offset); \
	514	} STMT_END
	515
	516	#define ckWARN2regdep(loc,m, a1) STMT_START { \
	517	const IV offset = loc - RExC_precomp; \
	518	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	519	m REPORT_LOCATION, \
	520	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	521	} STMT_END
	522
	523	#define ckWARN2reg(loc, m, a1) STMT_START { \
	524	const IV offset = loc - RExC_precomp; \
	525	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	526	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	527	} STMT_END
	528
	529	#define vWARN3(loc, m, a1, a2) STMT_START { \
	530	const IV offset = loc - RExC_precomp; \
	531	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	532	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	533	} STMT_END
	534
	535	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	536	const IV offset = loc - RExC_precomp; \
	537	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	538	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	539	} STMT_END
	540
	541	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	542	const IV offset = loc - RExC_precomp; \
	543	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	544	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	545	} STMT_END
	546
	547	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	548	const IV offset = loc - RExC_precomp; \
	549	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	550	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	551	} STMT_END
	552
	553	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	554	const IV offset = loc - RExC_precomp; \
	555	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	556	a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
	557	} STMT_END
	558
	559
	560	/* Allow for side effects in s */
	561	#define REGC(c,s) STMT_START { \
	562	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	563	} STMT_END
	564
	565	/* Macros for recording node offsets. 20001227 mjd@plover.com
	566	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	567	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	568	* Element 0 holds the number n.
	569	* Position is 1 indexed.
	570	*/
	571	#ifndef RE_TRACK_PATTERN_OFFSETS
	572	#define Set_Node_Offset_To_R(node,byte)
	573	#define Set_Node_Offset(node,byte)
	574	#define Set_Cur_Node_Offset
	575	#define Set_Node_Length_To_R(node,len)
	576	#define Set_Node_Length(node,len)
	577	#define Set_Node_Cur_Length(node)
	578	#define Node_Offset(n)
	579	#define Node_Length(n)
	580	#define Set_Node_Offset_Length(node,offset,len)
	581	#define ProgLen(ri) ri->u.proglen
	582	#define SetProgLen(ri,x) ri->u.proglen = x
	583	#else
	584	#define ProgLen(ri) ri->u.offsets[0]
	585	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	586	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	587	if (! SIZE_ONLY) { \
	588	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	589	__LINE__, (int)(node), (int)(byte))); \
	590	if((node) < 0) { \
	591	Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
	592	} else { \
	593	RExC_offsets[2*(node)-1] = (byte); \
	594	} \
	595	} \
	596	} STMT_END
	597
	598	#define Set_Node_Offset(node,byte) \
	599	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	600	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	601
	602	#define Set_Node_Length_To_R(node,len) STMT_START { \
	603	if (! SIZE_ONLY) { \
	604	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	605	__LINE__, (int)(node), (int)(len))); \
	606	if((node) < 0) { \
	607	Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
	608	} else { \
	609	RExC_offsets[2*(node)] = (len); \
	610	} \
	611	} \
	612	} STMT_END
	613
	614	#define Set_Node_Length(node,len) \
	615	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	616	#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
	617	#define Set_Node_Cur_Length(node) \
	618	Set_Node_Length(node, RExC_parse - parse_start)
	619
	620	/* Get offsets and lengths */
	621	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	622	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	623
	624	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	625	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	626	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	627	} STMT_END
	628	#endif
	629
	630	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	631	#define EXPERIMENTAL_INPLACESCAN
	632	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	633
	634	#define DEBUG_STUDYDATA(str,data,depth) \
	635	DEBUG_OPTIMISE_MORE_r(if(data){ \
	636	PerlIO_printf(Perl_debug_log, \
	637	"%*s" str "Pos:%"IVdf"/%"IVdf \
	638	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	639	(int)(depth)*2, "", \
	640	(IV)((data)->pos_min), \
	641	(IV)((data)->pos_delta), \
	642	(UV)((data)->flags), \
	643	(IV)((data)->whilem_c), \
	644	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	645	is_inf ? "INF " : "" \
	646	); \
	647	if ((data)->last_found) \
	648	PerlIO_printf(Perl_debug_log, \
	649	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	650	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	651	SvPVX_const((data)->last_found), \
	652	(IV)((data)->last_end), \
	653	(IV)((data)->last_start_min), \
	654	(IV)((data)->last_start_max), \
	655	((data)->longest && \
	656	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	657	SvPVX_const((data)->longest_fixed), \
	658	(IV)((data)->offset_fixed), \
	659	((data)->longest && \
	660	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	661	SvPVX_const((data)->longest_float), \
	662	(IV)((data)->offset_float_min), \
	663	(IV)((data)->offset_float_max) \
	664	); \
	665	PerlIO_printf(Perl_debug_log,"\n"); \
	666	});
	667
	668	static void clear_re(pTHX_ void *r);
	669
	670	/* Mark that we cannot extend a found fixed substring at this point.
	671	Update the longest found anchored substring and the longest found
	672	floating substrings if needed. */
	673
	674	STATIC void
	675	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data, I32 *minlenp, int is_inf)
	676	{
	677	const STRLEN l = CHR_SVLEN(data->last_found);
	678	const STRLEN old_l = CHR_SVLEN(*data->longest);
	679	GET_RE_DEBUG_FLAGS_DECL;
	680
	681	PERL_ARGS_ASSERT_SCAN_COMMIT;
	682
	683	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	684	SvSetMagicSV(*data->longest, data->last_found);
	685	if (*data->longest == data->longest_fixed) {
	686	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	687	if (data->flags & SF_BEFORE_EOL)
	688	data->flags
	689	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	690	else
	691	data->flags &= ~SF_FIX_BEFORE_EOL;
	692	data->minlen_fixed=minlenp;
	693	data->lookbehind_fixed=0;
	694	}
	695	else { /* data->longest == data->longest_float /
	696	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	697	data->offset_float_max = (l
	698	? data->last_start_max
	699	: data->pos_min + data->pos_delta);
	700	if (is_inf \|\| (U32)data->offset_float_max > (U32)I32_MAX)
	701	data->offset_float_max = I32_MAX;
	702	if (data->flags & SF_BEFORE_EOL)
	703	data->flags
	704	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	705	else
	706	data->flags &= ~SF_FL_BEFORE_EOL;
	707	data->minlen_float=minlenp;
	708	data->lookbehind_float=0;
	709	}
	710	}
	711	SvCUR_set(data->last_found, 0);
	712	{
	713	SV * const sv = data->last_found;
	714	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	715	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	716	if (mg)
	717	mg->mg_len = 0;
	718	}
	719	}
	720	data->last_end = -1;
	721	data->flags &= ~SF_BEFORE_EOL;
	722	DEBUG_STUDYDATA("commit: ",data,0);
	723	}
	724
	725	/* Can match anything (initialization) */
	726	STATIC void
	727	S_cl_anything(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	728	{
	729	PERL_ARGS_ASSERT_CL_ANYTHING;
	730
	731	ANYOF_BITMAP_SETALL(cl);
	732	cl->flags = ANYOF_CLASS\|ANYOF_EOS\|ANYOF_UNICODE_ALL
	733	\|ANYOF_LOC_NONBITMAP_FOLD\|ANYOF_NON_UTF8_LATIN1_ALL;
	734
	735	/* If any portion of the regex is to operate under locale rules,
	736	* initialization includes it. The reason this isn't done for all regexes
	737	* is that the optimizer was written under the assumption that locale was
	738	* all-or-nothing. Given the complexity and lack of documentation in the
	739	* optimizer, and that there are inadequate test cases for locale, so many
	740	* parts of it may not work properly, it is safest to avoid locale unless
	741	* necessary. */
	742	if (RExC_contains_locale) {
	743	ANYOF_CLASS_SETALL(cl); /* /l uses class */
	744	cl->flags \|= ANYOF_LOCALE;
	745	}
	746	else {
	747	ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
	748	}
	749	}
	750
	751	/* Can match anything (initialization) */
	752	STATIC int
	753	S_cl_is_anything(const struct regnode_charclass_class *cl)
	754	{
	755	int value;
	756
	757	PERL_ARGS_ASSERT_CL_IS_ANYTHING;
	758
	759	for (value = 0; value <= ANYOF_MAX; value += 2)
	760	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
	761	return 1;
	762	if (!(cl->flags & ANYOF_UNICODE_ALL))
	763	return 0;
	764	if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
	765	return 0;
	766	return 1;
	767	}
	768
	769	/* Can match anything (initialization) */
	770	STATIC void
	771	S_cl_init(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	772	{
	773	PERL_ARGS_ASSERT_CL_INIT;
	774
	775	Zero(cl, 1, struct regnode_charclass_class);
	776	cl->type = ANYOF;
	777	cl_anything(pRExC_state, cl);
	778	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	779	}
	780
	781	/* These two functions currently do the exact same thing */
	782	#define cl_init_zero S_cl_init
	783
	784	/* 'AND' a given class with another one. Can create false positives. 'cl'
	785	* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
	786	* 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
	787	STATIC void
	788	S_cl_and(struct regnode_charclass_class *cl,
	789	const struct regnode_charclass_class *and_with)
	790	{
	791	PERL_ARGS_ASSERT_CL_AND;
	792
	793	assert(and_with->type == ANYOF);
	794
	795	/* I (khw) am not sure all these restrictions are necessary XXX */
	796	if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
	797	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
	798	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	799	&& !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	800	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
	801	int i;
	802
	803	if (and_with->flags & ANYOF_INVERT)
	804	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	805	cl->bitmap[i] &= ~and_with->bitmap[i];
	806	else
	807	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	808	cl->bitmap[i] &= and_with->bitmap[i];
	809	} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
	810
	811	if (and_with->flags & ANYOF_INVERT) {
	812
	813	/* Here, the and'ed node is inverted. Get the AND of the flags that
	814	* aren't affected by the inversion. Those that are affected are
	815	* handled individually below */
	816	U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
	817	cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
	818	cl->flags \|= affected_flags;
	819
	820	/* We currently don't know how to deal with things that aren't in the
	821	* bitmap, but we know that the intersection is no greater than what
	822	* is already in cl, so let there be false positives that get sorted
	823	* out after the synthetic start class succeeds, and the node is
	824	* matched for real. */
	825
	826	/* The inversion of these two flags indicate that the resulting
	827	* intersection doesn't have them */
	828	if (and_with->flags & ANYOF_UNICODE_ALL) {
	829	cl->flags &= ~ANYOF_UNICODE_ALL;
	830	}
	831	if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	832	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
	833	}
	834	}
	835	else { /* and'd node is not inverted */
	836	U8 outside_bitmap_but_not_utf8; /* Temp variable */
	837
	838	if (! ANYOF_NONBITMAP(and_with)) {
	839
	840	/* Here 'and_with' doesn't match anything outside the bitmap
	841	* (except possibly ANYOF_UNICODE_ALL), which means the
	842	* intersection can't either, except for ANYOF_UNICODE_ALL, in
	843	* which case we don't know what the intersection is, but it's no
	844	* greater than what cl already has, so can just leave it alone,
	845	* with possible false positives */
	846	if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
	847	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	848	cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
	849	}
	850	}
	851	else if (! ANYOF_NONBITMAP(cl)) {
	852
	853	/* Here, 'and_with' does match something outside the bitmap, and cl
	854	* doesn't have a list of things to match outside the bitmap. If
	855	* cl can match all code points above 255, the intersection will
	856	* be those above-255 code points that 'and_with' matches. If cl
	857	* can't match all Unicode code points, it means that it can't
	858	* match anything outside the bitmap (since the 'if' that got us
	859	* into this block tested for that), so we leave the bitmap empty.
	860	*/
	861	if (cl->flags & ANYOF_UNICODE_ALL) {
	862	ARG_SET(cl, ARG(and_with));
	863
	864	/* and_with's ARG may match things that don't require UTF8.
	865	* And now cl's will too, in spite of this being an 'and'. See
	866	* the comments below about the kludge */
	867	cl->flags \|= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
	868	}
	869	}
	870	else {
	871	/* Here, both 'and_with' and cl match something outside the
	872	* bitmap. Currently we do not do the intersection, so just match
	873	* whatever cl had at the beginning. */
	874	}
	875
	876
	877	/* Take the intersection of the two sets of flags. However, the
	878	* ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
	879	* kludge around the fact that this flag is not treated like the others
	880	* which are initialized in cl_anything(). The way the optimizer works
	881	* is that the synthetic start class (SSC) is initialized to match
	882	* anything, and then the first time a real node is encountered, its
	883	* values are AND'd with the SSC's with the result being the values of
	884	* the real node. However, there are paths through the optimizer where
	885	* the AND never gets called, so those initialized bits are set
	886	* inappropriately, which is not usually a big deal, as they just cause
	887	* false positives in the SSC, which will just mean a probably
	888	* imperceptible slow down in execution. However this bit has a
	889	* higher false positive consequence in that it can cause utf8.pm,
	890	* utf8_heavy.pl ... to be loaded when not necessary, which is a much
	891	* bigger slowdown and also causes significant extra memory to be used.
	892	* In order to prevent this, the code now takes a different tack. The
	893	* bit isn't set unless some part of the regular expression needs it,
	894	* but once set it won't get cleared. This means that these extra
	895	* modules won't get loaded unless there was some path through the
	896	* pattern that would have required them anyway, and so any false
	897	* positives that occur by not ANDing them out when they could be
	898	* aren't as severe as they would be if we treated this bit like all
	899	* the others */
	900	outside_bitmap_but_not_utf8 = (cl->flags \| and_with->flags)
	901	& ANYOF_NONBITMAP_NON_UTF8;
	902	cl->flags &= and_with->flags;
	903	cl->flags \|= outside_bitmap_but_not_utf8;
	904	}
	905	}
	906
	907	/* 'OR' a given class with another one. Can create false positives. 'cl'
	908	* should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
	909	* 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
	910	STATIC void
	911	S_cl_or(const RExC_state_t pRExC_state, struct regnode_charclass_class cl, const struct regnode_charclass_class *or_with)
	912	{
	913	PERL_ARGS_ASSERT_CL_OR;
	914
	915	if (or_with->flags & ANYOF_INVERT) {
	916
	917	/* Here, the or'd node is to be inverted. This means we take the
	918	* complement of everything not in the bitmap, but currently we don't
	919	* know what that is, so give up and match anything */
	920	if (ANYOF_NONBITMAP(or_with)) {
	921	cl_anything(pRExC_state, cl);
	922	}
	923	/* We do not use
	924	* (B1 \| CL1) \| (!B2 & !CL2) = (B1 \| !B2 & !CL2) \| (CL1 \| (!B2 & !CL2))
	925	* <= (B1 \| !B2) \| (CL1 \| !CL2)
	926	* which is wasteful if CL2 is small, but we ignore CL2:
	927	* (B1 \| CL1) \| (!B2 & !CL2) <= (B1 \| CL1) \| !B2 = (B1 \| !B2) \| CL1
	928	* XXXX Can we handle case-fold? Unclear:
	929	* (OK1(i) \| OK1(i')) \| !(OK1(i) \| OK1(i')) =
	930	* (OK1(i) \| OK1(i')) \| (!OK1(i) & !OK1(i'))
	931	*/
	932	else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	933	&& !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	934	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
	935	int i;
	936
	937	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	938	cl->bitmap[i] \|= ~or_with->bitmap[i];
	939	} /* XXXX: logic is complicated otherwise */
	940	else {
	941	cl_anything(pRExC_state, cl);
	942	}
	943
	944	/* And, we can just take the union of the flags that aren't affected
	945	* by the inversion */
	946	cl->flags \|= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
	947
	948	/* For the remaining flags:
	949	ANYOF_UNICODE_ALL and inverted means to not match anything above
	950	255, which means that the union with cl should just be
	951	what cl has in it, so can ignore this flag
	952	ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
	953	is 127-255 to match them, but then invert that, so the
	954	union with cl should just be what cl has in it, so can
	955	ignore this flag
	956	*/
	957	} else { /* 'or_with' is not inverted */
	958	/* (B1 \| CL1) \| (B2 \| CL2) = (B1 \| B2) \| (CL1 \| CL2)) */
	959	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	960	&& (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	961	\|\| (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
	962	int i;
	963
	964	/* OR char bitmap and class bitmap separately */
	965	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	966	cl->bitmap[i] \|= or_with->bitmap[i];
	967	if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
	968	for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
	969	cl->classflags[i] \|= or_with->classflags[i];
	970	cl->flags \|= ANYOF_CLASS;
	971	}
	972	}
	973	else { /* XXXX: logic is complicated, leave it along for a moment. */
	974	cl_anything(pRExC_state, cl);
	975	}
	976
	977	if (ANYOF_NONBITMAP(or_with)) {
	978
	979	/* Use the added node's outside-the-bit-map match if there isn't a
	980	* conflict. If there is a conflict (both nodes match something
	981	* outside the bitmap, but what they match outside is not the same
	982	* pointer, and hence not easily compared until XXX we extend
	983	* inversion lists this far), give up and allow the start class to
	984	* match everything outside the bitmap. If that stuff is all above
	985	* 255, can just set UNICODE_ALL, otherwise caould be anything. */
	986	if (! ANYOF_NONBITMAP(cl)) {
	987	ARG_SET(cl, ARG(or_with));
	988	}
	989	else if (ARG(cl) != ARG(or_with)) {
	990
	991	if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
	992	cl_anything(pRExC_state, cl);
	993	}
	994	else {
	995	cl->flags \|= ANYOF_UNICODE_ALL;
	996	}
	997	}
	998	}
	999
	1000	/* Take the union */
	1001	cl->flags \|= or_with->flags;
	1002	}
	1003	}
	1004
	1005	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	1006	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	1007	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	1008	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
	1009
	1010
	1011	#ifdef DEBUGGING
	1012	/*
	1013	dump_trie(trie,widecharmap,revcharmap)
	1014	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	1015	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	1016
	1017	These routines dump out a trie in a somewhat readable format.
	1018	The _interim_ variants are used for debugging the interim
	1019	tables that are used to generate the final compressed
	1020	representation which is what dump_trie expects.
	1021
	1022	Part of the reason for their existence is to provide a form
	1023	of documentation as to how the different representations function.
	1024
	1025	*/
	1026
	1027	/*
	1028	Dumps the final compressed table form of the trie to Perl_debug_log.
	1029	Used for debugging make_trie().
	1030	*/
	1031
	1032	STATIC void
	1033	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	1034	AV *revcharmap, U32 depth)
	1035	{
	1036	U32 state;
	1037	SV *sv=sv_newmortal();
	1038	int colwidth= widecharmap ? 6 : 4;
	1039	U16 word;
	1040	GET_RE_DEBUG_FLAGS_DECL;
	1041
	1042	PERL_ARGS_ASSERT_DUMP_TRIE;
	1043
	1044	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	1045	(int)depth * 2 + 2,"",
	1046	"Match","Base","Ofs" );
	1047
	1048	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	1049	SV ** const tmp = av_fetch( revcharmap, state, 0);
	1050	if ( tmp ) {
	1051	PerlIO_printf( Perl_debug_log, "%*s",
	1052	colwidth,
	1053	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1054	PL_colors[0], PL_colors[1],
	1055	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1056	PERL_PV_ESCAPE_FIRSTCHAR
	1057	)
	1058	);
	1059	}
	1060	}
	1061	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	1062	(int)depth * 2 + 2,"");
	1063
	1064	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	1065	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	1066	PerlIO_printf( Perl_debug_log, "\n");
	1067
	1068	for( state = 1 ; state < trie->statecount ; state++ ) {
	1069	const U32 base = trie->states[ state ].trans.base;
	1070
	1071	PerlIO_printf( Perl_debug_log, "%s#%4"UVXf"\|", (int)depth 2 + 2,"", (UV)state);
	1072
	1073	if ( trie->states[ state ].wordnum ) {
	1074	PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
	1075	} else {
	1076	PerlIO_printf( Perl_debug_log, "%6s", "" );
	1077	}
	1078
	1079	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	1080
	1081	if ( base ) {
	1082	U32 ofs = 0;
	1083
	1084	while( ( base + ofs < trie->uniquecharcount ) \|\|
	1085	( base + ofs - trie->uniquecharcount < trie->lasttrans
	1086	&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
	1087	ofs++;
	1088
	1089	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	1090
	1091	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1092	if ( ( base + ofs >= trie->uniquecharcount ) &&
	1093	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	1094	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	1095	{
	1096	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	1097	colwidth,
	1098	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
	1099	} else {
	1100	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	1101	}
	1102	}
	1103
	1104	PerlIO_printf( Perl_debug_log, "]");
	1105
	1106	}
	1107	PerlIO_printf( Perl_debug_log, "\n" );
	1108	}
	1109	PerlIO_printf(Perl_debug_log, "%sword_info N:(prev,len)=", (int)depth2, "");
	1110	for (word=1; word <= trie->wordcount; word++) {
	1111	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	1112	(int)word, (int)(trie->wordinfo[word].prev),
	1113	(int)(trie->wordinfo[word].len));
	1114	}
	1115	PerlIO_printf(Perl_debug_log, "\n" );
	1116	}
	1117	/*
	1118	Dumps a fully constructed but uncompressed trie in list form.
	1119	List tries normally only are used for construction when the number of
	1120	possible chars (trie->uniquecharcount) is very high.
	1121	Used for debugging make_trie().
	1122	*/
	1123	STATIC void
	1124	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	1125	HV widecharmap, AV revcharmap, U32 next_alloc,
	1126	U32 depth)
	1127	{
	1128	U32 state;
	1129	SV *sv=sv_newmortal();
	1130	int colwidth= widecharmap ? 6 : 4;
	1131	GET_RE_DEBUG_FLAGS_DECL;
	1132
	1133	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1134
	1135	/* print out the table precompression. */
	1136	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1137	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1138	"------:-----+-----------------\n" );
	1139
	1140	for( state=1 ; state < next_alloc ; state ++ ) {
	1141	U16 charid;
	1142
	1143	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1144	(int)depth * 2 + 2,"", (UV)state );
	1145	if ( ! trie->states[ state ].wordnum ) {
	1146	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1147	} else {
	1148	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1149	trie->states[ state ].wordnum
	1150	);
	1151	}
	1152	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1153	SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
	1154	if ( tmp ) {
	1155	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1156	colwidth,
	1157	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1158	PL_colors[0], PL_colors[1],
	1159	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1160	PERL_PV_ESCAPE_FIRSTCHAR
	1161	) ,
	1162	TRIE_LIST_ITEM(state,charid).forid,
	1163	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1164	);
	1165	if (!(charid % 10))
	1166	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1167	(int)((depth * 2) + 14), "");
	1168	}
	1169	}
	1170	PerlIO_printf( Perl_debug_log, "\n");
	1171	}
	1172	}
	1173
	1174	/*
	1175	Dumps a fully constructed but uncompressed trie in table form.
	1176	This is the normal DFA style state transition table, with a few
	1177	twists to facilitate compression later.
	1178	Used for debugging make_trie().
	1179	*/
	1180	STATIC void
	1181	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1182	HV widecharmap, AV revcharmap, U32 next_alloc,
	1183	U32 depth)
	1184	{
	1185	U32 state;
	1186	U16 charid;
	1187	SV *sv=sv_newmortal();
	1188	int colwidth= widecharmap ? 6 : 4;
	1189	GET_RE_DEBUG_FLAGS_DECL;
	1190
	1191	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1192
	1193	/*
	1194	print out the table precompression so that we can do a visual check
	1195	that they are identical.
	1196	*/
	1197
	1198	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1199
	1200	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1201	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1202	if ( tmp ) {
	1203	PerlIO_printf( Perl_debug_log, "%*s",
	1204	colwidth,
	1205	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1206	PL_colors[0], PL_colors[1],
	1207	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1208	PERL_PV_ESCAPE_FIRSTCHAR
	1209	)
	1210	);
	1211	}
	1212	}
	1213
	1214	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1215
	1216	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1217	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1218	}
	1219
	1220	PerlIO_printf( Perl_debug_log, "\n" );
	1221
	1222	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1223
	1224	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1225	(int)depth * 2 + 2,"",
	1226	(UV)TRIE_NODENUM( state ) );
	1227
	1228	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1229	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1230	if (v)
	1231	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1232	else
	1233	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1234	}
	1235	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1236	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
	1237	} else {
	1238	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
	1239	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1240	}
	1241	}
	1242	}
	1243
	1244	#endif
	1245
	1246
	1247	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1248	startbranch: the first branch in the whole branch sequence
	1249	first : start branch of sequence of branch-exact nodes.
	1250	May be the same as startbranch
	1251	last : Thing following the last branch.
	1252	May be the same as tail.
	1253	tail : item following the branch sequence
	1254	count : words in the sequence
	1255	flags : currently the OP() type we will be building one of /EXACT(\|F\|Fl)/
	1256	depth : indent depth
	1257
	1258	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1259
	1260	A trie is an N'ary tree where the branches are determined by digital
	1261	decomposition of the key. IE, at the root node you look up the 1st character and
	1262	follow that branch repeat until you find the end of the branches. Nodes can be
	1263	marked as "accepting" meaning they represent a complete word. Eg:
	1264
	1265	/he\|she\|his\|hers/
	1266
	1267	would convert into the following structure. Numbers represent states, letters
	1268	following numbers represent valid transitions on the letter from that state, if
	1269	the number is in square brackets it represents an accepting state, otherwise it
	1270	will be in parenthesis.
	1271
	1272	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1273	\| \|
	1274	\| (2)
	1275	\| \|
	1276	(1) +-i->(6)-+-s->[7]
	1277	\|
	1278	+-s->(3)-+-h->(4)-+-e->[5]
	1279
	1280	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1281
	1282	This shows that when matching against the string 'hers' we will begin at state 1
	1283	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1284	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1285	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1286	single traverse. We store a mapping from accepting to state to which word was
	1287	matched, and then when we have multiple possibilities we try to complete the
	1288	rest of the regex in the order in which they occured in the alternation.
	1289
	1290	The only prior NFA like behaviour that would be changed by the TRIE support is
	1291	the silent ignoring of duplicate alternations which are of the form:
	1292
	1293	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1294
	1295	Thus EVAL blocks following a trie may be called a different number of times with
	1296	and without the optimisation. With the optimisations dupes will be silently
	1297	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1298	the following demonstrates:
	1299
	1300	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1301
	1302	which prints out 'word' three times, but
	1303
	1304	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1305
	1306	which doesnt print it out at all. This is due to other optimisations kicking in.
	1307
	1308	Example of what happens on a structural level:
	1309
	1310	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1311
	1312	1: CURLYM[1] {1,32767}(18)
	1313	5: BRANCH(8)
	1314	6: EXACT <ac>(16)
	1315	8: BRANCH(11)
	1316	9: EXACT <ad>(16)
	1317	11: BRANCH(14)
	1318	12: EXACT <ab>(16)
	1319	16: SUCCEED(0)
	1320	17: NOTHING(18)
	1321	18: END(0)
	1322
	1323	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1324	and should turn into:
	1325
	1326	1: CURLYM[1] {1,32767}(18)
	1327	5: TRIE(16)
	1328	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1329	<ac>
	1330	<ad>
	1331	<ab>
	1332	16: SUCCEED(0)
	1333	17: NOTHING(18)
	1334	18: END(0)
	1335
	1336	Cases where tail != last would be like /(?foo\|bar)baz/:
	1337
	1338	1: BRANCH(4)
	1339	2: EXACT <foo>(8)
	1340	4: BRANCH(7)
	1341	5: EXACT <bar>(8)
	1342	7: TAIL(8)
	1343	8: EXACT <baz>(10)
	1344	10: END(0)
	1345
	1346	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1347	and would end up looking like:
	1348
	1349	1: TRIE(8)
	1350	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1351	<foo>
	1352	<bar>
	1353	7: TAIL(8)
	1354	8: EXACT <baz>(10)
	1355	10: END(0)
	1356
	1357	d = uvuni_to_utf8_flags(d, uv, 0);
	1358
	1359	is the recommended Unicode-aware way of saying
	1360
	1361	*(d++) = uv;
	1362	*/
	1363
	1364	#define TRIE_STORE_REVCHAR \
	1365	STMT_START { \
	1366	if (UTF) { \
	1367	SV *zlopp = newSV(2); \
	1368	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1369	unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
	1370	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1371	SvPOK_on(zlopp); \
	1372	SvUTF8_on(zlopp); \
	1373	av_push(revcharmap, zlopp); \
	1374	} else { \
	1375	char ooooff = (char)uvc; \
	1376	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1377	} \
	1378	} STMT_END
	1379
	1380	#define TRIE_READ_CHAR STMT_START { \
	1381	wordlen++; \
	1382	if ( UTF ) { \
	1383	if ( folder ) { \
	1384	if ( foldlen > 0 ) { \
	1385	uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
	1386	foldlen -= len; \
	1387	scan += len; \
	1388	len = 0; \
	1389	} else { \
	1390	len = UTF8SKIP(uc);\
	1391	uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
	1392	foldlen -= UNISKIP( uvc ); \
	1393	scan = foldbuf + UNISKIP( uvc ); \
	1394	} \
	1395	} else { \
	1396	uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
	1397	} \
	1398	} else { \
	1399	uvc = (U32)*uc; \
	1400	len = 1; \
	1401	} \
	1402	} STMT_END
	1403
	1404
	1405
	1406	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1407	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1408	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1409	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1410	} \
	1411	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1412	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1413	TRIE_LIST_CUR( state )++; \
	1414	} STMT_END
	1415
	1416	#define TRIE_LIST_NEW(state) STMT_START { \
	1417	Newxz( trie->states[ state ].trans.list, \
	1418	4, reg_trie_trans_le ); \
	1419	TRIE_LIST_CUR( state ) = 1; \
	1420	TRIE_LIST_LEN( state ) = 4; \
	1421	} STMT_END
	1422
	1423	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1424	U16 dupe= trie->states[ state ].wordnum; \
	1425	regnode * const noper_next = regnext( noper ); \
	1426	\
	1427	DEBUG_r({ \
	1428	/* store the word for dumping */ \
	1429	SV* tmp; \
	1430	if (OP(noper) != NOTHING) \
	1431	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1432	else \
	1433	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1434	av_push( trie_words, tmp ); \
	1435	}); \
	1436	\
	1437	curword++; \
	1438	trie->wordinfo[curword].prev = 0; \
	1439	trie->wordinfo[curword].len = wordlen; \
	1440	trie->wordinfo[curword].accept = state; \
	1441	\
	1442	if ( noper_next < tail ) { \
	1443	if (!trie->jump) \
	1444	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
	1445	trie->jump[curword] = (U16)(noper_next - convert); \
	1446	if (!jumper) \
	1447	jumper = noper_next; \
	1448	if (!nextbranch) \
	1449	nextbranch= regnext(cur); \
	1450	} \
	1451	\
	1452	if ( dupe ) { \
	1453	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1454	/* chain, so that when the bits of chain are later */\
	1455	/* linked together, the dups appear in the chain */\
	1456	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1457	trie->wordinfo[dupe].prev = curword; \
	1458	} else { \
	1459	/* we haven't inserted this word yet. */ \
	1460	trie->states[ state ].wordnum = curword; \
	1461	} \
	1462	} STMT_END
	1463
	1464
	1465	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1466	( ( base + charid >= ucharcount \
	1467	&& base + charid < ubound \
	1468	&& state == trie->trans[ base - ucharcount + charid ].check \
	1469	&& trie->trans[ base - ucharcount + charid ].next ) \
	1470	? trie->trans[ base - ucharcount + charid ].next \
	1471	: ( state==1 ? special : 0 ) \
	1472	)
	1473
	1474	#define MADE_TRIE 1
	1475	#define MADE_JUMP_TRIE 2
	1476	#define MADE_EXACT_TRIE 4
	1477
	1478	STATIC I32
	1479	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch, regnode first, regnode last, regnode *tail, U32 word_count, U32 flags, U32 depth)
	1480	{
	1481	dVAR;
	1482	/* first pass, loop through and scan words */
	1483	reg_trie_data *trie;
	1484	HV *widecharmap = NULL;
	1485	AV *revcharmap = newAV();
	1486	regnode *cur;
	1487	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1488	STRLEN len = 0;
	1489	UV uvc = 0;
	1490	U16 curword = 0;
	1491	U32 next_alloc = 0;
	1492	regnode *jumper = NULL;
	1493	regnode *nextbranch = NULL;
	1494	regnode *convert = NULL;
	1495	U32 prev_states; / temp array mapping each state to previous one */
	1496	/* we just use folder as a flag in utf8 */
	1497	const U8 * folder = NULL;
	1498
	1499	#ifdef DEBUGGING
	1500	const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
	1501	AV *trie_words = NULL;
	1502	/* along with revcharmap, this only used during construction but both are
	1503	* useful during debugging so we store them in the struct when debugging.
	1504	*/
	1505	#else
	1506	const U32 data_slot = add_data( pRExC_state, 2, "tu" );
	1507	STRLEN trie_charcount=0;
	1508	#endif
	1509	SV *re_trie_maxbuff;
	1510	GET_RE_DEBUG_FLAGS_DECL;
	1511
	1512	PERL_ARGS_ASSERT_MAKE_TRIE;
	1513	#ifndef DEBUGGING
	1514	PERL_UNUSED_ARG(depth);
	1515	#endif
	1516
	1517	switch (flags) {
	1518	case EXACTFA:
	1519	case EXACTFU: folder = PL_fold_latin1; break;
	1520	case EXACTF: folder = PL_fold; break;
	1521	case EXACTFL: folder = PL_fold_locale; break;
	1522	}
	1523
	1524	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1525	trie->refcount = 1;
	1526	trie->startstate = 1;
	1527	trie->wordcount = word_count;
	1528	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1529	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1530	if (!(UTF && folder))
	1531	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1532	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1533	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1534
	1535	DEBUG_r({
	1536	trie_words = newAV();
	1537	});
	1538
	1539	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1540	if (!SvIOK(re_trie_maxbuff)) {
	1541	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1542	}
	1543	DEBUG_OPTIMISE_r({
	1544	PerlIO_printf( Perl_debug_log,
	1545	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	1546	(int)depth * 2 + 2, "",
	1547	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	1548	REG_NODE_NUM(last), REG_NODE_NUM(tail),
	1549	(int)depth);
	1550	});
	1551
	1552	/* Find the node we are going to overwrite */
	1553	if ( first == startbranch && OP( last ) != BRANCH ) {
	1554	/* whole branch chain */
	1555	convert = first;
	1556	} else {
	1557	/* branch sub-chain */
	1558	convert = NEXTOPER( first );
	1559	}
	1560
	1561	/* -- First loop and Setup --
	1562
	1563	We first traverse the branches and scan each word to determine if it
	1564	contains widechars, and how many unique chars there are, this is
	1565	important as we have to build a table with at least as many columns as we
	1566	have unique chars.
	1567
	1568	We use an array of integers to represent the character codes 0..255
	1569	(trie->charmap) and we use a an HV* to store Unicode characters. We use the
	1570	native representation of the character value as the key and IV's for the
	1571	coded index.
	1572
	1573	TODO If we keep track of how many times each character is used we can
	1574	remap the columns so that the table compression later on is more
	1575	efficient in terms of memory by ensuring the most common value is in the
	1576	middle and the least common are on the outside. IMO this would be better
	1577	than a most to least common mapping as theres a decent chance the most
	1578	common letter will share a node with the least common, meaning the node
	1579	will not be compressible. With a middle is most common approach the worst
	1580	case is when we have the least common nodes twice.
	1581
	1582	*/
	1583
	1584	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1585	regnode * const noper = NEXTOPER( cur );
	1586	const U8 uc = (U8)STRING( noper );
	1587	const U8 * const e = uc + STR_LEN( noper );
	1588	STRLEN foldlen = 0;
	1589	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1590	const U8 scan = (U8)NULL;
	1591	U32 wordlen = 0; /* required init */
	1592	STRLEN chars = 0;
	1593	bool set_bit = trie->bitmap ? 1 : 0; /store the first char in the bitmap?/
	1594
	1595	if (OP(noper) == NOTHING) {
	1596	trie->minlen= 0;
	1597	continue;
	1598	}
	1599	if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
	1600	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	1601	regardless of encoding */
	1602
	1603	for ( ; uc < e ; uc += len ) {
	1604	TRIE_CHARCOUNT(trie)++;
	1605	TRIE_READ_CHAR;
	1606	chars++;
	1607	if ( uvc < 256 ) {
	1608	if ( !trie->charmap[ uvc ] ) {
	1609	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	1610	if ( folder )
	1611	trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
	1612	TRIE_STORE_REVCHAR;
	1613	}
	1614	if ( set_bit ) {
	1615	/* store the codepoint in the bitmap, and its folded
	1616	* equivalent. */
	1617	TRIE_BITMAP_SET(trie,uvc);
	1618
	1619	/* store the folded codepoint */
	1620	if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
	1621
	1622	if ( !UTF ) {
	1623	/* store first byte of utf8 representation of
	1624	variant codepoints */
	1625	if (! UNI_IS_INVARIANT(uvc)) {
	1626	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	1627	}
	1628	}
	1629	set_bit = 0; /* We've done our bit :-) */
	1630	}
	1631	} else {
	1632	SV** svpp;
	1633	if ( !widecharmap )
	1634	widecharmap = newHV();
	1635
	1636	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	1637
	1638	if ( !svpp )
	1639	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	1640
	1641	if ( !SvTRUE( *svpp ) ) {
	1642	sv_setiv( *svpp, ++trie->uniquecharcount );
	1643	TRIE_STORE_REVCHAR;
	1644	}
	1645	}
	1646	}
	1647	if( cur == first ) {
	1648	trie->minlen=chars;
	1649	trie->maxlen=chars;
	1650	} else if (chars < trie->minlen) {
	1651	trie->minlen=chars;
	1652	} else if (chars > trie->maxlen) {
	1653	trie->maxlen=chars;
	1654	}
	1655
	1656	} /* end first pass */
	1657	DEBUG_TRIE_COMPILE_r(
	1658	PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	1659	(int)depth * 2 + 2,"",
	1660	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	1661	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	1662	(int)trie->minlen, (int)trie->maxlen )
	1663	);
	1664
	1665	/*
	1666	We now know what we are dealing with in terms of unique chars and
	1667	string sizes so we can calculate how much memory a naive
	1668	representation using a flat table will take. If it's over a reasonable
	1669	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	1670	conservative but potentially much slower representation using an array
	1671	of lists.
	1672
	1673	At the end we convert both representations into the same compressed
	1674	form that will be used in regexec.c for matching with. The latter
	1675	is a form that cannot be used to construct with but has memory
	1676	properties similar to the list form and access properties similar
	1677	to the table form making it both suitable for fast searches and
	1678	small enough that its feasable to store for the duration of a program.
	1679
	1680	See the comment in the code where the compressed table is produced
	1681	inplace from the flat tabe representation for an explanation of how
	1682	the compression works.
	1683
	1684	*/
	1685
	1686
	1687	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	1688	prev_states[1] = 0;
	1689
	1690	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
	1691	/*
	1692	Second Pass -- Array Of Lists Representation
	1693
	1694	Each state will be represented by a list of charid:state records
	1695	(reg_trie_trans_le) the first such element holds the CUR and LEN
	1696	points of the allocated array. (See defines above).
	1697
	1698	We build the initial structure using the lists, and then convert
	1699	it into the compressed table form which allows faster lookups
	1700	(but cant be modified once converted).
	1701	*/
	1702
	1703	STRLEN transcount = 1;
	1704
	1705	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1706	"%*sCompiling trie using list compiler\n",
	1707	(int)depth * 2 + 2, ""));
	1708
	1709	trie->states = (reg_trie_state *)
	1710	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1711	sizeof(reg_trie_state) );
	1712	TRIE_LIST_NEW(1);
	1713	next_alloc = 2;
	1714
	1715	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1716
	1717	regnode * const noper = NEXTOPER( cur );
	1718	U8 uc = (U8)STRING( noper );
	1719	const U8 * const e = uc + STR_LEN( noper );
	1720	U32 state = 1; /* required init */
	1721	U16 charid = 0; /* sanity init */
	1722	U8 scan = (U8)NULL; /* sanity init */
	1723	STRLEN foldlen = 0; /* required init */
	1724	U32 wordlen = 0; /* required init */
	1725	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1726
	1727	if (OP(noper) != NOTHING) {
	1728	for ( ; uc < e ; uc += len ) {
	1729
	1730	TRIE_READ_CHAR;
	1731
	1732	if ( uvc < 256 ) {
	1733	charid = trie->charmap[ uvc ];
	1734	} else {
	1735	SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1736	if ( !svpp ) {
	1737	charid = 0;
	1738	} else {
	1739	charid=(U16)SvIV( *svpp );
	1740	}
	1741	}
	1742	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1743	if ( charid ) {
	1744
	1745	U16 check;
	1746	U32 newstate = 0;
	1747
	1748	charid--;
	1749	if ( !trie->states[ state ].trans.list ) {
	1750	TRIE_LIST_NEW( state );
	1751	}
	1752	for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
	1753	if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
	1754	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	1755	break;
	1756	}
	1757	}
	1758	if ( ! newstate ) {
	1759	newstate = next_alloc++;
	1760	prev_states[newstate] = state;
	1761	TRIE_LIST_PUSH( state, charid, newstate );
	1762	transcount++;
	1763	}
	1764	state = newstate;
	1765	} else {
	1766	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1767	}
	1768	}
	1769	}
	1770	TRIE_HANDLE_WORD(state);
	1771
	1772	} /* end second pass */
	1773
	1774	/* next alloc is the NEXT state to be allocated */
	1775	trie->statecount = next_alloc;
	1776	trie->states = (reg_trie_state *)
	1777	PerlMemShared_realloc( trie->states,
	1778	next_alloc
	1779	* sizeof(reg_trie_state) );
	1780
	1781	/* and now dump it out before we compress it */
	1782	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	1783	revcharmap, next_alloc,
	1784	depth+1)
	1785	);
	1786
	1787	trie->trans = (reg_trie_trans *)
	1788	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	1789	{
	1790	U32 state;
	1791	U32 tp = 0;
	1792	U32 zp = 0;
	1793
	1794
	1795	for( state=1 ; state < next_alloc ; state ++ ) {
	1796	U32 base=0;
	1797
	1798	/*
	1799	DEBUG_TRIE_COMPILE_MORE_r(
	1800	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	1801	);
	1802	*/
	1803
	1804	if (trie->states[state].trans.list) {
	1805	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1806	U16 maxid=minid;
	1807	U16 idx;
	1808
	1809	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1810	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1811	if ( forid < minid ) {
	1812	minid=forid;
	1813	} else if ( forid > maxid ) {
	1814	maxid=forid;
	1815	}
	1816	}
	1817	if ( transcount < tp + maxid - minid + 1) {
	1818	transcount *= 2;
	1819	trie->trans = (reg_trie_trans *)
	1820	PerlMemShared_realloc( trie->trans,
	1821	transcount
	1822	* sizeof(reg_trie_trans) );
	1823	Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
	1824	}
	1825	base = trie->uniquecharcount + tp - minid;
	1826	if ( maxid == minid ) {
	1827	U32 set = 0;
	1828	for ( ; zp < tp ; zp++ ) {
	1829	if ( ! trie->trans[ zp ].next ) {
	1830	base = trie->uniquecharcount + zp - minid;
	1831	trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1832	trie->trans[ zp ].check = state;
	1833	set = 1;
	1834	break;
	1835	}
	1836	}
	1837	if ( !set ) {
	1838	trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1839	trie->trans[ tp ].check = state;
	1840	tp++;
	1841	zp = tp;
	1842	}
	1843	} else {
	1844	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1845	const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
	1846	trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
	1847	trie->trans[ tid ].check = state;
	1848	}
	1849	tp += ( maxid - minid + 1 );
	1850	}
	1851	Safefree(trie->states[ state ].trans.list);
	1852	}
	1853	/*
	1854	DEBUG_TRIE_COMPILE_MORE_r(
	1855	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	1856	);
	1857	*/
	1858	trie->states[ state ].trans.base=base;
	1859	}
	1860	trie->lasttrans = tp + 1;
	1861	}
	1862	} else {
	1863	/*
	1864	Second Pass -- Flat Table Representation.
	1865
	1866	we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
	1867	We know that we will need Charcount+1 trans at most to store the data
	1868	(one row per char at worst case) So we preallocate both structures
	1869	assuming worst case.
	1870
	1871	We then construct the trie using only the .next slots of the entry
	1872	structs.
	1873
	1874	We use the .check field of the first entry of the node temporarily to
	1875	make compression both faster and easier by keeping track of how many non
	1876	zero fields are in the node.
	1877
	1878	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	1879	transition.
	1880
	1881	There are two terms at use here: state as a TRIE_NODEIDX() which is a
	1882	number representing the first entry of the node, and state as a
	1883	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
	1884	TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
	1885	are 2 entrys per node. eg:
	1886
	1887	A B A B
	1888	1. 2 4 1. 3 7
	1889	2. 0 3 3. 0 5
	1890	3. 0 0 5. 0 0
	1891	4. 0 0 7. 0 0
	1892
	1893	The table is internally in the right hand, idx form. However as we also
	1894	have to deal with the states array which is indexed by nodenum we have to
	1895	use TRIE_NODENUM() to convert.
	1896
	1897	*/
	1898	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1899	"%*sCompiling trie using table compiler\n",
	1900	(int)depth * 2 + 2, ""));
	1901
	1902	trie->trans = (reg_trie_trans *)
	1903	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	1904	* trie->uniquecharcount + 1,
	1905	sizeof(reg_trie_trans) );
	1906	trie->states = (reg_trie_state *)
	1907	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1908	sizeof(reg_trie_state) );
	1909	next_alloc = trie->uniquecharcount + 1;
	1910
	1911
	1912	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1913
	1914	regnode * const noper = NEXTOPER( cur );
	1915	const U8 uc = (U8)STRING( noper );
	1916	const U8 * const e = uc + STR_LEN( noper );
	1917
	1918	U32 state = 1; /* required init */
	1919
	1920	U16 charid = 0; /* sanity init */
	1921	U32 accept_state = 0; /* sanity init */
	1922	U8 scan = (U8)NULL; /* sanity init */
	1923
	1924	STRLEN foldlen = 0; /* required init */
	1925	U32 wordlen = 0; /* required init */
	1926	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1927
	1928	if ( OP(noper) != NOTHING ) {
	1929	for ( ; uc < e ; uc += len ) {
	1930
	1931	TRIE_READ_CHAR;
	1932
	1933	if ( uvc < 256 ) {
	1934	charid = trie->charmap[ uvc ];
	1935	} else {
	1936	SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1937	charid = svpp ? (U16)SvIV(*svpp) : 0;
	1938	}
	1939	if ( charid ) {
	1940	charid--;
	1941	if ( !trie->trans[ state + charid ].next ) {
	1942	trie->trans[ state + charid ].next = next_alloc;
	1943	trie->trans[ state ].check++;
	1944	prev_states[TRIE_NODENUM(next_alloc)]
	1945	= TRIE_NODENUM(state);
	1946	next_alloc += trie->uniquecharcount;
	1947	}
	1948	state = trie->trans[ state + charid ].next;
	1949	} else {
	1950	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1951	}
	1952	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1953	}
	1954	}
	1955	accept_state = TRIE_NODENUM( state );
	1956	TRIE_HANDLE_WORD(accept_state);
	1957
	1958	} /* end second pass */
	1959
	1960	/* and now dump it out before we compress it */
	1961	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	1962	revcharmap,
	1963	next_alloc, depth+1));
	1964
	1965	{
	1966	/*
	1967	* Inplace compress the table.*
	1968
	1969	For sparse data sets the table constructed by the trie algorithm will
	1970	be mostly 0/FAIL transitions or to put it another way mostly empty.
	1971	(Note that leaf nodes will not contain any transitions.)
	1972
	1973	This algorithm compresses the tables by eliminating most such
	1974	transitions, at the cost of a modest bit of extra work during lookup:
	1975
	1976	- Each states[] entry contains a .base field which indicates the
	1977	index in the state[] array wheres its transition data is stored.
	1978
	1979	- If .base is 0 there are no valid transitions from that node.
	1980
	1981	- If .base is nonzero then charid is added to it to find an entry in
	1982	the trans array.
	1983
	1984	-If trans[states[state].base+charid].check!=state then the
	1985	transition is taken to be a 0/Fail transition. Thus if there are fail
	1986	transitions at the front of the node then the .base offset will point
	1987	somewhere inside the previous nodes data (or maybe even into a node
	1988	even earlier), but the .check field determines if the transition is
	1989	valid.
	1990
	1991	XXX - wrong maybe?
	1992	The following process inplace converts the table to the compressed
	1993	table: We first do not compress the root node 1,and mark all its
	1994	.check pointers as 1 and set its .base pointer as 1 as well. This
	1995	allows us to do a DFA construction from the compressed table later,
	1996	and ensures that any .base pointers we calculate later are greater
	1997	than 0.
	1998
	1999	- We set 'pos' to indicate the first entry of the second node.
	2000
	2001	- We then iterate over the columns of the node, finding the first and
	2002	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	2003	and set the .check pointers accordingly, and advance pos
	2004	appropriately and repreat for the next node. Note that when we copy
	2005	the next pointers we have to convert them from the original
	2006	NODEIDX form to NODENUM form as the former is not valid post
	2007	compression.
	2008
	2009	- If a node has no transitions used we mark its base as 0 and do not
	2010	advance the pos pointer.
	2011
	2012	- If a node only has one transition we use a second pointer into the
	2013	structure to fill in allocated fail transitions from other states.
	2014	This pointer is independent of the main pointer and scans forward
	2015	looking for null transitions that are allocated to a state. When it
	2016	finds one it writes the single transition into the "hole". If the
	2017	pointer doesnt find one the single transition is appended as normal.
	2018
	2019	- Once compressed we can Renew/realloc the structures to release the
	2020	excess space.
	2021
	2022	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	2023	specifically Fig 3.47 and the associated pseudocode.
	2024
	2025	demq
	2026	*/
	2027	const U32 laststate = TRIE_NODENUM( next_alloc );
	2028	U32 state, charid;
	2029	U32 pos = 0, zp=0;
	2030	trie->statecount = laststate;
	2031
	2032	for ( state = 1 ; state < laststate ; state++ ) {
	2033	U8 flag = 0;
	2034	const U32 stateidx = TRIE_NODEIDX( state );
	2035	const U32 o_used = trie->trans[ stateidx ].check;
	2036	U32 used = trie->trans[ stateidx ].check;
	2037	trie->trans[ stateidx ].check = 0;
	2038
	2039	for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
	2040	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	2041	if ( trie->trans[ stateidx + charid ].next ) {
	2042	if (o_used == 1) {
	2043	for ( ; zp < pos ; zp++ ) {
	2044	if ( ! trie->trans[ zp ].next ) {
	2045	break;
	2046	}
	2047	}
	2048	trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
	2049	trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2050	trie->trans[ zp ].check = state;
	2051	if ( ++zp > pos ) pos = zp;
	2052	break;
	2053	}
	2054	used--;
	2055	}
	2056	if ( !flag ) {
	2057	flag = 1;
	2058	trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
	2059	}
	2060	trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2061	trie->trans[ pos ].check = state;
	2062	pos++;
	2063	}
	2064	}
	2065	}
	2066	trie->lasttrans = pos + 1;
	2067	trie->states = (reg_trie_state *)
	2068	PerlMemShared_realloc( trie->states, laststate
	2069	* sizeof(reg_trie_state) );
	2070	DEBUG_TRIE_COMPILE_MORE_r(
	2071	PerlIO_printf( Perl_debug_log,
	2072	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	2073	(int)depth * 2 + 2,"",
	2074	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
	2075	(IV)next_alloc,
	2076	(IV)pos,
	2077	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	2078	);
	2079
	2080	} /* end table compress */
	2081	}
	2082	DEBUG_TRIE_COMPILE_MORE_r(
	2083	PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	2084	(int)depth * 2 + 2, "",
	2085	(UV)trie->statecount,
	2086	(UV)trie->lasttrans)
	2087	);
	2088	/* resize the trans array to remove unused space */
	2089	trie->trans = (reg_trie_trans *)
	2090	PerlMemShared_realloc( trie->trans, trie->lasttrans
	2091	* sizeof(reg_trie_trans) );
	2092
	2093	{ /* Modify the program and insert the new TRIE node */
	2094	U8 nodetype =(U8)(flags & 0xFF);
	2095	char *str=NULL;
	2096
	2097	#ifdef DEBUGGING
	2098	regnode *optimize = NULL;
	2099	#ifdef RE_TRACK_PATTERN_OFFSETS
	2100
	2101	U32 mjd_offset = 0;
	2102	U32 mjd_nodelen = 0;
	2103	#endif /* RE_TRACK_PATTERN_OFFSETS */
	2104	#endif /* DEBUGGING */
	2105	/*
	2106	This means we convert either the first branch or the first Exact,
	2107	depending on whether the thing following (in 'last') is a branch
	2108	or not and whther first is the startbranch (ie is it a sub part of
	2109	the alternation or is it the whole thing.)
	2110	Assuming its a sub part we convert the EXACT otherwise we convert
	2111	the whole branch sequence, including the first.
	2112	*/
	2113	/* Find the node we are going to overwrite */
	2114	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	2115	/* branch sub-chain */
	2116	NEXT_OFF( first ) = (U16)(last - first);
	2117	#ifdef RE_TRACK_PATTERN_OFFSETS
	2118	DEBUG_r({
	2119	mjd_offset= Node_Offset((convert));
	2120	mjd_nodelen= Node_Length((convert));
	2121	});
	2122	#endif
	2123	/* whole branch chain */
	2124	}
	2125	#ifdef RE_TRACK_PATTERN_OFFSETS
	2126	else {
	2127	DEBUG_r({
	2128	const regnode *nop = NEXTOPER( convert );
	2129	mjd_offset= Node_Offset((nop));
	2130	mjd_nodelen= Node_Length((nop));
	2131	});
	2132	}
	2133	DEBUG_OPTIMISE_r(
	2134	PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2135	(int)depth * 2 + 2, "",
	2136	(UV)mjd_offset, (UV)mjd_nodelen)
	2137	);
	2138	#endif
	2139	/* But first we check to see if there is a common prefix we can
	2140	split out as an EXACT and put in front of the TRIE node. */
	2141	trie->startstate= 1;
	2142	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2143	U32 state;
	2144	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2145	U32 ofs = 0;
	2146	I32 idx = -1;
	2147	U32 count = 0;
	2148	const U32 base = trie->states[ state ].trans.base;
	2149
	2150	if ( trie->states[state].wordnum )
	2151	count = 1;
	2152
	2153	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2154	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2155	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2156	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2157	{
	2158	if ( ++count > 1 ) {
	2159	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2160	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2161	if ( state == 1 ) break;
	2162	if ( count == 2 ) {
	2163	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2164	DEBUG_OPTIMISE_r(
	2165	PerlIO_printf(Perl_debug_log,
	2166	"%*sNew Start State=%"UVuf" Class: [",
	2167	(int)depth * 2 + 2, "",
	2168	(UV)state));
	2169	if (idx >= 0) {
	2170	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2171	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2172
	2173	TRIE_BITMAP_SET(trie,*ch);
	2174	if ( folder )
	2175	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2176	DEBUG_OPTIMISE_r(
	2177	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2178	);
	2179	}
	2180	}
	2181	TRIE_BITMAP_SET(trie,*ch);
	2182	if ( folder )
	2183	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2184	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2185	}
	2186	idx = ofs;
	2187	}
	2188	}
	2189	if ( count == 1 ) {
	2190	SV **tmp = av_fetch( revcharmap, idx, 0);
	2191	STRLEN len;
	2192	char ch = SvPV( tmp, len );
	2193	DEBUG_OPTIMISE_r({
	2194	SV *sv=sv_newmortal();
	2195	PerlIO_printf( Perl_debug_log,
	2196	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2197	(int)depth * 2 + 2, "",
	2198	(UV)state, (UV)idx,
	2199	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2200	PL_colors[0], PL_colors[1],
	2201	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2202	PERL_PV_ESCAPE_FIRSTCHAR
	2203	)
	2204	);
	2205	});
	2206	if ( state==1 ) {
	2207	OP( convert ) = nodetype;
	2208	str=STRING(convert);
	2209	STR_LEN(convert)=0;
	2210	}
	2211	STR_LEN(convert) += len;
	2212	while (len--)
	2213	str++ = ch++;
	2214	} else {
	2215	#ifdef DEBUGGING
	2216	if (state>1)
	2217	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2218	#endif
	2219	break;
	2220	}
	2221	}
	2222	trie->prefixlen = (state-1);
	2223	if (str) {
	2224	regnode *n = convert+NODE_SZ_STR(convert);
	2225	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2226	trie->startstate = state;
	2227	trie->minlen -= (state - 1);
	2228	trie->maxlen -= (state - 1);
	2229	#ifdef DEBUGGING
	2230	/* At least the UNICOS C compiler choked on this
	2231	* being argument to DEBUG_r(), so let's just have
	2232	* it right here. */
	2233	if (
	2234	#ifdef PERL_EXT_RE_BUILD
	2235	1
	2236	#else
	2237	DEBUG_r_TEST
	2238	#endif
	2239	) {
	2240	regnode *fix = convert;
	2241	U32 word = trie->wordcount;
	2242	mjd_nodelen++;
	2243	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2244	while( ++fix < n ) {
	2245	Set_Node_Offset_Length(fix, 0, 0);
	2246	}
	2247	while (word--) {
	2248	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2249	if (tmp) {
	2250	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2251	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2252	else
	2253	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2254	}
	2255	}
	2256	}
	2257	#endif
	2258	if (trie->maxlen) {
	2259	convert = n;
	2260	} else {
	2261	NEXT_OFF(convert) = (U16)(tail - convert);
	2262	DEBUG_r(optimize= n);
	2263	}
	2264	}
	2265	}
	2266	if (!jumper)
	2267	jumper = last;
	2268	if ( trie->maxlen ) {
	2269	NEXT_OFF( convert ) = (U16)(tail - convert);
	2270	ARG_SET( convert, data_slot );
	2271	/* Store the offset to the first unabsorbed branch in
	2272	jump[0], which is otherwise unused by the jump logic.
	2273	We use this when dumping a trie and during optimisation. */
	2274	if (trie->jump)
	2275	trie->jump[0] = (U16)(nextbranch - convert);
	2276
	2277	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2278	* and there is a bitmap
	2279	* and the first "jump target" node we found leaves enough room
	2280	* then convert the TRIE node into a TRIEC node, with the bitmap
	2281	* embedded inline in the opcode - this is hypothetically faster.
	2282	*/
	2283	if ( !trie->states[trie->startstate].wordnum
	2284	&& trie->bitmap
	2285	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2286	{
	2287	OP( convert ) = TRIEC;
	2288	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2289	PerlMemShared_free(trie->bitmap);
	2290	trie->bitmap= NULL;
	2291	} else
	2292	OP( convert ) = TRIE;
	2293
	2294	/* store the type in the flags */
	2295	convert->flags = nodetype;
	2296	DEBUG_r({
	2297	optimize = convert
	2298	+ NODE_STEP_REGNODE
	2299	+ regarglen[ OP( convert ) ];
	2300	});
	2301	/* XXX We really should free up the resource in trie now,
	2302	as we won't use them - (which resources?) dmq */
	2303	}
	2304	/* needed for dumping*/
	2305	DEBUG_r(if (optimize) {
	2306	regnode *opt = convert;
	2307
	2308	while ( ++opt < optimize) {
	2309	Set_Node_Offset_Length(opt,0,0);
	2310	}
	2311	/*
	2312	Try to clean up some of the debris left after the
	2313	optimisation.
	2314	*/
	2315	while( optimize < jumper ) {
	2316	mjd_nodelen += Node_Length((optimize));
	2317	OP( optimize ) = OPTIMIZED;
	2318	Set_Node_Offset_Length(optimize,0,0);
	2319	optimize++;
	2320	}
	2321	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2322	});
	2323	} /* end node insert */
	2324
	2325	/* Finish populating the prev field of the wordinfo array. Walk back
	2326	* from each accept state until we find another accept state, and if
	2327	* so, point the first word's .prev field at the second word. If the
	2328	* second already has a .prev field set, stop now. This will be the
	2329	* case either if we've already processed that word's accept state,
	2330	* or that state had multiple words, and the overspill words were
	2331	* already linked up earlier.
	2332	*/
	2333	{
	2334	U16 word;
	2335	U32 state;
	2336	U16 prev;
	2337
	2338	for (word=1; word <= trie->wordcount; word++) {
	2339	prev = 0;
	2340	if (trie->wordinfo[word].prev)
	2341	continue;
	2342	state = trie->wordinfo[word].accept;
	2343	while (state) {
	2344	state = prev_states[state];
	2345	if (!state)
	2346	break;
	2347	prev = trie->states[state].wordnum;
	2348	if (prev)
	2349	break;
	2350	}
	2351	trie->wordinfo[word].prev = prev;
	2352	}
	2353	Safefree(prev_states);
	2354	}
	2355
	2356
	2357	/* and now dump out the compressed format */
	2358	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2359
	2360	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2361	#ifdef DEBUGGING
	2362	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2363	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2364	#else
	2365	SvREFCNT_dec(revcharmap);
	2366	#endif
	2367	return trie->jump
	2368	? MADE_JUMP_TRIE
	2369	: trie->startstate>1
	2370	? MADE_EXACT_TRIE
	2371	: MADE_TRIE;
	2372	}
	2373
	2374	STATIC void
	2375	S_make_trie_failtable(pTHX_ RExC_state_t pRExC_state, regnode source, regnode *stclass, U32 depth)
	2376	{
	2377	/* The Trie is constructed and compressed now so we can build a fail array if it's needed
	2378
	2379	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
	2380	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
	2381	ISBN 0-201-10088-6
	2382
	2383	We find the fail state for each state in the trie, this state is the longest proper
	2384	suffix of the current state's 'word' that is also a proper prefix of another word in our
	2385	trie. State 1 represents the word '' and is thus the default fail state. This allows
	2386	the DFA not to have to restart after its tried and failed a word at a given point, it
	2387	simply continues as though it had been matching the other word in the first place.
	2388	Consider
	2389	'abcdgu'=~/abcdefg\|cdgu/
	2390	When we get to 'd' we are still matching the first word, we would encounter 'g' which would
	2391	fail, which would bring us to the state representing 'd' in the second word where we would
	2392	try 'g' and succeed, proceeding to match 'cdgu'.
	2393	*/
	2394	/* add a fail transition */
	2395	const U32 trie_offset = ARG(source);
	2396	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2397	U32 *q;
	2398	const U32 ucharcount = trie->uniquecharcount;
	2399	const U32 numstates = trie->statecount;
	2400	const U32 ubound = trie->lasttrans + ucharcount;
	2401	U32 q_read = 0;
	2402	U32 q_write = 0;
	2403	U32 charid;
	2404	U32 base = trie->states[ 1 ].trans.base;
	2405	U32 *fail;
	2406	reg_ac_data *aho;
	2407	const U32 data_slot = add_data( pRExC_state, 1, "T" );
	2408	GET_RE_DEBUG_FLAGS_DECL;
	2409
	2410	PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
	2411	#ifndef DEBUGGING
	2412	PERL_UNUSED_ARG(depth);
	2413	#endif
	2414
	2415
	2416	ARG_SET( stclass, data_slot );
	2417	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	2418	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	2419	aho->trie=trie_offset;
	2420	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	2421	Copy( trie->states, aho->states, numstates, reg_trie_state );
	2422	Newxz( q, numstates, U32);
	2423	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	2424	aho->refcount = 1;
	2425	fail = aho->fail;
	2426	/* initialize fail[0..1] to be 1 so that we always have
	2427	a valid final fail state */
	2428	fail[ 0 ] = fail[ 1 ] = 1;
	2429
	2430	for ( charid = 0; charid < ucharcount ; charid++ ) {
	2431	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	2432	if ( newstate ) {
	2433	q[ q_write ] = newstate;
	2434	/* set to point at the root */
	2435	fail[ q[ q_write++ ] ]=1;
	2436	}
	2437	}
	2438	while ( q_read < q_write) {
	2439	const U32 cur = q[ q_read++ % numstates ];
	2440	base = trie->states[ cur ].trans.base;
	2441
	2442	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	2443	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	2444	if (ch_state) {
	2445	U32 fail_state = cur;
	2446	U32 fail_base;
	2447	do {
	2448	fail_state = fail[ fail_state ];
	2449	fail_base = aho->states[ fail_state ].trans.base;
	2450	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	2451
	2452	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	2453	fail[ ch_state ] = fail_state;
	2454	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	2455	{
	2456	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	2457	}
	2458	q[ q_write++ % numstates] = ch_state;
	2459	}
	2460	}
	2461	}
	2462	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	2463	when we fail in state 1, this allows us to use the
	2464	charclass scan to find a valid start char. This is based on the principle
	2465	that theres a good chance the string being searched contains lots of stuff
	2466	that cant be a start char.
	2467	*/
	2468	fail[ 0 ] = fail[ 1 ] = 0;
	2469	DEBUG_TRIE_COMPILE_r({
	2470	PerlIO_printf(Perl_debug_log,
	2471	"%*sStclass Failtable (%"UVuf" states): 0",
	2472	(int)(depth * 2), "", (UV)numstates
	2473	);
	2474	for( q_read=1; q_read<numstates; q_read++ ) {
	2475	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	2476	}
	2477	PerlIO_printf(Perl_debug_log, "\n");
	2478	});
	2479	Safefree(q);
	2480	/RExC_seen \|= REG_SEEN_TRIEDFA;/
	2481	}
	2482
	2483
	2484	/*
	2485	* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
	2486	* These need to be revisited when a newer toolchain becomes available.
	2487	*/
	2488	#if defined(__sparc64__) && defined(__GNUC__)
	2489	# if __GNUC__ < 2 \|\| (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
	2490	# undef SPARC64_GCC_WORKAROUND
	2491	# define SPARC64_GCC_WORKAROUND 1
	2492	# endif
	2493	#endif
	2494
	2495	#define DEBUG_PEEP(str,scan,depth) \
	2496	DEBUG_OPTIMISE_r({if (scan){ \
	2497	SV * const mysv=sv_newmortal(); \
	2498	regnode *Next = regnext(scan); \
	2499	regprop(RExC_rx, mysv, scan); \
	2500	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	2501	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	2502	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	2503	}});
	2504
	2505
	2506
	2507
	2508
	2509	#define JOIN_EXACT(scan,min,flags) \
	2510	if (PL_regkind[OP(scan)] == EXACT) \
	2511	join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
	2512
	2513	STATIC U32
	2514	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan, I32 min, U32 flags,regnode val, U32 depth) {
	2515	/* Merge several consecutive EXACTish nodes into one. */
	2516	regnode *n = regnext(scan);
	2517	U32 stringok = 1;
	2518	regnode *next = scan + NODE_SZ_STR(scan);
	2519	U32 merged = 0;
	2520	U32 stopnow = 0;
	2521	#ifdef DEBUGGING
	2522	regnode *stop = scan;
	2523	GET_RE_DEBUG_FLAGS_DECL;
	2524	#else
	2525	PERL_UNUSED_ARG(depth);
	2526	#endif
	2527
	2528	PERL_ARGS_ASSERT_JOIN_EXACT;
	2529	#ifndef EXPERIMENTAL_INPLACESCAN
	2530	PERL_UNUSED_ARG(flags);
	2531	PERL_UNUSED_ARG(val);
	2532	#endif
	2533	DEBUG_PEEP("join",scan,depth);
	2534
	2535	/* Skip NOTHING, merge EXACT. /
	2536	while (n &&
	2537	( PL_regkind[OP(n)] == NOTHING \|\|
	2538	(stringok && (OP(n) == OP(scan))))
	2539	&& NEXT_OFF(n)
	2540	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
	2541
	2542	if (OP(n) == TAIL \|\| n > next)
	2543	stringok = 0;
	2544	if (PL_regkind[OP(n)] == NOTHING) {
	2545	DEBUG_PEEP("skip:",n,depth);
	2546	NEXT_OFF(scan) += NEXT_OFF(n);
	2547	next = n + NODE_STEP_REGNODE;
	2548	#ifdef DEBUGGING
	2549	if (stringok)
	2550	stop = n;
	2551	#endif
	2552	n = regnext(n);
	2553	}
	2554	else if (stringok) {
	2555	const unsigned int oldl = STR_LEN(scan);
	2556	regnode * const nnext = regnext(n);
	2557
	2558	DEBUG_PEEP("merg",n,depth);
	2559
	2560	merged++;
	2561	if (oldl + STR_LEN(n) > U8_MAX)
	2562	break;
	2563	NEXT_OFF(scan) += NEXT_OFF(n);
	2564	STR_LEN(scan) += STR_LEN(n);
	2565	next = n + NODE_SZ_STR(n);
	2566	/* Now we can overwrite n : /
	2567	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	2568	#ifdef DEBUGGING
	2569	stop = next - 1;
	2570	#endif
	2571	n = nnext;
	2572	if (stopnow) break;
	2573	}
	2574
	2575	#ifdef EXPERIMENTAL_INPLACESCAN
	2576	if (flags && !NEXT_OFF(n)) {
	2577	DEBUG_PEEP("atch", val, depth);
	2578	if (reg_off_by_arg[OP(n)]) {
	2579	ARG_SET(n, val - n);
	2580	}
	2581	else {
	2582	NEXT_OFF(n) = val - n;
	2583	}
	2584	stopnow = 1;
	2585	}
	2586	#endif
	2587	}
	2588	#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
	2589	#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
	2590	#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
	2591	#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
	2592
	2593	if (UTF
	2594	&& ( OP(scan) == EXACTF \|\| OP(scan) == EXACTFU \|\| OP(scan) == EXACTFA)
	2595	&& ( STR_LEN(scan) >= 6 ) )
	2596	{
	2597	/*
	2598	Two problematic code points in Unicode casefolding of EXACT nodes:
	2599
	2600	U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	2601	U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
	2602
	2603	which casefold to
	2604
	2605	Unicode UTF-8
	2606
	2607	U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
	2608	U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
	2609
	2610	This means that in case-insensitive matching (or "loose matching",
	2611	as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
	2612	length of the above casefolded versions) can match a target string
	2613	of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
	2614	This would rather mess up the minimum length computation.
	2615
	2616	What we'll do is to look for the tail four bytes, and then peek
	2617	at the preceding two bytes to see whether we need to decrease
	2618	the minimum length by four (six minus two).
	2619
	2620	Thanks to the design of UTF-8, there cannot be false matches:
	2621	A sequence of valid UTF-8 bytes cannot be a subsequence of
	2622	another valid sequence of UTF-8 bytes.
	2623
	2624	*/
	2625	char * const s0 = STRING(scan), s, t;
	2626	char * const s1 = s0 + STR_LEN(scan) - 1;
	2627	char * const s2 = s1 - 4;
	2628	#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
	2629	const char t0[] = "\xaf\x49\xaf\x42";
	2630	#else
	2631	const char t0[] = "\xcc\x88\xcc\x81";
	2632	#endif
	2633	const char * const t1 = t0 + 3;
	2634
	2635	for (s = s0 + 2;
	2636	s < s2 && (t = ninstr(s, s1, t0, t1));
	2637	s = t + 4) {
	2638	#ifdef EBCDIC
	2639	if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) \|\|
	2640	((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
	2641	#else
	2642	if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) \|\|
	2643	((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
	2644	#endif
	2645	*min -= 4;
	2646	}
	2647	}
	2648
	2649	#ifdef DEBUGGING
	2650	/* Allow dumping but overwriting the collection of skipped
	2651	* ops and/or strings with fake optimized ops */
	2652	n = scan + NODE_SZ_STR(scan);
	2653	while (n <= stop) {
	2654	OP(n) = OPTIMIZED;
	2655	FLAGS(n) = 0;
	2656	NEXT_OFF(n) = 0;
	2657	n++;
	2658	}
	2659	#endif
	2660	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	2661	return stopnow;
	2662	}
	2663
	2664	/* REx optimizer. Converts nodes into quicker variants "in place".
	2665	Finds fixed substrings. */
	2666
	2667	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	2668	to the position after last scanned or to NULL. */
	2669
	2670	#define INIT_AND_WITHP \
	2671	assert(!and_withp); \
	2672	Newx(and_withp,1,struct regnode_charclass_class); \
	2673	SAVEFREEPV(and_withp)
	2674
	2675	/* this is a chain of data about sub patterns we are processing that
	2676	need to be handled separately/specially in study_chunk. Its so
	2677	we can simulate recursion without losing state. */
	2678	struct scan_frame;
	2679	typedef struct scan_frame {
	2680	regnode last; / last node to process in this frame */
	2681	regnode next; / next node to process when last is reached */
	2682	struct scan_frame prev; /previous frame*/
	2683	I32 stop; /* what stopparen do we use */
	2684	} scan_frame;
	2685
	2686
	2687	#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
	2688
	2689	#define CASE_SYNST_FNC(nAmE) \
	2690	case nAmE: \
	2691	if (flags & SCF_DO_STCLASS_AND) { \
	2692	for (value = 0; value < 256; value++) \
	2693	if (!is_ ## nAmE ## _cp(value)) \
	2694	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2695	} \
	2696	else { \
	2697	for (value = 0; value < 256; value++) \
	2698	if (is_ ## nAmE ## _cp(value)) \
	2699	ANYOF_BITMAP_SET(data->start_class, value); \
	2700	} \
	2701	break; \
	2702	case N ## nAmE: \
	2703	if (flags & SCF_DO_STCLASS_AND) { \
	2704	for (value = 0; value < 256; value++) \
	2705	if (is_ ## nAmE ## _cp(value)) \
	2706	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2707	} \
	2708	else { \
	2709	for (value = 0; value < 256; value++) \
	2710	if (!is_ ## nAmE ## _cp(value)) \
	2711	ANYOF_BITMAP_SET(data->start_class, value); \
	2712	} \
	2713	break
	2714
	2715
	2716
	2717	STATIC I32
	2718	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	2719	I32 minlenp, I32 deltap,
	2720	regnode *last,
	2721	scan_data_t *data,
	2722	I32 stopparen,
	2723	U8* recursed,
	2724	struct regnode_charclass_class *and_withp,
	2725	U32 flags, U32 depth)
	2726	/* scanp: Start here (read-write). */
	2727	/* deltap: Write maxlen-minlen here. */
	2728	/* last: Stop before this one. */
	2729	/* data: string data about the pattern */
	2730	/* stopparen: treat close N as END */
	2731	/* recursed: which subroutines have we recursed into */
	2732	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	2733	{
	2734	dVAR;
	2735	I32 min = 0, pars = 0, code;
	2736	regnode scan = scanp, *next;
	2737	I32 delta = 0;
	2738	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	2739	int is_inf_internal = 0; /* The studied chunk is infinite */
	2740	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	2741	scan_data_t data_fake;
	2742	SV *re_trie_maxbuff = NULL;
	2743	regnode *first_non_open = scan;
	2744	I32 stopmin = I32_MAX;
	2745	scan_frame *frame = NULL;
	2746	GET_RE_DEBUG_FLAGS_DECL;
	2747
	2748	PERL_ARGS_ASSERT_STUDY_CHUNK;
	2749
	2750	#ifdef DEBUGGING
	2751	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	2752	#endif
	2753
	2754	if ( depth == 0 ) {
	2755	while (first_non_open && OP(first_non_open) == OPEN)
	2756	first_non_open=regnext(first_non_open);
	2757	}
	2758
	2759
	2760	fake_study_recurse:
	2761	while ( scan && OP(scan) != END && scan < last ){
	2762	/* Peephole optimizer: */
	2763	DEBUG_STUDYDATA("Peep:", data,depth);
	2764	DEBUG_PEEP("Peep",scan,depth);
	2765	JOIN_EXACT(scan,&min,0);
	2766
	2767	/* Follow the next-chain of the current node and optimize
	2768	away all the NOTHINGs from it. */
	2769	if (OP(scan) != CURLYX) {
	2770	const int max = (reg_off_by_arg[OP(scan)]
	2771	? I32_MAX
	2772	/* I32 may be smaller than U16 on CRAYs! */
	2773	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	2774	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	2775	int noff;
	2776	regnode *n = scan;
	2777
	2778	/* Skip NOTHING and LONGJMP. */
	2779	while ((n = regnext(n))
	2780	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	2781	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	2782	&& off + noff < max)
	2783	off += noff;
	2784	if (reg_off_by_arg[OP(scan)])
	2785	ARG(scan) = off;
	2786	else
	2787	NEXT_OFF(scan) = off;
	2788	}
	2789
	2790
	2791
	2792	/* The principal pseudo-switch. Cannot be a switch, since we
	2793	look into several different things. */
	2794	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	2795	\|\| OP(scan) == IFTHEN) {
	2796	next = regnext(scan);
	2797	code = OP(scan);
	2798	/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
	2799
	2800	if (OP(next) == code \|\| code == IFTHEN) {
	2801	/* NOTE - There is similar code to this block below for handling
	2802	TRIE nodes on a re-study. If you change stuff here check there
	2803	too. */
	2804	I32 max1 = 0, min1 = I32_MAX, num = 0;
	2805	struct regnode_charclass_class accum;
	2806	regnode * const startbranch=scan;
	2807
	2808	if (flags & SCF_DO_SUBSTR)
	2809	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
	2810	if (flags & SCF_DO_STCLASS)
	2811	cl_init_zero(pRExC_state, &accum);
	2812
	2813	while (OP(scan) == code) {
	2814	I32 deltanext, minnext, f = 0, fake;
	2815	struct regnode_charclass_class this_class;
	2816
	2817	num++;
	2818	data_fake.flags = 0;
	2819	if (data) {
	2820	data_fake.whilem_c = data->whilem_c;
	2821	data_fake.last_closep = data->last_closep;
	2822	}
	2823	else
	2824	data_fake.last_closep = &fake;
	2825
	2826	data_fake.pos_delta = delta;
	2827	next = regnext(scan);
	2828	scan = NEXTOPER(scan);
	2829	if (code != BRANCH)
	2830	scan = NEXTOPER(scan);
	2831	if (flags & SCF_DO_STCLASS) {
	2832	cl_init(pRExC_state, &this_class);
	2833	data_fake.start_class = &this_class;
	2834	f = SCF_DO_STCLASS_AND;
	2835	}
	2836	if (flags & SCF_WHILEM_VISITED_POS)
	2837	f \|= SCF_WHILEM_VISITED_POS;
	2838
	2839	/* we suppose the run is continuous, last=next...*/
	2840	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	2841	next, &data_fake,
	2842	stopparen, recursed, NULL, f,depth+1);
	2843	if (min1 > minnext)
	2844	min1 = minnext;
	2845	if (max1 < minnext + deltanext)
	2846	max1 = minnext + deltanext;
	2847	if (deltanext == I32_MAX)
	2848	is_inf = is_inf_internal = 1;
	2849	scan = next;
	2850	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	2851	pars++;
	2852	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	2853	if ( stopmin > minnext)
	2854	stopmin = min + min1;
	2855	flags &= ~SCF_DO_SUBSTR;
	2856	if (data)
	2857	data->flags \|= SCF_SEEN_ACCEPT;
	2858	}
	2859	if (data) {
	2860	if (data_fake.flags & SF_HAS_EVAL)
	2861	data->flags \|= SF_HAS_EVAL;
	2862	data->whilem_c = data_fake.whilem_c;
	2863	}
	2864	if (flags & SCF_DO_STCLASS)
	2865	cl_or(pRExC_state, &accum, &this_class);
	2866	}
	2867	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	2868	min1 = 0;
	2869	if (flags & SCF_DO_SUBSTR) {
	2870	data->pos_min += min1;
	2871	data->pos_delta += max1 - min1;
	2872	if (max1 != min1 \|\| is_inf)
	2873	data->longest = &(data->longest_float);
	2874	}
	2875	min += min1;
	2876	delta += max1 - min1;
	2877	if (flags & SCF_DO_STCLASS_OR) {
	2878	cl_or(pRExC_state, data->start_class, &accum);
	2879	if (min1) {
	2880	cl_and(data->start_class, and_withp);
	2881	flags &= ~SCF_DO_STCLASS;
	2882	}
	2883	}
	2884	else if (flags & SCF_DO_STCLASS_AND) {
	2885	if (min1) {
	2886	cl_and(data->start_class, &accum);
	2887	flags &= ~SCF_DO_STCLASS;
	2888	}
	2889	else {
	2890	/* Switch to OR mode: cache the old value of
	2891	* data->start_class */
	2892	INIT_AND_WITHP;
	2893	StructCopy(data->start_class, and_withp,
	2894	struct regnode_charclass_class);
	2895	flags &= ~SCF_DO_STCLASS_AND;
	2896	StructCopy(&accum, data->start_class,
	2897	struct regnode_charclass_class);
	2898	flags \|= SCF_DO_STCLASS_OR;
	2899	data->start_class->flags \|= ANYOF_EOS;
	2900	}
	2901	}
	2902
	2903	if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
	2904	/* demq.
	2905
	2906	Assuming this was/is a branch we are dealing with: 'scan' now
	2907	points at the item that follows the branch sequence, whatever
	2908	it is. We now start at the beginning of the sequence and look
	2909	for subsequences of
	2910
	2911	BRANCH->EXACT=>x1
	2912	BRANCH->EXACT=>x2
	2913	tail
	2914
	2915	which would be constructed from a pattern like /A\|LIST\|OF\|WORDS/
	2916
	2917	If we can find such a subsequence we need to turn the first
	2918	element into a trie and then add the subsequent branch exact
	2919	strings to the trie.
	2920
	2921	We have two cases
	2922
	2923	1. patterns where the whole set of branches can be converted.
	2924
	2925	2. patterns where only a subset can be converted.
	2926
	2927	In case 1 we can replace the whole set with a single regop
	2928	for the trie. In case 2 we need to keep the start and end
	2929	branches so
	2930
	2931	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	2932	becomes BRANCH TRIE; BRANCH X;
	2933
	2934	There is an additional case, that being where there is a
	2935	common prefix, which gets split out into an EXACT like node
	2936	preceding the TRIE node.
	2937
	2938	If x(1..n)==tail then we can do a simple trie, if not we make
	2939	a "jump" trie, such that when we match the appropriate word
	2940	we "jump" to the appropriate tail node. Essentially we turn
	2941	a nested if into a case structure of sorts.
	2942
	2943	*/
	2944
	2945	int made=0;
	2946	if (!re_trie_maxbuff) {
	2947	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	2948	if (!SvIOK(re_trie_maxbuff))
	2949	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	2950	}
	2951	if ( SvIV(re_trie_maxbuff)>=0 ) {
	2952	regnode *cur;
	2953	regnode first = (regnode )NULL;
	2954	regnode last = (regnode )NULL;
	2955	regnode *tail = scan;
	2956	U8 optype = 0;
	2957	U32 count=0;
	2958
	2959	#ifdef DEBUGGING
	2960	SV * const mysv = sv_newmortal(); /* for dumping */
	2961	#endif
	2962	/* var tail is used because there may be a TAIL
	2963	regop in the way. Ie, the exacts will point to the
	2964	thing following the TAIL, but the last branch will
	2965	point at the TAIL. So we advance tail. If we
	2966	have nested (?:) we may have to move through several
	2967	tails.
	2968	*/
	2969
	2970	while ( OP( tail ) == TAIL ) {
	2971	/* this is the TAIL generated by (?:) */
	2972	tail = regnext( tail );
	2973	}
	2974
	2975
	2976	DEBUG_OPTIMISE_r({
	2977	regprop(RExC_rx, mysv, tail );
	2978	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	2979	(int)depth * 2 + 2, "",
	2980	"Looking for TRIE'able sequences. Tail node is: ",
	2981	SvPV_nolen_const( mysv )
	2982	);
	2983	});
	2984
	2985	/*
	2986
	2987	step through the branches, cur represents each
	2988	branch, noper is the first thing to be matched
	2989	as part of that branch and noper_next is the
	2990	regnext() of that node. if noper is an EXACT
	2991	and noper_next is the same as scan (our current
	2992	position in the regex) then the EXACT branch is
	2993	a possible optimization target. Once we have
	2994	two or more consecutive such branches we can
	2995	create a trie of the EXACT's contents and stich
	2996	it in place. If the sequence represents all of
	2997	the branches we eliminate the whole thing and
	2998	replace it with a single TRIE. If it is a
	2999	subsequence then we need to stitch it in. This
	3000	means the first branch has to remain, and needs
	3001	to be repointed at the item on the branch chain
	3002	following the last branch optimized. This could
	3003	be either a BRANCH, in which case the
	3004	subsequence is internal, or it could be the
	3005	item following the branch sequence in which
	3006	case the subsequence is at the end.
	3007
	3008	*/
	3009
	3010	/* dont use tail as the end marker for this traverse */
	3011	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	3012	regnode * const noper = NEXTOPER( cur );
	3013	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	3014	regnode * const noper_next = regnext( noper );
	3015	#endif
	3016
	3017	DEBUG_OPTIMISE_r({
	3018	regprop(RExC_rx, mysv, cur);
	3019	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	3020	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	3021
	3022	regprop(RExC_rx, mysv, noper);
	3023	PerlIO_printf( Perl_debug_log, " -> %s",
	3024	SvPV_nolen_const(mysv));
	3025
	3026	if ( noper_next ) {
	3027	regprop(RExC_rx, mysv, noper_next );
	3028	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	3029	SvPV_nolen_const(mysv));
	3030	}
	3031	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
	3032	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
	3033	});
	3034	if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
	3035	: PL_regkind[ OP( noper ) ] == EXACT )
	3036	\|\| OP(noper) == NOTHING )
	3037	#ifdef NOJUMPTRIE
	3038	&& noper_next == tail
	3039	#endif
	3040	&& count < U16_MAX)
	3041	{
	3042	count++;
	3043	if ( !first \|\| optype == NOTHING ) {
	3044	if (!first) first = cur;
	3045	optype = OP( noper );
	3046	} else {
	3047	last = cur;
	3048	}
	3049	} else {
	3050	/*
	3051	Currently the trie logic handles case insensitive matching properly only
	3052	when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
	3053	semantics).
	3054
	3055	If/when this is fixed the following define can be swapped
	3056	in below to fully enable trie logic.
	3057
	3058	#define TRIE_TYPE_IS_SAFE 1
	3059
	3060	*/
	3061	#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) \|\| optype==EXACT)
	3062
	3063	if ( last && TRIE_TYPE_IS_SAFE ) {
	3064	make_trie( pRExC_state,
	3065	startbranch, first, cur, tail, count,
	3066	optype, depth+1 );
	3067	}
	3068	if ( PL_regkind[ OP( noper ) ] == EXACT
	3069	#ifdef NOJUMPTRIE
	3070	&& noper_next == tail
	3071	#endif
	3072	){
	3073	count = 1;
	3074	first = cur;
	3075	optype = OP( noper );
	3076	} else {
	3077	count = 0;
	3078	first = NULL;
	3079	optype = 0;
	3080	}
	3081	last = NULL;
	3082	}
	3083	}
	3084	DEBUG_OPTIMISE_r({
	3085	regprop(RExC_rx, mysv, cur);
	3086	PerlIO_printf( Perl_debug_log,
	3087	"%s- %s (%d) <SCAN FINISHED>\n", (int)depth 2 + 2,
	3088	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3089
	3090	});
	3091
	3092	if ( last && TRIE_TYPE_IS_SAFE ) {
	3093	made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
	3094	#ifdef TRIE_STUDY_OPT
	3095	if ( ((made == MADE_EXACT_TRIE &&
	3096	startbranch == first)
	3097	\|\| ( first_non_open == first )) &&
	3098	depth==0 ) {
	3099	flags \|= SCF_TRIE_RESTUDY;
	3100	if ( startbranch == first
	3101	&& scan == tail )
	3102	{
	3103	RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
	3104	}
	3105	}
	3106	#endif
	3107	}
	3108	}
	3109
	3110	} /* do trie */
	3111
	3112	}
	3113	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	3114	scan = NEXTOPER(NEXTOPER(scan));
	3115	} else /* single branch is optimized. */
	3116	scan = NEXTOPER(scan);
	3117	continue;
	3118	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	3119	scan_frame *newframe = NULL;
	3120	I32 paren;
	3121	regnode *start;
	3122	regnode *end;
	3123
	3124	if (OP(scan) != SUSPEND) {
	3125	/* set the pointer */
	3126	if (OP(scan) == GOSUB) {
	3127	paren = ARG(scan);
	3128	RExC_recurse[ARG2L(scan)] = scan;
	3129	start = RExC_open_parens[paren-1];
	3130	end = RExC_close_parens[paren-1];
	3131	} else {
	3132	paren = 0;
	3133	start = RExC_rxi->program + 1;
	3134	end = RExC_opend;
	3135	}
	3136	if (!recursed) {
	3137	Newxz(recursed, (((RExC_npar)>>3) +1), U8);
	3138	SAVEFREEPV(recursed);
	3139	}
	3140	if (!PAREN_TEST(recursed,paren+1)) {
	3141	PAREN_SET(recursed,paren+1);
	3142	Newx(newframe,1,scan_frame);
	3143	} else {
	3144	if (flags & SCF_DO_SUBSTR) {
	3145	SCAN_COMMIT(pRExC_state,data,minlenp);
	3146	data->longest = &(data->longest_float);
	3147	}
	3148	is_inf = is_inf_internal = 1;
	3149	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3150	cl_anything(pRExC_state, data->start_class);
	3151	flags &= ~SCF_DO_STCLASS;
	3152	}
	3153	} else {
	3154	Newx(newframe,1,scan_frame);
	3155	paren = stopparen;
	3156	start = scan+2;
	3157	end = regnext(scan);
	3158	}
	3159	if (newframe) {
	3160	assert(start);
	3161	assert(end);
	3162	SAVEFREEPV(newframe);
	3163	newframe->next = regnext(scan);
	3164	newframe->last = last;
	3165	newframe->stop = stopparen;
	3166	newframe->prev = frame;
	3167
	3168	frame = newframe;
	3169	scan = start;
	3170	stopparen = paren;
	3171	last = end;
	3172
	3173	continue;
	3174	}
	3175	}
	3176	else if (OP(scan) == EXACT) {
	3177	I32 l = STR_LEN(scan);
	3178	UV uc;
	3179	if (UTF) {
	3180	const U8 * const s = (U8*)STRING(scan);
	3181	l = utf8_length(s, s + l);
	3182	uc = utf8_to_uvchr(s, NULL);
	3183	} else {
	3184	uc = ((U8)STRING(scan));
	3185	}
	3186	min += l;
	3187	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	3188	/* The code below prefers earlier match for fixed
	3189	offset, later match for variable offset. */
	3190	if (data->last_end == -1) { /* Update the start info. */
	3191	data->last_start_min = data->pos_min;
	3192	data->last_start_max = is_inf
	3193	? I32_MAX : data->pos_min + data->pos_delta;
	3194	}
	3195	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	3196	if (UTF)
	3197	SvUTF8_on(data->last_found);
	3198	{
	3199	SV * const sv = data->last_found;
	3200	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3201	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3202	if (mg && mg->mg_len >= 0)
	3203	mg->mg_len += utf8_length((U8*)STRING(scan),
	3204	(U8*)STRING(scan)+STR_LEN(scan));
	3205	}
	3206	data->last_end = data->pos_min + l;
	3207	data->pos_min += l; /* As in the first entry. */
	3208	data->flags &= ~SF_BEFORE_EOL;
	3209	}
	3210	if (flags & SCF_DO_STCLASS_AND) {
	3211	/* Check whether it is compatible with what we know already! */
	3212	int compat = 1;
	3213
	3214
	3215	/* If compatible, we or it in below. It is compatible if is
	3216	* in the bitmp and either 1) its bit or its fold is set, or 2)
	3217	* it's for a locale. Even if there isn't unicode semantics
	3218	* here, at runtime there may be because of matching against a
	3219	* utf8 string, so accept a possible false positive for
	3220	* latin1-range folds */
	3221	if (uc >= 0x100 \|\|
	3222	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3223	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3224	&& (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
	3225	\|\| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3226	)
	3227	{
	3228	compat = 0;
	3229	}
	3230	ANYOF_CLASS_ZERO(data->start_class);
	3231	ANYOF_BITMAP_ZERO(data->start_class);
	3232	if (compat)
	3233	ANYOF_BITMAP_SET(data->start_class, uc);
	3234	else if (uc >= 0x100) {
	3235	int i;
	3236
	3237	/* Some Unicode code points fold to the Latin1 range; as
	3238	* XXX temporary code, instead of figuring out if this is
	3239	* one, just assume it is and set all the start class bits
	3240	* that could be some such above 255 code point's fold
	3241	* which will generate fals positives. As the code
	3242	* elsewhere that does compute the fold settles down, it
	3243	* can be extracted out and re-used here */
	3244	for (i = 0; i < 256; i++){
	3245	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3246	ANYOF_BITMAP_SET(data->start_class, i);
	3247	}
	3248	}
	3249	}
	3250	data->start_class->flags &= ~ANYOF_EOS;
	3251	if (uc < 0x100)
	3252	data->start_class->flags &= ~ANYOF_UNICODE_ALL;
	3253	}
	3254	else if (flags & SCF_DO_STCLASS_OR) {
	3255	/* false positive possible if the class is case-folded */
	3256	if (uc < 0x100)
	3257	ANYOF_BITMAP_SET(data->start_class, uc);
	3258	else
	3259	data->start_class->flags \|= ANYOF_UNICODE_ALL;
	3260	data->start_class->flags &= ~ANYOF_EOS;
	3261	cl_and(data->start_class, and_withp);
	3262	}
	3263	flags &= ~SCF_DO_STCLASS;
	3264	}
	3265	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
	3266	I32 l = STR_LEN(scan);
	3267	UV uc = ((U8)STRING(scan));
	3268
	3269	/* Search for fixed substrings supports EXACT only. */
	3270	if (flags & SCF_DO_SUBSTR) {
	3271	assert(data);
	3272	SCAN_COMMIT(pRExC_state, data, minlenp);
	3273	}
	3274	if (UTF) {
	3275	const U8 * const s = (U8 *)STRING(scan);
	3276	l = utf8_length(s, s + l);
	3277	uc = utf8_to_uvchr(s, NULL);
	3278	}
	3279	min += l;
	3280	if (flags & SCF_DO_SUBSTR)
	3281	data->pos_min += l;
	3282	if (flags & SCF_DO_STCLASS_AND) {
	3283	/* Check whether it is compatible with what we know already! */
	3284	int compat = 1;
	3285	if (uc >= 0x100 \|\|
	3286	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3287	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3288	&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3289	{
	3290	compat = 0;
	3291	}
	3292	ANYOF_CLASS_ZERO(data->start_class);
	3293	ANYOF_BITMAP_ZERO(data->start_class);
	3294	if (compat) {
	3295	ANYOF_BITMAP_SET(data->start_class, uc);
	3296	data->start_class->flags &= ~ANYOF_EOS;
	3297	data->start_class->flags \|= ANYOF_LOC_NONBITMAP_FOLD;
	3298	if (OP(scan) == EXACTFL) {
	3299	/* XXX This set is probably no longer necessary, and
	3300	* probably wrong as LOCALE now is on in the initial
	3301	* state */
	3302	data->start_class->flags \|= ANYOF_LOCALE;
	3303	}
	3304	else {
	3305
	3306	/* Also set the other member of the fold pair. In case
	3307	* that unicode semantics is called for at runtime, use
	3308	* the full latin1 fold. (Can't do this for locale,
	3309	* because not known until runtime */
	3310	ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
	3311	}
	3312	}
	3313	else if (uc >= 0x100) {
	3314	int i;
	3315	for (i = 0; i < 256; i++){
	3316	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3317	ANYOF_BITMAP_SET(data->start_class, i);
	3318	}
	3319	}
	3320	}
	3321	}
	3322	else if (flags & SCF_DO_STCLASS_OR) {
	3323	if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
	3324	/* false positive possible if the class is case-folded.
	3325	Assume that the locale settings are the same... */
	3326	if (uc < 0x100) {
	3327	ANYOF_BITMAP_SET(data->start_class, uc);
	3328	if (OP(scan) != EXACTFL) {
	3329
	3330	/* And set the other member of the fold pair, but
	3331	* can't do that in locale because not known until
	3332	* run-time */
	3333	ANYOF_BITMAP_SET(data->start_class,
	3334	PL_fold_latin1[uc]);
	3335	}
	3336	}
	3337	data->start_class->flags &= ~ANYOF_EOS;
	3338	}
	3339	cl_and(data->start_class, and_withp);
	3340	}
	3341	flags &= ~SCF_DO_STCLASS;
	3342	}
	3343	else if (REGNODE_VARIES(OP(scan))) {
	3344	I32 mincount, maxcount, minnext, deltanext, fl = 0;
	3345	I32 f = flags, pos_before = 0;
	3346	regnode * const oscan = scan;
	3347	struct regnode_charclass_class this_class;
	3348	struct regnode_charclass_class *oclass = NULL;
	3349	I32 next_is_eval = 0;
	3350
	3351	switch (PL_regkind[OP(scan)]) {
	3352	case WHILEM: /* End of (?:...)* . */
	3353	scan = NEXTOPER(scan);
	3354	goto finish;
	3355	case PLUS:
	3356	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	3357	next = NEXTOPER(scan);
	3358	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	3359	mincount = 1;
	3360	maxcount = REG_INFTY;
	3361	next = regnext(scan);
	3362	scan = NEXTOPER(scan);
	3363	goto do_curly;
	3364	}
	3365	}
	3366	if (flags & SCF_DO_SUBSTR)
	3367	data->pos_min++;
	3368	min++;
	3369	/* Fall through. */
	3370	case STAR:
	3371	if (flags & SCF_DO_STCLASS) {
	3372	mincount = 0;
	3373	maxcount = REG_INFTY;
	3374	next = regnext(scan);
	3375	scan = NEXTOPER(scan);
	3376	goto do_curly;
	3377	}
	3378	is_inf = is_inf_internal = 1;
	3379	scan = regnext(scan);
	3380	if (flags & SCF_DO_SUBSTR) {
	3381	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
	3382	data->longest = &(data->longest_float);
	3383	}
	3384	goto optimize_curly_tail;
	3385	case CURLY:
	3386	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	3387	&& (scan->flags == stopparen))
	3388	{
	3389	mincount = 1;
	3390	maxcount = 1;
	3391	} else {
	3392	mincount = ARG1(scan);
	3393	maxcount = ARG2(scan);
	3394	}
	3395	next = regnext(scan);
	3396	if (OP(scan) == CURLYX) {
	3397	I32 lp = (data ? *(data->last_closep) : 0);
	3398	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	3399	}
	3400	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	3401	next_is_eval = (OP(scan) == EVAL);
	3402	do_curly:
	3403	if (flags & SCF_DO_SUBSTR) {
	3404	if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
	3405	pos_before = data->pos_min;
	3406	}
	3407	if (data) {
	3408	fl = data->flags;
	3409	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	3410	if (is_inf)
	3411	data->flags \|= SF_IS_INF;
	3412	}
	3413	if (flags & SCF_DO_STCLASS) {
	3414	cl_init(pRExC_state, &this_class);
	3415	oclass = data->start_class;
	3416	data->start_class = &this_class;
	3417	f \|= SCF_DO_STCLASS_AND;
	3418	f &= ~SCF_DO_STCLASS_OR;
	3419	}
	3420	/* Exclude from super-linear cache processing any {n,m}
	3421	regops for which the combination of input pos and regex
	3422	pos is not enough information to determine if a match
	3423	will be possible.
	3424
	3425	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	3426	regex pos at the \s*, the prospects for a match depend not
	3427	only on the input position but also on how many (bar\s*)
	3428	repeats into the {4,8} we are. */
	3429	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	3430	f &= ~SCF_WHILEM_VISITED_POS;
	3431
	3432	/* This will finish on WHILEM, setting scan, or on NULL: */
	3433	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3434	last, data, stopparen, recursed, NULL,
	3435	(mincount == 0
	3436	? (f & ~SCF_DO_SUBSTR) : f),depth+1);
	3437
	3438	if (flags & SCF_DO_STCLASS)
	3439	data->start_class = oclass;
	3440	if (mincount == 0 \|\| minnext == 0) {
	3441	if (flags & SCF_DO_STCLASS_OR) {
	3442	cl_or(pRExC_state, data->start_class, &this_class);
	3443	}
	3444	else if (flags & SCF_DO_STCLASS_AND) {
	3445	/* Switch to OR mode: cache the old value of
	3446	* data->start_class */
	3447	INIT_AND_WITHP;
	3448	StructCopy(data->start_class, and_withp,
	3449	struct regnode_charclass_class);
	3450	flags &= ~SCF_DO_STCLASS_AND;
	3451	StructCopy(&this_class, data->start_class,
	3452	struct regnode_charclass_class);
	3453	flags \|= SCF_DO_STCLASS_OR;
	3454	data->start_class->flags \|= ANYOF_EOS;
	3455	}
	3456	} else { /* Non-zero len */
	3457	if (flags & SCF_DO_STCLASS_OR) {
	3458	cl_or(pRExC_state, data->start_class, &this_class);
	3459	cl_and(data->start_class, and_withp);
	3460	}
	3461	else if (flags & SCF_DO_STCLASS_AND)
	3462	cl_and(data->start_class, &this_class);
	3463	flags &= ~SCF_DO_STCLASS;
	3464	}
	3465	if (!scan) /* It was not CURLYX, but CURLY. */
	3466	scan = next;
	3467	if ( /* ? quantifier ok, except for (?{ ... }) */
	3468	(next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	3469	&& (minnext == 0) && (deltanext == 0)
	3470	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	3471	&& maxcount <= REG_INFTY/3) /* Complement check for big count */
	3472	{
	3473	ckWARNreg(RExC_parse,
	3474	"Quantifier unexpected on zero-length expression");
	3475	}
	3476
	3477	min += minnext * mincount;
	3478	is_inf_internal \|= ((maxcount == REG_INFTY
	3479	&& (minnext + deltanext) > 0)
	3480	\|\| deltanext == I32_MAX);
	3481	is_inf \|= is_inf_internal;
	3482	delta += (minnext + deltanext) * maxcount - minnext * mincount;
	3483
	3484	/* Try powerful optimization CURLYX => CURLYN. */
	3485	if ( OP(oscan) == CURLYX && data
	3486	&& data->flags & SF_IN_PAR
	3487	&& !(data->flags & SF_HAS_EVAL)
	3488	&& !deltanext && minnext == 1 ) {
	3489	/* Try to optimize to CURLYN. */
	3490	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	3491	regnode * const nxt1 = nxt;
	3492	#ifdef DEBUGGING
	3493	regnode *nxt2;
	3494	#endif
	3495
	3496	/* Skip open. */
	3497	nxt = regnext(nxt);
	3498	if (!REGNODE_SIMPLE(OP(nxt))
	3499	&& !(PL_regkind[OP(nxt)] == EXACT
	3500	&& STR_LEN(nxt) == 1))
	3501	goto nogo;
	3502	#ifdef DEBUGGING
	3503	nxt2 = nxt;
	3504	#endif
	3505	nxt = regnext(nxt);
	3506	if (OP(nxt) != CLOSE)
	3507	goto nogo;
	3508	if (RExC_open_parens) {
	3509	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3510	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	3511	}
	3512	/* Now we know that nxt2 is the only contents: */
	3513	oscan->flags = (U8)ARG(nxt);
	3514	OP(oscan) = CURLYN;
	3515	OP(nxt1) = NOTHING; /* was OPEN. */
	3516
	3517	#ifdef DEBUGGING
	3518	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3519	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	3520	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	3521	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3522	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3523	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	3524	#endif
	3525	}
	3526	nogo:
	3527
	3528	/* Try optimization CURLYX => CURLYM. */
	3529	if ( OP(oscan) == CURLYX && data
	3530	&& !(data->flags & SF_HAS_PAR)
	3531	&& !(data->flags & SF_HAS_EVAL)
	3532	&& !deltanext /* atom is fixed width */
	3533	&& minnext != 0 /* CURLYM can't handle zero width */
	3534	) {
	3535	/* XXXX How to optimize if data == 0? */
	3536	/* Optimize to a simpler form. */
	3537	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	3538	regnode *nxt2;
	3539
	3540	OP(oscan) = CURLYM;
	3541	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	3542	&& (OP(nxt2) != WHILEM))
	3543	nxt = nxt2;
	3544	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	3545	/* Need to optimize away parenths. */
	3546	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	3547	/* Set the parenth number. */
	3548	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	3549
	3550	oscan->flags = (U8)ARG(nxt);
	3551	if (RExC_open_parens) {
	3552	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3553	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	3554	}
	3555	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	3556	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3557
	3558	#ifdef DEBUGGING
	3559	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3560	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3561	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	3562	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	3563	#endif
	3564	#if 0
	3565	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	3566	regnode *nnxt = regnext(nxt1);
	3567	if (nnxt == nxt) {
	3568	if (reg_off_by_arg[OP(nxt1)])
	3569	ARG_SET(nxt1, nxt2 - nxt1);
	3570	else if (nxt2 - nxt1 < U16_MAX)
	3571	NEXT_OFF(nxt1) = nxt2 - nxt1;
	3572	else
	3573	OP(nxt) = NOTHING; /* Cannot beautify */
	3574	}
	3575	nxt1 = nnxt;
	3576	}
	3577	#endif
	3578	/* Optimize again: */
	3579	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	3580	NULL, stopparen, recursed, NULL, 0,depth+1);
	3581	}
	3582	else
	3583	oscan->flags = 0;
	3584	}
	3585	else if ((OP(oscan) == CURLYX)
	3586	&& (flags & SCF_WHILEM_VISITED_POS)
	3587	/* See the comment on a similar expression above.
	3588	However, this time it's not a subexpression
	3589	we care about, but the expression itself. */
	3590	&& (maxcount == REG_INFTY)
	3591	&& data && ++data->whilem_c < 16) {
	3592	/* This stays as CURLYX, we can put the count/of pair. */
	3593	/* Find WHILEM (as in regexec.c) */
	3594	regnode *nxt = oscan + NEXT_OFF(oscan);
	3595
	3596	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	3597	nxt += ARG(nxt);
	3598	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	3599	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	3600	}
	3601	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	3602	pars++;
	3603	if (flags & SCF_DO_SUBSTR) {
	3604	SV *last_str = NULL;
	3605	int counted = mincount != 0;
	3606
	3607	if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
	3608	#if defined(SPARC64_GCC_WORKAROUND)
	3609	I32 b = 0;
	3610	STRLEN l = 0;
	3611	const char *s = NULL;
	3612	I32 old = 0;
	3613
	3614	if (pos_before >= data->last_start_min)
	3615	b = pos_before;
	3616	else
	3617	b = data->last_start_min;
	3618
	3619	l = 0;
	3620	s = SvPV_const(data->last_found, l);
	3621	old = b - data->last_start_min;
	3622
	3623	#else
	3624	I32 b = pos_before >= data->last_start_min
	3625	? pos_before : data->last_start_min;
	3626	STRLEN l;
	3627	const char * const s = SvPV_const(data->last_found, l);
	3628	I32 old = b - data->last_start_min;
	3629	#endif
	3630
	3631	if (UTF)
	3632	old = utf8_hop((U8)s, old) - (U8)s;
	3633	l -= old;
	3634	/* Get the added string: */
	3635	last_str = newSVpvn_utf8(s + old, l, UTF);
	3636	if (deltanext == 0 && pos_before == b) {
	3637	/* What was added is a constant string */
	3638	if (mincount > 1) {
	3639	SvGROW(last_str, (mincount * l) + 1);
	3640	repeatcpy(SvPVX(last_str) + l,
	3641	SvPVX_const(last_str), l, mincount - 1);
	3642	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	3643	/* Add additional parts. */
	3644	SvCUR_set(data->last_found,
	3645	SvCUR(data->last_found) - l);
	3646	sv_catsv(data->last_found, last_str);
	3647	{
	3648	SV * sv = data->last_found;
	3649	MAGIC *mg =
	3650	SvUTF8(sv) && SvMAGICAL(sv) ?
	3651	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3652	if (mg && mg->mg_len >= 0)
	3653	mg->mg_len += CHR_SVLEN(last_str) - l;
	3654	}
	3655	data->last_end += l * (mincount - 1);
	3656	}
	3657	} else {
	3658	/* start offset must point into the last copy */
	3659	data->last_start_min += minnext * (mincount - 1);
	3660	data->last_start_max += is_inf ? I32_MAX
	3661	: (maxcount - 1) * (minnext + data->pos_delta);
	3662	}
	3663	}
	3664	/* It is counted once already... */
	3665	data->pos_min += minnext * (mincount - counted);
	3666	data->pos_delta += - counted * deltanext +
	3667	(minnext + deltanext) * maxcount - minnext * mincount;
	3668	if (mincount != maxcount) {
	3669	/* Cannot extend fixed substrings found inside
	3670	the group. */
	3671	SCAN_COMMIT(pRExC_state,data,minlenp);
	3672	if (mincount && last_str) {
	3673	SV * const sv = data->last_found;
	3674	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3675	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3676
	3677	if (mg)
	3678	mg->mg_len = -1;
	3679	sv_setsv(sv, last_str);
	3680	data->last_end = data->pos_min;
	3681	data->last_start_min =
	3682	data->pos_min - CHR_SVLEN(last_str);
	3683	data->last_start_max = is_inf
	3684	? I32_MAX
	3685	: data->pos_min + data->pos_delta
	3686	- CHR_SVLEN(last_str);
	3687	}
	3688	data->longest = &(data->longest_float);
	3689	}
	3690	SvREFCNT_dec(last_str);
	3691	}
	3692	if (data && (fl & SF_HAS_EVAL))
	3693	data->flags \|= SF_HAS_EVAL;
	3694	optimize_curly_tail:
	3695	if (OP(oscan) != CURLYX) {
	3696	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	3697	&& NEXT_OFF(next))
	3698	NEXT_OFF(oscan) += NEXT_OFF(next);
	3699	}
	3700	continue;
	3701	default: /* REF, ANYOFV, and CLUMP only? */
	3702	if (flags & SCF_DO_SUBSTR) {
	3703	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3704	data->longest = &(data->longest_float);
	3705	}
	3706	is_inf = is_inf_internal = 1;
	3707	if (flags & SCF_DO_STCLASS_OR)
	3708	cl_anything(pRExC_state, data->start_class);
	3709	flags &= ~SCF_DO_STCLASS;
	3710	break;
	3711	}
	3712	}
	3713	else if (OP(scan) == LNBREAK) {
	3714	if (flags & SCF_DO_STCLASS) {
	3715	int value = 0;
	3716	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3717	if (flags & SCF_DO_STCLASS_AND) {
	3718	for (value = 0; value < 256; value++)
	3719	if (!is_VERTWS_cp(value))
	3720	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3721	}
	3722	else {
	3723	for (value = 0; value < 256; value++)
	3724	if (is_VERTWS_cp(value))
	3725	ANYOF_BITMAP_SET(data->start_class, value);
	3726	}
	3727	if (flags & SCF_DO_STCLASS_OR)
	3728	cl_and(data->start_class, and_withp);
	3729	flags &= ~SCF_DO_STCLASS;
	3730	}
	3731	min += 1;
	3732	delta += 1;
	3733	if (flags & SCF_DO_SUBSTR) {
	3734	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3735	data->pos_min += 1;
	3736	data->pos_delta += 1;
	3737	data->longest = &(data->longest_float);
	3738	}
	3739	}
	3740	else if (OP(scan) == FOLDCHAR) {
	3741	int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
	3742	flags &= ~SCF_DO_STCLASS;
	3743	min += 1;
	3744	delta += d;
	3745	if (flags & SCF_DO_SUBSTR) {
	3746	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3747	data->pos_min += 1;
	3748	data->pos_delta += d;
	3749	data->longest = &(data->longest_float);
	3750	}
	3751	}
	3752	else if (REGNODE_SIMPLE(OP(scan))) {
	3753	int value = 0;
	3754
	3755	if (flags & SCF_DO_SUBSTR) {
	3756	SCAN_COMMIT(pRExC_state,data,minlenp);
	3757	data->pos_min++;
	3758	}
	3759	min++;
	3760	if (flags & SCF_DO_STCLASS) {
	3761	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3762
	3763	/* Some of the logic below assumes that switching
	3764	locale on will only add false positives. */
	3765	switch (PL_regkind[OP(scan)]) {
	3766	case SANY:
	3767	default:
	3768	do_default:
	3769	/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
	3770	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3771	cl_anything(pRExC_state, data->start_class);
	3772	break;
	3773	case REG_ANY:
	3774	if (OP(scan) == SANY)
	3775	goto do_default;
	3776	if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
	3777	value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
	3778	\|\| ANYOF_CLASS_TEST_ANY_SET(data->start_class));
	3779	cl_anything(pRExC_state, data->start_class);
	3780	}
	3781	if (flags & SCF_DO_STCLASS_AND \|\| !value)
	3782	ANYOF_BITMAP_CLEAR(data->start_class,'\n');
	3783	break;
	3784	case ANYOF:
	3785	if (flags & SCF_DO_STCLASS_AND)
	3786	cl_and(data->start_class,
	3787	(struct regnode_charclass_class*)scan);
	3788	else
	3789	cl_or(pRExC_state, data->start_class,
	3790	(struct regnode_charclass_class*)scan);
	3791	break;
	3792	case ALNUM:
	3793	if (flags & SCF_DO_STCLASS_AND) {
	3794	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3795	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
	3796	if (OP(scan) == ALNUMU) {
	3797	for (value = 0; value < 256; value++) {
	3798	if (!isWORDCHAR_L1(value)) {
	3799	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3800	}
	3801	}
	3802	} else {
	3803	for (value = 0; value < 256; value++) {
	3804	if (!isALNUM(value)) {
	3805	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3806	}
	3807	}
	3808	}
	3809	}
	3810	}
	3811	else {
	3812	if (data->start_class->flags & ANYOF_LOCALE)
	3813	ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
	3814
	3815	/* Even if under locale, set the bits for non-locale
	3816	* in case it isn't a true locale-node. This will
	3817	* create false positives if it truly is locale */
	3818	if (OP(scan) == ALNUMU) {
	3819	for (value = 0; value < 256; value++) {
	3820	if (isWORDCHAR_L1(value)) {
	3821	ANYOF_BITMAP_SET(data->start_class, value);
	3822	}
	3823	}
	3824	} else {
	3825	for (value = 0; value < 256; value++) {
	3826	if (isALNUM(value)) {
	3827	ANYOF_BITMAP_SET(data->start_class, value);
	3828	}
	3829	}
	3830	}
	3831	}
	3832	break;
	3833	case NALNUM:
	3834	if (flags & SCF_DO_STCLASS_AND) {
	3835	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3836	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
	3837	if (OP(scan) == NALNUMU) {
	3838	for (value = 0; value < 256; value++) {
	3839	if (isWORDCHAR_L1(value)) {
	3840	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3841	}
	3842	}
	3843	} else {
	3844	for (value = 0; value < 256; value++) {
	3845	if (isALNUM(value)) {
	3846	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3847	}
	3848	}
	3849	}
	3850	}
	3851	}
	3852	else {
	3853	if (data->start_class->flags & ANYOF_LOCALE)
	3854	ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
	3855
	3856	/* Even if under locale, set the bits for non-locale in
	3857	* case it isn't a true locale-node. This will create
	3858	* false positives if it truly is locale */
	3859	if (OP(scan) == NALNUMU) {
	3860	for (value = 0; value < 256; value++) {
	3861	if (! isWORDCHAR_L1(value)) {
	3862	ANYOF_BITMAP_SET(data->start_class, value);
	3863	}
	3864	}
	3865	} else {
	3866	for (value = 0; value < 256; value++) {
	3867	if (! isALNUM(value)) {
	3868	ANYOF_BITMAP_SET(data->start_class, value);
	3869	}
	3870	}
	3871	}
	3872	}
	3873	break;
	3874	case SPACE:
	3875	if (flags & SCF_DO_STCLASS_AND) {
	3876	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3877	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
	3878	if (OP(scan) == SPACEU) {
	3879	for (value = 0; value < 256; value++) {
	3880	if (!isSPACE_L1(value)) {
	3881	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3882	}
	3883	}
	3884	} else {
	3885	for (value = 0; value < 256; value++) {
	3886	if (!isSPACE(value)) {
	3887	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3888	}
	3889	}
	3890	}
	3891	}
	3892	}
	3893	else {
	3894	if (data->start_class->flags & ANYOF_LOCALE) {
	3895	ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
	3896	}
	3897	if (OP(scan) == SPACEU) {
	3898	for (value = 0; value < 256; value++) {
	3899	if (isSPACE_L1(value)) {
	3900	ANYOF_BITMAP_SET(data->start_class, value);
	3901	}
	3902	}
	3903	} else {
	3904	for (value = 0; value < 256; value++) {
	3905	if (isSPACE(value)) {
	3906	ANYOF_BITMAP_SET(data->start_class, value);
	3907	}
	3908	}
	3909	}
	3910	}
	3911	break;
	3912	case NSPACE:
	3913	if (flags & SCF_DO_STCLASS_AND) {
	3914	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3915	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
	3916	if (OP(scan) == NSPACEU) {
	3917	for (value = 0; value < 256; value++) {
	3918	if (isSPACE_L1(value)) {
	3919	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3920	}
	3921	}
	3922	} else {
	3923	for (value = 0; value < 256; value++) {
	3924	if (isSPACE(value)) {
	3925	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3926	}
	3927	}
	3928	}
	3929	}
	3930	}
	3931	else {
	3932	if (data->start_class->flags & ANYOF_LOCALE)
	3933	ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
	3934	if (OP(scan) == NSPACEU) {
	3935	for (value = 0; value < 256; value++) {
	3936	if (!isSPACE_L1(value)) {
	3937	ANYOF_BITMAP_SET(data->start_class, value);
	3938	}
	3939	}
	3940	}
	3941	else {
	3942	for (value = 0; value < 256; value++) {
	3943	if (!isSPACE(value)) {
	3944	ANYOF_BITMAP_SET(data->start_class, value);
	3945	}
	3946	}
	3947	}
	3948	}
	3949	break;
	3950	case DIGIT:
	3951	if (flags & SCF_DO_STCLASS_AND) {
	3952	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3953	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
	3954	for (value = 0; value < 256; value++)
	3955	if (!isDIGIT(value))
	3956	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3957	}
	3958	}
	3959	else {
	3960	if (data->start_class->flags & ANYOF_LOCALE)
	3961	ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
	3962	for (value = 0; value < 256; value++)
	3963	if (isDIGIT(value))
	3964	ANYOF_BITMAP_SET(data->start_class, value);
	3965	}
	3966	break;
	3967	case NDIGIT:
	3968	if (flags & SCF_DO_STCLASS_AND) {
	3969	if (!(data->start_class->flags & ANYOF_LOCALE))
	3970	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
	3971	for (value = 0; value < 256; value++)
	3972	if (isDIGIT(value))
	3973	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3974	}
	3975	else {
	3976	if (data->start_class->flags & ANYOF_LOCALE)
	3977	ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
	3978	for (value = 0; value < 256; value++)
	3979	if (!isDIGIT(value))
	3980	ANYOF_BITMAP_SET(data->start_class, value);
	3981	}
	3982	break;
	3983	CASE_SYNST_FNC(VERTWS);
	3984	CASE_SYNST_FNC(HORIZWS);
	3985
	3986	}
	3987	if (flags & SCF_DO_STCLASS_OR)
	3988	cl_and(data->start_class, and_withp);
	3989	flags &= ~SCF_DO_STCLASS;
	3990	}
	3991	}
	3992	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	3993	data->flags \|= (OP(scan) == MEOL
	3994	? SF_BEFORE_MEOL
	3995	: SF_BEFORE_SEOL);
	3996	}
	3997	else if ( PL_regkind[OP(scan)] == BRANCHJ
	3998	/* Lookbehind, or need to calculate parens/evals/stclass: */
	3999	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	4000	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM)) {
	4001	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4002	\|\| OP(scan) == UNLESSM )
	4003	{
	4004	/* Negative Lookahead/lookbehind
	4005	In this case we can't do fixed string optimisation.
	4006	*/
	4007
	4008	I32 deltanext, minnext, fake = 0;
	4009	regnode *nscan;
	4010	struct regnode_charclass_class intrnl;
	4011	int f = 0;
	4012
	4013	data_fake.flags = 0;
	4014	if (data) {
	4015	data_fake.whilem_c = data->whilem_c;
	4016	data_fake.last_closep = data->last_closep;
	4017	}
	4018	else
	4019	data_fake.last_closep = &fake;
	4020	data_fake.pos_delta = delta;
	4021	if ( flags & SCF_DO_STCLASS && !scan->flags
	4022	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4023	cl_init(pRExC_state, &intrnl);
	4024	data_fake.start_class = &intrnl;
	4025	f \|= SCF_DO_STCLASS_AND;
	4026	}
	4027	if (flags & SCF_WHILEM_VISITED_POS)
	4028	f \|= SCF_WHILEM_VISITED_POS;
	4029	next = regnext(scan);
	4030	nscan = NEXTOPER(NEXTOPER(scan));
	4031	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	4032	last, &data_fake, stopparen, recursed, NULL, f, depth+1);
	4033	if (scan->flags) {
	4034	if (deltanext) {
	4035	FAIL("Variable length lookbehind not implemented");
	4036	}
	4037	else if (minnext > (I32)U8_MAX) {
	4038	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4039	}
	4040	scan->flags = (U8)minnext;
	4041	}
	4042	if (data) {
	4043	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4044	pars++;
	4045	if (data_fake.flags & SF_HAS_EVAL)
	4046	data->flags \|= SF_HAS_EVAL;
	4047	data->whilem_c = data_fake.whilem_c;
	4048	}
	4049	if (f & SCF_DO_STCLASS_AND) {
	4050	if (flags & SCF_DO_STCLASS_OR) {
	4051	/* OR before, AND after: ideally we would recurse with
	4052	* data_fake to get the AND applied by study of the
	4053	* remainder of the pattern, and then derecurse;
	4054	* * HACK * for now just treat as "no information".
	4055	* See [perl #56690].
	4056	*/
	4057	cl_init(pRExC_state, data->start_class);
	4058	} else {
	4059	/* AND before and after: combine and continue */
	4060	const int was = (data->start_class->flags & ANYOF_EOS);
	4061
	4062	cl_and(data->start_class, &intrnl);
	4063	if (was)
	4064	data->start_class->flags \|= ANYOF_EOS;
	4065	}
	4066	}
	4067	}
	4068	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4069	else {
	4070	/* Positive Lookahead/lookbehind
	4071	In this case we can do fixed string optimisation,
	4072	but we must be careful about it. Note in the case of
	4073	lookbehind the positions will be offset by the minimum
	4074	length of the pattern, something we won't know about
	4075	until after the recurse.
	4076	*/
	4077	I32 deltanext, fake = 0;
	4078	regnode *nscan;
	4079	struct regnode_charclass_class intrnl;
	4080	int f = 0;
	4081	/* We use SAVEFREEPV so that when the full compile
	4082	is finished perl will clean up the allocated
	4083	minlens when it's all done. This way we don't
	4084	have to worry about freeing them when we know
	4085	they wont be used, which would be a pain.
	4086	*/
	4087	I32 *minnextp;
	4088	Newx( minnextp, 1, I32 );
	4089	SAVEFREEPV(minnextp);
	4090
	4091	if (data) {
	4092	StructCopy(data, &data_fake, scan_data_t);
	4093	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	4094	f \|= SCF_DO_SUBSTR;
	4095	if (scan->flags)
	4096	SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
	4097	data_fake.last_found=newSVsv(data->last_found);
	4098	}
	4099	}
	4100	else
	4101	data_fake.last_closep = &fake;
	4102	data_fake.flags = 0;
	4103	data_fake.pos_delta = delta;
	4104	if (is_inf)
	4105	data_fake.flags \|= SF_IS_INF;
	4106	if ( flags & SCF_DO_STCLASS && !scan->flags
	4107	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4108	cl_init(pRExC_state, &intrnl);
	4109	data_fake.start_class = &intrnl;
	4110	f \|= SCF_DO_STCLASS_AND;
	4111	}
	4112	if (flags & SCF_WHILEM_VISITED_POS)
	4113	f \|= SCF_WHILEM_VISITED_POS;
	4114	next = regnext(scan);
	4115	nscan = NEXTOPER(NEXTOPER(scan));
	4116
	4117	*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
	4118	last, &data_fake, stopparen, recursed, NULL, f,depth+1);
	4119	if (scan->flags) {
	4120	if (deltanext) {
	4121	FAIL("Variable length lookbehind not implemented");
	4122	}
	4123	else if (*minnextp > (I32)U8_MAX) {
	4124	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4125	}
	4126	scan->flags = (U8)*minnextp;
	4127	}
	4128
	4129	*minnextp += min;
	4130
	4131	if (f & SCF_DO_STCLASS_AND) {
	4132	const int was = (data->start_class->flags & ANYOF_EOS);
	4133
	4134	cl_and(data->start_class, &intrnl);
	4135	if (was)
	4136	data->start_class->flags \|= ANYOF_EOS;
	4137	}
	4138	if (data) {
	4139	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4140	pars++;
	4141	if (data_fake.flags & SF_HAS_EVAL)
	4142	data->flags \|= SF_HAS_EVAL;
	4143	data->whilem_c = data_fake.whilem_c;
	4144	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	4145	if (RExC_rx->minlen<*minnextp)
	4146	RExC_rx->minlen=*minnextp;
	4147	SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
	4148	SvREFCNT_dec(data_fake.last_found);
	4149
	4150	if ( data_fake.minlen_fixed != minlenp )
	4151	{
	4152	data->offset_fixed= data_fake.offset_fixed;
	4153	data->minlen_fixed= data_fake.minlen_fixed;
	4154	data->lookbehind_fixed+= scan->flags;
	4155	}
	4156	if ( data_fake.minlen_float != minlenp )
	4157	{
	4158	data->minlen_float= data_fake.minlen_float;
	4159	data->offset_float_min=data_fake.offset_float_min;
	4160	data->offset_float_max=data_fake.offset_float_max;
	4161	data->lookbehind_float+= scan->flags;
	4162	}
	4163	}
	4164	}
	4165
	4166
	4167	}
	4168	#endif
	4169	}
	4170	else if (OP(scan) == OPEN) {
	4171	if (stopparen != (I32)ARG(scan))
	4172	pars++;
	4173	}
	4174	else if (OP(scan) == CLOSE) {
	4175	if (stopparen == (I32)ARG(scan)) {
	4176	break;
	4177	}
	4178	if ((I32)ARG(scan) == is_par) {
	4179	next = regnext(scan);
	4180
	4181	if ( next && (OP(next) != WHILEM) && next < last)
	4182	is_par = 0; /* Disable optimization */
	4183	}
	4184	if (data)
	4185	*(data->last_closep) = ARG(scan);
	4186	}
	4187	else if (OP(scan) == EVAL) {
	4188	if (data)
	4189	data->flags \|= SF_HAS_EVAL;
	4190	}
	4191	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	4192	if (flags & SCF_DO_SUBSTR) {
	4193	SCAN_COMMIT(pRExC_state,data,minlenp);
	4194	flags &= ~SCF_DO_SUBSTR;
	4195	}
	4196	if (data && OP(scan)==ACCEPT) {
	4197	data->flags \|= SCF_SEEN_ACCEPT;
	4198	if (stopmin > min)
	4199	stopmin = min;
	4200	}
	4201	}
	4202	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	4203	{
	4204	if (flags & SCF_DO_SUBSTR) {
	4205	SCAN_COMMIT(pRExC_state,data,minlenp);
	4206	data->longest = &(data->longest_float);
	4207	}
	4208	is_inf = is_inf_internal = 1;
	4209	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4210	cl_anything(pRExC_state, data->start_class);
	4211	flags &= ~SCF_DO_STCLASS;
	4212	}
	4213	else if (OP(scan) == GPOS) {
	4214	if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
	4215	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	4216	{
	4217	if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
	4218	RExC_rx->extflags \|= RXf_ANCH_GPOS;
	4219	if (RExC_rx->gofs < (U32)min)
	4220	RExC_rx->gofs = min;
	4221	} else {
	4222	RExC_rx->extflags \|= RXf_GPOS_FLOAT;
	4223	RExC_rx->gofs = 0;
	4224	}
	4225	}
	4226	#ifdef TRIE_STUDY_OPT
	4227	#ifdef FULL_TRIE_STUDY
	4228	else if (PL_regkind[OP(scan)] == TRIE) {
	4229	/* NOTE - There is similar code to this block above for handling
	4230	BRANCH nodes on the initial study. If you change stuff here
	4231	check there too. */
	4232	regnode *trie_node= scan;
	4233	regnode *tail= regnext(scan);
	4234	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4235	I32 max1 = 0, min1 = I32_MAX;
	4236	struct regnode_charclass_class accum;
	4237
	4238	if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
	4239	SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
	4240	if (flags & SCF_DO_STCLASS)
	4241	cl_init_zero(pRExC_state, &accum);
	4242
	4243	if (!trie->jump) {
	4244	min1= trie->minlen;
	4245	max1= trie->maxlen;
	4246	} else {
	4247	const regnode *nextbranch= NULL;
	4248	U32 word;
	4249
	4250	for ( word=1 ; word <= trie->wordcount ; word++)
	4251	{
	4252	I32 deltanext=0, minnext=0, f = 0, fake;
	4253	struct regnode_charclass_class this_class;
	4254
	4255	data_fake.flags = 0;
	4256	if (data) {
	4257	data_fake.whilem_c = data->whilem_c;
	4258	data_fake.last_closep = data->last_closep;
	4259	}
	4260	else
	4261	data_fake.last_closep = &fake;
	4262	data_fake.pos_delta = delta;
	4263	if (flags & SCF_DO_STCLASS) {
	4264	cl_init(pRExC_state, &this_class);
	4265	data_fake.start_class = &this_class;
	4266	f = SCF_DO_STCLASS_AND;
	4267	}
	4268	if (flags & SCF_WHILEM_VISITED_POS)
	4269	f \|= SCF_WHILEM_VISITED_POS;
	4270
	4271	if (trie->jump[word]) {
	4272	if (!nextbranch)
	4273	nextbranch = trie_node + trie->jump[0];
	4274	scan= trie_node + trie->jump[word];
	4275	/* We go from the jump point to the branch that follows
	4276	it. Note this means we need the vestigal unused branches
	4277	even though they arent otherwise used.
	4278	*/
	4279	minnext = study_chunk(pRExC_state, &scan, minlenp,
	4280	&deltanext, (regnode *)nextbranch, &data_fake,
	4281	stopparen, recursed, NULL, f,depth+1);
	4282	}
	4283	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	4284	nextbranch= regnext((regnode*)nextbranch);
	4285
	4286	if (min1 > (I32)(minnext + trie->minlen))
	4287	min1 = minnext + trie->minlen;
	4288	if (max1 < (I32)(minnext + deltanext + trie->maxlen))
	4289	max1 = minnext + deltanext + trie->maxlen;
	4290	if (deltanext == I32_MAX)
	4291	is_inf = is_inf_internal = 1;
	4292
	4293	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4294	pars++;
	4295	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	4296	if ( stopmin > min + min1)
	4297	stopmin = min + min1;
	4298	flags &= ~SCF_DO_SUBSTR;
	4299	if (data)
	4300	data->flags \|= SCF_SEEN_ACCEPT;
	4301	}
	4302	if (data) {
	4303	if (data_fake.flags & SF_HAS_EVAL)
	4304	data->flags \|= SF_HAS_EVAL;
	4305	data->whilem_c = data_fake.whilem_c;
	4306	}
	4307	if (flags & SCF_DO_STCLASS)
	4308	cl_or(pRExC_state, &accum, &this_class);
	4309	}
	4310	}
	4311	if (flags & SCF_DO_SUBSTR) {
	4312	data->pos_min += min1;
	4313	data->pos_delta += max1 - min1;
	4314	if (max1 != min1 \|\| is_inf)
	4315	data->longest = &(data->longest_float);
	4316	}
	4317	min += min1;
	4318	delta += max1 - min1;
	4319	if (flags & SCF_DO_STCLASS_OR) {
	4320	cl_or(pRExC_state, data->start_class, &accum);
	4321	if (min1) {
	4322	cl_and(data->start_class, and_withp);
	4323	flags &= ~SCF_DO_STCLASS;
	4324	}
	4325	}
	4326	else if (flags & SCF_DO_STCLASS_AND) {
	4327	if (min1) {
	4328	cl_and(data->start_class, &accum);
	4329	flags &= ~SCF_DO_STCLASS;
	4330	}
	4331	else {
	4332	/* Switch to OR mode: cache the old value of
	4333	* data->start_class */
	4334	INIT_AND_WITHP;
	4335	StructCopy(data->start_class, and_withp,
	4336	struct regnode_charclass_class);
	4337	flags &= ~SCF_DO_STCLASS_AND;
	4338	StructCopy(&accum, data->start_class,
	4339	struct regnode_charclass_class);
	4340	flags \|= SCF_DO_STCLASS_OR;
	4341	data->start_class->flags \|= ANYOF_EOS;
	4342	}
	4343	}
	4344	scan= tail;
	4345	continue;
	4346	}
	4347	#else
	4348	else if (PL_regkind[OP(scan)] == TRIE) {
	4349	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4350	U8*bang=NULL;
	4351
	4352	min += trie->minlen;
	4353	delta += (trie->maxlen - trie->minlen);
	4354	flags &= ~SCF_DO_STCLASS; /* xxx */
	4355	if (flags & SCF_DO_SUBSTR) {
	4356	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4357	data->pos_min += trie->minlen;
	4358	data->pos_delta += (trie->maxlen - trie->minlen);
	4359	if (trie->maxlen != trie->minlen)
	4360	data->longest = &(data->longest_float);
	4361	}
	4362	if (trie->jump) /* no more substrings -- for now /grr*/
	4363	flags &= ~SCF_DO_SUBSTR;
	4364	}
	4365	#endif /* old or new */
	4366	#endif /* TRIE_STUDY_OPT */
	4367
	4368	/* Else: zero-length, ignore. */
	4369	scan = regnext(scan);
	4370	}
	4371	if (frame) {
	4372	last = frame->last;
	4373	scan = frame->next;
	4374	stopparen = frame->stop;
	4375	frame = frame->prev;
	4376	goto fake_study_recurse;
	4377	}
	4378
	4379	finish:
	4380	assert(!frame);
	4381	DEBUG_STUDYDATA("pre-fin:",data,depth);
	4382
	4383	*scanp = scan;
	4384	*deltap = is_inf_internal ? I32_MAX : delta;
	4385	if (flags & SCF_DO_SUBSTR && is_inf)
	4386	data->pos_delta = I32_MAX - data->pos_min;
	4387	if (is_par > (I32)U8_MAX)
	4388	is_par = 0;
	4389	if (is_par && pars==1 && data) {
	4390	data->flags \|= SF_IN_PAR;
	4391	data->flags &= ~SF_HAS_PAR;
	4392	}
	4393	else if (pars && data) {
	4394	data->flags \|= SF_HAS_PAR;
	4395	data->flags &= ~SF_IN_PAR;
	4396	}
	4397	if (flags & SCF_DO_STCLASS_OR)
	4398	cl_and(data->start_class, and_withp);
	4399	if (flags & SCF_TRIE_RESTUDY)
	4400	data->flags \|= SCF_TRIE_RESTUDY;
	4401
	4402	DEBUG_STUDYDATA("post-fin:",data,depth);
	4403
	4404	return min < stopmin ? min : stopmin;
	4405	}
	4406
	4407	STATIC U32
	4408	S_add_data(RExC_state_t pRExC_state, U32 n, const char s)
	4409	{
	4410	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	4411
	4412	PERL_ARGS_ASSERT_ADD_DATA;
	4413
	4414	Renewc(RExC_rxi->data,
	4415	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	4416	char, struct reg_data);
	4417	if(count)
	4418	Renew(RExC_rxi->data->what, count + n, U8);
	4419	else
	4420	Newx(RExC_rxi->data->what, n, U8);
	4421	RExC_rxi->data->count = count + n;
	4422	Copy(s, RExC_rxi->data->what + count, n, U8);
	4423	return count;
	4424	}
	4425
	4426	/XXX: todo make this not included in a non debugging perl /
	4427	#ifndef PERL_IN_XSUB_RE
	4428	void
	4429	Perl_reginitcolors(pTHX)
	4430	{
	4431	dVAR;
	4432	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	4433	if (s) {
	4434	char *t = savepv(s);
	4435	int i = 0;
	4436	PL_colors[0] = t;
	4437	while (++i < 6) {
	4438	t = strchr(t, '\t');
	4439	if (t) {
	4440	*t = '\0';
	4441	PL_colors[i] = ++t;
	4442	}
	4443	else
	4444	PL_colors[i] = t = (char *)"";
	4445	}
	4446	} else {
	4447	int i = 0;
	4448	while (i < 6)
	4449	PL_colors[i++] = (char *)"";
	4450	}
	4451	PL_colorset = 1;
	4452	}
	4453	#endif
	4454
	4455
	4456	#ifdef TRIE_STUDY_OPT
	4457	#define CHECK_RESTUDY_GOTO \
	4458	if ( \
	4459	(data.flags & SCF_TRIE_RESTUDY) \
	4460	&& ! restudied++ \
	4461	) goto reStudy
	4462	#else
	4463	#define CHECK_RESTUDY_GOTO
	4464	#endif
	4465
	4466	/*
	4467	- pregcomp - compile a regular expression into internal code
	4468	*
	4469	* We can't allocate space until we know how big the compiled form will be,
	4470	* but we can't compile it (and thus know how big it is) until we've got a
	4471	* place to put the code. So we cheat: we compile it twice, once with code
	4472	* generation turned off and size counting turned on, and once "for real".
	4473	* This also means that we don't allocate space until we are sure that the
	4474	* thing really will compile successfully, and we never have to move the
	4475	* code and thus invalidate pointers into it. (Note that it has to be in
	4476	* one piece because free() must be able to free it all.) [NB: not true in perl]
	4477	*
	4478	* Beware that the optimization-preparation code in here knows about some
	4479	* of the structure of the compiled regexp. [I'll say.]
	4480	*/
	4481
	4482
	4483
	4484	#ifndef PERL_IN_XSUB_RE
	4485	#define RE_ENGINE_PTR &PL_core_reg_engine
	4486	#else
	4487	extern const struct regexp_engine my_reg_engine;
	4488	#define RE_ENGINE_PTR &my_reg_engine
	4489	#endif
	4490
	4491	#ifndef PERL_IN_XSUB_RE
	4492	REGEXP *
	4493	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	4494	{
	4495	dVAR;
	4496	HV * const table = GvHV(PL_hintgv);
	4497
	4498	PERL_ARGS_ASSERT_PREGCOMP;
	4499
	4500	/* Dispatch a request to compile a regexp to correct
	4501	regexp engine. */
	4502	if (table) {
	4503	SV **ptr= hv_fetchs(table, "regcomp", FALSE);
	4504	GET_RE_DEBUG_FLAGS_DECL;
	4505	if (ptr && SvIOK(ptr) && SvIV(ptr)) {
	4506	const regexp_engine eng=INT2PTR(regexp_engine,SvIV(*ptr));
	4507	DEBUG_COMPILE_r({
	4508	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	4509	SvIV(*ptr));
	4510	});
	4511	return CALLREGCOMP_ENG(eng, pattern, flags);
	4512	}
	4513	}
	4514	return Perl_re_compile(aTHX_ pattern, flags);
	4515	}
	4516	#endif
	4517
	4518	REGEXP *
	4519	Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
	4520	{
	4521	dVAR;
	4522	REGEXP *rx;
	4523	struct regexp *r;
	4524	register regexp_internal *ri;
	4525	STRLEN plen;
	4526	char* VOL exp;
	4527	char* xend;
	4528	regnode *scan;
	4529	I32 flags;
	4530	I32 minlen = 0;
	4531	U32 pm_flags;
	4532
	4533	/* these are all flags - maybe they should be turned
	4534	* into a single int with different bit masks */
	4535	I32 sawlookahead = 0;
	4536	I32 sawplus = 0;
	4537	I32 sawopen = 0;
	4538	bool used_setjump = FALSE;
	4539	regex_charset initial_charset = get_regex_charset(orig_pm_flags);
	4540
	4541	U8 jump_ret = 0;
	4542	dJMPENV;
	4543	scan_data_t data;
	4544	RExC_state_t RExC_state;
	4545	RExC_state_t * const pRExC_state = &RExC_state;
	4546	#ifdef TRIE_STUDY_OPT
	4547	int restudied;
	4548	RExC_state_t copyRExC_state;
	4549	#endif
	4550	GET_RE_DEBUG_FLAGS_DECL;
	4551
	4552	PERL_ARGS_ASSERT_RE_COMPILE;
	4553
	4554	DEBUG_r(if (!PL_colorset) reginitcolors());
	4555
	4556	exp = SvPV(pattern, plen);
	4557
	4558	if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
	4559	RExC_utf8 = RExC_orig_utf8 = 0;
	4560	}
	4561	else {
	4562	RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
	4563	}
	4564	RExC_uni_semantics = 0;
	4565	RExC_contains_locale = 0;
	4566
	4567	/**************** LONG JUMP TARGET HERE*********************/
	4568	/* Longjmp back to here if have to switch in midstream to utf8 */
	4569	if (! RExC_orig_utf8) {
	4570	JMPENV_PUSH(jump_ret);
	4571	used_setjump = TRUE;
	4572	}
	4573
	4574	if (jump_ret == 0) { /* First time through */
	4575	xend = exp + plen;
	4576
	4577	DEBUG_COMPILE_r({
	4578	SV *dsv= sv_newmortal();
	4579	RE_PV_QUOTED_DECL(s, RExC_utf8,
	4580	dsv, exp, plen, 60);
	4581	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	4582	PL_colors[4],PL_colors[5],s);
	4583	});
	4584	}
	4585	else { /* longjumped back */
	4586	STRLEN len = plen;
	4587
	4588	/* If the cause for the longjmp was other than changing to utf8, pop
	4589	* our own setjmp, and longjmp to the correct handler */
	4590	if (jump_ret != UTF8_LONGJMP) {
	4591	JMPENV_POP;
	4592	JMPENV_JUMP(jump_ret);
	4593	}
	4594
	4595	GET_RE_DEBUG_FLAGS;
	4596
	4597	/* It's possible to write a regexp in ascii that represents Unicode
	4598	codepoints outside of the byte range, such as via \x{100}. If we
	4599	detect such a sequence we have to convert the entire pattern to utf8
	4600	and then recompile, as our sizing calculation will have been based
	4601	on 1 byte == 1 character, but we will need to use utf8 to encode
	4602	at least some part of the pattern, and therefore must convert the whole
	4603	thing.
	4604	-- dmq */
	4605	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	4606	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	4607	exp = (char*)Perl_bytes_to_utf8(aTHX_
	4608	(U8*)SvPV_nomg(pattern, plen),
	4609	&len);
	4610	xend = exp + len;
	4611	RExC_orig_utf8 = RExC_utf8 = 1;
	4612	SAVEFREEPV(exp);
	4613	}
	4614
	4615	#ifdef TRIE_STUDY_OPT
	4616	restudied = 0;
	4617	#endif
	4618
	4619	pm_flags = orig_pm_flags;
	4620
	4621	if (initial_charset == REGEX_LOCALE_CHARSET) {
	4622	RExC_contains_locale = 1;
	4623	}
	4624	else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
	4625
	4626	/* Set to use unicode semantics if the pattern is in utf8 and has the
	4627	* 'depends' charset specified, as it means unicode when utf8 */
	4628	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4629	}
	4630
	4631	RExC_precomp = exp;
	4632	RExC_flags = pm_flags;
	4633	RExC_sawback = 0;
	4634
	4635	RExC_seen = 0;
	4636	RExC_in_lookbehind = 0;
	4637	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	4638	RExC_seen_evals = 0;
	4639	RExC_extralen = 0;
	4640	RExC_override_recoding = 0;
	4641
	4642	/* First pass: determine size, legality. */
	4643	RExC_parse = exp;
	4644	RExC_start = exp;
	4645	RExC_end = xend;
	4646	RExC_naughty = 0;
	4647	RExC_npar = 1;
	4648	RExC_nestroot = 0;
	4649	RExC_size = 0L;
	4650	RExC_emit = &PL_regdummy;
	4651	RExC_whilem_seen = 0;
	4652	RExC_open_parens = NULL;
	4653	RExC_close_parens = NULL;
	4654	RExC_opend = NULL;
	4655	RExC_paren_names = NULL;
	4656	#ifdef DEBUGGING
	4657	RExC_paren_name_list = NULL;
	4658	#endif
	4659	RExC_recurse = NULL;
	4660	RExC_recurse_count = 0;
	4661
	4662	#if 0 /* REGC() is (currently) a NOP at the first pass.
	4663	* Clever compilers notice this and complain. --jhi */
	4664	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	4665	#endif
	4666	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
	4667	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4668	RExC_precomp = NULL;
	4669	return(NULL);
	4670	}
	4671
	4672	/* Here, finished first pass. Get rid of any added setjmp */
	4673	if (used_setjump) {
	4674	JMPENV_POP;
	4675	}
	4676
	4677	DEBUG_PARSE_r({
	4678	PerlIO_printf(Perl_debug_log,
	4679	"Required size %"IVdf" nodes\n"
	4680	"Starting second pass (creation)\n",
	4681	(IV)RExC_size);
	4682	RExC_lastnum=0;
	4683	RExC_lastparse=NULL;
	4684	});
	4685
	4686	/* The first pass could have found things that force Unicode semantics */
	4687	if ((RExC_utf8 \|\| RExC_uni_semantics)
	4688	&& get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
	4689	{
	4690	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4691	}
	4692
	4693	/* Small enough for pointer-storage convention?
	4694	If extralen==0, this means that we will not need long jumps. */
	4695	if (RExC_size >= 0x10000L && RExC_extralen)
	4696	RExC_size += RExC_extralen;
	4697	else
	4698	RExC_extralen = 0;
	4699	if (RExC_whilem_seen > 15)
	4700	RExC_whilem_seen = 15;
	4701
	4702	/* Allocate space and zero-initialize. Note, the two step process
	4703	of zeroing when in debug mode, thus anything assigned has to
	4704	happen after that */
	4705	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	4706	r = (struct regexp*)SvANY(rx);
	4707	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	4708	char, regexp_internal);
	4709	if ( r == NULL \|\| ri == NULL )
	4710	FAIL("Regexp out of space");
	4711	#ifdef DEBUGGING
	4712	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	4713	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
	4714	#else
	4715	/* bulk initialize base fields with 0. */
	4716	Zero(ri, sizeof(regexp_internal), char);
	4717	#endif
	4718
	4719	/* non-zero initialization begins here */
	4720	RXi_SET( r, ri );
	4721	r->engine= RE_ENGINE_PTR;
	4722	r->extflags = pm_flags;
	4723	{
	4724	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	4725	bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
	4726
	4727	/* The caret is output if there are any defaults: if not all the STD
	4728	* flags are set, or if no character set specifier is needed */
	4729	bool has_default =
	4730	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	4731	\|\| ! has_charset);
	4732	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
	4733	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	4734	>> RXf_PMf_STD_PMMOD_SHIFT);
	4735	const char fptr = STD_PAT_MODS; /"msix"*/
	4736	char *p;
	4737	/* Allocate for the worst case, which is all the std flags are turned
	4738	* on. If more precision is desired, we could do a population count of
	4739	* the flags set. This could be done with a small lookup table, or by
	4740	* shifting, masking and adding, or even, when available, assembly
	4741	* language for a machine-language population count.
	4742	* We never output a minus, as all those are defaults, so are
	4743	* covered by the caret */
	4744	const STRLEN wraplen = plen + has_p + has_runon
	4745	+ has_default /* If needs a caret */
	4746
	4747	/* If needs a character set specifier */
	4748	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	4749	+ (sizeof(STD_PAT_MODS) - 1)
	4750	+ (sizeof("(?:)") - 1);
	4751
	4752	p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
	4753	SvPOK_on(rx);
	4754	SvFLAGS(rx) \|= SvUTF8(pattern);
	4755	p++='('; p++='?';
	4756
	4757	/* If a default, cover it using the caret */
	4758	if (has_default) {
	4759	*p++= DEFAULT_PAT_MOD;
	4760	}
	4761	if (has_charset) {
	4762	STRLEN len;
	4763	const char* const name = get_regex_charset_name(r->extflags, &len);
	4764	Copy(name, p, len, char);
	4765	p += len;
	4766	}
	4767	if (has_p)
	4768	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	4769	{
	4770	char ch;
	4771	while((ch = *fptr++)) {
	4772	if(reganch & 1)
	4773	*p++ = ch;
	4774	reganch >>= 1;
	4775	}
	4776	}
	4777
	4778	*p++ = ':';
	4779	Copy(RExC_precomp, p, plen, char);
	4780	assert ((RX_WRAPPED(rx) - p) < 16);
	4781	r->pre_prefix = p - RX_WRAPPED(rx);
	4782	p += plen;
	4783	if (has_runon)
	4784	*p++ = '\n';
	4785	*p++ = ')';
	4786	*p = 0;
	4787	SvCUR_set(rx, p - SvPVX_const(rx));
	4788	}
	4789
	4790	r->intflags = 0;
	4791	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	4792
	4793	if (RExC_seen & REG_SEEN_RECURSE) {
	4794	Newxz(RExC_open_parens, RExC_npar,regnode *);
	4795	SAVEFREEPV(RExC_open_parens);
	4796	Newxz(RExC_close_parens,RExC_npar,regnode *);
	4797	SAVEFREEPV(RExC_close_parens);
	4798	}
	4799
	4800	/* Useful during FAIL. */
	4801	#ifdef RE_TRACK_PATTERN_OFFSETS
	4802	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	4803	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	4804	"%s %"UVuf" bytes for offset annotations.\n",
	4805	ri->u.offsets ? "Got" : "Couldn't get",
	4806	(UV)((2RExC_size+1) sizeof(U32))));
	4807	#endif
	4808	SetProgLen(ri,RExC_size);
	4809	RExC_rx_sv = rx;
	4810	RExC_rx = r;
	4811	RExC_rxi = ri;
	4812
	4813	/* Second pass: emit code. */
	4814	RExC_flags = pm_flags; /* don't let top level (?i) bleed */
	4815	RExC_parse = exp;
	4816	RExC_end = xend;
	4817	RExC_naughty = 0;
	4818	RExC_npar = 1;
	4819	RExC_emit_start = ri->program;
	4820	RExC_emit = ri->program;
	4821	RExC_emit_bound = ri->program + RExC_size + 1;
	4822
	4823	/* Store the count of eval-groups for security checks: */
	4824	RExC_rx->seen_evals = RExC_seen_evals;
	4825	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	4826	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4827	ReREFCNT_dec(rx);
	4828	return(NULL);
	4829	}
	4830	/* XXXX To minimize changes to RE engine we always allocate
	4831	3-units-long substrs field. */
	4832	Newx(r->substrs, 1, struct reg_substr_data);
	4833	if (RExC_recurse_count) {
	4834	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	4835	SAVEFREEPV(RExC_recurse);
	4836	}
	4837
	4838	reStudy:
	4839	r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
	4840	Zero(r->substrs, 1, struct reg_substr_data);
	4841
	4842	#ifdef TRIE_STUDY_OPT
	4843	if (!restudied) {
	4844	StructCopy(&zero_scan_data, &data, scan_data_t);
	4845	copyRExC_state = RExC_state;
	4846	} else {
	4847	U32 seen=RExC_seen;
	4848	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	4849
	4850	RExC_state = copyRExC_state;
	4851	if (seen & REG_TOP_LEVEL_BRANCHES)
	4852	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	4853	else
	4854	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
	4855	if (data.last_found) {
	4856	SvREFCNT_dec(data.longest_fixed);
	4857	SvREFCNT_dec(data.longest_float);
	4858	SvREFCNT_dec(data.last_found);
	4859	}
	4860	StructCopy(&zero_scan_data, &data, scan_data_t);
	4861	}
	4862	#else
	4863	StructCopy(&zero_scan_data, &data, scan_data_t);
	4864	#endif
	4865
	4866	/* Dig out information for optimizations. */
	4867	r->extflags = RExC_flags; /* was pm_op */
	4868	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	4869
	4870	if (UTF)
	4871	SvUTF8_on(rx); /* Unicode in it? */
	4872	ri->regstclass = NULL;
	4873	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	4874	r->intflags \|= PREGf_NAUGHTY;
	4875	scan = ri->program + 1; /* First BRANCH. */
	4876
	4877	/* testing for BRANCH here tells us whether there is "must appear"
	4878	data in the pattern. If there is then we can use it for optimisations */
	4879	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
	4880	I32 fake;
	4881	STRLEN longest_float_length, longest_fixed_length;
	4882	struct regnode_charclass_class ch_class; /* pointed to by data */
	4883	int stclass_flag;
	4884	I32 last_close = 0; /* pointed to by data */
	4885	regnode *first= scan;
	4886	regnode *first_next= regnext(first);
	4887	/*
	4888	* Skip introductions and multiplicators >= 1
	4889	* so that we can extract the 'meat' of the pattern that must
	4890	* match in the large if() sequence following.
	4891	* NOTE that EXACT is NOT covered here, as it is normally
	4892	* picked up by the optimiser separately.
	4893	*
	4894	* This is unfortunate as the optimiser isnt handling lookahead
	4895	* properly currently.
	4896	*
	4897	*/
	4898	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	4899	/* An OR of one alternative - should not happen now. */
	4900	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	4901	/* for now we can't handle lookbehind IFMATCH*/
	4902	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	4903	(OP(first) == PLUS) \|\|
	4904	(OP(first) == MINMOD) \|\|
	4905	/* An {n,m} with n>0 */
	4906	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	4907	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	4908	{
	4909	/*
	4910	* the only op that could be a regnode is PLUS, all the rest
	4911	* will be regnode_1 or regnode_2.
	4912	*
	4913	*/
	4914	if (OP(first) == PLUS)
	4915	sawplus = 1;
	4916	else
	4917	first += regarglen[OP(first)];
	4918
	4919	first = NEXTOPER(first);
	4920	first_next= regnext(first);
	4921	}
	4922
	4923	/* Starting-point info. */
	4924	again:
	4925	DEBUG_PEEP("first:",first,0);
	4926	/* Ignore EXACT as we deal with it later. */
	4927	if (PL_regkind[OP(first)] == EXACT) {
	4928	if (OP(first) == EXACT)
	4929	NOOP; /* Empty, get anchored substr later. */
	4930	else
	4931	ri->regstclass = first;
	4932	}
	4933	#ifdef TRIE_STCLASS
	4934	else if (PL_regkind[OP(first)] == TRIE &&
	4935	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	4936	{
	4937	regnode *trie_op;
	4938	/* this can happen only on restudy */
	4939	if ( OP(first) == TRIE ) {
	4940	struct regnode_1 trieop = (struct regnode_1 )
	4941	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	4942	StructCopy(first,trieop,struct regnode_1);
	4943	trie_op=(regnode *)trieop;
	4944	} else {
	4945	struct regnode_charclass trieop = (struct regnode_charclass )
	4946	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	4947	StructCopy(first,trieop,struct regnode_charclass);
	4948	trie_op=(regnode *)trieop;
	4949	}
	4950	OP(trie_op)+=2;
	4951	make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
	4952	ri->regstclass = trie_op;
	4953	}
	4954	#endif
	4955	else if (REGNODE_SIMPLE(OP(first)))
	4956	ri->regstclass = first;
	4957	else if (PL_regkind[OP(first)] == BOUND \|\|
	4958	PL_regkind[OP(first)] == NBOUND)
	4959	ri->regstclass = first;
	4960	else if (PL_regkind[OP(first)] == BOL) {
	4961	r->extflags \|= (OP(first) == MBOL
	4962	? RXf_ANCH_MBOL
	4963	: (OP(first) == SBOL
	4964	? RXf_ANCH_SBOL
	4965	: RXf_ANCH_BOL));
	4966	first = NEXTOPER(first);
	4967	goto again;
	4968	}
	4969	else if (OP(first) == GPOS) {
	4970	r->extflags \|= RXf_ANCH_GPOS;
	4971	first = NEXTOPER(first);
	4972	goto again;
	4973	}
	4974	else if ((!sawopen \|\| !RExC_sawback) &&
	4975	(OP(first) == STAR &&
	4976	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	4977	!(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
	4978	{
	4979	/* turn .* into ^.* with an implied $=1 /
	4980	const int type =
	4981	(OP(NEXTOPER(first)) == REG_ANY)
	4982	? RXf_ANCH_MBOL
	4983	: RXf_ANCH_SBOL;
	4984	r->extflags \|= type;
	4985	r->intflags \|= PREGf_IMPLICIT;
	4986	first = NEXTOPER(first);
	4987	goto again;
	4988	}
	4989	if (sawplus && !sawlookahead && (!sawopen \|\| !RExC_sawback)
	4990	&& !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
	4991	/* x+ must match at the 1st pos of run of x's */
	4992	r->intflags \|= PREGf_SKIP;
	4993
	4994	/* Scan is after the zeroth branch, first is atomic matcher. */
	4995	#ifdef TRIE_STUDY_OPT
	4996	DEBUG_PARSE_r(
	4997	if (!restudied)
	4998	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	4999	(IV)(first - scan + 1))
	5000	);
	5001	#else
	5002	DEBUG_PARSE_r(
	5003	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	5004	(IV)(first - scan + 1))
	5005	);
	5006	#endif
	5007
	5008
	5009	/*
	5010	* If there's something expensive in the r.e., find the
	5011	* longest literal string that must appear and make it the
	5012	* regmust. Resolve ties in favor of later strings, since
	5013	* the regstart check works with the beginning of the r.e.
	5014	* and avoiding duplication strengthens checking. Not a
	5015	* strong reason, but sufficient in the absence of others.
	5016	* [Now we resolve ties in favor of the earlier string if
	5017	* it happens that c_offset_min has been invalidated, since the
	5018	* earlier string may buy us something the later one won't.]
	5019	*/
	5020
	5021	data.longest_fixed = newSVpvs("");
	5022	data.longest_float = newSVpvs("");
	5023	data.last_found = newSVpvs("");
	5024	data.longest = &(data.longest_fixed);
	5025	first = scan;
	5026	if (!ri->regstclass) {
	5027	cl_init(pRExC_state, &ch_class);
	5028	data.start_class = &ch_class;
	5029	stclass_flag = SCF_DO_STCLASS_AND;
	5030	} else /* XXXX Check for BOUND? */
	5031	stclass_flag = 0;
	5032	data.last_closep = &last_close;
	5033
	5034	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
	5035	&data, -1, NULL, NULL,
	5036	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag,0);
	5037
	5038
	5039	CHECK_RESTUDY_GOTO;
	5040
	5041
	5042	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	5043	&& data.last_start_min == 0 && data.last_end > 0
	5044	&& !RExC_seen_zerolen
	5045	&& !(RExC_seen & REG_SEEN_VERBARG)
	5046	&& (!(RExC_seen & REG_SEEN_GPOS) \|\| (r->extflags & RXf_ANCH_GPOS)))
	5047	r->extflags \|= RXf_CHECK_ALL;
	5048	scan_commit(pRExC_state, &data,&minlen,0);
	5049	SvREFCNT_dec(data.last_found);
	5050
	5051	/* Note that code very similar to this but for anchored string
	5052	follows immediately below, changes may need to be made to both.
	5053	Be careful.
	5054	*/
	5055	longest_float_length = CHR_SVLEN(data.longest_float);
	5056	if (longest_float_length
	5057	\|\| (data.flags & SF_FL_BEFORE_EOL
	5058	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5059	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	5060	{
	5061	I32 t,ml;
	5062
	5063	if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	5064	&& data.offset_fixed == data.offset_float_min
	5065	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
	5066	goto remove_float; /* As in (a)+. */
	5067
	5068	/* copy the information about the longest float from the reg_scan_data
	5069	over to the program. */
	5070	if (SvUTF8(data.longest_float)) {
	5071	r->float_utf8 = data.longest_float;
	5072	r->float_substr = NULL;
	5073	} else {
	5074	r->float_substr = data.longest_float;
	5075	r->float_utf8 = NULL;
	5076	}
	5077	/* float_end_shift is how many chars that must be matched that
	5078	follow this item. We calculate it ahead of time as once the
	5079	lookbehind offset is added in we lose the ability to correctly
	5080	calculate it.*/
	5081	ml = data.minlen_float ? *(data.minlen_float)
	5082	: (I32)longest_float_length;
	5083	r->float_end_shift = ml - data.offset_float_min
	5084	- longest_float_length + (SvTAIL(data.longest_float) != 0)
	5085	+ data.lookbehind_float;
	5086	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	5087	r->float_max_offset = data.offset_float_max;
	5088	if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
	5089	r->float_max_offset -= data.lookbehind_float;
	5090
	5091	t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
	5092	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5093	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5094	fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
	5095	}
	5096	else {
	5097	remove_float:
	5098	r->float_substr = r->float_utf8 = NULL;
	5099	SvREFCNT_dec(data.longest_float);
	5100	longest_float_length = 0;
	5101	}
	5102
	5103	/* Note that code very similar to this but for floating string
	5104	is immediately above, changes may need to be made to both.
	5105	Be careful.
	5106	*/
	5107	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	5108	if (longest_fixed_length
	5109	\|\| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
	5110	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5111	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	5112	{
	5113	I32 t,ml;
	5114
	5115	/* copy the information about the longest fixed
	5116	from the reg_scan_data over to the program. */
	5117	if (SvUTF8(data.longest_fixed)) {
	5118	r->anchored_utf8 = data.longest_fixed;
	5119	r->anchored_substr = NULL;
	5120	} else {
	5121	r->anchored_substr = data.longest_fixed;
	5122	r->anchored_utf8 = NULL;
	5123	}
	5124	/* fixed_end_shift is how many chars that must be matched that
	5125	follow this item. We calculate it ahead of time as once the
	5126	lookbehind offset is added in we lose the ability to correctly
	5127	calculate it.*/
	5128	ml = data.minlen_fixed ? *(data.minlen_fixed)
	5129	: (I32)longest_fixed_length;
	5130	r->anchored_end_shift = ml - data.offset_fixed
	5131	- longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
	5132	+ data.lookbehind_fixed;
	5133	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	5134
	5135	t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
	5136	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5137	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5138	fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
	5139	}
	5140	else {
	5141	r->anchored_substr = r->anchored_utf8 = NULL;
	5142	SvREFCNT_dec(data.longest_fixed);
	5143	longest_fixed_length = 0;
	5144	}
	5145	if (ri->regstclass
	5146	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	5147	ri->regstclass = NULL;
	5148
	5149	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	5150	&& stclass_flag
	5151	&& !(data.start_class->flags & ANYOF_EOS)
	5152	&& !cl_is_anything(data.start_class))
	5153	{
	5154	const U32 n = add_data(pRExC_state, 1, "f");
	5155	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5156
	5157	Newx(RExC_rxi->data->data[n], 1,
	5158	struct regnode_charclass_class);
	5159	StructCopy(data.start_class,
	5160	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5161	struct regnode_charclass_class);
	5162	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5163	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5164	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	5165	regprop(r, sv, (regnode*)data.start_class);
	5166	PerlIO_printf(Perl_debug_log,
	5167	"synthetic stclass \"%s\".\n",
	5168	SvPVX_const(sv));});
	5169	}
	5170
	5171	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
	5172	if (longest_fixed_length > longest_float_length) {
	5173	r->check_end_shift = r->anchored_end_shift;
	5174	r->check_substr = r->anchored_substr;
	5175	r->check_utf8 = r->anchored_utf8;
	5176	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	5177	if (r->extflags & RXf_ANCH_SINGLE)
	5178	r->extflags \|= RXf_NOSCAN;
	5179	}
	5180	else {
	5181	r->check_end_shift = r->float_end_shift;
	5182	r->check_substr = r->float_substr;
	5183	r->check_utf8 = r->float_utf8;
	5184	r->check_offset_min = r->float_min_offset;
	5185	r->check_offset_max = r->float_max_offset;
	5186	}
	5187	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
	5188	This should be changed ASAP! */
	5189	if ((r->check_substr \|\| r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
	5190	r->extflags \|= RXf_USE_INTUIT;
	5191	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	5192	r->extflags \|= RXf_INTUIT_TAIL;
	5193	}
	5194	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	5195	if ( (STRLEN)minlen < longest_float_length )
	5196	minlen= longest_float_length;
	5197	if ( (STRLEN)minlen < longest_fixed_length )
	5198	minlen= longest_fixed_length;
	5199	*/
	5200	}
	5201	else {
	5202	/* Several toplevels. Best we can is to set minlen. */
	5203	I32 fake;
	5204	struct regnode_charclass_class ch_class;
	5205	I32 last_close = 0;
	5206
	5207	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	5208
	5209	scan = ri->program + 1;
	5210	cl_init(pRExC_state, &ch_class);
	5211	data.start_class = &ch_class;
	5212	data.last_closep = &last_close;
	5213
	5214
	5215	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
	5216	&data, -1, NULL, NULL, SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS,0);
	5217
	5218	CHECK_RESTUDY_GOTO;
	5219
	5220	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	5221	= r->float_substr = r->float_utf8 = NULL;
	5222
	5223	if (!(data.start_class->flags & ANYOF_EOS)
	5224	&& !cl_is_anything(data.start_class))
	5225	{
	5226	const U32 n = add_data(pRExC_state, 1, "f");
	5227	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5228
	5229	Newx(RExC_rxi->data->data[n], 1,
	5230	struct regnode_charclass_class);
	5231	StructCopy(data.start_class,
	5232	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5233	struct regnode_charclass_class);
	5234	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5235	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5236	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	5237	regprop(r, sv, (regnode*)data.start_class);
	5238	PerlIO_printf(Perl_debug_log,
	5239	"synthetic stclass \"%s\".\n",
	5240	SvPVX_const(sv));});
	5241	}
	5242	}
	5243
	5244	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	5245	the "real" pattern. */
	5246	DEBUG_OPTIMISE_r({
	5247	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
	5248	(IV)minlen, (IV)r->minlen);
	5249	});
	5250	r->minlenret = minlen;
	5251	if (r->minlen < minlen)
	5252	r->minlen = minlen;
	5253
	5254	if (RExC_seen & REG_SEEN_GPOS)
	5255	r->extflags \|= RXf_GPOS_SEEN;
	5256	if (RExC_seen & REG_SEEN_LOOKBEHIND)
	5257	r->extflags \|= RXf_LOOKBEHIND_SEEN;
	5258	if (RExC_seen & REG_SEEN_EVAL)
	5259	r->extflags \|= RXf_EVAL_SEEN;
	5260	if (RExC_seen & REG_SEEN_CANY)
	5261	r->extflags \|= RXf_CANY_SEEN;
	5262	if (RExC_seen & REG_SEEN_VERBARG)
	5263	r->intflags \|= PREGf_VERBARG_SEEN;
	5264	if (RExC_seen & REG_SEEN_CUTGROUP)
	5265	r->intflags \|= PREGf_CUTGROUP_SEEN;
	5266	if (RExC_paren_names)
	5267	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	5268	else
	5269	RXp_PAREN_NAMES(r) = NULL;
	5270
	5271	#ifdef STUPID_PATTERN_CHECKS
	5272	if (RX_PRELEN(rx) == 0)
	5273	r->extflags \|= RXf_NULL;
	5274	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5275	/* XXX: this should happen BEFORE we compile */
	5276	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5277	else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
	5278	r->extflags \|= RXf_WHITE;
	5279	else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
	5280	r->extflags \|= RXf_START_ONLY;
	5281	#else
	5282	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5283	/* XXX: this should happen BEFORE we compile */
	5284	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5285	else {
	5286	regnode *first = ri->program + 1;
	5287	U8 fop = OP(first);
	5288
	5289	if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
	5290	r->extflags \|= RXf_NULL;
	5291	else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
	5292	r->extflags \|= RXf_START_ONLY;
	5293	else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
	5294	&& OP(regnext(first)) == END)
	5295	r->extflags \|= RXf_WHITE;
	5296	}
	5297	#endif
	5298	#ifdef DEBUGGING
	5299	if (RExC_paren_names) {
	5300	ri->name_list_idx = add_data( pRExC_state, 1, "a" );
	5301	ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
	5302	} else
	5303	#endif
	5304	ri->name_list_idx = 0;
	5305
	5306	if (RExC_recurse_count) {
	5307	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	5308	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	5309	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	5310	}
	5311	}
	5312	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	5313	/* assume we don't need to swap parens around before we match */
	5314
	5315	DEBUG_DUMP_r({
	5316	PerlIO_printf(Perl_debug_log,"Final program:\n");
	5317	regdump(r);
	5318	});
	5319	#ifdef RE_TRACK_PATTERN_OFFSETS
	5320	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	5321	const U32 len = ri->u.offsets[0];
	5322	U32 i;
	5323	GET_RE_DEBUG_FLAGS_DECL;
	5324	PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	5325	for (i = 1; i <= len; i++) {
	5326	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	5327	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	5328	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	5329	}
	5330	PerlIO_printf(Perl_debug_log, "\n");
	5331	});
	5332	#endif
	5333	return rx;
	5334	}
	5335
	5336	#undef RE_ENGINE_PTR
	5337
	5338
	5339	SV*
	5340	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	5341	const U32 flags)
	5342	{
	5343	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	5344
	5345	PERL_UNUSED_ARG(value);
	5346
	5347	if (flags & RXapif_FETCH) {
	5348	return reg_named_buff_fetch(rx, key, flags);
	5349	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	5350	Perl_croak_no_modify(aTHX);
	5351	return NULL;
	5352	} else if (flags & RXapif_EXISTS) {
	5353	return reg_named_buff_exists(rx, key, flags)
	5354	? &PL_sv_yes
	5355	: &PL_sv_no;
	5356	} else if (flags & RXapif_REGNAMES) {
	5357	return reg_named_buff_all(rx, flags);
	5358	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	5359	return reg_named_buff_scalar(rx, flags);
	5360	} else {
	5361	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	5362	return NULL;
	5363	}
	5364	}
	5365
	5366	SV*
	5367	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	5368	const U32 flags)
	5369	{
	5370	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	5371	PERL_UNUSED_ARG(lastkey);
	5372
	5373	if (flags & RXapif_FIRSTKEY)
	5374	return reg_named_buff_firstkey(rx, flags);
	5375	else if (flags & RXapif_NEXTKEY)
	5376	return reg_named_buff_nextkey(rx, flags);
	5377	else {
	5378	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
	5379	return NULL;
	5380	}
	5381	}
	5382
	5383	SV*
	5384	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	5385	const U32 flags)
	5386	{
	5387	AV *retarray = NULL;
	5388	SV *ret;
	5389	struct regexp const rx = (struct regexp )SvANY(r);
	5390
	5391	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	5392
	5393	if (flags & RXapif_ALL)
	5394	retarray=newAV();
	5395
	5396	if (rx && RXp_PAREN_NAMES(rx)) {
	5397	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	5398	if (he_str) {
	5399	IV i;
	5400	SV* sv_dat=HeVAL(he_str);
	5401	I32 nums=(I32)SvPVX(sv_dat);
	5402	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	5403	if ((I32)(rx->nparens) >= nums[i]
	5404	&& rx->offs[nums[i]].start != -1
	5405	&& rx->offs[nums[i]].end != -1)
	5406	{
	5407	ret = newSVpvs("");
	5408	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	5409	if (!retarray)
	5410	return ret;
	5411	} else {
	5412	ret = newSVsv(&PL_sv_undef);
	5413	}
	5414	if (retarray)
	5415	av_push(retarray, ret);
	5416	}
	5417	if (retarray)
	5418	return newRV_noinc(MUTABLE_SV(retarray));
	5419	}
	5420	}
	5421	return NULL;
	5422	}
	5423
	5424	bool
	5425	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	5426	const U32 flags)
	5427	{
	5428	struct regexp const rx = (struct regexp )SvANY(r);
	5429
	5430	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	5431
	5432	if (rx && RXp_PAREN_NAMES(rx)) {
	5433	if (flags & RXapif_ALL) {
	5434	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	5435	} else {
	5436	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	5437	if (sv) {
	5438	SvREFCNT_dec(sv);
	5439	return TRUE;
	5440	} else {
	5441	return FALSE;
	5442	}
	5443	}
	5444	} else {
	5445	return FALSE;
	5446	}
	5447	}
	5448
	5449	SV*
	5450	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	5451	{
	5452	struct regexp const rx = (struct regexp )SvANY(r);
	5453
	5454	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	5455
	5456	if ( rx && RXp_PAREN_NAMES(rx) ) {
	5457	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	5458
	5459	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	5460	} else {
	5461	return FALSE;
	5462	}
	5463	}
	5464
	5465	SV*
	5466	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	5467	{
	5468	struct regexp const rx = (struct regexp )SvANY(r);
	5469	GET_RE_DEBUG_FLAGS_DECL;
	5470
	5471	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	5472
	5473	if (rx && RXp_PAREN_NAMES(rx)) {
	5474	HV *hv = RXp_PAREN_NAMES(rx);
	5475	HE *temphe;
	5476	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5477	IV i;
	5478	IV parno = 0;
	5479	SV* sv_dat = HeVAL(temphe);
	5480	I32 nums = (I32)SvPVX(sv_dat);
	5481	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5482	if ((I32)(rx->lastparen) >= nums[i] &&
	5483	rx->offs[nums[i]].start != -1 &&
	5484	rx->offs[nums[i]].end != -1)
	5485	{
	5486	parno = nums[i];
	5487	break;
	5488	}
	5489	}
	5490	if (parno \|\| flags & RXapif_ALL) {
	5491	return newSVhek(HeKEY_hek(temphe));
	5492	}
	5493	}
	5494	}
	5495	return NULL;
	5496	}
	5497
	5498	SV*
	5499	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	5500	{
	5501	SV *ret;
	5502	AV *av;
	5503	I32 length;
	5504	struct regexp const rx = (struct regexp )SvANY(r);
	5505
	5506	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	5507
	5508	if (rx && RXp_PAREN_NAMES(rx)) {
	5509	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	5510	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	5511	} else if (flags & RXapif_ONE) {
	5512	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	5513	av = MUTABLE_AV(SvRV(ret));
	5514	length = av_len(av);
	5515	SvREFCNT_dec(ret);
	5516	return newSViv(length + 1);
	5517	} else {
	5518	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
	5519	return NULL;
	5520	}
	5521	}
	5522	return &PL_sv_undef;
	5523	}
	5524
	5525	SV*
	5526	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	5527	{
	5528	struct regexp const rx = (struct regexp )SvANY(r);
	5529	AV *av = newAV();
	5530
	5531	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	5532
	5533	if (rx && RXp_PAREN_NAMES(rx)) {
	5534	HV *hv= RXp_PAREN_NAMES(rx);
	5535	HE *temphe;
	5536	(void)hv_iterinit(hv);
	5537	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5538	IV i;
	5539	IV parno = 0;
	5540	SV* sv_dat = HeVAL(temphe);
	5541	I32 nums = (I32)SvPVX(sv_dat);
	5542	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5543	if ((I32)(rx->lastparen) >= nums[i] &&
	5544	rx->offs[nums[i]].start != -1 &&
	5545	rx->offs[nums[i]].end != -1)
	5546	{
	5547	parno = nums[i];
	5548	break;
	5549	}
	5550	}
	5551	if (parno \|\| flags & RXapif_ALL) {
	5552	av_push(av, newSVhek(HeKEY_hek(temphe)));
	5553	}
	5554	}
	5555	}
	5556
	5557	return newRV_noinc(MUTABLE_SV(av));
	5558	}
	5559
	5560	void
	5561	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	5562	SV * const sv)
	5563	{
	5564	struct regexp const rx = (struct regexp )SvANY(r);
	5565	char *s = NULL;
	5566	I32 i = 0;
	5567	I32 s1, t1;
	5568
	5569	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	5570
	5571	if (!rx->subbeg) {
	5572	sv_setsv(sv,&PL_sv_undef);
	5573	return;
	5574	}
	5575	else
	5576	if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
	5577	/* $` */
	5578	i = rx->offs[0].start;
	5579	s = rx->subbeg;
	5580	}
	5581	else
	5582	if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
	5583	/* $' */
	5584	s = rx->subbeg + rx->offs[0].end;
	5585	i = rx->sublen - rx->offs[0].end;
	5586	}
	5587	else
	5588	if ( 0 <= paren && paren <= (I32)rx->nparens &&
	5589	(s1 = rx->offs[paren].start) != -1 &&
	5590	(t1 = rx->offs[paren].end) != -1)
	5591	{
	5592	/* $& $1 ... */
	5593	i = t1 - s1;
	5594	s = rx->subbeg + s1;
	5595	} else {
	5596	sv_setsv(sv,&PL_sv_undef);
	5597	return;
	5598	}
	5599	assert(rx->sublen >= (s - rx->subbeg) + i );
	5600	if (i >= 0) {
	5601	const int oldtainted = PL_tainted;
	5602	TAINT_NOT;
	5603	sv_setpvn(sv, s, i);
	5604	PL_tainted = oldtainted;
	5605	if ( (rx->extflags & RXf_CANY_SEEN)
	5606	? (RXp_MATCH_UTF8(rx)
	5607	&& (!i \|\| is_utf8_string((U8*)s, i)))
	5608	: (RXp_MATCH_UTF8(rx)) )
	5609	{
	5610	SvUTF8_on(sv);
	5611	}
	5612	else
	5613	SvUTF8_off(sv);
	5614	if (PL_tainting) {
	5615	if (RXp_MATCH_TAINTED(rx)) {
	5616	if (SvTYPE(sv) >= SVt_PVMG) {
	5617	MAGIC* const mg = SvMAGIC(sv);
	5618	MAGIC* mgt;
	5619	PL_tainted = 1;
	5620	SvMAGIC_set(sv, mg->mg_moremagic);
	5621	SvTAINT(sv);
	5622	if ((mgt = SvMAGIC(sv))) {
	5623	mg->mg_moremagic = mgt;
	5624	SvMAGIC_set(sv, mg);
	5625	}
	5626	} else {
	5627	PL_tainted = 1;
	5628	SvTAINT(sv);
	5629	}
	5630	} else
	5631	SvTAINTED_off(sv);
	5632	}
	5633	} else {
	5634	sv_setsv(sv,&PL_sv_undef);
	5635	return;
	5636	}
	5637	}
	5638
	5639	void
	5640	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	5641	SV const * const value)
	5642	{
	5643	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	5644
	5645	PERL_UNUSED_ARG(rx);
	5646	PERL_UNUSED_ARG(paren);
	5647	PERL_UNUSED_ARG(value);
	5648
	5649	if (!PL_localizing)
	5650	Perl_croak_no_modify(aTHX);
	5651	}
	5652
	5653	I32
	5654	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	5655	const I32 paren)
	5656	{
	5657	struct regexp const rx = (struct regexp )SvANY(r);
	5658	I32 i;
	5659	I32 s1, t1;
	5660
	5661	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	5662
	5663	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	5664	switch (paren) {
	5665	/* $` / ${^PREMATCH} */
	5666	case RX_BUFF_IDX_PREMATCH:
	5667	if (rx->offs[0].start != -1) {
	5668	i = rx->offs[0].start;
	5669	if (i > 0) {
	5670	s1 = 0;
	5671	t1 = i;
	5672	goto getlen;
	5673	}
	5674	}
	5675	return 0;
	5676	/* $' / ${^POSTMATCH} */
	5677	case RX_BUFF_IDX_POSTMATCH:
	5678	if (rx->offs[0].end != -1) {
	5679	i = rx->sublen - rx->offs[0].end;
	5680	if (i > 0) {
	5681	s1 = rx->offs[0].end;
	5682	t1 = rx->sublen;
	5683	goto getlen;
	5684	}
	5685	}
	5686	return 0;
	5687	/* $& / ${^MATCH}, $1, $2, ... */
	5688	default:
	5689	if (paren <= (I32)rx->nparens &&
	5690	(s1 = rx->offs[paren].start) != -1 &&
	5691	(t1 = rx->offs[paren].end) != -1)
	5692	{
	5693	i = t1 - s1;
	5694	goto getlen;
	5695	} else {
	5696	if (ckWARN(WARN_UNINITIALIZED))
	5697	report_uninit((const SV *)sv);
	5698	return 0;
	5699	}
	5700	}
	5701	getlen:
	5702	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	5703	const char * const s = rx->subbeg + s1;
	5704	const U8 *ep;
	5705	STRLEN el;
	5706
	5707	i = t1 - s1;
	5708	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	5709	i = el;
	5710	}
	5711	return i;
	5712	}
	5713
	5714	SV*
	5715	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	5716	{
	5717	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	5718	PERL_UNUSED_ARG(rx);
	5719	if (0)
	5720	return NULL;
	5721	else
	5722	return newSVpvs("Regexp");
	5723	}
	5724
	5725	/* Scans the name of a named buffer from the pattern.
	5726	* If flags is REG_RSN_RETURN_NULL returns null.
	5727	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	5728	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	5729	* to the parsed name as looked up in the RExC_paren_names hash.
	5730	* If there is an error throws a vFAIL().. type exception.
	5731	*/
	5732
	5733	#define REG_RSN_RETURN_NULL 0
	5734	#define REG_RSN_RETURN_NAME 1
	5735	#define REG_RSN_RETURN_DATA 2
	5736
	5737	STATIC SV*
	5738	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	5739	{
	5740	char *name_start = RExC_parse;
	5741
	5742	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	5743
	5744	if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	5745	/* skip IDFIRST by using do...while */
	5746	if (UTF)
	5747	do {
	5748	RExC_parse += UTF8SKIP(RExC_parse);
	5749	} while (isALNUM_utf8((U8*)RExC_parse));
	5750	else
	5751	do {
	5752	RExC_parse++;
	5753	} while (isALNUM(*RExC_parse));
	5754	}
	5755
	5756	if ( flags ) {
	5757	SV* sv_name
	5758	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	5759	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	5760	if ( flags == REG_RSN_RETURN_NAME)
	5761	return sv_name;
	5762	else if (flags==REG_RSN_RETURN_DATA) {
	5763	HE *he_str = NULL;
	5764	SV *sv_dat = NULL;
	5765	if ( ! sv_name ) /* should not happen*/
	5766	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	5767	if (RExC_paren_names)
	5768	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	5769	if ( he_str )
	5770	sv_dat = HeVAL(he_str);
	5771	if ( ! sv_dat )
	5772	vFAIL("Reference to nonexistent named group");
	5773	return sv_dat;
	5774	}
	5775	else {
	5776	Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
	5777	}
	5778	/* NOT REACHED */
	5779	}
	5780	return NULL;
	5781	}
	5782
	5783	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	5784	int rem=(int)(RExC_end - RExC_parse); \
	5785	int cut; \
	5786	int num; \
	5787	int iscut=0; \
	5788	if (rem>10) { \
	5789	rem=10; \
	5790	iscut=1; \
	5791	} \
	5792	cut=10-rem; \
	5793	if (RExC_lastparse!=RExC_parse) \
	5794	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	5795	rem, RExC_parse, \
	5796	cut + 4, \
	5797	iscut ? "..." : "<" \
	5798	); \
	5799	else \
	5800	PerlIO_printf(Perl_debug_log,"%16s",""); \
	5801	\
	5802	if (SIZE_ONLY) \
	5803	num = RExC_size + 1; \
	5804	else \
	5805	num=REG_NODE_NUM(RExC_emit); \
	5806	if (RExC_lastnum!=num) \
	5807	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	5808	else \
	5809	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	5810	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	5811	(int)((depth*2)), "", \
	5812	(funcname) \
	5813	); \
	5814	RExC_lastnum=num; \
	5815	RExC_lastparse=RExC_parse; \
	5816	})
	5817
	5818
	5819
	5820	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	5821	DEBUG_PARSE_MSG((funcname)); \
	5822	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	5823	})
	5824	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	5825	DEBUG_PARSE_MSG((funcname)); \
	5826	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	5827	})
	5828
	5829	/* This section of code defines the inversion list object and its methods. The
	5830	* interfaces are highly subject to change, so as much as possible is static to
	5831	* this file. An inversion list is here implemented as a malloc'd C UV array
	5832	* with some added info that is placed as UVs at the beginning in a header
	5833	* portion. An inversion list for Unicode is an array of code points, sorted
	5834	* by ordinal number. The zeroth element is the first code point in the list.
	5835	* The 1th element is the first element beyond that not in the list. In other
	5836	* words, the first range is
	5837	* invlist[0]..(invlist[1]-1)
	5838	* The other ranges follow. Thus every element that is divisible by two marks
	5839	* the beginning of a range that is in the list, and every element not
	5840	* divisible by two marks the beginning of a range not in the list. A single
	5841	* element inversion list that contains the single code point N generally
	5842	* consists of two elements
	5843	* invlist[0] == N
	5844	* invlist[1] == N+1
	5845	* (The exception is when N is the highest representable value on the
	5846	* machine, in which case the list containing just it would be a single
	5847	* element, itself. By extension, if the last range in the list extends to
	5848	* infinity, then the first element of that range will be in the inversion list
	5849	* at a position that is divisible by two, and is the final element in the
	5850	* list.)
	5851	* Taking the complement (inverting) an inversion list is quite simple, if the
	5852	* first element is 0, remove it; otherwise add a 0 element at the beginning.
	5853	* This implementation reserves an element at the beginning of each inversion list
	5854	* to contain 0 when the list contains 0, and contains 1 otherwise. The actual
	5855	* beginning of the list is either that element if 0, or the next one if 1.
	5856	*
	5857	* More about inversion lists can be found in "Unicode Demystified"
	5858	* Chapter 13 by Richard Gillam, published by Addison-Wesley.
	5859	* More will be coming when functionality is added later.
	5860	*
	5861	* The inversion list data structure is currently implemented as an SV pointing
	5862	* to an array of UVs that the SV thinks are bytes. This allows us to have an
	5863	* array of UV whose memory management is automatically handled by the existing
	5864	* facilities for SV's.
	5865	*
	5866	* Some of the methods should always be private to the implementation, and some
	5867	* should eventually be made public */
	5868
	5869	#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
	5870	#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
	5871
	5872	#define INVLIST_ZERO_OFFSET 2 /* 0 or 1; must be last element in header */
	5873	/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
	5874	* contains the code point U+00000, and begins here. If 1, the inversion list
	5875	* doesn't contain U+0000, and it begins at the next UV in the array.
	5876	* Inverting an inversion list consists of adding or removing the 0 at the
	5877	* beginning of it. By reserving a space for that 0, inversion can be made
	5878	* very fast */
	5879
	5880	#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
	5881
	5882	/* Internally things are UVs */
	5883	#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
	5884	#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
	5885
	5886	#define INVLIST_INITIAL_LEN 10
	5887
	5888	PERL_STATIC_INLINE UV*
	5889	S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
	5890	{
	5891	/* Returns a pointer to the first element in the inversion list's array.
	5892	* This is called upon initialization of an inversion list. Where the
	5893	* array begins depends on whether the list has the code point U+0000
	5894	* in it or not. The other parameter tells it whether the code that
	5895	* follows this call is about to put a 0 in the inversion list or not.
	5896	* The first element is either the element with 0, if 0, or the next one,
	5897	* if 1 */
	5898
	5899	UV* zero = get_invlist_zero_addr(invlist);
	5900
	5901	PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
	5902
	5903	/* Must be empty */
	5904	assert(! *get_invlist_len_addr(invlist));
	5905
	5906	/* 1^1 = 0; 1^0 = 1 */
	5907	*zero = 1 ^ will_have_0;
	5908	return zero + *zero;
	5909	}
	5910
	5911	PERL_STATIC_INLINE UV*
	5912	S_invlist_array(pTHX_ SV* const invlist)
	5913	{
	5914	/* Returns the pointer to the inversion list's array. Every time the
	5915	* length changes, this needs to be called in case malloc or realloc moved
	5916	* it */
	5917
	5918	PERL_ARGS_ASSERT_INVLIST_ARRAY;
	5919
	5920	/* Must not be empty */
	5921	assert(*get_invlist_len_addr(invlist));
	5922	assert(*get_invlist_zero_addr(invlist) == 0
	5923	\|\| *get_invlist_zero_addr(invlist) == 1);
	5924
	5925	/* The array begins either at the element reserved for zero if the
	5926	* list contains 0 (that element will be set to 0), or otherwise the next
	5927	* element (in which case the reserved element will be set to 1). */
	5928	return (UV *) (get_invlist_zero_addr(invlist)
	5929	+ *get_invlist_zero_addr(invlist));
	5930	}
	5931
	5932	PERL_STATIC_INLINE UV*
	5933	S_get_invlist_len_addr(pTHX_ SV* invlist)
	5934	{
	5935	/* Return the address of the UV that contains the current number
	5936	* of used elements in the inversion list */
	5937
	5938	PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
	5939
	5940	return (UV ) (SvPVX(invlist) + (INVLIST_LEN_OFFSET sizeof (UV)));
	5941	}
	5942
	5943	PERL_STATIC_INLINE UV
	5944	S_invlist_len(pTHX_ SV* const invlist)
	5945	{
	5946	/* Returns the current number of elements in the inversion list's array */
	5947
	5948	PERL_ARGS_ASSERT_INVLIST_LEN;
	5949
	5950	return *get_invlist_len_addr(invlist);
	5951	}
	5952
	5953	PERL_STATIC_INLINE void
	5954	S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
	5955	{
	5956	/* Sets the current number of elements stored in the inversion list */
	5957
	5958	PERL_ARGS_ASSERT_INVLIST_SET_LEN;
	5959
	5960	*get_invlist_len_addr(invlist) = len;
	5961
	5962	assert(len <= SvLEN(invlist));
	5963
	5964	SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
	5965	/* If the list contains U+0000, that element is part of the header,
	5966	* and should not be counted as part of the array. It will contain
	5967	* 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
	5968	* subtract:
	5969	* SvCUR_set(invlist,
	5970	* TO_INTERNAL_SIZE(len
	5971	* - (*get_invlist_zero_addr(inv_list) ^ 1)));
	5972	* But, this is only valid if len is not 0. The consequences of not doing
	5973	* this is that the memory allocation code may think that 1 more UV is
	5974	* being used than actually is, and so might do an unnecessary grow. That
	5975	* seems worth not bothering to make this the precise amount.
	5976	*
	5977	* Note that when inverting, SvCUR shouldn't change */
	5978	}
	5979
	5980	PERL_STATIC_INLINE UV
	5981	S_invlist_max(pTHX_ SV* const invlist)
	5982	{
	5983	/* Returns the maximum number of elements storable in the inversion list's
	5984	* array, without having to realloc() */
	5985
	5986	PERL_ARGS_ASSERT_INVLIST_MAX;
	5987
	5988	return FROM_INTERNAL_SIZE(SvLEN(invlist));
	5989	}
	5990
	5991	PERL_STATIC_INLINE UV*
	5992	S_get_invlist_zero_addr(pTHX_ SV* invlist)
	5993	{
	5994	/* Return the address of the UV that is reserved to hold 0 if the inversion
	5995	* list contains 0. This has to be the last element of the heading, as the
	5996	* list proper starts with either it if 0, or the next element if not.
	5997	* (But we force it to contain either 0 or 1) */
	5998
	5999	PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
	6000
	6001	return (UV ) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET sizeof (UV)));
	6002	}
	6003
	6004	#ifndef PERL_IN_XSUB_RE
	6005	SV*
	6006	Perl__new_invlist(pTHX_ IV initial_size)
	6007	{
	6008
	6009	/* Return a pointer to a newly constructed inversion list, with enough
	6010	* space to store 'initial_size' elements. If that number is negative, a
	6011	* system default is used instead */
	6012
	6013	SV* new_list;
	6014
	6015	if (initial_size < 0) {
	6016	initial_size = INVLIST_INITIAL_LEN;
	6017	}
	6018
	6019	/* Allocate the initial space */
	6020	new_list = newSV(TO_INTERNAL_SIZE(initial_size));
	6021	invlist_set_len(new_list, 0);
	6022
	6023	/* Force iterinit() to be used to get iteration to work */
	6024	*get_invlist_iter_addr(new_list) = UV_MAX;
	6025
	6026	/* This should force a segfault if a method doesn't initialize this
	6027	* properly */
	6028	*get_invlist_zero_addr(new_list) = UV_MAX;
	6029
	6030	return new_list;
	6031	}
	6032	#endif
	6033
	6034	STATIC void
	6035	S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
	6036	{
	6037	/* Grow the maximum size of an inversion list */
	6038
	6039	PERL_ARGS_ASSERT_INVLIST_EXTEND;
	6040
	6041	SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
	6042	}
	6043
	6044	PERL_STATIC_INLINE void
	6045	S_invlist_trim(pTHX_ SV* const invlist)
	6046	{
	6047	PERL_ARGS_ASSERT_INVLIST_TRIM;
	6048
	6049	/* Change the length of the inversion list to how many entries it currently
	6050	* has */
	6051
	6052	SvPV_shrink_to_cur((SV *) invlist);
	6053	}
	6054
	6055	/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
	6056	* etc */
	6057
	6058	#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
	6059	#define PREV_ELEMENT_IN_INVLIST_SET(i) (! ELEMENT_IN_INVLIST_SET(i))
	6060
	6061	#ifndef PERL_IN_XSUB_RE
	6062	void
	6063	Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
	6064	{
	6065	/* Subject to change or removal. Append the range from 'start' to 'end' at
	6066	* the end of the inversion list. The range must be above any existing
	6067	* ones. */
	6068
	6069	UV* array;
	6070	UV max = invlist_max(invlist);
	6071	UV len = invlist_len(invlist);
	6072
	6073	PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
	6074
	6075	if (len == 0) { /* Empty lists must be initialized */
	6076	array = _invlist_array_init(invlist, start == 0);
	6077	}
	6078	else {
	6079	/* Here, the existing list is non-empty. The current max entry in the
	6080	* list is generally the first value not in the set, except when the
	6081	* set extends to the end of permissible values, in which case it is
	6082	* the first entry in that final set, and so this call is an attempt to
	6083	* append out-of-order */
	6084
	6085	UV final_element = len - 1;
	6086	array = invlist_array(invlist);
	6087	if (array[final_element] > start
	6088	\|\| ELEMENT_IN_INVLIST_SET(final_element))
	6089	{
	6090	Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
	6091	}
	6092
	6093	/* Here, it is a legal append. If the new range begins with the first
	6094	* value not in the set, it is extending the set, so the new first
	6095	* value not in the set is one greater than the newly extended range.
	6096	* */
	6097	if (array[final_element] == start) {
	6098	if (end != UV_MAX) {
	6099	array[final_element] = end + 1;
	6100	}
	6101	else {
	6102	/* But if the end is the maximum representable on the machine,
	6103	* just let the range that this would extend have no end */
	6104	invlist_set_len(invlist, len - 1);
	6105	}
	6106	return;
	6107	}
	6108	}
	6109
	6110	/* Here the new range doesn't extend any existing set. Add it */
	6111
	6112	len += 2; /* Includes an element each for the start and end of range */
	6113
	6114	/* If overflows the existing space, extend, which may cause the array to be
	6115	* moved */
	6116	if (max < len) {
	6117	invlist_extend(invlist, len);
	6118	invlist_set_len(invlist, len); /* Have to set len here to avoid assert
	6119	failure in invlist_array() */
	6120	array = invlist_array(invlist);
	6121	}
	6122	else {
	6123	invlist_set_len(invlist, len);
	6124	}
	6125
	6126	/* The next item on the list starts the range, the one after that is
	6127	* one past the new range. */
	6128	array[len - 2] = start;
	6129	if (end != UV_MAX) {
	6130	array[len - 1] = end + 1;
	6131	}
	6132	else {
	6133	/* But if the end is the maximum representable on the machine, just let
	6134	* the range have no end */
	6135	invlist_set_len(invlist, len - 1);
	6136	}
	6137	}
	6138
	6139	void
	6140	Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
	6141	{
	6142	/* Take the union of two inversion lists and point 'result' to it. If
	6143	* 'result' on input points to one of the two lists, the reference count to
	6144	* that list will be decremented.
	6145	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	6146	* Richard Gillam, published by Addison-Wesley, and explained at some
	6147	* length there. The preface says to incorporate its examples into your
	6148	* code at your own risk.
	6149	*
	6150	* The algorithm is like a merge sort.
	6151	*
	6152	* XXX A potential performance improvement is to keep track as we go along
	6153	* if only one of the inputs contributes to the result, meaning the other
	6154	* is a subset of that one. In that case, we can skip the final copy and
	6155	* return the larger of the input lists, but then outside code might need
	6156	* to keep track of whether to free the input list or not */
	6157
	6158	UV* array_a; /* a's array */
	6159	UV* array_b;
	6160	UV len_a; /* length of a's array */
	6161	UV len_b;
	6162
	6163	SV* u; /* the resulting union */
	6164	UV* array_u;
	6165	UV len_u;
	6166
	6167	UV i_a = 0; /* current index into a's array */
	6168	UV i_b = 0;
	6169	UV i_u = 0;
	6170
	6171	/* running count, as explained in the algorithm source book; items are
	6172	* stopped accumulating and are output when the count changes to/from 0.
	6173	* The count is incremented when we start a range that's in the set, and
	6174	* decremented when we start a range that's not in the set. So its range
	6175	* is 0 to 2. Only when the count is zero is something not in the set.
	6176	*/
	6177	UV count = 0;
	6178
	6179	PERL_ARGS_ASSERT__INVLIST_UNION;
	6180
	6181	/* If either one is empty, the union is the other one */
	6182	len_a = invlist_len(a);
	6183	if (len_a == 0) {
	6184	if (output == &a) {
	6185	SvREFCNT_dec(a);
	6186	}
	6187	else if (output != &b) {
	6188	*output = invlist_clone(b);
	6189	}
	6190	/* else output already = b; /
	6191	return;
	6192	}
	6193	else if ((len_b = invlist_len(b)) == 0) {
	6194	if (output == &b) {
	6195	SvREFCNT_dec(b);
	6196	}
	6197	else if (output != &a) {
	6198	*output = invlist_clone(a);
	6199	}
	6200	/* else output already = a; /
	6201	return;
	6202	}
	6203
	6204	/* Here both lists exist and are non-empty */
	6205	array_a = invlist_array(a);
	6206	array_b = invlist_array(b);
	6207
	6208	/* Size the union for the worst case: that the sets are completely
	6209	* disjoint */
	6210	u = _new_invlist(len_a + len_b);
	6211
	6212	/* Will contain U+0000 if either component does */
	6213	array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
	6214	\|\| (len_b > 0 && array_b[0] == 0));
	6215
	6216	/* Go through each list item by item, stopping when exhausted one of
	6217	* them */
	6218	while (i_a < len_a && i_b < len_b) {
	6219	UV cp; /* The element to potentially add to the union's array */
	6220	bool cp_in_set; /* is it in the the input list's set or not */
	6221
	6222	/* We need to take one or the other of the two inputs for the union.
	6223	* Since we are merging two sorted lists, we take the smaller of the
	6224	* next items. In case of a tie, we take the one that is in its set
	6225	* first. If we took one not in the set first, it would decrement the
	6226	* count, possibly to 0 which would cause it to be output as ending the
	6227	* range, and the next time through we would take the same number, and
	6228	* output it again as beginning the next range. By doing it the
	6229	* opposite way, there is no possibility that the count will be
	6230	* momentarily decremented to 0, and thus the two adjoining ranges will
	6231	* be seamlessly merged. (In a tie and both are in the set or both not
	6232	* in the set, it doesn't matter which we take first.) */
	6233	if (array_a[i_a] < array_b[i_b]
	6234	\|\| (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
	6235	{
	6236	cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
	6237	cp= array_a[i_a++];
	6238	}
	6239	else {
	6240	cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
	6241	cp= array_b[i_b++];
	6242	}
	6243
	6244	/* Here, have chosen which of the two inputs to look at. Only output
	6245	* if the running count changes to/from 0, which marks the
	6246	* beginning/end of a range in that's in the set */
	6247	if (cp_in_set) {
	6248	if (count == 0) {
	6249	array_u[i_u++] = cp;
	6250	}
	6251	count++;
	6252	}
	6253	else {
	6254	count--;
	6255	if (count == 0) {
	6256	array_u[i_u++] = cp;
	6257	}
	6258	}
	6259	}
	6260
	6261	/* Here, we are finished going through at least one of the lists, which
	6262	* means there is something remaining in at most one. We check if the list
	6263	* that hasn't been exhausted is positioned such that we are in the middle
	6264	* of a range in its set or not. (i_a and i_b point to the element beyond
	6265	* the one we care about.) If in the set, we decrement 'count'; if 0, there
	6266	* is potentially more to output.
	6267	* There are four cases:
	6268	* 1) Both weren't in their sets, count is 0, and remains 0. What's left
	6269	* in the union is entirely from the non-exhausted set.
	6270	* 2) Both were in their sets, count is 2. Nothing further should
	6271	* be output, as everything that remains will be in the exhausted
	6272	* list's set, hence in the union; decrementing to 1 but not 0 insures
	6273	* that
	6274	* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
	6275	* Nothing further should be output because the union includes
	6276	* everything from the exhausted set. Not decrementing ensures that.
	6277	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
	6278	* decrementing to 0 insures that we look at the remainder of the
	6279	* non-exhausted set */
	6280	if ((i_a != len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
	6281	\|\| (i_b != len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
	6282	{
	6283	count--;
	6284	}
	6285
	6286	/* The final length is what we've output so far, plus what else is about to
	6287	* be output. (If 'count' is non-zero, then the input list we exhausted
	6288	* has everything remaining up to the machine's limit in its set, and hence
	6289	* in the union, so there will be no further output. */
	6290	len_u = i_u;
	6291	if (count == 0) {
	6292	/* At most one of the subexpressions will be non-zero */
	6293	len_u += (len_a - i_a) + (len_b - i_b);
	6294	}
	6295
	6296	/* Set result to final length, which can change the pointer to array_u, so
	6297	* re-find it */
	6298	if (len_u != invlist_len(u)) {
	6299	invlist_set_len(u, len_u);
	6300	invlist_trim(u);
	6301	array_u = invlist_array(u);
	6302	}
	6303
	6304	/* When 'count' is 0, the list that was exhausted (if one was shorter than
	6305	* the other) ended with everything above it not in its set. That means
	6306	* that the remaining part of the union is precisely the same as the
	6307	* non-exhausted list, so can just copy it unchanged. (If both list were
	6308	* exhausted at the same time, then the operations below will be both 0.)
	6309	*/
	6310	if (count == 0) {
	6311	IV copy_count; /* At most one will have a non-zero copy count */
	6312	if ((copy_count = len_a - i_a) > 0) {
	6313	Copy(array_a + i_a, array_u + i_u, copy_count, UV);
	6314	}
	6315	else if ((copy_count = len_b - i_b) > 0) {
	6316	Copy(array_b + i_b, array_u + i_u, copy_count, UV);
	6317	}
	6318	}
	6319
	6320	/* We may be removing a reference to one of the inputs */
	6321	if (&a == output \|\| &b == output) {
	6322	SvREFCNT_dec(*output);
	6323	}
	6324
	6325	*output = u;
	6326	return;
	6327	}
	6328
	6329	void
	6330	Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
	6331	{
	6332	/* Take the intersection of two inversion lists and point 'i' to it. If
	6333	* 'i' on input points to one of the two lists, the reference count to that
	6334	* list will be decremented.
	6335	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	6336	* Richard Gillam, published by Addison-Wesley, and explained at some
	6337	* length there. The preface says to incorporate its examples into your
	6338	* code at your own risk. In fact, it had bugs
	6339	*
	6340	* The algorithm is like a merge sort, and is essentially the same as the
	6341	* union above
	6342	*/
	6343
	6344	UV* array_a; /* a's array */
	6345	UV* array_b;
	6346	UV len_a; /* length of a's array */
	6347	UV len_b;
	6348
	6349	SV* r; /* the resulting intersection */
	6350	UV* array_r;
	6351	UV len_r;
	6352
	6353	UV i_a = 0; /* current index into a's array */
	6354	UV i_b = 0;
	6355	UV i_r = 0;
	6356
	6357	/* running count, as explained in the algorithm source book; items are
	6358	* stopped accumulating and are output when the count changes to/from 2.
	6359	* The count is incremented when we start a range that's in the set, and
	6360	* decremented when we start a range that's not in the set. So its range
	6361	* is 0 to 2. Only when the count is 2 is something in the intersection.
	6362	*/
	6363	UV count = 0;
	6364
	6365	PERL_ARGS_ASSERT__INVLIST_INTERSECTION;
	6366
	6367	/* If either one is empty, the intersection is null */
	6368	len_a = invlist_len(a);
	6369	if ((len_a == 0) \|\| ((len_b = invlist_len(b)) == 0)) {
	6370	*i = _new_invlist(0);
	6371
	6372	/* If the result is the same as one of the inputs, the input is being
	6373	* overwritten */
	6374	if (i == &a) {
	6375	SvREFCNT_dec(a);
	6376	}
	6377	else if (i == &b) {
	6378	SvREFCNT_dec(b);
	6379	}
	6380	return;
	6381	}
	6382
	6383	/* Here both lists exist and are non-empty */
	6384	array_a = invlist_array(a);
	6385	array_b = invlist_array(b);
	6386
	6387	/* Size the intersection for the worst case: that the intersection ends up
	6388	* fragmenting everything to be completely disjoint */
	6389	r= _new_invlist(len_a + len_b);
	6390
	6391	/* Will contain U+0000 iff both components do */
	6392	array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
	6393	&& len_b > 0 && array_b[0] == 0);
	6394
	6395	/* Go through each list item by item, stopping when exhausted one of
	6396	* them */
	6397	while (i_a < len_a && i_b < len_b) {
	6398	UV cp; /* The element to potentially add to the intersection's
	6399	array */
	6400	bool cp_in_set; /* Is it in the input list's set or not */
	6401
	6402	/* We need to take one or the other of the two inputs for the
	6403	* intersection. Since we are merging two sorted lists, we take the
	6404	* smaller of the next items. In case of a tie, we take the one that
	6405	* is not in its set first (a difference from the union algorithm). If
	6406	* we took one in the set first, it would increment the count, possibly
	6407	* to 2 which would cause it to be output as starting a range in the
	6408	* intersection, and the next time through we would take that same
	6409	* number, and output it again as ending the set. By doing it the
	6410	* opposite of this, there is no possibility that the count will be
	6411	* momentarily incremented to 2. (In a tie and both are in the set or
	6412	* both not in the set, it doesn't matter which we take first.) */
	6413	if (array_a[i_a] < array_b[i_b]
	6414	\|\| (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
	6415	{
	6416	cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
	6417	cp= array_a[i_a++];
	6418	}
	6419	else {
	6420	cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
	6421	cp= array_b[i_b++];
	6422	}
	6423
	6424	/* Here, have chosen which of the two inputs to look at. Only output
	6425	* if the running count changes to/from 2, which marks the
	6426	* beginning/end of a range that's in the intersection */
	6427	if (cp_in_set) {
	6428	count++;
	6429	if (count == 2) {
	6430	array_r[i_r++] = cp;
	6431	}
	6432	}
	6433	else {
	6434	if (count == 2) {
	6435	array_r[i_r++] = cp;
	6436	}
	6437	count--;
	6438	}
	6439	}
	6440
	6441	/* Here, we are finished going through at least one of the lists, which
	6442	* means there is something remaining in at most one. We check if the list
	6443	* that has been exhausted is positioned such that we are in the middle
	6444	* of a range in its set or not. (i_a and i_b point to elements 1 beyond
	6445	* the ones we care about.) There are four cases:
	6446	* 1) Both weren't in their sets, count is 0, and remains 0. There's
	6447	* nothing left in the intersection.
	6448	* 2) Both were in their sets, count is 2 and perhaps is incremented to
	6449	* above 2. What should be output is exactly that which is in the
	6450	* non-exhausted set, as everything it has is also in the intersection
	6451	* set, and everything it doesn't have can't be in the intersection
	6452	* 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
	6453	* gets incremented to 2. Like the previous case, the intersection is
	6454	* everything that remains in the non-exhausted set.
	6455	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
	6456	* remains 1. And the intersection has nothing more. */
	6457	if ((i_a == len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
	6458	\|\| (i_b == len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
	6459	{
	6460	count++;
	6461	}
	6462
	6463	/* The final length is what we've output so far plus what else is in the
	6464	* intersection. At most one of the subexpressions below will be non-zero */
	6465	len_r = i_r;
	6466	if (count >= 2) {
	6467	len_r += (len_a - i_a) + (len_b - i_b);
	6468	}
	6469
	6470	/* Set result to final length, which can change the pointer to array_r, so
	6471	* re-find it */
	6472	if (len_r != invlist_len(r)) {
	6473	invlist_set_len(r, len_r);
	6474	invlist_trim(r);
	6475	array_r = invlist_array(r);
	6476	}
	6477
	6478	/* Finish outputting any remaining */
	6479	if (count >= 2) { /* At most one will have a non-zero copy count */
	6480	IV copy_count;
	6481	if ((copy_count = len_a - i_a) > 0) {
	6482	Copy(array_a + i_a, array_r + i_r, copy_count, UV);
	6483	}
	6484	else if ((copy_count = len_b - i_b) > 0) {
	6485	Copy(array_b + i_b, array_r + i_r, copy_count, UV);
	6486	}
	6487	}
	6488
	6489	/* We may be removing a reference to one of the inputs */
	6490	if (&a == i \|\| &b == i) {
	6491	SvREFCNT_dec(*i);
	6492	}
	6493
	6494	*i = r;
	6495	return;
	6496	}
	6497
	6498	#endif
	6499
	6500	STATIC SV*
	6501	S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
	6502	{
	6503	/* Add the range from 'start' to 'end' inclusive to the inversion list's
	6504	* set. A pointer to the inversion list is returned. This may actually be
	6505	* a new list, in which case the passed in one has been destroyed. The
	6506	* passed in inversion list can be NULL, in which case a new one is created
	6507	* with just the one range in it */
	6508
	6509	SV* range_invlist;
	6510	UV len;
	6511
	6512	if (invlist == NULL) {
	6513	invlist = _new_invlist(2);
	6514	len = 0;
	6515	}
	6516	else {
	6517	len = invlist_len(invlist);
	6518	}
	6519
	6520	/* If comes after the final entry, can just append it to the end */
	6521	if (len == 0
	6522	\|\| start >= invlist_array(invlist)
	6523	[invlist_len(invlist) - 1])
	6524	{
	6525	_append_range_to_invlist(invlist, start, end);
	6526	return invlist;
	6527	}
	6528
	6529	/* Here, can't just append things, create and return a new inversion list
	6530	* which is the union of this range and the existing inversion list */
	6531	range_invlist = _new_invlist(2);
	6532	_append_range_to_invlist(range_invlist, start, end);
	6533
	6534	_invlist_union(invlist, range_invlist, &invlist);
	6535
	6536	/* The temporary can be freed */
	6537	SvREFCNT_dec(range_invlist);
	6538
	6539	return invlist;
	6540	}
	6541
	6542	PERL_STATIC_INLINE SV*
	6543	S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
	6544	return add_range_to_invlist(invlist, cp, cp);
	6545	}
	6546
	6547	#ifndef PERL_IN_XSUB_RE
	6548	void
	6549	Perl__invlist_invert(pTHX_ SV* const invlist)
	6550	{
	6551	/* Complement the input inversion list. This adds a 0 if the list didn't
	6552	* have a zero; removes it otherwise. As described above, the data
	6553	* structure is set up so that this is very efficient */
	6554
	6555	UV* len_pos = get_invlist_len_addr(invlist);
	6556
	6557	PERL_ARGS_ASSERT__INVLIST_INVERT;
	6558
	6559	/* The inverse of matching nothing is matching everything */
	6560	if (*len_pos == 0) {
	6561	_append_range_to_invlist(invlist, 0, UV_MAX);
	6562	return;
	6563	}
	6564
	6565	/* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
	6566	* zero element was a 0, so it is being removed, so the length decrements
	6567	* by 1; and vice-versa. SvCUR is unaffected */
	6568	if (*get_invlist_zero_addr(invlist) ^= 1) {
	6569	(*len_pos)--;
	6570	}
	6571	else {
	6572	(*len_pos)++;
	6573	}
	6574	}
	6575
	6576	void
	6577	Perl__invlist_invert_prop(pTHX_ SV* const invlist)
	6578	{
	6579	/* Complement the input inversion list (which must be a Unicode property,
	6580	* all of which don't match above the Unicode maximum code point.) And
	6581	* Perl has chosen to not have the inversion match above that either. This
	6582	* adds a 0x110000 if the list didn't end with it, and removes it if it did
	6583	*/
	6584
	6585	UV len;
	6586	UV* array;
	6587
	6588	PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
	6589
	6590	_invlist_invert(invlist);
	6591
	6592	len = invlist_len(invlist);
	6593
	6594	if (len != 0) { /* If empty do nothing */
	6595	array = invlist_array(invlist);
	6596	if (array[len - 1] != PERL_UNICODE_MAX + 1) {
	6597	/* Add 0x110000. First, grow if necessary */
	6598	len++;
	6599	if (invlist_max(invlist) < len) {
	6600	invlist_extend(invlist, len);
	6601	array = invlist_array(invlist);
	6602	}
	6603	invlist_set_len(invlist, len);
	6604	array[len - 1] = PERL_UNICODE_MAX + 1;
	6605	}
	6606	else { /* Remove the 0x110000 */
	6607	invlist_set_len(invlist, len - 1);
	6608	}
	6609	}
	6610
	6611	return;
	6612	}
	6613	#endif
	6614
	6615	PERL_STATIC_INLINE SV*
	6616	S_invlist_clone(pTHX_ SV* const invlist)
	6617	{
	6618
	6619	/* Return a new inversion list that is a copy of the input one, which is
	6620	* unchanged */
	6621
	6622	SV* new_invlist = _new_invlist(SvCUR(invlist));
	6623
	6624	PERL_ARGS_ASSERT_INVLIST_CLONE;
	6625
	6626	Copy(SvPVX(invlist), SvPVX(new_invlist), SvCUR(invlist), char);
	6627	return new_invlist;
	6628	}
	6629
	6630	#ifndef PERL_IN_XSUB_RE
	6631	void
	6632	Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
	6633	{
	6634	/* Point result to an inversion list which consists of all elements in 'a'
	6635	* that aren't also in 'b' */
	6636
	6637	PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
	6638
	6639	/* Subtracting nothing retains the original */
	6640	if (invlist_len(b) == 0) {
	6641
	6642	/* If the result is not to be the same variable as the original, create
	6643	* a copy */
	6644	if (result != &a) {
	6645	*result = invlist_clone(a);
	6646	}
	6647	} else {
	6648	SV *b_copy = invlist_clone(b);
	6649	_invlist_invert(b_copy); /* Everything not in 'b' */
	6650	_invlist_intersection(a, b_copy, result); /* Everything in 'a' not in
	6651	'b' */
	6652	SvREFCNT_dec(b_copy);
	6653	}
	6654
	6655	if (result == &b) {
	6656	SvREFCNT_dec(b);
	6657	}
	6658
	6659	return;
	6660	}
	6661	#endif
	6662
	6663	PERL_STATIC_INLINE UV*
	6664	S_get_invlist_iter_addr(pTHX_ SV* invlist)
	6665	{
	6666	/* Return the address of the UV that contains the current iteration
	6667	* position */
	6668
	6669	PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
	6670
	6671	return (UV ) (SvPVX(invlist) + (INVLIST_ITER_OFFSET sizeof (UV)));
	6672	}
	6673
	6674	PERL_STATIC_INLINE void
	6675	S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
	6676	{
	6677	PERL_ARGS_ASSERT_INVLIST_ITERINIT;
	6678
	6679	*get_invlist_iter_addr(invlist) = 0;
	6680	}
	6681
	6682	STATIC bool
	6683	S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
	6684	{
	6685	UV* pos = get_invlist_iter_addr(invlist);
	6686	UV len = invlist_len(invlist);
	6687	UV *array;
	6688
	6689	PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
	6690
	6691	if (*pos >= len) {
	6692	pos = UV_MAX; / Force iternit() to be required next time */
	6693	return FALSE;
	6694	}
	6695
	6696	array = invlist_array(invlist);
	6697
	6698	start = array[(pos)++];
	6699
	6700	if (*pos >= len) {
	6701	*end = UV_MAX;
	6702	}
	6703	else {
	6704	end = array[(pos)++] - 1;
	6705	}
	6706
	6707	return TRUE;
	6708	}
	6709
	6710	#if 0
	6711	void
	6712	S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
	6713	{
	6714	/* Dumps out the ranges in an inversion list. The string 'header'
	6715	* if present is output on a line before the first range */
	6716
	6717	UV start, end;
	6718
	6719	if (header && strlen(header)) {
	6720	PerlIO_printf(Perl_debug_log, "%s\n", header);
	6721	}
	6722	invlist_iterinit(invlist);
	6723	while (invlist_iternext(invlist, &start, &end)) {
	6724	if (end == UV_MAX) {
	6725	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
	6726	}
	6727	else {
	6728	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
	6729	}
	6730	}
	6731	}
	6732	#endif
	6733
	6734	#undef HEADER_LENGTH
	6735	#undef INVLIST_INITIAL_LENGTH
	6736	#undef TO_INTERNAL_SIZE
	6737	#undef FROM_INTERNAL_SIZE
	6738	#undef INVLIST_LEN_OFFSET
	6739	#undef INVLIST_ZERO_OFFSET
	6740	#undef INVLIST_ITER_OFFSET
	6741
	6742	/* End of inversion list object */
	6743
	6744	/*
	6745	- reg - regular expression, i.e. main body or parenthesized thing
	6746	*
	6747	* Caller must absorb opening parenthesis.
	6748	*
	6749	* Combining parenthesis handling with the base level of regular expression
	6750	* is a trifle forced, but the need to tie the tails of the branches to what
	6751	* follows makes it hard to avoid.
	6752	*/
	6753	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	6754	#ifdef DEBUGGING
	6755	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	6756	#else
	6757	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	6758	#endif
	6759
	6760	STATIC regnode *
	6761	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	6762	/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
	6763	{
	6764	dVAR;
	6765	register regnode ret; / Will be the head of the group. */
	6766	register regnode *br;
	6767	register regnode *lastbr;
	6768	register regnode *ender = NULL;
	6769	register I32 parno = 0;
	6770	I32 flags;
	6771	U32 oregflags = RExC_flags;
	6772	bool have_branch = 0;
	6773	bool is_open = 0;
	6774	I32 freeze_paren = 0;
	6775	I32 after_freeze = 0;
	6776
	6777	/* for (?g), (?gc), and (?o) warnings; warning
	6778	about (?c) will warn about (?g) -- japhy */
	6779
	6780	#define WASTED_O 0x01
	6781	#define WASTED_G 0x02
	6782	#define WASTED_C 0x04
	6783	#define WASTED_GC (0x02\|0x04)
	6784	I32 wastedflags = 0x00;
	6785
	6786	char * parse_start = RExC_parse; /* MJD */
	6787	char * const oregcomp_parse = RExC_parse;
	6788
	6789	GET_RE_DEBUG_FLAGS_DECL;
	6790
	6791	PERL_ARGS_ASSERT_REG;
	6792	DEBUG_PARSE("reg ");
	6793
	6794	flagp = 0; / Tentatively. */
	6795
	6796
	6797	/* Make an OPEN node, if parenthesized. */
	6798	if (paren) {
	6799	if ( RExC_parse == '') { /* (VERB:ARG) /
	6800	char *start_verb = RExC_parse;
	6801	STRLEN verb_len = 0;
	6802	char *start_arg = NULL;
	6803	unsigned char op = 0;
	6804	int argok = 1;
	6805	int internal_argval = 0; /* internal_argval is only useful if !argok */
	6806	while ( RExC_parse && RExC_parse != ')' ) {
	6807	if ( *RExC_parse == ':' ) {
	6808	start_arg = RExC_parse + 1;
	6809	break;
	6810	}
	6811	RExC_parse++;
	6812	}
	6813	++start_verb;
	6814	verb_len = RExC_parse - start_verb;
	6815	if ( start_arg ) {
	6816	RExC_parse++;
	6817	while ( RExC_parse && RExC_parse != ')' )
	6818	RExC_parse++;
	6819	if ( *RExC_parse != ')' )
	6820	vFAIL("Unterminated verb pattern argument");
	6821	if ( RExC_parse == start_arg )
	6822	start_arg = NULL;
	6823	} else {
	6824	if ( *RExC_parse != ')' )
	6825	vFAIL("Unterminated verb pattern");
	6826	}
	6827
	6828	switch ( *start_verb ) {
	6829	case 'A': /* (ACCEPT) /
	6830	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	6831	op = ACCEPT;
	6832	internal_argval = RExC_nestroot;
	6833	}
	6834	break;
	6835	case 'C': /* (COMMIT) /
	6836	if ( memEQs(start_verb,verb_len,"COMMIT") )
	6837	op = COMMIT;
	6838	break;
	6839	case 'F': /* (FAIL) /
	6840	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	6841	op = OPFAIL;
	6842	argok = 0;
	6843	}
	6844	break;
	6845	case ':': /* (:NAME) /
	6846	case 'M': /* (MARK:NAME) /
	6847	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	6848	op = MARKPOINT;
	6849	argok = -1;
	6850	}
	6851	break;
	6852	case 'P': /* (PRUNE) /
	6853	if ( memEQs(start_verb,verb_len,"PRUNE") )
	6854	op = PRUNE;
	6855	break;
	6856	case 'S': /* (SKIP) /
	6857	if ( memEQs(start_verb,verb_len,"SKIP") )
	6858	op = SKIP;
	6859	break;
	6860	case 'T': /* (THEN) /
	6861	/* [19:06] <TimToady> :: is then */
	6862	if ( memEQs(start_verb,verb_len,"THEN") ) {
	6863	op = CUTGROUP;
	6864	RExC_seen \|= REG_SEEN_CUTGROUP;
	6865	}
	6866	break;
	6867	}
	6868	if ( ! op ) {
	6869	RExC_parse++;
	6870	vFAIL3("Unknown verb pattern '%.*s'",
	6871	verb_len, start_verb);
	6872	}
	6873	if ( argok ) {
	6874	if ( start_arg && internal_argval ) {
	6875	vFAIL3("Verb pattern '%.*s' may not have an argument",
	6876	verb_len, start_verb);
	6877	} else if ( argok < 0 && !start_arg ) {
	6878	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	6879	verb_len, start_verb);
	6880	} else {
	6881	ret = reganode(pRExC_state, op, internal_argval);
	6882	if ( ! internal_argval && ! SIZE_ONLY ) {
	6883	if (start_arg) {
	6884	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	6885	ARG(ret) = add_data( pRExC_state, 1, "S" );
	6886	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	6887	ret->flags = 0;
	6888	} else {
	6889	ret->flags = 1;
	6890	}
	6891	}
	6892	}
	6893	if (!internal_argval)
	6894	RExC_seen \|= REG_SEEN_VERBARG;
	6895	} else if ( start_arg ) {
	6896	vFAIL3("Verb pattern '%.*s' may not have an argument",
	6897	verb_len, start_verb);
	6898	} else {
	6899	ret = reg_node(pRExC_state, op);
	6900	}
	6901	nextchar(pRExC_state);
	6902	return ret;
	6903	} else
	6904	if (RExC_parse == '?') { / (?...) */
	6905	bool is_logical = 0;
	6906	const char * const seqstart = RExC_parse;
	6907	bool has_use_defaults = FALSE;
	6908
	6909	RExC_parse++;
	6910	paren = *RExC_parse++;
	6911	ret = NULL; /* For look-ahead/behind. */
	6912	switch (paren) {
	6913
	6914	case 'P': /* (?P...) variants for those used to PCRE/Python */
	6915	paren = *RExC_parse++;
	6916	if ( paren == '<') /* (?P<...>) named capture */
	6917	goto named_capture;
	6918	else if (paren == '>') { /* (?P>name) named recursion */
	6919	goto named_recursion;
	6920	}
	6921	else if (paren == '=') { /* (?P=...) named backref */
	6922	/* this pretty much dupes the code for \k<NAME> in regatom(), if
	6923	you change this make sure you change that */
	6924	char* name_start = RExC_parse;
	6925	U32 num = 0;
	6926	SV *sv_dat = reg_scan_name(pRExC_state,
	6927	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6928	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	6929	vFAIL2("Sequence %.3s... not terminated",parse_start);
	6930
	6931	if (!SIZE_ONLY) {
	6932	num = add_data( pRExC_state, 1, "S" );
	6933	RExC_rxi->data->data[num]=(void*)sv_dat;
	6934	SvREFCNT_inc_simple_void(sv_dat);
	6935	}
	6936	RExC_sawback = 1;
	6937	ret = reganode(pRExC_state,
	6938	((! FOLD)
	6939	? NREF
	6940	: (MORE_ASCII_RESTRICTED)
	6941	? NREFFA
	6942	: (AT_LEAST_UNI_SEMANTICS)
	6943	? NREFFU
	6944	: (LOC)
	6945	? NREFFL
	6946	: NREFF),
	6947	num);
	6948	*flagp \|= HASWIDTH;
	6949
	6950	Set_Node_Offset(ret, parse_start+1);
	6951	Set_Node_Cur_Length(ret); /* MJD */
	6952
	6953	nextchar(pRExC_state);
	6954	return ret;
	6955	}
	6956	RExC_parse++;
	6957	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6958	/NOTREACHED/
	6959	case '<': /* (?<...) */
	6960	if (*RExC_parse == '!')
	6961	paren = ',';
	6962	else if (*RExC_parse != '=')
	6963	named_capture:
	6964	{ /* (?<...>) */
	6965	char *name_start;
	6966	SV *svname;
	6967	paren= '>';
	6968	case '\'': /* (?'...') */
	6969	name_start= RExC_parse;
	6970	svname = reg_scan_name(pRExC_state,
	6971	SIZE_ONLY ? /* reverse test from the others */
	6972	REG_RSN_RETURN_NAME :
	6973	REG_RSN_RETURN_NULL);
	6974	if (RExC_parse == name_start) {
	6975	RExC_parse++;
	6976	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6977	/NOTREACHED/
	6978	}
	6979	if (*RExC_parse != paren)
	6980	vFAIL2("Sequence (?%c... not terminated",
	6981	paren=='>' ? '<' : paren);
	6982	if (SIZE_ONLY) {
	6983	HE *he_str;
	6984	SV *sv_dat = NULL;
	6985	if (!svname) /* shouldn't happen */
	6986	Perl_croak(aTHX_
	6987	"panic: reg_scan_name returned NULL");
	6988	if (!RExC_paren_names) {
	6989	RExC_paren_names= newHV();
	6990	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	6991	#ifdef DEBUGGING
	6992	RExC_paren_name_list= newAV();
	6993	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	6994	#endif
	6995	}
	6996	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	6997	if ( he_str )
	6998	sv_dat = HeVAL(he_str);
	6999	if ( ! sv_dat ) {
	7000	/* croak baby croak */
	7001	Perl_croak(aTHX_
	7002	"panic: paren_name hash element allocation failed");
	7003	} else if ( SvPOK(sv_dat) ) {
	7004	/* (?\|...) can mean we have dupes so scan to check
	7005	its already been stored. Maybe a flag indicating
	7006	we are inside such a construct would be useful,
	7007	but the arrays are likely to be quite small, so
	7008	for now we punt -- dmq */
	7009	IV count = SvIV(sv_dat);
	7010	I32 pv = (I32)SvPVX(sv_dat);
	7011	IV i;
	7012	for ( i = 0 ; i < count ; i++ ) {
	7013	if ( pv[i] == RExC_npar ) {
	7014	count = 0;
	7015	break;
	7016	}
	7017	}
	7018	if ( count ) {
	7019	pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
	7020	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	7021	pv[count] = RExC_npar;
	7022	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	7023	}
	7024	} else {
	7025	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	7026	sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
	7027	SvIOK_on(sv_dat);
	7028	SvIV_set(sv_dat, 1);
	7029	}
	7030	#ifdef DEBUGGING
	7031	/* Yes this does cause a memory leak in debugging Perls */
	7032	if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
	7033	SvREFCNT_dec(svname);
	7034	#endif
	7035
	7036	/sv_dump(sv_dat);/
	7037	}
	7038	nextchar(pRExC_state);
	7039	paren = 1;
	7040	goto capturing_parens;
	7041	}
	7042	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	7043	RExC_in_lookbehind++;
	7044	RExC_parse++;
	7045	case '=': /* (?=...) */
	7046	RExC_seen_zerolen++;
	7047	break;
	7048	case '!': /* (?!...) */
	7049	RExC_seen_zerolen++;
	7050	if (*RExC_parse == ')') {
	7051	ret=reg_node(pRExC_state, OPFAIL);
	7052	nextchar(pRExC_state);
	7053	return ret;
	7054	}
	7055	break;
	7056	case '\|': /* (?\|...) */
	7057	/* branch reset, behave like a (?:...) except that
	7058	buffers in alternations share the same numbers */
	7059	paren = ':';
	7060	after_freeze = freeze_paren = RExC_npar;
	7061	break;
	7062	case ':': /* (?:...) */
	7063	case '>': /* (?>...) */
	7064	break;
	7065	case '$': /* (?$...) */
	7066	case '@': /* (?@...) */
	7067	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	7068	break;
	7069	case '#': /* (?#...) */
	7070	while (RExC_parse && RExC_parse != ')')
	7071	RExC_parse++;
	7072	if (*RExC_parse != ')')
	7073	FAIL("Sequence (?#... not terminated");
	7074	nextchar(pRExC_state);
	7075	*flagp = TRYAGAIN;
	7076	return NULL;
	7077	case '0' : /* (?0) */
	7078	case 'R' : /* (?R) */
	7079	if (*RExC_parse != ')')
	7080	FAIL("Sequence (?R) not terminated");
	7081	ret = reg_node(pRExC_state, GOSTART);
	7082	*flagp \|= POSTPONED;
	7083	nextchar(pRExC_state);
	7084	return ret;
	7085	/notreached/
	7086	{ /* named and numeric backreferences */
	7087	I32 num;
	7088	case '&': /* (?&NAME) */
	7089	parse_start = RExC_parse - 1;
	7090	named_recursion:
	7091	{
	7092	SV *sv_dat = reg_scan_name(pRExC_state,
	7093	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7094	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	7095	}
	7096	goto gen_recurse_regop;
	7097	/* NOT REACHED */
	7098	case '+':
	7099	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	7100	RExC_parse++;
	7101	vFAIL("Illegal pattern");
	7102	}
	7103	goto parse_recursion;
	7104	/* NOT REACHED*/
	7105	case '-': /* (?-1) */
	7106	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	7107	RExC_parse--; /* rewind to let it be handled later */
	7108	goto parse_flags;
	7109	}
	7110	/FALLTHROUGH /
	7111	case '1': case '2': case '3': case '4': /* (?1) */
	7112	case '5': case '6': case '7': case '8': case '9':
	7113	RExC_parse--;
	7114	parse_recursion:
	7115	num = atoi(RExC_parse);
	7116	parse_start = RExC_parse - 1; /* MJD */
	7117	if (*RExC_parse == '-')
	7118	RExC_parse++;
	7119	while (isDIGIT(*RExC_parse))
	7120	RExC_parse++;
	7121	if (*RExC_parse!=')')
	7122	vFAIL("Expecting close bracket");
	7123
	7124	gen_recurse_regop:
	7125	if ( paren == '-' ) {
	7126	/*
	7127	Diagram of capture buffer numbering.
	7128	Top line is the normal capture buffer numbers
	7129	Bottom line is the negative indexing as from
	7130	the X (the (?-2))
	7131
	7132	+ 1 2 3 4 5 X 6 7
	7133	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	7134	- 5 4 3 2 1 X x x
	7135
	7136	*/
	7137	num = RExC_npar + num;
	7138	if (num < 1) {
	7139	RExC_parse++;
	7140	vFAIL("Reference to nonexistent group");
	7141	}
	7142	} else if ( paren == '+' ) {
	7143	num = RExC_npar + num - 1;
	7144	}
	7145
	7146	ret = reganode(pRExC_state, GOSUB, num);
	7147	if (!SIZE_ONLY) {
	7148	if (num > (I32)RExC_rx->nparens) {
	7149	RExC_parse++;
	7150	vFAIL("Reference to nonexistent group");
	7151	}
	7152	ARG2L_SET( ret, RExC_recurse_count++);
	7153	RExC_emit++;
	7154	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	7155	"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
	7156	} else {
	7157	RExC_size++;
	7158	}
	7159	RExC_seen \|= REG_SEEN_RECURSE;
	7160	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	7161	Set_Node_Offset(ret, parse_start); /* MJD */
	7162
	7163	*flagp \|= POSTPONED;
	7164	nextchar(pRExC_state);
	7165	return ret;
	7166	} /* named and numeric backreferences */
	7167	/* NOT REACHED */
	7168
	7169	case '?': /* (??...) */
	7170	is_logical = 1;
	7171	if (*RExC_parse != '{') {
	7172	RExC_parse++;
	7173	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7174	/NOTREACHED/
	7175	}
	7176	*flagp \|= POSTPONED;
	7177	paren = *RExC_parse++;
	7178	/* FALL THROUGH */
	7179	case '{': /* (?{...}) */
	7180	{
	7181	I32 count = 1;
	7182	U32 n = 0;
	7183	char c;
	7184	char *s = RExC_parse;
	7185
	7186	RExC_seen_zerolen++;
	7187	RExC_seen \|= REG_SEEN_EVAL;
	7188	while (count && (c = *RExC_parse)) {
	7189	if (c == '\\') {
	7190	if (RExC_parse[1])
	7191	RExC_parse++;
	7192	}
	7193	else if (c == '{')
	7194	count++;
	7195	else if (c == '}')
	7196	count--;
	7197	RExC_parse++;
	7198	}
	7199	if (*RExC_parse != ')') {
	7200	RExC_parse = s;
	7201	vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
	7202	}
	7203	if (!SIZE_ONLY) {
	7204	PAD *pad;
	7205	OP_4tree sop, rop;
	7206	SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
	7207
	7208	ENTER;
	7209	Perl_save_re_context(aTHX);
	7210	rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
	7211	sop->op_private \|= OPpREFCOUNTED;
	7212	/* re_dup will OpREFCNT_inc */
	7213	OpREFCNT_set(sop, 1);
	7214	LEAVE;
	7215
	7216	n = add_data(pRExC_state, 3, "nop");
	7217	RExC_rxi->data->data[n] = (void*)rop;
	7218	RExC_rxi->data->data[n+1] = (void*)sop;
	7219	RExC_rxi->data->data[n+2] = (void*)pad;
	7220	SvREFCNT_dec(sv);
	7221	}
	7222	else { /* First pass */
	7223	if (PL_reginterp_cnt < ++RExC_seen_evals
	7224	&& IN_PERL_RUNTIME)
	7225	/* No compiled RE interpolated, has runtime
	7226	components ===> unsafe. */
	7227	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	7228	if (PL_tainting && PL_tainted)
	7229	FAIL("Eval-group in insecure regular expression");
	7230	#if PERL_VERSION > 8
	7231	if (IN_PERL_COMPILETIME)
	7232	PL_cv_has_eval = 1;
	7233	#endif
	7234	}
	7235
	7236	nextchar(pRExC_state);
	7237	if (is_logical) {
	7238	ret = reg_node(pRExC_state, LOGICAL);
	7239	if (!SIZE_ONLY)
	7240	ret->flags = 2;
	7241	REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
	7242	/* deal with the length of this later - MJD */
	7243	return ret;
	7244	}
	7245	ret = reganode(pRExC_state, EVAL, n);
	7246	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	7247	Set_Node_Offset(ret, parse_start);
	7248	return ret;
	7249	}
	7250	case '(': /* (?(?{...})...) and (?(?=...)...) */
	7251	{
	7252	int is_define= 0;
	7253	if (RExC_parse[0] == '?') { /* (?(?...)) */
	7254	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	7255	\|\| RExC_parse[1] == '<'
	7256	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	7257	I32 flag;
	7258
	7259	ret = reg_node(pRExC_state, LOGICAL);
	7260	if (!SIZE_ONLY)
	7261	ret->flags = 1;
	7262	REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
	7263	goto insert_if;
	7264	}
	7265	}
	7266	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	7267	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	7268	{
	7269	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	7270	char *name_start= RExC_parse++;
	7271	U32 num = 0;
	7272	SV *sv_dat=reg_scan_name(pRExC_state,
	7273	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7274	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	7275	vFAIL2("Sequence (?(%c... not terminated",
	7276	(ch == '>' ? '<' : ch));
	7277	RExC_parse++;
	7278	if (!SIZE_ONLY) {
	7279	num = add_data( pRExC_state, 1, "S" );
	7280	RExC_rxi->data->data[num]=(void*)sv_dat;
	7281	SvREFCNT_inc_simple_void(sv_dat);
	7282	}
	7283	ret = reganode(pRExC_state,NGROUPP,num);
	7284	goto insert_if_check_paren;
	7285	}
	7286	else if (RExC_parse[0] == 'D' &&
	7287	RExC_parse[1] == 'E' &&
	7288	RExC_parse[2] == 'F' &&
	7289	RExC_parse[3] == 'I' &&
	7290	RExC_parse[4] == 'N' &&
	7291	RExC_parse[5] == 'E')
	7292	{
	7293	ret = reganode(pRExC_state,DEFINEP,0);
	7294	RExC_parse +=6 ;
	7295	is_define = 1;
	7296	goto insert_if_check_paren;
	7297	}
	7298	else if (RExC_parse[0] == 'R') {
	7299	RExC_parse++;
	7300	parno = 0;
	7301	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	7302	parno = atoi(RExC_parse++);
	7303	while (isDIGIT(*RExC_parse))
	7304	RExC_parse++;
	7305	} else if (RExC_parse[0] == '&') {
	7306	SV *sv_dat;
	7307	RExC_parse++;
	7308	sv_dat = reg_scan_name(pRExC_state,
	7309	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	7310	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	7311	}
	7312	ret = reganode(pRExC_state,INSUBP,parno);
	7313	goto insert_if_check_paren;
	7314	}
	7315	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	7316	/* (?(1)...) */
	7317	char c;
	7318	parno = atoi(RExC_parse++);
	7319
	7320	while (isDIGIT(*RExC_parse))
	7321	RExC_parse++;
	7322	ret = reganode(pRExC_state, GROUPP, parno);
	7323
	7324	insert_if_check_paren:
	7325	if ((c = *nextchar(pRExC_state)) != ')')
	7326	vFAIL("Switch condition not recognized");
	7327	insert_if:
	7328	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	7329	br = regbranch(pRExC_state, &flags, 1,depth+1);
	7330	if (br == NULL)
	7331	br = reganode(pRExC_state, LONGJMP, 0);
	7332	else
	7333	REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
	7334	c = *nextchar(pRExC_state);
	7335	if (flags&HASWIDTH)
	7336	*flagp \|= HASWIDTH;
	7337	if (c == '\|') {
	7338	if (is_define)
	7339	vFAIL("(?(DEFINE)....) does not allow branches");
	7340	lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
	7341	regbranch(pRExC_state, &flags, 1,depth+1);
	7342	REGTAIL(pRExC_state, ret, lastbr);
	7343	if (flags&HASWIDTH)
	7344	*flagp \|= HASWIDTH;
	7345	c = *nextchar(pRExC_state);
	7346	}
	7347	else
	7348	lastbr = NULL;
	7349	if (c != ')')
	7350	vFAIL("Switch (?(condition)... contains too many branches");
	7351	ender = reg_node(pRExC_state, TAIL);
	7352	REGTAIL(pRExC_state, br, ender);
	7353	if (lastbr) {
	7354	REGTAIL(pRExC_state, lastbr, ender);
	7355	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	7356	}
	7357	else
	7358	REGTAIL(pRExC_state, ret, ender);
	7359	RExC_size++; /* XXX WHY do we need this?!!
	7360	For large programs it seems to be required
	7361	but I can't figure out why. -- dmq*/
	7362	return ret;
	7363	}
	7364	else {
	7365	vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
	7366	}
	7367	}
	7368	case 0:
	7369	RExC_parse--; /* for vFAIL to print correctly */
	7370	vFAIL("Sequence (? incomplete");
	7371	break;
	7372	case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
	7373	that follow */
	7374	has_use_defaults = TRUE;
	7375	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	7376	set_regex_charset(&RExC_flags, (RExC_utf8 \|\| RExC_uni_semantics)
	7377	? REGEX_UNICODE_CHARSET
	7378	: REGEX_DEPENDS_CHARSET);
	7379	goto parse_flags;
	7380	default:
	7381	--RExC_parse;
	7382	parse_flags: /* (?i) */
	7383	{
	7384	U32 posflags = 0, negflags = 0;
	7385	U32 *flagsp = &posflags;
	7386	char has_charset_modifier = '\0';
	7387	regex_charset cs = (RExC_utf8 \|\| RExC_uni_semantics)
	7388	? REGEX_UNICODE_CHARSET
	7389	: REGEX_DEPENDS_CHARSET;
	7390
	7391	while (*RExC_parse) {
	7392	/* && strchr("iogcmsx", RExC_parse) /
	7393	/* (?g), (?gc) and (?o) are useless here
	7394	and must be globally applied -- japhy */
	7395	switch (*RExC_parse) {
	7396	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	7397	case LOCALE_PAT_MOD:
	7398	if (has_charset_modifier) {
	7399	goto excess_modifier;
	7400	}
	7401	else if (flagsp == &negflags) {
	7402	goto neg_modifier;
	7403	}
	7404	cs = REGEX_LOCALE_CHARSET;
	7405	has_charset_modifier = LOCALE_PAT_MOD;
	7406	RExC_contains_locale = 1;
	7407	break;
	7408	case UNICODE_PAT_MOD:
	7409	if (has_charset_modifier) {
	7410	goto excess_modifier;
	7411	}
	7412	else if (flagsp == &negflags) {
	7413	goto neg_modifier;
	7414	}
	7415	cs = REGEX_UNICODE_CHARSET;
	7416	has_charset_modifier = UNICODE_PAT_MOD;
	7417	break;
	7418	case ASCII_RESTRICT_PAT_MOD:
	7419	if (flagsp == &negflags) {
	7420	goto neg_modifier;
	7421	}
	7422	if (has_charset_modifier) {
	7423	if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
	7424	goto excess_modifier;
	7425	}
	7426	/* Doubled modifier implies more restricted */
	7427	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	7428	}
	7429	else {
	7430	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	7431	}
	7432	has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
	7433	break;
	7434	case DEPENDS_PAT_MOD:
	7435	if (has_use_defaults) {
	7436	goto fail_modifiers;
	7437	}
	7438	else if (flagsp == &negflags) {
	7439	goto neg_modifier;
	7440	}
	7441	else if (has_charset_modifier) {
	7442	goto excess_modifier;
	7443	}
	7444
	7445	/* The dual charset means unicode semantics if the
	7446	* pattern (or target, not known until runtime) are
	7447	* utf8, or something in the pattern indicates unicode
	7448	* semantics */
	7449	cs = (RExC_utf8 \|\| RExC_uni_semantics)
	7450	? REGEX_UNICODE_CHARSET
	7451	: REGEX_DEPENDS_CHARSET;
	7452	has_charset_modifier = DEPENDS_PAT_MOD;
	7453	break;
	7454	excess_modifier:
	7455	RExC_parse++;
	7456	if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
	7457	vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
	7458	}
	7459	else if (has_charset_modifier == *(RExC_parse - 1)) {
	7460	vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
	7461	}
	7462	else {
	7463	vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
	7464	}
	7465	/NOTREACHED/
	7466	neg_modifier:
	7467	RExC_parse++;
	7468	vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
	7469	/NOTREACHED/
	7470	case ONCE_PAT_MOD: /* 'o' */
	7471	case GLOBAL_PAT_MOD: /* 'g' */
	7472	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	7473	const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
	7474	if (! (wastedflags & wflagbit) ) {
	7475	wastedflags \|= wflagbit;
	7476	vWARN5(
	7477	RExC_parse + 1,
	7478	"Useless (%s%c) - %suse /%c modifier",
	7479	flagsp == &negflags ? "?-" : "?",
	7480	*RExC_parse,
	7481	flagsp == &negflags ? "don't " : "",
	7482	*RExC_parse
	7483	);
	7484	}
	7485	}
	7486	break;
	7487
	7488	case CONTINUE_PAT_MOD: /* 'c' */
	7489	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	7490	if (! (wastedflags & WASTED_C) ) {
	7491	wastedflags \|= WASTED_GC;
	7492	vWARN3(
	7493	RExC_parse + 1,
	7494	"Useless (%sc) - %suse /gc modifier",
	7495	flagsp == &negflags ? "?-" : "?",
	7496	flagsp == &negflags ? "don't " : ""
	7497	);
	7498	}
	7499	}
	7500	break;
	7501	case KEEPCOPY_PAT_MOD: /* 'p' */
	7502	if (flagsp == &negflags) {
	7503	if (SIZE_ONLY)
	7504	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	7505	} else {
	7506	*flagsp \|= RXf_PMf_KEEPCOPY;
	7507	}
	7508	break;
	7509	case '-':
	7510	/* A flag is a default iff it is following a minus, so
	7511	* if there is a minus, it means will be trying to
	7512	* re-specify a default which is an error */
	7513	if (has_use_defaults \|\| flagsp == &negflags) {
	7514	fail_modifiers:
	7515	RExC_parse++;
	7516	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7517	/NOTREACHED/
	7518	}
	7519	flagsp = &negflags;
	7520	wastedflags = 0; /* reset so (?g-c) warns twice */
	7521	break;
	7522	case ':':
	7523	paren = ':';
	7524	/FALLTHROUGH/
	7525	case ')':
	7526	RExC_flags \|= posflags;
	7527	RExC_flags &= ~negflags;
	7528	set_regex_charset(&RExC_flags, cs);
	7529	if (paren != ':') {
	7530	oregflags \|= posflags;
	7531	oregflags &= ~negflags;
	7532	set_regex_charset(&oregflags, cs);
	7533	}
	7534	nextchar(pRExC_state);
	7535	if (paren != ':') {
	7536	*flagp = TRYAGAIN;
	7537	return NULL;
	7538	} else {
	7539	ret = NULL;
	7540	goto parse_rest;
	7541	}
	7542	/NOTREACHED/
	7543	default:
	7544	RExC_parse++;
	7545	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7546	/NOTREACHED/
	7547	}
	7548	++RExC_parse;
	7549	}
	7550	}} /* one for the default block, one for the switch */
	7551	}
	7552	else { /* (...) */
	7553	capturing_parens:
	7554	parno = RExC_npar;
	7555	RExC_npar++;
	7556
	7557	ret = reganode(pRExC_state, OPEN, parno);
	7558	if (!SIZE_ONLY ){
	7559	if (!RExC_nestroot)
	7560	RExC_nestroot = parno;
	7561	if (RExC_seen & REG_SEEN_RECURSE
	7562	&& !RExC_open_parens[parno-1])
	7563	{
	7564	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	7565	"Setting open paren #%"IVdf" to %d\n",
	7566	(IV)parno, REG_NODE_NUM(ret)));
	7567	RExC_open_parens[parno-1]= ret;
	7568	}
	7569	}
	7570	Set_Node_Length(ret, 1); /* MJD */
	7571	Set_Node_Offset(ret, RExC_parse); /* MJD */
	7572	is_open = 1;
	7573	}
	7574	}
	7575	else /* ! paren */
	7576	ret = NULL;
	7577
	7578	parse_rest:
	7579	/* Pick up the branches, linking them together. */
	7580	parse_start = RExC_parse; /* MJD */
	7581	br = regbranch(pRExC_state, &flags, 1,depth+1);
	7582
	7583	/* branch_len = (paren != 0); */
	7584
	7585	if (br == NULL)
	7586	return(NULL);
	7587	if (*RExC_parse == '\|') {
	7588	if (!SIZE_ONLY && RExC_extralen) {
	7589	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	7590	}
	7591	else { /* MJD */
	7592	reginsert(pRExC_state, BRANCH, br, depth+1);
	7593	Set_Node_Length(br, paren != 0);
	7594	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	7595	}
	7596	have_branch = 1;
	7597	if (SIZE_ONLY)
	7598	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	7599	}
	7600	else if (paren == ':') {
	7601	*flagp \|= flags&SIMPLE;
	7602	}
	7603	if (is_open) { /* Starts with OPEN. */
	7604	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	7605	}
	7606	else if (paren != '?') /* Not Conditional */
	7607	ret = br;
	7608	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	7609	lastbr = br;
	7610	while (*RExC_parse == '\|') {
	7611	if (!SIZE_ONLY && RExC_extralen) {
	7612	ender = reganode(pRExC_state, LONGJMP,0);
	7613	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
	7614	}
	7615	if (SIZE_ONLY)
	7616	RExC_extralen += 2; /* Account for LONGJMP. */
	7617	nextchar(pRExC_state);
	7618	if (freeze_paren) {
	7619	if (RExC_npar > after_freeze)
	7620	after_freeze = RExC_npar;
	7621	RExC_npar = freeze_paren;
	7622	}
	7623	br = regbranch(pRExC_state, &flags, 0, depth+1);
	7624
	7625	if (br == NULL)
	7626	return(NULL);
	7627	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	7628	lastbr = br;
	7629	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	7630	}
	7631
	7632	if (have_branch \|\| paren != ':') {
	7633	/* Make a closing node, and hook it on the end. */
	7634	switch (paren) {
	7635	case ':':
	7636	ender = reg_node(pRExC_state, TAIL);
	7637	break;
	7638	case 1:
	7639	ender = reganode(pRExC_state, CLOSE, parno);
	7640	if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
	7641	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	7642	"Setting close paren #%"IVdf" to %d\n",
	7643	(IV)parno, REG_NODE_NUM(ender)));
	7644	RExC_close_parens[parno-1]= ender;
	7645	if (RExC_nestroot == parno)
	7646	RExC_nestroot = 0;
	7647	}
	7648	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	7649	Set_Node_Length(ender,1); /* MJD */
	7650	break;
	7651	case '<':
	7652	case ',':
	7653	case '=':
	7654	case '!':
	7655	*flagp &= ~HASWIDTH;
	7656	/* FALL THROUGH */
	7657	case '>':
	7658	ender = reg_node(pRExC_state, SUCCEED);
	7659	break;
	7660	case 0:
	7661	ender = reg_node(pRExC_state, END);
	7662	if (!SIZE_ONLY) {
	7663	assert(!RExC_opend); /* there can only be one! */
	7664	RExC_opend = ender;
	7665	}
	7666	break;
	7667	}
	7668	REGTAIL(pRExC_state, lastbr, ender);
	7669
	7670	if (have_branch && !SIZE_ONLY) {
	7671	if (depth==1)
	7672	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	7673
	7674	/* Hook the tails of the branches to the closing node. */
	7675	for (br = ret; br; br = regnext(br)) {
	7676	const U8 op = PL_regkind[OP(br)];
	7677	if (op == BRANCH) {
	7678	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	7679	}
	7680	else if (op == BRANCHJ) {
	7681	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	7682	}
	7683	}
	7684	}
	7685	}
	7686
	7687	{
	7688	const char *p;
	7689	static const char parens[] = "=!<,>";
	7690
	7691	if (paren && (p = strchr(parens, paren))) {
	7692	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	7693	int flag = (p - parens) > 1;
	7694
	7695	if (paren == '>')
	7696	node = SUSPEND, flag = 0;
	7697	reginsert(pRExC_state, node,ret, depth+1);
	7698	Set_Node_Cur_Length(ret);
	7699	Set_Node_Offset(ret, parse_start + 1);
	7700	ret->flags = flag;
	7701	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	7702	}
	7703	}
	7704
	7705	/* Check for proper termination. */
	7706	if (paren) {
	7707	RExC_flags = oregflags;
	7708	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	7709	RExC_parse = oregcomp_parse;
	7710	vFAIL("Unmatched (");
	7711	}
	7712	}
	7713	else if (!paren && RExC_parse < RExC_end) {
	7714	if (*RExC_parse == ')') {
	7715	RExC_parse++;
	7716	vFAIL("Unmatched )");
	7717	}
	7718	else
	7719	FAIL("Junk on end of regexp"); /* "Can't happen". */
	7720	/* NOTREACHED */
	7721	}
	7722
	7723	if (RExC_in_lookbehind) {
	7724	RExC_in_lookbehind--;
	7725	}
	7726	if (after_freeze > RExC_npar)
	7727	RExC_npar = after_freeze;
	7728	return(ret);
	7729	}
	7730
	7731	/*
	7732	- regbranch - one alternative of an \| operator
	7733	*
	7734	* Implements the concatenation operator.
	7735	*/
	7736	STATIC regnode *
	7737	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	7738	{
	7739	dVAR;
	7740	register regnode *ret;
	7741	register regnode *chain = NULL;
	7742	register regnode *latest;
	7743	I32 flags = 0, c = 0;
	7744	GET_RE_DEBUG_FLAGS_DECL;
	7745
	7746	PERL_ARGS_ASSERT_REGBRANCH;
	7747
	7748	DEBUG_PARSE("brnc");
	7749
	7750	if (first)
	7751	ret = NULL;
	7752	else {
	7753	if (!SIZE_ONLY && RExC_extralen)
	7754	ret = reganode(pRExC_state, BRANCHJ,0);
	7755	else {
	7756	ret = reg_node(pRExC_state, BRANCH);
	7757	Set_Node_Length(ret, 1);
	7758	}
	7759	}
	7760
	7761	if (!first && SIZE_ONLY)
	7762	RExC_extralen += 1; /* BRANCHJ */
	7763
	7764	flagp = WORST; / Tentatively. */
	7765
	7766	RExC_parse--;
	7767	nextchar(pRExC_state);
	7768	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	7769	flags &= ~TRYAGAIN;
	7770	latest = regpiece(pRExC_state, &flags,depth+1);
	7771	if (latest == NULL) {
	7772	if (flags & TRYAGAIN)
	7773	continue;
	7774	return(NULL);
	7775	}
	7776	else if (ret == NULL)
	7777	ret = latest;
	7778	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	7779	if (chain == NULL) /* First piece. */
	7780	*flagp \|= flags&SPSTART;
	7781	else {
	7782	RExC_naughty++;
	7783	REGTAIL(pRExC_state, chain, latest);
	7784	}
	7785	chain = latest;
	7786	c++;
	7787	}
	7788	if (chain == NULL) { /* Loop ran zero times. */
	7789	chain = reg_node(pRExC_state, NOTHING);
	7790	if (ret == NULL)
	7791	ret = chain;
	7792	}
	7793	if (c == 1) {
	7794	*flagp \|= flags&SIMPLE;
	7795	}
	7796
	7797	return ret;
	7798	}
	7799
	7800	/*
	7801	- regpiece - something followed by possible [*+?]
	7802	*
	7803	* Note that the branching code sequences used for ? and the general cases
	7804	* of * and + are somewhat optimized: they use the same NOTHING node as
	7805	* both the endmarker for their branch list and the body of the last branch.
	7806	* It might seem that this node could be dispensed with entirely, but the
	7807	* endmarker role is not redundant.
	7808	*/
	7809	STATIC regnode *
	7810	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	7811	{
	7812	dVAR;
	7813	register regnode *ret;
	7814	register char op;
	7815	register char *next;
	7816	I32 flags;
	7817	const char * const origparse = RExC_parse;
	7818	I32 min;
	7819	I32 max = REG_INFTY;
	7820	#ifdef RE_TRACK_PATTERN_OFFSETS
	7821	char *parse_start;
	7822	#endif
	7823	const char *maxpos = NULL;
	7824	GET_RE_DEBUG_FLAGS_DECL;
	7825
	7826	PERL_ARGS_ASSERT_REGPIECE;
	7827
	7828	DEBUG_PARSE("piec");
	7829
	7830	ret = regatom(pRExC_state, &flags,depth+1);
	7831	if (ret == NULL) {
	7832	if (flags & TRYAGAIN)
	7833	*flagp \|= TRYAGAIN;
	7834	return(NULL);
	7835	}
	7836
	7837	op = *RExC_parse;
	7838
	7839	if (op == '{' && regcurly(RExC_parse)) {
	7840	maxpos = NULL;
	7841	#ifdef RE_TRACK_PATTERN_OFFSETS
	7842	parse_start = RExC_parse; /* MJD */
	7843	#endif
	7844	next = RExC_parse + 1;
	7845	while (isDIGIT(next) \|\| next == ',') {
	7846	if (*next == ',') {
	7847	if (maxpos)
	7848	break;
	7849	else
	7850	maxpos = next;
	7851	}
	7852	next++;
	7853	}
	7854	if (next == '}') { / got one */
	7855	if (!maxpos)
	7856	maxpos = next;
	7857	RExC_parse++;
	7858	min = atoi(RExC_parse);
	7859	if (*maxpos == ',')
	7860	maxpos++;
	7861	else
	7862	maxpos = RExC_parse;
	7863	max = atoi(maxpos);
	7864	if (!max && *maxpos != '0')
	7865	max = REG_INFTY; /* meaning "infinity" */
	7866	else if (max >= REG_INFTY)
	7867	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	7868	RExC_parse = next;
	7869	nextchar(pRExC_state);
	7870
	7871	do_curly:
	7872	if ((flags&SIMPLE)) {
	7873	RExC_naughty += 2 + RExC_naughty / 2;
	7874	reginsert(pRExC_state, CURLY, ret, depth+1);
	7875	Set_Node_Offset(ret, parse_start+1); /* MJD */
	7876	Set_Node_Cur_Length(ret);
	7877	}
	7878	else {
	7879	regnode * const w = reg_node(pRExC_state, WHILEM);
	7880
	7881	w->flags = 0;
	7882	REGTAIL(pRExC_state, ret, w);
	7883	if (!SIZE_ONLY && RExC_extralen) {
	7884	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	7885	reginsert(pRExC_state, NOTHING,ret, depth+1);
	7886	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	7887	}
	7888	reginsert(pRExC_state, CURLYX,ret, depth+1);
	7889	/* MJD hk */
	7890	Set_Node_Offset(ret, parse_start+1);
	7891	Set_Node_Length(ret,
	7892	op == '{' ? (RExC_parse - parse_start) : 1);
	7893
	7894	if (!SIZE_ONLY && RExC_extralen)
	7895	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	7896	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	7897	if (SIZE_ONLY)
	7898	RExC_whilem_seen++, RExC_extralen += 3;
	7899	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	7900	}
	7901	ret->flags = 0;
	7902
	7903	if (min > 0)
	7904	*flagp = WORST;
	7905	if (max > 0)
	7906	*flagp \|= HASWIDTH;
	7907	if (max < min)
	7908	vFAIL("Can't do {n,m} with n > m");
	7909	if (!SIZE_ONLY) {
	7910	ARG1_SET(ret, (U16)min);
	7911	ARG2_SET(ret, (U16)max);
	7912	}
	7913
	7914	goto nest_check;
	7915	}
	7916	}
	7917
	7918	if (!ISMULT1(op)) {
	7919	*flagp = flags;
	7920	return(ret);
	7921	}
	7922
	7923	#if 0 /* Now runtime fix should be reliable. */
	7924
	7925	/* if this is reinstated, don't forget to put this back into perldiag:
	7926
	7927	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	7928
	7929	(F) The part of the regexp subject to either the * or + quantifier
	7930	could match an empty string. The {#} shows in the regular
	7931	expression about where the problem was discovered.
	7932
	7933	*/
	7934
	7935	if (!(flags&HASWIDTH) && op != '?')
	7936	vFAIL("Regexp *+ operand could be empty");
	7937	#endif
	7938
	7939	#ifdef RE_TRACK_PATTERN_OFFSETS
	7940	parse_start = RExC_parse;
	7941	#endif
	7942	nextchar(pRExC_state);
	7943
	7944	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	7945
	7946	if (op == '*' && (flags&SIMPLE)) {
	7947	reginsert(pRExC_state, STAR, ret, depth+1);
	7948	ret->flags = 0;
	7949	RExC_naughty += 4;
	7950	}
	7951	else if (op == '*') {
	7952	min = 0;
	7953	goto do_curly;
	7954	}
	7955	else if (op == '+' && (flags&SIMPLE)) {
	7956	reginsert(pRExC_state, PLUS, ret, depth+1);
	7957	ret->flags = 0;
	7958	RExC_naughty += 3;
	7959	}
	7960	else if (op == '+') {
	7961	min = 1;
	7962	goto do_curly;
	7963	}
	7964	else if (op == '?') {
	7965	min = 0; max = 1;
	7966	goto do_curly;
	7967	}
	7968	nest_check:
	7969	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	7970	ckWARN3reg(RExC_parse,
	7971	"%.*s matches null string many times",
	7972	(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
	7973	origparse);
	7974	}
	7975
	7976	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	7977	nextchar(pRExC_state);
	7978	reginsert(pRExC_state, MINMOD, ret, depth+1);
	7979	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	7980	}
	7981	#ifndef REG_ALLOW_MINMOD_SUSPEND
	7982	else
	7983	#endif
	7984	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	7985	regnode *ender;
	7986	nextchar(pRExC_state);
	7987	ender = reg_node(pRExC_state, SUCCEED);
	7988	REGTAIL(pRExC_state, ret, ender);
	7989	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	7990	ret->flags = 0;
	7991	ender = reg_node(pRExC_state, TAIL);
	7992	REGTAIL(pRExC_state, ret, ender);
	7993	/ret= ender;/
	7994	}
	7995
	7996	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	7997	RExC_parse++;
	7998	vFAIL("Nested quantifiers");
	7999	}
	8000
	8001	return(ret);
	8002	}
	8003
	8004
	8005	/* reg_namedseq(pRExC_state,UVp, UV depth)
	8006
	8007	This is expected to be called by a parser routine that has
	8008	recognized '\N' and needs to handle the rest. RExC_parse is
	8009	expected to point at the first char following the N at the time
	8010	of the call.
	8011
	8012	The \N may be inside (indicated by valuep not being NULL) or outside a
	8013	character class.
	8014
	8015	\N may begin either a named sequence, or if outside a character class, mean
	8016	to match a non-newline. For non single-quoted regexes, the tokenizer has
	8017	attempted to decide which, and in the case of a named sequence converted it
	8018	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	8019	where c1... are the characters in the sequence. For single-quoted regexes,
	8020	the tokenizer passes the \N sequence through unchanged; this code will not
	8021	attempt to determine this nor expand those. The net effect is that if the
	8022	beginning of the passed-in pattern isn't '{U+' or there is no '}', it
	8023	signals that this \N occurrence means to match a non-newline.
	8024
	8025	Only the \N{U+...} form should occur in a character class, for the same
	8026	reason that '.' inside a character class means to just match a period: it
	8027	just doesn't make sense.
	8028
	8029	If valuep is non-null then it is assumed that we are parsing inside
	8030	of a charclass definition and the first codepoint in the resolved
	8031	string is returned via *valuep and the routine will return NULL.
	8032	In this mode if a multichar string is returned from the charnames
	8033	handler, a warning will be issued, and only the first char in the
	8034	sequence will be examined. If the string returned is zero length
	8035	then the value of *valuep is undefined and NON-NULL will
	8036	be returned to indicate failure. (This will NOT be a valid pointer
	8037	to a regnode.)
	8038
	8039	If valuep is null then it is assumed that we are parsing normal text and a
	8040	new EXACT node is inserted into the program containing the resolved string,
	8041	and a pointer to the new node is returned. But if the string is zero length
	8042	a NOTHING node is emitted instead.
	8043
	8044	On success RExC_parse is set to the char following the endbrace.
	8045	Parsing failures will generate a fatal error via vFAIL(...)
	8046	*/
	8047	STATIC regnode *
	8048	S_reg_namedseq(pTHX_ RExC_state_t pRExC_state, UV valuep, I32 *flagp, U32 depth)
	8049	{
	8050	char * endbrace; /* '}' following the name */
	8051	regnode *ret = NULL;
	8052	char* p;
	8053
	8054	GET_RE_DEBUG_FLAGS_DECL;
	8055
	8056	PERL_ARGS_ASSERT_REG_NAMEDSEQ;
	8057
	8058	GET_RE_DEBUG_FLAGS;
	8059
	8060	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	8061	* modifier. The other meaning does not */
	8062	p = (RExC_flags & RXf_PMf_EXTENDED)
	8063	? regwhite( pRExC_state, RExC_parse )
	8064	: RExC_parse;
	8065
	8066	/* Disambiguate between \N meaning a named character versus \N meaning
	8067	* [^\n]. The former is assumed when it can't be the latter. */
	8068	if (*p != '{' \|\| regcurly(p)) {
	8069	RExC_parse = p;
	8070	if (valuep) {
	8071	/* no bare \N in a charclass */
	8072	vFAIL("\\N in a character class must be a named character: \\N{...}");
	8073	}
	8074	nextchar(pRExC_state);
	8075	ret = reg_node(pRExC_state, REG_ANY);
	8076	*flagp \|= HASWIDTH\|SIMPLE;
	8077	RExC_naughty++;
	8078	RExC_parse--;
	8079	Set_Node_Length(ret, 1); /* MJD */
	8080	return ret;
	8081	}
	8082
	8083	/* Here, we have decided it should be a named sequence */
	8084
	8085	/* The test above made sure that the next real character is a '{', but
	8086	* under the /x modifier, it could be separated by space (or a comment and
	8087	* \n) and this is not allowed (for consistency with \x{...} and the
	8088	* tokenizer handling of \N{NAME}). */
	8089	if (*RExC_parse != '{') {
	8090	vFAIL("Missing braces on \\N{}");
	8091	}
	8092
	8093	RExC_parse++; /* Skip past the '{' */
	8094
	8095	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	8096	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	8097	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
	8098	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
	8099	{
	8100	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	8101	vFAIL("\\N{NAME} must be resolved by the lexer");
	8102	}
	8103
	8104	if (endbrace == RExC_parse) { /* empty: \N{} */
	8105	if (! valuep) {
	8106	RExC_parse = endbrace + 1;
	8107	return reg_node(pRExC_state,NOTHING);
	8108	}
	8109
	8110	if (SIZE_ONLY) {
	8111	ckWARNreg(RExC_parse,
	8112	"Ignoring zero length \\N{} in character class"
	8113	);
	8114	RExC_parse = endbrace + 1;
	8115	}
	8116	*valuep = 0;
	8117	return (regnode ) &RExC_parse; / Invalid regnode pointer */
	8118	}
	8119
	8120	REQUIRE_UTF8; /* named sequences imply Unicode semantics */
	8121	RExC_parse += 2; /* Skip past the 'U+' */
	8122
	8123	if (valuep) { /* In a bracketed char class */
	8124	/* We only pay attention to the first char of
	8125	multichar strings being returned. I kinda wonder
	8126	if this makes sense as it does change the behaviour
	8127	from earlier versions, OTOH that behaviour was broken
	8128	as well. XXX Solution is to recharacterize as
	8129	[rest-of-class]\|multi1\|multi2... */
	8130
	8131	STRLEN length_of_hex;
	8132	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	8133	\| PERL_SCAN_DISALLOW_PREFIX
	8134	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	8135
	8136	char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
	8137	if (endchar < endbrace) {
	8138	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	8139	}
	8140
	8141	length_of_hex = (STRLEN)(endchar - RExC_parse);
	8142	*valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
	8143
	8144	/* The tokenizer should have guaranteed validity, but it's possible to
	8145	* bypass it by using single quoting, so check */
	8146	if (length_of_hex == 0
	8147	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	8148	{
	8149	RExC_parse += length_of_hex; /* Includes all the valid */
	8150	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	8151	? UTF8SKIP(RExC_parse)
	8152	: 1;
	8153	/* Guard against malformed utf8 */
	8154	if (RExC_parse >= endchar) RExC_parse = endchar;
	8155	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	8156	}
	8157
	8158	RExC_parse = endbrace + 1;
	8159	if (endchar == endbrace) return NULL;
	8160
	8161	ret = (regnode ) &RExC_parse; / Invalid regnode pointer */
	8162	}
	8163	else { /* Not a char class */
	8164
	8165	/* What is done here is to convert this to a sub-pattern of the form
	8166	* (?:\x{char1}\x{char2}...)
	8167	* and then call reg recursively. That way, it retains its atomicness,
	8168	* while not having to worry about special handling that some code
	8169	* points may have. toke.c has converted the original Unicode values
	8170	* to native, so that we can just pass on the hex values unchanged. We
	8171	* do have to set a flag to keep recoding from happening in the
	8172	* recursion */
	8173
	8174	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8\|SVs_TEMP);
	8175	STRLEN len;
	8176	char endchar; / Points to '.' or '}' ending cur char in the input
	8177	stream */
	8178	char *orig_end = RExC_end;
	8179
	8180	while (RExC_parse < endbrace) {
	8181
	8182	/* Code points are separated by dots. If none, there is only one
	8183	* code point, and is terminated by the brace */
	8184	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	8185
	8186	/* Convert to notation the rest of the code understands */
	8187	sv_catpv(substitute_parse, "\\x{");
	8188	sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
	8189	sv_catpv(substitute_parse, "}");
	8190
	8191	/* Point to the beginning of the next character in the sequence. */
	8192	RExC_parse = endchar + 1;
	8193	}
	8194	sv_catpv(substitute_parse, ")");
	8195
	8196	RExC_parse = SvPV(substitute_parse, len);
	8197
	8198	/* Don't allow empty number */
	8199	if (len < 8) {
	8200	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	8201	}
	8202	RExC_end = RExC_parse + len;
	8203
	8204	/* The values are Unicode, and therefore not subject to recoding */
	8205	RExC_override_recoding = 1;
	8206
	8207	ret = reg(pRExC_state, 1, flagp, depth+1);
	8208
	8209	RExC_parse = endbrace;
	8210	RExC_end = orig_end;
	8211	RExC_override_recoding = 0;
	8212
	8213	nextchar(pRExC_state);
	8214	}
	8215
	8216	return ret;
	8217	}
	8218
	8219
	8220	/*
	8221	* reg_recode
	8222	*
	8223	* It returns the code point in utf8 for the value in *encp.
	8224	* value: a code value in the source encoding
	8225	* encp: a pointer to an Encode object
	8226	*
	8227	* If the result from Encode is not a single character,
	8228	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	8229	*/
	8230	STATIC UV
	8231	S_reg_recode(pTHX_ const char value, SV **encp)
	8232	{
	8233	STRLEN numlen = 1;
	8234	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	8235	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	8236	const STRLEN newlen = SvCUR(sv);
	8237	UV uv = UNICODE_REPLACEMENT;
	8238
	8239	PERL_ARGS_ASSERT_REG_RECODE;
	8240
	8241	if (newlen)
	8242	uv = SvUTF8(sv)
	8243	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	8244	: (U8)s;
	8245
	8246	if (!newlen \|\| numlen != newlen) {
	8247	uv = UNICODE_REPLACEMENT;
	8248	*encp = NULL;
	8249	}
	8250	return uv;
	8251	}
	8252
	8253
	8254	/*
	8255	- regatom - the lowest level
	8256
	8257	Try to identify anything special at the start of the pattern. If there
	8258	is, then handle it as required. This may involve generating a single regop,
	8259	such as for an assertion; or it may involve recursing, such as to
	8260	handle a () structure.
	8261
	8262	If the string doesn't start with something special then we gobble up
	8263	as much literal text as we can.
	8264
	8265	Once we have been able to handle whatever type of thing started the
	8266	sequence, we return.
	8267
	8268	Note: we have to be careful with escapes, as they can be both literal
	8269	and special, and in the case of \10 and friends can either, depending
	8270	on context. Specifically there are two separate switches for handling
	8271	escape sequences, with the one for handling literal escapes requiring
	8272	a dummy entry for all of the special escapes that are actually handled
	8273	by the other.
	8274	*/
	8275
	8276	STATIC regnode *
	8277	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	8278	{
	8279	dVAR;
	8280	register regnode *ret = NULL;
	8281	I32 flags;
	8282	char *parse_start = RExC_parse;
	8283	U8 op;
	8284	GET_RE_DEBUG_FLAGS_DECL;
	8285	DEBUG_PARSE("atom");
	8286	flagp = WORST; / Tentatively. */
	8287
	8288	PERL_ARGS_ASSERT_REGATOM;
	8289
	8290	tryagain:
	8291	switch ((U8)*RExC_parse) {
	8292	case '^':
	8293	RExC_seen_zerolen++;
	8294	nextchar(pRExC_state);
	8295	if (RExC_flags & RXf_PMf_MULTILINE)
	8296	ret = reg_node(pRExC_state, MBOL);
	8297	else if (RExC_flags & RXf_PMf_SINGLELINE)
	8298	ret = reg_node(pRExC_state, SBOL);
	8299	else
	8300	ret = reg_node(pRExC_state, BOL);
	8301	Set_Node_Length(ret, 1); /* MJD */
	8302	break;
	8303	case '$':
	8304	nextchar(pRExC_state);
	8305	if (*RExC_parse)
	8306	RExC_seen_zerolen++;
	8307	if (RExC_flags & RXf_PMf_MULTILINE)
	8308	ret = reg_node(pRExC_state, MEOL);
	8309	else if (RExC_flags & RXf_PMf_SINGLELINE)
	8310	ret = reg_node(pRExC_state, SEOL);
	8311	else
	8312	ret = reg_node(pRExC_state, EOL);
	8313	Set_Node_Length(ret, 1); /* MJD */
	8314	break;
	8315	case '.':
	8316	nextchar(pRExC_state);
	8317	if (RExC_flags & RXf_PMf_SINGLELINE)
	8318	ret = reg_node(pRExC_state, SANY);
	8319	else
	8320	ret = reg_node(pRExC_state, REG_ANY);
	8321	*flagp \|= HASWIDTH\|SIMPLE;
	8322	RExC_naughty++;
	8323	Set_Node_Length(ret, 1); /* MJD */
	8324	break;
	8325	case '[':
	8326	{
	8327	char * const oregcomp_parse = ++RExC_parse;
	8328	ret = regclass(pRExC_state,depth+1);
	8329	if (*RExC_parse != ']') {
	8330	RExC_parse = oregcomp_parse;
	8331	vFAIL("Unmatched [");
	8332	}
	8333	nextchar(pRExC_state);
	8334	*flagp \|= HASWIDTH\|SIMPLE;
	8335	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	8336	break;
	8337	}
	8338	case '(':
	8339	nextchar(pRExC_state);
	8340	ret = reg(pRExC_state, 1, &flags,depth+1);
	8341	if (ret == NULL) {
	8342	if (flags & TRYAGAIN) {
	8343	if (RExC_parse == RExC_end) {
	8344	/* Make parent create an empty node if needed. */
	8345	*flagp \|= TRYAGAIN;
	8346	return(NULL);
	8347	}
	8348	goto tryagain;
	8349	}
	8350	return(NULL);
	8351	}
	8352	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	8353	break;
	8354	case '\|':
	8355	case ')':
	8356	if (flags & TRYAGAIN) {
	8357	*flagp \|= TRYAGAIN;
	8358	return NULL;
	8359	}
	8360	vFAIL("Internal urp");
	8361	/* Supposed to be caught earlier. */
	8362	break;
	8363	case '{':
	8364	if (!regcurly(RExC_parse)) {
	8365	RExC_parse++;
	8366	goto defchar;
	8367	}
	8368	/* FALL THROUGH */
	8369	case '?':
	8370	case '+':
	8371	case '*':
	8372	RExC_parse++;
	8373	vFAIL("Quantifier follows nothing");
	8374	break;
	8375	case '\\':
	8376	/* Special Escapes
	8377
	8378	This switch handles escape sequences that resolve to some kind
	8379	of special regop and not to literal text. Escape sequnces that
	8380	resolve to literal text are handled below in the switch marked
	8381	"Literal Escapes".
	8382
	8383	Every entry in this switch must have a corresponding entry
	8384	in the literal escape switch. However, the opposite is not
	8385	required, as the default for this switch is to jump to the
	8386	literal text handling code.
	8387	*/
	8388	switch ((U8)*++RExC_parse) {
	8389	/* Special Escapes */
	8390	case 'A':
	8391	RExC_seen_zerolen++;
	8392	ret = reg_node(pRExC_state, SBOL);
	8393	*flagp \|= SIMPLE;
	8394	goto finish_meta_pat;
	8395	case 'G':
	8396	ret = reg_node(pRExC_state, GPOS);
	8397	RExC_seen \|= REG_SEEN_GPOS;
	8398	*flagp \|= SIMPLE;
	8399	goto finish_meta_pat;
	8400	case 'K':
	8401	RExC_seen_zerolen++;
	8402	ret = reg_node(pRExC_state, KEEPS);
	8403	*flagp \|= SIMPLE;
	8404	/* XXX:dmq : disabling in-place substitution seems to
	8405	* be necessary here to avoid cases of memory corruption, as
	8406	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	8407	*/
	8408	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8409	goto finish_meta_pat;
	8410	case 'Z':
	8411	ret = reg_node(pRExC_state, SEOL);
	8412	*flagp \|= SIMPLE;
	8413	RExC_seen_zerolen++; /* Do not optimize RE away */
	8414	goto finish_meta_pat;
	8415	case 'z':
	8416	ret = reg_node(pRExC_state, EOS);
	8417	*flagp \|= SIMPLE;
	8418	RExC_seen_zerolen++; /* Do not optimize RE away */
	8419	goto finish_meta_pat;
	8420	case 'C':
	8421	ret = reg_node(pRExC_state, CANY);
	8422	RExC_seen \|= REG_SEEN_CANY;
	8423	*flagp \|= HASWIDTH\|SIMPLE;
	8424	goto finish_meta_pat;
	8425	case 'X':
	8426	ret = reg_node(pRExC_state, CLUMP);
	8427	*flagp \|= HASWIDTH;
	8428	goto finish_meta_pat;
	8429	case 'w':
	8430	switch (get_regex_charset(RExC_flags)) {
	8431	case REGEX_LOCALE_CHARSET:
	8432	op = ALNUML;
	8433	break;
	8434	case REGEX_UNICODE_CHARSET:
	8435	op = ALNUMU;
	8436	break;
	8437	case REGEX_ASCII_RESTRICTED_CHARSET:
	8438	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8439	op = ALNUMA;
	8440	break;
	8441	case REGEX_DEPENDS_CHARSET:
	8442	op = ALNUM;
	8443	break;
	8444	default:
	8445	goto bad_charset;
	8446	}
	8447	ret = reg_node(pRExC_state, op);
	8448	*flagp \|= HASWIDTH\|SIMPLE;
	8449	goto finish_meta_pat;
	8450	case 'W':
	8451	switch (get_regex_charset(RExC_flags)) {
	8452	case REGEX_LOCALE_CHARSET:
	8453	op = NALNUML;
	8454	break;
	8455	case REGEX_UNICODE_CHARSET:
	8456	op = NALNUMU;
	8457	break;
	8458	case REGEX_ASCII_RESTRICTED_CHARSET:
	8459	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8460	op = NALNUMA;
	8461	break;
	8462	case REGEX_DEPENDS_CHARSET:
	8463	op = NALNUM;
	8464	break;
	8465	default:
	8466	goto bad_charset;
	8467	}
	8468	ret = reg_node(pRExC_state, op);
	8469	*flagp \|= HASWIDTH\|SIMPLE;
	8470	goto finish_meta_pat;
	8471	case 'b':
	8472	RExC_seen_zerolen++;
	8473	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8474	switch (get_regex_charset(RExC_flags)) {
	8475	case REGEX_LOCALE_CHARSET:
	8476	op = BOUNDL;
	8477	break;
	8478	case REGEX_UNICODE_CHARSET:
	8479	op = BOUNDU;
	8480	break;
	8481	case REGEX_ASCII_RESTRICTED_CHARSET:
	8482	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8483	op = BOUNDA;
	8484	break;
	8485	case REGEX_DEPENDS_CHARSET:
	8486	op = BOUND;
	8487	break;
	8488	default:
	8489	goto bad_charset;
	8490	}
	8491	ret = reg_node(pRExC_state, op);
	8492	FLAGS(ret) = get_regex_charset(RExC_flags);
	8493	*flagp \|= SIMPLE;
	8494	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	8495	ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
	8496	}
	8497	goto finish_meta_pat;
	8498	case 'B':
	8499	RExC_seen_zerolen++;
	8500	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8501	switch (get_regex_charset(RExC_flags)) {
	8502	case REGEX_LOCALE_CHARSET:
	8503	op = NBOUNDL;
	8504	break;
	8505	case REGEX_UNICODE_CHARSET:
	8506	op = NBOUNDU;
	8507	break;
	8508	case REGEX_ASCII_RESTRICTED_CHARSET:
	8509	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8510	op = NBOUNDA;
	8511	break;
	8512	case REGEX_DEPENDS_CHARSET:
	8513	op = NBOUND;
	8514	break;
	8515	default:
	8516	goto bad_charset;
	8517	}
	8518	ret = reg_node(pRExC_state, op);
	8519	FLAGS(ret) = get_regex_charset(RExC_flags);
	8520	*flagp \|= SIMPLE;
	8521	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	8522	ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
	8523	}
	8524	goto finish_meta_pat;
	8525	case 's':
	8526	switch (get_regex_charset(RExC_flags)) {
	8527	case REGEX_LOCALE_CHARSET:
	8528	op = SPACEL;
	8529	break;
	8530	case REGEX_UNICODE_CHARSET:
	8531	op = SPACEU;
	8532	break;
	8533	case REGEX_ASCII_RESTRICTED_CHARSET:
	8534	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8535	op = SPACEA;
	8536	break;
	8537	case REGEX_DEPENDS_CHARSET:
	8538	op = SPACE;
	8539	break;
	8540	default:
	8541	goto bad_charset;
	8542	}
	8543	ret = reg_node(pRExC_state, op);
	8544	*flagp \|= HASWIDTH\|SIMPLE;
	8545	goto finish_meta_pat;
	8546	case 'S':
	8547	switch (get_regex_charset(RExC_flags)) {
	8548	case REGEX_LOCALE_CHARSET:
	8549	op = NSPACEL;
	8550	break;
	8551	case REGEX_UNICODE_CHARSET:
	8552	op = NSPACEU;
	8553	break;
	8554	case REGEX_ASCII_RESTRICTED_CHARSET:
	8555	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8556	op = NSPACEA;
	8557	break;
	8558	case REGEX_DEPENDS_CHARSET:
	8559	op = NSPACE;
	8560	break;
	8561	default:
	8562	goto bad_charset;
	8563	}
	8564	ret = reg_node(pRExC_state, op);
	8565	*flagp \|= HASWIDTH\|SIMPLE;
	8566	goto finish_meta_pat;
	8567	case 'd':
	8568	switch (get_regex_charset(RExC_flags)) {
	8569	case REGEX_LOCALE_CHARSET:
	8570	op = DIGITL;
	8571	break;
	8572	case REGEX_ASCII_RESTRICTED_CHARSET:
	8573	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8574	op = DIGITA;
	8575	break;
	8576	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	8577	case REGEX_UNICODE_CHARSET:
	8578	op = DIGIT;
	8579	break;
	8580	default:
	8581	goto bad_charset;
	8582	}
	8583	ret = reg_node(pRExC_state, op);
	8584	*flagp \|= HASWIDTH\|SIMPLE;
	8585	goto finish_meta_pat;
	8586	case 'D':
	8587	switch (get_regex_charset(RExC_flags)) {
	8588	case REGEX_LOCALE_CHARSET:
	8589	op = NDIGITL;
	8590	break;
	8591	case REGEX_ASCII_RESTRICTED_CHARSET:
	8592	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8593	op = NDIGITA;
	8594	break;
	8595	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	8596	case REGEX_UNICODE_CHARSET:
	8597	op = NDIGIT;
	8598	break;
	8599	default:
	8600	goto bad_charset;
	8601	}
	8602	ret = reg_node(pRExC_state, op);
	8603	*flagp \|= HASWIDTH\|SIMPLE;
	8604	goto finish_meta_pat;
	8605	case 'R':
	8606	ret = reg_node(pRExC_state, LNBREAK);
	8607	*flagp \|= HASWIDTH\|SIMPLE;
	8608	goto finish_meta_pat;
	8609	case 'h':
	8610	ret = reg_node(pRExC_state, HORIZWS);
	8611	*flagp \|= HASWIDTH\|SIMPLE;
	8612	goto finish_meta_pat;
	8613	case 'H':
	8614	ret = reg_node(pRExC_state, NHORIZWS);
	8615	*flagp \|= HASWIDTH\|SIMPLE;
	8616	goto finish_meta_pat;
	8617	case 'v':
	8618	ret = reg_node(pRExC_state, VERTWS);
	8619	*flagp \|= HASWIDTH\|SIMPLE;
	8620	goto finish_meta_pat;
	8621	case 'V':
	8622	ret = reg_node(pRExC_state, NVERTWS);
	8623	*flagp \|= HASWIDTH\|SIMPLE;
	8624	finish_meta_pat:
	8625	nextchar(pRExC_state);
	8626	Set_Node_Length(ret, 2); /* MJD */
	8627	break;
	8628	case 'p':
	8629	case 'P':
	8630	{
	8631	char* const oldregxend = RExC_end;
	8632	#ifdef DEBUGGING
	8633	char* parse_start = RExC_parse - 2;
	8634	#endif
	8635
	8636	if (RExC_parse[1] == '{') {
	8637	/* a lovely hack--pretend we saw [\pX] instead */
	8638	RExC_end = strchr(RExC_parse, '}');
	8639	if (!RExC_end) {
	8640	const U8 c = (U8)*RExC_parse;
	8641	RExC_parse += 2;
	8642	RExC_end = oldregxend;
	8643	vFAIL2("Missing right brace on \\%c{}", c);
	8644	}
	8645	RExC_end++;
	8646	}
	8647	else {
	8648	RExC_end = RExC_parse + 2;
	8649	if (RExC_end > oldregxend)
	8650	RExC_end = oldregxend;
	8651	}
	8652	RExC_parse--;
	8653
	8654	ret = regclass(pRExC_state,depth+1);
	8655
	8656	RExC_end = oldregxend;
	8657	RExC_parse--;
	8658
	8659	Set_Node_Offset(ret, parse_start + 2);
	8660	Set_Node_Cur_Length(ret);
	8661	nextchar(pRExC_state);
	8662	*flagp \|= HASWIDTH\|SIMPLE;
	8663	}
	8664	break;
	8665	case 'N':
	8666	/* Handle \N and \N{NAME} here and not below because it can be
	8667	multicharacter. join_exact() will join them up later on.
	8668	Also this makes sure that things like /\N{BLAH}+/ and
	8669	\N{BLAH} being multi char Just Happen. dmq*/
	8670	++RExC_parse;
	8671	ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
	8672	break;
	8673	case 'k': /* Handle \k<NAME> and \k'NAME' */
	8674	parse_named_seq:
	8675	{
	8676	char ch= RExC_parse[1];
	8677	if (ch != '<' && ch != '\'' && ch != '{') {
	8678	RExC_parse++;
	8679	vFAIL2("Sequence %.2s... not terminated",parse_start);
	8680	} else {
	8681	/* this pretty much dupes the code for (?P=...) in reg(), if
	8682	you change this make sure you change that */
	8683	char* name_start = (RExC_parse += 2);
	8684	U32 num = 0;
	8685	SV *sv_dat = reg_scan_name(pRExC_state,
	8686	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8687	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	8688	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	8689	vFAIL2("Sequence %.3s... not terminated",parse_start);
	8690
	8691	if (!SIZE_ONLY) {
	8692	num = add_data( pRExC_state, 1, "S" );
	8693	RExC_rxi->data->data[num]=(void*)sv_dat;
	8694	SvREFCNT_inc_simple_void(sv_dat);
	8695	}
	8696
	8697	RExC_sawback = 1;
	8698	ret = reganode(pRExC_state,
	8699	((! FOLD)
	8700	? NREF
	8701	: (MORE_ASCII_RESTRICTED)
	8702	? NREFFA
	8703	: (AT_LEAST_UNI_SEMANTICS)
	8704	? NREFFU
	8705	: (LOC)
	8706	? NREFFL
	8707	: NREFF),
	8708	num);
	8709	*flagp \|= HASWIDTH;
	8710
	8711	/* override incorrect value set in reganode MJD */
	8712	Set_Node_Offset(ret, parse_start+1);
	8713	Set_Node_Cur_Length(ret); /* MJD */
	8714	nextchar(pRExC_state);
	8715
	8716	}
	8717	break;
	8718	}
	8719	case 'g':
	8720	case '1': case '2': case '3': case '4':
	8721	case '5': case '6': case '7': case '8': case '9':
	8722	{
	8723	I32 num;
	8724	bool isg = *RExC_parse == 'g';
	8725	bool isrel = 0;
	8726	bool hasbrace = 0;
	8727	if (isg) {
	8728	RExC_parse++;
	8729	if (*RExC_parse == '{') {
	8730	RExC_parse++;
	8731	hasbrace = 1;
	8732	}
	8733	if (*RExC_parse == '-') {
	8734	RExC_parse++;
	8735	isrel = 1;
	8736	}
	8737	if (hasbrace && !isDIGIT(*RExC_parse)) {
	8738	if (isrel) RExC_parse--;
	8739	RExC_parse -= 2;
	8740	goto parse_named_seq;
	8741	} }
	8742	num = atoi(RExC_parse);
	8743	if (isg && num == 0)
	8744	vFAIL("Reference to invalid group 0");
	8745	if (isrel) {
	8746	num = RExC_npar - num;
	8747	if (num < 1)
	8748	vFAIL("Reference to nonexistent or unclosed group");
	8749	}
	8750	if (!isg && num > 9 && num >= RExC_npar)
	8751	goto defchar;
	8752	else {
	8753	char * const parse_start = RExC_parse - 1; /* MJD */
	8754	while (isDIGIT(*RExC_parse))
	8755	RExC_parse++;
	8756	if (parse_start == RExC_parse - 1)
	8757	vFAIL("Unterminated \\g... pattern");
	8758	if (hasbrace) {
	8759	if (*RExC_parse != '}')
	8760	vFAIL("Unterminated \\g{...} pattern");
	8761	RExC_parse++;
	8762	}
	8763	if (!SIZE_ONLY) {
	8764	if (num > (I32)RExC_rx->nparens)
	8765	vFAIL("Reference to nonexistent group");
	8766	}
	8767	RExC_sawback = 1;
	8768	ret = reganode(pRExC_state,
	8769	((! FOLD)
	8770	? REF
	8771	: (MORE_ASCII_RESTRICTED)
	8772	? REFFA
	8773	: (AT_LEAST_UNI_SEMANTICS)
	8774	? REFFU
	8775	: (LOC)
	8776	? REFFL
	8777	: REFF),
	8778	num);
	8779	*flagp \|= HASWIDTH;
	8780
	8781	/* override incorrect value set in reganode MJD */
	8782	Set_Node_Offset(ret, parse_start+1);
	8783	Set_Node_Cur_Length(ret); /* MJD */
	8784	RExC_parse--;
	8785	nextchar(pRExC_state);
	8786	}
	8787	}
	8788	break;
	8789	case '\0':
	8790	if (RExC_parse >= RExC_end)
	8791	FAIL("Trailing \\");
	8792	/* FALL THROUGH */
	8793	default:
	8794	/* Do not generate "unrecognized" warnings here, we fall
	8795	back into the quick-grab loop below */
	8796	parse_start--;
	8797	goto defchar;
	8798	}
	8799	break;
	8800
	8801	case '#':
	8802	if (RExC_flags & RXf_PMf_EXTENDED) {
	8803	if ( reg_skipcomment( pRExC_state ) )
	8804	goto tryagain;
	8805	}
	8806	/* FALL THROUGH */
	8807
	8808	default:
	8809
	8810	parse_start = RExC_parse - 1;
	8811
	8812	RExC_parse++;
	8813
	8814	defchar: {
	8815	typedef enum {
	8816	generic_char = 0,
	8817	char_s,
	8818	upsilon_1,
	8819	upsilon_2,
	8820	iota_1,
	8821	iota_2,
	8822	} char_state;
	8823	char_state latest_char_state = generic_char;
	8824	register STRLEN len;
	8825	register UV ender;
	8826	register char *p;
	8827	char *s;
	8828	STRLEN foldlen;
	8829	U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
	8830	regnode * orig_emit;
	8831
	8832	ender = 0;
	8833	orig_emit = RExC_emit; /* Save the original output node position in
	8834	case we need to output a different node
	8835	type */
	8836	ret = reg_node(pRExC_state,
	8837	(U8) ((! FOLD) ? EXACT
	8838	: (LOC)
	8839	? EXACTFL
	8840	: (MORE_ASCII_RESTRICTED)
	8841	? EXACTFA
	8842	: (AT_LEAST_UNI_SEMANTICS)
	8843	? EXACTFU
	8844	: EXACTF)
	8845	);
	8846	s = STRING(ret);
	8847	for (len = 0, p = RExC_parse - 1;
	8848	len < 127 && p < RExC_end;
	8849	len++)
	8850	{
	8851	char * const oldp = p;
	8852
	8853	if (RExC_flags & RXf_PMf_EXTENDED)
	8854	p = regwhite( pRExC_state, p );
	8855	switch ((U8)*p) {
	8856	case '^':
	8857	case '$':
	8858	case '.':
	8859	case '[':
	8860	case '(':
	8861	case ')':
	8862	case '\|':
	8863	goto loopdone;
	8864	case '\\':
	8865	/* Literal Escapes Switch
	8866
	8867	This switch is meant to handle escape sequences that
	8868	resolve to a literal character.
	8869
	8870	Every escape sequence that represents something
	8871	else, like an assertion or a char class, is handled
	8872	in the switch marked 'Special Escapes' above in this
	8873	routine, but also has an entry here as anything that
	8874	isn't explicitly mentioned here will be treated as
	8875	an unescaped equivalent literal.
	8876	*/
	8877
	8878	switch ((U8)*++p) {
	8879	/* These are all the special escapes. */
	8880	case 'A': /* Start assertion */
	8881	case 'b': case 'B': /* Word-boundary assertion*/
	8882	case 'C': /* Single char !DANGEROUS! */
	8883	case 'd': case 'D': /* digit class */
	8884	case 'g': case 'G': /* generic-backref, pos assertion */
	8885	case 'h': case 'H': /* HORIZWS */
	8886	case 'k': case 'K': /* named backref, keep marker */
	8887	case 'N': /* named char sequence */
	8888	case 'p': case 'P': /* Unicode property */
	8889	case 'R': /* LNBREAK */
	8890	case 's': case 'S': /* space class */
	8891	case 'v': case 'V': /* VERTWS */
	8892	case 'w': case 'W': /* word class */
	8893	case 'X': /* eXtended Unicode "combining character sequence" */
	8894	case 'z': case 'Z': /* End of line/string assertion */
	8895	--p;
	8896	goto loopdone;
	8897
	8898	/* Anything after here is an escape that resolves to a
	8899	literal. (Except digits, which may or may not)
	8900	*/
	8901	case 'n':
	8902	ender = '\n';
	8903	p++;
	8904	break;
	8905	case 'r':
	8906	ender = '\r';
	8907	p++;
	8908	break;
	8909	case 't':
	8910	ender = '\t';
	8911	p++;
	8912	break;
	8913	case 'f':
	8914	ender = '\f';
	8915	p++;
	8916	break;
	8917	case 'e':
	8918	ender = ASCII_TO_NATIVE('\033');
	8919	p++;
	8920	break;
	8921	case 'a':
	8922	ender = ASCII_TO_NATIVE('\007');
	8923	p++;
	8924	break;
	8925	case 'o':
	8926	{
	8927	STRLEN brace_len = len;
	8928	UV result;
	8929	const char* error_msg;
	8930
	8931	bool valid = grok_bslash_o(p,
	8932	&result,
	8933	&brace_len,
	8934	&error_msg,
	8935	1);
	8936	p += brace_len;
	8937	if (! valid) {
	8938	RExC_parse = p; /* going to die anyway; point
	8939	to exact spot of failure */
	8940	vFAIL(error_msg);
	8941	}
	8942	else
	8943	{
	8944	ender = result;
	8945	}
	8946	if (PL_encoding && ender < 0x100) {
	8947	goto recode_encoding;
	8948	}
	8949	if (ender > 0xff) {
	8950	REQUIRE_UTF8;
	8951	}
	8952	break;
	8953	}
	8954	case 'x':
	8955	if (*++p == '{') {
	8956	char* const e = strchr(p, '}');
	8957
	8958	if (!e) {
	8959	RExC_parse = p + 1;
	8960	vFAIL("Missing right brace on \\x{}");
	8961	}
	8962	else {
	8963	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	8964	\| PERL_SCAN_DISALLOW_PREFIX;
	8965	STRLEN numlen = e - p - 1;
	8966	ender = grok_hex(p + 1, &numlen, &flags, NULL);
	8967	if (ender > 0xff)
	8968	REQUIRE_UTF8;
	8969	p = e + 1;
	8970	}
	8971	}
	8972	else {
	8973	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	8974	STRLEN numlen = 2;
	8975	ender = grok_hex(p, &numlen, &flags, NULL);
	8976	p += numlen;
	8977	}
	8978	if (PL_encoding && ender < 0x100)
	8979	goto recode_encoding;
	8980	break;
	8981	case 'c':
	8982	p++;
	8983	ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
	8984	break;
	8985	case '0': case '1': case '2': case '3':case '4':
	8986	case '5': case '6': case '7': case '8':case '9':
	8987	if (*p == '0' \|\|
	8988	(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
	8989	{
	8990	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	8991	STRLEN numlen = 3;
	8992	ender = grok_oct(p, &numlen, &flags, NULL);
	8993	if (ender > 0xff) {
	8994	REQUIRE_UTF8;
	8995	}
	8996	p += numlen;
	8997	}
	8998	else {
	8999	--p;
	9000	goto loopdone;
	9001	}
	9002	if (PL_encoding && ender < 0x100)
	9003	goto recode_encoding;
	9004	break;
	9005	recode_encoding:
	9006	if (! RExC_override_recoding) {
	9007	SV* enc = PL_encoding;
	9008	ender = reg_recode((const char)(U8)ender, &enc);
	9009	if (!enc && SIZE_ONLY)
	9010	ckWARNreg(p, "Invalid escape in the specified encoding");
	9011	REQUIRE_UTF8;
	9012	}
	9013	break;
	9014	case '\0':
	9015	if (p >= RExC_end)
	9016	FAIL("Trailing \\");
	9017	/* FALL THROUGH */
	9018	default:
	9019	if (!SIZE_ONLY&& isALPHA(*p)) {
	9020	/* Include any { following the alpha to emphasize
	9021	* that it could be part of an escape at some point
	9022	* in the future */
	9023	int len = (*(p + 1) == '{') ? 2 : 1;
	9024	ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
	9025	}
	9026	goto normal_default;
	9027	}
	9028	break;
	9029	default:
	9030	normal_default:
	9031	if (UTF8_IS_START(*p) && UTF) {
	9032	STRLEN numlen;
	9033	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	9034	&numlen, UTF8_ALLOW_DEFAULT);
	9035	p += numlen;
	9036	}
	9037	else
	9038	ender = (U8) *p++;
	9039	break;
	9040	} /* End of switch on the literal */
	9041
	9042	/* Certain characters are problematic because their folded
	9043	* length is so different from their original length that it
	9044	* isn't handleable by the optimizer. They are therefore not
	9045	* placed in an EXACTish node; and are here handled specially.
	9046	* (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
	9047	* putting it in a special node keeps regexec from having to
	9048	* deal with a non-utf8 multi-char fold */
	9049	if (FOLD
	9050	&& (ender > 255 \|\| (! MORE_ASCII_RESTRICTED && ! LOC)))
	9051	{
	9052	/* We look for either side of the fold. For example \xDF
	9053	* folds to 'ss'. We look for both the single character
	9054	* \xDF and the sequence 'ss'. When we find something that
	9055	* could be one of those, we stop and flush whatever we
	9056	* have output so far into the EXACTish node that was being
	9057	* built. Then restore the input pointer to what it was.
	9058	* regatom will return that EXACT node, and will be called
	9059	* again, positioned so the first character is the one in
	9060	* question, which we return in a different node type.
	9061	* The multi-char folds are a sequence, so the occurrence
	9062	* of the first character in that sequence doesn't
	9063	* necessarily mean that what follows is the rest of the
	9064	* sequence. We keep track of that with a state machine,
	9065	* with the state being set to the latest character
	9066	* processed before the current one. Most characters will
	9067	* set the state to 0, but if one occurs that is part of a
	9068	* potential tricky fold sequence, the state is set to that
	9069	* character, and the next loop iteration sees if the state
	9070	* should progress towards the final folded-from character,
	9071	* or if it was a false alarm. If it turns out to be a
	9072	* false alarm, the character(s) will be output in a new
	9073	* EXACTish node, and join_exact() will later combine them.
	9074	* In the case of the 'ss' sequence, which is more common
	9075	* and more easily checked, some look-ahead is done to
	9076	* save time by ruling-out some false alarms */
	9077	switch (ender) {
	9078	default:
	9079	latest_char_state = generic_char;
	9080	break;
	9081	case 's':
	9082	case 'S':
	9083	case 0x17F: /* LATIN SMALL LETTER LONG S */
	9084	if (AT_LEAST_UNI_SEMANTICS) {
	9085	if (latest_char_state == char_s) { /* 'ss' */
	9086	ender = LATIN_SMALL_LETTER_SHARP_S;
	9087	goto do_tricky;
	9088	}
	9089	else if (p < RExC_end) {
	9090
	9091	/* Look-ahead at the next character. If it
	9092	* is also an s, we handle as a sharp s
	9093	* tricky regnode. */
	9094	if (p == 's' \|\| p == 'S') {
	9095
	9096	/* But first flush anything in the
	9097	* EXACTish buffer */
	9098	if (len != 0) {
	9099	p = oldp;
	9100	goto loopdone;
	9101	}
	9102	p++; /* Account for swallowing this
	9103	's' up */
	9104	ender = LATIN_SMALL_LETTER_SHARP_S;
	9105	goto do_tricky;
	9106	}
	9107	/* Here, the next character is not a
	9108	* literal 's', but still could
	9109	* evaluate to one if part of a \o{},
	9110	* \x or \OCTAL-DIGIT. The minimum
	9111	* length required for that is 4, eg
	9112	* \x53 or \123 */
	9113	else if (*p == '\\'
	9114	&& p < RExC_end - 4
	9115	&& (isDIGIT(*(p + 1))
	9116	\|\| *(p + 1) == 'x'
	9117	\|\| *(p + 1) == 'o' ))
	9118	{
	9119
	9120	/* Here, it could be an 's', too much
	9121	* bother to figure it out here. Flush
	9122	* the buffer if any; when come back
	9123	* here, set the state so know that the
	9124	* previous char was an 's' */
	9125	if (len != 0) {
	9126	latest_char_state = generic_char;
	9127	p = oldp;
	9128	goto loopdone;
	9129	}
	9130	latest_char_state = char_s;
	9131	break;
	9132	}
	9133	}
	9134	}
	9135
	9136	/* Here, can't be an 'ss' sequence, or at least not
	9137	* one that could fold to/from the sharp ss */
	9138	latest_char_state = generic_char;
	9139	break;
	9140	case 0x03C5: /* First char in upsilon series */
	9141	case 0x03A5: /* Also capital UPSILON, which folds to
	9142	03C5, and hence exhibits the same
	9143	problem */
	9144	if (p < RExC_end - 4) { /* Need >= 4 bytes left */
	9145	latest_char_state = upsilon_1;
	9146	if (len != 0) {
	9147	p = oldp;
	9148	goto loopdone;
	9149	}
	9150	}
	9151	else {
	9152	latest_char_state = generic_char;
	9153	}
	9154	break;
	9155	case 0x03B9: /* First char in iota series */
	9156	case 0x0399: /* Also capital IOTA */
	9157	case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */
	9158	case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds
	9159	to 3B9 */
	9160	if (p < RExC_end - 4) {
	9161	latest_char_state = iota_1;
	9162	if (len != 0) {
	9163	p = oldp;
	9164	goto loopdone;
	9165	}
	9166	}
	9167	else {
	9168	latest_char_state = generic_char;
	9169	}
	9170	break;
	9171	case 0x0308:
	9172	if (latest_char_state == upsilon_1) {
	9173	latest_char_state = upsilon_2;
	9174	}
	9175	else if (latest_char_state == iota_1) {
	9176	latest_char_state = iota_2;
	9177	}
	9178	else {
	9179	latest_char_state = generic_char;
	9180	}
	9181	break;
	9182	case 0x301:
	9183	if (latest_char_state == upsilon_2) {
	9184	ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
	9185	goto do_tricky;
	9186	}
	9187	else if (latest_char_state == iota_2) {
	9188	ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
	9189	goto do_tricky;
	9190	}
	9191	latest_char_state = generic_char;
	9192	break;
	9193
	9194	/* These are the tricky fold characters. Flush any
	9195	* buffer first. (When adding to this list, also should
	9196	* add them to fold_grind.t to make sure get tested) */
	9197	case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
	9198	case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
	9199	case LATIN_SMALL_LETTER_SHARP_S:
	9200	case LATIN_CAPITAL_LETTER_SHARP_S:
	9201	case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
	9202	case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
	9203	if (len != 0) {
	9204	p = oldp;
	9205	goto loopdone;
	9206	}
	9207	/* FALL THROUGH */
	9208	do_tricky: {
	9209	char* const oldregxend = RExC_end;
	9210	U8 tmpbuf[UTF8_MAXBYTES+1];
	9211
	9212	/* Here, we know we need to generate a special
	9213	* regnode, and 'ender' contains the tricky
	9214	* character. What's done is to pretend it's in a
	9215	* [bracketed] class, and let the code that deals
	9216	* with those handle it, as that code has all the
	9217	* intelligence necessary. First save the current
	9218	* parse state, get rid of the already allocated
	9219	* but empty EXACT node that the ANYOFV node will
	9220	* replace, and point the parse to a buffer which
	9221	* we fill with the character we want the regclass
	9222	* code to think is being parsed */
	9223	RExC_emit = orig_emit;
	9224	RExC_parse = (char *) tmpbuf;
	9225	if (UTF) {
	9226	U8 *d = uvchr_to_utf8(tmpbuf, ender);
	9227	*d = '\0';
	9228	RExC_end = (char *) d;
	9229	}
	9230	else { /* ender above 255 already excluded */
	9231	tmpbuf[0] = (U8) ender;
	9232	tmpbuf[1] = '\0';
	9233	RExC_end = RExC_parse + 1;
	9234	}
	9235
	9236	ret = regclass(pRExC_state,depth+1);
	9237
	9238	/* Here, have parsed the buffer. Reset the parse to
	9239	* the actual input, and return */
	9240	RExC_end = oldregxend;
	9241	RExC_parse = p - 1;
	9242
	9243	Set_Node_Offset(ret, RExC_parse);
	9244	Set_Node_Cur_Length(ret);
	9245	nextchar(pRExC_state);
	9246	*flagp \|= HASWIDTH\|SIMPLE;
	9247	return ret;
	9248	}
	9249	}
	9250	}
	9251
	9252	if ( RExC_flags & RXf_PMf_EXTENDED)
	9253	p = regwhite( pRExC_state, p );
	9254	if (UTF && FOLD) {
	9255	/* Prime the casefolded buffer. Locale rules, which apply
	9256	* only to code points < 256, aren't known until execution,
	9257	* so for them, just output the original character using
	9258	* utf8 */
	9259	if (LOC && ender < 256) {
	9260	if (UNI_IS_INVARIANT(ender)) {
	9261	*tmpbuf = (U8) ender;
	9262	foldlen = 1;
	9263	} else {
	9264	*tmpbuf = UTF8_TWO_BYTE_HI(ender);
	9265	*(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
	9266	foldlen = 2;
	9267	}
	9268	}
	9269	else if (isASCII(ender)) { /* Note: Here can't also be LOC
	9270	*/
	9271	ender = toLOWER(ender);
	9272	*tmpbuf = (U8) ender;
	9273	foldlen = 1;
	9274	}
	9275	else if (! MORE_ASCII_RESTRICTED && ! LOC) {
	9276
	9277	/* Locale and /aa require more selectivity about the
	9278	* fold, so are handled below. Otherwise, here, just
	9279	* use the fold */
	9280	ender = toFOLD_uni(ender, tmpbuf, &foldlen);
	9281	}
	9282	else {
	9283	/* Under locale rules or /aa we are not to mix,
	9284	* respectively, ords < 256 or ASCII with non-. So
	9285	* reject folds that mix them, using only the
	9286	* non-folded code point. So do the fold to a
	9287	* temporary, and inspect each character in it. */
	9288	U8 trialbuf[UTF8_MAXBYTES_CASE+1];
	9289	U8* s = trialbuf;
	9290	UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
	9291	U8* e = s + foldlen;
	9292	bool fold_ok = TRUE;
	9293
	9294	while (s < e) {
	9295	if (isASCII(*s)
	9296	\|\| (LOC && (UTF8_IS_INVARIANT(*s)
	9297	\|\| UTF8_IS_DOWNGRADEABLE_START(*s))))
	9298	{
	9299	fold_ok = FALSE;
	9300	break;
	9301	}
	9302	s += UTF8SKIP(s);
	9303	}
	9304	if (fold_ok) {
	9305	Copy(trialbuf, tmpbuf, foldlen, U8);
	9306	ender = tmpender;
	9307	}
	9308	else {
	9309	uvuni_to_utf8(tmpbuf, ender);
	9310	foldlen = UNISKIP(ender);
	9311	}
	9312	}
	9313	}
	9314	if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+. /
	9315	if (len)
	9316	p = oldp;
	9317	else if (UTF) {
	9318	if (FOLD) {
	9319	/* Emit all the Unicode characters. */
	9320	STRLEN numlen;
	9321	for (foldbuf = tmpbuf;
	9322	foldlen;
	9323	foldlen -= numlen) {
	9324	ender = utf8_to_uvchr(foldbuf, &numlen);
	9325	if (numlen > 0) {
	9326	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9327	s += unilen;
	9328	len += unilen;
	9329	/* In EBCDIC the numlen
	9330	* and unilen can differ. */
	9331	foldbuf += numlen;
	9332	if (numlen >= foldlen)
	9333	break;
	9334	}
	9335	else
	9336	break; /* "Can't happen." */
	9337	}
	9338	}
	9339	else {
	9340	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9341	if (unilen > 0) {
	9342	s += unilen;
	9343	len += unilen;
	9344	}
	9345	}
	9346	}
	9347	else {
	9348	len++;
	9349	REGC((char)ender, s++);
	9350	}
	9351	break;
	9352	}
	9353	if (UTF) {
	9354	if (FOLD) {
	9355	/* Emit all the Unicode characters. */
	9356	STRLEN numlen;
	9357	for (foldbuf = tmpbuf;
	9358	foldlen;
	9359	foldlen -= numlen) {
	9360	ender = utf8_to_uvchr(foldbuf, &numlen);
	9361	if (numlen > 0) {
	9362	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9363	len += unilen;
	9364	s += unilen;
	9365	/* In EBCDIC the numlen
	9366	* and unilen can differ. */
	9367	foldbuf += numlen;
	9368	if (numlen >= foldlen)
	9369	break;
	9370	}
	9371	else
	9372	break;
	9373	}
	9374	}
	9375	else {
	9376	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9377	if (unilen > 0) {
	9378	s += unilen;
	9379	len += unilen;
	9380	}
	9381	}
	9382	len--;
	9383	}
	9384	else {
	9385	REGC((char)ender, s++);
	9386	}
	9387	}
	9388	loopdone: /* Jumped to when encounters something that shouldn't be in
	9389	the node */
	9390	RExC_parse = p - 1;
	9391	Set_Node_Cur_Length(ret); /* MJD */
	9392	nextchar(pRExC_state);
	9393	{
	9394	/* len is STRLEN which is unsigned, need to copy to signed */
	9395	IV iv = len;
	9396	if (iv < 0)
	9397	vFAIL("Internal disaster");
	9398	}
	9399	if (len > 0)
	9400	*flagp \|= HASWIDTH;
	9401	if (len == 1 && UNI_IS_INVARIANT(ender))
	9402	*flagp \|= SIMPLE;
	9403
	9404	if (SIZE_ONLY)
	9405	RExC_size += STR_SZ(len);
	9406	else {
	9407	STR_LEN(ret) = len;
	9408	RExC_emit += STR_SZ(len);
	9409	}
	9410	}
	9411	break;
	9412	}
	9413
	9414	return(ret);
	9415
	9416	/* Jumped to when an unrecognized character set is encountered */
	9417	bad_charset:
	9418	Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
	9419	return(NULL);
	9420	}
	9421
	9422	STATIC char *
	9423	S_regwhite( RExC_state_t pRExC_state, char p )
	9424	{
	9425	const char *e = RExC_end;
	9426
	9427	PERL_ARGS_ASSERT_REGWHITE;
	9428
	9429	while (p < e) {
	9430	if (isSPACE(*p))
	9431	++p;
	9432	else if (*p == '#') {
	9433	bool ended = 0;
	9434	do {
	9435	if (*p++ == '\n') {
	9436	ended = 1;
	9437	break;
	9438	}
	9439	} while (p < e);
	9440	if (!ended)
	9441	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	9442	}
	9443	else
	9444	break;
	9445	}
	9446	return p;
	9447	}
	9448
	9449	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	9450	Character classes ([:foo:]) can also be negated ([:^foo:]).
	9451	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	9452	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	9453	but trigger failures because they are currently unimplemented. */
	9454
	9455	#define POSIXCC_DONE(c) ((c) == ':')
	9456	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	9457	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	9458
	9459	STATIC I32
	9460	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
	9461	{
	9462	dVAR;
	9463	I32 namedclass = OOB_NAMEDCLASS;
	9464
	9465	PERL_ARGS_ASSERT_REGPPOSIXCC;
	9466
	9467	if (value == '[' && RExC_parse + 1 < RExC_end &&
	9468	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	9469	POSIXCC(UCHARAT(RExC_parse))) {
	9470	const char c = UCHARAT(RExC_parse);
	9471	char* const s = RExC_parse++;
	9472
	9473	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	9474	RExC_parse++;
	9475	if (RExC_parse == RExC_end)
	9476	/* Grandfather lone [:, [=, [. */
	9477	RExC_parse = s;
	9478	else {
	9479	const char* const t = RExC_parse++; /* skip over the c */
	9480	assert(*t == c);
	9481
	9482	if (UCHARAT(RExC_parse) == ']') {
	9483	const char *posixcc = s + 1;
	9484	RExC_parse++; /* skip over the ending ] */
	9485
	9486	if (*s == ':') {
	9487	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	9488	const I32 skip = t - posixcc;
	9489
	9490	/* Initially switch on the length of the name. */
	9491	switch (skip) {
	9492	case 4:
	9493	if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
	9494	namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
	9495	break;
	9496	case 5:
	9497	/* Names all of length 5. */
	9498	/* alnum alpha ascii blank cntrl digit graph lower
	9499	print punct space upper */
	9500	/* Offset 4 gives the best switch position. */
	9501	switch (posixcc[4]) {
	9502	case 'a':
	9503	if (memEQ(posixcc, "alph", 4)) /* alpha */
	9504	namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
	9505	break;
	9506	case 'e':
	9507	if (memEQ(posixcc, "spac", 4)) /* space */
	9508	namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
	9509	break;
	9510	case 'h':
	9511	if (memEQ(posixcc, "grap", 4)) /* graph */
	9512	namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
	9513	break;
	9514	case 'i':
	9515	if (memEQ(posixcc, "asci", 4)) /* ascii */
	9516	namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
	9517	break;
	9518	case 'k':
	9519	if (memEQ(posixcc, "blan", 4)) /* blank */
	9520	namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
	9521	break;
	9522	case 'l':
	9523	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	9524	namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
	9525	break;
	9526	case 'm':
	9527	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	9528	namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
	9529	break;
	9530	case 'r':
	9531	if (memEQ(posixcc, "lowe", 4)) /* lower */
	9532	namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
	9533	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	9534	namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
	9535	break;
	9536	case 't':
	9537	if (memEQ(posixcc, "digi", 4)) /* digit */
	9538	namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
	9539	else if (memEQ(posixcc, "prin", 4)) /* print */
	9540	namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
	9541	else if (memEQ(posixcc, "punc", 4)) /* punct */
	9542	namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
	9543	break;
	9544	}
	9545	break;
	9546	case 6:
	9547	if (memEQ(posixcc, "xdigit", 6))
	9548	namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
	9549	break;
	9550	}
	9551
	9552	if (namedclass == OOB_NAMEDCLASS)
	9553	Simple_vFAIL3("POSIX class [:%.*s:] unknown",
	9554	t - s - 1, s + 1);
	9555	assert (posixcc[skip] == ':');
	9556	assert (posixcc[skip+1] == ']');
	9557	} else if (!SIZE_ONLY) {
	9558	/* [[=foo=]] and [[.foo.]] are still future. */
	9559
	9560	/* adjust RExC_parse so the warning shows after
	9561	the class closes */
	9562	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	9563	RExC_parse++;
	9564	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	9565	}
	9566	} else {
	9567	/* Maternal grandfather:
	9568	* "[:" ending in ":" but not in ":]" */
	9569	RExC_parse = s;
	9570	}
	9571	}
	9572	}
	9573
	9574	return namedclass;
	9575	}
	9576
	9577	STATIC void
	9578	S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
	9579	{
	9580	dVAR;
	9581
	9582	PERL_ARGS_ASSERT_CHECKPOSIXCC;
	9583
	9584	if (POSIXCC(UCHARAT(RExC_parse))) {
	9585	const char *s = RExC_parse;
	9586	const char c = *s++;
	9587
	9588	while (isALNUM(*s))
	9589	s++;
	9590	if (s && c == s && s[1] == ']') {
	9591	ckWARN3reg(s+2,
	9592	"POSIX syntax [%c %c] belongs inside character classes",
	9593	c, c);
	9594
	9595	/* [[=foo=]] and [[.foo.]] are still future. */
	9596	if (POSIXCC_NOTYET(c)) {
	9597	/* adjust RExC_parse so the error shows after
	9598	the class closes */
	9599	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
	9600	NOOP;
	9601	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	9602	}
	9603	}
	9604	}
	9605	}
	9606
	9607	/* No locale test, and always Unicode semantics, no ignore-case differences */
	9608	#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
	9609	ANYOF_##NAME: \
	9610	for (value = 0; value < 256; value++) \
	9611	if (TEST) \
	9612	stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9613	yesno = '+'; \
	9614	what = WORD; \
	9615	break; \
	9616	case ANYOF_N##NAME: \
	9617	for (value = 0; value < 256; value++) \
	9618	if (!TEST) \
	9619	stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9620	yesno = '!'; \
	9621	what = WORD; \
	9622	break
	9623
	9624	/* Like the above, but there are differences if we are in uni-8-bit or not, so
	9625	* there are two tests passed in, to use depending on that. There aren't any
	9626	* cases where the label is different from the name, so no need for that
	9627	* parameter.
	9628	* Sets 'what' to WORD which is the property name for non-bitmap code points;
	9629	* But, uses FOLD_WORD instead if /i has been selected, to allow a different
	9630	* property name */
	9631	#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD) \
	9632	ANYOF_##NAME: \
	9633	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
	9634	else if (UNI_SEMANTICS) { \
	9635	for (value = 0; value < 256; value++) { \
	9636	if (TEST_8(value)) stored += \
	9637	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9638	} \
	9639	} \
	9640	else { \
	9641	for (value = 0; value < 128; value++) { \
	9642	if (TEST_7(UNI_TO_NATIVE(value))) stored += \
	9643	set_regclass_bit(pRExC_state, ret, \
	9644	(U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9645	} \
	9646	} \
	9647	yesno = '+'; \
	9648	if (FOLD) { \
	9649	what = FOLD_WORD; \
	9650	} \
	9651	else { \
	9652	what = WORD; \
	9653	} \
	9654	break; \
	9655	case ANYOF_N##NAME: \
	9656	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
	9657	else if (UNI_SEMANTICS) { \
	9658	for (value = 0; value < 256; value++) { \
	9659	if (! TEST_8(value)) stored += \
	9660	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9661	} \
	9662	} \
	9663	else { \
	9664	for (value = 0; value < 128; value++) { \
	9665	if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
	9666	pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9667	} \
	9668	if (AT_LEAST_ASCII_RESTRICTED) { \
	9669	for (value = 128; value < 256; value++) { \
	9670	stored += set_regclass_bit( \
	9671	pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9672	} \
	9673	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL; \
	9674	} \
	9675	else { \
	9676	/* For a non-ut8 target string with DEPENDS semantics, all above \
	9677	* ASCII Latin1 code points match the complement of any of the \
	9678	* classes. But in utf8, they have their Unicode semantics, so \
	9679	* can't just set them in the bitmap, or else regexec.c will think \
	9680	* they matched when they shouldn't. */ \
	9681	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	9682	} \
	9683	} \
	9684	yesno = '!'; \
	9685	if (FOLD) { \
	9686	what = FOLD_WORD; \
	9687	} \
	9688	else { \
	9689	what = WORD; \
	9690	} \
	9691	break
	9692
	9693	STATIC U8
	9694	S_set_regclass_bit_fold(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, SV invlist_ptr, AV alternate_ptr)
	9695	{
	9696
	9697	/* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
	9698	* Locale folding is done at run-time, so this function should not be
	9699	* called for nodes that are for locales.
	9700	*
	9701	* This function sets the bit corresponding to the fold of the input
	9702	* 'value', if not already set. The fold of 'f' is 'F', and the fold of
	9703	* 'F' is 'f'.
	9704	*
	9705	* It also knows about the characters that are in the bitmap that have
	9706	* folds that are matchable only outside it, and sets the appropriate lists
	9707	* and flags.
	9708	*
	9709	* It returns the number of bits that actually changed from 0 to 1 */
	9710
	9711	U8 stored = 0;
	9712	U8 fold;
	9713
	9714	PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
	9715
	9716	fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
	9717	: PL_fold[value];
	9718
	9719	/* It assumes the bit for 'value' has already been set */
	9720	if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
	9721	ANYOF_BITMAP_SET(node, fold);
	9722	stored++;
	9723	}
	9724	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) \|\| ! MORE_ASCII_RESTRICTED)) {
	9725	/* Certain Latin1 characters have matches outside the bitmap. To get
	9726	* here, 'value' is one of those characters. None of these matches is
	9727	* valid for ASCII characters under /aa, which have been excluded by
	9728	* the 'if' above. The matches fall into three categories:
	9729	* 1) They are singly folded-to or -from an above 255 character, as
	9730	* LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
	9731	* WITH DIAERESIS;
	9732	* 2) They are part of a multi-char fold with another character in the
	9733	* bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
	9734	* 3) They are part of a multi-char fold with a character not in the
	9735	* bitmap, such as various ligatures.
	9736	* We aren't dealing fully with multi-char folds, except we do deal
	9737	* with the pattern containing a character that has a multi-char fold
	9738	* (not so much the inverse).
	9739	* For types 1) and 3), the matches only happen when the target string
	9740	* is utf8; that's not true for 2), and we set a flag for it.
	9741	*
	9742	* The code below adds to the passed in inversion list the single fold
	9743	* closures for 'value'. The values are hard-coded here so that an
	9744	* innocent-looking character class, like /[ks]/i won't have to go out
	9745	* to disk to find the possible matches. XXX It would be better to
	9746	* generate these via regen, in case a new version of the Unicode
	9747	* standard adds new mappings, though that is not really likely. */
	9748	switch (value) {
	9749	case 'k':
	9750	case 'K':
	9751	/* KELVIN SIGN */
	9752	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212A);
	9753	break;
	9754	case 's':
	9755	case 'S':
	9756	/* LATIN SMALL LETTER LONG S */
	9757	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x017F);
	9758	break;
	9759	case MICRO_SIGN:
	9760	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9761	GREEK_SMALL_LETTER_MU);
	9762	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9763	GREEK_CAPITAL_LETTER_MU);
	9764	break;
	9765	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	9766	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	9767	/* ANGSTROM SIGN */
	9768	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212B);
	9769	if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
	9770	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9771	PL_fold_latin1[value]);
	9772	}
	9773	break;
	9774	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	9775	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9776	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	9777	break;
	9778	case LATIN_SMALL_LETTER_SHARP_S:
	9779	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9780	LATIN_CAPITAL_LETTER_SHARP_S);
	9781
	9782	/* Under /a, /d, and /u, this can match the two chars "ss" */
	9783	if (! MORE_ASCII_RESTRICTED) {
	9784	add_alternate(alternate_ptr, (U8 *) "ss", 2);
	9785
	9786	/* And under /u or /a, it can match even if the target is
	9787	* not utf8 */
	9788	if (AT_LEAST_UNI_SEMANTICS) {
	9789	ANYOF_FLAGS(node) \|= ANYOF_NONBITMAP_NON_UTF8;
	9790	}
	9791	}
	9792	break;
	9793	case 'F': case 'f':
	9794	case 'I': case 'i':
	9795	case 'L': case 'l':
	9796	case 'T': case 't':
	9797	case 'A': case 'a':
	9798	case 'H': case 'h':
	9799	case 'J': case 'j':
	9800	case 'N': case 'n':
	9801	case 'W': case 'w':
	9802	case 'Y': case 'y':
	9803	/* These all are targets of multi-character folds from code
	9804	* points that require UTF8 to express, so they can't match
	9805	* unless the target string is in UTF-8, so no action here is
	9806	* necessary, as regexec.c properly handles the general case
	9807	* for UTF-8 matching */
	9808	break;
	9809	default:
	9810	/* Use deprecated warning to increase the chances of this
	9811	* being output */
	9812	ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
	9813	break;
	9814	}
	9815	}
	9816	else if (DEPENDS_SEMANTICS
	9817	&& ! isASCII(value)
	9818	&& PL_fold_latin1[value] != value)
	9819	{
	9820	/* Under DEPENDS rules, non-ASCII Latin1 characters match their
	9821	* folds only when the target string is in UTF-8. We add the fold
	9822	* here to the list of things to match outside the bitmap, which
	9823	* won't be looked at unless it is UTF8 (or else if something else
	9824	* says to look even if not utf8, but those things better not happen
	9825	* under DEPENDS semantics. */
	9826	invlist_ptr = add_cp_to_invlist(invlist_ptr, PL_fold_latin1[value]);
	9827	}
	9828
	9829	return stored;
	9830	}
	9831
	9832
	9833	PERL_STATIC_INLINE U8
	9834	S_set_regclass_bit(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, SV invlist_ptr, AV alternate_ptr)
	9835	{
	9836	/* This inline function sets a bit in the bitmap if not already set, and if
	9837	* appropriate, its fold, returning the number of bits that actually
	9838	* changed from 0 to 1 */
	9839
	9840	U8 stored;
	9841
	9842	PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
	9843
	9844	if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
	9845	return 0;
	9846	}
	9847
	9848	ANYOF_BITMAP_SET(node, value);
	9849	stored = 1;
	9850
	9851	if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
	9852	stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
	9853	}
	9854
	9855	return stored;
	9856	}
	9857
	9858	STATIC void
	9859	S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
	9860	{
	9861	/* Adds input 'string' with length 'len' to the ANYOF node's unicode
	9862	* alternate list, pointed to by 'alternate_ptr'. This is an array of
	9863	* the multi-character folds of characters in the node */
	9864	SV *sv;
	9865
	9866	PERL_ARGS_ASSERT_ADD_ALTERNATE;
	9867
	9868	if (! *alternate_ptr) {
	9869	*alternate_ptr = newAV();
	9870	}
	9871	sv = newSVpvn_utf8((char*)string, len, TRUE);
	9872	av_push(*alternate_ptr, sv);
	9873	return;
	9874	}
	9875
	9876	/*
	9877	parse a class specification and produce either an ANYOF node that
	9878	matches the pattern or perhaps will be optimized into an EXACTish node
	9879	instead. The node contains a bit map for the first 256 characters, with the
	9880	corresponding bit set if that character is in the list. For characters
	9881	above 255, a range list is used */
	9882
	9883	STATIC regnode *
	9884	S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
	9885	{
	9886	dVAR;
	9887	register UV nextvalue;
	9888	register IV prevvalue = OOB_UNICODE;
	9889	register IV range = 0;
	9890	UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
	9891	register regnode *ret;
	9892	STRLEN numlen;
	9893	IV namedclass;
	9894	char *rangebegin = NULL;
	9895	bool need_class = 0;
	9896	bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
	9897	SV *listsv = NULL;
	9898	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	9899	than just initialized. */
	9900	UV n;
	9901
	9902	/* code points this node matches that can't be stored in the bitmap */
	9903	SV* nonbitmap = NULL;
	9904
	9905	/* The items that are to match that aren't stored in the bitmap, but are a
	9906	* result of things that are stored there. This is the fold closure of
	9907	* such a character, either because it has DEPENDS semantics and shouldn't
	9908	* be matched unless the target string is utf8, or is a code point that is
	9909	* too large for the bit map, as for example, the fold of the MICRO SIGN is
	9910	* above 255. This all is solely for performance reasons. By having this
	9911	* code know the outside-the-bitmap folds that the bitmapped characters are
	9912	* involved with, we don't have to go out to disk to find the list of
	9913	* matches, unless the character class includes code points that aren't
	9914	* storable in the bit map. That means that a character class with an 's'
	9915	* in it, for example, doesn't need to go out to disk to find everything
	9916	* that matches. A 2nd list is used so that the 'nonbitmap' list is kept
	9917	* empty unless there is something whose fold we don't know about, and will
	9918	* have to go out to the disk to find. */
	9919	SV* l1_fold_invlist = NULL;
	9920
	9921	/* List of multi-character folds that are matched by this node */
	9922	AV* unicode_alternate = NULL;
	9923	#ifdef EBCDIC
	9924	UV literal_endpoint = 0;
	9925	#endif
	9926	UV stored = 0; /* how many chars stored in the bitmap */
	9927
	9928	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	9929	case we need to change the emitted regop to an EXACT. */
	9930	const char * orig_parse = RExC_parse;
	9931	GET_RE_DEBUG_FLAGS_DECL;
	9932
	9933	PERL_ARGS_ASSERT_REGCLASS;
	9934	#ifndef DEBUGGING
	9935	PERL_UNUSED_ARG(depth);
	9936	#endif
	9937
	9938	DEBUG_PARSE("clas");
	9939
	9940	/* Assume we are going to generate an ANYOF node. */
	9941	ret = reganode(pRExC_state, ANYOF, 0);
	9942
	9943
	9944	if (!SIZE_ONLY) {
	9945	ANYOF_FLAGS(ret) = 0;
	9946	}
	9947
	9948	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	9949	RExC_naughty++;
	9950	RExC_parse++;
	9951	if (!SIZE_ONLY)
	9952	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	9953
	9954	/* We have decided to not allow multi-char folds in inverted character
	9955	* classes, due to the confusion that can happen, especially with
	9956	* classes that are designed for a non-Unicode world: You have the
	9957	* peculiar case that:
	9958	"s s" =~ /^[^\xDF]+$/i => Y
	9959	"ss" =~ /^[^\xDF]+$/i => N
	9960	*
	9961	* See [perl #89750] */
	9962	allow_full_fold = FALSE;
	9963	}
	9964
	9965	if (SIZE_ONLY) {
	9966	RExC_size += ANYOF_SKIP;
	9967	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	9968	}
	9969	else {
	9970	RExC_emit += ANYOF_SKIP;
	9971	if (LOC) {
	9972	ANYOF_FLAGS(ret) \|= ANYOF_LOCALE;
	9973	}
	9974	ANYOF_BITMAP_ZERO(ret);
	9975	listsv = newSVpvs("# comment\n");
	9976	initial_listsv_len = SvCUR(listsv);
	9977	}
	9978
	9979	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	9980
	9981	if (!SIZE_ONLY && POSIXCC(nextvalue))
	9982	checkposixcc(pRExC_state);
	9983
	9984	/* allow 1st char to be ] (allowing it to be - is dealt with later) */
	9985	if (UCHARAT(RExC_parse) == ']')
	9986	goto charclassloop;
	9987
	9988	parseit:
	9989	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	9990
	9991	charclassloop:
	9992
	9993	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	9994
	9995	if (!range)
	9996	rangebegin = RExC_parse;
	9997	if (UTF) {
	9998	value = utf8n_to_uvchr((U8*)RExC_parse,
	9999	RExC_end - RExC_parse,
	10000	&numlen, UTF8_ALLOW_DEFAULT);
	10001	RExC_parse += numlen;
	10002	}
	10003	else
	10004	value = UCHARAT(RExC_parse++);
	10005
	10006	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	10007	if (value == '[' && POSIXCC(nextvalue))
	10008	namedclass = regpposixcc(pRExC_state, value);
	10009	else if (value == '\\') {
	10010	if (UTF) {
	10011	value = utf8n_to_uvchr((U8*)RExC_parse,
	10012	RExC_end - RExC_parse,
	10013	&numlen, UTF8_ALLOW_DEFAULT);
	10014	RExC_parse += numlen;
	10015	}
	10016	else
	10017	value = UCHARAT(RExC_parse++);
	10018	/* Some compilers cannot handle switching on 64-bit integer
	10019	* values, therefore value cannot be an UV. Yes, this will
	10020	* be a problem later if we want switch on Unicode.
	10021	* A similar issue a little bit later when switching on
	10022	* namedclass. --jhi */
	10023	switch ((I32)value) {
	10024	case 'w': namedclass = ANYOF_ALNUM; break;
	10025	case 'W': namedclass = ANYOF_NALNUM; break;
	10026	case 's': namedclass = ANYOF_SPACE; break;
	10027	case 'S': namedclass = ANYOF_NSPACE; break;
	10028	case 'd': namedclass = ANYOF_DIGIT; break;
	10029	case 'D': namedclass = ANYOF_NDIGIT; break;
	10030	case 'v': namedclass = ANYOF_VERTWS; break;
	10031	case 'V': namedclass = ANYOF_NVERTWS; break;
	10032	case 'h': namedclass = ANYOF_HORIZWS; break;
	10033	case 'H': namedclass = ANYOF_NHORIZWS; break;
	10034	case 'N': /* Handle \N{NAME} in class */
	10035	{
	10036	/* We only pay attention to the first char of
	10037	multichar strings being returned. I kinda wonder
	10038	if this makes sense as it does change the behaviour
	10039	from earlier versions, OTOH that behaviour was broken
	10040	as well. */
	10041	UV v; /* value is register so we cant & it /grrr */
	10042	if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
	10043	goto parseit;
	10044	}
	10045	value= v;
	10046	}
	10047	break;
	10048	case 'p':
	10049	case 'P':
	10050	{
	10051	char *e;
	10052	if (RExC_parse >= RExC_end)
	10053	vFAIL2("Empty \\%c{}", (U8)value);
	10054	if (*RExC_parse == '{') {
	10055	const U8 c = (U8)value;
	10056	e = strchr(RExC_parse++, '}');
	10057	if (!e)
	10058	vFAIL2("Missing right brace on \\%c{}", c);
	10059	while (isSPACE(UCHARAT(RExC_parse)))
	10060	RExC_parse++;
	10061	if (e == RExC_parse)
	10062	vFAIL2("Empty \\%c{}", c);
	10063	n = e - RExC_parse;
	10064	while (isSPACE(UCHARAT(RExC_parse + n - 1)))
	10065	n--;
	10066	}
	10067	else {
	10068	e = RExC_parse;
	10069	n = 1;
	10070	}
	10071	if (!SIZE_ONLY) {
	10072	if (UCHARAT(RExC_parse) == '^') {
	10073	RExC_parse++;
	10074	n--;
	10075	value = value == 'p' ? 'P' : 'p'; /* toggle */
	10076	while (isSPACE(UCHARAT(RExC_parse))) {
	10077	RExC_parse++;
	10078	n--;
	10079	}
	10080	}
	10081
	10082	/* Add the property name to the list. If /i matching, give
	10083	* a different name which consists of the normal name
	10084	* sandwiched between two underscores and '_i'. The design
	10085	* is discussed in the commit message for this. */
	10086	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
	10087	(value=='p' ? '+' : '!'),
	10088	(FOLD) ? "__" : "",
	10089	(int)n,
	10090	RExC_parse,
	10091	(FOLD) ? "_i" : ""
	10092	);
	10093	}
	10094	RExC_parse = e + 1;
	10095
	10096	/* The \p could match something in the Latin1 range, hence
	10097	* something that isn't utf8 */
	10098	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	10099	namedclass = ANYOF_MAX; /* no official name, but it's named */
	10100
	10101	/* \p means they want Unicode semantics */
	10102	RExC_uni_semantics = 1;
	10103	}
	10104	break;
	10105	case 'n': value = '\n'; break;
	10106	case 'r': value = '\r'; break;
	10107	case 't': value = '\t'; break;
	10108	case 'f': value = '\f'; break;
	10109	case 'b': value = '\b'; break;
	10110	case 'e': value = ASCII_TO_NATIVE('\033');break;
	10111	case 'a': value = ASCII_TO_NATIVE('\007');break;
	10112	case 'o':
	10113	RExC_parse--; /* function expects to be pointed at the 'o' */
	10114	{
	10115	const char* error_msg;
	10116	bool valid = grok_bslash_o(RExC_parse,
	10117	&value,
	10118	&numlen,
	10119	&error_msg,
	10120	SIZE_ONLY);
	10121	RExC_parse += numlen;
	10122	if (! valid) {
	10123	vFAIL(error_msg);
	10124	}
	10125	}
	10126	if (PL_encoding && value < 0x100) {
	10127	goto recode_encoding;
	10128	}
	10129	break;
	10130	case 'x':
	10131	if (*RExC_parse == '{') {
	10132	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	10133	\| PERL_SCAN_DISALLOW_PREFIX;
	10134	char * const e = strchr(RExC_parse++, '}');
	10135	if (!e)
	10136	vFAIL("Missing right brace on \\x{}");
	10137
	10138	numlen = e - RExC_parse;
	10139	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	10140	RExC_parse = e + 1;
	10141	}
	10142	else {
	10143	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	10144	numlen = 2;
	10145	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	10146	RExC_parse += numlen;
	10147	}
	10148	if (PL_encoding && value < 0x100)
	10149	goto recode_encoding;
	10150	break;
	10151	case 'c':
	10152	value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
	10153	break;
	10154	case '0': case '1': case '2': case '3': case '4':
	10155	case '5': case '6': case '7':
	10156	{
	10157	/* Take 1-3 octal digits */
	10158	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	10159	numlen = 3;
	10160	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	10161	RExC_parse += numlen;
	10162	if (PL_encoding && value < 0x100)
	10163	goto recode_encoding;
	10164	break;
	10165	}
	10166	recode_encoding:
	10167	if (! RExC_override_recoding) {
	10168	SV* enc = PL_encoding;
	10169	value = reg_recode((const char)(U8)value, &enc);
	10170	if (!enc && SIZE_ONLY)
	10171	ckWARNreg(RExC_parse,
	10172	"Invalid escape in the specified encoding");
	10173	break;
	10174	}
	10175	default:
	10176	/* Allow \_ to not give an error */
	10177	if (!SIZE_ONLY && isALNUM(value) && value != '_') {
	10178	ckWARN2reg(RExC_parse,
	10179	"Unrecognized escape \\%c in character class passed through",
	10180	(int)value);
	10181	}
	10182	break;
	10183	}
	10184	} /* end of \blah */
	10185	#ifdef EBCDIC
	10186	else
	10187	literal_endpoint++;
	10188	#endif
	10189
	10190	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	10191
	10192	/* What matches in a locale is not known until runtime, so need to
	10193	* (one time per class) allocate extra space to pass to regexec.
	10194	* The space will contain a bit for each named class that is to be
	10195	* matched against. This isn't needed for \p{} and pseudo-classes,
	10196	* as they are not affected by locale, and hence are dealt with
	10197	* separately */
	10198	if (LOC && namedclass < ANYOF_MAX && ! need_class) {
	10199	need_class = 1;
	10200	if (SIZE_ONLY) {
	10201	RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	10202	}
	10203	else {
	10204	RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	10205	ANYOF_CLASS_ZERO(ret);
	10206	}
	10207	ANYOF_FLAGS(ret) \|= ANYOF_CLASS;
	10208	}
	10209
	10210	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	10211	* literal, as is the character that began the false range, i.e.
	10212	* the 'a' in the examples */
	10213	if (range) {
	10214	if (!SIZE_ONLY) {
	10215	const int w =
	10216	RExC_parse >= rangebegin ?
	10217	RExC_parse - rangebegin : 0;
	10218	ckWARN4reg(RExC_parse,
	10219	"False [] range \"%.s\"",
	10220	w, w, rangebegin);
	10221
	10222	stored +=
	10223	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	10224	if (prevvalue < 256) {
	10225	stored +=
	10226	set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
	10227	}
	10228	else {
	10229	nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
	10230	}
	10231	}
	10232
	10233	range = 0; /* this was not a true range */
	10234	}
	10235
	10236
	10237
	10238	if (!SIZE_ONLY) {
	10239	const char *what = NULL;
	10240	char yesno = 0;
	10241
	10242	/* Possible truncation here but in some 64-bit environments
	10243	* the compiler gets heartburn about switch on 64-bit values.
	10244	* A similar issue a little earlier when switching on value.
	10245	* --jhi */
	10246	switch ((I32)namedclass) {
	10247
	10248	case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
	10249	case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
	10250	case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
	10251	case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
	10252	case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
	10253	case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
	10254	case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
	10255	case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
	10256	case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
	10257	case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
	10258	/* \s, \w match all unicode if utf8. */
	10259	case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
	10260	case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
	10261	case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
	10262	case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
	10263	case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
	10264	case ANYOF_ASCII:
	10265	if (LOC)
	10266	ANYOF_CLASS_SET(ret, ANYOF_ASCII);
	10267	else {
	10268	for (value = 0; value < 128; value++)
	10269	stored +=
	10270	set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
	10271	}
	10272	yesno = '+';
	10273	what = NULL; /* Doesn't match outside ascii, so
	10274	don't want to add +utf8:: */
	10275	break;
	10276	case ANYOF_NASCII:
	10277	if (LOC)
	10278	ANYOF_CLASS_SET(ret, ANYOF_NASCII);
	10279	else {
	10280	for (value = 128; value < 256; value++)
	10281	stored +=
	10282	set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
	10283	}
	10284	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	10285	yesno = '!';
	10286	what = "ASCII";
	10287	break;
	10288	case ANYOF_DIGIT:
	10289	if (LOC)
	10290	ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
	10291	else {
	10292	/* consecutive digits assumed */
	10293	for (value = '0'; value <= '9'; value++)
	10294	stored +=
	10295	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	10296	}
	10297	yesno = '+';
	10298	what = "Digit";
	10299	break;
	10300	case ANYOF_NDIGIT:
	10301	if (LOC)
	10302	ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
	10303	else {
	10304	/* consecutive digits assumed */
	10305	for (value = 0; value < '0'; value++)
	10306	stored +=
	10307	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	10308	for (value = '9' + 1; value < 256; value++)
	10309	stored +=
	10310	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	10311	}
	10312	yesno = '!';
	10313	what = "Digit";
	10314	if (AT_LEAST_ASCII_RESTRICTED ) {
	10315	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	10316	}
	10317	break;
	10318	case ANYOF_MAX:
	10319	/* this is to handle \p and \P */
	10320	break;
	10321	default:
	10322	vFAIL("Invalid [::] class");
	10323	break;
	10324	}
	10325	if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
	10326	/* Strings such as "+utf8::isWord\n" */
	10327	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
	10328	}
	10329
	10330	continue;
	10331	}
	10332	} /* end of namedclass \blah */
	10333
	10334	if (range) {
	10335	if (prevvalue > (IV)value) /* b-a */ {
	10336	const int w = RExC_parse - rangebegin;
	10337	Simple_vFAIL4("Invalid [] range \"%.s\"", w, w, rangebegin);
	10338	range = 0; /* not a valid range */
	10339	}
	10340	}
	10341	else {
	10342	prevvalue = value; /* save the beginning of the range */
	10343	if (RExC_parse+1 < RExC_end
	10344	&& *RExC_parse == '-'
	10345	&& RExC_parse[1] != ']')
	10346	{
	10347	RExC_parse++;
	10348
	10349	/* a bad range like \w-, [:word:]- ? */
	10350	if (namedclass > OOB_NAMEDCLASS) {
	10351	if (ckWARN(WARN_REGEXP)) {
	10352	const int w =
	10353	RExC_parse >= rangebegin ?
	10354	RExC_parse - rangebegin : 0;
	10355	vWARN4(RExC_parse,
	10356	"False [] range \"%.s\"",
	10357	w, w, rangebegin);
	10358	}
	10359	if (!SIZE_ONLY)
	10360	stored +=
	10361	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	10362	} else
	10363	range = 1; /* yeah, it's a range! */
	10364	continue; /* but do it the next time */
	10365	}
	10366	}
	10367
	10368	/* non-Latin1 code point implies unicode semantics. Must be set in
	10369	* pass1 so is there for the whole of pass 2 */
	10370	if (value > 255) {
	10371	RExC_uni_semantics = 1;
	10372	}
	10373
	10374	/* now is the next time */
	10375	if (!SIZE_ONLY) {
	10376	if (prevvalue < 256) {
	10377	const IV ceilvalue = value < 256 ? value : 255;
	10378	IV i;
	10379	#ifdef EBCDIC
	10380	/* In EBCDIC [\x89-\x91] should include
	10381	* the \x8e but [i-j] should not. */
	10382	if (literal_endpoint == 2 &&
	10383	((isLOWER(prevvalue) && isLOWER(ceilvalue)) \|\|
	10384	(isUPPER(prevvalue) && isUPPER(ceilvalue))))
	10385	{
	10386	if (isLOWER(prevvalue)) {
	10387	for (i = prevvalue; i <= ceilvalue; i++)
	10388	if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	10389	stored +=
	10390	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10391	}
	10392	} else {
	10393	for (i = prevvalue; i <= ceilvalue; i++)
	10394	if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	10395	stored +=
	10396	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10397	}
	10398	}
	10399	}
	10400	else
	10401	#endif
	10402	for (i = prevvalue; i <= ceilvalue; i++) {
	10403	stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10404	}
	10405	}
	10406	if (value > 255) {
	10407	const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
	10408	const UV natvalue = NATIVE_TO_UNI(value);
	10409	nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
	10410	}
	10411	#ifdef EBCDIC
	10412	literal_endpoint = 0;
	10413	#endif
	10414	}
	10415
	10416	range = 0; /* this range (if it was one) is done now */
	10417	}
	10418
	10419
	10420
	10421	if (SIZE_ONLY)
	10422	return ret;
	10423	/**** !SIZE_ONLY AFTER HERE *******/
	10424
	10425	/* If folding and there are code points above 255, we calculate all
	10426	* characters that could fold to or from the ones already on the list */
	10427	if (FOLD && nonbitmap) {
	10428	UV start, end; /* End points of code point ranges */
	10429
	10430	SV* fold_intersection;
	10431
	10432	/* This is a list of all the characters that participate in folds
	10433	* (except marks, etc in multi-char folds */
	10434	if (! PL_utf8_foldable) {
	10435	SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
	10436	PL_utf8_foldable = _swash_to_invlist(swash);
	10437	}
	10438
	10439	/* This is a hash that for a particular fold gives all characters
	10440	* that are involved in it */
	10441	if (! PL_utf8_foldclosures) {
	10442
	10443	/* If we were unable to find any folds, then we likely won't be
	10444	* able to find the closures. So just create an empty list.
	10445	* Folding will effectively be restricted to the non-Unicode rules
	10446	* hard-coded into Perl. (This case happens legitimately during
	10447	* compilation of Perl itself before the Unicode tables are
	10448	* generated) */
	10449	if (invlist_len(PL_utf8_foldable) == 0) {
	10450	PL_utf8_foldclosures = newHV();
	10451	} else {
	10452	/* If the folds haven't been read in, call a fold function
	10453	* to force that */
	10454	if (! PL_utf8_tofold) {
	10455	U8 dummy[UTF8_MAXBYTES+1];
	10456	STRLEN dummy_len;
	10457
	10458	/* This particular string is above \xff in both UTF-8 and
	10459	* UTFEBCDIC */
	10460	to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
	10461	assert(PL_utf8_tofold); /* Verify that worked */
	10462	}
	10463	PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
	10464	}
	10465	}
	10466
	10467	/* Only the characters in this class that participate in folds need
	10468	* be checked. Get the intersection of this class and all the
	10469	* possible characters that are foldable. This can quickly narrow
	10470	* down a large class */
	10471	_invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
	10472
	10473	/* Now look at the foldable characters in this class individually */
	10474	invlist_iterinit(fold_intersection);
	10475	while (invlist_iternext(fold_intersection, &start, &end)) {
	10476	UV j;
	10477
	10478	/* Look at every character in the range */
	10479	for (j = start; j <= end; j++) {
	10480
	10481	/* Get its fold */
	10482	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	10483	STRLEN foldlen;
	10484	const UV f =
	10485	_to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
	10486
	10487	if (foldlen > (STRLEN)UNISKIP(f)) {
	10488
	10489	/* Any multicharacter foldings (disallowed in
	10490	* lookbehind patterns) require the following
	10491	* transform: [ABCDEF] -> (?:[ABCabcDEFd]\|pq\|rst) where
	10492	* E folds into "pq" and F folds into "rst", all other
	10493	* characters fold to single characters. We save away
	10494	* these multicharacter foldings, to be later saved as
	10495	* part of the additional "s" data. */
	10496	if (! RExC_in_lookbehind) {
	10497	U8* loc = foldbuf;
	10498	U8* e = foldbuf + foldlen;
	10499
	10500	/* If any of the folded characters of this are in
	10501	* the Latin1 range, tell the regex engine that
	10502	* this can match a non-utf8 target string. The
	10503	* only multi-byte fold whose source is in the
	10504	* Latin1 range (U+00DF) applies only when the
	10505	* target string is utf8, or under unicode rules */
	10506	if (j > 255 \|\| AT_LEAST_UNI_SEMANTICS) {
	10507	while (loc < e) {
	10508
	10509	/* Can't mix ascii with non- under /aa */
	10510	if (MORE_ASCII_RESTRICTED
	10511	&& (isASCII(*loc) != isASCII(j)))
	10512	{
	10513	goto end_multi_fold;
	10514	}
	10515	if (UTF8_IS_INVARIANT(*loc)
	10516	\|\| UTF8_IS_DOWNGRADEABLE_START(*loc))
	10517	{
	10518	/* Can't mix above and below 256 under
	10519	* LOC */
	10520	if (LOC) {
	10521	goto end_multi_fold;
	10522	}
	10523	ANYOF_FLAGS(ret)
	10524	\|= ANYOF_NONBITMAP_NON_UTF8;
	10525	break;
	10526	}
	10527	loc += UTF8SKIP(loc);
	10528	}
	10529	}
	10530
	10531	add_alternate(&unicode_alternate, foldbuf, foldlen);
	10532	end_multi_fold: ;
	10533	}
	10534
	10535	/* This is special-cased, as it is the only letter which
	10536	* has both a multi-fold and single-fold in Latin1. All
	10537	* the other chars that have single and multi-folds are
	10538	* always in utf8, and the utf8 folding algorithm catches
	10539	* them */
	10540	if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
	10541	stored += set_regclass_bit(pRExC_state,
	10542	ret,
	10543	LATIN_SMALL_LETTER_SHARP_S,
	10544	&l1_fold_invlist, &unicode_alternate);
	10545	}
	10546	}
	10547	else {
	10548	/* Single character fold. Add everything in its fold
	10549	* closure to the list that this node should match */
	10550	SV** listp;
	10551
	10552	/* The fold closures data structure is a hash with the
	10553	* keys being every character that is folded to, like
	10554	* 'k', and the values each an array of everything that
	10555	* folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
	10556	if ((listp = hv_fetch(PL_utf8_foldclosures,
	10557	(char *) foldbuf, foldlen, FALSE)))
	10558	{
	10559	AV* list = (AV) listp;
	10560	IV k;
	10561	for (k = 0; k <= av_len(list); k++) {
	10562	SV** c_p = av_fetch(list, k, FALSE);
	10563	UV c;
	10564	if (c_p == NULL) {
	10565	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	10566	}
	10567	c = SvUV(*c_p);
	10568
	10569	/* /aa doesn't allow folds between ASCII and
	10570	* non-; /l doesn't allow them between above
	10571	* and below 256 */
	10572	if ((MORE_ASCII_RESTRICTED
	10573	&& (isASCII(c) != isASCII(j)))
	10574	\|\| (LOC && ((c < 256) != (j < 256))))
	10575	{
	10576	continue;
	10577	}
	10578
	10579	if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
	10580	stored += set_regclass_bit(pRExC_state,
	10581	ret,
	10582	(U8) c,
	10583	&l1_fold_invlist, &unicode_alternate);
	10584	}
	10585	/* It may be that the code point is already
	10586	* in this range or already in the bitmap,
	10587	* in which case we need do nothing */
	10588	else if ((c < start \|\| c > end)
	10589	&& (c > 255
	10590	\|\| ! ANYOF_BITMAP_TEST(ret, c)))
	10591	{
	10592	nonbitmap = add_cp_to_invlist(nonbitmap, c);
	10593	}
	10594	}
	10595	}
	10596	}
	10597	}
	10598	}
	10599	SvREFCNT_dec(fold_intersection);
	10600	}
	10601
	10602	/* Combine the two lists into one. */
	10603	if (l1_fold_invlist) {
	10604	if (nonbitmap) {
	10605	_invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
	10606	SvREFCNT_dec(l1_fold_invlist);
	10607	}
	10608	else {
	10609	nonbitmap = l1_fold_invlist;
	10610	}
	10611	}
	10612
	10613	/* Here, we have calculated what code points should be in the character
	10614	* class. Now we can see about various optimizations. Fold calculation
	10615	* needs to take place before inversion. Otherwise /[^k]/i would invert to
	10616	* include K, which under /i would match k. */
	10617
	10618	/* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
	10619	* set the FOLD flag yet, so this this does optimize those. It doesn't
	10620	* optimize locale. Doing so perhaps could be done as long as there is
	10621	* nothing like \w in it; some thought also would have to be given to the
	10622	* interaction with above 0x100 chars */
	10623	if (! LOC
	10624	&& (ANYOF_FLAGS(ret) & ANYOF_INVERT)
	10625	&& ! unicode_alternate
	10626	/* In case of /d, there are some things that should match only when in
	10627	* not in the bitmap, i.e., they require UTF8 to match. These are
	10628	* listed in nonbitmap. */
	10629	&& (! nonbitmap
	10630	\|\| ! DEPENDS_SEMANTICS
	10631	\|\| (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
	10632	&& SvCUR(listsv) == initial_listsv_len)
	10633	{
	10634	if (! nonbitmap) {
	10635	for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
	10636	ANYOF_BITMAP(ret)[value] ^= 0xFF;
	10637	/* The inversion means that everything above 255 is matched */
	10638	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	10639	}
	10640	else {
	10641	/* Here, also has things outside the bitmap. Go through each bit
	10642	* individually and add it to the list to get rid of from those
	10643	* things not in the bitmap */
	10644	SV *remove_list = _new_invlist(2);
	10645	_invlist_invert(nonbitmap);
	10646	for (value = 0; value < 256; ++value) {
	10647	if (ANYOF_BITMAP_TEST(ret, value)) {
	10648	ANYOF_BITMAP_CLEAR(ret, value);
	10649	remove_list = add_cp_to_invlist(remove_list, value);
	10650	}
	10651	else {
	10652	ANYOF_BITMAP_SET(ret, value);
	10653	}
	10654	}
	10655	_invlist_subtract(nonbitmap, remove_list, &nonbitmap);
	10656	SvREFCNT_dec(remove_list);
	10657	}
	10658
	10659	stored = 256 - stored;
	10660
	10661	/* Clear the invert flag since have just done it here */
	10662	ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
	10663	}
	10664
	10665	/* Folding in the bitmap is taken care of above, but not for locale (for
	10666	* which we have to wait to see what folding is in effect at runtime), and
	10667	* for things not in the bitmap. Set run-time fold flag for these */
	10668	if (FOLD && (LOC \|\| nonbitmap \|\| unicode_alternate)) {
	10669	ANYOF_FLAGS(ret) \|= ANYOF_LOC_NONBITMAP_FOLD;
	10670	}
	10671
	10672	/* A single character class can be "optimized" into an EXACTish node.
	10673	* Note that since we don't currently count how many characters there are
	10674	* outside the bitmap, we are XXX missing optimization possibilities for
	10675	* them. This optimization can't happen unless this is a truly single
	10676	* character class, which means that it can't be an inversion into a
	10677	* many-character class, and there must be no possibility of there being
	10678	* things outside the bitmap. 'stored' (only) for locales doesn't include
	10679	* \w, etc, so have to make a special test that they aren't present
	10680	*
	10681	* Similarly A 2-character class of the very special form like [bB] can be
	10682	* optimized into an EXACTFish node, but only for non-locales, and for
	10683	* characters which only have the two folds; so things like 'fF' and 'Ii'
	10684	* wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
	10685	* FI'. */
	10686	if (! nonbitmap
	10687	&& ! unicode_alternate
	10688	&& SvCUR(listsv) == initial_listsv_len
	10689	&& ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT\|ANYOF_UNICODE_ALL))
	10690	&& (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	10691	\|\| (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
	10692	\|\| (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	10693	&& (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
	10694	/* If the latest code point has a fold whose
	10695	* bit is set, it must be the only other one */
	10696	&& ((prevvalue = PL_fold_latin1[value]) != (IV)value)
	10697	&& ANYOF_BITMAP_TEST(ret, prevvalue)))))
	10698	{
	10699	/* Note that the information needed to decide to do this optimization
	10700	* is not currently available until the 2nd pass, and that the actually
	10701	* used EXACTish node takes less space than the calculated ANYOF node,
	10702	* and hence the amount of space calculated in the first pass is larger
	10703	* than actually used, so this optimization doesn't gain us any space.
	10704	* But an EXACT node is faster than an ANYOF node, and can be combined
	10705	* with any adjacent EXACT nodes later by the optimizer for further
	10706	* gains. The speed of executing an EXACTF is similar to an ANYOF
	10707	* node, so the optimization advantage comes from the ability to join
	10708	* it to adjacent EXACT nodes */
	10709
	10710	const char * cur_parse= RExC_parse;
	10711	U8 op;
	10712	RExC_emit = (regnode *)orig_emit;
	10713	RExC_parse = (char *)orig_parse;
	10714
	10715	if (stored == 1) {
	10716
	10717	/* A locale node with one point can be folded; all the other cases
	10718	* with folding will have two points, since we calculate them above
	10719	*/
	10720	if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
	10721	op = EXACTFL;
	10722	}
	10723	else {
	10724	op = EXACT;
	10725	}
	10726	}
	10727	else { /* else 2 chars in the bit map: the folds of each other */
	10728
	10729	/* Use the folded value, which for the cases where we get here,
	10730	* is just the lower case of the current one (which may resolve to
	10731	* itself, or to the other one */
	10732	value = toLOWER_LATIN1(value);
	10733	if (AT_LEAST_UNI_SEMANTICS \|\| !isASCII(value)) {
	10734
	10735	/* To join adjacent nodes, they must be the exact EXACTish
	10736	* type. Try to use the most likely type, by using EXACTFU if
	10737	* the regex calls for them, or is required because the
	10738	* character is non-ASCII */
	10739	op = EXACTFU;
	10740	}
	10741	else { /* Otherwise, more likely to be EXACTF type */
	10742	op = EXACTF;
	10743	}
	10744	}
	10745
	10746	ret = reg_node(pRExC_state, op);
	10747	RExC_parse = (char *)cur_parse;
	10748	if (UTF && ! NATIVE_IS_INVARIANT(value)) {
	10749	*STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
	10750	*(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
	10751	STR_LEN(ret)= 2;
	10752	RExC_emit += STR_SZ(2);
	10753	}
	10754	else {
	10755	*STRING(ret)= (char)value;
	10756	STR_LEN(ret)= 1;
	10757	RExC_emit += STR_SZ(1);
	10758	}
	10759	SvREFCNT_dec(listsv);
	10760	return ret;
	10761	}
	10762
	10763	if (nonbitmap) {
	10764	UV start, end;
	10765	invlist_iterinit(nonbitmap);
	10766	while (invlist_iternext(nonbitmap, &start, &end)) {
	10767	if (start == end) {
	10768	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
	10769	}
	10770	else {
	10771	/* The \t sets the whole range */
	10772	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
	10773	/* XXX EBCDIC */
	10774	start, end);
	10775	}
	10776	}
	10777	SvREFCNT_dec(nonbitmap);
	10778	}
	10779
	10780	if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
	10781	ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
	10782	SvREFCNT_dec(listsv);
	10783	SvREFCNT_dec(unicode_alternate);
	10784	}
	10785	else {
	10786
	10787	AV * const av = newAV();
	10788	SV *rv;
	10789	/* The 0th element stores the character class description
	10790	* in its textual form: used later (regexec.c:Perl_regclass_swash())
	10791	* to initialize the appropriate swash (which gets stored in
	10792	* the 1st element), and also useful for dumping the regnode.
	10793	* The 2nd element stores the multicharacter foldings,
	10794	* used later (regexec.c:S_reginclass()). */
	10795	av_store(av, 0, listsv);
	10796	av_store(av, 1, NULL);
	10797
	10798	/* Store any computed multi-char folds only if we are allowing
	10799	* them */
	10800	if (allow_full_fold) {
	10801	av_store(av, 2, MUTABLE_SV(unicode_alternate));
	10802	if (unicode_alternate) { /* This node is variable length */
	10803	OP(ret) = ANYOFV;
	10804	}
	10805	}
	10806	else {
	10807	av_store(av, 2, NULL);
	10808	}
	10809	rv = newRV_noinc(MUTABLE_SV(av));
	10810	n = add_data(pRExC_state, 1, "s");
	10811	RExC_rxi->data->data[n] = (void*)rv;
	10812	ARG_SET(ret, n);
	10813	}
	10814	return ret;
	10815	}
	10816	#undef _C_C_T_
	10817
	10818
	10819	/* reg_skipcomment()
	10820
	10821	Absorbs an /x style # comments from the input stream.
	10822	Returns true if there is more text remaining in the stream.
	10823	Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
	10824	terminates the pattern without including a newline.
	10825
	10826	Note its the callers responsibility to ensure that we are
	10827	actually in /x mode
	10828
	10829	*/
	10830
	10831	STATIC bool
	10832	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
	10833	{
	10834	bool ended = 0;
	10835
	10836	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	10837
	10838	while (RExC_parse < RExC_end)
	10839	if (*RExC_parse++ == '\n') {
	10840	ended = 1;
	10841	break;
	10842	}
	10843	if (!ended) {
	10844	/* we ran off the end of the pattern without ending
	10845	the comment, so we have to add an \n when wrapping */
	10846	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	10847	return 0;
	10848	} else
	10849	return 1;
	10850	}
	10851
	10852	/* nextchar()
	10853
	10854	Advances the parse position, and optionally absorbs
	10855	"whitespace" from the inputstream.
	10856
	10857	Without /x "whitespace" means (?#...) style comments only,
	10858	with /x this means (?#...) and # comments and whitespace proper.
	10859
	10860	Returns the RExC_parse point from BEFORE the scan occurs.
	10861
	10862	This is the /x friendly way of saying RExC_parse++.
	10863	*/
	10864
	10865	STATIC char*
	10866	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	10867	{
	10868	char* const retval = RExC_parse++;
	10869
	10870	PERL_ARGS_ASSERT_NEXTCHAR;
	10871
	10872	for (;;) {
	10873	if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
	10874	RExC_parse[2] == '#') {
	10875	while (*RExC_parse != ')') {
	10876	if (RExC_parse == RExC_end)
	10877	FAIL("Sequence (?#... not terminated");
	10878	RExC_parse++;
	10879	}
	10880	RExC_parse++;
	10881	continue;
	10882	}
	10883	if (RExC_flags & RXf_PMf_EXTENDED) {
	10884	if (isSPACE(*RExC_parse)) {
	10885	RExC_parse++;
	10886	continue;
	10887	}
	10888	else if (*RExC_parse == '#') {
	10889	if ( reg_skipcomment( pRExC_state ) )
	10890	continue;
	10891	}
	10892	}
	10893	return retval;
	10894	}
	10895	}
	10896
	10897	/*
	10898	- reg_node - emit a node
	10899	*/
	10900	STATIC regnode * /* Location. */
	10901	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	10902	{
	10903	dVAR;
	10904	register regnode *ptr;
	10905	regnode * const ret = RExC_emit;
	10906	GET_RE_DEBUG_FLAGS_DECL;
	10907
	10908	PERL_ARGS_ASSERT_REG_NODE;
	10909
	10910	if (SIZE_ONLY) {
	10911	SIZE_ALIGN(RExC_size);
	10912	RExC_size += 1;
	10913	return(ret);
	10914	}
	10915	if (RExC_emit >= RExC_emit_bound)
	10916	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
	10917
	10918	NODE_ALIGN_FILL(ret);
	10919	ptr = ret;
	10920	FILL_ADVANCE_NODE(ptr, op);
	10921	#ifdef RE_TRACK_PATTERN_OFFSETS
	10922	if (RExC_offsets) { /* MJD */
	10923	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	10924	"reg_node", __LINE__,
	10925	PL_reg_name[op],
	10926	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	10927	? "Overwriting end of array!\n" : "OK",
	10928	(UV)(RExC_emit - RExC_emit_start),
	10929	(UV)(RExC_parse - RExC_start),
	10930	(UV)RExC_offsets[0]));
	10931	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	10932	}
	10933	#endif
	10934	RExC_emit = ptr;
	10935	return(ret);
	10936	}
	10937
	10938	/*
	10939	- reganode - emit a node with an argument
	10940	*/
	10941	STATIC regnode * /* Location. */
	10942	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	10943	{
	10944	dVAR;
	10945	register regnode *ptr;
	10946	regnode * const ret = RExC_emit;
	10947	GET_RE_DEBUG_FLAGS_DECL;
	10948
	10949	PERL_ARGS_ASSERT_REGANODE;
	10950
	10951	if (SIZE_ONLY) {
	10952	SIZE_ALIGN(RExC_size);
	10953	RExC_size += 2;
	10954	/*
	10955	We can't do this:
	10956
	10957	assert(2==regarglen[op]+1);
	10958
	10959	Anything larger than this has to allocate the extra amount.
	10960	If we changed this to be:
	10961
	10962	RExC_size += (1 + regarglen[op]);
	10963
	10964	then it wouldn't matter. Its not clear what side effect
	10965	might come from that so its not done so far.
	10966	-- dmq
	10967	*/
	10968	return(ret);
	10969	}
	10970	if (RExC_emit >= RExC_emit_bound)
	10971	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
	10972
	10973	NODE_ALIGN_FILL(ret);
	10974	ptr = ret;
	10975	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	10976	#ifdef RE_TRACK_PATTERN_OFFSETS
	10977	if (RExC_offsets) { /* MJD */
	10978	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	10979	"reganode",
	10980	__LINE__,
	10981	PL_reg_name[op],
	10982	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	10983	"Overwriting end of array!\n" : "OK",
	10984	(UV)(RExC_emit - RExC_emit_start),
	10985	(UV)(RExC_parse - RExC_start),
	10986	(UV)RExC_offsets[0]));
	10987	Set_Cur_Node_Offset;
	10988	}
	10989	#endif
	10990	RExC_emit = ptr;
	10991	return(ret);
	10992	}
	10993
	10994	/*
	10995	- reguni - emit (if appropriate) a Unicode character
	10996	*/
	10997	STATIC STRLEN
	10998	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	10999	{
	11000	dVAR;
	11001
	11002	PERL_ARGS_ASSERT_REGUNI;
	11003
	11004	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	11005	}
	11006
	11007	/*
	11008	- reginsert - insert an operator in front of already-emitted operand
	11009	*
	11010	* Means relocating the operand.
	11011	*/
	11012	STATIC void
	11013	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	11014	{
	11015	dVAR;
	11016	register regnode *src;
	11017	register regnode *dst;
	11018	register regnode *place;
	11019	const int offset = regarglen[(U8)op];
	11020	const int size = NODE_STEP_REGNODE + offset;
	11021	GET_RE_DEBUG_FLAGS_DECL;
	11022
	11023	PERL_ARGS_ASSERT_REGINSERT;
	11024	PERL_UNUSED_ARG(depth);
	11025	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	11026	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	11027	if (SIZE_ONLY) {
	11028	RExC_size += size;
	11029	return;
	11030	}
	11031
	11032	src = RExC_emit;
	11033	RExC_emit += size;
	11034	dst = RExC_emit;
	11035	if (RExC_open_parens) {
	11036	int paren;
	11037	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	11038	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	11039	if ( RExC_open_parens[paren] >= opnd ) {
	11040	/DEBUG_PARSE_FMT("open"," - %d",size);/
	11041	RExC_open_parens[paren] += size;
	11042	} else {
	11043	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	11044	}
	11045	if ( RExC_close_parens[paren] >= opnd ) {
	11046	/DEBUG_PARSE_FMT("close"," - %d",size);/
	11047	RExC_close_parens[paren] += size;
	11048	} else {
	11049	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	11050	}
	11051	}
	11052	}
	11053
	11054	while (src > opnd) {
	11055	StructCopy(--src, --dst, regnode);
	11056	#ifdef RE_TRACK_PATTERN_OFFSETS
	11057	if (RExC_offsets) { /* MJD 20010112 */
	11058	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	11059	"reg_insert",
	11060	__LINE__,
	11061	PL_reg_name[op],
	11062	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	11063	? "Overwriting end of array!\n" : "OK",
	11064	(UV)(src - RExC_emit_start),
	11065	(UV)(dst - RExC_emit_start),
	11066	(UV)RExC_offsets[0]));
	11067	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	11068	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	11069	}
	11070	#endif
	11071	}
	11072
	11073
	11074	place = opnd; /* Op node, where operand used to be. */
	11075	#ifdef RE_TRACK_PATTERN_OFFSETS
	11076	if (RExC_offsets) { /* MJD */
	11077	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	11078	"reginsert",
	11079	__LINE__,
	11080	PL_reg_name[op],
	11081	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	11082	? "Overwriting end of array!\n" : "OK",
	11083	(UV)(place - RExC_emit_start),
	11084	(UV)(RExC_parse - RExC_start),
	11085	(UV)RExC_offsets[0]));
	11086	Set_Node_Offset(place, RExC_parse);
	11087	Set_Node_Length(place, 1);
	11088	}
	11089	#endif
	11090	src = NEXTOPER(place);
	11091	FILL_ADVANCE_NODE(place, op);
	11092	Zero(src, offset, regnode);
	11093	}
	11094
	11095	/*
	11096	- regtail - set the next-pointer at the end of a node chain of p to val.
	11097	- SEE ALSO: regtail_study
	11098	*/
	11099	/* TODO: All three parms should be const */
	11100	STATIC void
	11101	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	11102	{
	11103	dVAR;
	11104	register regnode *scan;
	11105	GET_RE_DEBUG_FLAGS_DECL;
	11106
	11107	PERL_ARGS_ASSERT_REGTAIL;
	11108	#ifndef DEBUGGING
	11109	PERL_UNUSED_ARG(depth);
	11110	#endif
	11111
	11112	if (SIZE_ONLY)
	11113	return;
	11114
	11115	/* Find last node. */
	11116	scan = p;
	11117	for (;;) {
	11118	regnode * const temp = regnext(scan);
	11119	DEBUG_PARSE_r({
	11120	SV * const mysv=sv_newmortal();
	11121	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	11122	regprop(RExC_rx, mysv, scan);
	11123	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	11124	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	11125	(temp == NULL ? "->" : ""),
	11126	(temp == NULL ? PL_reg_name[OP(val)] : "")
	11127	);
	11128	});
	11129	if (temp == NULL)
	11130	break;
	11131	scan = temp;
	11132	}
	11133
	11134	if (reg_off_by_arg[OP(scan)]) {
	11135	ARG_SET(scan, val - scan);
	11136	}
	11137	else {
	11138	NEXT_OFF(scan) = val - scan;
	11139	}
	11140	}
	11141
	11142	#ifdef DEBUGGING
	11143	/*
	11144	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	11145	- Look for optimizable sequences at the same time.
	11146	- currently only looks for EXACT chains.
	11147
	11148	This is experimental code. The idea is to use this routine to perform
	11149	in place optimizations on branches and groups as they are constructed,
	11150	with the long term intention of removing optimization from study_chunk so
	11151	that it is purely analytical.
	11152
	11153	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	11154	to control which is which.
	11155
	11156	*/
	11157	/* TODO: All four parms should be const */
	11158
	11159	STATIC U8
	11160	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	11161	{
	11162	dVAR;
	11163	register regnode *scan;
	11164	U8 exact = PSEUDO;
	11165	#ifdef EXPERIMENTAL_INPLACESCAN
	11166	I32 min = 0;
	11167	#endif
	11168	GET_RE_DEBUG_FLAGS_DECL;
	11169
	11170	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	11171
	11172
	11173	if (SIZE_ONLY)
	11174	return exact;
	11175
	11176	/* Find last node. */
	11177
	11178	scan = p;
	11179	for (;;) {
	11180	regnode * const temp = regnext(scan);
	11181	#ifdef EXPERIMENTAL_INPLACESCAN
	11182	if (PL_regkind[OP(scan)] == EXACT)
	11183	if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
	11184	return EXACT;
	11185	#endif
	11186	if ( exact ) {
	11187	switch (OP(scan)) {
	11188	case EXACT:
	11189	case EXACTF:
	11190	case EXACTFA:
	11191	case EXACTFU:
	11192	case EXACTFL:
	11193	if( exact == PSEUDO )
	11194	exact= OP(scan);
	11195	else if ( exact != OP(scan) )
	11196	exact= 0;
	11197	case NOTHING:
	11198	break;
	11199	default:
	11200	exact= 0;
	11201	}
	11202	}
	11203	DEBUG_PARSE_r({
	11204	SV * const mysv=sv_newmortal();
	11205	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	11206	regprop(RExC_rx, mysv, scan);
	11207	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	11208	SvPV_nolen_const(mysv),
	11209	REG_NODE_NUM(scan),
	11210	PL_reg_name[exact]);
	11211	});
	11212	if (temp == NULL)
	11213	break;
	11214	scan = temp;
	11215	}
	11216	DEBUG_PARSE_r({
	11217	SV * const mysv_val=sv_newmortal();
	11218	DEBUG_PARSE_MSG("");
	11219	regprop(RExC_rx, mysv_val, val);
	11220	PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	11221	SvPV_nolen_const(mysv_val),
	11222	(IV)REG_NODE_NUM(val),
	11223	(IV)(val - scan)
	11224	);
	11225	});
	11226	if (reg_off_by_arg[OP(scan)]) {
	11227	ARG_SET(scan, val - scan);
	11228	}
	11229	else {
	11230	NEXT_OFF(scan) = val - scan;
	11231	}
	11232
	11233	return exact;
	11234	}
	11235	#endif
	11236
	11237	/*
	11238	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	11239	*/
	11240	#ifdef DEBUGGING
	11241	static void
	11242	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	11243	{
	11244	int bit;
	11245	int set=0;
	11246	regex_charset cs;
	11247
	11248	for (bit=0; bit<32; bit++) {
	11249	if (flags & (1<<bit)) {
	11250	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	11251	continue;
	11252	}
	11253	if (!set++ && lead)
	11254	PerlIO_printf(Perl_debug_log, "%s",lead);
	11255	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	11256	}
	11257	}
	11258	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	11259	if (!set++ && lead) {
	11260	PerlIO_printf(Perl_debug_log, "%s",lead);
	11261	}
	11262	switch (cs) {
	11263	case REGEX_UNICODE_CHARSET:
	11264	PerlIO_printf(Perl_debug_log, "UNICODE");
	11265	break;
	11266	case REGEX_LOCALE_CHARSET:
	11267	PerlIO_printf(Perl_debug_log, "LOCALE");
	11268	break;
	11269	case REGEX_ASCII_RESTRICTED_CHARSET:
	11270	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	11271	break;
	11272	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	11273	PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
	11274	break;
	11275	default:
	11276	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	11277	break;
	11278	}
	11279	}
	11280	if (lead) {
	11281	if (set)
	11282	PerlIO_printf(Perl_debug_log, "\n");
	11283	else
	11284	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	11285	}
	11286	}
	11287	#endif
	11288
	11289	void
	11290	Perl_regdump(pTHX_ const regexp *r)
	11291	{
	11292	#ifdef DEBUGGING
	11293	dVAR;
	11294	SV * const sv = sv_newmortal();
	11295	SV *dsv= sv_newmortal();
	11296	RXi_GET_DECL(r,ri);
	11297	GET_RE_DEBUG_FLAGS_DECL;
	11298
	11299	PERL_ARGS_ASSERT_REGDUMP;
	11300
	11301	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	11302
	11303	/* Header fields of interest. */
	11304	if (r->anchored_substr) {
	11305	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	11306	RE_SV_DUMPLEN(r->anchored_substr), 30);
	11307	PerlIO_printf(Perl_debug_log,
	11308	"anchored %s%s at %"IVdf" ",
	11309	s, RE_SV_TAIL(r->anchored_substr),
	11310	(IV)r->anchored_offset);
	11311	} else if (r->anchored_utf8) {
	11312	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	11313	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	11314	PerlIO_printf(Perl_debug_log,
	11315	"anchored utf8 %s%s at %"IVdf" ",
	11316	s, RE_SV_TAIL(r->anchored_utf8),
	11317	(IV)r->anchored_offset);
	11318	}
	11319	if (r->float_substr) {
	11320	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	11321	RE_SV_DUMPLEN(r->float_substr), 30);
	11322	PerlIO_printf(Perl_debug_log,
	11323	"floating %s%s at %"IVdf"..%"UVuf" ",
	11324	s, RE_SV_TAIL(r->float_substr),
	11325	(IV)r->float_min_offset, (UV)r->float_max_offset);
	11326	} else if (r->float_utf8) {
	11327	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	11328	RE_SV_DUMPLEN(r->float_utf8), 30);
	11329	PerlIO_printf(Perl_debug_log,
	11330	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	11331	s, RE_SV_TAIL(r->float_utf8),
	11332	(IV)r->float_min_offset, (UV)r->float_max_offset);
	11333	}
	11334	if (r->check_substr \|\| r->check_utf8)
	11335	PerlIO_printf(Perl_debug_log,
	11336	(const char *)
	11337	(r->check_substr == r->float_substr
	11338	&& r->check_utf8 == r->float_utf8
	11339	? "(checking floating" : "(checking anchored"));
	11340	if (r->extflags & RXf_NOSCAN)
	11341	PerlIO_printf(Perl_debug_log, " noscan");
	11342	if (r->extflags & RXf_CHECK_ALL)
	11343	PerlIO_printf(Perl_debug_log, " isall");
	11344	if (r->check_substr \|\| r->check_utf8)
	11345	PerlIO_printf(Perl_debug_log, ") ");
	11346
	11347	if (ri->regstclass) {
	11348	regprop(r, sv, ri->regstclass);
	11349	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	11350	}
	11351	if (r->extflags & RXf_ANCH) {
	11352	PerlIO_printf(Perl_debug_log, "anchored");
	11353	if (r->extflags & RXf_ANCH_BOL)
	11354	PerlIO_printf(Perl_debug_log, "(BOL)");
	11355	if (r->extflags & RXf_ANCH_MBOL)
	11356	PerlIO_printf(Perl_debug_log, "(MBOL)");
	11357	if (r->extflags & RXf_ANCH_SBOL)
	11358	PerlIO_printf(Perl_debug_log, "(SBOL)");
	11359	if (r->extflags & RXf_ANCH_GPOS)
	11360	PerlIO_printf(Perl_debug_log, "(GPOS)");
	11361	PerlIO_putc(Perl_debug_log, ' ');
	11362	}
	11363	if (r->extflags & RXf_GPOS_SEEN)
	11364	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	11365	if (r->intflags & PREGf_SKIP)
	11366	PerlIO_printf(Perl_debug_log, "plus ");
	11367	if (r->intflags & PREGf_IMPLICIT)
	11368	PerlIO_printf(Perl_debug_log, "implicit ");
	11369	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	11370	if (r->extflags & RXf_EVAL_SEEN)
	11371	PerlIO_printf(Perl_debug_log, "with eval ");
	11372	PerlIO_printf(Perl_debug_log, "\n");
	11373	DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
	11374	#else
	11375	PERL_ARGS_ASSERT_REGDUMP;
	11376	PERL_UNUSED_CONTEXT;
	11377	PERL_UNUSED_ARG(r);
	11378	#endif /* DEBUGGING */
	11379	}
	11380
	11381	/*
	11382	- regprop - printable representation of opcode
	11383	*/
	11384	#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
	11385	STMT_START { \
	11386	if (do_sep) { \
	11387	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
	11388	if (flags & ANYOF_INVERT) \
	11389	/make sure the invert info is in each / \
	11390	sv_catpvs(sv, "^"); \
	11391	do_sep = 0; \
	11392	} \
	11393	} STMT_END
	11394
	11395	void
	11396	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode *o)
	11397	{
	11398	#ifdef DEBUGGING
	11399	dVAR;
	11400	register int k;
	11401	RXi_GET_DECL(prog,progi);
	11402	GET_RE_DEBUG_FLAGS_DECL;
	11403
	11404	PERL_ARGS_ASSERT_REGPROP;
	11405
	11406	sv_setpvs(sv, "");
	11407
	11408	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	11409	/* It would be nice to FAIL() here, but this may be called from
	11410	regexec.c, and it would be hard to supply pRExC_state. */
	11411	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
	11412	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	11413
	11414	k = PL_regkind[OP(o)];
	11415
	11416	if (k == EXACT) {
	11417	sv_catpvs(sv, " ");
	11418	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	11419	* is a crude hack but it may be the best for now since
	11420	* we have no flag "this EXACTish node was UTF-8"
	11421	* --jhi */
	11422	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	11423	PERL_PV_ESCAPE_UNI_DETECT \|
	11424	PERL_PV_ESCAPE_NONASCII \|
	11425	PERL_PV_PRETTY_ELLIPSES \|
	11426	PERL_PV_PRETTY_LTGT \|
	11427	PERL_PV_PRETTY_NOCLEAR
	11428	);
	11429	} else if (k == TRIE) {
	11430	/* print the details of the trie in dumpuntil instead, as
	11431	* progi->data isn't available here */
	11432	const char op = OP(o);
	11433	const U32 n = ARG(o);
	11434	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	11435	(reg_ac_data *)progi->data->data[n] :
	11436	NULL;
	11437	const reg_trie_data * const trie
	11438	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	11439
	11440	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	11441	DEBUG_TRIE_COMPILE_r(
	11442	Perl_sv_catpvf(aTHX_ sv,
	11443	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	11444	(UV)trie->startstate,
	11445	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	11446	(UV)trie->wordcount,
	11447	(UV)trie->minlen,
	11448	(UV)trie->maxlen,
	11449	(UV)TRIE_CHARCOUNT(trie),
	11450	(UV)trie->uniquecharcount
	11451	)
	11452	);
	11453	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	11454	int i;
	11455	int rangestart = -1;
	11456	U8* bitmap = IS_ANYOF_TRIE(op) ? (U8)ANYOF_BITMAP(o) : (U8)TRIE_BITMAP(trie);
	11457	sv_catpvs(sv, "[");
	11458	for (i = 0; i <= 256; i++) {
	11459	if (i < 256 && BITMAP_TEST(bitmap,i)) {
	11460	if (rangestart == -1)
	11461	rangestart = i;
	11462	} else if (rangestart != -1) {
	11463	if (i <= rangestart + 3)
	11464	for (; rangestart < i; rangestart++)
	11465	put_byte(sv, rangestart);
	11466	else {
	11467	put_byte(sv, rangestart);
	11468	sv_catpvs(sv, "-");
	11469	put_byte(sv, i - 1);
	11470	}
	11471	rangestart = -1;
	11472	}
	11473	}
	11474	sv_catpvs(sv, "]");
	11475	}
	11476
	11477	} else if (k == CURLY) {
	11478	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	11479	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	11480	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	11481	}
	11482	else if (k == WHILEM && o->flags) /* Ordinal/of */
	11483	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	11484	else if (k == REF \|\| k == OPEN \|\| k == CLOSE \|\| k == GROUPP \|\| OP(o)==ACCEPT) {
	11485	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	11486	if ( RXp_PAREN_NAMES(prog) ) {
	11487	if ( k != REF \|\| (OP(o) < NREF)) {
	11488	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	11489	SV **name= av_fetch(list, ARG(o), 0 );
	11490	if (name)
	11491	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	11492	}
	11493	else {
	11494	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	11495	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	11496	I32 nums=(I32)SvPVX(sv_dat);
	11497	SV **name= av_fetch(list, nums[0], 0 );
	11498	I32 n;
	11499	if (name) {
	11500	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	11501	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	11502	(n ? "," : ""), (IV)nums[n]);
	11503	}
	11504	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	11505	}
	11506	}
	11507	}
	11508	} else if (k == GOSUB)
	11509	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
	11510	else if (k == VERB) {
	11511	if (!o->flags)
	11512	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	11513	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	11514	} else if (k == LOGICAL)
	11515	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
	11516	else if (k == FOLDCHAR)
	11517	Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
	11518	else if (k == ANYOF) {
	11519	int i, rangestart = -1;
	11520	const U8 flags = ANYOF_FLAGS(o);
	11521	int do_sep = 0;
	11522
	11523	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	11524	static const char * const anyofs[] = {
	11525	"\\w",
	11526	"\\W",
	11527	"\\s",
	11528	"\\S",
	11529	"\\d",
	11530	"\\D",
	11531	"[:alnum:]",
	11532	"[:^alnum:]",
	11533	"[:alpha:]",
	11534	"[:^alpha:]",
	11535	"[:ascii:]",
	11536	"[:^ascii:]",
	11537	"[:cntrl:]",
	11538	"[:^cntrl:]",
	11539	"[:graph:]",
	11540	"[:^graph:]",
	11541	"[:lower:]",
	11542	"[:^lower:]",
	11543	"[:print:]",
	11544	"[:^print:]",
	11545	"[:punct:]",
	11546	"[:^punct:]",
	11547	"[:upper:]",
	11548	"[:^upper:]",
	11549	"[:xdigit:]",
	11550	"[:^xdigit:]",
	11551	"[:space:]",
	11552	"[:^space:]",
	11553	"[:blank:]",
	11554	"[:^blank:]"
	11555	};
	11556
	11557	if (flags & ANYOF_LOCALE)
	11558	sv_catpvs(sv, "{loc}");
	11559	if (flags & ANYOF_LOC_NONBITMAP_FOLD)
	11560	sv_catpvs(sv, "{i}");
	11561	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	11562	if (flags & ANYOF_INVERT)
	11563	sv_catpvs(sv, "^");
	11564
	11565	/* output what the standard cp 0-255 bitmap matches */
	11566	for (i = 0; i <= 256; i++) {
	11567	if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
	11568	if (rangestart == -1)
	11569	rangestart = i;
	11570	} else if (rangestart != -1) {
	11571	if (i <= rangestart + 3)
	11572	for (; rangestart < i; rangestart++)
	11573	put_byte(sv, rangestart);
	11574	else {
	11575	put_byte(sv, rangestart);
	11576	sv_catpvs(sv, "-");
	11577	put_byte(sv, i - 1);
	11578	}
	11579	do_sep = 1;
	11580	rangestart = -1;
	11581	}
	11582	}
	11583
	11584	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	11585	/* output any special charclass tests (used entirely under use locale) */
	11586	if (ANYOF_CLASS_TEST_ANY_SET(o))
	11587	for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
	11588	if (ANYOF_CLASS_TEST(o,i)) {
	11589	sv_catpv(sv, anyofs[i]);
	11590	do_sep = 1;
	11591	}
	11592
	11593	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	11594
	11595	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	11596	sv_catpvs(sv, "{non-utf8-latin1-all}");
	11597	}
	11598
	11599	/* output information about the unicode matching */
	11600	if (flags & ANYOF_UNICODE_ALL)
	11601	sv_catpvs(sv, "{unicode_all}");
	11602	else if (ANYOF_NONBITMAP(o))
	11603	sv_catpvs(sv, "{unicode}");
	11604	if (flags & ANYOF_NONBITMAP_NON_UTF8)
	11605	sv_catpvs(sv, "{outside bitmap}");
	11606
	11607	if (ANYOF_NONBITMAP(o)) {
	11608	SV *lv;
	11609	SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
	11610
	11611	if (lv) {
	11612	if (sw) {
	11613	U8 s[UTF8_MAXBYTES_CASE+1];
	11614
	11615	for (i = 0; i <= 256; i++) { /* just the first 256 */
	11616	uvchr_to_utf8(s, i);
	11617
	11618	if (i < 256 && swash_fetch(sw, s, TRUE)) {
	11619	if (rangestart == -1)
	11620	rangestart = i;
	11621	} else if (rangestart != -1) {
	11622	if (i <= rangestart + 3)
	11623	for (; rangestart < i; rangestart++) {
	11624	const U8 * const e = uvchr_to_utf8(s,rangestart);
	11625	U8 *p;
	11626	for(p = s; p < e; p++)
	11627	put_byte(sv, *p);
	11628	}
	11629	else {
	11630	const U8 *e = uvchr_to_utf8(s,rangestart);
	11631	U8 *p;
	11632	for (p = s; p < e; p++)
	11633	put_byte(sv, *p);
	11634	sv_catpvs(sv, "-");
	11635	e = uvchr_to_utf8(s, i-1);
	11636	for (p = s; p < e; p++)
	11637	put_byte(sv, *p);
	11638	}
	11639	rangestart = -1;
	11640	}
	11641	}
	11642
	11643	sv_catpvs(sv, "..."); /* et cetera */
	11644	}
	11645
	11646	{
	11647	char *s = savesvpv(lv);
	11648	char * const origs = s;
	11649
	11650	while (s && s != '\n')
	11651	s++;
	11652
	11653	if (*s == '\n') {
	11654	const char * const t = ++s;
	11655
	11656	while (*s) {
	11657	if (*s == '\n')
	11658	*s = ' ';
	11659	s++;
	11660	}
	11661	if (s[-1] == ' ')
	11662	s[-1] = 0;
	11663
	11664	sv_catpv(sv, t);
	11665	}
	11666
	11667	Safefree(origs);
	11668	}
	11669	}
	11670	}
	11671
	11672	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	11673	}
	11674	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	11675	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	11676	#else
	11677	PERL_UNUSED_CONTEXT;
	11678	PERL_UNUSED_ARG(sv);
	11679	PERL_UNUSED_ARG(o);
	11680	PERL_UNUSED_ARG(prog);
	11681	#endif /* DEBUGGING */
	11682	}
	11683
	11684	SV *
	11685	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	11686	{ /* Assume that RE_INTUIT is set */
	11687	dVAR;
	11688	struct regexp const prog = (struct regexp )SvANY(r);
	11689	GET_RE_DEBUG_FLAGS_DECL;
	11690
	11691	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	11692	PERL_UNUSED_CONTEXT;
	11693
	11694	DEBUG_COMPILE_r(
	11695	{
	11696	const char * const s = SvPV_nolen_const(prog->check_substr
	11697	? prog->check_substr : prog->check_utf8);
	11698
	11699	if (!PL_colorset) reginitcolors();
	11700	PerlIO_printf(Perl_debug_log,
	11701	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	11702	PL_colors[4],
	11703	prog->check_substr ? "" : "utf8 ",
	11704	PL_colors[5],PL_colors[0],
	11705	s,
	11706	PL_colors[1],
	11707	(strlen(s) > 60 ? "..." : ""));
	11708	} );
	11709
	11710	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	11711	}
	11712
	11713	/*
	11714	pregfree()
	11715
	11716	handles refcounting and freeing the perl core regexp structure. When
	11717	it is necessary to actually free the structure the first thing it
	11718	does is call the 'free' method of the regexp_engine associated to
	11719	the regexp, allowing the handling of the void *pprivate; member
	11720	first. (This routine is not overridable by extensions, which is why
	11721	the extensions free is called first.)
	11722
	11723	See regdupe and regdupe_internal if you change anything here.
	11724	*/
	11725	#ifndef PERL_IN_XSUB_RE
	11726	void
	11727	Perl_pregfree(pTHX_ REGEXP *r)
	11728	{
	11729	SvREFCNT_dec(r);
	11730	}
	11731
	11732	void
	11733	Perl_pregfree2(pTHX_ REGEXP *rx)
	11734	{
	11735	dVAR;
	11736	struct regexp const r = (struct regexp )SvANY(rx);
	11737	GET_RE_DEBUG_FLAGS_DECL;
	11738
	11739	PERL_ARGS_ASSERT_PREGFREE2;
	11740
	11741	if (r->mother_re) {
	11742	ReREFCNT_dec(r->mother_re);
	11743	} else {
	11744	CALLREGFREE_PVT(rx); /* free the private data */
	11745	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	11746	}
	11747	if (r->substrs) {
	11748	SvREFCNT_dec(r->anchored_substr);
	11749	SvREFCNT_dec(r->anchored_utf8);
	11750	SvREFCNT_dec(r->float_substr);
	11751	SvREFCNT_dec(r->float_utf8);
	11752	Safefree(r->substrs);
	11753	}
	11754	RX_MATCH_COPY_FREE(rx);
	11755	#ifdef PERL_OLD_COPY_ON_WRITE
	11756	SvREFCNT_dec(r->saved_copy);
	11757	#endif
	11758	Safefree(r->offs);
	11759	}
	11760
	11761	/* reg_temp_copy()
	11762
	11763	This is a hacky workaround to the structural issue of match results
	11764	being stored in the regexp structure which is in turn stored in
	11765	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	11766	could be PL_curpm in multiple contexts, and could require multiple
	11767	result sets being associated with the pattern simultaneously, such
	11768	as when doing a recursive match with (??{$qr})
	11769
	11770	The solution is to make a lightweight copy of the regexp structure
	11771	when a qr// is returned from the code executed by (??{$qr}) this
	11772	lightweight copy doesn't actually own any of its data except for
	11773	the starp/end and the actual regexp structure itself.
	11774
	11775	*/
	11776
	11777
	11778	REGEXP *
	11779	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	11780	{
	11781	struct regexp *ret;
	11782	struct regexp const r = (struct regexp )SvANY(rx);
	11783	register const I32 npar = r->nparens+1;
	11784
	11785	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	11786
	11787	if (!ret_x)
	11788	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	11789	ret = (struct regexp *)SvANY(ret_x);
	11790
	11791	(void)ReREFCNT_inc(rx);
	11792	/* We can take advantage of the existing "copied buffer" mechanism in SVs
	11793	by pointing directly at the buffer, but flagging that the allocated
	11794	space in the copy is zero. As we've just done a struct copy, it's now
	11795	a case of zero-ing that, rather than copying the current length. */
	11796	SvPV_set(ret_x, RX_WRAPPED(rx));
	11797	SvFLAGS(ret_x) \|= SvFLAGS(rx) & (SVf_POK\|SVp_POK\|SVf_UTF8);
	11798	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	11799	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	11800	SvLEN_set(ret_x, 0);
	11801	SvSTASH_set(ret_x, NULL);
	11802	SvMAGIC_set(ret_x, NULL);
	11803	Newx(ret->offs, npar, regexp_paren_pair);
	11804	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	11805	if (r->substrs) {
	11806	Newx(ret->substrs, 1, struct reg_substr_data);
	11807	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	11808
	11809	SvREFCNT_inc_void(ret->anchored_substr);
	11810	SvREFCNT_inc_void(ret->anchored_utf8);
	11811	SvREFCNT_inc_void(ret->float_substr);
	11812	SvREFCNT_inc_void(ret->float_utf8);
	11813
	11814	/* check_substr and check_utf8, if non-NULL, point to either their
	11815	anchored or float namesakes, and don't hold a second reference. */
	11816	}
	11817	RX_MATCH_COPIED_off(ret_x);
	11818	#ifdef PERL_OLD_COPY_ON_WRITE
	11819	ret->saved_copy = NULL;
	11820	#endif
	11821	ret->mother_re = rx;
	11822
	11823	return ret_x;
	11824	}
	11825	#endif
	11826
	11827	/* regfree_internal()
	11828
	11829	Free the private data in a regexp. This is overloadable by
	11830	extensions. Perl takes care of the regexp structure in pregfree(),
	11831	this covers the *pprivate pointer which technically perl doesn't
	11832	know about, however of course we have to handle the
	11833	regexp_internal structure when no extension is in use.
	11834
	11835	Note this is called before freeing anything in the regexp
	11836	structure.
	11837	*/
	11838
	11839	void
	11840	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	11841	{
	11842	dVAR;
	11843	struct regexp const r = (struct regexp )SvANY(rx);
	11844	RXi_GET_DECL(r,ri);
	11845	GET_RE_DEBUG_FLAGS_DECL;
	11846
	11847	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	11848
	11849	DEBUG_COMPILE_r({
	11850	if (!PL_colorset)
	11851	reginitcolors();
	11852	{
	11853	SV *dsv= sv_newmortal();
	11854	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	11855	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	11856	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	11857	PL_colors[4],PL_colors[5],s);
	11858	}
	11859	});
	11860	#ifdef RE_TRACK_PATTERN_OFFSETS
	11861	if (ri->u.offsets)
	11862	Safefree(ri->u.offsets); /* 20010421 MJD */
	11863	#endif
	11864	if (ri->data) {
	11865	int n = ri->data->count;
	11866	PAD* new_comppad = NULL;
	11867	PAD* old_comppad;
	11868	PADOFFSET refcnt;
	11869
	11870	while (--n >= 0) {
	11871	/* If you add a ->what type here, update the comment in regcomp.h */
	11872	switch (ri->data->what[n]) {
	11873	case 'a':
	11874	case 's':
	11875	case 'S':
	11876	case 'u':
	11877	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	11878	break;
	11879	case 'f':
	11880	Safefree(ri->data->data[n]);
	11881	break;
	11882	case 'p':
	11883	new_comppad = MUTABLE_AV(ri->data->data[n]);
	11884	break;
	11885	case 'o':
	11886	if (new_comppad == NULL)
	11887	Perl_croak(aTHX_ "panic: pregfree comppad");
	11888	PAD_SAVE_LOCAL(old_comppad,
	11889	/* Watch out for global destruction's random ordering. */
	11890	(SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
	11891	);
	11892	OP_REFCNT_LOCK;
	11893	refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
	11894	OP_REFCNT_UNLOCK;
	11895	if (!refcnt)
	11896	op_free((OP_4tree*)ri->data->data[n]);
	11897
	11898	PAD_RESTORE_LOCAL(old_comppad);
	11899	SvREFCNT_dec(MUTABLE_SV(new_comppad));
	11900	new_comppad = NULL;
	11901	break;
	11902	case 'n':
	11903	break;
	11904	case 'T':
	11905	{ /* Aho Corasick add-on structure for a trie node.
	11906	Used in stclass optimization only */
	11907	U32 refcount;
	11908	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	11909	OP_REFCNT_LOCK;
	11910	refcount = --aho->refcount;
	11911	OP_REFCNT_UNLOCK;
	11912	if ( !refcount ) {
	11913	PerlMemShared_free(aho->states);
	11914	PerlMemShared_free(aho->fail);
	11915	/* do this last!!!! */
	11916	PerlMemShared_free(ri->data->data[n]);
	11917	PerlMemShared_free(ri->regstclass);
	11918	}
	11919	}
	11920	break;
	11921	case 't':
	11922	{
	11923	/* trie structure. */
	11924	U32 refcount;
	11925	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	11926	OP_REFCNT_LOCK;
	11927	refcount = --trie->refcount;
	11928	OP_REFCNT_UNLOCK;
	11929	if ( !refcount ) {
	11930	PerlMemShared_free(trie->charmap);
	11931	PerlMemShared_free(trie->states);
	11932	PerlMemShared_free(trie->trans);
	11933	if (trie->bitmap)
	11934	PerlMemShared_free(trie->bitmap);
	11935	if (trie->jump)
	11936	PerlMemShared_free(trie->jump);
	11937	PerlMemShared_free(trie->wordinfo);
	11938	/* do this last!!!! */
	11939	PerlMemShared_free(ri->data->data[n]);
	11940	}
	11941	}
	11942	break;
	11943	default:
	11944	Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
	11945	}
	11946	}
	11947	Safefree(ri->data->what);
	11948	Safefree(ri->data);
	11949	}
	11950
	11951	Safefree(ri);
	11952	}
	11953
	11954	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	11955	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	11956	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	11957
	11958	/*
	11959	re_dup - duplicate a regexp.
	11960
	11961	This routine is expected to clone a given regexp structure. It is only
	11962	compiled under USE_ITHREADS.
	11963
	11964	After all of the core data stored in struct regexp is duplicated
	11965	the regexp_engine.dupe method is used to copy any private data
	11966	stored in the *pprivate pointer. This allows extensions to handle
	11967	any duplication it needs to do.
	11968
	11969	See pregfree() and regfree_internal() if you change anything here.
	11970	*/
	11971	#if defined(USE_ITHREADS)
	11972	#ifndef PERL_IN_XSUB_RE
	11973	void
	11974	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	11975	{
	11976	dVAR;
	11977	I32 npar;
	11978	const struct regexp r = (const struct regexp )SvANY(sstr);
	11979	struct regexp ret = (struct regexp )SvANY(dstr);
	11980
	11981	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	11982
	11983	npar = r->nparens+1;
	11984	Newx(ret->offs, npar, regexp_paren_pair);
	11985	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	11986	if(ret->swap) {
	11987	/* no need to copy these */
	11988	Newx(ret->swap, npar, regexp_paren_pair);
	11989	}
	11990
	11991	if (ret->substrs) {
	11992	/* Do it this way to avoid reading from *r after the StructCopy().
	11993	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	11994	cache, it doesn't matter. */
	11995	const bool anchored = r->check_substr
	11996	? r->check_substr == r->anchored_substr
	11997	: r->check_utf8 == r->anchored_utf8;
	11998	Newx(ret->substrs, 1, struct reg_substr_data);
	11999	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	12000
	12001	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	12002	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	12003	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	12004	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	12005
	12006	/* check_substr and check_utf8, if non-NULL, point to either their
	12007	anchored or float namesakes, and don't hold a second reference. */
	12008
	12009	if (ret->check_substr) {
	12010	if (anchored) {
	12011	assert(r->check_utf8 == r->anchored_utf8);
	12012	ret->check_substr = ret->anchored_substr;
	12013	ret->check_utf8 = ret->anchored_utf8;
	12014	} else {
	12015	assert(r->check_substr == r->float_substr);
	12016	assert(r->check_utf8 == r->float_utf8);
	12017	ret->check_substr = ret->float_substr;
	12018	ret->check_utf8 = ret->float_utf8;
	12019	}
	12020	} else if (ret->check_utf8) {
	12021	if (anchored) {
	12022	ret->check_utf8 = ret->anchored_utf8;
	12023	} else {
	12024	ret->check_utf8 = ret->float_utf8;
	12025	}
	12026	}
	12027	}
	12028
	12029	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	12030
	12031	if (ret->pprivate)
	12032	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	12033
	12034	if (RX_MATCH_COPIED(dstr))
	12035	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	12036	else
	12037	ret->subbeg = NULL;
	12038	#ifdef PERL_OLD_COPY_ON_WRITE
	12039	ret->saved_copy = NULL;
	12040	#endif
	12041
	12042	if (ret->mother_re) {
	12043	if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
	12044	/* Our storage points directly to our mother regexp, but that's
	12045	1: a buffer in a different thread
	12046	2: something we no longer hold a reference on
	12047	so we need to copy it locally. */
	12048	/* Note we need to sue SvCUR() on our mother_re, because it, in
	12049	turn, may well be pointing to its own mother_re. */
	12050	SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
	12051	SvCUR(ret->mother_re)+1));
	12052	SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
	12053	}
	12054	ret->mother_re = NULL;
	12055	}
	12056	ret->gofs = 0;
	12057	}
	12058	#endif /* PERL_IN_XSUB_RE */
	12059
	12060	/*
	12061	regdupe_internal()
	12062
	12063	This is the internal complement to regdupe() which is used to copy
	12064	the structure pointed to by the *pprivate pointer in the regexp.
	12065	This is the core version of the extension overridable cloning hook.
	12066	The regexp structure being duplicated will be copied by perl prior
	12067	to this and will be provided as the regexp *r argument, however
	12068	with the /old/ structures pprivate pointer value. Thus this routine
	12069	may override any copying normally done by perl.
	12070
	12071	It returns a pointer to the new regexp_internal structure.
	12072	*/
	12073
	12074	void *
	12075	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	12076	{
	12077	dVAR;
	12078	struct regexp const r = (struct regexp )SvANY(rx);
	12079	regexp_internal *reti;
	12080	int len;
	12081	RXi_GET_DECL(r,ri);
	12082
	12083	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	12084
	12085	len = ProgLen(ri);
	12086
	12087	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
	12088	Copy(ri->program, reti->program, len+1, regnode);
	12089
	12090
	12091	reti->regstclass = NULL;
	12092
	12093	if (ri->data) {
	12094	struct reg_data *d;
	12095	const int count = ri->data->count;
	12096	int i;
	12097
	12098	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	12099	char, struct reg_data);
	12100	Newx(d->what, count, U8);
	12101
	12102	d->count = count;
	12103	for (i = 0; i < count; i++) {
	12104	d->what[i] = ri->data->what[i];
	12105	switch (d->what[i]) {
	12106	/* legal options are one of: sSfpontTua
	12107	see also regcomp.h and pregfree() */
	12108	case 'a': /* actually an AV, but the dup function is identical. */
	12109	case 's':
	12110	case 'S':
	12111	case 'p': /* actually an AV, but the dup function is identical. */
	12112	case 'u': /* actually an HV, but the dup function is identical. */
	12113	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	12114	break;
	12115	case 'f':
	12116	/* This is cheating. */
	12117	Newx(d->data[i], 1, struct regnode_charclass_class);
	12118	StructCopy(ri->data->data[i], d->data[i],
	12119	struct regnode_charclass_class);
	12120	reti->regstclass = (regnode*)d->data[i];
	12121	break;
	12122	case 'o':
	12123	/* Compiled op trees are readonly and in shared memory,
	12124	and can thus be shared without duplication. */
	12125	OP_REFCNT_LOCK;
	12126	d->data[i] = (void)OpREFCNT_inc((OP)ri->data->data[i]);
	12127	OP_REFCNT_UNLOCK;
	12128	break;
	12129	case 'T':
	12130	/* Trie stclasses are readonly and can thus be shared
	12131	* without duplication. We free the stclass in pregfree
	12132	* when the corresponding reg_ac_data struct is freed.
	12133	*/
	12134	reti->regstclass= ri->regstclass;
	12135	/* Fall through */
	12136	case 't':
	12137	OP_REFCNT_LOCK;
	12138	((reg_trie_data*)ri->data->data[i])->refcount++;
	12139	OP_REFCNT_UNLOCK;
	12140	/* Fall through */
	12141	case 'n':
	12142	d->data[i] = ri->data->data[i];
	12143	break;
	12144	default:
	12145	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
	12146	}
	12147	}
	12148
	12149	reti->data = d;
	12150	}
	12151	else
	12152	reti->data = NULL;
	12153
	12154	reti->name_list_idx = ri->name_list_idx;
	12155
	12156	#ifdef RE_TRACK_PATTERN_OFFSETS
	12157	if (ri->u.offsets) {
	12158	Newx(reti->u.offsets, 2*len+1, U32);
	12159	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	12160	}
	12161	#else
	12162	SetProgLen(reti,len);
	12163	#endif
	12164
	12165	return (void*)reti;
	12166	}
	12167
	12168	#endif /* USE_ITHREADS */
	12169
	12170	#ifndef PERL_IN_XSUB_RE
	12171
	12172	/*
	12173	- regnext - dig the "next" pointer out of a node
	12174	*/
	12175	regnode *
	12176	Perl_regnext(pTHX_ register regnode *p)
	12177	{
	12178	dVAR;
	12179	register I32 offset;
	12180
	12181	if (!p)
	12182	return(NULL);
	12183
	12184	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	12185	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
	12186	}
	12187
	12188	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	12189	if (offset == 0)
	12190	return(NULL);
	12191
	12192	return(p+offset);
	12193	}
	12194	#endif
	12195
	12196	STATIC void
	12197	S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
	12198	{
	12199	va_list args;
	12200	STRLEN l1 = strlen(pat1);
	12201	STRLEN l2 = strlen(pat2);
	12202	char buf[512];
	12203	SV *msv;
	12204	const char *message;
	12205
	12206	PERL_ARGS_ASSERT_RE_CROAK2;
	12207
	12208	if (l1 > 510)
	12209	l1 = 510;
	12210	if (l1 + l2 > 510)
	12211	l2 = 510 - l1;
	12212	Copy(pat1, buf, l1 , char);
	12213	Copy(pat2, buf + l1, l2 , char);
	12214	buf[l1 + l2] = '\n';
	12215	buf[l1 + l2 + 1] = '\0';
	12216	#ifdef I_STDARG
	12217	/* ANSI variant takes additional second argument */
	12218	va_start(args, pat2);
	12219	#else
	12220	va_start(args);
	12221	#endif
	12222	msv = vmess(buf, &args);
	12223	va_end(args);
	12224	message = SvPV_const(msv,l1);
	12225	if (l1 > 512)
	12226	l1 = 512;
	12227	Copy(message, buf, l1 , char);
	12228	buf[l1-1] = '\0'; /* Overwrite \n */
	12229	Perl_croak(aTHX_ "%s", buf);
	12230	}
	12231
	12232	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	12233
	12234	#ifndef PERL_IN_XSUB_RE
	12235	void
	12236	Perl_save_re_context(pTHX)
	12237	{
	12238	dVAR;
	12239
	12240	struct re_save_state *state;
	12241
	12242	SAVEVPTR(PL_curcop);
	12243	SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
	12244
	12245	state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
	12246	PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
	12247	SSPUSHUV(SAVEt_RE_STATE);
	12248
	12249	Copy(&PL_reg_state, state, 1, struct re_save_state);
	12250
	12251	PL_reg_start_tmp = 0;
	12252	PL_reg_start_tmpl = 0;
	12253	PL_reg_oldsaved = NULL;
	12254	PL_reg_oldsavedlen = 0;
	12255	PL_reg_maxiter = 0;
	12256	PL_reg_leftiter = 0;
	12257	PL_reg_poscache = NULL;
	12258	PL_reg_poscache_size = 0;
	12259	#ifdef PERL_OLD_COPY_ON_WRITE
	12260	PL_nrs = NULL;
	12261	#endif
	12262
	12263	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	12264	if (PL_curpm) {
	12265	const REGEXP * const rx = PM_GETRE(PL_curpm);
	12266	if (rx) {
	12267	U32 i;
	12268	for (i = 1; i <= RX_NPARENS(rx); i++) {
	12269	char digits[TYPE_CHARS(long)];
	12270	const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
	12271	GV const const gvp
	12272	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	12273
	12274	if (gvp) {
	12275	GV * const gv = *gvp;
	12276	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	12277	save_scalar(gv);
	12278	}
	12279	}
	12280	}
	12281	}
	12282	}
	12283	#endif
	12284
	12285	static void
	12286	clear_re(pTHX_ void *r)
	12287	{
	12288	dVAR;
	12289	ReREFCNT_dec((REGEXP *)r);
	12290	}
	12291
	12292	#ifdef DEBUGGING
	12293
	12294	STATIC void
	12295	S_put_byte(pTHX_ SV *sv, int c)
	12296	{
	12297	PERL_ARGS_ASSERT_PUT_BYTE;
	12298
	12299	/* Our definition of isPRINT() ignores locales, so only bytes that are
	12300	not part of UTF-8 are considered printable. I assume that the same
	12301	holds for UTF-EBCDIC.
	12302	Also, code point 255 is not printable in either (it's E0 in EBCDIC,
	12303	which Wikipedia says:
	12304
	12305	EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
	12306	ones (binary 1111 1111, hexadecimal FF). It is similar, but not
	12307	identical, to the ASCII delete (DEL) or rubout control character.
	12308	) So the old condition can be simplified to !isPRINT(c) */
	12309	if (!isPRINT(c)) {
	12310	if (c < 256) {
	12311	Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
	12312	}
	12313	else {
	12314	Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
	12315	}
	12316	}
	12317	else {
	12318	const char string = c;
	12319	if (c == '-' \|\| c == ']' \|\| c == '\\' \|\| c == '^')
	12320	sv_catpvs(sv, "\\");
	12321	sv_catpvn(sv, &string, 1);
	12322	}
	12323	}
	12324
	12325
	12326	#define CLEAR_OPTSTART \
	12327	if (optstart) STMT_START { \
	12328	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	12329	optstart=NULL; \
	12330	} STMT_END
	12331
	12332	#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	12333
	12334	STATIC const regnode *
	12335	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	12336	const regnode last, const regnode plast,
	12337	SV* sv, I32 indent, U32 depth)
	12338	{
	12339	dVAR;
	12340	register U8 op = PSEUDO; /* Arbitrary non-END op. */
	12341	register const regnode *next;
	12342	const regnode *optstart= NULL;
	12343
	12344	RXi_GET_DECL(r,ri);
	12345	GET_RE_DEBUG_FLAGS_DECL;
	12346
	12347	PERL_ARGS_ASSERT_DUMPUNTIL;
	12348
	12349	#ifdef DEBUG_DUMPUNTIL
	12350	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	12351	last ? last-start : 0,plast ? plast-start : 0);
	12352	#endif
	12353
	12354	if (plast && plast < last)
	12355	last= plast;
	12356
	12357	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	12358	/* While that wasn't END last time... */
	12359	NODE_ALIGN(node);
	12360	op = OP(node);
	12361	if (op == CLOSE \|\| op == WHILEM)
	12362	indent--;
	12363	next = regnext((regnode *)node);
	12364
	12365	/* Where, what. */
	12366	if (OP(node) == OPTIMIZED) {
	12367	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	12368	optstart = node;
	12369	else
	12370	goto after_print;
	12371	} else
	12372	CLEAR_OPTSTART;
	12373
	12374	regprop(r, sv, node);
	12375	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	12376	(int)(2*indent + 1), "", SvPVX_const(sv));
	12377
	12378	if (OP(node) != OPTIMIZED) {
	12379	if (next == NULL) /* Next ptr. */
	12380	PerlIO_printf(Perl_debug_log, " (0)");
	12381	else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
	12382	PerlIO_printf(Perl_debug_log, " (FAIL)");
	12383	else
	12384	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	12385	(void)PerlIO_putc(Perl_debug_log, '\n');
	12386	}
	12387
	12388	after_print:
	12389	if (PL_regkind[(U8)op] == BRANCHJ) {
	12390	assert(next);
	12391	{
	12392	register const regnode *nnode = (OP(next) == LONGJMP
	12393	? regnext((regnode *)next)
	12394	: next);
	12395	if (last && nnode > last)
	12396	nnode = last;
	12397	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	12398	}
	12399	}
	12400	else if (PL_regkind[(U8)op] == BRANCH) {
	12401	assert(next);
	12402	DUMPUNTIL(NEXTOPER(node), next);
	12403	}
	12404	else if ( PL_regkind[(U8)op] == TRIE ) {
	12405	const regnode *this_trie = node;
	12406	const char op = OP(node);
	12407	const U32 n = ARG(node);
	12408	const reg_ac_data * const ac = op>=AHOCORASICK ?
	12409	(reg_ac_data *)ri->data->data[n] :
	12410	NULL;
	12411	const reg_trie_data * const trie =
	12412	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	12413	#ifdef DEBUGGING
	12414	AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	12415	#endif
	12416	const regnode *nextbranch= NULL;
	12417	I32 word_idx;
	12418	sv_setpvs(sv, "");
	12419	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	12420	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	12421
	12422	PerlIO_printf(Perl_debug_log, "%*s%s ",
	12423	(int)(2*(indent+3)), "",
	12424	elem_ptr ? pv_pretty(sv, SvPV_nolen_const(elem_ptr), SvCUR(elem_ptr), 60,
	12425	PL_colors[0], PL_colors[1],
	12426	(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) \|
	12427	PERL_PV_PRETTY_ELLIPSES \|
	12428	PERL_PV_PRETTY_LTGT
	12429	)
	12430	: "???"
	12431	);
	12432	if (trie->jump) {
	12433	U16 dist= trie->jump[word_idx+1];
	12434	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	12435	(UV)((dist ? this_trie + dist : next) - start));
	12436	if (dist) {
	12437	if (!nextbranch)
	12438	nextbranch= this_trie + trie->jump[0];
	12439	DUMPUNTIL(this_trie + dist, nextbranch);
	12440	}
	12441	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	12442	nextbranch= regnext((regnode *)nextbranch);
	12443	} else {
	12444	PerlIO_printf(Perl_debug_log, "\n");
	12445	}
	12446	}
	12447	if (last && next > last)
	12448	node= last;
	12449	else
	12450	node= next;
	12451	}
	12452	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	12453	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	12454	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	12455	}
	12456	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	12457	assert(next);
	12458	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	12459	}
	12460	else if ( op == PLUS \|\| op == STAR) {
	12461	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	12462	}
	12463	else if (PL_regkind[(U8)op] == ANYOF) {
	12464	/* arglen 1 + class block */
	12465	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
	12466	? ANYOF_CLASS_SKIP : ANYOF_SKIP);
	12467	node = NEXTOPER(node);
	12468	}
	12469	else if (PL_regkind[(U8)op] == EXACT) {
	12470	/* Literal string, where present. */
	12471	node += NODE_SZ_STR(node) - 1;
	12472	node = NEXTOPER(node);
	12473	}
	12474	else {
	12475	node = NEXTOPER(node);
	12476	node += regarglen[(U8)op];
	12477	}
	12478	if (op == CURLYX \|\| op == OPEN)
	12479	indent++;
	12480	}
	12481	CLEAR_OPTSTART;
	12482	#ifdef DEBUG_DUMPUNTIL
	12483	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	12484	#endif
	12485	return node;
	12486	}
	12487
	12488	#endif /* DEBUGGING */
	12489
	12490	/*
	12491	* Local variables:
	12492	* c-indentation-style: bsd
	12493	* c-basic-offset: 4
	12494	* indent-tabs-mode: t
	12495	* End:
	12496	*
	12497	* ex: set ts=8 sts=4 sw=4 noet:
	12498	*/