perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	#else
	85	# include "regcomp.h"
	86	#endif
	87
	88	#include "dquote_static.c"
	89	#ifndef PERL_IN_XSUB_RE
	90	# include "charclass_invlists.h"
	91	#endif
	92
	93	#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
	94
	95	#ifdef op
	96	#undef op
	97	#endif /* op */
	98
	99	#ifdef MSDOS
	100	# if defined(BUGGY_MSC6)
	101	/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
	102	# pragma optimize("a",off)
	103	/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
	104	# pragma optimize("w",on )
	105	# endif /* BUGGY_MSC6 */
	106	#endif /* MSDOS */
	107
	108	#ifndef STATIC
	109	#define STATIC static
	110	#endif
	111
	112
	113	typedef struct RExC_state_t {
	114	U32 flags; /* RXf_* are we folding, multilining? */
	115	U32 pm_flags; /* PMf_* stuff from the calling PMOP */
	116	char precomp; / uncompiled string. */
	117	REGEXP rx_sv; / The SV that is the regexp. */
	118	regexp rx; / perl core regexp structure */
	119	regexp_internal rxi; / internal data for regexp object pprivate field */
	120	char start; / Start of input for compile */
	121	char end; / End of input for compile */
	122	char parse; / Input-scan pointer. */
	123	I32 whilem_seen; /* number of WHILEM in this expr */
	124	regnode emit_start; / Start of emitted-code area */
	125	regnode emit_bound; / First regnode outside of the allocated space */
	126	regnode emit; / Code-emit pointer; &regdummy = don't = compiling */
	127	I32 naughty; /* How bad is this pattern? */
	128	I32 sawback; /* Did we see \1, ...? */
	129	U32 seen;
	130	I32 size; /* Code size. */
	131	I32 npar; /* Capture buffer count, (OPEN). */
	132	I32 cpar; /* Capture buffer count, (CLOSE). */
	133	I32 nestroot; /* root parens we are in - used by accept */
	134	I32 extralen;
	135	I32 seen_zerolen;
	136	regnode *open_parens; / pointers to open parens */
	137	regnode *close_parens; / pointers to close parens */
	138	regnode opend; / END node in program */
	139	I32 utf8; /* whether the pattern is utf8 or not */
	140	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	141	/* XXX use this for future optimisation of case
	142	* where pattern must be upgraded to utf8. */
	143	I32 uni_semantics; /* If a d charset modifier should use unicode
	144	rules, even if the pattern is not in
	145	utf8 */
	146	HV paren_names; / Paren names */
	147
	148	regnode *recurse; / Recurse regops */
	149	I32 recurse_count; /* Number of recurse regops */
	150	I32 in_lookbehind;
	151	I32 contains_locale;
	152	I32 override_recoding;
	153	struct reg_code_block code_blocks; / positions of literal (?{})
	154	within pattern */
	155	int num_code_blocks; /* size of code_blocks[] */
	156	int code_index; /* next code_blocks[] slot */
	157	#if ADD_TO_REGEXEC
	158	char starttry; / -Dr: where regtry was called. */
	159	#define RExC_starttry (pRExC_state->starttry)
	160	#endif
	161	SV runtime_code_qr; / qr with the runtime code blocks */
	162	#ifdef DEBUGGING
	163	const char *lastparse;
	164	I32 lastnum;
	165	AV paren_name_list; / idx -> name */
	166	#define RExC_lastparse (pRExC_state->lastparse)
	167	#define RExC_lastnum (pRExC_state->lastnum)
	168	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	169	#endif
	170	} RExC_state_t;
	171
	172	#define RExC_flags (pRExC_state->flags)
	173	#define RExC_pm_flags (pRExC_state->pm_flags)
	174	#define RExC_precomp (pRExC_state->precomp)
	175	#define RExC_rx_sv (pRExC_state->rx_sv)
	176	#define RExC_rx (pRExC_state->rx)
	177	#define RExC_rxi (pRExC_state->rxi)
	178	#define RExC_start (pRExC_state->start)
	179	#define RExC_end (pRExC_state->end)
	180	#define RExC_parse (pRExC_state->parse)
	181	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	182	#ifdef RE_TRACK_PATTERN_OFFSETS
	183	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
	184	#endif
	185	#define RExC_emit (pRExC_state->emit)
	186	#define RExC_emit_start (pRExC_state->emit_start)
	187	#define RExC_emit_bound (pRExC_state->emit_bound)
	188	#define RExC_naughty (pRExC_state->naughty)
	189	#define RExC_sawback (pRExC_state->sawback)
	190	#define RExC_seen (pRExC_state->seen)
	191	#define RExC_size (pRExC_state->size)
	192	#define RExC_npar (pRExC_state->npar)
	193	#define RExC_nestroot (pRExC_state->nestroot)
	194	#define RExC_extralen (pRExC_state->extralen)
	195	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	196	#define RExC_utf8 (pRExC_state->utf8)
	197	#define RExC_uni_semantics (pRExC_state->uni_semantics)
	198	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	199	#define RExC_open_parens (pRExC_state->open_parens)
	200	#define RExC_close_parens (pRExC_state->close_parens)
	201	#define RExC_opend (pRExC_state->opend)
	202	#define RExC_paren_names (pRExC_state->paren_names)
	203	#define RExC_recurse (pRExC_state->recurse)
	204	#define RExC_recurse_count (pRExC_state->recurse_count)
	205	#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
	206	#define RExC_contains_locale (pRExC_state->contains_locale)
	207	#define RExC_override_recoding (pRExC_state->override_recoding)
	208
	209
	210	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	211	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	212	((*s) == '{' && regcurly(s)))
	213
	214	#ifdef SPSTART
	215	#undef SPSTART /* dratted cpp namespace... */
	216	#endif
	217	/*
	218	* Flags to be passed up and down.
	219	*/
	220	#define WORST 0 /* Worst case. */
	221	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	222
	223	/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
	224	* character, and if utf8, must be invariant. Note that this is not the same
	225	* thing as REGNODE_SIMPLE */
	226	#define SIMPLE 0x02
	227	#define SPSTART 0x04 /* Starts with * or +. */
	228	#define TRYAGAIN 0x08 /* Weeded out a declaration. */
	229	#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
	230
	231	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	232
	233	/* whether trie related optimizations are enabled */
	234	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	235	#define TRIE_STUDY_OPT
	236	#define FULL_TRIE_STUDY
	237	#define TRIE_STCLASS
	238	#endif
	239
	240
	241
	242	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	243	#define PBITVAL(paren) (1 << ((paren) & 7))
	244	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	245	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	246	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	247
	248	/* If not already in utf8, do a longjmp back to the beginning */
	249	#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
	250	#define REQUIRE_UTF8 STMT_START { \
	251	if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
	252	} STMT_END
	253
	254	/* About scan_data_t.
	255
	256	During optimisation we recurse through the regexp program performing
	257	various inplace (keyhole style) optimisations. In addition study_chunk
	258	and scan_commit populate this data structure with information about
	259	what strings MUST appear in the pattern. We look for the longest
	260	string that must appear at a fixed location, and we look for the
	261	longest string that may appear at a floating location. So for instance
	262	in the pattern:
	263
	264	/FOO[xX]A.*B[xX]BAR/
	265
	266	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	267	strings (because they follow a .* construct). study_chunk will identify
	268	both FOO and BAR as being the longest fixed and floating strings respectively.
	269
	270	The strings can be composites, for instance
	271
	272	/(f)(o)(o)/
	273
	274	will result in a composite fixed substring 'foo'.
	275
	276	For each string some basic information is maintained:
	277
	278	- offset or min_offset
	279	This is the position the string must appear at, or not before.
	280	It also implicitly (when combined with minlenp) tells us how many
	281	characters must match before the string we are searching for.
	282	Likewise when combined with minlenp and the length of the string it
	283	tells us how many characters must appear after the string we have
	284	found.
	285
	286	- max_offset
	287	Only used for floating strings. This is the rightmost point that
	288	the string can appear at. If set to I32 max it indicates that the
	289	string can occur infinitely far to the right.
	290
	291	- minlenp
	292	A pointer to the minimum length of the pattern that the string
	293	was found inside. This is important as in the case of positive
	294	lookahead or positive lookbehind we can have multiple patterns
	295	involved. Consider
	296
	297	/(?=FOO).*F/
	298
	299	The minimum length of the pattern overall is 3, the minimum length
	300	of the lookahead part is 3, but the minimum length of the part that
	301	will actually match is 1. So 'FOO's minimum length is 3, but the
	302	minimum length for the F is 1. This is important as the minimum length
	303	is used to determine offsets in front of and behind the string being
	304	looked for. Since strings can be composites this is the length of the
	305	pattern at the time it was committed with a scan_commit. Note that
	306	the length is calculated by study_chunk, so that the minimum lengths
	307	are not known until the full pattern has been compiled, thus the
	308	pointer to the value.
	309
	310	- lookbehind
	311
	312	In the case of lookbehind the string being searched for can be
	313	offset past the start point of the final matching string.
	314	If this value was just blithely removed from the min_offset it would
	315	invalidate some of the calculations for how many chars must match
	316	before or after (as they are derived from min_offset and minlen and
	317	the length of the string being searched for).
	318	When the final pattern is compiled and the data is moved from the
	319	scan_data_t structure into the regexp structure the information
	320	about lookbehind is factored in, with the information that would
	321	have been lost precalculated in the end_shift field for the
	322	associated string.
	323
	324	The fields pos_min and pos_delta are used to store the minimum offset
	325	and the delta to the maximum offset at the current point in the pattern.
	326
	327	*/
	328
	329	typedef struct scan_data_t {
	330	/I32 len_min; unused /
	331	/I32 len_delta; unused /
	332	I32 pos_min;
	333	I32 pos_delta;
	334	SV *last_found;
	335	I32 last_end; /* min value, <0 unless valid. */
	336	I32 last_start_min;
	337	I32 last_start_max;
	338	SV *longest; / Either &l_fixed, or &l_float. */
	339	SV longest_fixed; / longest fixed string found in pattern */
	340	I32 offset_fixed; /* offset where it starts */
	341	I32 minlen_fixed; / pointer to the minlen relevant to the string */
	342	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	343	SV longest_float; / longest floating string found in pattern */
	344	I32 offset_float_min; /* earliest point in string it can appear */
	345	I32 offset_float_max; /* latest point in string it can appear */
	346	I32 minlen_float; / pointer to the minlen relevant to the string */
	347	I32 lookbehind_float; /* is the position of the string modified by LB */
	348	I32 flags;
	349	I32 whilem_c;
	350	I32 *last_closep;
	351	struct regnode_charclass_class *start_class;
	352	} scan_data_t;
	353
	354	/*
	355	* Forward declarations for pregcomp()'s friends.
	356	*/
	357
	358	static const scan_data_t zero_scan_data =
	359	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	360
	361	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	362	#define SF_BEFORE_SEOL 0x0001
	363	#define SF_BEFORE_MEOL 0x0002
	364	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	365	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	366
	367	#ifdef NO_UNARY_PLUS
	368	# define SF_FIX_SHIFT_EOL (0+2)
	369	# define SF_FL_SHIFT_EOL (0+4)
	370	#else
	371	# define SF_FIX_SHIFT_EOL (+2)
	372	# define SF_FL_SHIFT_EOL (+4)
	373	#endif
	374
	375	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	376	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	377
	378	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	379	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	380	#define SF_IS_INF 0x0040
	381	#define SF_HAS_PAR 0x0080
	382	#define SF_IN_PAR 0x0100
	383	#define SF_HAS_EVAL 0x0200
	384	#define SCF_DO_SUBSTR 0x0400
	385	#define SCF_DO_STCLASS_AND 0x0800
	386	#define SCF_DO_STCLASS_OR 0x1000
	387	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	388	#define SCF_WHILEM_VISITED_POS 0x2000
	389
	390	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	391	#define SCF_SEEN_ACCEPT 0x8000
	392
	393	#define UTF cBOOL(RExC_utf8)
	394
	395	/* The enums for all these are ordered so things work out correctly */
	396	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	397	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
	398	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	399	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
	400	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
	401	#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	402	#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
	403
	404	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	405
	406	#define OOB_UNICODE 12345678
	407	#define OOB_NAMEDCLASS -1
	408
	409	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	410	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	411
	412
	413	/* length of regex to show in messages that don't mark a position within */
	414	#define RegexLengthToShowInErrorMessages 127
	415
	416	/*
	417	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	418	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	419	* op/pragma/warn/regcomp.
	420	*/
	421	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	422	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	423
	424	#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
	425
	426	/*
	427	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	428	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	429	* "...".
	430	*/
	431	#define _FAIL(code) STMT_START { \
	432	const char *ellipses = ""; \
	433	IV len = RExC_end - RExC_precomp; \
	434	\
	435	if (!SIZE_ONLY) \
	436	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	437	if (len > RegexLengthToShowInErrorMessages) { \
	438	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	439	len = RegexLengthToShowInErrorMessages - 10; \
	440	ellipses = "..."; \
	441	} \
	442	code; \
	443	} STMT_END
	444
	445	#define FAIL(msg) _FAIL( \
	446	Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
	447	msg, (int)len, RExC_precomp, ellipses))
	448
	449	#define FAIL2(msg,arg) _FAIL( \
	450	Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
	451	arg, (int)len, RExC_precomp, ellipses))
	452
	453	/*
	454	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	455	*/
	456	#define Simple_vFAIL(m) STMT_START { \
	457	const IV offset = RExC_parse - RExC_precomp; \
	458	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	459	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	460	} STMT_END
	461
	462	/*
	463	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	464	*/
	465	#define vFAIL(m) STMT_START { \
	466	if (!SIZE_ONLY) \
	467	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	468	Simple_vFAIL(m); \
	469	} STMT_END
	470
	471	/*
	472	* Like Simple_vFAIL(), but accepts two arguments.
	473	*/
	474	#define Simple_vFAIL2(m,a1) STMT_START { \
	475	const IV offset = RExC_parse - RExC_precomp; \
	476	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
	477	(int)offset, RExC_precomp, RExC_precomp + offset); \
	478	} STMT_END
	479
	480	/*
	481	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	482	*/
	483	#define vFAIL2(m,a1) STMT_START { \
	484	if (!SIZE_ONLY) \
	485	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	486	Simple_vFAIL2(m, a1); \
	487	} STMT_END
	488
	489
	490	/*
	491	* Like Simple_vFAIL(), but accepts three arguments.
	492	*/
	493	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	494	const IV offset = RExC_parse - RExC_precomp; \
	495	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
	496	(int)offset, RExC_precomp, RExC_precomp + offset); \
	497	} STMT_END
	498
	499	/*
	500	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	501	*/
	502	#define vFAIL3(m,a1,a2) STMT_START { \
	503	if (!SIZE_ONLY) \
	504	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	505	Simple_vFAIL3(m, a1, a2); \
	506	} STMT_END
	507
	508	/*
	509	* Like Simple_vFAIL(), but accepts four arguments.
	510	*/
	511	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	512	const IV offset = RExC_parse - RExC_precomp; \
	513	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
	514	(int)offset, RExC_precomp, RExC_precomp + offset); \
	515	} STMT_END
	516
	517	#define ckWARNreg(loc,m) STMT_START { \
	518	const IV offset = loc - RExC_precomp; \
	519	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	520	(int)offset, RExC_precomp, RExC_precomp + offset); \
	521	} STMT_END
	522
	523	#define ckWARNregdep(loc,m) STMT_START { \
	524	const IV offset = loc - RExC_precomp; \
	525	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	526	m REPORT_LOCATION, \
	527	(int)offset, RExC_precomp, RExC_precomp + offset); \
	528	} STMT_END
	529
	530	#define ckWARN2regdep(loc,m, a1) STMT_START { \
	531	const IV offset = loc - RExC_precomp; \
	532	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	533	m REPORT_LOCATION, \
	534	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	535	} STMT_END
	536
	537	#define ckWARN2reg(loc, m, a1) STMT_START { \
	538	const IV offset = loc - RExC_precomp; \
	539	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	540	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	541	} STMT_END
	542
	543	#define vWARN3(loc, m, a1, a2) STMT_START { \
	544	const IV offset = loc - RExC_precomp; \
	545	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	546	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	547	} STMT_END
	548
	549	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	550	const IV offset = loc - RExC_precomp; \
	551	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	552	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	553	} STMT_END
	554
	555	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	556	const IV offset = loc - RExC_precomp; \
	557	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	558	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	559	} STMT_END
	560
	561	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	562	const IV offset = loc - RExC_precomp; \
	563	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	564	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	565	} STMT_END
	566
	567	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	568	const IV offset = loc - RExC_precomp; \
	569	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	570	a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
	571	} STMT_END
	572
	573
	574	/* Allow for side effects in s */
	575	#define REGC(c,s) STMT_START { \
	576	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	577	} STMT_END
	578
	579	/* Macros for recording node offsets. 20001227 mjd@plover.com
	580	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	581	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	582	* Element 0 holds the number n.
	583	* Position is 1 indexed.
	584	*/
	585	#ifndef RE_TRACK_PATTERN_OFFSETS
	586	#define Set_Node_Offset_To_R(node,byte)
	587	#define Set_Node_Offset(node,byte)
	588	#define Set_Cur_Node_Offset
	589	#define Set_Node_Length_To_R(node,len)
	590	#define Set_Node_Length(node,len)
	591	#define Set_Node_Cur_Length(node)
	592	#define Node_Offset(n)
	593	#define Node_Length(n)
	594	#define Set_Node_Offset_Length(node,offset,len)
	595	#define ProgLen(ri) ri->u.proglen
	596	#define SetProgLen(ri,x) ri->u.proglen = x
	597	#else
	598	#define ProgLen(ri) ri->u.offsets[0]
	599	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	600	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	601	if (! SIZE_ONLY) { \
	602	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	603	__LINE__, (int)(node), (int)(byte))); \
	604	if((node) < 0) { \
	605	Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
	606	} else { \
	607	RExC_offsets[2*(node)-1] = (byte); \
	608	} \
	609	} \
	610	} STMT_END
	611
	612	#define Set_Node_Offset(node,byte) \
	613	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	614	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	615
	616	#define Set_Node_Length_To_R(node,len) STMT_START { \
	617	if (! SIZE_ONLY) { \
	618	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	619	__LINE__, (int)(node), (int)(len))); \
	620	if((node) < 0) { \
	621	Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
	622	} else { \
	623	RExC_offsets[2*(node)] = (len); \
	624	} \
	625	} \
	626	} STMT_END
	627
	628	#define Set_Node_Length(node,len) \
	629	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	630	#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
	631	#define Set_Node_Cur_Length(node) \
	632	Set_Node_Length(node, RExC_parse - parse_start)
	633
	634	/* Get offsets and lengths */
	635	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	636	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	637
	638	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	639	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	640	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	641	} STMT_END
	642	#endif
	643
	644	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	645	#define EXPERIMENTAL_INPLACESCAN
	646	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	647
	648	#define DEBUG_STUDYDATA(str,data,depth) \
	649	DEBUG_OPTIMISE_MORE_r(if(data){ \
	650	PerlIO_printf(Perl_debug_log, \
	651	"%*s" str "Pos:%"IVdf"/%"IVdf \
	652	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	653	(int)(depth)*2, "", \
	654	(IV)((data)->pos_min), \
	655	(IV)((data)->pos_delta), \
	656	(UV)((data)->flags), \
	657	(IV)((data)->whilem_c), \
	658	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	659	is_inf ? "INF " : "" \
	660	); \
	661	if ((data)->last_found) \
	662	PerlIO_printf(Perl_debug_log, \
	663	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	664	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	665	SvPVX_const((data)->last_found), \
	666	(IV)((data)->last_end), \
	667	(IV)((data)->last_start_min), \
	668	(IV)((data)->last_start_max), \
	669	((data)->longest && \
	670	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	671	SvPVX_const((data)->longest_fixed), \
	672	(IV)((data)->offset_fixed), \
	673	((data)->longest && \
	674	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	675	SvPVX_const((data)->longest_float), \
	676	(IV)((data)->offset_float_min), \
	677	(IV)((data)->offset_float_max) \
	678	); \
	679	PerlIO_printf(Perl_debug_log,"\n"); \
	680	});
	681
	682	static void clear_re(pTHX_ void *r);
	683
	684	/* Mark that we cannot extend a found fixed substring at this point.
	685	Update the longest found anchored substring and the longest found
	686	floating substrings if needed. */
	687
	688	STATIC void
	689	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data, I32 *minlenp, int is_inf)
	690	{
	691	const STRLEN l = CHR_SVLEN(data->last_found);
	692	const STRLEN old_l = CHR_SVLEN(*data->longest);
	693	GET_RE_DEBUG_FLAGS_DECL;
	694
	695	PERL_ARGS_ASSERT_SCAN_COMMIT;
	696
	697	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	698	SvSetMagicSV(*data->longest, data->last_found);
	699	if (*data->longest == data->longest_fixed) {
	700	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	701	if (data->flags & SF_BEFORE_EOL)
	702	data->flags
	703	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	704	else
	705	data->flags &= ~SF_FIX_BEFORE_EOL;
	706	data->minlen_fixed=minlenp;
	707	data->lookbehind_fixed=0;
	708	}
	709	else { /* data->longest == data->longest_float /
	710	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	711	data->offset_float_max = (l
	712	? data->last_start_max
	713	: data->pos_min + data->pos_delta);
	714	if (is_inf \|\| (U32)data->offset_float_max > (U32)I32_MAX)
	715	data->offset_float_max = I32_MAX;
	716	if (data->flags & SF_BEFORE_EOL)
	717	data->flags
	718	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	719	else
	720	data->flags &= ~SF_FL_BEFORE_EOL;
	721	data->minlen_float=minlenp;
	722	data->lookbehind_float=0;
	723	}
	724	}
	725	SvCUR_set(data->last_found, 0);
	726	{
	727	SV * const sv = data->last_found;
	728	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	729	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	730	if (mg)
	731	mg->mg_len = 0;
	732	}
	733	}
	734	data->last_end = -1;
	735	data->flags &= ~SF_BEFORE_EOL;
	736	DEBUG_STUDYDATA("commit: ",data,0);
	737	}
	738
	739	/* Can match anything (initialization) */
	740	STATIC void
	741	S_cl_anything(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	742	{
	743	PERL_ARGS_ASSERT_CL_ANYTHING;
	744
	745	ANYOF_BITMAP_SETALL(cl);
	746	cl->flags = ANYOF_CLASS\|ANYOF_EOS\|ANYOF_UNICODE_ALL
	747	\|ANYOF_LOC_NONBITMAP_FOLD\|ANYOF_NON_UTF8_LATIN1_ALL;
	748
	749	/* If any portion of the regex is to operate under locale rules,
	750	* initialization includes it. The reason this isn't done for all regexes
	751	* is that the optimizer was written under the assumption that locale was
	752	* all-or-nothing. Given the complexity and lack of documentation in the
	753	* optimizer, and that there are inadequate test cases for locale, so many
	754	* parts of it may not work properly, it is safest to avoid locale unless
	755	* necessary. */
	756	if (RExC_contains_locale) {
	757	ANYOF_CLASS_SETALL(cl); /* /l uses class */
	758	cl->flags \|= ANYOF_LOCALE;
	759	}
	760	else {
	761	ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
	762	}
	763	}
	764
	765	/* Can match anything (initialization) */
	766	STATIC int
	767	S_cl_is_anything(const struct regnode_charclass_class *cl)
	768	{
	769	int value;
	770
	771	PERL_ARGS_ASSERT_CL_IS_ANYTHING;
	772
	773	for (value = 0; value <= ANYOF_MAX; value += 2)
	774	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
	775	return 1;
	776	if (!(cl->flags & ANYOF_UNICODE_ALL))
	777	return 0;
	778	if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
	779	return 0;
	780	return 1;
	781	}
	782
	783	/* Can match anything (initialization) */
	784	STATIC void
	785	S_cl_init(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	786	{
	787	PERL_ARGS_ASSERT_CL_INIT;
	788
	789	Zero(cl, 1, struct regnode_charclass_class);
	790	cl->type = ANYOF;
	791	cl_anything(pRExC_state, cl);
	792	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	793	}
	794
	795	/* These two functions currently do the exact same thing */
	796	#define cl_init_zero S_cl_init
	797
	798	/* 'AND' a given class with another one. Can create false positives. 'cl'
	799	* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
	800	* 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
	801	STATIC void
	802	S_cl_and(struct regnode_charclass_class *cl,
	803	const struct regnode_charclass_class *and_with)
	804	{
	805	PERL_ARGS_ASSERT_CL_AND;
	806
	807	assert(and_with->type == ANYOF);
	808
	809	/* I (khw) am not sure all these restrictions are necessary XXX */
	810	if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
	811	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
	812	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	813	&& !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	814	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
	815	int i;
	816
	817	if (and_with->flags & ANYOF_INVERT)
	818	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	819	cl->bitmap[i] &= ~and_with->bitmap[i];
	820	else
	821	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	822	cl->bitmap[i] &= and_with->bitmap[i];
	823	} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
	824
	825	if (and_with->flags & ANYOF_INVERT) {
	826
	827	/* Here, the and'ed node is inverted. Get the AND of the flags that
	828	* aren't affected by the inversion. Those that are affected are
	829	* handled individually below */
	830	U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
	831	cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
	832	cl->flags \|= affected_flags;
	833
	834	/* We currently don't know how to deal with things that aren't in the
	835	* bitmap, but we know that the intersection is no greater than what
	836	* is already in cl, so let there be false positives that get sorted
	837	* out after the synthetic start class succeeds, and the node is
	838	* matched for real. */
	839
	840	/* The inversion of these two flags indicate that the resulting
	841	* intersection doesn't have them */
	842	if (and_with->flags & ANYOF_UNICODE_ALL) {
	843	cl->flags &= ~ANYOF_UNICODE_ALL;
	844	}
	845	if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	846	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
	847	}
	848	}
	849	else { /* and'd node is not inverted */
	850	U8 outside_bitmap_but_not_utf8; /* Temp variable */
	851
	852	if (! ANYOF_NONBITMAP(and_with)) {
	853
	854	/* Here 'and_with' doesn't match anything outside the bitmap
	855	* (except possibly ANYOF_UNICODE_ALL), which means the
	856	* intersection can't either, except for ANYOF_UNICODE_ALL, in
	857	* which case we don't know what the intersection is, but it's no
	858	* greater than what cl already has, so can just leave it alone,
	859	* with possible false positives */
	860	if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
	861	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	862	cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
	863	}
	864	}
	865	else if (! ANYOF_NONBITMAP(cl)) {
	866
	867	/* Here, 'and_with' does match something outside the bitmap, and cl
	868	* doesn't have a list of things to match outside the bitmap. If
	869	* cl can match all code points above 255, the intersection will
	870	* be those above-255 code points that 'and_with' matches. If cl
	871	* can't match all Unicode code points, it means that it can't
	872	* match anything outside the bitmap (since the 'if' that got us
	873	* into this block tested for that), so we leave the bitmap empty.
	874	*/
	875	if (cl->flags & ANYOF_UNICODE_ALL) {
	876	ARG_SET(cl, ARG(and_with));
	877
	878	/* and_with's ARG may match things that don't require UTF8.
	879	* And now cl's will too, in spite of this being an 'and'. See
	880	* the comments below about the kludge */
	881	cl->flags \|= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
	882	}
	883	}
	884	else {
	885	/* Here, both 'and_with' and cl match something outside the
	886	* bitmap. Currently we do not do the intersection, so just match
	887	* whatever cl had at the beginning. */
	888	}
	889
	890
	891	/* Take the intersection of the two sets of flags. However, the
	892	* ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
	893	* kludge around the fact that this flag is not treated like the others
	894	* which are initialized in cl_anything(). The way the optimizer works
	895	* is that the synthetic start class (SSC) is initialized to match
	896	* anything, and then the first time a real node is encountered, its
	897	* values are AND'd with the SSC's with the result being the values of
	898	* the real node. However, there are paths through the optimizer where
	899	* the AND never gets called, so those initialized bits are set
	900	* inappropriately, which is not usually a big deal, as they just cause
	901	* false positives in the SSC, which will just mean a probably
	902	* imperceptible slow down in execution. However this bit has a
	903	* higher false positive consequence in that it can cause utf8.pm,
	904	* utf8_heavy.pl ... to be loaded when not necessary, which is a much
	905	* bigger slowdown and also causes significant extra memory to be used.
	906	* In order to prevent this, the code now takes a different tack. The
	907	* bit isn't set unless some part of the regular expression needs it,
	908	* but once set it won't get cleared. This means that these extra
	909	* modules won't get loaded unless there was some path through the
	910	* pattern that would have required them anyway, and so any false
	911	* positives that occur by not ANDing them out when they could be
	912	* aren't as severe as they would be if we treated this bit like all
	913	* the others */
	914	outside_bitmap_but_not_utf8 = (cl->flags \| and_with->flags)
	915	& ANYOF_NONBITMAP_NON_UTF8;
	916	cl->flags &= and_with->flags;
	917	cl->flags \|= outside_bitmap_but_not_utf8;
	918	}
	919	}
	920
	921	/* 'OR' a given class with another one. Can create false positives. 'cl'
	922	* should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
	923	* 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
	924	STATIC void
	925	S_cl_or(const RExC_state_t pRExC_state, struct regnode_charclass_class cl, const struct regnode_charclass_class *or_with)
	926	{
	927	PERL_ARGS_ASSERT_CL_OR;
	928
	929	if (or_with->flags & ANYOF_INVERT) {
	930
	931	/* Here, the or'd node is to be inverted. This means we take the
	932	* complement of everything not in the bitmap, but currently we don't
	933	* know what that is, so give up and match anything */
	934	if (ANYOF_NONBITMAP(or_with)) {
	935	cl_anything(pRExC_state, cl);
	936	}
	937	/* We do not use
	938	* (B1 \| CL1) \| (!B2 & !CL2) = (B1 \| !B2 & !CL2) \| (CL1 \| (!B2 & !CL2))
	939	* <= (B1 \| !B2) \| (CL1 \| !CL2)
	940	* which is wasteful if CL2 is small, but we ignore CL2:
	941	* (B1 \| CL1) \| (!B2 & !CL2) <= (B1 \| CL1) \| !B2 = (B1 \| !B2) \| CL1
	942	* XXXX Can we handle case-fold? Unclear:
	943	* (OK1(i) \| OK1(i')) \| !(OK1(i) \| OK1(i')) =
	944	* (OK1(i) \| OK1(i')) \| (!OK1(i) & !OK1(i'))
	945	*/
	946	else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	947	&& !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	948	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
	949	int i;
	950
	951	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	952	cl->bitmap[i] \|= ~or_with->bitmap[i];
	953	} /* XXXX: logic is complicated otherwise */
	954	else {
	955	cl_anything(pRExC_state, cl);
	956	}
	957
	958	/* And, we can just take the union of the flags that aren't affected
	959	* by the inversion */
	960	cl->flags \|= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
	961
	962	/* For the remaining flags:
	963	ANYOF_UNICODE_ALL and inverted means to not match anything above
	964	255, which means that the union with cl should just be
	965	what cl has in it, so can ignore this flag
	966	ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
	967	is 127-255 to match them, but then invert that, so the
	968	union with cl should just be what cl has in it, so can
	969	ignore this flag
	970	*/
	971	} else { /* 'or_with' is not inverted */
	972	/* (B1 \| CL1) \| (B2 \| CL2) = (B1 \| B2) \| (CL1 \| CL2)) */
	973	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	974	&& (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	975	\|\| (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
	976	int i;
	977
	978	/* OR char bitmap and class bitmap separately */
	979	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	980	cl->bitmap[i] \|= or_with->bitmap[i];
	981	if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
	982	for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
	983	cl->classflags[i] \|= or_with->classflags[i];
	984	cl->flags \|= ANYOF_CLASS;
	985	}
	986	}
	987	else { /* XXXX: logic is complicated, leave it along for a moment. */
	988	cl_anything(pRExC_state, cl);
	989	}
	990
	991	if (ANYOF_NONBITMAP(or_with)) {
	992
	993	/* Use the added node's outside-the-bit-map match if there isn't a
	994	* conflict. If there is a conflict (both nodes match something
	995	* outside the bitmap, but what they match outside is not the same
	996	* pointer, and hence not easily compared until XXX we extend
	997	* inversion lists this far), give up and allow the start class to
	998	* match everything outside the bitmap. If that stuff is all above
	999	* 255, can just set UNICODE_ALL, otherwise caould be anything. */
	1000	if (! ANYOF_NONBITMAP(cl)) {
	1001	ARG_SET(cl, ARG(or_with));
	1002	}
	1003	else if (ARG(cl) != ARG(or_with)) {
	1004
	1005	if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
	1006	cl_anything(pRExC_state, cl);
	1007	}
	1008	else {
	1009	cl->flags \|= ANYOF_UNICODE_ALL;
	1010	}
	1011	}
	1012	}
	1013
	1014	/* Take the union */
	1015	cl->flags \|= or_with->flags;
	1016	}
	1017	}
	1018
	1019	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	1020	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	1021	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	1022	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
	1023
	1024
	1025	#ifdef DEBUGGING
	1026	/*
	1027	dump_trie(trie,widecharmap,revcharmap)
	1028	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	1029	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	1030
	1031	These routines dump out a trie in a somewhat readable format.
	1032	The _interim_ variants are used for debugging the interim
	1033	tables that are used to generate the final compressed
	1034	representation which is what dump_trie expects.
	1035
	1036	Part of the reason for their existence is to provide a form
	1037	of documentation as to how the different representations function.
	1038
	1039	*/
	1040
	1041	/*
	1042	Dumps the final compressed table form of the trie to Perl_debug_log.
	1043	Used for debugging make_trie().
	1044	*/
	1045
	1046	STATIC void
	1047	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	1048	AV *revcharmap, U32 depth)
	1049	{
	1050	U32 state;
	1051	SV *sv=sv_newmortal();
	1052	int colwidth= widecharmap ? 6 : 4;
	1053	U16 word;
	1054	GET_RE_DEBUG_FLAGS_DECL;
	1055
	1056	PERL_ARGS_ASSERT_DUMP_TRIE;
	1057
	1058	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	1059	(int)depth * 2 + 2,"",
	1060	"Match","Base","Ofs" );
	1061
	1062	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	1063	SV ** const tmp = av_fetch( revcharmap, state, 0);
	1064	if ( tmp ) {
	1065	PerlIO_printf( Perl_debug_log, "%*s",
	1066	colwidth,
	1067	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1068	PL_colors[0], PL_colors[1],
	1069	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1070	PERL_PV_ESCAPE_FIRSTCHAR
	1071	)
	1072	);
	1073	}
	1074	}
	1075	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	1076	(int)depth * 2 + 2,"");
	1077
	1078	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	1079	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	1080	PerlIO_printf( Perl_debug_log, "\n");
	1081
	1082	for( state = 1 ; state < trie->statecount ; state++ ) {
	1083	const U32 base = trie->states[ state ].trans.base;
	1084
	1085	PerlIO_printf( Perl_debug_log, "%s#%4"UVXf"\|", (int)depth 2 + 2,"", (UV)state);
	1086
	1087	if ( trie->states[ state ].wordnum ) {
	1088	PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
	1089	} else {
	1090	PerlIO_printf( Perl_debug_log, "%6s", "" );
	1091	}
	1092
	1093	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	1094
	1095	if ( base ) {
	1096	U32 ofs = 0;
	1097
	1098	while( ( base + ofs < trie->uniquecharcount ) \|\|
	1099	( base + ofs - trie->uniquecharcount < trie->lasttrans
	1100	&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
	1101	ofs++;
	1102
	1103	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	1104
	1105	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1106	if ( ( base + ofs >= trie->uniquecharcount ) &&
	1107	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	1108	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	1109	{
	1110	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	1111	colwidth,
	1112	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
	1113	} else {
	1114	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	1115	}
	1116	}
	1117
	1118	PerlIO_printf( Perl_debug_log, "]");
	1119
	1120	}
	1121	PerlIO_printf( Perl_debug_log, "\n" );
	1122	}
	1123	PerlIO_printf(Perl_debug_log, "%sword_info N:(prev,len)=", (int)depth2, "");
	1124	for (word=1; word <= trie->wordcount; word++) {
	1125	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	1126	(int)word, (int)(trie->wordinfo[word].prev),
	1127	(int)(trie->wordinfo[word].len));
	1128	}
	1129	PerlIO_printf(Perl_debug_log, "\n" );
	1130	}
	1131	/*
	1132	Dumps a fully constructed but uncompressed trie in list form.
	1133	List tries normally only are used for construction when the number of
	1134	possible chars (trie->uniquecharcount) is very high.
	1135	Used for debugging make_trie().
	1136	*/
	1137	STATIC void
	1138	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	1139	HV widecharmap, AV revcharmap, U32 next_alloc,
	1140	U32 depth)
	1141	{
	1142	U32 state;
	1143	SV *sv=sv_newmortal();
	1144	int colwidth= widecharmap ? 6 : 4;
	1145	GET_RE_DEBUG_FLAGS_DECL;
	1146
	1147	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1148
	1149	/* print out the table precompression. */
	1150	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1151	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1152	"------:-----+-----------------\n" );
	1153
	1154	for( state=1 ; state < next_alloc ; state ++ ) {
	1155	U16 charid;
	1156
	1157	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1158	(int)depth * 2 + 2,"", (UV)state );
	1159	if ( ! trie->states[ state ].wordnum ) {
	1160	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1161	} else {
	1162	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1163	trie->states[ state ].wordnum
	1164	);
	1165	}
	1166	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1167	SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
	1168	if ( tmp ) {
	1169	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1170	colwidth,
	1171	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1172	PL_colors[0], PL_colors[1],
	1173	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1174	PERL_PV_ESCAPE_FIRSTCHAR
	1175	) ,
	1176	TRIE_LIST_ITEM(state,charid).forid,
	1177	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1178	);
	1179	if (!(charid % 10))
	1180	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1181	(int)((depth * 2) + 14), "");
	1182	}
	1183	}
	1184	PerlIO_printf( Perl_debug_log, "\n");
	1185	}
	1186	}
	1187
	1188	/*
	1189	Dumps a fully constructed but uncompressed trie in table form.
	1190	This is the normal DFA style state transition table, with a few
	1191	twists to facilitate compression later.
	1192	Used for debugging make_trie().
	1193	*/
	1194	STATIC void
	1195	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1196	HV widecharmap, AV revcharmap, U32 next_alloc,
	1197	U32 depth)
	1198	{
	1199	U32 state;
	1200	U16 charid;
	1201	SV *sv=sv_newmortal();
	1202	int colwidth= widecharmap ? 6 : 4;
	1203	GET_RE_DEBUG_FLAGS_DECL;
	1204
	1205	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1206
	1207	/*
	1208	print out the table precompression so that we can do a visual check
	1209	that they are identical.
	1210	*/
	1211
	1212	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1213
	1214	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1215	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1216	if ( tmp ) {
	1217	PerlIO_printf( Perl_debug_log, "%*s",
	1218	colwidth,
	1219	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1220	PL_colors[0], PL_colors[1],
	1221	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1222	PERL_PV_ESCAPE_FIRSTCHAR
	1223	)
	1224	);
	1225	}
	1226	}
	1227
	1228	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1229
	1230	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1231	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1232	}
	1233
	1234	PerlIO_printf( Perl_debug_log, "\n" );
	1235
	1236	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1237
	1238	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1239	(int)depth * 2 + 2,"",
	1240	(UV)TRIE_NODENUM( state ) );
	1241
	1242	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1243	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1244	if (v)
	1245	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1246	else
	1247	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1248	}
	1249	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1250	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
	1251	} else {
	1252	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
	1253	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1254	}
	1255	}
	1256	}
	1257
	1258	#endif
	1259
	1260
	1261	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1262	startbranch: the first branch in the whole branch sequence
	1263	first : start branch of sequence of branch-exact nodes.
	1264	May be the same as startbranch
	1265	last : Thing following the last branch.
	1266	May be the same as tail.
	1267	tail : item following the branch sequence
	1268	count : words in the sequence
	1269	flags : currently the OP() type we will be building one of /EXACT(\|F\|Fl)/
	1270	depth : indent depth
	1271
	1272	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1273
	1274	A trie is an N'ary tree where the branches are determined by digital
	1275	decomposition of the key. IE, at the root node you look up the 1st character and
	1276	follow that branch repeat until you find the end of the branches. Nodes can be
	1277	marked as "accepting" meaning they represent a complete word. Eg:
	1278
	1279	/he\|she\|his\|hers/
	1280
	1281	would convert into the following structure. Numbers represent states, letters
	1282	following numbers represent valid transitions on the letter from that state, if
	1283	the number is in square brackets it represents an accepting state, otherwise it
	1284	will be in parenthesis.
	1285
	1286	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1287	\| \|
	1288	\| (2)
	1289	\| \|
	1290	(1) +-i->(6)-+-s->[7]
	1291	\|
	1292	+-s->(3)-+-h->(4)-+-e->[5]
	1293
	1294	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1295
	1296	This shows that when matching against the string 'hers' we will begin at state 1
	1297	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1298	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1299	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1300	single traverse. We store a mapping from accepting to state to which word was
	1301	matched, and then when we have multiple possibilities we try to complete the
	1302	rest of the regex in the order in which they occured in the alternation.
	1303
	1304	The only prior NFA like behaviour that would be changed by the TRIE support is
	1305	the silent ignoring of duplicate alternations which are of the form:
	1306
	1307	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1308
	1309	Thus EVAL blocks following a trie may be called a different number of times with
	1310	and without the optimisation. With the optimisations dupes will be silently
	1311	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1312	the following demonstrates:
	1313
	1314	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1315
	1316	which prints out 'word' three times, but
	1317
	1318	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1319
	1320	which doesnt print it out at all. This is due to other optimisations kicking in.
	1321
	1322	Example of what happens on a structural level:
	1323
	1324	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1325
	1326	1: CURLYM[1] {1,32767}(18)
	1327	5: BRANCH(8)
	1328	6: EXACT <ac>(16)
	1329	8: BRANCH(11)
	1330	9: EXACT <ad>(16)
	1331	11: BRANCH(14)
	1332	12: EXACT <ab>(16)
	1333	16: SUCCEED(0)
	1334	17: NOTHING(18)
	1335	18: END(0)
	1336
	1337	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1338	and should turn into:
	1339
	1340	1: CURLYM[1] {1,32767}(18)
	1341	5: TRIE(16)
	1342	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1343	<ac>
	1344	<ad>
	1345	<ab>
	1346	16: SUCCEED(0)
	1347	17: NOTHING(18)
	1348	18: END(0)
	1349
	1350	Cases where tail != last would be like /(?foo\|bar)baz/:
	1351
	1352	1: BRANCH(4)
	1353	2: EXACT <foo>(8)
	1354	4: BRANCH(7)
	1355	5: EXACT <bar>(8)
	1356	7: TAIL(8)
	1357	8: EXACT <baz>(10)
	1358	10: END(0)
	1359
	1360	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1361	and would end up looking like:
	1362
	1363	1: TRIE(8)
	1364	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1365	<foo>
	1366	<bar>
	1367	7: TAIL(8)
	1368	8: EXACT <baz>(10)
	1369	10: END(0)
	1370
	1371	d = uvuni_to_utf8_flags(d, uv, 0);
	1372
	1373	is the recommended Unicode-aware way of saying
	1374
	1375	*(d++) = uv;
	1376	*/
	1377
	1378	#define TRIE_STORE_REVCHAR(val) \
	1379	STMT_START { \
	1380	if (UTF) { \
	1381	SV zlopp = newSV(7); / XXX: optimize me */ \
	1382	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1383	unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
	1384	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1385	SvPOK_on(zlopp); \
	1386	SvUTF8_on(zlopp); \
	1387	av_push(revcharmap, zlopp); \
	1388	} else { \
	1389	char ooooff = (char)val; \
	1390	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1391	} \
	1392	} STMT_END
	1393
	1394	#define TRIE_READ_CHAR STMT_START { \
	1395	wordlen++; \
	1396	if ( UTF ) { \
	1397	/* if it is UTF then it is either already folded, or does not need folding */ \
	1398	uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
	1399	} \
	1400	else if (folder == PL_fold_latin1) { \
	1401	/* if we use this folder we have to obey unicode rules on latin-1 data */ \
	1402	if ( foldlen > 0 ) { \
	1403	uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
	1404	foldlen -= len; \
	1405	scan += len; \
	1406	len = 0; \
	1407	} else { \
	1408	len = 1; \
	1409	uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
	1410	skiplen = UNISKIP(uvc); \
	1411	foldlen -= skiplen; \
	1412	scan = foldbuf + skiplen; \
	1413	} \
	1414	} else { \
	1415	/* raw data, will be folded later if needed */ \
	1416	uvc = (U32)*uc; \
	1417	len = 1; \
	1418	} \
	1419	} STMT_END
	1420
	1421
	1422
	1423	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1424	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1425	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1426	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1427	} \
	1428	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1429	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1430	TRIE_LIST_CUR( state )++; \
	1431	} STMT_END
	1432
	1433	#define TRIE_LIST_NEW(state) STMT_START { \
	1434	Newxz( trie->states[ state ].trans.list, \
	1435	4, reg_trie_trans_le ); \
	1436	TRIE_LIST_CUR( state ) = 1; \
	1437	TRIE_LIST_LEN( state ) = 4; \
	1438	} STMT_END
	1439
	1440	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1441	U16 dupe= trie->states[ state ].wordnum; \
	1442	regnode * const noper_next = regnext( noper ); \
	1443	\
	1444	DEBUG_r({ \
	1445	/* store the word for dumping */ \
	1446	SV* tmp; \
	1447	if (OP(noper) != NOTHING) \
	1448	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1449	else \
	1450	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1451	av_push( trie_words, tmp ); \
	1452	}); \
	1453	\
	1454	curword++; \
	1455	trie->wordinfo[curword].prev = 0; \
	1456	trie->wordinfo[curword].len = wordlen; \
	1457	trie->wordinfo[curword].accept = state; \
	1458	\
	1459	if ( noper_next < tail ) { \
	1460	if (!trie->jump) \
	1461	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
	1462	trie->jump[curword] = (U16)(noper_next - convert); \
	1463	if (!jumper) \
	1464	jumper = noper_next; \
	1465	if (!nextbranch) \
	1466	nextbranch= regnext(cur); \
	1467	} \
	1468	\
	1469	if ( dupe ) { \
	1470	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1471	/* chain, so that when the bits of chain are later */\
	1472	/* linked together, the dups appear in the chain */\
	1473	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1474	trie->wordinfo[dupe].prev = curword; \
	1475	} else { \
	1476	/* we haven't inserted this word yet. */ \
	1477	trie->states[ state ].wordnum = curword; \
	1478	} \
	1479	} STMT_END
	1480
	1481
	1482	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1483	( ( base + charid >= ucharcount \
	1484	&& base + charid < ubound \
	1485	&& state == trie->trans[ base - ucharcount + charid ].check \
	1486	&& trie->trans[ base - ucharcount + charid ].next ) \
	1487	? trie->trans[ base - ucharcount + charid ].next \
	1488	: ( state==1 ? special : 0 ) \
	1489	)
	1490
	1491	#define MADE_TRIE 1
	1492	#define MADE_JUMP_TRIE 2
	1493	#define MADE_EXACT_TRIE 4
	1494
	1495	STATIC I32
	1496	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch, regnode first, regnode last, regnode *tail, U32 word_count, U32 flags, U32 depth)
	1497	{
	1498	dVAR;
	1499	/* first pass, loop through and scan words */
	1500	reg_trie_data *trie;
	1501	HV *widecharmap = NULL;
	1502	AV *revcharmap = newAV();
	1503	regnode *cur;
	1504	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1505	STRLEN len = 0;
	1506	UV uvc = 0;
	1507	U16 curword = 0;
	1508	U32 next_alloc = 0;
	1509	regnode *jumper = NULL;
	1510	regnode *nextbranch = NULL;
	1511	regnode *convert = NULL;
	1512	U32 prev_states; / temp array mapping each state to previous one */
	1513	/* we just use folder as a flag in utf8 */
	1514	const U8 * folder = NULL;
	1515
	1516	#ifdef DEBUGGING
	1517	const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
	1518	AV *trie_words = NULL;
	1519	/* along with revcharmap, this only used during construction but both are
	1520	* useful during debugging so we store them in the struct when debugging.
	1521	*/
	1522	#else
	1523	const U32 data_slot = add_data( pRExC_state, 2, "tu" );
	1524	STRLEN trie_charcount=0;
	1525	#endif
	1526	SV *re_trie_maxbuff;
	1527	GET_RE_DEBUG_FLAGS_DECL;
	1528
	1529	PERL_ARGS_ASSERT_MAKE_TRIE;
	1530	#ifndef DEBUGGING
	1531	PERL_UNUSED_ARG(depth);
	1532	#endif
	1533
	1534	switch (flags) {
	1535	case EXACT: break;
	1536	case EXACTFA:
	1537	case EXACTFU_SS:
	1538	case EXACTFU_TRICKYFOLD:
	1539	case EXACTFU: folder = PL_fold_latin1; break;
	1540	case EXACTF: folder = PL_fold; break;
	1541	case EXACTFL: folder = PL_fold_locale; break;
	1542	default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
	1543	}
	1544
	1545	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1546	trie->refcount = 1;
	1547	trie->startstate = 1;
	1548	trie->wordcount = word_count;
	1549	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1550	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1551	if (flags == EXACT)
	1552	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1553	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1554	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1555
	1556	DEBUG_r({
	1557	trie_words = newAV();
	1558	});
	1559
	1560	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1561	if (!SvIOK(re_trie_maxbuff)) {
	1562	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1563	}
	1564	DEBUG_TRIE_COMPILE_r({
	1565	PerlIO_printf( Perl_debug_log,
	1566	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	1567	(int)depth * 2 + 2, "",
	1568	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	1569	REG_NODE_NUM(last), REG_NODE_NUM(tail),
	1570	(int)depth);
	1571	});
	1572
	1573	/* Find the node we are going to overwrite */
	1574	if ( first == startbranch && OP( last ) != BRANCH ) {
	1575	/* whole branch chain */
	1576	convert = first;
	1577	} else {
	1578	/* branch sub-chain */
	1579	convert = NEXTOPER( first );
	1580	}
	1581
	1582	/* -- First loop and Setup --
	1583
	1584	We first traverse the branches and scan each word to determine if it
	1585	contains widechars, and how many unique chars there are, this is
	1586	important as we have to build a table with at least as many columns as we
	1587	have unique chars.
	1588
	1589	We use an array of integers to represent the character codes 0..255
	1590	(trie->charmap) and we use a an HV* to store Unicode characters. We use the
	1591	native representation of the character value as the key and IV's for the
	1592	coded index.
	1593
	1594	TODO If we keep track of how many times each character is used we can
	1595	remap the columns so that the table compression later on is more
	1596	efficient in terms of memory by ensuring the most common value is in the
	1597	middle and the least common are on the outside. IMO this would be better
	1598	than a most to least common mapping as theres a decent chance the most
	1599	common letter will share a node with the least common, meaning the node
	1600	will not be compressible. With a middle is most common approach the worst
	1601	case is when we have the least common nodes twice.
	1602
	1603	*/
	1604
	1605	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1606	regnode *noper = NEXTOPER( cur );
	1607	const U8 uc = (U8)STRING( noper );
	1608	const U8 *e = uc + STR_LEN( noper );
	1609	STRLEN foldlen = 0;
	1610	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1611	STRLEN skiplen = 0;
	1612	const U8 scan = (U8)NULL;
	1613	U32 wordlen = 0; /* required init */
	1614	STRLEN chars = 0;
	1615	bool set_bit = trie->bitmap ? 1 : 0; /store the first char in the bitmap?/
	1616
	1617	if (OP(noper) == NOTHING) {
	1618	regnode *noper_next= regnext(noper);
	1619	if (noper_next != tail && OP(noper_next) == flags) {
	1620	noper = noper_next;
	1621	uc= (U8*)STRING(noper);
	1622	e= uc + STR_LEN(noper);
	1623	trie->minlen= STR_LEN(noper);
	1624	} else {
	1625	trie->minlen= 0;
	1626	continue;
	1627	}
	1628	}
	1629
	1630	if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
	1631	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	1632	regardless of encoding */
	1633	if (OP( noper ) == EXACTFU_SS) {
	1634	/* false positives are ok, so just set this */
	1635	TRIE_BITMAP_SET(trie,0xDF);
	1636	}
	1637	}
	1638	for ( ; uc < e ; uc += len ) {
	1639	TRIE_CHARCOUNT(trie)++;
	1640	TRIE_READ_CHAR;
	1641	chars++;
	1642	if ( uvc < 256 ) {
	1643	if ( folder ) {
	1644	U8 folded= folder[ (U8) uvc ];
	1645	if ( !trie->charmap[ folded ] ) {
	1646	trie->charmap[ folded ]=( ++trie->uniquecharcount );
	1647	TRIE_STORE_REVCHAR( folded );
	1648	}
	1649	}
	1650	if ( !trie->charmap[ uvc ] ) {
	1651	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	1652	TRIE_STORE_REVCHAR( uvc );
	1653	}
	1654	if ( set_bit ) {
	1655	/* store the codepoint in the bitmap, and its folded
	1656	* equivalent. */
	1657	TRIE_BITMAP_SET(trie, uvc);
	1658
	1659	/* store the folded codepoint */
	1660	if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
	1661
	1662	if ( !UTF ) {
	1663	/* store first byte of utf8 representation of
	1664	variant codepoints */
	1665	if (! UNI_IS_INVARIANT(uvc)) {
	1666	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	1667	}
	1668	}
	1669	set_bit = 0; /* We've done our bit :-) */
	1670	}
	1671	} else {
	1672	SV** svpp;
	1673	if ( !widecharmap )
	1674	widecharmap = newHV();
	1675
	1676	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	1677
	1678	if ( !svpp )
	1679	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	1680
	1681	if ( !SvTRUE( *svpp ) ) {
	1682	sv_setiv( *svpp, ++trie->uniquecharcount );
	1683	TRIE_STORE_REVCHAR(uvc);
	1684	}
	1685	}
	1686	}
	1687	if( cur == first ) {
	1688	trie->minlen = chars;
	1689	trie->maxlen = chars;
	1690	} else if (chars < trie->minlen) {
	1691	trie->minlen = chars;
	1692	} else if (chars > trie->maxlen) {
	1693	trie->maxlen = chars;
	1694	}
	1695	if (OP( noper ) == EXACTFU_SS) {
	1696	/* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
	1697	if (trie->minlen > 1)
	1698	trie->minlen= 1;
	1699	}
	1700	if (OP( noper ) == EXACTFU_TRICKYFOLD) {
	1701	/* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
	1702	* - We assume that any such sequence might match a 2 byte string */
	1703	if (trie->minlen > 2 )
	1704	trie->minlen= 2;
	1705	}
	1706
	1707	} /* end first pass */
	1708	DEBUG_TRIE_COMPILE_r(
	1709	PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	1710	(int)depth * 2 + 2,"",
	1711	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	1712	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	1713	(int)trie->minlen, (int)trie->maxlen )
	1714	);
	1715
	1716	/*
	1717	We now know what we are dealing with in terms of unique chars and
	1718	string sizes so we can calculate how much memory a naive
	1719	representation using a flat table will take. If it's over a reasonable
	1720	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	1721	conservative but potentially much slower representation using an array
	1722	of lists.
	1723
	1724	At the end we convert both representations into the same compressed
	1725	form that will be used in regexec.c for matching with. The latter
	1726	is a form that cannot be used to construct with but has memory
	1727	properties similar to the list form and access properties similar
	1728	to the table form making it both suitable for fast searches and
	1729	small enough that its feasable to store for the duration of a program.
	1730
	1731	See the comment in the code where the compressed table is produced
	1732	inplace from the flat tabe representation for an explanation of how
	1733	the compression works.
	1734
	1735	*/
	1736
	1737
	1738	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	1739	prev_states[1] = 0;
	1740
	1741	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
	1742	/*
	1743	Second Pass -- Array Of Lists Representation
	1744
	1745	Each state will be represented by a list of charid:state records
	1746	(reg_trie_trans_le) the first such element holds the CUR and LEN
	1747	points of the allocated array. (See defines above).
	1748
	1749	We build the initial structure using the lists, and then convert
	1750	it into the compressed table form which allows faster lookups
	1751	(but cant be modified once converted).
	1752	*/
	1753
	1754	STRLEN transcount = 1;
	1755
	1756	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1757	"%*sCompiling trie using list compiler\n",
	1758	(int)depth * 2 + 2, ""));
	1759
	1760	trie->states = (reg_trie_state *)
	1761	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1762	sizeof(reg_trie_state) );
	1763	TRIE_LIST_NEW(1);
	1764	next_alloc = 2;
	1765
	1766	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1767
	1768	regnode *noper = NEXTOPER( cur );
	1769	U8 uc = (U8)STRING( noper );
	1770	const U8 *e = uc + STR_LEN( noper );
	1771	U32 state = 1; /* required init */
	1772	U16 charid = 0; /* sanity init */
	1773	U8 scan = (U8)NULL; /* sanity init */
	1774	STRLEN foldlen = 0; /* required init */
	1775	U32 wordlen = 0; /* required init */
	1776	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1777	STRLEN skiplen = 0;
	1778
	1779	if (OP(noper) == NOTHING) {
	1780	regnode *noper_next= regnext(noper);
	1781	if (noper_next != tail && OP(noper_next) == flags) {
	1782	noper = noper_next;
	1783	uc= (U8*)STRING(noper);
	1784	e= uc + STR_LEN(noper);
	1785	}
	1786	}
	1787
	1788	if (OP(noper) != NOTHING) {
	1789	for ( ; uc < e ; uc += len ) {
	1790
	1791	TRIE_READ_CHAR;
	1792
	1793	if ( uvc < 256 ) {
	1794	charid = trie->charmap[ uvc ];
	1795	} else {
	1796	SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1797	if ( !svpp ) {
	1798	charid = 0;
	1799	} else {
	1800	charid=(U16)SvIV( *svpp );
	1801	}
	1802	}
	1803	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1804	if ( charid ) {
	1805
	1806	U16 check;
	1807	U32 newstate = 0;
	1808
	1809	charid--;
	1810	if ( !trie->states[ state ].trans.list ) {
	1811	TRIE_LIST_NEW( state );
	1812	}
	1813	for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
	1814	if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
	1815	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	1816	break;
	1817	}
	1818	}
	1819	if ( ! newstate ) {
	1820	newstate = next_alloc++;
	1821	prev_states[newstate] = state;
	1822	TRIE_LIST_PUSH( state, charid, newstate );
	1823	transcount++;
	1824	}
	1825	state = newstate;
	1826	} else {
	1827	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1828	}
	1829	}
	1830	}
	1831	TRIE_HANDLE_WORD(state);
	1832
	1833	} /* end second pass */
	1834
	1835	/* next alloc is the NEXT state to be allocated */
	1836	trie->statecount = next_alloc;
	1837	trie->states = (reg_trie_state *)
	1838	PerlMemShared_realloc( trie->states,
	1839	next_alloc
	1840	* sizeof(reg_trie_state) );
	1841
	1842	/* and now dump it out before we compress it */
	1843	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	1844	revcharmap, next_alloc,
	1845	depth+1)
	1846	);
	1847
	1848	trie->trans = (reg_trie_trans *)
	1849	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	1850	{
	1851	U32 state;
	1852	U32 tp = 0;
	1853	U32 zp = 0;
	1854
	1855
	1856	for( state=1 ; state < next_alloc ; state ++ ) {
	1857	U32 base=0;
	1858
	1859	/*
	1860	DEBUG_TRIE_COMPILE_MORE_r(
	1861	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	1862	);
	1863	*/
	1864
	1865	if (trie->states[state].trans.list) {
	1866	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1867	U16 maxid=minid;
	1868	U16 idx;
	1869
	1870	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1871	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1872	if ( forid < minid ) {
	1873	minid=forid;
	1874	} else if ( forid > maxid ) {
	1875	maxid=forid;
	1876	}
	1877	}
	1878	if ( transcount < tp + maxid - minid + 1) {
	1879	transcount *= 2;
	1880	trie->trans = (reg_trie_trans *)
	1881	PerlMemShared_realloc( trie->trans,
	1882	transcount
	1883	* sizeof(reg_trie_trans) );
	1884	Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
	1885	}
	1886	base = trie->uniquecharcount + tp - minid;
	1887	if ( maxid == minid ) {
	1888	U32 set = 0;
	1889	for ( ; zp < tp ; zp++ ) {
	1890	if ( ! trie->trans[ zp ].next ) {
	1891	base = trie->uniquecharcount + zp - minid;
	1892	trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1893	trie->trans[ zp ].check = state;
	1894	set = 1;
	1895	break;
	1896	}
	1897	}
	1898	if ( !set ) {
	1899	trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1900	trie->trans[ tp ].check = state;
	1901	tp++;
	1902	zp = tp;
	1903	}
	1904	} else {
	1905	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1906	const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
	1907	trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
	1908	trie->trans[ tid ].check = state;
	1909	}
	1910	tp += ( maxid - minid + 1 );
	1911	}
	1912	Safefree(trie->states[ state ].trans.list);
	1913	}
	1914	/*
	1915	DEBUG_TRIE_COMPILE_MORE_r(
	1916	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	1917	);
	1918	*/
	1919	trie->states[ state ].trans.base=base;
	1920	}
	1921	trie->lasttrans = tp + 1;
	1922	}
	1923	} else {
	1924	/*
	1925	Second Pass -- Flat Table Representation.
	1926
	1927	we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
	1928	We know that we will need Charcount+1 trans at most to store the data
	1929	(one row per char at worst case) So we preallocate both structures
	1930	assuming worst case.
	1931
	1932	We then construct the trie using only the .next slots of the entry
	1933	structs.
	1934
	1935	We use the .check field of the first entry of the node temporarily to
	1936	make compression both faster and easier by keeping track of how many non
	1937	zero fields are in the node.
	1938
	1939	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	1940	transition.
	1941
	1942	There are two terms at use here: state as a TRIE_NODEIDX() which is a
	1943	number representing the first entry of the node, and state as a
	1944	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
	1945	TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
	1946	are 2 entrys per node. eg:
	1947
	1948	A B A B
	1949	1. 2 4 1. 3 7
	1950	2. 0 3 3. 0 5
	1951	3. 0 0 5. 0 0
	1952	4. 0 0 7. 0 0
	1953
	1954	The table is internally in the right hand, idx form. However as we also
	1955	have to deal with the states array which is indexed by nodenum we have to
	1956	use TRIE_NODENUM() to convert.
	1957
	1958	*/
	1959	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1960	"%*sCompiling trie using table compiler\n",
	1961	(int)depth * 2 + 2, ""));
	1962
	1963	trie->trans = (reg_trie_trans *)
	1964	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	1965	* trie->uniquecharcount + 1,
	1966	sizeof(reg_trie_trans) );
	1967	trie->states = (reg_trie_state *)
	1968	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1969	sizeof(reg_trie_state) );
	1970	next_alloc = trie->uniquecharcount + 1;
	1971
	1972
	1973	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1974
	1975	regnode *noper = NEXTOPER( cur );
	1976	const U8 uc = (U8)STRING( noper );
	1977	const U8 *e = uc + STR_LEN( noper );
	1978
	1979	U32 state = 1; /* required init */
	1980
	1981	U16 charid = 0; /* sanity init */
	1982	U32 accept_state = 0; /* sanity init */
	1983	U8 scan = (U8)NULL; /* sanity init */
	1984
	1985	STRLEN foldlen = 0; /* required init */
	1986	U32 wordlen = 0; /* required init */
	1987	STRLEN skiplen = 0;
	1988	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1989
	1990	if (OP(noper) == NOTHING) {
	1991	regnode *noper_next= regnext(noper);
	1992	if (noper_next != tail && OP(noper_next) == flags) {
	1993	noper = noper_next;
	1994	uc= (U8*)STRING(noper);
	1995	e= uc + STR_LEN(noper);
	1996	}
	1997	}
	1998
	1999	if ( OP(noper) != NOTHING ) {
	2000	for ( ; uc < e ; uc += len ) {
	2001
	2002	TRIE_READ_CHAR;
	2003
	2004	if ( uvc < 256 ) {
	2005	charid = trie->charmap[ uvc ];
	2006	} else {
	2007	SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	2008	charid = svpp ? (U16)SvIV(*svpp) : 0;
	2009	}
	2010	if ( charid ) {
	2011	charid--;
	2012	if ( !trie->trans[ state + charid ].next ) {
	2013	trie->trans[ state + charid ].next = next_alloc;
	2014	trie->trans[ state ].check++;
	2015	prev_states[TRIE_NODENUM(next_alloc)]
	2016	= TRIE_NODENUM(state);
	2017	next_alloc += trie->uniquecharcount;
	2018	}
	2019	state = trie->trans[ state + charid ].next;
	2020	} else {
	2021	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	2022	}
	2023	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	2024	}
	2025	}
	2026	accept_state = TRIE_NODENUM( state );
	2027	TRIE_HANDLE_WORD(accept_state);
	2028
	2029	} /* end second pass */
	2030
	2031	/* and now dump it out before we compress it */
	2032	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	2033	revcharmap,
	2034	next_alloc, depth+1));
	2035
	2036	{
	2037	/*
	2038	* Inplace compress the table.*
	2039
	2040	For sparse data sets the table constructed by the trie algorithm will
	2041	be mostly 0/FAIL transitions or to put it another way mostly empty.
	2042	(Note that leaf nodes will not contain any transitions.)
	2043
	2044	This algorithm compresses the tables by eliminating most such
	2045	transitions, at the cost of a modest bit of extra work during lookup:
	2046
	2047	- Each states[] entry contains a .base field which indicates the
	2048	index in the state[] array wheres its transition data is stored.
	2049
	2050	- If .base is 0 there are no valid transitions from that node.
	2051
	2052	- If .base is nonzero then charid is added to it to find an entry in
	2053	the trans array.
	2054
	2055	-If trans[states[state].base+charid].check!=state then the
	2056	transition is taken to be a 0/Fail transition. Thus if there are fail
	2057	transitions at the front of the node then the .base offset will point
	2058	somewhere inside the previous nodes data (or maybe even into a node
	2059	even earlier), but the .check field determines if the transition is
	2060	valid.
	2061
	2062	XXX - wrong maybe?
	2063	The following process inplace converts the table to the compressed
	2064	table: We first do not compress the root node 1,and mark all its
	2065	.check pointers as 1 and set its .base pointer as 1 as well. This
	2066	allows us to do a DFA construction from the compressed table later,
	2067	and ensures that any .base pointers we calculate later are greater
	2068	than 0.
	2069
	2070	- We set 'pos' to indicate the first entry of the second node.
	2071
	2072	- We then iterate over the columns of the node, finding the first and
	2073	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	2074	and set the .check pointers accordingly, and advance pos
	2075	appropriately and repreat for the next node. Note that when we copy
	2076	the next pointers we have to convert them from the original
	2077	NODEIDX form to NODENUM form as the former is not valid post
	2078	compression.
	2079
	2080	- If a node has no transitions used we mark its base as 0 and do not
	2081	advance the pos pointer.
	2082
	2083	- If a node only has one transition we use a second pointer into the
	2084	structure to fill in allocated fail transitions from other states.
	2085	This pointer is independent of the main pointer and scans forward
	2086	looking for null transitions that are allocated to a state. When it
	2087	finds one it writes the single transition into the "hole". If the
	2088	pointer doesnt find one the single transition is appended as normal.
	2089
	2090	- Once compressed we can Renew/realloc the structures to release the
	2091	excess space.
	2092
	2093	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	2094	specifically Fig 3.47 and the associated pseudocode.
	2095
	2096	demq
	2097	*/
	2098	const U32 laststate = TRIE_NODENUM( next_alloc );
	2099	U32 state, charid;
	2100	U32 pos = 0, zp=0;
	2101	trie->statecount = laststate;
	2102
	2103	for ( state = 1 ; state < laststate ; state++ ) {
	2104	U8 flag = 0;
	2105	const U32 stateidx = TRIE_NODEIDX( state );
	2106	const U32 o_used = trie->trans[ stateidx ].check;
	2107	U32 used = trie->trans[ stateidx ].check;
	2108	trie->trans[ stateidx ].check = 0;
	2109
	2110	for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
	2111	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	2112	if ( trie->trans[ stateidx + charid ].next ) {
	2113	if (o_used == 1) {
	2114	for ( ; zp < pos ; zp++ ) {
	2115	if ( ! trie->trans[ zp ].next ) {
	2116	break;
	2117	}
	2118	}
	2119	trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
	2120	trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2121	trie->trans[ zp ].check = state;
	2122	if ( ++zp > pos ) pos = zp;
	2123	break;
	2124	}
	2125	used--;
	2126	}
	2127	if ( !flag ) {
	2128	flag = 1;
	2129	trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
	2130	}
	2131	trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2132	trie->trans[ pos ].check = state;
	2133	pos++;
	2134	}
	2135	}
	2136	}
	2137	trie->lasttrans = pos + 1;
	2138	trie->states = (reg_trie_state *)
	2139	PerlMemShared_realloc( trie->states, laststate
	2140	* sizeof(reg_trie_state) );
	2141	DEBUG_TRIE_COMPILE_MORE_r(
	2142	PerlIO_printf( Perl_debug_log,
	2143	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	2144	(int)depth * 2 + 2,"",
	2145	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
	2146	(IV)next_alloc,
	2147	(IV)pos,
	2148	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	2149	);
	2150
	2151	} /* end table compress */
	2152	}
	2153	DEBUG_TRIE_COMPILE_MORE_r(
	2154	PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	2155	(int)depth * 2 + 2, "",
	2156	(UV)trie->statecount,
	2157	(UV)trie->lasttrans)
	2158	);
	2159	/* resize the trans array to remove unused space */
	2160	trie->trans = (reg_trie_trans *)
	2161	PerlMemShared_realloc( trie->trans, trie->lasttrans
	2162	* sizeof(reg_trie_trans) );
	2163
	2164	{ /* Modify the program and insert the new TRIE node */
	2165	U8 nodetype =(U8)(flags & 0xFF);
	2166	char *str=NULL;
	2167
	2168	#ifdef DEBUGGING
	2169	regnode *optimize = NULL;
	2170	#ifdef RE_TRACK_PATTERN_OFFSETS
	2171
	2172	U32 mjd_offset = 0;
	2173	U32 mjd_nodelen = 0;
	2174	#endif /* RE_TRACK_PATTERN_OFFSETS */
	2175	#endif /* DEBUGGING */
	2176	/*
	2177	This means we convert either the first branch or the first Exact,
	2178	depending on whether the thing following (in 'last') is a branch
	2179	or not and whther first is the startbranch (ie is it a sub part of
	2180	the alternation or is it the whole thing.)
	2181	Assuming its a sub part we convert the EXACT otherwise we convert
	2182	the whole branch sequence, including the first.
	2183	*/
	2184	/* Find the node we are going to overwrite */
	2185	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	2186	/* branch sub-chain */
	2187	NEXT_OFF( first ) = (U16)(last - first);
	2188	#ifdef RE_TRACK_PATTERN_OFFSETS
	2189	DEBUG_r({
	2190	mjd_offset= Node_Offset((convert));
	2191	mjd_nodelen= Node_Length((convert));
	2192	});
	2193	#endif
	2194	/* whole branch chain */
	2195	}
	2196	#ifdef RE_TRACK_PATTERN_OFFSETS
	2197	else {
	2198	DEBUG_r({
	2199	const regnode *nop = NEXTOPER( convert );
	2200	mjd_offset= Node_Offset((nop));
	2201	mjd_nodelen= Node_Length((nop));
	2202	});
	2203	}
	2204	DEBUG_OPTIMISE_r(
	2205	PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2206	(int)depth * 2 + 2, "",
	2207	(UV)mjd_offset, (UV)mjd_nodelen)
	2208	);
	2209	#endif
	2210	/* But first we check to see if there is a common prefix we can
	2211	split out as an EXACT and put in front of the TRIE node. */
	2212	trie->startstate= 1;
	2213	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2214	U32 state;
	2215	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2216	U32 ofs = 0;
	2217	I32 idx = -1;
	2218	U32 count = 0;
	2219	const U32 base = trie->states[ state ].trans.base;
	2220
	2221	if ( trie->states[state].wordnum )
	2222	count = 1;
	2223
	2224	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2225	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2226	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2227	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2228	{
	2229	if ( ++count > 1 ) {
	2230	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2231	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2232	if ( state == 1 ) break;
	2233	if ( count == 2 ) {
	2234	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2235	DEBUG_OPTIMISE_r(
	2236	PerlIO_printf(Perl_debug_log,
	2237	"%*sNew Start State=%"UVuf" Class: [",
	2238	(int)depth * 2 + 2, "",
	2239	(UV)state));
	2240	if (idx >= 0) {
	2241	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2242	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2243
	2244	TRIE_BITMAP_SET(trie,*ch);
	2245	if ( folder )
	2246	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2247	DEBUG_OPTIMISE_r(
	2248	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2249	);
	2250	}
	2251	}
	2252	TRIE_BITMAP_SET(trie,*ch);
	2253	if ( folder )
	2254	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2255	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2256	}
	2257	idx = ofs;
	2258	}
	2259	}
	2260	if ( count == 1 ) {
	2261	SV **tmp = av_fetch( revcharmap, idx, 0);
	2262	STRLEN len;
	2263	char ch = SvPV( tmp, len );
	2264	DEBUG_OPTIMISE_r({
	2265	SV *sv=sv_newmortal();
	2266	PerlIO_printf( Perl_debug_log,
	2267	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2268	(int)depth * 2 + 2, "",
	2269	(UV)state, (UV)idx,
	2270	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2271	PL_colors[0], PL_colors[1],
	2272	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2273	PERL_PV_ESCAPE_FIRSTCHAR
	2274	)
	2275	);
	2276	});
	2277	if ( state==1 ) {
	2278	OP( convert ) = nodetype;
	2279	str=STRING(convert);
	2280	STR_LEN(convert)=0;
	2281	}
	2282	STR_LEN(convert) += len;
	2283	while (len--)
	2284	str++ = ch++;
	2285	} else {
	2286	#ifdef DEBUGGING
	2287	if (state>1)
	2288	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2289	#endif
	2290	break;
	2291	}
	2292	}
	2293	trie->prefixlen = (state-1);
	2294	if (str) {
	2295	regnode *n = convert+NODE_SZ_STR(convert);
	2296	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2297	trie->startstate = state;
	2298	trie->minlen -= (state - 1);
	2299	trie->maxlen -= (state - 1);
	2300	#ifdef DEBUGGING
	2301	/* At least the UNICOS C compiler choked on this
	2302	* being argument to DEBUG_r(), so let's just have
	2303	* it right here. */
	2304	if (
	2305	#ifdef PERL_EXT_RE_BUILD
	2306	1
	2307	#else
	2308	DEBUG_r_TEST
	2309	#endif
	2310	) {
	2311	regnode *fix = convert;
	2312	U32 word = trie->wordcount;
	2313	mjd_nodelen++;
	2314	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2315	while( ++fix < n ) {
	2316	Set_Node_Offset_Length(fix, 0, 0);
	2317	}
	2318	while (word--) {
	2319	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2320	if (tmp) {
	2321	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2322	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2323	else
	2324	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2325	}
	2326	}
	2327	}
	2328	#endif
	2329	if (trie->maxlen) {
	2330	convert = n;
	2331	} else {
	2332	NEXT_OFF(convert) = (U16)(tail - convert);
	2333	DEBUG_r(optimize= n);
	2334	}
	2335	}
	2336	}
	2337	if (!jumper)
	2338	jumper = last;
	2339	if ( trie->maxlen ) {
	2340	NEXT_OFF( convert ) = (U16)(tail - convert);
	2341	ARG_SET( convert, data_slot );
	2342	/* Store the offset to the first unabsorbed branch in
	2343	jump[0], which is otherwise unused by the jump logic.
	2344	We use this when dumping a trie and during optimisation. */
	2345	if (trie->jump)
	2346	trie->jump[0] = (U16)(nextbranch - convert);
	2347
	2348	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2349	* and there is a bitmap
	2350	* and the first "jump target" node we found leaves enough room
	2351	* then convert the TRIE node into a TRIEC node, with the bitmap
	2352	* embedded inline in the opcode - this is hypothetically faster.
	2353	*/
	2354	if ( !trie->states[trie->startstate].wordnum
	2355	&& trie->bitmap
	2356	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2357	{
	2358	OP( convert ) = TRIEC;
	2359	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2360	PerlMemShared_free(trie->bitmap);
	2361	trie->bitmap= NULL;
	2362	} else
	2363	OP( convert ) = TRIE;
	2364
	2365	/* store the type in the flags */
	2366	convert->flags = nodetype;
	2367	DEBUG_r({
	2368	optimize = convert
	2369	+ NODE_STEP_REGNODE
	2370	+ regarglen[ OP( convert ) ];
	2371	});
	2372	/* XXX We really should free up the resource in trie now,
	2373	as we won't use them - (which resources?) dmq */
	2374	}
	2375	/* needed for dumping*/
	2376	DEBUG_r(if (optimize) {
	2377	regnode *opt = convert;
	2378
	2379	while ( ++opt < optimize) {
	2380	Set_Node_Offset_Length(opt,0,0);
	2381	}
	2382	/*
	2383	Try to clean up some of the debris left after the
	2384	optimisation.
	2385	*/
	2386	while( optimize < jumper ) {
	2387	mjd_nodelen += Node_Length((optimize));
	2388	OP( optimize ) = OPTIMIZED;
	2389	Set_Node_Offset_Length(optimize,0,0);
	2390	optimize++;
	2391	}
	2392	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2393	});
	2394	} /* end node insert */
	2395
	2396	/* Finish populating the prev field of the wordinfo array. Walk back
	2397	* from each accept state until we find another accept state, and if
	2398	* so, point the first word's .prev field at the second word. If the
	2399	* second already has a .prev field set, stop now. This will be the
	2400	* case either if we've already processed that word's accept state,
	2401	* or that state had multiple words, and the overspill words were
	2402	* already linked up earlier.
	2403	*/
	2404	{
	2405	U16 word;
	2406	U32 state;
	2407	U16 prev;
	2408
	2409	for (word=1; word <= trie->wordcount; word++) {
	2410	prev = 0;
	2411	if (trie->wordinfo[word].prev)
	2412	continue;
	2413	state = trie->wordinfo[word].accept;
	2414	while (state) {
	2415	state = prev_states[state];
	2416	if (!state)
	2417	break;
	2418	prev = trie->states[state].wordnum;
	2419	if (prev)
	2420	break;
	2421	}
	2422	trie->wordinfo[word].prev = prev;
	2423	}
	2424	Safefree(prev_states);
	2425	}
	2426
	2427
	2428	/* and now dump out the compressed format */
	2429	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2430
	2431	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2432	#ifdef DEBUGGING
	2433	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2434	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2435	#else
	2436	SvREFCNT_dec(revcharmap);
	2437	#endif
	2438	return trie->jump
	2439	? MADE_JUMP_TRIE
	2440	: trie->startstate>1
	2441	? MADE_EXACT_TRIE
	2442	: MADE_TRIE;
	2443	}
	2444
	2445	STATIC void
	2446	S_make_trie_failtable(pTHX_ RExC_state_t pRExC_state, regnode source, regnode *stclass, U32 depth)
	2447	{
	2448	/* The Trie is constructed and compressed now so we can build a fail array if it's needed
	2449
	2450	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
	2451	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
	2452	ISBN 0-201-10088-6
	2453
	2454	We find the fail state for each state in the trie, this state is the longest proper
	2455	suffix of the current state's 'word' that is also a proper prefix of another word in our
	2456	trie. State 1 represents the word '' and is thus the default fail state. This allows
	2457	the DFA not to have to restart after its tried and failed a word at a given point, it
	2458	simply continues as though it had been matching the other word in the first place.
	2459	Consider
	2460	'abcdgu'=~/abcdefg\|cdgu/
	2461	When we get to 'd' we are still matching the first word, we would encounter 'g' which would
	2462	fail, which would bring us to the state representing 'd' in the second word where we would
	2463	try 'g' and succeed, proceeding to match 'cdgu'.
	2464	*/
	2465	/* add a fail transition */
	2466	const U32 trie_offset = ARG(source);
	2467	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2468	U32 *q;
	2469	const U32 ucharcount = trie->uniquecharcount;
	2470	const U32 numstates = trie->statecount;
	2471	const U32 ubound = trie->lasttrans + ucharcount;
	2472	U32 q_read = 0;
	2473	U32 q_write = 0;
	2474	U32 charid;
	2475	U32 base = trie->states[ 1 ].trans.base;
	2476	U32 *fail;
	2477	reg_ac_data *aho;
	2478	const U32 data_slot = add_data( pRExC_state, 1, "T" );
	2479	GET_RE_DEBUG_FLAGS_DECL;
	2480
	2481	PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
	2482	#ifndef DEBUGGING
	2483	PERL_UNUSED_ARG(depth);
	2484	#endif
	2485
	2486
	2487	ARG_SET( stclass, data_slot );
	2488	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	2489	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	2490	aho->trie=trie_offset;
	2491	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	2492	Copy( trie->states, aho->states, numstates, reg_trie_state );
	2493	Newxz( q, numstates, U32);
	2494	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	2495	aho->refcount = 1;
	2496	fail = aho->fail;
	2497	/* initialize fail[0..1] to be 1 so that we always have
	2498	a valid final fail state */
	2499	fail[ 0 ] = fail[ 1 ] = 1;
	2500
	2501	for ( charid = 0; charid < ucharcount ; charid++ ) {
	2502	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	2503	if ( newstate ) {
	2504	q[ q_write ] = newstate;
	2505	/* set to point at the root */
	2506	fail[ q[ q_write++ ] ]=1;
	2507	}
	2508	}
	2509	while ( q_read < q_write) {
	2510	const U32 cur = q[ q_read++ % numstates ];
	2511	base = trie->states[ cur ].trans.base;
	2512
	2513	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	2514	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	2515	if (ch_state) {
	2516	U32 fail_state = cur;
	2517	U32 fail_base;
	2518	do {
	2519	fail_state = fail[ fail_state ];
	2520	fail_base = aho->states[ fail_state ].trans.base;
	2521	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	2522
	2523	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	2524	fail[ ch_state ] = fail_state;
	2525	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	2526	{
	2527	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	2528	}
	2529	q[ q_write++ % numstates] = ch_state;
	2530	}
	2531	}
	2532	}
	2533	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	2534	when we fail in state 1, this allows us to use the
	2535	charclass scan to find a valid start char. This is based on the principle
	2536	that theres a good chance the string being searched contains lots of stuff
	2537	that cant be a start char.
	2538	*/
	2539	fail[ 0 ] = fail[ 1 ] = 0;
	2540	DEBUG_TRIE_COMPILE_r({
	2541	PerlIO_printf(Perl_debug_log,
	2542	"%*sStclass Failtable (%"UVuf" states): 0",
	2543	(int)(depth * 2), "", (UV)numstates
	2544	);
	2545	for( q_read=1; q_read<numstates; q_read++ ) {
	2546	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	2547	}
	2548	PerlIO_printf(Perl_debug_log, "\n");
	2549	});
	2550	Safefree(q);
	2551	/RExC_seen \|= REG_SEEN_TRIEDFA;/
	2552	}
	2553
	2554
	2555	/*
	2556	* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
	2557	* These need to be revisited when a newer toolchain becomes available.
	2558	*/
	2559	#if defined(__sparc64__) && defined(__GNUC__)
	2560	# if __GNUC__ < 2 \|\| (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
	2561	# undef SPARC64_GCC_WORKAROUND
	2562	# define SPARC64_GCC_WORKAROUND 1
	2563	# endif
	2564	#endif
	2565
	2566	#define DEBUG_PEEP(str,scan,depth) \
	2567	DEBUG_OPTIMISE_r({if (scan){ \
	2568	SV * const mysv=sv_newmortal(); \
	2569	regnode *Next = regnext(scan); \
	2570	regprop(RExC_rx, mysv, scan); \
	2571	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	2572	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	2573	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	2574	}});
	2575
	2576
	2577	/* The below joins as many adjacent EXACTish nodes as possible into a single
	2578	* one, and looks for problematic sequences of characters whose folds vs.
	2579	* non-folds have sufficiently different lengths, that the optimizer would be
	2580	* fooled into rejecting legitimate matches of them, and the trie construction
	2581	* code can't cope with them. The joining is only done if:
	2582	* 1) there is room in the current conglomerated node to entirely contain the
	2583	* next one.
	2584	* 2) they are the exact same node type
	2585	*
	2586	* The adjacent nodes actually may be separated by NOTHING kind nodes, and
	2587	* these get optimized out
	2588	*
	2589	* If there are problematic code sequences, *min_subtract is set to the delta
	2590	* that the minimum size of the node can be less than its actual size. And,
	2591	* the node type of the result is changed to reflect that it contains these
	2592	* sequences.
	2593	*
	2594	* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
	2595	* and contains LATIN SMALL LETTER SHARP S
	2596	*
	2597	* This is as good a place as any to discuss the design of handling these
	2598	* problematic sequences. It's been wrong in Perl for a very long time. There
	2599	* are three code points in Unicode whose folded lengths differ so much from
	2600	* the un-folded lengths that it causes problems for the optimizer and trie
	2601	* construction. Why only these are problematic, and not others where lengths
	2602	* also differ is something I (khw) do not understand. New versions of Unicode
	2603	* might add more such code points. Hopefully the logic in fold_grind.t that
	2604	* figures out what to test (in part by verifying that each size-combination
	2605	* gets tested) will catch any that do come along, so they can be added to the
	2606	* special handling below. The chances of new ones are actually rather small,
	2607	* as most, if not all, of the world's scripts that have casefolding have
	2608	* already been encoded by Unicode. Also, a number of Unicode's decisions were
	2609	* made to allow compatibility with pre-existing standards, and almost all of
	2610	* those have already been dealt with. These would otherwise be the most
	2611	* likely candidates for generating further tricky sequences. In other words,
	2612	* Unicode by itself is unlikely to add new ones unless it is for compatibility
	2613	* with pre-existing standards, and there aren't many of those left.
	2614	*
	2615	* The previous designs for dealing with these involved assigning a special
	2616	* node for them. This approach doesn't work, as evidenced by this example:
	2617	* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
	2618	* Both these fold to "sss", but if the pattern is parsed to create a node of
	2619	* that would match just the \xDF, it won't be able to handle the case where a
	2620	* successful match would have to cross the node's boundary. The new approach
	2621	* that hopefully generally solves the problem generates an EXACTFU_SS node
	2622	* that is "sss".
	2623	*
	2624	* There are a number of components to the approach (a lot of work for just
	2625	* three code points!):
	2626	* 1) This routine examines each EXACTFish node that could contain the
	2627	* problematic sequences. It returns in *min_subtract how much to
	2628	* subtract from the the actual length of the string to get a real minimum
	2629	* for one that could match it. This number is usually 0 except for the
	2630	* problematic sequences. This delta is used by the caller to adjust the
	2631	* min length of the match, and the delta between min and max, so that the
	2632	* optimizer doesn't reject these possibilities based on size constraints.
	2633	* 2) These sequences are not currently correctly handled by the trie code
	2634	* either, so it changes the joined node type to ops that are not handled
	2635	* by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
	2636	* 3) This is sufficient for the two Greek sequences (described below), but
	2637	* the one involving the Sharp s (\xDF) needs more. The node type
	2638	* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
	2639	* sequence in it. For non-UTF-8 patterns and strings, this is the only
	2640	* case where there is a possible fold length change. That means that a
	2641	* regular EXACTFU node without UTF-8 involvement doesn't have to concern
	2642	* itself with length changes, and so can be processed faster. regexec.c
	2643	* takes advantage of this. Generally, an EXACTFish node that is in UTF-8
	2644	* is pre-folded by regcomp.c. This saves effort in regex matching.
	2645	* However, probably mostly for historical reasons, the pre-folding isn't
	2646	* done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
	2647	* nodes, as what they fold to isn't known until runtime.) The fold
	2648	* possibilities for the non-UTF8 patterns are quite simple, except for
	2649	* the sharp s. All the ones that don't involve a UTF-8 target string
	2650	* are members of a fold-pair, and arrays are set up for all of them
	2651	* that quickly find the other member of the pair. It might actually
	2652	* be faster to pre-fold these, but it isn't currently done, except for
	2653	* the sharp s. Code elsewhere in this file makes sure that it gets
	2654	* folded to 'ss', even if the pattern isn't UTF-8. This avoids the
	2655	* issues described in the next item.
	2656	* 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
	2657	* 'ss' or not is not knowable at compile time. It will match iff the
	2658	* target string is in UTF-8, unlike the EXACTFU nodes, where it always
	2659	* matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
	2660	* it can't be folded to "ss" at compile time, unlike EXACTFU does as
	2661	* described in item 3). An assumption that the optimizer part of
	2662	* regexec.c (probably unwittingly) makes is that a character in the
	2663	* pattern corresponds to at most a single character in the target string.
	2664	* (And I do mean character, and not byte here, unlike other parts of the
	2665	* documentation that have never been updated to account for multibyte
	2666	* Unicode.) This assumption is wrong only in this case, as all other
	2667	* cases are either 1-1 folds when no UTF-8 is involved; or is true by
	2668	* virtue of having this file pre-fold UTF-8 patterns. I'm
	2669	* reluctant to try to change this assumption, so instead the code punts.
	2670	* This routine examines EXACTF nodes for the sharp s, and returns a
	2671	* boolean indicating whether or not the node is an EXACTF node that
	2672	* contains a sharp s. When it is true, the caller sets a flag that later
	2673	* causes the optimizer in this file to not set values for the floating
	2674	* and fixed string lengths, and thus avoids the optimizer code in
	2675	* regexec.c that makes the invalid assumption. Thus, there is no
	2676	* optimization based on string lengths for EXACTF nodes that contain the
	2677	* sharp s. This only happens for /id rules (which means the pattern
	2678	* isn't in UTF-8).
	2679	*/
	2680
	2681	#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
	2682	if (PL_regkind[OP(scan)] == EXACT) \
	2683	join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
	2684
	2685	STATIC U32
	2686	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan, UV min_subtract, bool has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
	2687	/* Merge several consecutive EXACTish nodes into one. */
	2688	regnode *n = regnext(scan);
	2689	U32 stringok = 1;
	2690	regnode *next = scan + NODE_SZ_STR(scan);
	2691	U32 merged = 0;
	2692	U32 stopnow = 0;
	2693	#ifdef DEBUGGING
	2694	regnode *stop = scan;
	2695	GET_RE_DEBUG_FLAGS_DECL;
	2696	#else
	2697	PERL_UNUSED_ARG(depth);
	2698	#endif
	2699
	2700	PERL_ARGS_ASSERT_JOIN_EXACT;
	2701	#ifndef EXPERIMENTAL_INPLACESCAN
	2702	PERL_UNUSED_ARG(flags);
	2703	PERL_UNUSED_ARG(val);
	2704	#endif
	2705	DEBUG_PEEP("join",scan,depth);
	2706
	2707	/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
	2708	* EXACT ones that are mergeable to the current one. */
	2709	while (n
	2710	&& (PL_regkind[OP(n)] == NOTHING
	2711	\|\| (stringok && OP(n) == OP(scan)))
	2712	&& NEXT_OFF(n)
	2713	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
	2714	{
	2715
	2716	if (OP(n) == TAIL \|\| n > next)
	2717	stringok = 0;
	2718	if (PL_regkind[OP(n)] == NOTHING) {
	2719	DEBUG_PEEP("skip:",n,depth);
	2720	NEXT_OFF(scan) += NEXT_OFF(n);
	2721	next = n + NODE_STEP_REGNODE;
	2722	#ifdef DEBUGGING
	2723	if (stringok)
	2724	stop = n;
	2725	#endif
	2726	n = regnext(n);
	2727	}
	2728	else if (stringok) {
	2729	const unsigned int oldl = STR_LEN(scan);
	2730	regnode * const nnext = regnext(n);
	2731
	2732	if (oldl + STR_LEN(n) > U8_MAX)
	2733	break;
	2734
	2735	DEBUG_PEEP("merg",n,depth);
	2736	merged++;
	2737
	2738	NEXT_OFF(scan) += NEXT_OFF(n);
	2739	STR_LEN(scan) += STR_LEN(n);
	2740	next = n + NODE_SZ_STR(n);
	2741	/* Now we can overwrite n : /
	2742	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	2743	#ifdef DEBUGGING
	2744	stop = next - 1;
	2745	#endif
	2746	n = nnext;
	2747	if (stopnow) break;
	2748	}
	2749
	2750	#ifdef EXPERIMENTAL_INPLACESCAN
	2751	if (flags && !NEXT_OFF(n)) {
	2752	DEBUG_PEEP("atch", val, depth);
	2753	if (reg_off_by_arg[OP(n)]) {
	2754	ARG_SET(n, val - n);
	2755	}
	2756	else {
	2757	NEXT_OFF(n) = val - n;
	2758	}
	2759	stopnow = 1;
	2760	}
	2761	#endif
	2762	}
	2763
	2764	*min_subtract = 0;
	2765	*has_exactf_sharp_s = FALSE;
	2766
	2767	/* Here, all the adjacent mergeable EXACTish nodes have been merged. We
	2768	* can now analyze for sequences of problematic code points. (Prior to
	2769	* this final joining, sequences could have been split over boundaries, and
	2770	* hence missed). The sequences only happen in folding, hence for any
	2771	* non-EXACT EXACTish node */
	2772	if (OP(scan) != EXACT) {
	2773	U8 *s;
	2774	U8 * s0 = (U8*) STRING(scan);
	2775	U8 * const s_end = s0 + STR_LEN(scan);
	2776
	2777	/* The below is perhaps overboard, but this allows us to save a test
	2778	* each time through the loop at the expense of a mask. This is
	2779	* because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
	2780	* single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
	2781	* This uses an exclusive 'or' to find that bit and then inverts it to
	2782	* form a mask, with just a single 0, in the bit position where 'S' and
	2783	* 's' differ. */
	2784	const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
	2785	const U8 s_masked = 's' & S_or_s_mask;
	2786
	2787	/* One pass is made over the node's string looking for all the
	2788	* possibilities. to avoid some tests in the loop, there are two main
	2789	* cases, for UTF-8 patterns (which can't have EXACTF nodes) and
	2790	* non-UTF-8 */
	2791	if (UTF) {
	2792
	2793	/* There are two problematic Greek code points in Unicode
	2794	* casefolding
	2795	*
	2796	* U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	2797	* U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
	2798	*
	2799	* which casefold to
	2800	*
	2801	* Unicode UTF-8
	2802	*
	2803	* U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
	2804	* U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
	2805	*
	2806	* This means that in case-insensitive matching (or "loose
	2807	* matching", as Unicode calls it), an EXACTF of length six (the
	2808	* UTF-8 encoded byte length of the above casefolded versions) can
	2809	* match a target string of length two (the byte length of UTF-8
	2810	* encoded U+0390 or U+03B0). This would rather mess up the
	2811	* minimum length computation. (there are other code points that
	2812	* also fold to these two sequences, but the delta is smaller)
	2813	*
	2814	* If these sequences are found, the minimum length is decreased by
	2815	* four (six minus two).
	2816	*
	2817	* Similarly, 'ss' may match the single char and byte LATIN SMALL
	2818	* LETTER SHARP S. We decrease the min length by 1 for each
	2819	* occurrence of 'ss' found */
	2820
	2821	#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
	2822	# define U390_first_byte 0xb4
	2823	const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
	2824	# define U3B0_first_byte 0xb5
	2825	const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
	2826	#else
	2827	# define U390_first_byte 0xce
	2828	const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
	2829	# define U3B0_first_byte 0xcf
	2830	const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
	2831	#endif
	2832	const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
	2833	yields a net of 0 */
	2834	/* Examine the string for one of the problematic sequences */
	2835	for (s = s0;
	2836	s < s_end - 1; /* Can stop 1 before the end, as minimum length
	2837	* sequence we are looking for is 2 */
	2838	s += UTF8SKIP(s))
	2839	{
	2840
	2841	/* Look for the first byte in each problematic sequence */
	2842	switch (*s) {
	2843	/* We don't have to worry about other things that fold to
	2844	* 's' (such as the long s, U+017F), as all above-latin1
	2845	* code points have been pre-folded */
	2846	case 's':
	2847	case 'S':
	2848
	2849	/* Current character is an 's' or 'S'. If next one is
	2850	* as well, we have the dreaded sequence */
	2851	if (((*(s+1) & S_or_s_mask) == s_masked)
	2852	/* These two node types don't have special handling
	2853	* for 'ss' */
	2854	&& OP(scan) != EXACTFL && OP(scan) != EXACTFA)
	2855	{
	2856	*min_subtract += 1;
	2857	OP(scan) = EXACTFU_SS;
	2858	s++; /* No need to look at this character again */
	2859	}
	2860	break;
	2861
	2862	case U390_first_byte:
	2863	if (s_end - s >= len
	2864
	2865	/* The 1's are because are skipping comparing the
	2866	* first byte */
	2867	&& memEQ(s + 1, U390_tail, len - 1))
	2868	{
	2869	goto greek_sequence;
	2870	}
	2871	break;
	2872
	2873	case U3B0_first_byte:
	2874	if (! (s_end - s >= len
	2875	&& memEQ(s + 1, U3B0_tail, len - 1)))
	2876	{
	2877	break;
	2878	}
	2879	greek_sequence:
	2880	*min_subtract += 4;
	2881
	2882	/* This can't currently be handled by trie's, so change
	2883	* the node type to indicate this. If EXACTFA and
	2884	* EXACTFL were ever to be handled by trie's, this
	2885	* would have to be changed. If this node has already
	2886	* been changed to EXACTFU_SS in this loop, leave it as
	2887	* is. (I (khw) think it doesn't matter in regexec.c
	2888	* for UTF patterns, but no need to change it */
	2889	if (OP(scan) == EXACTFU) {
	2890	OP(scan) = EXACTFU_TRICKYFOLD;
	2891	}
	2892	s += 6; /* We already know what this sequence is. Skip
	2893	the rest of it */
	2894	break;
	2895	}
	2896	}
	2897	}
	2898	else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
	2899
	2900	/* Here, the pattern is not UTF-8. We need to look only for the
	2901	* 'ss' sequence, and in the EXACTF case, the sharp s, which can be
	2902	* in the final position. Otherwise we can stop looking 1 byte
	2903	* earlier because have to find both the first and second 's' */
	2904	const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
	2905
	2906	for (s = s0; s < upper; s++) {
	2907	switch (*s) {
	2908	case 'S':
	2909	case 's':
	2910	if (s_end - s > 1
	2911	&& ((*(s+1) & S_or_s_mask) == s_masked))
	2912	{
	2913	*min_subtract += 1;
	2914
	2915	/* EXACTF nodes need to know that the minimum
	2916	* length changed so that a sharp s in the string
	2917	* can match this ss in the pattern, but they
	2918	* remain EXACTF nodes, as they are not trie'able,
	2919	* so don't have to invent a new node type to
	2920	* exclude them from the trie code */
	2921	if (OP(scan) != EXACTF) {
	2922	OP(scan) = EXACTFU_SS;
	2923	}
	2924	s++;
	2925	}
	2926	break;
	2927	case LATIN_SMALL_LETTER_SHARP_S:
	2928	if (OP(scan) == EXACTF) {
	2929	*has_exactf_sharp_s = TRUE;
	2930	}
	2931	break;
	2932	}
	2933	}
	2934	}
	2935	}
	2936
	2937	#ifdef DEBUGGING
	2938	/* Allow dumping but overwriting the collection of skipped
	2939	* ops and/or strings with fake optimized ops */
	2940	n = scan + NODE_SZ_STR(scan);
	2941	while (n <= stop) {
	2942	OP(n) = OPTIMIZED;
	2943	FLAGS(n) = 0;
	2944	NEXT_OFF(n) = 0;
	2945	n++;
	2946	}
	2947	#endif
	2948	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	2949	return stopnow;
	2950	}
	2951
	2952	/* REx optimizer. Converts nodes into quicker variants "in place".
	2953	Finds fixed substrings. */
	2954
	2955	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	2956	to the position after last scanned or to NULL. */
	2957
	2958	#define INIT_AND_WITHP \
	2959	assert(!and_withp); \
	2960	Newx(and_withp,1,struct regnode_charclass_class); \
	2961	SAVEFREEPV(and_withp)
	2962
	2963	/* this is a chain of data about sub patterns we are processing that
	2964	need to be handled separately/specially in study_chunk. Its so
	2965	we can simulate recursion without losing state. */
	2966	struct scan_frame;
	2967	typedef struct scan_frame {
	2968	regnode last; / last node to process in this frame */
	2969	regnode next; / next node to process when last is reached */
	2970	struct scan_frame prev; /previous frame*/
	2971	I32 stop; /* what stopparen do we use */
	2972	} scan_frame;
	2973
	2974
	2975	#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
	2976
	2977	#define CASE_SYNST_FNC(nAmE) \
	2978	case nAmE: \
	2979	if (flags & SCF_DO_STCLASS_AND) { \
	2980	for (value = 0; value < 256; value++) \
	2981	if (!is_ ## nAmE ## _cp(value)) \
	2982	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2983	} \
	2984	else { \
	2985	for (value = 0; value < 256; value++) \
	2986	if (is_ ## nAmE ## _cp(value)) \
	2987	ANYOF_BITMAP_SET(data->start_class, value); \
	2988	} \
	2989	break; \
	2990	case N ## nAmE: \
	2991	if (flags & SCF_DO_STCLASS_AND) { \
	2992	for (value = 0; value < 256; value++) \
	2993	if (is_ ## nAmE ## _cp(value)) \
	2994	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2995	} \
	2996	else { \
	2997	for (value = 0; value < 256; value++) \
	2998	if (!is_ ## nAmE ## _cp(value)) \
	2999	ANYOF_BITMAP_SET(data->start_class, value); \
	3000	} \
	3001	break
	3002
	3003
	3004
	3005	STATIC I32
	3006	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	3007	I32 minlenp, I32 deltap,
	3008	regnode *last,
	3009	scan_data_t *data,
	3010	I32 stopparen,
	3011	U8* recursed,
	3012	struct regnode_charclass_class *and_withp,
	3013	U32 flags, U32 depth)
	3014	/* scanp: Start here (read-write). */
	3015	/* deltap: Write maxlen-minlen here. */
	3016	/* last: Stop before this one. */
	3017	/* data: string data about the pattern */
	3018	/* stopparen: treat close N as END */
	3019	/* recursed: which subroutines have we recursed into */
	3020	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	3021	{
	3022	dVAR;
	3023	I32 min = 0, pars = 0, code;
	3024	regnode scan = scanp, *next;
	3025	I32 delta = 0;
	3026	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	3027	int is_inf_internal = 0; /* The studied chunk is infinite */
	3028	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	3029	scan_data_t data_fake;
	3030	SV *re_trie_maxbuff = NULL;
	3031	regnode *first_non_open = scan;
	3032	I32 stopmin = I32_MAX;
	3033	scan_frame *frame = NULL;
	3034	GET_RE_DEBUG_FLAGS_DECL;
	3035
	3036	PERL_ARGS_ASSERT_STUDY_CHUNK;
	3037
	3038	#ifdef DEBUGGING
	3039	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	3040	#endif
	3041
	3042	if ( depth == 0 ) {
	3043	while (first_non_open && OP(first_non_open) == OPEN)
	3044	first_non_open=regnext(first_non_open);
	3045	}
	3046
	3047
	3048	fake_study_recurse:
	3049	while ( scan && OP(scan) != END && scan < last ){
	3050	UV min_subtract = 0; /* How much to subtract from the minimum node
	3051	length to get a real minimum (because the
	3052	folded version may be shorter) */
	3053	bool has_exactf_sharp_s = FALSE;
	3054	/* Peephole optimizer: */
	3055	DEBUG_STUDYDATA("Peep:", data,depth);
	3056	DEBUG_PEEP("Peep",scan,depth);
	3057
	3058	/* Its not clear to khw or hv why this is done here, and not in the
	3059	* clauses that deal with EXACT nodes. khw's guess is that it's
	3060	* because of a previous design */
	3061	JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
	3062
	3063	/* Follow the next-chain of the current node and optimize
	3064	away all the NOTHINGs from it. */
	3065	if (OP(scan) != CURLYX) {
	3066	const int max = (reg_off_by_arg[OP(scan)]
	3067	? I32_MAX
	3068	/* I32 may be smaller than U16 on CRAYs! */
	3069	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	3070	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	3071	int noff;
	3072	regnode *n = scan;
	3073
	3074	/* Skip NOTHING and LONGJMP. */
	3075	while ((n = regnext(n))
	3076	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	3077	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	3078	&& off + noff < max)
	3079	off += noff;
	3080	if (reg_off_by_arg[OP(scan)])
	3081	ARG(scan) = off;
	3082	else
	3083	NEXT_OFF(scan) = off;
	3084	}
	3085
	3086
	3087
	3088	/* The principal pseudo-switch. Cannot be a switch, since we
	3089	look into several different things. */
	3090	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	3091	\|\| OP(scan) == IFTHEN) {
	3092	next = regnext(scan);
	3093	code = OP(scan);
	3094	/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
	3095
	3096	if (OP(next) == code \|\| code == IFTHEN) {
	3097	/* NOTE - There is similar code to this block below for handling
	3098	TRIE nodes on a re-study. If you change stuff here check there
	3099	too. */
	3100	I32 max1 = 0, min1 = I32_MAX, num = 0;
	3101	struct regnode_charclass_class accum;
	3102	regnode * const startbranch=scan;
	3103
	3104	if (flags & SCF_DO_SUBSTR)
	3105	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
	3106	if (flags & SCF_DO_STCLASS)
	3107	cl_init_zero(pRExC_state, &accum);
	3108
	3109	while (OP(scan) == code) {
	3110	I32 deltanext, minnext, f = 0, fake;
	3111	struct regnode_charclass_class this_class;
	3112
	3113	num++;
	3114	data_fake.flags = 0;
	3115	if (data) {
	3116	data_fake.whilem_c = data->whilem_c;
	3117	data_fake.last_closep = data->last_closep;
	3118	}
	3119	else
	3120	data_fake.last_closep = &fake;
	3121
	3122	data_fake.pos_delta = delta;
	3123	next = regnext(scan);
	3124	scan = NEXTOPER(scan);
	3125	if (code != BRANCH)
	3126	scan = NEXTOPER(scan);
	3127	if (flags & SCF_DO_STCLASS) {
	3128	cl_init(pRExC_state, &this_class);
	3129	data_fake.start_class = &this_class;
	3130	f = SCF_DO_STCLASS_AND;
	3131	}
	3132	if (flags & SCF_WHILEM_VISITED_POS)
	3133	f \|= SCF_WHILEM_VISITED_POS;
	3134
	3135	/* we suppose the run is continuous, last=next...*/
	3136	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3137	next, &data_fake,
	3138	stopparen, recursed, NULL, f,depth+1);
	3139	if (min1 > minnext)
	3140	min1 = minnext;
	3141	if (max1 < minnext + deltanext)
	3142	max1 = minnext + deltanext;
	3143	if (deltanext == I32_MAX)
	3144	is_inf = is_inf_internal = 1;
	3145	scan = next;
	3146	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3147	pars++;
	3148	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	3149	if ( stopmin > minnext)
	3150	stopmin = min + min1;
	3151	flags &= ~SCF_DO_SUBSTR;
	3152	if (data)
	3153	data->flags \|= SCF_SEEN_ACCEPT;
	3154	}
	3155	if (data) {
	3156	if (data_fake.flags & SF_HAS_EVAL)
	3157	data->flags \|= SF_HAS_EVAL;
	3158	data->whilem_c = data_fake.whilem_c;
	3159	}
	3160	if (flags & SCF_DO_STCLASS)
	3161	cl_or(pRExC_state, &accum, &this_class);
	3162	}
	3163	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	3164	min1 = 0;
	3165	if (flags & SCF_DO_SUBSTR) {
	3166	data->pos_min += min1;
	3167	data->pos_delta += max1 - min1;
	3168	if (max1 != min1 \|\| is_inf)
	3169	data->longest = &(data->longest_float);
	3170	}
	3171	min += min1;
	3172	delta += max1 - min1;
	3173	if (flags & SCF_DO_STCLASS_OR) {
	3174	cl_or(pRExC_state, data->start_class, &accum);
	3175	if (min1) {
	3176	cl_and(data->start_class, and_withp);
	3177	flags &= ~SCF_DO_STCLASS;
	3178	}
	3179	}
	3180	else if (flags & SCF_DO_STCLASS_AND) {
	3181	if (min1) {
	3182	cl_and(data->start_class, &accum);
	3183	flags &= ~SCF_DO_STCLASS;
	3184	}
	3185	else {
	3186	/* Switch to OR mode: cache the old value of
	3187	* data->start_class */
	3188	INIT_AND_WITHP;
	3189	StructCopy(data->start_class, and_withp,
	3190	struct regnode_charclass_class);
	3191	flags &= ~SCF_DO_STCLASS_AND;
	3192	StructCopy(&accum, data->start_class,
	3193	struct regnode_charclass_class);
	3194	flags \|= SCF_DO_STCLASS_OR;
	3195	data->start_class->flags \|= ANYOF_EOS;
	3196	}
	3197	}
	3198
	3199	if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
	3200	/* demq.
	3201
	3202	Assuming this was/is a branch we are dealing with: 'scan' now
	3203	points at the item that follows the branch sequence, whatever
	3204	it is. We now start at the beginning of the sequence and look
	3205	for subsequences of
	3206
	3207	BRANCH->EXACT=>x1
	3208	BRANCH->EXACT=>x2
	3209	tail
	3210
	3211	which would be constructed from a pattern like /A\|LIST\|OF\|WORDS/
	3212
	3213	If we can find such a subsequence we need to turn the first
	3214	element into a trie and then add the subsequent branch exact
	3215	strings to the trie.
	3216
	3217	We have two cases
	3218
	3219	1. patterns where the whole set of branches can be converted.
	3220
	3221	2. patterns where only a subset can be converted.
	3222
	3223	In case 1 we can replace the whole set with a single regop
	3224	for the trie. In case 2 we need to keep the start and end
	3225	branches so
	3226
	3227	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	3228	becomes BRANCH TRIE; BRANCH X;
	3229
	3230	There is an additional case, that being where there is a
	3231	common prefix, which gets split out into an EXACT like node
	3232	preceding the TRIE node.
	3233
	3234	If x(1..n)==tail then we can do a simple trie, if not we make
	3235	a "jump" trie, such that when we match the appropriate word
	3236	we "jump" to the appropriate tail node. Essentially we turn
	3237	a nested if into a case structure of sorts.
	3238
	3239	*/
	3240
	3241	int made=0;
	3242	if (!re_trie_maxbuff) {
	3243	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	3244	if (!SvIOK(re_trie_maxbuff))
	3245	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	3246	}
	3247	if ( SvIV(re_trie_maxbuff)>=0 ) {
	3248	regnode *cur;
	3249	regnode first = (regnode )NULL;
	3250	regnode last = (regnode )NULL;
	3251	regnode *tail = scan;
	3252	U8 trietype = 0;
	3253	U32 count=0;
	3254
	3255	#ifdef DEBUGGING
	3256	SV * const mysv = sv_newmortal(); /* for dumping */
	3257	#endif
	3258	/* var tail is used because there may be a TAIL
	3259	regop in the way. Ie, the exacts will point to the
	3260	thing following the TAIL, but the last branch will
	3261	point at the TAIL. So we advance tail. If we
	3262	have nested (?:) we may have to move through several
	3263	tails.
	3264	*/
	3265
	3266	while ( OP( tail ) == TAIL ) {
	3267	/* this is the TAIL generated by (?:) */
	3268	tail = regnext( tail );
	3269	}
	3270
	3271
	3272	DEBUG_TRIE_COMPILE_r({
	3273	regprop(RExC_rx, mysv, tail );
	3274	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	3275	(int)depth * 2 + 2, "",
	3276	"Looking for TRIE'able sequences. Tail node is: ",
	3277	SvPV_nolen_const( mysv )
	3278	);
	3279	});
	3280
	3281	/*
	3282
	3283	Step through the branches
	3284	cur represents each branch,
	3285	noper is the first thing to be matched as part of that branch
	3286	noper_next is the regnext() of that node.
	3287
	3288	We normally handle a case like this /FOO[xyz]\|BAR[pqr]/
	3289	via a "jump trie" but we also support building with NOJUMPTRIE,
	3290	which restricts the trie logic to structures like /FOO\|BAR/.
	3291
	3292	If noper is a trieable nodetype then the branch is a possible optimization
	3293	target. If we are building under NOJUMPTRIE then we require that noper_next
	3294	is the same as scan (our current position in the regex program).
	3295
	3296	Once we have two or more consecutive such branches we can create a
	3297	trie of the EXACT's contents and stitch it in place into the program.
	3298
	3299	If the sequence represents all of the branches in the alternation we
	3300	replace the entire thing with a single TRIE node.
	3301
	3302	Otherwise when it is a subsequence we need to stitch it in place and
	3303	replace only the relevant branches. This means the first branch has
	3304	to remain as it is used by the alternation logic, and its next pointer,
	3305	and needs to be repointed at the item on the branch chain following
	3306	the last branch we have optimized away.
	3307
	3308	This could be either a BRANCH, in which case the subsequence is internal,
	3309	or it could be the item following the branch sequence in which case the
	3310	subsequence is at the end (which does not necessarily mean the first node
	3311	is the start of the alternation).
	3312
	3313	TRIE_TYPE(X) is a define which maps the optype to a trietype.
	3314
	3315	optype \| trietype
	3316	----------------+-----------
	3317	NOTHING \| NOTHING
	3318	EXACT \| EXACT
	3319	EXACTFU \| EXACTFU
	3320	EXACTFU_SS \| EXACTFU
	3321	EXACTFU_TRICKYFOLD \| EXACTFU
	3322	EXACTFA \| 0
	3323
	3324
	3325	*/
	3326	#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
	3327	( EXACT == (X) ) ? EXACT : \
	3328	( EXACTFU == (X) \|\| EXACTFU_SS == (X) \|\| EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
	3329	0 )
	3330
	3331	/* dont use tail as the end marker for this traverse */
	3332	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	3333	regnode * const noper = NEXTOPER( cur );
	3334	U8 noper_type = OP( noper );
	3335	U8 noper_trietype = TRIE_TYPE( noper_type );
	3336	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	3337	regnode * const noper_next = regnext( noper );
	3338	U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
	3339	U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
	3340	#endif
	3341
	3342	DEBUG_TRIE_COMPILE_r({
	3343	regprop(RExC_rx, mysv, cur);
	3344	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	3345	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	3346
	3347	regprop(RExC_rx, mysv, noper);
	3348	PerlIO_printf( Perl_debug_log, " -> %s",
	3349	SvPV_nolen_const(mysv));
	3350
	3351	if ( noper_next ) {
	3352	regprop(RExC_rx, mysv, noper_next );
	3353	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	3354	SvPV_nolen_const(mysv));
	3355	}
	3356	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
	3357	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
	3358	PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
	3359	);
	3360	});
	3361
	3362	/* Is noper a trieable nodetype that can be merged with the
	3363	* current trie (if there is one)? */
	3364	if ( noper_trietype
	3365	&&
	3366	(
	3367	( noper_trietype == NOTHING)
	3368	\|\| ( trietype == NOTHING )
	3369	\|\| ( trietype == noper_trietype )
	3370	)
	3371	#ifdef NOJUMPTRIE
	3372	&& noper_next == tail
	3373	#endif
	3374	&& count < U16_MAX)
	3375	{
	3376	/* Handle mergable triable node
	3377	* Either we are the first node in a new trieable sequence,
	3378	* in which case we do some bookkeeping, otherwise we update
	3379	* the end pointer. */
	3380	if ( !first ) {
	3381	first = cur;
	3382	if ( noper_trietype == NOTHING ) {
	3383	#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
	3384	regnode * const noper_next = regnext( noper );
	3385	U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
	3386	U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
	3387	#endif
	3388
	3389	if ( noper_next_trietype ) {
	3390	trietype = noper_next_trietype;
	3391	} else if (noper_next_type) {
	3392	/* a NOTHING regop is 1 regop wide. We need at least two
	3393	* for a trie so we can't merge this in */
	3394	first = NULL;
	3395	}
	3396	} else {
	3397	trietype = noper_trietype;
	3398	}
	3399	} else {
	3400	if ( trietype == NOTHING )
	3401	trietype = noper_trietype;
	3402	last = cur;
	3403	}
	3404	if (first)
	3405	count++;
	3406	} /* end handle mergable triable node */
	3407	else {
	3408	/* handle unmergable node -
	3409	* noper may either be a triable node which can not be tried
	3410	* together with the current trie, or a non triable node */
	3411	if ( last ) {
	3412	/* If last is set and trietype is not NOTHING then we have found
	3413	* at least two triable branch sequences in a row of a similar
	3414	* trietype so we can turn them into a trie. If/when we
	3415	* allow NOTHING to start a trie sequence this condition will be
	3416	* required, and it isn't expensive so we leave it in for now. */
	3417	if ( trietype != NOTHING )
	3418	make_trie( pRExC_state,
	3419	startbranch, first, cur, tail, count,
	3420	trietype, depth+1 );
	3421	last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
	3422	}
	3423	if ( noper_trietype
	3424	#ifdef NOJUMPTRIE
	3425	&& noper_next == tail
	3426	#endif
	3427	){
	3428	/* noper is triable, so we can start a new trie sequence */
	3429	count = 1;
	3430	first = cur;
	3431	trietype = noper_trietype;
	3432	} else if (first) {
	3433	/* if we already saw a first but the current node is not triable then we have
	3434	* to reset the first information. */
	3435	count = 0;
	3436	first = NULL;
	3437	trietype = 0;
	3438	}
	3439	} /* end handle unmergable node */
	3440	} /* loop over branches */
	3441	DEBUG_TRIE_COMPILE_r({
	3442	regprop(RExC_rx, mysv, cur);
	3443	PerlIO_printf( Perl_debug_log,
	3444	"%s- %s (%d) <SCAN FINISHED>\n", (int)depth 2 + 2,
	3445	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3446
	3447	});
	3448	if ( last ) {
	3449	if ( trietype != NOTHING ) {
	3450	/* the last branch of the sequence was part of a trie,
	3451	* so we have to construct it here outside of the loop
	3452	*/
	3453	made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
	3454	#ifdef TRIE_STUDY_OPT
	3455	if ( ((made == MADE_EXACT_TRIE &&
	3456	startbranch == first)
	3457	\|\| ( first_non_open == first )) &&
	3458	depth==0 ) {
	3459	flags \|= SCF_TRIE_RESTUDY;
	3460	if ( startbranch == first
	3461	&& scan == tail )
	3462	{
	3463	RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
	3464	}
	3465	}
	3466	#endif
	3467	} else {
	3468	/* at this point we know whatever we have is a NOTHING sequence/branch
	3469	* AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
	3470	*/
	3471	if ( startbranch == first ) {
	3472	regnode *opt;
	3473	/* the entire thing is a NOTHING sequence, something like this:
	3474	* (?:\|) So we can turn it into a plain NOTHING op. */
	3475	DEBUG_TRIE_COMPILE_r({
	3476	regprop(RExC_rx, mysv, cur);
	3477	PerlIO_printf( Perl_debug_log,
	3478	"%s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth 2 + 2,
	3479	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3480
	3481	});
	3482	OP(startbranch)= NOTHING;
	3483	NEXT_OFF(startbranch)= tail - startbranch;
	3484	for ( opt= startbranch + 1; opt < tail ; opt++ )
	3485	OP(opt)= OPTIMIZED;
	3486	}
	3487	}
	3488	} /* end if ( last) */
	3489	} /* TRIE_MAXBUF is non zero */
	3490
	3491	} /* do trie */
	3492
	3493	}
	3494	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	3495	scan = NEXTOPER(NEXTOPER(scan));
	3496	} else /* single branch is optimized. */
	3497	scan = NEXTOPER(scan);
	3498	continue;
	3499	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	3500	scan_frame *newframe = NULL;
	3501	I32 paren;
	3502	regnode *start;
	3503	regnode *end;
	3504
	3505	if (OP(scan) != SUSPEND) {
	3506	/* set the pointer */
	3507	if (OP(scan) == GOSUB) {
	3508	paren = ARG(scan);
	3509	RExC_recurse[ARG2L(scan)] = scan;
	3510	start = RExC_open_parens[paren-1];
	3511	end = RExC_close_parens[paren-1];
	3512	} else {
	3513	paren = 0;
	3514	start = RExC_rxi->program + 1;
	3515	end = RExC_opend;
	3516	}
	3517	if (!recursed) {
	3518	Newxz(recursed, (((RExC_npar)>>3) +1), U8);
	3519	SAVEFREEPV(recursed);
	3520	}
	3521	if (!PAREN_TEST(recursed,paren+1)) {
	3522	PAREN_SET(recursed,paren+1);
	3523	Newx(newframe,1,scan_frame);
	3524	} else {
	3525	if (flags & SCF_DO_SUBSTR) {
	3526	SCAN_COMMIT(pRExC_state,data,minlenp);
	3527	data->longest = &(data->longest_float);
	3528	}
	3529	is_inf = is_inf_internal = 1;
	3530	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3531	cl_anything(pRExC_state, data->start_class);
	3532	flags &= ~SCF_DO_STCLASS;
	3533	}
	3534	} else {
	3535	Newx(newframe,1,scan_frame);
	3536	paren = stopparen;
	3537	start = scan+2;
	3538	end = regnext(scan);
	3539	}
	3540	if (newframe) {
	3541	assert(start);
	3542	assert(end);
	3543	SAVEFREEPV(newframe);
	3544	newframe->next = regnext(scan);
	3545	newframe->last = last;
	3546	newframe->stop = stopparen;
	3547	newframe->prev = frame;
	3548
	3549	frame = newframe;
	3550	scan = start;
	3551	stopparen = paren;
	3552	last = end;
	3553
	3554	continue;
	3555	}
	3556	}
	3557	else if (OP(scan) == EXACT) {
	3558	I32 l = STR_LEN(scan);
	3559	UV uc;
	3560	if (UTF) {
	3561	const U8 * const s = (U8*)STRING(scan);
	3562	uc = utf8_to_uvchr_buf(s, s + l, NULL);
	3563	l = utf8_length(s, s + l);
	3564	} else {
	3565	uc = ((U8)STRING(scan));
	3566	}
	3567	min += l;
	3568	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	3569	/* The code below prefers earlier match for fixed
	3570	offset, later match for variable offset. */
	3571	if (data->last_end == -1) { /* Update the start info. */
	3572	data->last_start_min = data->pos_min;
	3573	data->last_start_max = is_inf
	3574	? I32_MAX : data->pos_min + data->pos_delta;
	3575	}
	3576	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	3577	if (UTF)
	3578	SvUTF8_on(data->last_found);
	3579	{
	3580	SV * const sv = data->last_found;
	3581	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3582	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3583	if (mg && mg->mg_len >= 0)
	3584	mg->mg_len += utf8_length((U8*)STRING(scan),
	3585	(U8*)STRING(scan)+STR_LEN(scan));
	3586	}
	3587	data->last_end = data->pos_min + l;
	3588	data->pos_min += l; /* As in the first entry. */
	3589	data->flags &= ~SF_BEFORE_EOL;
	3590	}
	3591	if (flags & SCF_DO_STCLASS_AND) {
	3592	/* Check whether it is compatible with what we know already! */
	3593	int compat = 1;
	3594
	3595
	3596	/* If compatible, we or it in below. It is compatible if is
	3597	* in the bitmp and either 1) its bit or its fold is set, or 2)
	3598	* it's for a locale. Even if there isn't unicode semantics
	3599	* here, at runtime there may be because of matching against a
	3600	* utf8 string, so accept a possible false positive for
	3601	* latin1-range folds */
	3602	if (uc >= 0x100 \|\|
	3603	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3604	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3605	&& (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
	3606	\|\| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3607	)
	3608	{
	3609	compat = 0;
	3610	}
	3611	ANYOF_CLASS_ZERO(data->start_class);
	3612	ANYOF_BITMAP_ZERO(data->start_class);
	3613	if (compat)
	3614	ANYOF_BITMAP_SET(data->start_class, uc);
	3615	else if (uc >= 0x100) {
	3616	int i;
	3617
	3618	/* Some Unicode code points fold to the Latin1 range; as
	3619	* XXX temporary code, instead of figuring out if this is
	3620	* one, just assume it is and set all the start class bits
	3621	* that could be some such above 255 code point's fold
	3622	* which will generate fals positives. As the code
	3623	* elsewhere that does compute the fold settles down, it
	3624	* can be extracted out and re-used here */
	3625	for (i = 0; i < 256; i++){
	3626	if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
	3627	ANYOF_BITMAP_SET(data->start_class, i);
	3628	}
	3629	}
	3630	}
	3631	data->start_class->flags &= ~ANYOF_EOS;
	3632	if (uc < 0x100)
	3633	data->start_class->flags &= ~ANYOF_UNICODE_ALL;
	3634	}
	3635	else if (flags & SCF_DO_STCLASS_OR) {
	3636	/* false positive possible if the class is case-folded */
	3637	if (uc < 0x100)
	3638	ANYOF_BITMAP_SET(data->start_class, uc);
	3639	else
	3640	data->start_class->flags \|= ANYOF_UNICODE_ALL;
	3641	data->start_class->flags &= ~ANYOF_EOS;
	3642	cl_and(data->start_class, and_withp);
	3643	}
	3644	flags &= ~SCF_DO_STCLASS;
	3645	}
	3646	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
	3647	I32 l = STR_LEN(scan);
	3648	UV uc = ((U8)STRING(scan));
	3649
	3650	/* Search for fixed substrings supports EXACT only. */
	3651	if (flags & SCF_DO_SUBSTR) {
	3652	assert(data);
	3653	SCAN_COMMIT(pRExC_state, data, minlenp);
	3654	}
	3655	if (UTF) {
	3656	const U8 * const s = (U8 *)STRING(scan);
	3657	uc = utf8_to_uvchr_buf(s, s + l, NULL);
	3658	l = utf8_length(s, s + l);
	3659	}
	3660	else if (has_exactf_sharp_s) {
	3661	RExC_seen \|= REG_SEEN_EXACTF_SHARP_S;
	3662	}
	3663	min += l - min_subtract;
	3664	if (min < 0) {
	3665	min = 0;
	3666	}
	3667	delta += min_subtract;
	3668	if (flags & SCF_DO_SUBSTR) {
	3669	data->pos_min += l - min_subtract;
	3670	if (data->pos_min < 0) {
	3671	data->pos_min = 0;
	3672	}
	3673	data->pos_delta += min_subtract;
	3674	if (min_subtract) {
	3675	data->longest = &(data->longest_float);
	3676	}
	3677	}
	3678	if (flags & SCF_DO_STCLASS_AND) {
	3679	/* Check whether it is compatible with what we know already! */
	3680	int compat = 1;
	3681	if (uc >= 0x100 \|\|
	3682	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3683	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3684	&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3685	{
	3686	compat = 0;
	3687	}
	3688	ANYOF_CLASS_ZERO(data->start_class);
	3689	ANYOF_BITMAP_ZERO(data->start_class);
	3690	if (compat) {
	3691	ANYOF_BITMAP_SET(data->start_class, uc);
	3692	data->start_class->flags &= ~ANYOF_EOS;
	3693	data->start_class->flags \|= ANYOF_LOC_NONBITMAP_FOLD;
	3694	if (OP(scan) == EXACTFL) {
	3695	/* XXX This set is probably no longer necessary, and
	3696	* probably wrong as LOCALE now is on in the initial
	3697	* state */
	3698	data->start_class->flags \|= ANYOF_LOCALE;
	3699	}
	3700	else {
	3701
	3702	/* Also set the other member of the fold pair. In case
	3703	* that unicode semantics is called for at runtime, use
	3704	* the full latin1 fold. (Can't do this for locale,
	3705	* because not known until runtime) */
	3706	ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
	3707
	3708	/* All other (EXACTFL handled above) folds except under
	3709	* /iaa that include s, S, and sharp_s also may include
	3710	* the others */
	3711	if (OP(scan) != EXACTFA) {
	3712	if (uc == 's' \|\| uc == 'S') {
	3713	ANYOF_BITMAP_SET(data->start_class,
	3714	LATIN_SMALL_LETTER_SHARP_S);
	3715	}
	3716	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3717	ANYOF_BITMAP_SET(data->start_class, 's');
	3718	ANYOF_BITMAP_SET(data->start_class, 'S');
	3719	}
	3720	}
	3721	}
	3722	}
	3723	else if (uc >= 0x100) {
	3724	int i;
	3725	for (i = 0; i < 256; i++){
	3726	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3727	ANYOF_BITMAP_SET(data->start_class, i);
	3728	}
	3729	}
	3730	}
	3731	}
	3732	else if (flags & SCF_DO_STCLASS_OR) {
	3733	if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
	3734	/* false positive possible if the class is case-folded.
	3735	Assume that the locale settings are the same... */
	3736	if (uc < 0x100) {
	3737	ANYOF_BITMAP_SET(data->start_class, uc);
	3738	if (OP(scan) != EXACTFL) {
	3739
	3740	/* And set the other member of the fold pair, but
	3741	* can't do that in locale because not known until
	3742	* run-time */
	3743	ANYOF_BITMAP_SET(data->start_class,
	3744	PL_fold_latin1[uc]);
	3745
	3746	/* All folds except under /iaa that include s, S,
	3747	* and sharp_s also may include the others */
	3748	if (OP(scan) != EXACTFA) {
	3749	if (uc == 's' \|\| uc == 'S') {
	3750	ANYOF_BITMAP_SET(data->start_class,
	3751	LATIN_SMALL_LETTER_SHARP_S);
	3752	}
	3753	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3754	ANYOF_BITMAP_SET(data->start_class, 's');
	3755	ANYOF_BITMAP_SET(data->start_class, 'S');
	3756	}
	3757	}
	3758	}
	3759	}
	3760	data->start_class->flags &= ~ANYOF_EOS;
	3761	}
	3762	cl_and(data->start_class, and_withp);
	3763	}
	3764	flags &= ~SCF_DO_STCLASS;
	3765	}
	3766	else if (REGNODE_VARIES(OP(scan))) {
	3767	I32 mincount, maxcount, minnext, deltanext, fl = 0;
	3768	I32 f = flags, pos_before = 0;
	3769	regnode * const oscan = scan;
	3770	struct regnode_charclass_class this_class;
	3771	struct regnode_charclass_class *oclass = NULL;
	3772	I32 next_is_eval = 0;
	3773
	3774	switch (PL_regkind[OP(scan)]) {
	3775	case WHILEM: /* End of (?:...)* . */
	3776	scan = NEXTOPER(scan);
	3777	goto finish;
	3778	case PLUS:
	3779	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	3780	next = NEXTOPER(scan);
	3781	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	3782	mincount = 1;
	3783	maxcount = REG_INFTY;
	3784	next = regnext(scan);
	3785	scan = NEXTOPER(scan);
	3786	goto do_curly;
	3787	}
	3788	}
	3789	if (flags & SCF_DO_SUBSTR)
	3790	data->pos_min++;
	3791	min++;
	3792	/* Fall through. */
	3793	case STAR:
	3794	if (flags & SCF_DO_STCLASS) {
	3795	mincount = 0;
	3796	maxcount = REG_INFTY;
	3797	next = regnext(scan);
	3798	scan = NEXTOPER(scan);
	3799	goto do_curly;
	3800	}
	3801	is_inf = is_inf_internal = 1;
	3802	scan = regnext(scan);
	3803	if (flags & SCF_DO_SUBSTR) {
	3804	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
	3805	data->longest = &(data->longest_float);
	3806	}
	3807	goto optimize_curly_tail;
	3808	case CURLY:
	3809	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	3810	&& (scan->flags == stopparen))
	3811	{
	3812	mincount = 1;
	3813	maxcount = 1;
	3814	} else {
	3815	mincount = ARG1(scan);
	3816	maxcount = ARG2(scan);
	3817	}
	3818	next = regnext(scan);
	3819	if (OP(scan) == CURLYX) {
	3820	I32 lp = (data ? *(data->last_closep) : 0);
	3821	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	3822	}
	3823	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	3824	next_is_eval = (OP(scan) == EVAL);
	3825	do_curly:
	3826	if (flags & SCF_DO_SUBSTR) {
	3827	if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
	3828	pos_before = data->pos_min;
	3829	}
	3830	if (data) {
	3831	fl = data->flags;
	3832	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	3833	if (is_inf)
	3834	data->flags \|= SF_IS_INF;
	3835	}
	3836	if (flags & SCF_DO_STCLASS) {
	3837	cl_init(pRExC_state, &this_class);
	3838	oclass = data->start_class;
	3839	data->start_class = &this_class;
	3840	f \|= SCF_DO_STCLASS_AND;
	3841	f &= ~SCF_DO_STCLASS_OR;
	3842	}
	3843	/* Exclude from super-linear cache processing any {n,m}
	3844	regops for which the combination of input pos and regex
	3845	pos is not enough information to determine if a match
	3846	will be possible.
	3847
	3848	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	3849	regex pos at the \s*, the prospects for a match depend not
	3850	only on the input position but also on how many (bar\s*)
	3851	repeats into the {4,8} we are. */
	3852	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	3853	f &= ~SCF_WHILEM_VISITED_POS;
	3854
	3855	/* This will finish on WHILEM, setting scan, or on NULL: */
	3856	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3857	last, data, stopparen, recursed, NULL,
	3858	(mincount == 0
	3859	? (f & ~SCF_DO_SUBSTR) : f),depth+1);
	3860
	3861	if (flags & SCF_DO_STCLASS)
	3862	data->start_class = oclass;
	3863	if (mincount == 0 \|\| minnext == 0) {
	3864	if (flags & SCF_DO_STCLASS_OR) {
	3865	cl_or(pRExC_state, data->start_class, &this_class);
	3866	}
	3867	else if (flags & SCF_DO_STCLASS_AND) {
	3868	/* Switch to OR mode: cache the old value of
	3869	* data->start_class */
	3870	INIT_AND_WITHP;
	3871	StructCopy(data->start_class, and_withp,
	3872	struct regnode_charclass_class);
	3873	flags &= ~SCF_DO_STCLASS_AND;
	3874	StructCopy(&this_class, data->start_class,
	3875	struct regnode_charclass_class);
	3876	flags \|= SCF_DO_STCLASS_OR;
	3877	data->start_class->flags \|= ANYOF_EOS;
	3878	}
	3879	} else { /* Non-zero len */
	3880	if (flags & SCF_DO_STCLASS_OR) {
	3881	cl_or(pRExC_state, data->start_class, &this_class);
	3882	cl_and(data->start_class, and_withp);
	3883	}
	3884	else if (flags & SCF_DO_STCLASS_AND)
	3885	cl_and(data->start_class, &this_class);
	3886	flags &= ~SCF_DO_STCLASS;
	3887	}
	3888	if (!scan) /* It was not CURLYX, but CURLY. */
	3889	scan = next;
	3890	if ( /* ? quantifier ok, except for (?{ ... }) */
	3891	(next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	3892	&& (minnext == 0) && (deltanext == 0)
	3893	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	3894	&& maxcount <= REG_INFTY/3) /* Complement check for big count */
	3895	{
	3896	ckWARNreg(RExC_parse,
	3897	"Quantifier unexpected on zero-length expression");
	3898	}
	3899
	3900	min += minnext * mincount;
	3901	is_inf_internal \|= ((maxcount == REG_INFTY
	3902	&& (minnext + deltanext) > 0)
	3903	\|\| deltanext == I32_MAX);
	3904	is_inf \|= is_inf_internal;
	3905	delta += (minnext + deltanext) * maxcount - minnext * mincount;
	3906
	3907	/* Try powerful optimization CURLYX => CURLYN. */
	3908	if ( OP(oscan) == CURLYX && data
	3909	&& data->flags & SF_IN_PAR
	3910	&& !(data->flags & SF_HAS_EVAL)
	3911	&& !deltanext && minnext == 1 ) {
	3912	/* Try to optimize to CURLYN. */
	3913	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	3914	regnode * const nxt1 = nxt;
	3915	#ifdef DEBUGGING
	3916	regnode *nxt2;
	3917	#endif
	3918
	3919	/* Skip open. */
	3920	nxt = regnext(nxt);
	3921	if (!REGNODE_SIMPLE(OP(nxt))
	3922	&& !(PL_regkind[OP(nxt)] == EXACT
	3923	&& STR_LEN(nxt) == 1))
	3924	goto nogo;
	3925	#ifdef DEBUGGING
	3926	nxt2 = nxt;
	3927	#endif
	3928	nxt = regnext(nxt);
	3929	if (OP(nxt) != CLOSE)
	3930	goto nogo;
	3931	if (RExC_open_parens) {
	3932	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3933	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	3934	}
	3935	/* Now we know that nxt2 is the only contents: */
	3936	oscan->flags = (U8)ARG(nxt);
	3937	OP(oscan) = CURLYN;
	3938	OP(nxt1) = NOTHING; /* was OPEN. */
	3939
	3940	#ifdef DEBUGGING
	3941	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3942	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	3943	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	3944	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3945	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3946	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	3947	#endif
	3948	}
	3949	nogo:
	3950
	3951	/* Try optimization CURLYX => CURLYM. */
	3952	if ( OP(oscan) == CURLYX && data
	3953	&& !(data->flags & SF_HAS_PAR)
	3954	&& !(data->flags & SF_HAS_EVAL)
	3955	&& !deltanext /* atom is fixed width */
	3956	&& minnext != 0 /* CURLYM can't handle zero width */
	3957	) {
	3958	/* XXXX How to optimize if data == 0? */
	3959	/* Optimize to a simpler form. */
	3960	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	3961	regnode *nxt2;
	3962
	3963	OP(oscan) = CURLYM;
	3964	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	3965	&& (OP(nxt2) != WHILEM))
	3966	nxt = nxt2;
	3967	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	3968	/* Need to optimize away parenths. */
	3969	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	3970	/* Set the parenth number. */
	3971	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	3972
	3973	oscan->flags = (U8)ARG(nxt);
	3974	if (RExC_open_parens) {
	3975	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3976	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	3977	}
	3978	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	3979	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3980
	3981	#ifdef DEBUGGING
	3982	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3983	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3984	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	3985	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	3986	#endif
	3987	#if 0
	3988	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	3989	regnode *nnxt = regnext(nxt1);
	3990	if (nnxt == nxt) {
	3991	if (reg_off_by_arg[OP(nxt1)])
	3992	ARG_SET(nxt1, nxt2 - nxt1);
	3993	else if (nxt2 - nxt1 < U16_MAX)
	3994	NEXT_OFF(nxt1) = nxt2 - nxt1;
	3995	else
	3996	OP(nxt) = NOTHING; /* Cannot beautify */
	3997	}
	3998	nxt1 = nnxt;
	3999	}
	4000	#endif
	4001	/* Optimize again: */
	4002	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	4003	NULL, stopparen, recursed, NULL, 0,depth+1);
	4004	}
	4005	else
	4006	oscan->flags = 0;
	4007	}
	4008	else if ((OP(oscan) == CURLYX)
	4009	&& (flags & SCF_WHILEM_VISITED_POS)
	4010	/* See the comment on a similar expression above.
	4011	However, this time it's not a subexpression
	4012	we care about, but the expression itself. */
	4013	&& (maxcount == REG_INFTY)
	4014	&& data && ++data->whilem_c < 16) {
	4015	/* This stays as CURLYX, we can put the count/of pair. */
	4016	/* Find WHILEM (as in regexec.c) */
	4017	regnode *nxt = oscan + NEXT_OFF(oscan);
	4018
	4019	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	4020	nxt += ARG(nxt);
	4021	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	4022	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	4023	}
	4024	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	4025	pars++;
	4026	if (flags & SCF_DO_SUBSTR) {
	4027	SV *last_str = NULL;
	4028	int counted = mincount != 0;
	4029
	4030	if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
	4031	#if defined(SPARC64_GCC_WORKAROUND)
	4032	I32 b = 0;
	4033	STRLEN l = 0;
	4034	const char *s = NULL;
	4035	I32 old = 0;
	4036
	4037	if (pos_before >= data->last_start_min)
	4038	b = pos_before;
	4039	else
	4040	b = data->last_start_min;
	4041
	4042	l = 0;
	4043	s = SvPV_const(data->last_found, l);
	4044	old = b - data->last_start_min;
	4045
	4046	#else
	4047	I32 b = pos_before >= data->last_start_min
	4048	? pos_before : data->last_start_min;
	4049	STRLEN l;
	4050	const char * const s = SvPV_const(data->last_found, l);
	4051	I32 old = b - data->last_start_min;
	4052	#endif
	4053
	4054	if (UTF)
	4055	old = utf8_hop((U8)s, old) - (U8)s;
	4056	l -= old;
	4057	/* Get the added string: */
	4058	last_str = newSVpvn_utf8(s + old, l, UTF);
	4059	if (deltanext == 0 && pos_before == b) {
	4060	/* What was added is a constant string */
	4061	if (mincount > 1) {
	4062	SvGROW(last_str, (mincount * l) + 1);
	4063	repeatcpy(SvPVX(last_str) + l,
	4064	SvPVX_const(last_str), l, mincount - 1);
	4065	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	4066	/* Add additional parts. */
	4067	SvCUR_set(data->last_found,
	4068	SvCUR(data->last_found) - l);
	4069	sv_catsv(data->last_found, last_str);
	4070	{
	4071	SV * sv = data->last_found;
	4072	MAGIC *mg =
	4073	SvUTF8(sv) && SvMAGICAL(sv) ?
	4074	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	4075	if (mg && mg->mg_len >= 0)
	4076	mg->mg_len += CHR_SVLEN(last_str) - l;
	4077	}
	4078	data->last_end += l * (mincount - 1);
	4079	}
	4080	} else {
	4081	/* start offset must point into the last copy */
	4082	data->last_start_min += minnext * (mincount - 1);
	4083	data->last_start_max += is_inf ? I32_MAX
	4084	: (maxcount - 1) * (minnext + data->pos_delta);
	4085	}
	4086	}
	4087	/* It is counted once already... */
	4088	data->pos_min += minnext * (mincount - counted);
	4089	data->pos_delta += - counted * deltanext +
	4090	(minnext + deltanext) * maxcount - minnext * mincount;
	4091	if (mincount != maxcount) {
	4092	/* Cannot extend fixed substrings found inside
	4093	the group. */
	4094	SCAN_COMMIT(pRExC_state,data,minlenp);
	4095	if (mincount && last_str) {
	4096	SV * const sv = data->last_found;
	4097	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	4098	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	4099
	4100	if (mg)
	4101	mg->mg_len = -1;
	4102	sv_setsv(sv, last_str);
	4103	data->last_end = data->pos_min;
	4104	data->last_start_min =
	4105	data->pos_min - CHR_SVLEN(last_str);
	4106	data->last_start_max = is_inf
	4107	? I32_MAX
	4108	: data->pos_min + data->pos_delta
	4109	- CHR_SVLEN(last_str);
	4110	}
	4111	data->longest = &(data->longest_float);
	4112	}
	4113	SvREFCNT_dec(last_str);
	4114	}
	4115	if (data && (fl & SF_HAS_EVAL))
	4116	data->flags \|= SF_HAS_EVAL;
	4117	optimize_curly_tail:
	4118	if (OP(oscan) != CURLYX) {
	4119	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	4120	&& NEXT_OFF(next))
	4121	NEXT_OFF(oscan) += NEXT_OFF(next);
	4122	}
	4123	continue;
	4124	default: /* REF, ANYOFV, and CLUMP only? */
	4125	if (flags & SCF_DO_SUBSTR) {
	4126	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4127	data->longest = &(data->longest_float);
	4128	}
	4129	is_inf = is_inf_internal = 1;
	4130	if (flags & SCF_DO_STCLASS_OR)
	4131	cl_anything(pRExC_state, data->start_class);
	4132	flags &= ~SCF_DO_STCLASS;
	4133	break;
	4134	}
	4135	}
	4136	else if (OP(scan) == LNBREAK) {
	4137	if (flags & SCF_DO_STCLASS) {
	4138	int value = 0;
	4139	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	4140	if (flags & SCF_DO_STCLASS_AND) {
	4141	for (value = 0; value < 256; value++)
	4142	if (!is_VERTWS_cp(value))
	4143	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4144	}
	4145	else {
	4146	for (value = 0; value < 256; value++)
	4147	if (is_VERTWS_cp(value))
	4148	ANYOF_BITMAP_SET(data->start_class, value);
	4149	}
	4150	if (flags & SCF_DO_STCLASS_OR)
	4151	cl_and(data->start_class, and_withp);
	4152	flags &= ~SCF_DO_STCLASS;
	4153	}
	4154	min += 1;
	4155	delta += 1;
	4156	if (flags & SCF_DO_SUBSTR) {
	4157	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4158	data->pos_min += 1;
	4159	data->pos_delta += 1;
	4160	data->longest = &(data->longest_float);
	4161	}
	4162	}
	4163	else if (REGNODE_SIMPLE(OP(scan))) {
	4164	int value = 0;
	4165
	4166	if (flags & SCF_DO_SUBSTR) {
	4167	SCAN_COMMIT(pRExC_state,data,minlenp);
	4168	data->pos_min++;
	4169	}
	4170	min++;
	4171	if (flags & SCF_DO_STCLASS) {
	4172	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	4173
	4174	/* Some of the logic below assumes that switching
	4175	locale on will only add false positives. */
	4176	switch (PL_regkind[OP(scan)]) {
	4177	case SANY:
	4178	default:
	4179	do_default:
	4180	/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
	4181	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4182	cl_anything(pRExC_state, data->start_class);
	4183	break;
	4184	case REG_ANY:
	4185	if (OP(scan) == SANY)
	4186	goto do_default;
	4187	if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
	4188	value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
	4189	\|\| ANYOF_CLASS_TEST_ANY_SET(data->start_class));
	4190	cl_anything(pRExC_state, data->start_class);
	4191	}
	4192	if (flags & SCF_DO_STCLASS_AND \|\| !value)
	4193	ANYOF_BITMAP_CLEAR(data->start_class,'\n');
	4194	break;
	4195	case ANYOF:
	4196	if (flags & SCF_DO_STCLASS_AND)
	4197	cl_and(data->start_class,
	4198	(struct regnode_charclass_class*)scan);
	4199	else
	4200	cl_or(pRExC_state, data->start_class,
	4201	(struct regnode_charclass_class*)scan);
	4202	break;
	4203	case ALNUM:
	4204	if (flags & SCF_DO_STCLASS_AND) {
	4205	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4206	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
	4207	if (OP(scan) == ALNUMU) {
	4208	for (value = 0; value < 256; value++) {
	4209	if (!isWORDCHAR_L1(value)) {
	4210	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4211	}
	4212	}
	4213	} else {
	4214	for (value = 0; value < 256; value++) {
	4215	if (!isALNUM(value)) {
	4216	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4217	}
	4218	}
	4219	}
	4220	}
	4221	}
	4222	else {
	4223	if (data->start_class->flags & ANYOF_LOCALE)
	4224	ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
	4225
	4226	/* Even if under locale, set the bits for non-locale
	4227	* in case it isn't a true locale-node. This will
	4228	* create false positives if it truly is locale */
	4229	if (OP(scan) == ALNUMU) {
	4230	for (value = 0; value < 256; value++) {
	4231	if (isWORDCHAR_L1(value)) {
	4232	ANYOF_BITMAP_SET(data->start_class, value);
	4233	}
	4234	}
	4235	} else {
	4236	for (value = 0; value < 256; value++) {
	4237	if (isALNUM(value)) {
	4238	ANYOF_BITMAP_SET(data->start_class, value);
	4239	}
	4240	}
	4241	}
	4242	}
	4243	break;
	4244	case NALNUM:
	4245	if (flags & SCF_DO_STCLASS_AND) {
	4246	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4247	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
	4248	if (OP(scan) == NALNUMU) {
	4249	for (value = 0; value < 256; value++) {
	4250	if (isWORDCHAR_L1(value)) {
	4251	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4252	}
	4253	}
	4254	} else {
	4255	for (value = 0; value < 256; value++) {
	4256	if (isALNUM(value)) {
	4257	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4258	}
	4259	}
	4260	}
	4261	}
	4262	}
	4263	else {
	4264	if (data->start_class->flags & ANYOF_LOCALE)
	4265	ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
	4266
	4267	/* Even if under locale, set the bits for non-locale in
	4268	* case it isn't a true locale-node. This will create
	4269	* false positives if it truly is locale */
	4270	if (OP(scan) == NALNUMU) {
	4271	for (value = 0; value < 256; value++) {
	4272	if (! isWORDCHAR_L1(value)) {
	4273	ANYOF_BITMAP_SET(data->start_class, value);
	4274	}
	4275	}
	4276	} else {
	4277	for (value = 0; value < 256; value++) {
	4278	if (! isALNUM(value)) {
	4279	ANYOF_BITMAP_SET(data->start_class, value);
	4280	}
	4281	}
	4282	}
	4283	}
	4284	break;
	4285	case SPACE:
	4286	if (flags & SCF_DO_STCLASS_AND) {
	4287	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4288	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
	4289	if (OP(scan) == SPACEU) {
	4290	for (value = 0; value < 256; value++) {
	4291	if (!isSPACE_L1(value)) {
	4292	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4293	}
	4294	}
	4295	} else {
	4296	for (value = 0; value < 256; value++) {
	4297	if (!isSPACE(value)) {
	4298	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4299	}
	4300	}
	4301	}
	4302	}
	4303	}
	4304	else {
	4305	if (data->start_class->flags & ANYOF_LOCALE) {
	4306	ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
	4307	}
	4308	if (OP(scan) == SPACEU) {
	4309	for (value = 0; value < 256; value++) {
	4310	if (isSPACE_L1(value)) {
	4311	ANYOF_BITMAP_SET(data->start_class, value);
	4312	}
	4313	}
	4314	} else {
	4315	for (value = 0; value < 256; value++) {
	4316	if (isSPACE(value)) {
	4317	ANYOF_BITMAP_SET(data->start_class, value);
	4318	}
	4319	}
	4320	}
	4321	}
	4322	break;
	4323	case NSPACE:
	4324	if (flags & SCF_DO_STCLASS_AND) {
	4325	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4326	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
	4327	if (OP(scan) == NSPACEU) {
	4328	for (value = 0; value < 256; value++) {
	4329	if (isSPACE_L1(value)) {
	4330	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4331	}
	4332	}
	4333	} else {
	4334	for (value = 0; value < 256; value++) {
	4335	if (isSPACE(value)) {
	4336	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4337	}
	4338	}
	4339	}
	4340	}
	4341	}
	4342	else {
	4343	if (data->start_class->flags & ANYOF_LOCALE)
	4344	ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
	4345	if (OP(scan) == NSPACEU) {
	4346	for (value = 0; value < 256; value++) {
	4347	if (!isSPACE_L1(value)) {
	4348	ANYOF_BITMAP_SET(data->start_class, value);
	4349	}
	4350	}
	4351	}
	4352	else {
	4353	for (value = 0; value < 256; value++) {
	4354	if (!isSPACE(value)) {
	4355	ANYOF_BITMAP_SET(data->start_class, value);
	4356	}
	4357	}
	4358	}
	4359	}
	4360	break;
	4361	case DIGIT:
	4362	if (flags & SCF_DO_STCLASS_AND) {
	4363	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4364	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
	4365	for (value = 0; value < 256; value++)
	4366	if (!isDIGIT(value))
	4367	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4368	}
	4369	}
	4370	else {
	4371	if (data->start_class->flags & ANYOF_LOCALE)
	4372	ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
	4373	for (value = 0; value < 256; value++)
	4374	if (isDIGIT(value))
	4375	ANYOF_BITMAP_SET(data->start_class, value);
	4376	}
	4377	break;
	4378	case NDIGIT:
	4379	if (flags & SCF_DO_STCLASS_AND) {
	4380	if (!(data->start_class->flags & ANYOF_LOCALE))
	4381	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
	4382	for (value = 0; value < 256; value++)
	4383	if (isDIGIT(value))
	4384	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4385	}
	4386	else {
	4387	if (data->start_class->flags & ANYOF_LOCALE)
	4388	ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
	4389	for (value = 0; value < 256; value++)
	4390	if (!isDIGIT(value))
	4391	ANYOF_BITMAP_SET(data->start_class, value);
	4392	}
	4393	break;
	4394	CASE_SYNST_FNC(VERTWS);
	4395	CASE_SYNST_FNC(HORIZWS);
	4396
	4397	}
	4398	if (flags & SCF_DO_STCLASS_OR)
	4399	cl_and(data->start_class, and_withp);
	4400	flags &= ~SCF_DO_STCLASS;
	4401	}
	4402	}
	4403	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	4404	data->flags \|= (OP(scan) == MEOL
	4405	? SF_BEFORE_MEOL
	4406	: SF_BEFORE_SEOL);
	4407	}
	4408	else if ( PL_regkind[OP(scan)] == BRANCHJ
	4409	/* Lookbehind, or need to calculate parens/evals/stclass: */
	4410	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	4411	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM)) {
	4412	if ( OP(scan) == UNLESSM &&
	4413	scan->flags == 0 &&
	4414	OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
	4415	OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
	4416	) {
	4417	regnode *opt;
	4418	regnode *upto= regnext(scan);
	4419	DEBUG_PARSE_r({
	4420	SV * const mysv_val=sv_newmortal();
	4421	DEBUG_STUDYDATA("OPFAIL",data,depth);
	4422
	4423	/DEBUG_PARSE_MSG("opfail");/
	4424	regprop(RExC_rx, mysv_val, upto);
	4425	PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
	4426	SvPV_nolen_const(mysv_val),
	4427	(IV)REG_NODE_NUM(upto),
	4428	(IV)(upto - scan)
	4429	);
	4430	});
	4431	OP(scan) = OPFAIL;
	4432	NEXT_OFF(scan) = upto - scan;
	4433	for (opt= scan + 1; opt < upto ; opt++)
	4434	OP(opt) = OPTIMIZED;
	4435	scan= upto;
	4436	continue;
	4437	}
	4438	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4439	\|\| OP(scan) == UNLESSM )
	4440	{
	4441	/* Negative Lookahead/lookbehind
	4442	In this case we can't do fixed string optimisation.
	4443	*/
	4444
	4445	I32 deltanext, minnext, fake = 0;
	4446	regnode *nscan;
	4447	struct regnode_charclass_class intrnl;
	4448	int f = 0;
	4449
	4450	data_fake.flags = 0;
	4451	if (data) {
	4452	data_fake.whilem_c = data->whilem_c;
	4453	data_fake.last_closep = data->last_closep;
	4454	}
	4455	else
	4456	data_fake.last_closep = &fake;
	4457	data_fake.pos_delta = delta;
	4458	if ( flags & SCF_DO_STCLASS && !scan->flags
	4459	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4460	cl_init(pRExC_state, &intrnl);
	4461	data_fake.start_class = &intrnl;
	4462	f \|= SCF_DO_STCLASS_AND;
	4463	}
	4464	if (flags & SCF_WHILEM_VISITED_POS)
	4465	f \|= SCF_WHILEM_VISITED_POS;
	4466	next = regnext(scan);
	4467	nscan = NEXTOPER(NEXTOPER(scan));
	4468	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	4469	last, &data_fake, stopparen, recursed, NULL, f, depth+1);
	4470	if (scan->flags) {
	4471	if (deltanext) {
	4472	FAIL("Variable length lookbehind not implemented");
	4473	}
	4474	else if (minnext > (I32)U8_MAX) {
	4475	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4476	}
	4477	scan->flags = (U8)minnext;
	4478	}
	4479	if (data) {
	4480	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4481	pars++;
	4482	if (data_fake.flags & SF_HAS_EVAL)
	4483	data->flags \|= SF_HAS_EVAL;
	4484	data->whilem_c = data_fake.whilem_c;
	4485	}
	4486	if (f & SCF_DO_STCLASS_AND) {
	4487	if (flags & SCF_DO_STCLASS_OR) {
	4488	/* OR before, AND after: ideally we would recurse with
	4489	* data_fake to get the AND applied by study of the
	4490	* remainder of the pattern, and then derecurse;
	4491	* * HACK * for now just treat as "no information".
	4492	* See [perl #56690].
	4493	*/
	4494	cl_init(pRExC_state, data->start_class);
	4495	} else {
	4496	/* AND before and after: combine and continue */
	4497	const int was = (data->start_class->flags & ANYOF_EOS);
	4498
	4499	cl_and(data->start_class, &intrnl);
	4500	if (was)
	4501	data->start_class->flags \|= ANYOF_EOS;
	4502	}
	4503	}
	4504	}
	4505	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4506	else {
	4507	/* Positive Lookahead/lookbehind
	4508	In this case we can do fixed string optimisation,
	4509	but we must be careful about it. Note in the case of
	4510	lookbehind the positions will be offset by the minimum
	4511	length of the pattern, something we won't know about
	4512	until after the recurse.
	4513	*/
	4514	I32 deltanext, fake = 0;
	4515	regnode *nscan;
	4516	struct regnode_charclass_class intrnl;
	4517	int f = 0;
	4518	/* We use SAVEFREEPV so that when the full compile
	4519	is finished perl will clean up the allocated
	4520	minlens when it's all done. This way we don't
	4521	have to worry about freeing them when we know
	4522	they wont be used, which would be a pain.
	4523	*/
	4524	I32 *minnextp;
	4525	Newx( minnextp, 1, I32 );
	4526	SAVEFREEPV(minnextp);
	4527
	4528	if (data) {
	4529	StructCopy(data, &data_fake, scan_data_t);
	4530	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	4531	f \|= SCF_DO_SUBSTR;
	4532	if (scan->flags)
	4533	SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
	4534	data_fake.last_found=newSVsv(data->last_found);
	4535	}
	4536	}
	4537	else
	4538	data_fake.last_closep = &fake;
	4539	data_fake.flags = 0;
	4540	data_fake.pos_delta = delta;
	4541	if (is_inf)
	4542	data_fake.flags \|= SF_IS_INF;
	4543	if ( flags & SCF_DO_STCLASS && !scan->flags
	4544	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4545	cl_init(pRExC_state, &intrnl);
	4546	data_fake.start_class = &intrnl;
	4547	f \|= SCF_DO_STCLASS_AND;
	4548	}
	4549	if (flags & SCF_WHILEM_VISITED_POS)
	4550	f \|= SCF_WHILEM_VISITED_POS;
	4551	next = regnext(scan);
	4552	nscan = NEXTOPER(NEXTOPER(scan));
	4553
	4554	*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
	4555	last, &data_fake, stopparen, recursed, NULL, f,depth+1);
	4556	if (scan->flags) {
	4557	if (deltanext) {
	4558	FAIL("Variable length lookbehind not implemented");
	4559	}
	4560	else if (*minnextp > (I32)U8_MAX) {
	4561	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4562	}
	4563	scan->flags = (U8)*minnextp;
	4564	}
	4565
	4566	*minnextp += min;
	4567
	4568	if (f & SCF_DO_STCLASS_AND) {
	4569	const int was = (data->start_class->flags & ANYOF_EOS);
	4570
	4571	cl_and(data->start_class, &intrnl);
	4572	if (was)
	4573	data->start_class->flags \|= ANYOF_EOS;
	4574	}
	4575	if (data) {
	4576	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4577	pars++;
	4578	if (data_fake.flags & SF_HAS_EVAL)
	4579	data->flags \|= SF_HAS_EVAL;
	4580	data->whilem_c = data_fake.whilem_c;
	4581	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	4582	if (RExC_rx->minlen<*minnextp)
	4583	RExC_rx->minlen=*minnextp;
	4584	SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
	4585	SvREFCNT_dec(data_fake.last_found);
	4586
	4587	if ( data_fake.minlen_fixed != minlenp )
	4588	{
	4589	data->offset_fixed= data_fake.offset_fixed;
	4590	data->minlen_fixed= data_fake.minlen_fixed;
	4591	data->lookbehind_fixed+= scan->flags;
	4592	}
	4593	if ( data_fake.minlen_float != minlenp )
	4594	{
	4595	data->minlen_float= data_fake.minlen_float;
	4596	data->offset_float_min=data_fake.offset_float_min;
	4597	data->offset_float_max=data_fake.offset_float_max;
	4598	data->lookbehind_float+= scan->flags;
	4599	}
	4600	}
	4601	}
	4602	}
	4603	#endif
	4604	}
	4605	else if (OP(scan) == OPEN) {
	4606	if (stopparen != (I32)ARG(scan))
	4607	pars++;
	4608	}
	4609	else if (OP(scan) == CLOSE) {
	4610	if (stopparen == (I32)ARG(scan)) {
	4611	break;
	4612	}
	4613	if ((I32)ARG(scan) == is_par) {
	4614	next = regnext(scan);
	4615
	4616	if ( next && (OP(next) != WHILEM) && next < last)
	4617	is_par = 0; /* Disable optimization */
	4618	}
	4619	if (data)
	4620	*(data->last_closep) = ARG(scan);
	4621	}
	4622	else if (OP(scan) == EVAL) {
	4623	if (data)
	4624	data->flags \|= SF_HAS_EVAL;
	4625	}
	4626	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	4627	if (flags & SCF_DO_SUBSTR) {
	4628	SCAN_COMMIT(pRExC_state,data,minlenp);
	4629	flags &= ~SCF_DO_SUBSTR;
	4630	}
	4631	if (data && OP(scan)==ACCEPT) {
	4632	data->flags \|= SCF_SEEN_ACCEPT;
	4633	if (stopmin > min)
	4634	stopmin = min;
	4635	}
	4636	}
	4637	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	4638	{
	4639	if (flags & SCF_DO_SUBSTR) {
	4640	SCAN_COMMIT(pRExC_state,data,minlenp);
	4641	data->longest = &(data->longest_float);
	4642	}
	4643	is_inf = is_inf_internal = 1;
	4644	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4645	cl_anything(pRExC_state, data->start_class);
	4646	flags &= ~SCF_DO_STCLASS;
	4647	}
	4648	else if (OP(scan) == GPOS) {
	4649	if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
	4650	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	4651	{
	4652	if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
	4653	RExC_rx->extflags \|= RXf_ANCH_GPOS;
	4654	if (RExC_rx->gofs < (U32)min)
	4655	RExC_rx->gofs = min;
	4656	} else {
	4657	RExC_rx->extflags \|= RXf_GPOS_FLOAT;
	4658	RExC_rx->gofs = 0;
	4659	}
	4660	}
	4661	#ifdef TRIE_STUDY_OPT
	4662	#ifdef FULL_TRIE_STUDY
	4663	else if (PL_regkind[OP(scan)] == TRIE) {
	4664	/* NOTE - There is similar code to this block above for handling
	4665	BRANCH nodes on the initial study. If you change stuff here
	4666	check there too. */
	4667	regnode *trie_node= scan;
	4668	regnode *tail= regnext(scan);
	4669	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4670	I32 max1 = 0, min1 = I32_MAX;
	4671	struct regnode_charclass_class accum;
	4672
	4673	if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
	4674	SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
	4675	if (flags & SCF_DO_STCLASS)
	4676	cl_init_zero(pRExC_state, &accum);
	4677
	4678	if (!trie->jump) {
	4679	min1= trie->minlen;
	4680	max1= trie->maxlen;
	4681	} else {
	4682	const regnode *nextbranch= NULL;
	4683	U32 word;
	4684
	4685	for ( word=1 ; word <= trie->wordcount ; word++)
	4686	{
	4687	I32 deltanext=0, minnext=0, f = 0, fake;
	4688	struct regnode_charclass_class this_class;
	4689
	4690	data_fake.flags = 0;
	4691	if (data) {
	4692	data_fake.whilem_c = data->whilem_c;
	4693	data_fake.last_closep = data->last_closep;
	4694	}
	4695	else
	4696	data_fake.last_closep = &fake;
	4697	data_fake.pos_delta = delta;
	4698	if (flags & SCF_DO_STCLASS) {
	4699	cl_init(pRExC_state, &this_class);
	4700	data_fake.start_class = &this_class;
	4701	f = SCF_DO_STCLASS_AND;
	4702	}
	4703	if (flags & SCF_WHILEM_VISITED_POS)
	4704	f \|= SCF_WHILEM_VISITED_POS;
	4705
	4706	if (trie->jump[word]) {
	4707	if (!nextbranch)
	4708	nextbranch = trie_node + trie->jump[0];
	4709	scan= trie_node + trie->jump[word];
	4710	/* We go from the jump point to the branch that follows
	4711	it. Note this means we need the vestigal unused branches
	4712	even though they arent otherwise used.
	4713	*/
	4714	minnext = study_chunk(pRExC_state, &scan, minlenp,
	4715	&deltanext, (regnode *)nextbranch, &data_fake,
	4716	stopparen, recursed, NULL, f,depth+1);
	4717	}
	4718	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	4719	nextbranch= regnext((regnode*)nextbranch);
	4720
	4721	if (min1 > (I32)(minnext + trie->minlen))
	4722	min1 = minnext + trie->minlen;
	4723	if (max1 < (I32)(minnext + deltanext + trie->maxlen))
	4724	max1 = minnext + deltanext + trie->maxlen;
	4725	if (deltanext == I32_MAX)
	4726	is_inf = is_inf_internal = 1;
	4727
	4728	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4729	pars++;
	4730	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	4731	if ( stopmin > min + min1)
	4732	stopmin = min + min1;
	4733	flags &= ~SCF_DO_SUBSTR;
	4734	if (data)
	4735	data->flags \|= SCF_SEEN_ACCEPT;
	4736	}
	4737	if (data) {
	4738	if (data_fake.flags & SF_HAS_EVAL)
	4739	data->flags \|= SF_HAS_EVAL;
	4740	data->whilem_c = data_fake.whilem_c;
	4741	}
	4742	if (flags & SCF_DO_STCLASS)
	4743	cl_or(pRExC_state, &accum, &this_class);
	4744	}
	4745	}
	4746	if (flags & SCF_DO_SUBSTR) {
	4747	data->pos_min += min1;
	4748	data->pos_delta += max1 - min1;
	4749	if (max1 != min1 \|\| is_inf)
	4750	data->longest = &(data->longest_float);
	4751	}
	4752	min += min1;
	4753	delta += max1 - min1;
	4754	if (flags & SCF_DO_STCLASS_OR) {
	4755	cl_or(pRExC_state, data->start_class, &accum);
	4756	if (min1) {
	4757	cl_and(data->start_class, and_withp);
	4758	flags &= ~SCF_DO_STCLASS;
	4759	}
	4760	}
	4761	else if (flags & SCF_DO_STCLASS_AND) {
	4762	if (min1) {
	4763	cl_and(data->start_class, &accum);
	4764	flags &= ~SCF_DO_STCLASS;
	4765	}
	4766	else {
	4767	/* Switch to OR mode: cache the old value of
	4768	* data->start_class */
	4769	INIT_AND_WITHP;
	4770	StructCopy(data->start_class, and_withp,
	4771	struct regnode_charclass_class);
	4772	flags &= ~SCF_DO_STCLASS_AND;
	4773	StructCopy(&accum, data->start_class,
	4774	struct regnode_charclass_class);
	4775	flags \|= SCF_DO_STCLASS_OR;
	4776	data->start_class->flags \|= ANYOF_EOS;
	4777	}
	4778	}
	4779	scan= tail;
	4780	continue;
	4781	}
	4782	#else
	4783	else if (PL_regkind[OP(scan)] == TRIE) {
	4784	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4785	U8*bang=NULL;
	4786
	4787	min += trie->minlen;
	4788	delta += (trie->maxlen - trie->minlen);
	4789	flags &= ~SCF_DO_STCLASS; /* xxx */
	4790	if (flags & SCF_DO_SUBSTR) {
	4791	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4792	data->pos_min += trie->minlen;
	4793	data->pos_delta += (trie->maxlen - trie->minlen);
	4794	if (trie->maxlen != trie->minlen)
	4795	data->longest = &(data->longest_float);
	4796	}
	4797	if (trie->jump) /* no more substrings -- for now /grr*/
	4798	flags &= ~SCF_DO_SUBSTR;
	4799	}
	4800	#endif /* old or new */
	4801	#endif /* TRIE_STUDY_OPT */
	4802
	4803	/* Else: zero-length, ignore. */
	4804	scan = regnext(scan);
	4805	}
	4806	if (frame) {
	4807	last = frame->last;
	4808	scan = frame->next;
	4809	stopparen = frame->stop;
	4810	frame = frame->prev;
	4811	goto fake_study_recurse;
	4812	}
	4813
	4814	finish:
	4815	assert(!frame);
	4816	DEBUG_STUDYDATA("pre-fin:",data,depth);
	4817
	4818	*scanp = scan;
	4819	*deltap = is_inf_internal ? I32_MAX : delta;
	4820	if (flags & SCF_DO_SUBSTR && is_inf)
	4821	data->pos_delta = I32_MAX - data->pos_min;
	4822	if (is_par > (I32)U8_MAX)
	4823	is_par = 0;
	4824	if (is_par && pars==1 && data) {
	4825	data->flags \|= SF_IN_PAR;
	4826	data->flags &= ~SF_HAS_PAR;
	4827	}
	4828	else if (pars && data) {
	4829	data->flags \|= SF_HAS_PAR;
	4830	data->flags &= ~SF_IN_PAR;
	4831	}
	4832	if (flags & SCF_DO_STCLASS_OR)
	4833	cl_and(data->start_class, and_withp);
	4834	if (flags & SCF_TRIE_RESTUDY)
	4835	data->flags \|= SCF_TRIE_RESTUDY;
	4836
	4837	DEBUG_STUDYDATA("post-fin:",data,depth);
	4838
	4839	return min < stopmin ? min : stopmin;
	4840	}
	4841
	4842	STATIC U32
	4843	S_add_data(RExC_state_t pRExC_state, U32 n, const char s)
	4844	{
	4845	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	4846
	4847	PERL_ARGS_ASSERT_ADD_DATA;
	4848
	4849	Renewc(RExC_rxi->data,
	4850	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	4851	char, struct reg_data);
	4852	if(count)
	4853	Renew(RExC_rxi->data->what, count + n, U8);
	4854	else
	4855	Newx(RExC_rxi->data->what, n, U8);
	4856	RExC_rxi->data->count = count + n;
	4857	Copy(s, RExC_rxi->data->what + count, n, U8);
	4858	return count;
	4859	}
	4860
	4861	/XXX: todo make this not included in a non debugging perl /
	4862	#ifndef PERL_IN_XSUB_RE
	4863	void
	4864	Perl_reginitcolors(pTHX)
	4865	{
	4866	dVAR;
	4867	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	4868	if (s) {
	4869	char *t = savepv(s);
	4870	int i = 0;
	4871	PL_colors[0] = t;
	4872	while (++i < 6) {
	4873	t = strchr(t, '\t');
	4874	if (t) {
	4875	*t = '\0';
	4876	PL_colors[i] = ++t;
	4877	}
	4878	else
	4879	PL_colors[i] = t = (char *)"";
	4880	}
	4881	} else {
	4882	int i = 0;
	4883	while (i < 6)
	4884	PL_colors[i++] = (char *)"";
	4885	}
	4886	PL_colorset = 1;
	4887	}
	4888	#endif
	4889
	4890
	4891	#ifdef TRIE_STUDY_OPT
	4892	#define CHECK_RESTUDY_GOTO \
	4893	if ( \
	4894	(data.flags & SCF_TRIE_RESTUDY) \
	4895	&& ! restudied++ \
	4896	) goto reStudy
	4897	#else
	4898	#define CHECK_RESTUDY_GOTO
	4899	#endif
	4900
	4901	/*
	4902	* pregcomp - compile a regular expression into internal code
	4903	*
	4904	* Decides which engine's compiler to call based on the hint currently in
	4905	* scope
	4906	*/
	4907
	4908	#ifndef PERL_IN_XSUB_RE
	4909
	4910	/* return the currently in-scope regex engine (or the default if none) */
	4911
	4912	regexp_engine const *
	4913	Perl_current_re_engine(pTHX)
	4914	{
	4915	dVAR;
	4916
	4917	if (IN_PERL_COMPILETIME) {
	4918	HV * const table = GvHV(PL_hintgv);
	4919	SV **ptr;
	4920
	4921	if (!table)
	4922	return &PL_core_reg_engine;
	4923	ptr = hv_fetchs(table, "regcomp", FALSE);
	4924	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	4925	return &PL_core_reg_engine;
	4926	return INT2PTR(regexp_engine,SvIV(ptr));
	4927	}
	4928	else {
	4929	SV *ptr;
	4930	if (!PL_curcop->cop_hints_hash)
	4931	return &PL_core_reg_engine;
	4932	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
	4933	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	4934	return &PL_core_reg_engine;
	4935	return INT2PTR(regexp_engine*,SvIV(ptr));
	4936	}
	4937	}
	4938
	4939
	4940	REGEXP *
	4941	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	4942	{
	4943	dVAR;
	4944	regexp_engine const *eng = current_re_engine();
	4945	GET_RE_DEBUG_FLAGS_DECL;
	4946
	4947	PERL_ARGS_ASSERT_PREGCOMP;
	4948
	4949	/* Dispatch a request to compile a regexp to correct regexp engine. */
	4950	DEBUG_COMPILE_r({
	4951	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	4952	PTR2UV(eng));
	4953	});
	4954	return CALLREGCOMP_ENG(eng, pattern, flags);
	4955	}
	4956	#endif
	4957
	4958	/* public(ish) wrapper for Perl_re_op_compile that only takes an SV
	4959	* pattern rather than a list of OPs */
	4960
	4961	REGEXP *
	4962	Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
	4963	{
	4964	SV pat = pattern; / defeat constness! */
	4965	PERL_ARGS_ASSERT_RE_COMPILE;
	4966	return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
	4967	NULL, NULL, rx_flags, 0);
	4968	}
	4969
	4970	/* see if there are any run-time code blocks in the pattern.
	4971	* False positives are allowed */
	4972
	4973	static bool
	4974	S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
	4975	U32 pm_flags, char *pat, STRLEN plen)
	4976	{
	4977	int n = 0;
	4978	STRLEN s;
	4979
	4980	/* avoid infinitely recursing when we recompile the pattern parcelled up
	4981	* as qr'...'. A single constant qr// string can't have have any
	4982	* run-time component in it, and thus, no runtime code. (A non-qr
	4983	* string, however, can, e.g. $x =~ '(?{})') */
	4984	if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
	4985	return 0;
	4986
	4987	for (s = 0; s < plen; s++) {
	4988	if (n < pRExC_state->num_code_blocks
	4989	&& s == pRExC_state->code_blocks[n].start)
	4990	{
	4991	s = pRExC_state->code_blocks[n].end;
	4992	n++;
	4993	continue;
	4994	}
	4995	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
	4996	* positives here */
	4997	if (pat[s] == '(' && pat[s+1] == '?' &&
	4998	(pat[s+2] == '{' \|\| (pat[s+2] == '?' && pat[s+3] == '{'))
	4999	)
	5000	return 1;
	5001	}
	5002	return 0;
	5003	}
	5004
	5005	/* Handle run-time code blocks. We will already have compiled any direct
	5006	* or indirect literal code blocks. Now, take the pattern 'pat' and make a
	5007	* copy of it, but with any literal code blocks blanked out and
	5008	* appropriate chars escaped; then feed it into
	5009	*
	5010	* eval "qr'modified_pattern'"
	5011	*
	5012	* For example,
	5013	*
	5014	* a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
	5015	*
	5016	* becomes
	5017	*
	5018	* qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
	5019	*
	5020	* After eval_sv()-ing that, grab any new code blocks from the returned qr
	5021	* and merge them with any code blocks of the original regexp.
	5022	*
	5023	* If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
	5024	* instead, just save the qr and return FALSE; this tells our caller that
	5025	* the original pattern needs upgrading to utf8.
	5026	*/
	5027
	5028	bool
	5029	S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
	5030	char *pat, STRLEN plen)
	5031	{
	5032	SV *qr;
	5033
	5034	GET_RE_DEBUG_FLAGS_DECL;
	5035
	5036	if (pRExC_state->runtime_code_qr) {
	5037	/* this is the second time we've been called; this should
	5038	* only happen if the main pattern got upgraded to utf8
	5039	* during compilation; re-use the qr we compiled first time
	5040	* round (which should be utf8 too)
	5041	*/
	5042	qr = pRExC_state->runtime_code_qr;
	5043	pRExC_state->runtime_code_qr = NULL;
	5044	assert(RExC_utf8 && SvUTF8(qr));
	5045	}
	5046	else {
	5047	int n = 0;
	5048	STRLEN s;
	5049	char p, newpat;
	5050	int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
	5051	SV sv, qr_ref;
	5052	dSP;
	5053
	5054	/* determine how many extra chars we need for ' and \ escaping */
	5055	for (s = 0; s < plen; s++) {
	5056	if (pat[s] == '\'' \|\| pat[s] == '\\')
	5057	newlen++;
	5058	}
	5059
	5060	Newx(newpat, newlen, char);
	5061	p = newpat;
	5062	p++ = 'q'; p++ = 'r'; *p++ = '\'';
	5063
	5064	for (s = 0; s < plen; s++) {
	5065	if (n < pRExC_state->num_code_blocks
	5066	&& s == pRExC_state->code_blocks[n].start)
	5067	{
	5068	/* blank out literal code block */
	5069	assert(pat[s] == '(');
	5070	while (s <= pRExC_state->code_blocks[n].end) {
	5071	*p++ = ' ';
	5072	s++;
	5073	}
	5074	s--;
	5075	n++;
	5076	continue;
	5077	}
	5078	if (pat[s] == '\'' \|\| pat[s] == '\\')
	5079	*p++ = '\\';
	5080	*p++ = pat[s];
	5081	}
	5082	*p++ = '\'';
	5083	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
	5084	*p++ = 'x';
	5085	*p++ = '\0';
	5086	DEBUG_COMPILE_r({
	5087	PerlIO_printf(Perl_debug_log,
	5088	"%sre-parsing pattern for runtime code:%s %s\n",
	5089	PL_colors[4],PL_colors[5],newpat);
	5090	});
	5091
	5092	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
	5093	Safefree(newpat);
	5094
	5095	ENTER;
	5096	SAVETMPS;
	5097	save_re_context();
	5098	PUSHSTACKi(PERLSI_REQUIRE);
	5099	/* this causes the toker to collapse \\ into \ when parsing
	5100	* qr''; normally only q'' does this. It also alters hints
	5101	* handling */
	5102	PL_reg_state.re_reparsing = TRUE;
	5103	eval_sv(sv, G_SCALAR);
	5104	SvREFCNT_dec(sv);
	5105	SPAGAIN;
	5106	qr_ref = POPs;
	5107	PUTBACK;
	5108	if (SvTRUE(ERRSV))
	5109	Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
	5110	assert(SvROK(qr_ref));
	5111	qr = SvRV(qr_ref);
	5112	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
	5113	/* the leaving below frees the tmp qr_ref.
	5114	* Give qr a life of its own */
	5115	SvREFCNT_inc(qr);
	5116	POPSTACK;
	5117	FREETMPS;
	5118	LEAVE;
	5119
	5120	}
	5121
	5122	if (!RExC_utf8 && SvUTF8(qr)) {
	5123	/* first time through; the pattern got upgraded; save the
	5124	* qr for the next time through */
	5125	assert(!pRExC_state->runtime_code_qr);
	5126	pRExC_state->runtime_code_qr = qr;
	5127	return 0;
	5128	}
	5129
	5130
	5131	/* extract any code blocks within the returned qr// */
	5132
	5133
	5134	/* merge the main (r1) and run-time (r2) code blocks into one */
	5135	{
	5136	RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
	5137	struct reg_code_block new_block, dst;
	5138	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
	5139	int i1 = 0, i2 = 0;
	5140
	5141	if (!r2->num_code_blocks) /* we guessed wrong */
	5142	return 1;
	5143
	5144	Newx(new_block,
	5145	r1->num_code_blocks + r2->num_code_blocks,
	5146	struct reg_code_block);
	5147	dst = new_block;
	5148
	5149	while ( i1 < r1->num_code_blocks
	5150	\|\| i2 < r2->num_code_blocks)
	5151	{
	5152	struct reg_code_block *src;
	5153	bool is_qr = 0;
	5154
	5155	if (i1 == r1->num_code_blocks) {
	5156	src = &r2->code_blocks[i2++];
	5157	is_qr = 1;
	5158	}
	5159	else if (i2 == r2->num_code_blocks)
	5160	src = &r1->code_blocks[i1++];
	5161	else if ( r1->code_blocks[i1].start
	5162	< r2->code_blocks[i2].start)
	5163	{
	5164	src = &r1->code_blocks[i1++];
	5165	assert(src->end < r2->code_blocks[i2].start);
	5166	}
	5167	else {
	5168	assert( r1->code_blocks[i1].start
	5169	> r2->code_blocks[i2].start);
	5170	src = &r2->code_blocks[i2++];
	5171	is_qr = 1;
	5172	assert(src->end < r1->code_blocks[i1].start);
	5173	}
	5174
	5175	assert(pat[src->start] == '(');
	5176	assert(pat[src->end] == ')');
	5177	dst->start = src->start;
	5178	dst->end = src->end;
	5179	dst->block = src->block;
	5180	dst->src_regex = is_qr ? (REGEXP) SvREFCNT_inc( (SV) qr)
	5181	: src->src_regex;
	5182	dst++;
	5183	}
	5184	r1->num_code_blocks += r2->num_code_blocks;
	5185	Safefree(r1->code_blocks);
	5186	r1->code_blocks = new_block;
	5187	}
	5188
	5189	SvREFCNT_dec(qr);
	5190	return 1;
	5191	}
	5192
	5193
	5194	/*
	5195	* Perl_re_op_compile - the perl internal RE engine's function to compile a
	5196	* regular expression into internal code.
	5197	* The pattern may be passed either as:
	5198	* a list of SVs (patternp plus pat_count)
	5199	* a list of OPs (expr)
	5200	* If both are passed, the SV list is used, but the OP list indicates
	5201	* which SVs are actually pre-compiled code blocks
	5202	*
	5203	* The SVs in the list have magic and qr overloading applied to them (and
	5204	* the list may be modified in-place with replacement SVs in the latter
	5205	* case).
	5206	*
	5207	* If the pattern hasn't changed from old_re, then old_re will be
	5208	* returned.
	5209	*
	5210	* eng is the current engine. If that engine has an op_comp method, then
	5211	* handle directly (i.e. we assume that op_comp was us); otherwise, just
	5212	* do the initial concatenation of arguments and pass on to the external
	5213	* engine.
	5214	*
	5215	* If is_bare_re is not null, set it to a boolean indicating whether the
	5216	* arg list reduced (after overloading) to a single bare regex which has
	5217	* been returned (i.e. /$qr/).
	5218	*
	5219	* orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
	5220	*
	5221	* pm_flags contains the PMf_* flags, typically based on those from the
	5222	* pm_flags field of the related PMOP. Currently we're only interested in
	5223	* PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
	5224	*
	5225	* We can't allocate space until we know how big the compiled form will be,
	5226	* but we can't compile it (and thus know how big it is) until we've got a
	5227	* place to put the code. So we cheat: we compile it twice, once with code
	5228	* generation turned off and size counting turned on, and once "for real".
	5229	* This also means that we don't allocate space until we are sure that the
	5230	* thing really will compile successfully, and we never have to move the
	5231	* code and thus invalidate pointers into it. (Note that it has to be in
	5232	* one piece because free() must be able to free it all.) [NB: not true in perl]
	5233	*
	5234	* Beware that the optimization-preparation code in here knows about some
	5235	* of the structure of the compiled regexp. [I'll say.]
	5236	*/
	5237
	5238	REGEXP *
	5239	Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
	5240	OP expr, const regexp_engine eng, REGEXP *VOL old_re,
	5241	bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
	5242	{
	5243	dVAR;
	5244	REGEXP *rx;
	5245	struct regexp *r;
	5246	register regexp_internal *ri;
	5247	STRLEN plen;
	5248	char * VOL exp;
	5249	char* xend;
	5250	regnode *scan;
	5251	I32 flags;
	5252	I32 minlen = 0;
	5253	U32 rx_flags;
	5254	SV * VOL pat;
	5255
	5256	/* these are all flags - maybe they should be turned
	5257	* into a single int with different bit masks */
	5258	I32 sawlookahead = 0;
	5259	I32 sawplus = 0;
	5260	I32 sawopen = 0;
	5261	bool used_setjump = FALSE;
	5262	regex_charset initial_charset = get_regex_charset(orig_rx_flags);
	5263	bool code_is_utf8 = 0;
	5264	bool VOL recompile = 0;
	5265	bool runtime_code = 0;
	5266	U8 jump_ret = 0;
	5267	dJMPENV;
	5268	scan_data_t data;
	5269	RExC_state_t RExC_state;
	5270	RExC_state_t * const pRExC_state = &RExC_state;
	5271	#ifdef TRIE_STUDY_OPT
	5272	int restudied;
	5273	RExC_state_t copyRExC_state;
	5274	#endif
	5275	GET_RE_DEBUG_FLAGS_DECL;
	5276
	5277	PERL_ARGS_ASSERT_RE_OP_COMPILE;
	5278
	5279	DEBUG_r(if (!PL_colorset) reginitcolors());
	5280
	5281	#ifndef PERL_IN_XSUB_RE
	5282	/* Initialize these here instead of as-needed, as is quick and avoids
	5283	* having to test them each time otherwise */
	5284	if (! PL_AboveLatin1) {
	5285	PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
	5286	PL_ASCII = _new_invlist_C_array(ASCII_invlist);
	5287	PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
	5288
	5289	PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
	5290	PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
	5291
	5292	PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
	5293	PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
	5294
	5295	PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
	5296	PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
	5297
	5298	PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
	5299
	5300	PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
	5301	PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
	5302
	5303	PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
	5304
	5305	PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
	5306	PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
	5307
	5308	PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
	5309	PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
	5310
	5311	PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
	5312	PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
	5313
	5314	PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
	5315	PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
	5316
	5317	PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
	5318	PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
	5319
	5320	PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
	5321	PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
	5322
	5323	PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
	5324	PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
	5325
	5326	PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
	5327	PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
	5328
	5329	PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
	5330
	5331	PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
	5332	PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
	5333
	5334	PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
	5335	PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
	5336	}
	5337	#endif
	5338
	5339	pRExC_state->code_blocks = NULL;
	5340	pRExC_state->num_code_blocks = 0;
	5341
	5342	if (is_bare_re)
	5343	*is_bare_re = FALSE;
	5344
	5345	if (expr && (expr->op_type == OP_LIST \|\|
	5346	(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
	5347
	5348	/* is the source UTF8, and how many code blocks are there? */
	5349	OP *o;
	5350	int ncode = 0;
	5351
	5352	for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
	5353	if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
	5354	code_is_utf8 = 1;
	5355	else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
	5356	/* count of DO blocks */
	5357	ncode++;
	5358	}
	5359	if (ncode) {
	5360	pRExC_state->num_code_blocks = ncode;
	5361	Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
	5362	}
	5363	}
	5364
	5365	if (pat_count) {
	5366	/* handle a list of SVs */
	5367
	5368	SV **svp;
	5369
	5370	/* apply magic and RE overloading to each arg */
	5371	for (svp = patternp; svp < patternp + pat_count; svp++) {
	5372	SV rx = svp;
	5373	SvGETMAGIC(rx);
	5374	if (SvROK(rx) && SvAMAGIC(rx)) {
	5375	SV *sv = AMG_CALLunary(rx, regexp_amg);
	5376	if (sv) {
	5377	if (SvROK(sv))
	5378	sv = SvRV(sv);
	5379	if (SvTYPE(sv) != SVt_REGEXP)
	5380	Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
	5381	*svp = sv;
	5382	}
	5383	}
	5384	}
	5385
	5386	if (pat_count > 1) {
	5387	/* concat multiple args and find any code block indexes */
	5388
	5389	OP *o = NULL;
	5390	int n = 0;
	5391	bool utf8 = 0;
	5392	STRLEN orig_patlen = 0;
	5393
	5394	if (pRExC_state->num_code_blocks) {
	5395	o = cLISTOPx(expr)->op_first;
	5396	assert(o->op_type == OP_PUSHMARK);
	5397	o = o->op_sibling;
	5398	}
	5399
	5400	pat = newSVpvn("", 0);
	5401	SAVEFREESV(pat);
	5402
	5403	/* determine if the pattern is going to be utf8 (needed
	5404	* in advance to align code block indices correctly).
	5405	* XXX This could fail to be detected for an arg with
	5406	* overloading but not concat overloading; but the main effect
	5407	* in this obscure case is to need a 'use re eval' for a
	5408	* literal code block */
	5409	for (svp = patternp; svp < patternp + pat_count; svp++) {
	5410	if (SvUTF8(*svp))
	5411	utf8 = 1;
	5412	}
	5413	if (utf8)
	5414	SvUTF8_on(pat);
	5415
	5416	for (svp = patternp; svp < patternp + pat_count; svp++) {
	5417	SV sv, msv = *svp;
	5418	SV *rx;
	5419	bool code = 0;
	5420	if (o) {
	5421	if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
	5422	assert(n < pRExC_state->num_code_blocks);
	5423	pRExC_state->code_blocks[n].start = SvCUR(pat);
	5424	pRExC_state->code_blocks[n].block = o;
	5425	pRExC_state->code_blocks[n].src_regex = NULL;
	5426	n++;
	5427	code = 1;
	5428	o = o->op_sibling; /* skip CONST */
	5429	assert(o);
	5430	}
	5431	o = o->op_sibling;;
	5432	}
	5433
	5434	if ((SvAMAGIC(pat) \|\| SvAMAGIC(msv)) &&
	5435	(sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
	5436	{
	5437	sv_setsv(pat, sv);
	5438	/* overloading involved: all bets are off over literal
	5439	* code. Pretend we haven't seen it */
	5440	pRExC_state->num_code_blocks -= n;
	5441	n = 0;
	5442	rx = NULL;
	5443
	5444	}
	5445	else {
	5446	while (SvAMAGIC(msv)
	5447	&& (sv = AMG_CALLunary(msv, string_amg))
	5448	&& sv != msv)
	5449	{
	5450	msv = sv;
	5451	SvGETMAGIC(msv);
	5452	}
	5453	if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
	5454	msv = SvRV(msv);
	5455	orig_patlen = SvCUR(pat);
	5456	sv_catsv_nomg(pat, msv);
	5457	rx = msv;
	5458	if (code)
	5459	pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
	5460	}
	5461
	5462	/* extract any code blocks within any embedded qr//'s */
	5463	if (rx && SvTYPE(rx) == SVt_REGEXP
	5464	&& RX_ENGINE((REGEXP*)rx)->op_comp)
	5465	{
	5466
	5467	RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
	5468	if (ri->num_code_blocks) {
	5469	int i;
	5470	/* the presence of an embedded qr// with code means
	5471	* we should always recompile: the text of the
	5472	* qr// may not have changed, but it may be a
	5473	* different closure than last time */
	5474	recompile = 1;
	5475	Renew(pRExC_state->code_blocks,
	5476	pRExC_state->num_code_blocks + ri->num_code_blocks,
	5477	struct reg_code_block);
	5478	pRExC_state->num_code_blocks += ri->num_code_blocks;
	5479	for (i=0; i < ri->num_code_blocks; i++) {
	5480	struct reg_code_block src, dst;
	5481	STRLEN offset = orig_patlen
	5482	+ ((struct regexp *)SvANY(rx))->pre_prefix;
	5483	assert(n < pRExC_state->num_code_blocks);
	5484	src = &ri->code_blocks[i];
	5485	dst = &pRExC_state->code_blocks[n];
	5486	dst->start = src->start + offset;
	5487	dst->end = src->end + offset;
	5488	dst->block = src->block;
	5489	dst->src_regex = (REGEXP) SvREFCNT_inc( (SV)
	5490	src->src_regex
	5491	? src->src_regex
	5492	: (REGEXP*)rx);
	5493	n++;
	5494	}
	5495	}
	5496	}
	5497	}
	5498	SvSETMAGIC(pat);
	5499	}
	5500	else {
	5501	SV *sv;
	5502	pat = *patternp;
	5503	while (SvAMAGIC(pat)
	5504	&& (sv = AMG_CALLunary(pat, string_amg))
	5505	&& sv != pat)
	5506	{
	5507	pat = sv;
	5508	SvGETMAGIC(pat);
	5509	}
	5510	}
	5511
	5512	/* handle bare regex: foo =~ $re */
	5513	{
	5514	SV *re = pat;
	5515	if (SvROK(re))
	5516	re = SvRV(re);
	5517	if (SvTYPE(re) == SVt_REGEXP) {
	5518	if (is_bare_re)
	5519	*is_bare_re = TRUE;
	5520	SvREFCNT_inc(re);
	5521	Safefree(pRExC_state->code_blocks);
	5522	return (REGEXP*)re;
	5523	}
	5524	}
	5525	}
	5526	else {
	5527	/* not a list of SVs, so must be a list of OPs */
	5528	assert(expr);
	5529	if (expr->op_type == OP_LIST) {
	5530	int i = -1;
	5531	bool is_code = 0;
	5532	OP *o;
	5533
	5534	pat = newSVpvn("", 0);
	5535	SAVEFREESV(pat);
	5536	if (code_is_utf8)
	5537	SvUTF8_on(pat);
	5538
	5539	/* given a list of CONSTs and DO blocks in expr, append all
	5540	* the CONSTs to pat, and record the start and end of each
	5541	* code block in code_blocks[] (each DO{} op is followed by an
	5542	* OP_CONST containing the corresponding literal '(?{...})
	5543	* text)
	5544	*/
	5545	for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
	5546	if (o->op_type == OP_CONST) {
	5547	sv_catsv(pat, cSVOPo_sv);
	5548	if (is_code) {
	5549	pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
	5550	is_code = 0;
	5551	}
	5552	}
	5553	else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
	5554	assert(i+1 < pRExC_state->num_code_blocks);
	5555	pRExC_state->code_blocks[++i].start = SvCUR(pat);
	5556	pRExC_state->code_blocks[i].block = o;
	5557	pRExC_state->code_blocks[i].src_regex = NULL;
	5558	is_code = 1;
	5559	}
	5560	}
	5561	}
	5562	else {
	5563	assert(expr->op_type == OP_CONST);
	5564	pat = cSVOPx_sv(expr);
	5565	}
	5566	}
	5567
	5568	exp = SvPV_nomg(pat, plen);
	5569
	5570	if (!eng->op_comp) {
	5571	if ((SvUTF8(pat) && IN_BYTES)
	5572	\|\| SvGMAGICAL(pat) \|\| SvAMAGIC(pat))
	5573	{
	5574	/* make a temporary copy; either to convert to bytes,
	5575	* or to avoid repeating get-magic / overloaded stringify */
	5576	pat = newSVpvn_flags(exp, plen, SVs_TEMP \|
	5577	(IN_BYTES ? 0 : SvUTF8(pat)));
	5578	}
	5579	Safefree(pRExC_state->code_blocks);
	5580	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
	5581	}
	5582
	5583	/* ignore the utf8ness if the pattern is 0 length */
	5584	RExC_utf8 = RExC_orig_utf8 = (plen == 0 \|\| IN_BYTES) ? 0 : SvUTF8(pat);
	5585	RExC_uni_semantics = 0;
	5586	RExC_contains_locale = 0;
	5587	pRExC_state->runtime_code_qr = NULL;
	5588
	5589	/**************** LONG JUMP TARGET HERE*********************/
	5590	/* Longjmp back to here if have to switch in midstream to utf8 */
	5591	if (! RExC_orig_utf8) {
	5592	JMPENV_PUSH(jump_ret);
	5593	used_setjump = TRUE;
	5594	}
	5595
	5596	if (jump_ret == 0) { /* First time through */
	5597	xend = exp + plen;
	5598
	5599	DEBUG_COMPILE_r({
	5600	SV *dsv= sv_newmortal();
	5601	RE_PV_QUOTED_DECL(s, RExC_utf8,
	5602	dsv, exp, plen, 60);
	5603	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	5604	PL_colors[4],PL_colors[5],s);
	5605	});
	5606	}
	5607	else { /* longjumped back */
	5608	U8 src, dst;
	5609	int n=0;
	5610	STRLEN s = 0, d = 0;
	5611	bool do_end = 0;
	5612
	5613	/* If the cause for the longjmp was other than changing to utf8, pop
	5614	* our own setjmp, and longjmp to the correct handler */
	5615	if (jump_ret != UTF8_LONGJMP) {
	5616	JMPENV_POP;
	5617	JMPENV_JUMP(jump_ret);
	5618	}
	5619
	5620	GET_RE_DEBUG_FLAGS;
	5621
	5622	/* It's possible to write a regexp in ascii that represents Unicode
	5623	codepoints outside of the byte range, such as via \x{100}. If we
	5624	detect such a sequence we have to convert the entire pattern to utf8
	5625	and then recompile, as our sizing calculation will have been based
	5626	on 1 byte == 1 character, but we will need to use utf8 to encode
	5627	at least some part of the pattern, and therefore must convert the whole
	5628	thing.
	5629	-- dmq */
	5630	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	5631	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	5632
	5633	/* upgrade pattern to UTF8, and if there are code blocks,
	5634	* recalculate the indices.
	5635	* This is essentially an unrolled Perl_bytes_to_utf8() */
	5636
	5637	src = (U8*)SvPV_nomg(pat, plen);
	5638	Newx(dst, plen * 2 + 1, U8);
	5639
	5640	while (s < plen) {
	5641	const UV uv = NATIVE_TO_ASCII(src[s]);
	5642	if (UNI_IS_INVARIANT(uv))
	5643	dst[d] = (U8)UTF_TO_NATIVE(uv);
	5644	else {
	5645	dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
	5646	dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
	5647	}
	5648	if (n < pRExC_state->num_code_blocks) {
	5649	if (!do_end && pRExC_state->code_blocks[n].start == s) {
	5650	pRExC_state->code_blocks[n].start = d;
	5651	assert(dst[d] == '(');
	5652	do_end = 1;
	5653	}
	5654	else if (do_end && pRExC_state->code_blocks[n].end == s) {
	5655	pRExC_state->code_blocks[n].end = d;
	5656	assert(dst[d] == ')');
	5657	do_end = 0;
	5658	n++;
	5659	}
	5660	}
	5661	s++;
	5662	d++;
	5663	}
	5664	dst[d] = '\0';
	5665	plen = d;
	5666	exp = (char*) dst;
	5667	xend = exp + plen;
	5668	SAVEFREEPV(exp);
	5669	RExC_orig_utf8 = RExC_utf8 = 1;
	5670	}
	5671
	5672	/* return old regex if pattern hasn't changed */
	5673
	5674	if ( old_re
	5675	&& !recompile
	5676	&& !!RX_UTF8(old_re) == !!RExC_utf8
	5677	&& RX_PRECOMP(old_re)
	5678	&& RX_PRELEN(old_re) == plen
	5679	&& memEQ(RX_PRECOMP(old_re), exp, plen))
	5680	{
	5681	/* with runtime code, always recompile */
	5682	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
	5683	exp, plen);
	5684	if (!runtime_code) {
	5685	ReREFCNT_inc(old_re);
	5686	if (used_setjump) {
	5687	JMPENV_POP;
	5688	}
	5689	Safefree(pRExC_state->code_blocks);
	5690	return old_re;
	5691	}
	5692	}
	5693	else if ((pm_flags & PMf_USE_RE_EVAL)
	5694	/* this second condition covers the non-regex literal case,
	5695	* i.e. $foo =~ '(?{})'. */
	5696	\|\| ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
	5697	&& (PL_hints & HINT_RE_EVAL))
	5698	)
	5699	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
	5700	exp, plen);
	5701
	5702	#ifdef TRIE_STUDY_OPT
	5703	restudied = 0;
	5704	#endif
	5705
	5706	rx_flags = orig_rx_flags;
	5707
	5708	if (initial_charset == REGEX_LOCALE_CHARSET) {
	5709	RExC_contains_locale = 1;
	5710	}
	5711	else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
	5712
	5713	/* Set to use unicode semantics if the pattern is in utf8 and has the
	5714	* 'depends' charset specified, as it means unicode when utf8 */
	5715	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
	5716	}
	5717
	5718	RExC_precomp = exp;
	5719	RExC_flags = rx_flags;
	5720	RExC_pm_flags = pm_flags;
	5721
	5722	if (runtime_code) {
	5723	if (PL_tainting && PL_tainted)
	5724	Perl_croak(aTHX_ "Eval-group in insecure regular expression");
	5725
	5726	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
	5727	/* whoops, we have a non-utf8 pattern, whilst run-time code
	5728	* got compiled as utf8. Try again with a utf8 pattern */
	5729	JMPENV_JUMP(UTF8_LONGJMP);
	5730	}
	5731	}
	5732	assert(!pRExC_state->runtime_code_qr);
	5733
	5734	RExC_sawback = 0;
	5735
	5736	RExC_seen = 0;
	5737	RExC_in_lookbehind = 0;
	5738	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	5739	RExC_extralen = 0;
	5740	RExC_override_recoding = 0;
	5741
	5742	/* First pass: determine size, legality. */
	5743	RExC_parse = exp;
	5744	RExC_start = exp;
	5745	RExC_end = xend;
	5746	RExC_naughty = 0;
	5747	RExC_npar = 1;
	5748	RExC_nestroot = 0;
	5749	RExC_size = 0L;
	5750	RExC_emit = &PL_regdummy;
	5751	RExC_whilem_seen = 0;
	5752	RExC_open_parens = NULL;
	5753	RExC_close_parens = NULL;
	5754	RExC_opend = NULL;
	5755	RExC_paren_names = NULL;
	5756	#ifdef DEBUGGING
	5757	RExC_paren_name_list = NULL;
	5758	#endif
	5759	RExC_recurse = NULL;
	5760	RExC_recurse_count = 0;
	5761	pRExC_state->code_index = 0;
	5762
	5763	#if 0 /* REGC() is (currently) a NOP at the first pass.
	5764	* Clever compilers notice this and complain. --jhi */
	5765	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	5766	#endif
	5767	DEBUG_PARSE_r(
	5768	PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
	5769	RExC_lastnum=0;
	5770	RExC_lastparse=NULL;
	5771	);
	5772	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	5773	RExC_precomp = NULL;
	5774	Safefree(pRExC_state->code_blocks);
	5775	return(NULL);
	5776	}
	5777
	5778	/* Here, finished first pass. Get rid of any added setjmp */
	5779	if (used_setjump) {
	5780	JMPENV_POP;
	5781	}
	5782
	5783	DEBUG_PARSE_r({
	5784	PerlIO_printf(Perl_debug_log,
	5785	"Required size %"IVdf" nodes\n"
	5786	"Starting second pass (creation)\n",
	5787	(IV)RExC_size);
	5788	RExC_lastnum=0;
	5789	RExC_lastparse=NULL;
	5790	});
	5791
	5792	/* The first pass could have found things that force Unicode semantics */
	5793	if ((RExC_utf8 \|\| RExC_uni_semantics)
	5794	&& get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
	5795	{
	5796	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
	5797	}
	5798
	5799	/* Small enough for pointer-storage convention?
	5800	If extralen==0, this means that we will not need long jumps. */
	5801	if (RExC_size >= 0x10000L && RExC_extralen)
	5802	RExC_size += RExC_extralen;
	5803	else
	5804	RExC_extralen = 0;
	5805	if (RExC_whilem_seen > 15)
	5806	RExC_whilem_seen = 15;
	5807
	5808	/* Allocate space and zero-initialize. Note, the two step process
	5809	of zeroing when in debug mode, thus anything assigned has to
	5810	happen after that */
	5811	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	5812	r = (struct regexp*)SvANY(rx);
	5813	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	5814	char, regexp_internal);
	5815	if ( r == NULL \|\| ri == NULL )
	5816	FAIL("Regexp out of space");
	5817	#ifdef DEBUGGING
	5818	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	5819	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
	5820	#else
	5821	/* bulk initialize base fields with 0. */
	5822	Zero(ri, sizeof(regexp_internal), char);
	5823	#endif
	5824
	5825	/* non-zero initialization begins here */
	5826	RXi_SET( r, ri );
	5827	r->engine= eng;
	5828	r->extflags = rx_flags;
	5829	if (pm_flags & PMf_IS_QR) {
	5830	ri->code_blocks = pRExC_state->code_blocks;
	5831	ri->num_code_blocks = pRExC_state->num_code_blocks;
	5832	}
	5833	else
	5834	SAVEFREEPV(pRExC_state->code_blocks);
	5835
	5836	{
	5837	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	5838	bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
	5839
	5840	/* The caret is output if there are any defaults: if not all the STD
	5841	* flags are set, or if no character set specifier is needed */
	5842	bool has_default =
	5843	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	5844	\|\| ! has_charset);
	5845	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
	5846	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	5847	>> RXf_PMf_STD_PMMOD_SHIFT);
	5848	const char fptr = STD_PAT_MODS; /"msix"*/
	5849	char *p;
	5850	/* Allocate for the worst case, which is all the std flags are turned
	5851	* on. If more precision is desired, we could do a population count of
	5852	* the flags set. This could be done with a small lookup table, or by
	5853	* shifting, masking and adding, or even, when available, assembly
	5854	* language for a machine-language population count.
	5855	* We never output a minus, as all those are defaults, so are
	5856	* covered by the caret */
	5857	const STRLEN wraplen = plen + has_p + has_runon
	5858	+ has_default /* If needs a caret */
	5859
	5860	/* If needs a character set specifier */
	5861	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	5862	+ (sizeof(STD_PAT_MODS) - 1)
	5863	+ (sizeof("(?:)") - 1);
	5864
	5865	p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
	5866	SvPOK_on(rx);
	5867	if (RExC_utf8)
	5868	SvFLAGS(rx) \|= SVf_UTF8;
	5869	p++='('; p++='?';
	5870
	5871	/* If a default, cover it using the caret */
	5872	if (has_default) {
	5873	*p++= DEFAULT_PAT_MOD;
	5874	}
	5875	if (has_charset) {
	5876	STRLEN len;
	5877	const char* const name = get_regex_charset_name(r->extflags, &len);
	5878	Copy(name, p, len, char);
	5879	p += len;
	5880	}
	5881	if (has_p)
	5882	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	5883	{
	5884	char ch;
	5885	while((ch = *fptr++)) {
	5886	if(reganch & 1)
	5887	*p++ = ch;
	5888	reganch >>= 1;
	5889	}
	5890	}
	5891
	5892	*p++ = ':';
	5893	Copy(RExC_precomp, p, plen, char);
	5894	assert ((RX_WRAPPED(rx) - p) < 16);
	5895	r->pre_prefix = p - RX_WRAPPED(rx);
	5896	p += plen;
	5897	if (has_runon)
	5898	*p++ = '\n';
	5899	*p++ = ')';
	5900	*p = 0;
	5901	SvCUR_set(rx, p - SvPVX_const(rx));
	5902	}
	5903
	5904	r->intflags = 0;
	5905	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	5906
	5907	if (RExC_seen & REG_SEEN_RECURSE) {
	5908	Newxz(RExC_open_parens, RExC_npar,regnode *);
	5909	SAVEFREEPV(RExC_open_parens);
	5910	Newxz(RExC_close_parens,RExC_npar,regnode *);
	5911	SAVEFREEPV(RExC_close_parens);
	5912	}
	5913
	5914	/* Useful during FAIL. */
	5915	#ifdef RE_TRACK_PATTERN_OFFSETS
	5916	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	5917	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	5918	"%s %"UVuf" bytes for offset annotations.\n",
	5919	ri->u.offsets ? "Got" : "Couldn't get",
	5920	(UV)((2RExC_size+1) sizeof(U32))));
	5921	#endif
	5922	SetProgLen(ri,RExC_size);
	5923	RExC_rx_sv = rx;
	5924	RExC_rx = r;
	5925	RExC_rxi = ri;
	5926
	5927	/* Second pass: emit code. */
	5928	RExC_flags = rx_flags; /* don't let top level (?i) bleed */
	5929	RExC_pm_flags = pm_flags;
	5930	RExC_parse = exp;
	5931	RExC_end = xend;
	5932	RExC_naughty = 0;
	5933	RExC_npar = 1;
	5934	RExC_emit_start = ri->program;
	5935	RExC_emit = ri->program;
	5936	RExC_emit_bound = ri->program + RExC_size + 1;
	5937	pRExC_state->code_index = 0;
	5938
	5939	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	5940	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	5941	ReREFCNT_dec(rx);
	5942	return(NULL);
	5943	}
	5944	/* XXXX To minimize changes to RE engine we always allocate
	5945	3-units-long substrs field. */
	5946	Newx(r->substrs, 1, struct reg_substr_data);
	5947	if (RExC_recurse_count) {
	5948	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	5949	SAVEFREEPV(RExC_recurse);
	5950	}
	5951
	5952	reStudy:
	5953	r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
	5954	Zero(r->substrs, 1, struct reg_substr_data);
	5955
	5956	#ifdef TRIE_STUDY_OPT
	5957	if (!restudied) {
	5958	StructCopy(&zero_scan_data, &data, scan_data_t);
	5959	copyRExC_state = RExC_state;
	5960	} else {
	5961	U32 seen=RExC_seen;
	5962	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	5963
	5964	RExC_state = copyRExC_state;
	5965	if (seen & REG_TOP_LEVEL_BRANCHES)
	5966	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	5967	else
	5968	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
	5969	if (data.last_found) {
	5970	SvREFCNT_dec(data.longest_fixed);
	5971	SvREFCNT_dec(data.longest_float);
	5972	SvREFCNT_dec(data.last_found);
	5973	}
	5974	StructCopy(&zero_scan_data, &data, scan_data_t);
	5975	}
	5976	#else
	5977	StructCopy(&zero_scan_data, &data, scan_data_t);
	5978	#endif
	5979
	5980	/* Dig out information for optimizations. */
	5981	r->extflags = RExC_flags; /* was pm_op */
	5982	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	5983
	5984	if (UTF)
	5985	SvUTF8_on(rx); /* Unicode in it? */
	5986	ri->regstclass = NULL;
	5987	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	5988	r->intflags \|= PREGf_NAUGHTY;
	5989	scan = ri->program + 1; /* First BRANCH. */
	5990
	5991	/* testing for BRANCH here tells us whether there is "must appear"
	5992	data in the pattern. If there is then we can use it for optimisations */
	5993	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
	5994	I32 fake;
	5995	STRLEN longest_float_length, longest_fixed_length;
	5996	struct regnode_charclass_class ch_class; /* pointed to by data */
	5997	int stclass_flag;
	5998	I32 last_close = 0; /* pointed to by data */
	5999	regnode *first= scan;
	6000	regnode *first_next= regnext(first);
	6001	/*
	6002	* Skip introductions and multiplicators >= 1
	6003	* so that we can extract the 'meat' of the pattern that must
	6004	* match in the large if() sequence following.
	6005	* NOTE that EXACT is NOT covered here, as it is normally
	6006	* picked up by the optimiser separately.
	6007	*
	6008	* This is unfortunate as the optimiser isnt handling lookahead
	6009	* properly currently.
	6010	*
	6011	*/
	6012	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	6013	/* An OR of one alternative - should not happen now. */
	6014	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	6015	/* for now we can't handle lookbehind IFMATCH*/
	6016	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	6017	(OP(first) == PLUS) \|\|
	6018	(OP(first) == MINMOD) \|\|
	6019	/* An {n,m} with n>0 */
	6020	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	6021	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	6022	{
	6023	/*
	6024	* the only op that could be a regnode is PLUS, all the rest
	6025	* will be regnode_1 or regnode_2.
	6026	*
	6027	*/
	6028	if (OP(first) == PLUS)
	6029	sawplus = 1;
	6030	else
	6031	first += regarglen[OP(first)];
	6032
	6033	first = NEXTOPER(first);
	6034	first_next= regnext(first);
	6035	}
	6036
	6037	/* Starting-point info. */
	6038	again:
	6039	DEBUG_PEEP("first:",first,0);
	6040	/* Ignore EXACT as we deal with it later. */
	6041	if (PL_regkind[OP(first)] == EXACT) {
	6042	if (OP(first) == EXACT)
	6043	NOOP; /* Empty, get anchored substr later. */
	6044	else
	6045	ri->regstclass = first;
	6046	}
	6047	#ifdef TRIE_STCLASS
	6048	else if (PL_regkind[OP(first)] == TRIE &&
	6049	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	6050	{
	6051	regnode *trie_op;
	6052	/* this can happen only on restudy */
	6053	if ( OP(first) == TRIE ) {
	6054	struct regnode_1 trieop = (struct regnode_1 )
	6055	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	6056	StructCopy(first,trieop,struct regnode_1);
	6057	trie_op=(regnode *)trieop;
	6058	} else {
	6059	struct regnode_charclass trieop = (struct regnode_charclass )
	6060	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	6061	StructCopy(first,trieop,struct regnode_charclass);
	6062	trie_op=(regnode *)trieop;
	6063	}
	6064	OP(trie_op)+=2;
	6065	make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
	6066	ri->regstclass = trie_op;
	6067	}
	6068	#endif
	6069	else if (REGNODE_SIMPLE(OP(first)))
	6070	ri->regstclass = first;
	6071	else if (PL_regkind[OP(first)] == BOUND \|\|
	6072	PL_regkind[OP(first)] == NBOUND)
	6073	ri->regstclass = first;
	6074	else if (PL_regkind[OP(first)] == BOL) {
	6075	r->extflags \|= (OP(first) == MBOL
	6076	? RXf_ANCH_MBOL
	6077	: (OP(first) == SBOL
	6078	? RXf_ANCH_SBOL
	6079	: RXf_ANCH_BOL));
	6080	first = NEXTOPER(first);
	6081	goto again;
	6082	}
	6083	else if (OP(first) == GPOS) {
	6084	r->extflags \|= RXf_ANCH_GPOS;
	6085	first = NEXTOPER(first);
	6086	goto again;
	6087	}
	6088	else if ((!sawopen \|\| !RExC_sawback) &&
	6089	(OP(first) == STAR &&
	6090	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	6091	!(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
	6092	{
	6093	/* turn .* into ^.* with an implied $=1 /
	6094	const int type =
	6095	(OP(NEXTOPER(first)) == REG_ANY)
	6096	? RXf_ANCH_MBOL
	6097	: RXf_ANCH_SBOL;
	6098	r->extflags \|= type;
	6099	r->intflags \|= PREGf_IMPLICIT;
	6100	first = NEXTOPER(first);
	6101	goto again;
	6102	}
	6103	if (sawplus && !sawlookahead && (!sawopen \|\| !RExC_sawback)
	6104	&& !pRExC_state->num_code_blocks) /* May examine pos and $& */
	6105	/* x+ must match at the 1st pos of run of x's */
	6106	r->intflags \|= PREGf_SKIP;
	6107
	6108	/* Scan is after the zeroth branch, first is atomic matcher. */
	6109	#ifdef TRIE_STUDY_OPT
	6110	DEBUG_PARSE_r(
	6111	if (!restudied)
	6112	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	6113	(IV)(first - scan + 1))
	6114	);
	6115	#else
	6116	DEBUG_PARSE_r(
	6117	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	6118	(IV)(first - scan + 1))
	6119	);
	6120	#endif
	6121
	6122
	6123	/*
	6124	* If there's something expensive in the r.e., find the
	6125	* longest literal string that must appear and make it the
	6126	* regmust. Resolve ties in favor of later strings, since
	6127	* the regstart check works with the beginning of the r.e.
	6128	* and avoiding duplication strengthens checking. Not a
	6129	* strong reason, but sufficient in the absence of others.
	6130	* [Now we resolve ties in favor of the earlier string if
	6131	* it happens that c_offset_min has been invalidated, since the
	6132	* earlier string may buy us something the later one won't.]
	6133	*/
	6134
	6135	data.longest_fixed = newSVpvs("");
	6136	data.longest_float = newSVpvs("");
	6137	data.last_found = newSVpvs("");
	6138	data.longest = &(data.longest_fixed);
	6139	first = scan;
	6140	if (!ri->regstclass) {
	6141	cl_init(pRExC_state, &ch_class);
	6142	data.start_class = &ch_class;
	6143	stclass_flag = SCF_DO_STCLASS_AND;
	6144	} else /* XXXX Check for BOUND? */
	6145	stclass_flag = 0;
	6146	data.last_closep = &last_close;
	6147
	6148	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
	6149	&data, -1, NULL, NULL,
	6150	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag,0);
	6151
	6152
	6153	CHECK_RESTUDY_GOTO;
	6154
	6155
	6156	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	6157	&& data.last_start_min == 0 && data.last_end > 0
	6158	&& !RExC_seen_zerolen
	6159	&& !(RExC_seen & REG_SEEN_VERBARG)
	6160	&& (!(RExC_seen & REG_SEEN_GPOS) \|\| (r->extflags & RXf_ANCH_GPOS)))
	6161	r->extflags \|= RXf_CHECK_ALL;
	6162	scan_commit(pRExC_state, &data,&minlen,0);
	6163	SvREFCNT_dec(data.last_found);
	6164
	6165	/* Note that code very similar to this but for anchored string
	6166	follows immediately below, changes may need to be made to both.
	6167	Be careful.
	6168	*/
	6169	longest_float_length = CHR_SVLEN(data.longest_float);
	6170	if (longest_float_length
	6171	\|\| (data.flags & SF_FL_BEFORE_EOL
	6172	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	6173	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	6174	{
	6175	I32 t,ml;
	6176
	6177	/* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
	6178	if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
	6179	\|\| (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	6180	&& data.offset_fixed == data.offset_float_min
	6181	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
	6182	goto remove_float; /* As in (a)+. */
	6183
	6184	/* copy the information about the longest float from the reg_scan_data
	6185	over to the program. */
	6186	if (SvUTF8(data.longest_float)) {
	6187	r->float_utf8 = data.longest_float;
	6188	r->float_substr = NULL;
	6189	} else {
	6190	r->float_substr = data.longest_float;
	6191	r->float_utf8 = NULL;
	6192	}
	6193	/* float_end_shift is how many chars that must be matched that
	6194	follow this item. We calculate it ahead of time as once the
	6195	lookbehind offset is added in we lose the ability to correctly
	6196	calculate it.*/
	6197	ml = data.minlen_float ? *(data.minlen_float)
	6198	: (I32)longest_float_length;
	6199	r->float_end_shift = ml - data.offset_float_min
	6200	- longest_float_length + (SvTAIL(data.longest_float) != 0)
	6201	+ data.lookbehind_float;
	6202	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	6203	r->float_max_offset = data.offset_float_max;
	6204	if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
	6205	r->float_max_offset -= data.lookbehind_float;
	6206
	6207	t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
	6208	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	6209	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	6210	fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
	6211	}
	6212	else {
	6213	remove_float:
	6214	r->float_substr = r->float_utf8 = NULL;
	6215	SvREFCNT_dec(data.longest_float);
	6216	longest_float_length = 0;
	6217	}
	6218
	6219	/* Note that code very similar to this but for floating string
	6220	is immediately above, changes may need to be made to both.
	6221	Be careful.
	6222	*/
	6223	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	6224
	6225	/* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
	6226	if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
	6227	&& (longest_fixed_length
	6228	\|\| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
	6229	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	6230	\|\| (RExC_flags & RXf_PMf_MULTILINE)))) )
	6231	{
	6232	I32 t,ml;
	6233
	6234	/* copy the information about the longest fixed
	6235	from the reg_scan_data over to the program. */
	6236	if (SvUTF8(data.longest_fixed)) {
	6237	r->anchored_utf8 = data.longest_fixed;
	6238	r->anchored_substr = NULL;
	6239	} else {
	6240	r->anchored_substr = data.longest_fixed;
	6241	r->anchored_utf8 = NULL;
	6242	}
	6243	/* fixed_end_shift is how many chars that must be matched that
	6244	follow this item. We calculate it ahead of time as once the
	6245	lookbehind offset is added in we lose the ability to correctly
	6246	calculate it.*/
	6247	ml = data.minlen_fixed ? *(data.minlen_fixed)
	6248	: (I32)longest_fixed_length;
	6249	r->anchored_end_shift = ml - data.offset_fixed
	6250	- longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
	6251	+ data.lookbehind_fixed;
	6252	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	6253
	6254	t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
	6255	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	6256	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	6257	fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
	6258	}
	6259	else {
	6260	r->anchored_substr = r->anchored_utf8 = NULL;
	6261	SvREFCNT_dec(data.longest_fixed);
	6262	longest_fixed_length = 0;
	6263	}
	6264	if (ri->regstclass
	6265	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	6266	ri->regstclass = NULL;
	6267
	6268	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	6269	&& stclass_flag
	6270	&& !(data.start_class->flags & ANYOF_EOS)
	6271	&& !cl_is_anything(data.start_class))
	6272	{
	6273	const U32 n = add_data(pRExC_state, 1, "f");
	6274	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	6275
	6276	Newx(RExC_rxi->data->data[n], 1,
	6277	struct regnode_charclass_class);
	6278	StructCopy(data.start_class,
	6279	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	6280	struct regnode_charclass_class);
	6281	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	6282	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	6283	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	6284	regprop(r, sv, (regnode*)data.start_class);
	6285	PerlIO_printf(Perl_debug_log,
	6286	"synthetic stclass \"%s\".\n",
	6287	SvPVX_const(sv));});
	6288	}
	6289
	6290	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
	6291	if (longest_fixed_length > longest_float_length) {
	6292	r->check_end_shift = r->anchored_end_shift;
	6293	r->check_substr = r->anchored_substr;
	6294	r->check_utf8 = r->anchored_utf8;
	6295	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	6296	if (r->extflags & RXf_ANCH_SINGLE)
	6297	r->extflags \|= RXf_NOSCAN;
	6298	}
	6299	else {
	6300	r->check_end_shift = r->float_end_shift;
	6301	r->check_substr = r->float_substr;
	6302	r->check_utf8 = r->float_utf8;
	6303	r->check_offset_min = r->float_min_offset;
	6304	r->check_offset_max = r->float_max_offset;
	6305	}
	6306	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
	6307	This should be changed ASAP! */
	6308	if ((r->check_substr \|\| r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
	6309	r->extflags \|= RXf_USE_INTUIT;
	6310	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	6311	r->extflags \|= RXf_INTUIT_TAIL;
	6312	}
	6313	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	6314	if ( (STRLEN)minlen < longest_float_length )
	6315	minlen= longest_float_length;
	6316	if ( (STRLEN)minlen < longest_fixed_length )
	6317	minlen= longest_fixed_length;
	6318	*/
	6319	}
	6320	else {
	6321	/* Several toplevels. Best we can is to set minlen. */
	6322	I32 fake;
	6323	struct regnode_charclass_class ch_class;
	6324	I32 last_close = 0;
	6325
	6326	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	6327
	6328	scan = ri->program + 1;
	6329	cl_init(pRExC_state, &ch_class);
	6330	data.start_class = &ch_class;
	6331	data.last_closep = &last_close;
	6332
	6333
	6334	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
	6335	&data, -1, NULL, NULL, SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS,0);
	6336
	6337	CHECK_RESTUDY_GOTO;
	6338
	6339	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	6340	= r->float_substr = r->float_utf8 = NULL;
	6341
	6342	if (!(data.start_class->flags & ANYOF_EOS)
	6343	&& !cl_is_anything(data.start_class))
	6344	{
	6345	const U32 n = add_data(pRExC_state, 1, "f");
	6346	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	6347
	6348	Newx(RExC_rxi->data->data[n], 1,
	6349	struct regnode_charclass_class);
	6350	StructCopy(data.start_class,
	6351	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	6352	struct regnode_charclass_class);
	6353	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	6354	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	6355	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	6356	regprop(r, sv, (regnode*)data.start_class);
	6357	PerlIO_printf(Perl_debug_log,
	6358	"synthetic stclass \"%s\".\n",
	6359	SvPVX_const(sv));});
	6360	}
	6361	}
	6362
	6363	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	6364	the "real" pattern. */
	6365	DEBUG_OPTIMISE_r({
	6366	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
	6367	(IV)minlen, (IV)r->minlen);
	6368	});
	6369	r->minlenret = minlen;
	6370	if (r->minlen < minlen)
	6371	r->minlen = minlen;
	6372
	6373	if (RExC_seen & REG_SEEN_GPOS)
	6374	r->extflags \|= RXf_GPOS_SEEN;
	6375	if (RExC_seen & REG_SEEN_LOOKBEHIND)
	6376	r->extflags \|= RXf_LOOKBEHIND_SEEN;
	6377	if (pRExC_state->num_code_blocks)
	6378	r->extflags \|= RXf_EVAL_SEEN;
	6379	if (RExC_seen & REG_SEEN_CANY)
	6380	r->extflags \|= RXf_CANY_SEEN;
	6381	if (RExC_seen & REG_SEEN_VERBARG)
	6382	r->intflags \|= PREGf_VERBARG_SEEN;
	6383	if (RExC_seen & REG_SEEN_CUTGROUP)
	6384	r->intflags \|= PREGf_CUTGROUP_SEEN;
	6385	if (pm_flags & PMf_USE_RE_EVAL)
	6386	r->intflags \|= PREGf_USE_RE_EVAL;
	6387	if (RExC_paren_names)
	6388	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	6389	else
	6390	RXp_PAREN_NAMES(r) = NULL;
	6391
	6392	#ifdef STUPID_PATTERN_CHECKS
	6393	if (RX_PRELEN(rx) == 0)
	6394	r->extflags \|= RXf_NULL;
	6395	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	6396	/* XXX: this should happen BEFORE we compile */
	6397	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	6398	else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
	6399	r->extflags \|= RXf_WHITE;
	6400	else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
	6401	r->extflags \|= RXf_START_ONLY;
	6402	#else
	6403	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	6404	/* XXX: this should happen BEFORE we compile */
	6405	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	6406	else {
	6407	regnode *first = ri->program + 1;
	6408	U8 fop = OP(first);
	6409
	6410	if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
	6411	r->extflags \|= RXf_NULL;
	6412	else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
	6413	r->extflags \|= RXf_START_ONLY;
	6414	else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
	6415	&& OP(regnext(first)) == END)
	6416	r->extflags \|= RXf_WHITE;
	6417	}
	6418	#endif
	6419	#ifdef DEBUGGING
	6420	if (RExC_paren_names) {
	6421	ri->name_list_idx = add_data( pRExC_state, 1, "a" );
	6422	ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
	6423	} else
	6424	#endif
	6425	ri->name_list_idx = 0;
	6426
	6427	if (RExC_recurse_count) {
	6428	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	6429	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	6430	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	6431	}
	6432	}
	6433	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	6434	/* assume we don't need to swap parens around before we match */
	6435
	6436	DEBUG_DUMP_r({
	6437	PerlIO_printf(Perl_debug_log,"Final program:\n");
	6438	regdump(r);
	6439	});
	6440	#ifdef RE_TRACK_PATTERN_OFFSETS
	6441	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	6442	const U32 len = ri->u.offsets[0];
	6443	U32 i;
	6444	GET_RE_DEBUG_FLAGS_DECL;
	6445	PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	6446	for (i = 1; i <= len; i++) {
	6447	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	6448	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	6449	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	6450	}
	6451	PerlIO_printf(Perl_debug_log, "\n");
	6452	});
	6453	#endif
	6454	return rx;
	6455	}
	6456
	6457
	6458	SV*
	6459	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	6460	const U32 flags)
	6461	{
	6462	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	6463
	6464	PERL_UNUSED_ARG(value);
	6465
	6466	if (flags & RXapif_FETCH) {
	6467	return reg_named_buff_fetch(rx, key, flags);
	6468	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	6469	Perl_croak_no_modify(aTHX);
	6470	return NULL;
	6471	} else if (flags & RXapif_EXISTS) {
	6472	return reg_named_buff_exists(rx, key, flags)
	6473	? &PL_sv_yes
	6474	: &PL_sv_no;
	6475	} else if (flags & RXapif_REGNAMES) {
	6476	return reg_named_buff_all(rx, flags);
	6477	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	6478	return reg_named_buff_scalar(rx, flags);
	6479	} else {
	6480	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	6481	return NULL;
	6482	}
	6483	}
	6484
	6485	SV*
	6486	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	6487	const U32 flags)
	6488	{
	6489	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	6490	PERL_UNUSED_ARG(lastkey);
	6491
	6492	if (flags & RXapif_FIRSTKEY)
	6493	return reg_named_buff_firstkey(rx, flags);
	6494	else if (flags & RXapif_NEXTKEY)
	6495	return reg_named_buff_nextkey(rx, flags);
	6496	else {
	6497	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
	6498	return NULL;
	6499	}
	6500	}
	6501
	6502	SV*
	6503	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	6504	const U32 flags)
	6505	{
	6506	AV *retarray = NULL;
	6507	SV *ret;
	6508	struct regexp const rx = (struct regexp )SvANY(r);
	6509
	6510	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	6511
	6512	if (flags & RXapif_ALL)
	6513	retarray=newAV();
	6514
	6515	if (rx && RXp_PAREN_NAMES(rx)) {
	6516	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	6517	if (he_str) {
	6518	IV i;
	6519	SV* sv_dat=HeVAL(he_str);
	6520	I32 nums=(I32)SvPVX(sv_dat);
	6521	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	6522	if ((I32)(rx->nparens) >= nums[i]
	6523	&& rx->offs[nums[i]].start != -1
	6524	&& rx->offs[nums[i]].end != -1)
	6525	{
	6526	ret = newSVpvs("");
	6527	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	6528	if (!retarray)
	6529	return ret;
	6530	} else {
	6531	if (retarray)
	6532	ret = newSVsv(&PL_sv_undef);
	6533	}
	6534	if (retarray)
	6535	av_push(retarray, ret);
	6536	}
	6537	if (retarray)
	6538	return newRV_noinc(MUTABLE_SV(retarray));
	6539	}
	6540	}
	6541	return NULL;
	6542	}
	6543
	6544	bool
	6545	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	6546	const U32 flags)
	6547	{
	6548	struct regexp const rx = (struct regexp )SvANY(r);
	6549
	6550	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	6551
	6552	if (rx && RXp_PAREN_NAMES(rx)) {
	6553	if (flags & RXapif_ALL) {
	6554	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	6555	} else {
	6556	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	6557	if (sv) {
	6558	SvREFCNT_dec(sv);
	6559	return TRUE;
	6560	} else {
	6561	return FALSE;
	6562	}
	6563	}
	6564	} else {
	6565	return FALSE;
	6566	}
	6567	}
	6568
	6569	SV*
	6570	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	6571	{
	6572	struct regexp const rx = (struct regexp )SvANY(r);
	6573
	6574	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	6575
	6576	if ( rx && RXp_PAREN_NAMES(rx) ) {
	6577	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	6578
	6579	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	6580	} else {
	6581	return FALSE;
	6582	}
	6583	}
	6584
	6585	SV*
	6586	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	6587	{
	6588	struct regexp const rx = (struct regexp )SvANY(r);
	6589	GET_RE_DEBUG_FLAGS_DECL;
	6590
	6591	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	6592
	6593	if (rx && RXp_PAREN_NAMES(rx)) {
	6594	HV *hv = RXp_PAREN_NAMES(rx);
	6595	HE *temphe;
	6596	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	6597	IV i;
	6598	IV parno = 0;
	6599	SV* sv_dat = HeVAL(temphe);
	6600	I32 nums = (I32)SvPVX(sv_dat);
	6601	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	6602	if ((I32)(rx->lastparen) >= nums[i] &&
	6603	rx->offs[nums[i]].start != -1 &&
	6604	rx->offs[nums[i]].end != -1)
	6605	{
	6606	parno = nums[i];
	6607	break;
	6608	}
	6609	}
	6610	if (parno \|\| flags & RXapif_ALL) {
	6611	return newSVhek(HeKEY_hek(temphe));
	6612	}
	6613	}
	6614	}
	6615	return NULL;
	6616	}
	6617
	6618	SV*
	6619	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	6620	{
	6621	SV *ret;
	6622	AV *av;
	6623	I32 length;
	6624	struct regexp const rx = (struct regexp )SvANY(r);
	6625
	6626	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	6627
	6628	if (rx && RXp_PAREN_NAMES(rx)) {
	6629	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	6630	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	6631	} else if (flags & RXapif_ONE) {
	6632	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	6633	av = MUTABLE_AV(SvRV(ret));
	6634	length = av_len(av);
	6635	SvREFCNT_dec(ret);
	6636	return newSViv(length + 1);
	6637	} else {
	6638	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
	6639	return NULL;
	6640	}
	6641	}
	6642	return &PL_sv_undef;
	6643	}
	6644
	6645	SV*
	6646	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	6647	{
	6648	struct regexp const rx = (struct regexp )SvANY(r);
	6649	AV *av = newAV();
	6650
	6651	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	6652
	6653	if (rx && RXp_PAREN_NAMES(rx)) {
	6654	HV *hv= RXp_PAREN_NAMES(rx);
	6655	HE *temphe;
	6656	(void)hv_iterinit(hv);
	6657	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	6658	IV i;
	6659	IV parno = 0;
	6660	SV* sv_dat = HeVAL(temphe);
	6661	I32 nums = (I32)SvPVX(sv_dat);
	6662	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	6663	if ((I32)(rx->lastparen) >= nums[i] &&
	6664	rx->offs[nums[i]].start != -1 &&
	6665	rx->offs[nums[i]].end != -1)
	6666	{
	6667	parno = nums[i];
	6668	break;
	6669	}
	6670	}
	6671	if (parno \|\| flags & RXapif_ALL) {
	6672	av_push(av, newSVhek(HeKEY_hek(temphe)));
	6673	}
	6674	}
	6675	}
	6676
	6677	return newRV_noinc(MUTABLE_SV(av));
	6678	}
	6679
	6680	void
	6681	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	6682	SV * const sv)
	6683	{
	6684	struct regexp const rx = (struct regexp )SvANY(r);
	6685	char *s = NULL;
	6686	I32 i = 0;
	6687	I32 s1, t1;
	6688
	6689	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	6690
	6691	if (!rx->subbeg) {
	6692	sv_setsv(sv,&PL_sv_undef);
	6693	return;
	6694	}
	6695	else
	6696	if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
	6697	/* $` */
	6698	i = rx->offs[0].start;
	6699	s = rx->subbeg;
	6700	}
	6701	else
	6702	if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
	6703	/* $' */
	6704	s = rx->subbeg + rx->offs[0].end;
	6705	i = rx->sublen - rx->offs[0].end;
	6706	}
	6707	else
	6708	if ( 0 <= paren && paren <= (I32)rx->nparens &&
	6709	(s1 = rx->offs[paren].start) != -1 &&
	6710	(t1 = rx->offs[paren].end) != -1)
	6711	{
	6712	/* $& $1 ... */
	6713	i = t1 - s1;
	6714	s = rx->subbeg + s1;
	6715	} else {
	6716	sv_setsv(sv,&PL_sv_undef);
	6717	return;
	6718	}
	6719	assert(rx->sublen >= (s - rx->subbeg) + i );
	6720	if (i >= 0) {
	6721	const int oldtainted = PL_tainted;
	6722	TAINT_NOT;
	6723	sv_setpvn(sv, s, i);
	6724	PL_tainted = oldtainted;
	6725	if ( (rx->extflags & RXf_CANY_SEEN)
	6726	? (RXp_MATCH_UTF8(rx)
	6727	&& (!i \|\| is_utf8_string((U8*)s, i)))
	6728	: (RXp_MATCH_UTF8(rx)) )
	6729	{
	6730	SvUTF8_on(sv);
	6731	}
	6732	else
	6733	SvUTF8_off(sv);
	6734	if (PL_tainting) {
	6735	if (RXp_MATCH_TAINTED(rx)) {
	6736	if (SvTYPE(sv) >= SVt_PVMG) {
	6737	MAGIC* const mg = SvMAGIC(sv);
	6738	MAGIC* mgt;
	6739	PL_tainted = 1;
	6740	SvMAGIC_set(sv, mg->mg_moremagic);
	6741	SvTAINT(sv);
	6742	if ((mgt = SvMAGIC(sv))) {
	6743	mg->mg_moremagic = mgt;
	6744	SvMAGIC_set(sv, mg);
	6745	}
	6746	} else {
	6747	PL_tainted = 1;
	6748	SvTAINT(sv);
	6749	}
	6750	} else
	6751	SvTAINTED_off(sv);
	6752	}
	6753	} else {
	6754	sv_setsv(sv,&PL_sv_undef);
	6755	return;
	6756	}
	6757	}
	6758
	6759	void
	6760	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	6761	SV const * const value)
	6762	{
	6763	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	6764
	6765	PERL_UNUSED_ARG(rx);
	6766	PERL_UNUSED_ARG(paren);
	6767	PERL_UNUSED_ARG(value);
	6768
	6769	if (!PL_localizing)
	6770	Perl_croak_no_modify(aTHX);
	6771	}
	6772
	6773	I32
	6774	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	6775	const I32 paren)
	6776	{
	6777	struct regexp const rx = (struct regexp )SvANY(r);
	6778	I32 i;
	6779	I32 s1, t1;
	6780
	6781	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	6782
	6783	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	6784	switch (paren) {
	6785	/* $` / ${^PREMATCH} */
	6786	case RX_BUFF_IDX_PREMATCH:
	6787	if (rx->offs[0].start != -1) {
	6788	i = rx->offs[0].start;
	6789	if (i > 0) {
	6790	s1 = 0;
	6791	t1 = i;
	6792	goto getlen;
	6793	}
	6794	}
	6795	return 0;
	6796	/* $' / ${^POSTMATCH} */
	6797	case RX_BUFF_IDX_POSTMATCH:
	6798	if (rx->offs[0].end != -1) {
	6799	i = rx->sublen - rx->offs[0].end;
	6800	if (i > 0) {
	6801	s1 = rx->offs[0].end;
	6802	t1 = rx->sublen;
	6803	goto getlen;
	6804	}
	6805	}
	6806	return 0;
	6807	/* $& / ${^MATCH}, $1, $2, ... */
	6808	default:
	6809	if (paren <= (I32)rx->nparens &&
	6810	(s1 = rx->offs[paren].start) != -1 &&
	6811	(t1 = rx->offs[paren].end) != -1)
	6812	{
	6813	i = t1 - s1;
	6814	goto getlen;
	6815	} else {
	6816	if (ckWARN(WARN_UNINITIALIZED))
	6817	report_uninit((const SV *)sv);
	6818	return 0;
	6819	}
	6820	}
	6821	getlen:
	6822	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	6823	const char * const s = rx->subbeg + s1;
	6824	const U8 *ep;
	6825	STRLEN el;
	6826
	6827	i = t1 - s1;
	6828	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	6829	i = el;
	6830	}
	6831	return i;
	6832	}
	6833
	6834	SV*
	6835	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	6836	{
	6837	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	6838	PERL_UNUSED_ARG(rx);
	6839	if (0)
	6840	return NULL;
	6841	else
	6842	return newSVpvs("Regexp");
	6843	}
	6844
	6845	/* Scans the name of a named buffer from the pattern.
	6846	* If flags is REG_RSN_RETURN_NULL returns null.
	6847	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	6848	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	6849	* to the parsed name as looked up in the RExC_paren_names hash.
	6850	* If there is an error throws a vFAIL().. type exception.
	6851	*/
	6852
	6853	#define REG_RSN_RETURN_NULL 0
	6854	#define REG_RSN_RETURN_NAME 1
	6855	#define REG_RSN_RETURN_DATA 2
	6856
	6857	STATIC SV*
	6858	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	6859	{
	6860	char *name_start = RExC_parse;
	6861
	6862	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	6863
	6864	if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	6865	/* skip IDFIRST by using do...while */
	6866	if (UTF)
	6867	do {
	6868	RExC_parse += UTF8SKIP(RExC_parse);
	6869	} while (isALNUM_utf8((U8*)RExC_parse));
	6870	else
	6871	do {
	6872	RExC_parse++;
	6873	} while (isALNUM(*RExC_parse));
	6874	} else {
	6875	RExC_parse++; /* so the <- from the vFAIL is after the offending character */
	6876	vFAIL("Group name must start with a non-digit word character");
	6877	}
	6878	if ( flags ) {
	6879	SV* sv_name
	6880	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	6881	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	6882	if ( flags == REG_RSN_RETURN_NAME)
	6883	return sv_name;
	6884	else if (flags==REG_RSN_RETURN_DATA) {
	6885	HE *he_str = NULL;
	6886	SV *sv_dat = NULL;
	6887	if ( ! sv_name ) /* should not happen*/
	6888	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	6889	if (RExC_paren_names)
	6890	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	6891	if ( he_str )
	6892	sv_dat = HeVAL(he_str);
	6893	if ( ! sv_dat )
	6894	vFAIL("Reference to nonexistent named group");
	6895	return sv_dat;
	6896	}
	6897	else {
	6898	Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
	6899	(unsigned long) flags);
	6900	}
	6901	assert(0); /* NOT REACHED */
	6902	}
	6903	return NULL;
	6904	}
	6905
	6906	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	6907	int rem=(int)(RExC_end - RExC_parse); \
	6908	int cut; \
	6909	int num; \
	6910	int iscut=0; \
	6911	if (rem>10) { \
	6912	rem=10; \
	6913	iscut=1; \
	6914	} \
	6915	cut=10-rem; \
	6916	if (RExC_lastparse!=RExC_parse) \
	6917	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	6918	rem, RExC_parse, \
	6919	cut + 4, \
	6920	iscut ? "..." : "<" \
	6921	); \
	6922	else \
	6923	PerlIO_printf(Perl_debug_log,"%16s",""); \
	6924	\
	6925	if (SIZE_ONLY) \
	6926	num = RExC_size + 1; \
	6927	else \
	6928	num=REG_NODE_NUM(RExC_emit); \
	6929	if (RExC_lastnum!=num) \
	6930	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	6931	else \
	6932	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	6933	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	6934	(int)((depth*2)), "", \
	6935	(funcname) \
	6936	); \
	6937	RExC_lastnum=num; \
	6938	RExC_lastparse=RExC_parse; \
	6939	})
	6940
	6941
	6942
	6943	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	6944	DEBUG_PARSE_MSG((funcname)); \
	6945	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	6946	})
	6947	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	6948	DEBUG_PARSE_MSG((funcname)); \
	6949	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	6950	})
	6951
	6952	/* This section of code defines the inversion list object and its methods. The
	6953	* interfaces are highly subject to change, so as much as possible is static to
	6954	* this file. An inversion list is here implemented as a malloc'd C UV array
	6955	* with some added info that is placed as UVs at the beginning in a header
	6956	* portion. An inversion list for Unicode is an array of code points, sorted
	6957	* by ordinal number. The zeroth element is the first code point in the list.
	6958	* The 1th element is the first element beyond that not in the list. In other
	6959	* words, the first range is
	6960	* invlist[0]..(invlist[1]-1)
	6961	* The other ranges follow. Thus every element whose index is divisible by two
	6962	* marks the beginning of a range that is in the list, and every element not
	6963	* divisible by two marks the beginning of a range not in the list. A single
	6964	* element inversion list that contains the single code point N generally
	6965	* consists of two elements
	6966	* invlist[0] == N
	6967	* invlist[1] == N+1
	6968	* (The exception is when N is the highest representable value on the
	6969	* machine, in which case the list containing just it would be a single
	6970	* element, itself. By extension, if the last range in the list extends to
	6971	* infinity, then the first element of that range will be in the inversion list
	6972	* at a position that is divisible by two, and is the final element in the
	6973	* list.)
	6974	* Taking the complement (inverting) an inversion list is quite simple, if the
	6975	* first element is 0, remove it; otherwise add a 0 element at the beginning.
	6976	* This implementation reserves an element at the beginning of each inversion list
	6977	* to contain 0 when the list contains 0, and contains 1 otherwise. The actual
	6978	* beginning of the list is either that element if 0, or the next one if 1.
	6979	*
	6980	* More about inversion lists can be found in "Unicode Demystified"
	6981	* Chapter 13 by Richard Gillam, published by Addison-Wesley.
	6982	* More will be coming when functionality is added later.
	6983	*
	6984	* The inversion list data structure is currently implemented as an SV pointing
	6985	* to an array of UVs that the SV thinks are bytes. This allows us to have an
	6986	* array of UV whose memory management is automatically handled by the existing
	6987	* facilities for SV's.
	6988	*
	6989	* Some of the methods should always be private to the implementation, and some
	6990	* should eventually be made public */
	6991
	6992	#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
	6993	#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
	6994
	6995	/* This is a combination of a version and data structure type, so that one
	6996	* being passed in can be validated to be an inversion list of the correct
	6997	* vintage. When the structure of the header is changed, a new random number
	6998	* in the range 2**31-1 should be generated and the new() method changed to
	6999	* insert that at this location. Then, if an auxiliary program doesn't change
	7000	* correspondingly, it will be discovered immediately */
	7001	#define INVLIST_VERSION_ID_OFFSET 2
	7002	#define INVLIST_VERSION_ID 1064334010
	7003
	7004	/* For safety, when adding new elements, remember to #undef them at the end of
	7005	* the inversion list code section */
	7006
	7007	#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
	7008	/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
	7009	* contains the code point U+00000, and begins here. If 1, the inversion list
	7010	* doesn't contain U+0000, and it begins at the next UV in the array.
	7011	* Inverting an inversion list consists of adding or removing the 0 at the
	7012	* beginning of it. By reserving a space for that 0, inversion can be made
	7013	* very fast */
	7014
	7015	#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
	7016
	7017	/* Internally things are UVs */
	7018	#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
	7019	#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
	7020
	7021	#define INVLIST_INITIAL_LEN 10
	7022
	7023	PERL_STATIC_INLINE UV*
	7024	S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
	7025	{
	7026	/* Returns a pointer to the first element in the inversion list's array.
	7027	* This is called upon initialization of an inversion list. Where the
	7028	* array begins depends on whether the list has the code point U+0000
	7029	* in it or not. The other parameter tells it whether the code that
	7030	* follows this call is about to put a 0 in the inversion list or not.
	7031	* The first element is either the element with 0, if 0, or the next one,
	7032	* if 1 */
	7033
	7034	UV* zero = get_invlist_zero_addr(invlist);
	7035
	7036	PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
	7037
	7038	/* Must be empty */
	7039	assert(! *get_invlist_len_addr(invlist));
	7040
	7041	/* 1^1 = 0; 1^0 = 1 */
	7042	*zero = 1 ^ will_have_0;
	7043	return zero + *zero;
	7044	}
	7045
	7046	PERL_STATIC_INLINE UV*
	7047	S_invlist_array(pTHX_ SV* const invlist)
	7048	{
	7049	/* Returns the pointer to the inversion list's array. Every time the
	7050	* length changes, this needs to be called in case malloc or realloc moved
	7051	* it */
	7052
	7053	PERL_ARGS_ASSERT_INVLIST_ARRAY;
	7054
	7055	/* Must not be empty. If these fail, you probably didn't check for <len>
	7056	* being non-zero before trying to get the array */
	7057	assert(*get_invlist_len_addr(invlist));
	7058	assert(*get_invlist_zero_addr(invlist) == 0
	7059	\|\| *get_invlist_zero_addr(invlist) == 1);
	7060
	7061	/* The array begins either at the element reserved for zero if the
	7062	* list contains 0 (that element will be set to 0), or otherwise the next
	7063	* element (in which case the reserved element will be set to 1). */
	7064	return (UV *) (get_invlist_zero_addr(invlist)
	7065	+ *get_invlist_zero_addr(invlist));
	7066	}
	7067
	7068	PERL_STATIC_INLINE UV*
	7069	S_get_invlist_len_addr(pTHX_ SV* invlist)
	7070	{
	7071	/* Return the address of the UV that contains the current number
	7072	* of used elements in the inversion list */
	7073
	7074	PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
	7075
	7076	return (UV ) (SvPVX(invlist) + (INVLIST_LEN_OFFSET sizeof (UV)));
	7077	}
	7078
	7079	PERL_STATIC_INLINE UV
	7080	S_invlist_len(pTHX_ SV* const invlist)
	7081	{
	7082	/* Returns the current number of elements stored in the inversion list's
	7083	* array */
	7084
	7085	PERL_ARGS_ASSERT_INVLIST_LEN;
	7086
	7087	return *get_invlist_len_addr(invlist);
	7088	}
	7089
	7090	PERL_STATIC_INLINE void
	7091	S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
	7092	{
	7093	/* Sets the current number of elements stored in the inversion list */
	7094
	7095	PERL_ARGS_ASSERT_INVLIST_SET_LEN;
	7096
	7097	*get_invlist_len_addr(invlist) = len;
	7098
	7099	assert(len <= SvLEN(invlist));
	7100
	7101	SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
	7102	/* If the list contains U+0000, that element is part of the header,
	7103	* and should not be counted as part of the array. It will contain
	7104	* 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
	7105	* subtract:
	7106	* SvCUR_set(invlist,
	7107	* TO_INTERNAL_SIZE(len
	7108	* - (*get_invlist_zero_addr(inv_list) ^ 1)));
	7109	* But, this is only valid if len is not 0. The consequences of not doing
	7110	* this is that the memory allocation code may think that 1 more UV is
	7111	* being used than actually is, and so might do an unnecessary grow. That
	7112	* seems worth not bothering to make this the precise amount.
	7113	*
	7114	* Note that when inverting, SvCUR shouldn't change */
	7115	}
	7116
	7117	PERL_STATIC_INLINE UV
	7118	S_invlist_max(pTHX_ SV* const invlist)
	7119	{
	7120	/* Returns the maximum number of elements storable in the inversion list's
	7121	* array, without having to realloc() */
	7122
	7123	PERL_ARGS_ASSERT_INVLIST_MAX;
	7124
	7125	return FROM_INTERNAL_SIZE(SvLEN(invlist));
	7126	}
	7127
	7128	PERL_STATIC_INLINE UV*
	7129	S_get_invlist_zero_addr(pTHX_ SV* invlist)
	7130	{
	7131	/* Return the address of the UV that is reserved to hold 0 if the inversion
	7132	* list contains 0. This has to be the last element of the heading, as the
	7133	* list proper starts with either it if 0, or the next element if not.
	7134	* (But we force it to contain either 0 or 1) */
	7135
	7136	PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
	7137
	7138	return (UV ) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET sizeof (UV)));
	7139	}
	7140
	7141	#ifndef PERL_IN_XSUB_RE
	7142	SV*
	7143	Perl__new_invlist(pTHX_ IV initial_size)
	7144	{
	7145
	7146	/* Return a pointer to a newly constructed inversion list, with enough
	7147	* space to store 'initial_size' elements. If that number is negative, a
	7148	* system default is used instead */
	7149
	7150	SV* new_list;
	7151
	7152	if (initial_size < 0) {
	7153	initial_size = INVLIST_INITIAL_LEN;
	7154	}
	7155
	7156	/* Allocate the initial space */
	7157	new_list = newSV(TO_INTERNAL_SIZE(initial_size));
	7158	invlist_set_len(new_list, 0);
	7159
	7160	/* Force iterinit() to be used to get iteration to work */
	7161	*get_invlist_iter_addr(new_list) = UV_MAX;
	7162
	7163	/* This should force a segfault if a method doesn't initialize this
	7164	* properly */
	7165	*get_invlist_zero_addr(new_list) = UV_MAX;
	7166
	7167	*get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
	7168	#if HEADER_LENGTH != 4
	7169	# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
	7170	#endif
	7171
	7172	return new_list;
	7173	}
	7174	#endif
	7175
	7176	STATIC SV*
	7177	S__new_invlist_C_array(pTHX_ UV* list)
	7178	{
	7179	/* Return a pointer to a newly constructed inversion list, initialized to
	7180	* point to <list>, which has to be in the exact correct inversion list
	7181	* form, including internal fields. Thus this is a dangerous routine that
	7182	* should not be used in the wrong hands */
	7183
	7184	SV* invlist = newSV_type(SVt_PV);
	7185
	7186	PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
	7187
	7188	SvPV_set(invlist, (char *) list);
	7189	SvLEN_set(invlist, 0); /* Means we own the contents, and the system
	7190	shouldn't touch it */
	7191	SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
	7192
	7193	if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
	7194	Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
	7195	}
	7196
	7197	return invlist;
	7198	}
	7199
	7200	STATIC void
	7201	S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
	7202	{
	7203	/* Grow the maximum size of an inversion list */
	7204
	7205	PERL_ARGS_ASSERT_INVLIST_EXTEND;
	7206
	7207	SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
	7208	}
	7209
	7210	PERL_STATIC_INLINE void
	7211	S_invlist_trim(pTHX_ SV* const invlist)
	7212	{
	7213	PERL_ARGS_ASSERT_INVLIST_TRIM;
	7214
	7215	/* Change the length of the inversion list to how many entries it currently
	7216	* has */
	7217
	7218	SvPV_shrink_to_cur((SV *) invlist);
	7219	}
	7220
	7221	/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
	7222	* etc */
	7223	#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
	7224	#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
	7225
	7226	#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
	7227
	7228	STATIC void
	7229	S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
	7230	{
	7231	/* Subject to change or removal. Append the range from 'start' to 'end' at
	7232	* the end of the inversion list. The range must be above any existing
	7233	* ones. */
	7234
	7235	UV* array;
	7236	UV max = invlist_max(invlist);
	7237	UV len = invlist_len(invlist);
	7238
	7239	PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
	7240
	7241	if (len == 0) { /* Empty lists must be initialized */
	7242	array = _invlist_array_init(invlist, start == 0);
	7243	}
	7244	else {
	7245	/* Here, the existing list is non-empty. The current max entry in the
	7246	* list is generally the first value not in the set, except when the
	7247	* set extends to the end of permissible values, in which case it is
	7248	* the first entry in that final set, and so this call is an attempt to
	7249	* append out-of-order */
	7250
	7251	UV final_element = len - 1;
	7252	array = invlist_array(invlist);
	7253	if (array[final_element] > start
	7254	\|\| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
	7255	{
	7256	Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
	7257	array[final_element], start,
	7258	ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
	7259	}
	7260
	7261	/* Here, it is a legal append. If the new range begins with the first
	7262	* value not in the set, it is extending the set, so the new first
	7263	* value not in the set is one greater than the newly extended range.
	7264	* */
	7265	if (array[final_element] == start) {
	7266	if (end != UV_MAX) {
	7267	array[final_element] = end + 1;
	7268	}
	7269	else {
	7270	/* But if the end is the maximum representable on the machine,
	7271	* just let the range that this would extend to have no end */
	7272	invlist_set_len(invlist, len - 1);
	7273	}
	7274	return;
	7275	}
	7276	}
	7277
	7278	/* Here the new range doesn't extend any existing set. Add it */
	7279
	7280	len += 2; /* Includes an element each for the start and end of range */
	7281
	7282	/* If overflows the existing space, extend, which may cause the array to be
	7283	* moved */
	7284	if (max < len) {
	7285	invlist_extend(invlist, len);
	7286	invlist_set_len(invlist, len); /* Have to set len here to avoid assert
	7287	failure in invlist_array() */
	7288	array = invlist_array(invlist);
	7289	}
	7290	else {
	7291	invlist_set_len(invlist, len);
	7292	}
	7293
	7294	/* The next item on the list starts the range, the one after that is
	7295	* one past the new range. */
	7296	array[len - 2] = start;
	7297	if (end != UV_MAX) {
	7298	array[len - 1] = end + 1;
	7299	}
	7300	else {
	7301	/* But if the end is the maximum representable on the machine, just let
	7302	* the range have no end */
	7303	invlist_set_len(invlist, len - 1);
	7304	}
	7305	}
	7306
	7307	#ifndef PERL_IN_XSUB_RE
	7308
	7309	STATIC IV
	7310	S_invlist_search(pTHX_ SV* const invlist, const UV cp)
	7311	{
	7312	/* Searches the inversion list for the entry that contains the input code
	7313	* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
	7314	* return value is the index into the list's array of the range that
	7315	* contains <cp> */
	7316
	7317	IV low = 0;
	7318	IV high = invlist_len(invlist);
	7319	const UV * const array = invlist_array(invlist);
	7320
	7321	PERL_ARGS_ASSERT_INVLIST_SEARCH;
	7322
	7323	/* If list is empty or the code point is before the first element, return
	7324	* failure. */
	7325	if (high == 0 \|\| cp < array[0]) {
	7326	return -1;
	7327	}
	7328
	7329	/* Binary search. What we are looking for is <i> such that
	7330	* array[i] <= cp < array[i+1]
	7331	* The loop below converges on the i+1. */
	7332	while (low < high) {
	7333	IV mid = (low + high) / 2;
	7334	if (array[mid] <= cp) {
	7335	low = mid + 1;
	7336
	7337	/* We could do this extra test to exit the loop early.
	7338	if (cp < array[low]) {
	7339	return mid;
	7340	}
	7341	*/
	7342	}
	7343	else { /* cp < array[mid] */
	7344	high = mid;
	7345	}
	7346	}
	7347
	7348	return high - 1;
	7349	}
	7350
	7351	void
	7352	Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
	7353	{
	7354	/* populates a swatch of a swash the same way swatch_get() does in utf8.c,
	7355	* but is used when the swash has an inversion list. This makes this much
	7356	* faster, as it uses a binary search instead of a linear one. This is
	7357	* intimately tied to that function, and perhaps should be in utf8.c,
	7358	* except it is intimately tied to inversion lists as well. It assumes
	7359	* that <swatch> is all 0's on input */
	7360
	7361	UV current = start;
	7362	const IV len = invlist_len(invlist);
	7363	IV i;
	7364	const UV * array;
	7365
	7366	PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
	7367
	7368	if (len == 0) { /* Empty inversion list */
	7369	return;
	7370	}
	7371
	7372	array = invlist_array(invlist);
	7373
	7374	/* Find which element it is */
	7375	i = invlist_search(invlist, start);
	7376
	7377	/* We populate from <start> to <end> */
	7378	while (current < end) {
	7379	UV upper;
	7380
	7381	/* The inversion list gives the results for every possible code point
	7382	* after the first one in the list. Only those ranges whose index is
	7383	* even are ones that the inversion list matches. For the odd ones,
	7384	* and if the initial code point is not in the list, we have to skip
	7385	* forward to the next element */
	7386	if (i == -1 \|\| ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
	7387	i++;
	7388	if (i >= len) { /* Finished if beyond the end of the array */
	7389	return;
	7390	}
	7391	current = array[i];
	7392	if (current >= end) { /* Finished if beyond the end of what we
	7393	are populating */
	7394	return;
	7395	}
	7396	}
	7397	assert(current >= start);
	7398
	7399	/* The current range ends one below the next one, except don't go past
	7400	* <end> */
	7401	i++;
	7402	upper = (i < len && array[i] < end) ? array[i] : end;
	7403
	7404	/* Here we are in a range that matches. Populate a bit in the 3-bit U8
	7405	* for each code point in it */
	7406	for (; current < upper; current++) {
	7407	const STRLEN offset = (STRLEN)(current - start);
	7408	swatch[offset >> 3] \|= 1 << (offset & 7);
	7409	}
	7410
	7411	/* Quit if at the end of the list */
	7412	if (i >= len) {
	7413
	7414	/* But first, have to deal with the highest possible code point on
	7415	* the platform. The previous code assumes that <end> is one
	7416	* beyond where we want to populate, but that is impossible at the
	7417	* platform's infinity, so have to handle it specially */
	7418	if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
	7419	{
	7420	const STRLEN offset = (STRLEN)(end - start);
	7421	swatch[offset >> 3] \|= 1 << (offset & 7);
	7422	}
	7423	return;
	7424	}
	7425
	7426	/* Advance to the next range, which will be for code points not in the
	7427	* inversion list */
	7428	current = array[i];
	7429	}
	7430
	7431	return;
	7432	}
	7433
	7434
	7435	void
	7436	Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
	7437	{
	7438	/* Take the union of two inversion lists and point <output> to it. *output
	7439	* should be defined upon input, and if it points to one of the two lists,
	7440	* the reference count to that list will be decremented. The first list,
	7441	* <a>, may be NULL, in which case a copy of the second list is returned.
	7442	* If <complement_b> is TRUE, the union is taken of the complement
	7443	* (inversion) of <b> instead of b itself.
	7444	*
	7445	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	7446	* Richard Gillam, published by Addison-Wesley, and explained at some
	7447	* length there. The preface says to incorporate its examples into your
	7448	* code at your own risk.
	7449	*
	7450	* The algorithm is like a merge sort.
	7451	*
	7452	* XXX A potential performance improvement is to keep track as we go along
	7453	* if only one of the inputs contributes to the result, meaning the other
	7454	* is a subset of that one. In that case, we can skip the final copy and
	7455	* return the larger of the input lists, but then outside code might need
	7456	* to keep track of whether to free the input list or not */
	7457
	7458	UV* array_a; /* a's array */
	7459	UV* array_b;
	7460	UV len_a; /* length of a's array */
	7461	UV len_b;
	7462
	7463	SV* u; /* the resulting union */
	7464	UV* array_u;
	7465	UV len_u;
	7466
	7467	UV i_a = 0; /* current index into a's array */
	7468	UV i_b = 0;
	7469	UV i_u = 0;
	7470
	7471	/* running count, as explained in the algorithm source book; items are
	7472	* stopped accumulating and are output when the count changes to/from 0.
	7473	* The count is incremented when we start a range that's in the set, and
	7474	* decremented when we start a range that's not in the set. So its range
	7475	* is 0 to 2. Only when the count is zero is something not in the set.
	7476	*/
	7477	UV count = 0;
	7478
	7479	PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
	7480	assert(a != b);
	7481
	7482	/* If either one is empty, the union is the other one */
	7483	if (a == NULL \|\| ((len_a = invlist_len(a)) == 0)) {
	7484	if (*output == a) {
	7485	if (a != NULL) {
	7486	SvREFCNT_dec(a);
	7487	}
	7488	}
	7489	if (*output != b) {
	7490	*output = invlist_clone(b);
	7491	if (complement_b) {
	7492	_invlist_invert(*output);
	7493	}
	7494	} /* else output already = b; /
	7495	return;
	7496	}
	7497	else if ((len_b = invlist_len(b)) == 0) {
	7498	if (*output == b) {
	7499	SvREFCNT_dec(b);
	7500	}
	7501
	7502	/* The complement of an empty list is a list that has everything in it,
	7503	* so the union with <a> includes everything too */
	7504	if (complement_b) {
	7505	if (a == *output) {
	7506	SvREFCNT_dec(a);
	7507	}
	7508	*output = _new_invlist(1);
	7509	_append_range_to_invlist(*output, 0, UV_MAX);
	7510	}
	7511	else if (*output != a) {
	7512	*output = invlist_clone(a);
	7513	}
	7514	/* else output already = a; /
	7515	return;
	7516	}
	7517
	7518	/* Here both lists exist and are non-empty */
	7519	array_a = invlist_array(a);
	7520	array_b = invlist_array(b);
	7521
	7522	/* If are to take the union of 'a' with the complement of b, set it
	7523	* up so are looking at b's complement. */
	7524	if (complement_b) {
	7525
	7526	/* To complement, we invert: if the first element is 0, remove it. To
	7527	* do this, we just pretend the array starts one later, and clear the
	7528	* flag as we don't have to do anything else later */
	7529	if (array_b[0] == 0) {
	7530	array_b++;
	7531	len_b--;
	7532	complement_b = FALSE;
	7533	}
	7534	else {
	7535
	7536	/* But if the first element is not zero, we unshift a 0 before the
	7537	* array. The data structure reserves a space for that 0 (which
	7538	* should be a '1' right now), so physical shifting is unneeded,
	7539	* but temporarily change that element to 0. Before exiting the
	7540	* routine, we must restore the element to '1' */
	7541	array_b--;
	7542	len_b++;
	7543	array_b[0] = 0;
	7544	}
	7545	}
	7546
	7547	/* Size the union for the worst case: that the sets are completely
	7548	* disjoint */
	7549	u = _new_invlist(len_a + len_b);
	7550
	7551	/* Will contain U+0000 if either component does */
	7552	array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
	7553	\|\| (len_b > 0 && array_b[0] == 0));
	7554
	7555	/* Go through each list item by item, stopping when exhausted one of
	7556	* them */
	7557	while (i_a < len_a && i_b < len_b) {
	7558	UV cp; /* The element to potentially add to the union's array */
	7559	bool cp_in_set; /* is it in the the input list's set or not */
	7560
	7561	/* We need to take one or the other of the two inputs for the union.
	7562	* Since we are merging two sorted lists, we take the smaller of the
	7563	* next items. In case of a tie, we take the one that is in its set
	7564	* first. If we took one not in the set first, it would decrement the
	7565	* count, possibly to 0 which would cause it to be output as ending the
	7566	* range, and the next time through we would take the same number, and
	7567	* output it again as beginning the next range. By doing it the
	7568	* opposite way, there is no possibility that the count will be
	7569	* momentarily decremented to 0, and thus the two adjoining ranges will
	7570	* be seamlessly merged. (In a tie and both are in the set or both not
	7571	* in the set, it doesn't matter which we take first.) */
	7572	if (array_a[i_a] < array_b[i_b]
	7573	\|\| (array_a[i_a] == array_b[i_b]
	7574	&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	7575	{
	7576	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	7577	cp= array_a[i_a++];
	7578	}
	7579	else {
	7580	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	7581	cp= array_b[i_b++];
	7582	}
	7583
	7584	/* Here, have chosen which of the two inputs to look at. Only output
	7585	* if the running count changes to/from 0, which marks the
	7586	* beginning/end of a range in that's in the set */
	7587	if (cp_in_set) {
	7588	if (count == 0) {
	7589	array_u[i_u++] = cp;
	7590	}
	7591	count++;
	7592	}
	7593	else {
	7594	count--;
	7595	if (count == 0) {
	7596	array_u[i_u++] = cp;
	7597	}
	7598	}
	7599	}
	7600
	7601	/* Here, we are finished going through at least one of the lists, which
	7602	* means there is something remaining in at most one. We check if the list
	7603	* that hasn't been exhausted is positioned such that we are in the middle
	7604	* of a range in its set or not. (i_a and i_b point to the element beyond
	7605	* the one we care about.) If in the set, we decrement 'count'; if 0, there
	7606	* is potentially more to output.
	7607	* There are four cases:
	7608	* 1) Both weren't in their sets, count is 0, and remains 0. What's left
	7609	* in the union is entirely from the non-exhausted set.
	7610	* 2) Both were in their sets, count is 2. Nothing further should
	7611	* be output, as everything that remains will be in the exhausted
	7612	* list's set, hence in the union; decrementing to 1 but not 0 insures
	7613	* that
	7614	* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
	7615	* Nothing further should be output because the union includes
	7616	* everything from the exhausted set. Not decrementing ensures that.
	7617	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
	7618	* decrementing to 0 insures that we look at the remainder of the
	7619	* non-exhausted set */
	7620	if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	7621	\|\| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	7622	{
	7623	count--;
	7624	}
	7625
	7626	/* The final length is what we've output so far, plus what else is about to
	7627	* be output. (If 'count' is non-zero, then the input list we exhausted
	7628	* has everything remaining up to the machine's limit in its set, and hence
	7629	* in the union, so there will be no further output. */
	7630	len_u = i_u;
	7631	if (count == 0) {
	7632	/* At most one of the subexpressions will be non-zero */
	7633	len_u += (len_a - i_a) + (len_b - i_b);
	7634	}
	7635
	7636	/* Set result to final length, which can change the pointer to array_u, so
	7637	* re-find it */
	7638	if (len_u != invlist_len(u)) {
	7639	invlist_set_len(u, len_u);
	7640	invlist_trim(u);
	7641	array_u = invlist_array(u);
	7642	}
	7643
	7644	/* When 'count' is 0, the list that was exhausted (if one was shorter than
	7645	* the other) ended with everything above it not in its set. That means
	7646	* that the remaining part of the union is precisely the same as the
	7647	* non-exhausted list, so can just copy it unchanged. (If both list were
	7648	* exhausted at the same time, then the operations below will be both 0.)
	7649	*/
	7650	if (count == 0) {
	7651	IV copy_count; /* At most one will have a non-zero copy count */
	7652	if ((copy_count = len_a - i_a) > 0) {
	7653	Copy(array_a + i_a, array_u + i_u, copy_count, UV);
	7654	}
	7655	else if ((copy_count = len_b - i_b) > 0) {
	7656	Copy(array_b + i_b, array_u + i_u, copy_count, UV);
	7657	}
	7658	}
	7659
	7660	/* We may be removing a reference to one of the inputs */
	7661	if (a == output \|\| b == output) {
	7662	SvREFCNT_dec(*output);
	7663	}
	7664
	7665	/* If we've changed b, restore it */
	7666	if (complement_b) {
	7667	array_b[0] = 1;
	7668	}
	7669
	7670	*output = u;
	7671	return;
	7672	}
	7673
	7674	void
	7675	Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
	7676	{
	7677	/* Take the intersection of two inversion lists and point <i> to it. *i
	7678	* should be defined upon input, and if it points to one of the two lists,
	7679	* the reference count to that list will be decremented.
	7680	* If <complement_b> is TRUE, the result will be the intersection of <a>
	7681	* and the complement (or inversion) of <b> instead of <b> directly.
	7682	*
	7683	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	7684	* Richard Gillam, published by Addison-Wesley, and explained at some
	7685	* length there. The preface says to incorporate its examples into your
	7686	* code at your own risk. In fact, it had bugs
	7687	*
	7688	* The algorithm is like a merge sort, and is essentially the same as the
	7689	* union above
	7690	*/
	7691
	7692	UV* array_a; /* a's array */
	7693	UV* array_b;
	7694	UV len_a; /* length of a's array */
	7695	UV len_b;
	7696
	7697	SV* r; /* the resulting intersection */
	7698	UV* array_r;
	7699	UV len_r;
	7700
	7701	UV i_a = 0; /* current index into a's array */
	7702	UV i_b = 0;
	7703	UV i_r = 0;
	7704
	7705	/* running count, as explained in the algorithm source book; items are
	7706	* stopped accumulating and are output when the count changes to/from 2.
	7707	* The count is incremented when we start a range that's in the set, and
	7708	* decremented when we start a range that's not in the set. So its range
	7709	* is 0 to 2. Only when the count is 2 is something in the intersection.
	7710	*/
	7711	UV count = 0;
	7712
	7713	PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
	7714	assert(a != b);
	7715
	7716	/* Special case if either one is empty */
	7717	len_a = invlist_len(a);
	7718	if ((len_a == 0) \|\| ((len_b = invlist_len(b)) == 0)) {
	7719
	7720	if (len_a != 0 && complement_b) {
	7721
	7722	/* Here, 'a' is not empty, therefore from the above 'if', 'b' must
	7723	* be empty. Here, also we are using 'b's complement, which hence
	7724	* must be every possible code point. Thus the intersection is
	7725	* simply 'a'. */
	7726	if (*i != a) {
	7727	*i = invlist_clone(a);
	7728
	7729	if (*i == b) {
	7730	SvREFCNT_dec(b);
	7731	}
	7732	}
	7733	/* else i is already 'a' /
	7734	return;
	7735	}
	7736
	7737	/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
	7738	* intersection must be empty */
	7739	if (*i == a) {
	7740	SvREFCNT_dec(a);
	7741	}
	7742	else if (*i == b) {
	7743	SvREFCNT_dec(b);
	7744	}
	7745	*i = _new_invlist(0);
	7746	return;
	7747	}
	7748
	7749	/* Here both lists exist and are non-empty */
	7750	array_a = invlist_array(a);
	7751	array_b = invlist_array(b);
	7752
	7753	/* If are to take the intersection of 'a' with the complement of b, set it
	7754	* up so are looking at b's complement. */
	7755	if (complement_b) {
	7756
	7757	/* To complement, we invert: if the first element is 0, remove it. To
	7758	* do this, we just pretend the array starts one later, and clear the
	7759	* flag as we don't have to do anything else later */
	7760	if (array_b[0] == 0) {
	7761	array_b++;
	7762	len_b--;
	7763	complement_b = FALSE;
	7764	}
	7765	else {
	7766
	7767	/* But if the first element is not zero, we unshift a 0 before the
	7768	* array. The data structure reserves a space for that 0 (which
	7769	* should be a '1' right now), so physical shifting is unneeded,
	7770	* but temporarily change that element to 0. Before exiting the
	7771	* routine, we must restore the element to '1' */
	7772	array_b--;
	7773	len_b++;
	7774	array_b[0] = 0;
	7775	}
	7776	}
	7777
	7778	/* Size the intersection for the worst case: that the intersection ends up
	7779	* fragmenting everything to be completely disjoint */
	7780	r= _new_invlist(len_a + len_b);
	7781
	7782	/* Will contain U+0000 iff both components do */
	7783	array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
	7784	&& len_b > 0 && array_b[0] == 0);
	7785
	7786	/* Go through each list item by item, stopping when exhausted one of
	7787	* them */
	7788	while (i_a < len_a && i_b < len_b) {
	7789	UV cp; /* The element to potentially add to the intersection's
	7790	array */
	7791	bool cp_in_set; /* Is it in the input list's set or not */
	7792
	7793	/* We need to take one or the other of the two inputs for the
	7794	* intersection. Since we are merging two sorted lists, we take the
	7795	* smaller of the next items. In case of a tie, we take the one that
	7796	* is not in its set first (a difference from the union algorithm). If
	7797	* we took one in the set first, it would increment the count, possibly
	7798	* to 2 which would cause it to be output as starting a range in the
	7799	* intersection, and the next time through we would take that same
	7800	* number, and output it again as ending the set. By doing it the
	7801	* opposite of this, there is no possibility that the count will be
	7802	* momentarily incremented to 2. (In a tie and both are in the set or
	7803	* both not in the set, it doesn't matter which we take first.) */
	7804	if (array_a[i_a] < array_b[i_b]
	7805	\|\| (array_a[i_a] == array_b[i_b]
	7806	&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	7807	{
	7808	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	7809	cp= array_a[i_a++];
	7810	}
	7811	else {
	7812	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	7813	cp= array_b[i_b++];
	7814	}
	7815
	7816	/* Here, have chosen which of the two inputs to look at. Only output
	7817	* if the running count changes to/from 2, which marks the
	7818	* beginning/end of a range that's in the intersection */
	7819	if (cp_in_set) {
	7820	count++;
	7821	if (count == 2) {
	7822	array_r[i_r++] = cp;
	7823	}
	7824	}
	7825	else {
	7826	if (count == 2) {
	7827	array_r[i_r++] = cp;
	7828	}
	7829	count--;
	7830	}
	7831	}
	7832
	7833	/* Here, we are finished going through at least one of the lists, which
	7834	* means there is something remaining in at most one. We check if the list
	7835	* that has been exhausted is positioned such that we are in the middle
	7836	* of a range in its set or not. (i_a and i_b point to elements 1 beyond
	7837	* the ones we care about.) There are four cases:
	7838	* 1) Both weren't in their sets, count is 0, and remains 0. There's
	7839	* nothing left in the intersection.
	7840	* 2) Both were in their sets, count is 2 and perhaps is incremented to
	7841	* above 2. What should be output is exactly that which is in the
	7842	* non-exhausted set, as everything it has is also in the intersection
	7843	* set, and everything it doesn't have can't be in the intersection
	7844	* 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
	7845	* gets incremented to 2. Like the previous case, the intersection is
	7846	* everything that remains in the non-exhausted set.
	7847	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
	7848	* remains 1. And the intersection has nothing more. */
	7849	if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	7850	\|\| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	7851	{
	7852	count++;
	7853	}
	7854
	7855	/* The final length is what we've output so far plus what else is in the
	7856	* intersection. At most one of the subexpressions below will be non-zero */
	7857	len_r = i_r;
	7858	if (count >= 2) {
	7859	len_r += (len_a - i_a) + (len_b - i_b);
	7860	}
	7861
	7862	/* Set result to final length, which can change the pointer to array_r, so
	7863	* re-find it */
	7864	if (len_r != invlist_len(r)) {
	7865	invlist_set_len(r, len_r);
	7866	invlist_trim(r);
	7867	array_r = invlist_array(r);
	7868	}
	7869
	7870	/* Finish outputting any remaining */
	7871	if (count >= 2) { /* At most one will have a non-zero copy count */
	7872	IV copy_count;
	7873	if ((copy_count = len_a - i_a) > 0) {
	7874	Copy(array_a + i_a, array_r + i_r, copy_count, UV);
	7875	}
	7876	else if ((copy_count = len_b - i_b) > 0) {
	7877	Copy(array_b + i_b, array_r + i_r, copy_count, UV);
	7878	}
	7879	}
	7880
	7881	/* We may be removing a reference to one of the inputs */
	7882	if (a == i \|\| b == i) {
	7883	SvREFCNT_dec(*i);
	7884	}
	7885
	7886	/* If we've changed b, restore it */
	7887	if (complement_b) {
	7888	array_b[0] = 1;
	7889	}
	7890
	7891	*i = r;
	7892	return;
	7893	}
	7894
	7895	SV*
	7896	Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
	7897	{
	7898	/* Add the range from 'start' to 'end' inclusive to the inversion list's
	7899	* set. A pointer to the inversion list is returned. This may actually be
	7900	* a new list, in which case the passed in one has been destroyed. The
	7901	* passed in inversion list can be NULL, in which case a new one is created
	7902	* with just the one range in it */
	7903
	7904	SV* range_invlist;
	7905	UV len;
	7906
	7907	if (invlist == NULL) {
	7908	invlist = _new_invlist(2);
	7909	len = 0;
	7910	}
	7911	else {
	7912	len = invlist_len(invlist);
	7913	}
	7914
	7915	/* If comes after the final entry, can just append it to the end */
	7916	if (len == 0
	7917	\|\| start >= invlist_array(invlist)
	7918	[invlist_len(invlist) - 1])
	7919	{
	7920	_append_range_to_invlist(invlist, start, end);
	7921	return invlist;
	7922	}
	7923
	7924	/* Here, can't just append things, create and return a new inversion list
	7925	* which is the union of this range and the existing inversion list */
	7926	range_invlist = _new_invlist(2);
	7927	_append_range_to_invlist(range_invlist, start, end);
	7928
	7929	_invlist_union(invlist, range_invlist, &invlist);
	7930
	7931	/* The temporary can be freed */
	7932	SvREFCNT_dec(range_invlist);
	7933
	7934	return invlist;
	7935	}
	7936
	7937	#endif
	7938
	7939	PERL_STATIC_INLINE SV*
	7940	S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
	7941	return _add_range_to_invlist(invlist, cp, cp);
	7942	}
	7943
	7944	#ifndef PERL_IN_XSUB_RE
	7945	void
	7946	Perl__invlist_invert(pTHX_ SV* const invlist)
	7947	{
	7948	/* Complement the input inversion list. This adds a 0 if the list didn't
	7949	* have a zero; removes it otherwise. As described above, the data
	7950	* structure is set up so that this is very efficient */
	7951
	7952	UV* len_pos = get_invlist_len_addr(invlist);
	7953
	7954	PERL_ARGS_ASSERT__INVLIST_INVERT;
	7955
	7956	/* The inverse of matching nothing is matching everything */
	7957	if (*len_pos == 0) {
	7958	_append_range_to_invlist(invlist, 0, UV_MAX);
	7959	return;
	7960	}
	7961
	7962	/* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
	7963	* zero element was a 0, so it is being removed, so the length decrements
	7964	* by 1; and vice-versa. SvCUR is unaffected */
	7965	if (*get_invlist_zero_addr(invlist) ^= 1) {
	7966	(*len_pos)--;
	7967	}
	7968	else {
	7969	(*len_pos)++;
	7970	}
	7971	}
	7972
	7973	void
	7974	Perl__invlist_invert_prop(pTHX_ SV* const invlist)
	7975	{
	7976	/* Complement the input inversion list (which must be a Unicode property,
	7977	* all of which don't match above the Unicode maximum code point.) And
	7978	* Perl has chosen to not have the inversion match above that either. This
	7979	* adds a 0x110000 if the list didn't end with it, and removes it if it did
	7980	*/
	7981
	7982	UV len;
	7983	UV* array;
	7984
	7985	PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
	7986
	7987	_invlist_invert(invlist);
	7988
	7989	len = invlist_len(invlist);
	7990
	7991	if (len != 0) { /* If empty do nothing */
	7992	array = invlist_array(invlist);
	7993	if (array[len - 1] != PERL_UNICODE_MAX + 1) {
	7994	/* Add 0x110000. First, grow if necessary */
	7995	len++;
	7996	if (invlist_max(invlist) < len) {
	7997	invlist_extend(invlist, len);
	7998	array = invlist_array(invlist);
	7999	}
	8000	invlist_set_len(invlist, len);
	8001	array[len - 1] = PERL_UNICODE_MAX + 1;
	8002	}
	8003	else { /* Remove the 0x110000 */
	8004	invlist_set_len(invlist, len - 1);
	8005	}
	8006	}
	8007
	8008	return;
	8009	}
	8010	#endif
	8011
	8012	PERL_STATIC_INLINE SV*
	8013	S_invlist_clone(pTHX_ SV* const invlist)
	8014	{
	8015
	8016	/* Return a new inversion list that is a copy of the input one, which is
	8017	* unchanged */
	8018
	8019	/* Need to allocate extra space to accommodate Perl's addition of a
	8020	* trailing NUL to SvPV's, since it thinks they are always strings */
	8021	SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
	8022	STRLEN length = SvCUR(invlist);
	8023
	8024	PERL_ARGS_ASSERT_INVLIST_CLONE;
	8025
	8026	SvCUR_set(new_invlist, length); /* This isn't done automatically */
	8027	Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
	8028
	8029	return new_invlist;
	8030	}
	8031
	8032	PERL_STATIC_INLINE UV*
	8033	S_get_invlist_iter_addr(pTHX_ SV* invlist)
	8034	{
	8035	/* Return the address of the UV that contains the current iteration
	8036	* position */
	8037
	8038	PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
	8039
	8040	return (UV ) (SvPVX(invlist) + (INVLIST_ITER_OFFSET sizeof (UV)));
	8041	}
	8042
	8043	PERL_STATIC_INLINE UV*
	8044	S_get_invlist_version_id_addr(pTHX_ SV* invlist)
	8045	{
	8046	/* Return the address of the UV that contains the version id. */
	8047
	8048	PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
	8049
	8050	return (UV ) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET sizeof (UV)));
	8051	}
	8052
	8053	PERL_STATIC_INLINE void
	8054	S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
	8055	{
	8056	PERL_ARGS_ASSERT_INVLIST_ITERINIT;
	8057
	8058	*get_invlist_iter_addr(invlist) = 0;
	8059	}
	8060
	8061	STATIC bool
	8062	S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
	8063	{
	8064	/* An C<invlist_iterinit> call on <invlist> must be used to set this up.
	8065	* This call sets in <start> and <end>, the next range in <invlist>.
	8066	* Returns <TRUE> if successful and the next call will return the next
	8067	* range; <FALSE> if was already at the end of the list. If the latter,
	8068	* <start> and <end> are unchanged, and the next call to this function
	8069	* will start over at the beginning of the list */
	8070
	8071	UV* pos = get_invlist_iter_addr(invlist);
	8072	UV len = invlist_len(invlist);
	8073	UV *array;
	8074
	8075	PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
	8076
	8077	if (*pos >= len) {
	8078	pos = UV_MAX; / Force iternit() to be required next time */
	8079	return FALSE;
	8080	}
	8081
	8082	array = invlist_array(invlist);
	8083
	8084	start = array[(pos)++];
	8085
	8086	if (*pos >= len) {
	8087	*end = UV_MAX;
	8088	}
	8089	else {
	8090	end = array[(pos)++] - 1;
	8091	}
	8092
	8093	return TRUE;
	8094	}
	8095
	8096	#ifndef PERL_IN_XSUB_RE
	8097	SV *
	8098	Perl__invlist_contents(pTHX_ SV* const invlist)
	8099	{
	8100	/* Get the contents of an inversion list into a string SV so that they can
	8101	* be printed out. It uses the format traditionally done for debug tracing
	8102	*/
	8103
	8104	UV start, end;
	8105	SV* output = newSVpvs("\n");
	8106
	8107	PERL_ARGS_ASSERT__INVLIST_CONTENTS;
	8108
	8109	invlist_iterinit(invlist);
	8110	while (invlist_iternext(invlist, &start, &end)) {
	8111	if (end == UV_MAX) {
	8112	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
	8113	}
	8114	else if (end != start) {
	8115	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
	8116	start, end);
	8117	}
	8118	else {
	8119	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
	8120	}
	8121	}
	8122
	8123	return output;
	8124	}
	8125	#endif
	8126
	8127	#if 0
	8128	void
	8129	S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
	8130	{
	8131	/* Dumps out the ranges in an inversion list. The string 'header'
	8132	* if present is output on a line before the first range */
	8133
	8134	UV start, end;
	8135
	8136	if (header && strlen(header)) {
	8137	PerlIO_printf(Perl_debug_log, "%s\n", header);
	8138	}
	8139	invlist_iterinit(invlist);
	8140	while (invlist_iternext(invlist, &start, &end)) {
	8141	if (end == UV_MAX) {
	8142	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
	8143	}
	8144	else {
	8145	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
	8146	}
	8147	}
	8148	}
	8149	#endif
	8150
	8151	#undef HEADER_LENGTH
	8152	#undef INVLIST_INITIAL_LENGTH
	8153	#undef TO_INTERNAL_SIZE
	8154	#undef FROM_INTERNAL_SIZE
	8155	#undef INVLIST_LEN_OFFSET
	8156	#undef INVLIST_ZERO_OFFSET
	8157	#undef INVLIST_ITER_OFFSET
	8158	#undef INVLIST_VERSION_ID
	8159
	8160	/* End of inversion list object */
	8161
	8162	/*
	8163	- reg - regular expression, i.e. main body or parenthesized thing
	8164	*
	8165	* Caller must absorb opening parenthesis.
	8166	*
	8167	* Combining parenthesis handling with the base level of regular expression
	8168	* is a trifle forced, but the need to tie the tails of the branches to what
	8169	* follows makes it hard to avoid.
	8170	*/
	8171	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	8172	#ifdef DEBUGGING
	8173	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	8174	#else
	8175	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	8176	#endif
	8177
	8178	STATIC regnode *
	8179	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	8180	/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
	8181	{
	8182	dVAR;
	8183	register regnode ret; / Will be the head of the group. */
	8184	register regnode *br;
	8185	register regnode *lastbr;
	8186	register regnode *ender = NULL;
	8187	register I32 parno = 0;
	8188	I32 flags;
	8189	U32 oregflags = RExC_flags;
	8190	bool have_branch = 0;
	8191	bool is_open = 0;
	8192	I32 freeze_paren = 0;
	8193	I32 after_freeze = 0;
	8194
	8195	/* for (?g), (?gc), and (?o) warnings; warning
	8196	about (?c) will warn about (?g) -- japhy */
	8197
	8198	#define WASTED_O 0x01
	8199	#define WASTED_G 0x02
	8200	#define WASTED_C 0x04
	8201	#define WASTED_GC (0x02\|0x04)
	8202	I32 wastedflags = 0x00;
	8203
	8204	char * parse_start = RExC_parse; /* MJD */
	8205	char * const oregcomp_parse = RExC_parse;
	8206
	8207	GET_RE_DEBUG_FLAGS_DECL;
	8208
	8209	PERL_ARGS_ASSERT_REG;
	8210	DEBUG_PARSE("reg ");
	8211
	8212	flagp = 0; / Tentatively. */
	8213
	8214
	8215	/* Make an OPEN node, if parenthesized. */
	8216	if (paren) {
	8217	if ( RExC_parse == '') { /* (VERB:ARG) /
	8218	char *start_verb = RExC_parse;
	8219	STRLEN verb_len = 0;
	8220	char *start_arg = NULL;
	8221	unsigned char op = 0;
	8222	int argok = 1;
	8223	int internal_argval = 0; /* internal_argval is only useful if !argok */
	8224	while ( RExC_parse && RExC_parse != ')' ) {
	8225	if ( *RExC_parse == ':' ) {
	8226	start_arg = RExC_parse + 1;
	8227	break;
	8228	}
	8229	RExC_parse++;
	8230	}
	8231	++start_verb;
	8232	verb_len = RExC_parse - start_verb;
	8233	if ( start_arg ) {
	8234	RExC_parse++;
	8235	while ( RExC_parse && RExC_parse != ')' )
	8236	RExC_parse++;
	8237	if ( *RExC_parse != ')' )
	8238	vFAIL("Unterminated verb pattern argument");
	8239	if ( RExC_parse == start_arg )
	8240	start_arg = NULL;
	8241	} else {
	8242	if ( *RExC_parse != ')' )
	8243	vFAIL("Unterminated verb pattern");
	8244	}
	8245
	8246	switch ( *start_verb ) {
	8247	case 'A': /* (ACCEPT) /
	8248	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	8249	op = ACCEPT;
	8250	internal_argval = RExC_nestroot;
	8251	}
	8252	break;
	8253	case 'C': /* (COMMIT) /
	8254	if ( memEQs(start_verb,verb_len,"COMMIT") )
	8255	op = COMMIT;
	8256	break;
	8257	case 'F': /* (FAIL) /
	8258	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	8259	op = OPFAIL;
	8260	argok = 0;
	8261	}
	8262	break;
	8263	case ':': /* (:NAME) /
	8264	case 'M': /* (MARK:NAME) /
	8265	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	8266	op = MARKPOINT;
	8267	argok = -1;
	8268	}
	8269	break;
	8270	case 'P': /* (PRUNE) /
	8271	if ( memEQs(start_verb,verb_len,"PRUNE") )
	8272	op = PRUNE;
	8273	break;
	8274	case 'S': /* (SKIP) /
	8275	if ( memEQs(start_verb,verb_len,"SKIP") )
	8276	op = SKIP;
	8277	break;
	8278	case 'T': /* (THEN) /
	8279	/* [19:06] <TimToady> :: is then */
	8280	if ( memEQs(start_verb,verb_len,"THEN") ) {
	8281	op = CUTGROUP;
	8282	RExC_seen \|= REG_SEEN_CUTGROUP;
	8283	}
	8284	break;
	8285	}
	8286	if ( ! op ) {
	8287	RExC_parse++;
	8288	vFAIL3("Unknown verb pattern '%.*s'",
	8289	verb_len, start_verb);
	8290	}
	8291	if ( argok ) {
	8292	if ( start_arg && internal_argval ) {
	8293	vFAIL3("Verb pattern '%.*s' may not have an argument",
	8294	verb_len, start_verb);
	8295	} else if ( argok < 0 && !start_arg ) {
	8296	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	8297	verb_len, start_verb);
	8298	} else {
	8299	ret = reganode(pRExC_state, op, internal_argval);
	8300	if ( ! internal_argval && ! SIZE_ONLY ) {
	8301	if (start_arg) {
	8302	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	8303	ARG(ret) = add_data( pRExC_state, 1, "S" );
	8304	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	8305	ret->flags = 0;
	8306	} else {
	8307	ret->flags = 1;
	8308	}
	8309	}
	8310	}
	8311	if (!internal_argval)
	8312	RExC_seen \|= REG_SEEN_VERBARG;
	8313	} else if ( start_arg ) {
	8314	vFAIL3("Verb pattern '%.*s' may not have an argument",
	8315	verb_len, start_verb);
	8316	} else {
	8317	ret = reg_node(pRExC_state, op);
	8318	}
	8319	nextchar(pRExC_state);
	8320	return ret;
	8321	} else
	8322	if (RExC_parse == '?') { / (?...) */
	8323	bool is_logical = 0;
	8324	const char * const seqstart = RExC_parse;
	8325	bool has_use_defaults = FALSE;
	8326
	8327	RExC_parse++;
	8328	paren = *RExC_parse++;
	8329	ret = NULL; /* For look-ahead/behind. */
	8330	switch (paren) {
	8331
	8332	case 'P': /* (?P...) variants for those used to PCRE/Python */
	8333	paren = *RExC_parse++;
	8334	if ( paren == '<') /* (?P<...>) named capture */
	8335	goto named_capture;
	8336	else if (paren == '>') { /* (?P>name) named recursion */
	8337	goto named_recursion;
	8338	}
	8339	else if (paren == '=') { /* (?P=...) named backref */
	8340	/* this pretty much dupes the code for \k<NAME> in regatom(), if
	8341	you change this make sure you change that */
	8342	char* name_start = RExC_parse;
	8343	U32 num = 0;
	8344	SV *sv_dat = reg_scan_name(pRExC_state,
	8345	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8346	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	8347	vFAIL2("Sequence %.3s... not terminated",parse_start);
	8348
	8349	if (!SIZE_ONLY) {
	8350	num = add_data( pRExC_state, 1, "S" );
	8351	RExC_rxi->data->data[num]=(void*)sv_dat;
	8352	SvREFCNT_inc_simple_void(sv_dat);
	8353	}
	8354	RExC_sawback = 1;
	8355	ret = reganode(pRExC_state,
	8356	((! FOLD)
	8357	? NREF
	8358	: (MORE_ASCII_RESTRICTED)
	8359	? NREFFA
	8360	: (AT_LEAST_UNI_SEMANTICS)
	8361	? NREFFU
	8362	: (LOC)
	8363	? NREFFL
	8364	: NREFF),
	8365	num);
	8366	*flagp \|= HASWIDTH;
	8367
	8368	Set_Node_Offset(ret, parse_start+1);
	8369	Set_Node_Cur_Length(ret); /* MJD */
	8370
	8371	nextchar(pRExC_state);
	8372	return ret;
	8373	}
	8374	RExC_parse++;
	8375	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8376	/NOTREACHED/
	8377	case '<': /* (?<...) */
	8378	if (*RExC_parse == '!')
	8379	paren = ',';
	8380	else if (*RExC_parse != '=')
	8381	named_capture:
	8382	{ /* (?<...>) */
	8383	char *name_start;
	8384	SV *svname;
	8385	paren= '>';
	8386	case '\'': /* (?'...') */
	8387	name_start= RExC_parse;
	8388	svname = reg_scan_name(pRExC_state,
	8389	SIZE_ONLY ? /* reverse test from the others */
	8390	REG_RSN_RETURN_NAME :
	8391	REG_RSN_RETURN_NULL);
	8392	if (RExC_parse == name_start) {
	8393	RExC_parse++;
	8394	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8395	/NOTREACHED/
	8396	}
	8397	if (*RExC_parse != paren)
	8398	vFAIL2("Sequence (?%c... not terminated",
	8399	paren=='>' ? '<' : paren);
	8400	if (SIZE_ONLY) {
	8401	HE *he_str;
	8402	SV *sv_dat = NULL;
	8403	if (!svname) /* shouldn't happen */
	8404	Perl_croak(aTHX_
	8405	"panic: reg_scan_name returned NULL");
	8406	if (!RExC_paren_names) {
	8407	RExC_paren_names= newHV();
	8408	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	8409	#ifdef DEBUGGING
	8410	RExC_paren_name_list= newAV();
	8411	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	8412	#endif
	8413	}
	8414	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	8415	if ( he_str )
	8416	sv_dat = HeVAL(he_str);
	8417	if ( ! sv_dat ) {
	8418	/* croak baby croak */
	8419	Perl_croak(aTHX_
	8420	"panic: paren_name hash element allocation failed");
	8421	} else if ( SvPOK(sv_dat) ) {
	8422	/* (?\|...) can mean we have dupes so scan to check
	8423	its already been stored. Maybe a flag indicating
	8424	we are inside such a construct would be useful,
	8425	but the arrays are likely to be quite small, so
	8426	for now we punt -- dmq */
	8427	IV count = SvIV(sv_dat);
	8428	I32 pv = (I32)SvPVX(sv_dat);
	8429	IV i;
	8430	for ( i = 0 ; i < count ; i++ ) {
	8431	if ( pv[i] == RExC_npar ) {
	8432	count = 0;
	8433	break;
	8434	}
	8435	}
	8436	if ( count ) {
	8437	pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
	8438	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	8439	pv[count] = RExC_npar;
	8440	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	8441	}
	8442	} else {
	8443	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	8444	sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
	8445	SvIOK_on(sv_dat);
	8446	SvIV_set(sv_dat, 1);
	8447	}
	8448	#ifdef DEBUGGING
	8449	/* Yes this does cause a memory leak in debugging Perls */
	8450	if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
	8451	SvREFCNT_dec(svname);
	8452	#endif
	8453
	8454	/sv_dump(sv_dat);/
	8455	}
	8456	nextchar(pRExC_state);
	8457	paren = 1;
	8458	goto capturing_parens;
	8459	}
	8460	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8461	RExC_in_lookbehind++;
	8462	RExC_parse++;
	8463	case '=': /* (?=...) */
	8464	RExC_seen_zerolen++;
	8465	break;
	8466	case '!': /* (?!...) */
	8467	RExC_seen_zerolen++;
	8468	if (*RExC_parse == ')') {
	8469	ret=reg_node(pRExC_state, OPFAIL);
	8470	nextchar(pRExC_state);
	8471	return ret;
	8472	}
	8473	break;
	8474	case '\|': /* (?\|...) */
	8475	/* branch reset, behave like a (?:...) except that
	8476	buffers in alternations share the same numbers */
	8477	paren = ':';
	8478	after_freeze = freeze_paren = RExC_npar;
	8479	break;
	8480	case ':': /* (?:...) */
	8481	case '>': /* (?>...) */
	8482	break;
	8483	case '$': /* (?$...) */
	8484	case '@': /* (?@...) */
	8485	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	8486	break;
	8487	case '#': /* (?#...) */
	8488	while (RExC_parse && RExC_parse != ')')
	8489	RExC_parse++;
	8490	if (*RExC_parse != ')')
	8491	FAIL("Sequence (?#... not terminated");
	8492	nextchar(pRExC_state);
	8493	*flagp = TRYAGAIN;
	8494	return NULL;
	8495	case '0' : /* (?0) */
	8496	case 'R' : /* (?R) */
	8497	if (*RExC_parse != ')')
	8498	FAIL("Sequence (?R) not terminated");
	8499	ret = reg_node(pRExC_state, GOSTART);
	8500	*flagp \|= POSTPONED;
	8501	nextchar(pRExC_state);
	8502	return ret;
	8503	/notreached/
	8504	{ /* named and numeric backreferences */
	8505	I32 num;
	8506	case '&': /* (?&NAME) */
	8507	parse_start = RExC_parse - 1;
	8508	named_recursion:
	8509	{
	8510	SV *sv_dat = reg_scan_name(pRExC_state,
	8511	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8512	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	8513	}
	8514	goto gen_recurse_regop;
	8515	assert(0); /* NOT REACHED */
	8516	case '+':
	8517	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	8518	RExC_parse++;
	8519	vFAIL("Illegal pattern");
	8520	}
	8521	goto parse_recursion;
	8522	/* NOT REACHED*/
	8523	case '-': /* (?-1) */
	8524	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	8525	RExC_parse--; /* rewind to let it be handled later */
	8526	goto parse_flags;
	8527	}
	8528	/FALLTHROUGH /
	8529	case '1': case '2': case '3': case '4': /* (?1) */
	8530	case '5': case '6': case '7': case '8': case '9':
	8531	RExC_parse--;
	8532	parse_recursion:
	8533	num = atoi(RExC_parse);
	8534	parse_start = RExC_parse - 1; /* MJD */
	8535	if (*RExC_parse == '-')
	8536	RExC_parse++;
	8537	while (isDIGIT(*RExC_parse))
	8538	RExC_parse++;
	8539	if (*RExC_parse!=')')
	8540	vFAIL("Expecting close bracket");
	8541
	8542	gen_recurse_regop:
	8543	if ( paren == '-' ) {
	8544	/*
	8545	Diagram of capture buffer numbering.
	8546	Top line is the normal capture buffer numbers
	8547	Bottom line is the negative indexing as from
	8548	the X (the (?-2))
	8549
	8550	+ 1 2 3 4 5 X 6 7
	8551	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	8552	- 5 4 3 2 1 X x x
	8553
	8554	*/
	8555	num = RExC_npar + num;
	8556	if (num < 1) {
	8557	RExC_parse++;
	8558	vFAIL("Reference to nonexistent group");
	8559	}
	8560	} else if ( paren == '+' ) {
	8561	num = RExC_npar + num - 1;
	8562	}
	8563
	8564	ret = reganode(pRExC_state, GOSUB, num);
	8565	if (!SIZE_ONLY) {
	8566	if (num > (I32)RExC_rx->nparens) {
	8567	RExC_parse++;
	8568	vFAIL("Reference to nonexistent group");
	8569	}
	8570	ARG2L_SET( ret, RExC_recurse_count++);
	8571	RExC_emit++;
	8572	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	8573	"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
	8574	} else {
	8575	RExC_size++;
	8576	}
	8577	RExC_seen \|= REG_SEEN_RECURSE;
	8578	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	8579	Set_Node_Offset(ret, parse_start); /* MJD */
	8580
	8581	*flagp \|= POSTPONED;
	8582	nextchar(pRExC_state);
	8583	return ret;
	8584	} /* named and numeric backreferences */
	8585	assert(0); /* NOT REACHED */
	8586
	8587	case '?': /* (??...) */
	8588	is_logical = 1;
	8589	if (*RExC_parse != '{') {
	8590	RExC_parse++;
	8591	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8592	/NOTREACHED/
	8593	}
	8594	*flagp \|= POSTPONED;
	8595	paren = *RExC_parse++;
	8596	/* FALL THROUGH */
	8597	case '{': /* (?{...}) */
	8598	{
	8599	U32 n = 0;
	8600	struct reg_code_block *cb;
	8601
	8602	RExC_seen_zerolen++;
	8603
	8604	if ( !pRExC_state->num_code_blocks
	8605	\|\| pRExC_state->code_index >= pRExC_state->num_code_blocks
	8606	\|\| pRExC_state->code_blocks[pRExC_state->code_index].start
	8607	!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
	8608	- RExC_start)
	8609	) {
	8610	if (RExC_pm_flags & PMf_USE_RE_EVAL)
	8611	FAIL("panic: Sequence (?{...}): no code block found\n");
	8612	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	8613	}
	8614	/* this is a pre-compiled code block (?{...}) */
	8615	cb = &pRExC_state->code_blocks[pRExC_state->code_index];
	8616	RExC_parse = RExC_start + cb->end;
	8617	if (!SIZE_ONLY) {
	8618	OP *o = cb->block;
	8619	if (cb->src_regex) {
	8620	n = add_data(pRExC_state, 2, "rl");
	8621	RExC_rxi->data->data[n] =
	8622	(void)SvREFCNT_inc((SV)cb->src_regex);
	8623	RExC_rxi->data->data[n+1] = (void*)o;
	8624	}
	8625	else {
	8626	n = add_data(pRExC_state, 1,
	8627	(RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
	8628	RExC_rxi->data->data[n] = (void*)o;
	8629	}
	8630	}
	8631	pRExC_state->code_index++;
	8632	nextchar(pRExC_state);
	8633
	8634	if (is_logical) {
	8635	regnode *eval;
	8636	ret = reg_node(pRExC_state, LOGICAL);
	8637	eval = reganode(pRExC_state, EVAL, n);
	8638	if (!SIZE_ONLY) {
	8639	ret->flags = 2;
	8640	/* for later propagation into (??{}) return value */
	8641	eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
	8642	}
	8643	REGTAIL(pRExC_state, ret, eval);
	8644	/* deal with the length of this later - MJD */
	8645	return ret;
	8646	}
	8647	ret = reganode(pRExC_state, EVAL, n);
	8648	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	8649	Set_Node_Offset(ret, parse_start);
	8650	return ret;
	8651	}
	8652	case '(': /* (?(?{...})...) and (?(?=...)...) */
	8653	{
	8654	int is_define= 0;
	8655	if (RExC_parse[0] == '?') { /* (?(?...)) */
	8656	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	8657	\|\| RExC_parse[1] == '<'
	8658	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	8659	I32 flag;
	8660
	8661	ret = reg_node(pRExC_state, LOGICAL);
	8662	if (!SIZE_ONLY)
	8663	ret->flags = 1;
	8664	REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
	8665	goto insert_if;
	8666	}
	8667	}
	8668	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	8669	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	8670	{
	8671	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	8672	char *name_start= RExC_parse++;
	8673	U32 num = 0;
	8674	SV *sv_dat=reg_scan_name(pRExC_state,
	8675	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8676	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	8677	vFAIL2("Sequence (?(%c... not terminated",
	8678	(ch == '>' ? '<' : ch));
	8679	RExC_parse++;
	8680	if (!SIZE_ONLY) {
	8681	num = add_data( pRExC_state, 1, "S" );
	8682	RExC_rxi->data->data[num]=(void*)sv_dat;
	8683	SvREFCNT_inc_simple_void(sv_dat);
	8684	}
	8685	ret = reganode(pRExC_state,NGROUPP,num);
	8686	goto insert_if_check_paren;
	8687	}
	8688	else if (RExC_parse[0] == 'D' &&
	8689	RExC_parse[1] == 'E' &&
	8690	RExC_parse[2] == 'F' &&
	8691	RExC_parse[3] == 'I' &&
	8692	RExC_parse[4] == 'N' &&
	8693	RExC_parse[5] == 'E')
	8694	{
	8695	ret = reganode(pRExC_state,DEFINEP,0);
	8696	RExC_parse +=6 ;
	8697	is_define = 1;
	8698	goto insert_if_check_paren;
	8699	}
	8700	else if (RExC_parse[0] == 'R') {
	8701	RExC_parse++;
	8702	parno = 0;
	8703	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	8704	parno = atoi(RExC_parse++);
	8705	while (isDIGIT(*RExC_parse))
	8706	RExC_parse++;
	8707	} else if (RExC_parse[0] == '&') {
	8708	SV *sv_dat;
	8709	RExC_parse++;
	8710	sv_dat = reg_scan_name(pRExC_state,
	8711	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8712	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	8713	}
	8714	ret = reganode(pRExC_state,INSUBP,parno);
	8715	goto insert_if_check_paren;
	8716	}
	8717	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	8718	/* (?(1)...) */
	8719	char c;
	8720	parno = atoi(RExC_parse++);
	8721
	8722	while (isDIGIT(*RExC_parse))
	8723	RExC_parse++;
	8724	ret = reganode(pRExC_state, GROUPP, parno);
	8725
	8726	insert_if_check_paren:
	8727	if ((c = *nextchar(pRExC_state)) != ')')
	8728	vFAIL("Switch condition not recognized");
	8729	insert_if:
	8730	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	8731	br = regbranch(pRExC_state, &flags, 1,depth+1);
	8732	if (br == NULL)
	8733	br = reganode(pRExC_state, LONGJMP, 0);
	8734	else
	8735	REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
	8736	c = *nextchar(pRExC_state);
	8737	if (flags&HASWIDTH)
	8738	*flagp \|= HASWIDTH;
	8739	if (c == '\|') {
	8740	if (is_define)
	8741	vFAIL("(?(DEFINE)....) does not allow branches");
	8742	lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
	8743	regbranch(pRExC_state, &flags, 1,depth+1);
	8744	REGTAIL(pRExC_state, ret, lastbr);
	8745	if (flags&HASWIDTH)
	8746	*flagp \|= HASWIDTH;
	8747	c = *nextchar(pRExC_state);
	8748	}
	8749	else
	8750	lastbr = NULL;
	8751	if (c != ')')
	8752	vFAIL("Switch (?(condition)... contains too many branches");
	8753	ender = reg_node(pRExC_state, TAIL);
	8754	REGTAIL(pRExC_state, br, ender);
	8755	if (lastbr) {
	8756	REGTAIL(pRExC_state, lastbr, ender);
	8757	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	8758	}
	8759	else
	8760	REGTAIL(pRExC_state, ret, ender);
	8761	RExC_size++; /* XXX WHY do we need this?!!
	8762	For large programs it seems to be required
	8763	but I can't figure out why. -- dmq*/
	8764	return ret;
	8765	}
	8766	else {
	8767	vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
	8768	}
	8769	}
	8770	case 0:
	8771	RExC_parse--; /* for vFAIL to print correctly */
	8772	vFAIL("Sequence (? incomplete");
	8773	break;
	8774	case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
	8775	that follow */
	8776	has_use_defaults = TRUE;
	8777	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	8778	set_regex_charset(&RExC_flags, (RExC_utf8 \|\| RExC_uni_semantics)
	8779	? REGEX_UNICODE_CHARSET
	8780	: REGEX_DEPENDS_CHARSET);
	8781	goto parse_flags;
	8782	default:
	8783	--RExC_parse;
	8784	parse_flags: /* (?i) */
	8785	{
	8786	U32 posflags = 0, negflags = 0;
	8787	U32 *flagsp = &posflags;
	8788	char has_charset_modifier = '\0';
	8789	regex_charset cs = get_regex_charset(RExC_flags);
	8790	if (cs == REGEX_DEPENDS_CHARSET
	8791	&& (RExC_utf8 \|\| RExC_uni_semantics))
	8792	{
	8793	cs = REGEX_UNICODE_CHARSET;
	8794	}
	8795
	8796	while (*RExC_parse) {
	8797	/* && strchr("iogcmsx", RExC_parse) /
	8798	/* (?g), (?gc) and (?o) are useless here
	8799	and must be globally applied -- japhy */
	8800	switch (*RExC_parse) {
	8801	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	8802	case LOCALE_PAT_MOD:
	8803	if (has_charset_modifier) {
	8804	goto excess_modifier;
	8805	}
	8806	else if (flagsp == &negflags) {
	8807	goto neg_modifier;
	8808	}
	8809	cs = REGEX_LOCALE_CHARSET;
	8810	has_charset_modifier = LOCALE_PAT_MOD;
	8811	RExC_contains_locale = 1;
	8812	break;
	8813	case UNICODE_PAT_MOD:
	8814	if (has_charset_modifier) {
	8815	goto excess_modifier;
	8816	}
	8817	else if (flagsp == &negflags) {
	8818	goto neg_modifier;
	8819	}
	8820	cs = REGEX_UNICODE_CHARSET;
	8821	has_charset_modifier = UNICODE_PAT_MOD;
	8822	break;
	8823	case ASCII_RESTRICT_PAT_MOD:
	8824	if (flagsp == &negflags) {
	8825	goto neg_modifier;
	8826	}
	8827	if (has_charset_modifier) {
	8828	if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
	8829	goto excess_modifier;
	8830	}
	8831	/* Doubled modifier implies more restricted */
	8832	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	8833	}
	8834	else {
	8835	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	8836	}
	8837	has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
	8838	break;
	8839	case DEPENDS_PAT_MOD:
	8840	if (has_use_defaults) {
	8841	goto fail_modifiers;
	8842	}
	8843	else if (flagsp == &negflags) {
	8844	goto neg_modifier;
	8845	}
	8846	else if (has_charset_modifier) {
	8847	goto excess_modifier;
	8848	}
	8849
	8850	/* The dual charset means unicode semantics if the
	8851	* pattern (or target, not known until runtime) are
	8852	* utf8, or something in the pattern indicates unicode
	8853	* semantics */
	8854	cs = (RExC_utf8 \|\| RExC_uni_semantics)
	8855	? REGEX_UNICODE_CHARSET
	8856	: REGEX_DEPENDS_CHARSET;
	8857	has_charset_modifier = DEPENDS_PAT_MOD;
	8858	break;
	8859	excess_modifier:
	8860	RExC_parse++;
	8861	if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
	8862	vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
	8863	}
	8864	else if (has_charset_modifier == *(RExC_parse - 1)) {
	8865	vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
	8866	}
	8867	else {
	8868	vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
	8869	}
	8870	/NOTREACHED/
	8871	neg_modifier:
	8872	RExC_parse++;
	8873	vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
	8874	/NOTREACHED/
	8875	case ONCE_PAT_MOD: /* 'o' */
	8876	case GLOBAL_PAT_MOD: /* 'g' */
	8877	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	8878	const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
	8879	if (! (wastedflags & wflagbit) ) {
	8880	wastedflags \|= wflagbit;
	8881	vWARN5(
	8882	RExC_parse + 1,
	8883	"Useless (%s%c) - %suse /%c modifier",
	8884	flagsp == &negflags ? "?-" : "?",
	8885	*RExC_parse,
	8886	flagsp == &negflags ? "don't " : "",
	8887	*RExC_parse
	8888	);
	8889	}
	8890	}
	8891	break;
	8892
	8893	case CONTINUE_PAT_MOD: /* 'c' */
	8894	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	8895	if (! (wastedflags & WASTED_C) ) {
	8896	wastedflags \|= WASTED_GC;
	8897	vWARN3(
	8898	RExC_parse + 1,
	8899	"Useless (%sc) - %suse /gc modifier",
	8900	flagsp == &negflags ? "?-" : "?",
	8901	flagsp == &negflags ? "don't " : ""
	8902	);
	8903	}
	8904	}
	8905	break;
	8906	case KEEPCOPY_PAT_MOD: /* 'p' */
	8907	if (flagsp == &negflags) {
	8908	if (SIZE_ONLY)
	8909	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	8910	} else {
	8911	*flagsp \|= RXf_PMf_KEEPCOPY;
	8912	}
	8913	break;
	8914	case '-':
	8915	/* A flag is a default iff it is following a minus, so
	8916	* if there is a minus, it means will be trying to
	8917	* re-specify a default which is an error */
	8918	if (has_use_defaults \|\| flagsp == &negflags) {
	8919	fail_modifiers:
	8920	RExC_parse++;
	8921	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8922	/NOTREACHED/
	8923	}
	8924	flagsp = &negflags;
	8925	wastedflags = 0; /* reset so (?g-c) warns twice */
	8926	break;
	8927	case ':':
	8928	paren = ':';
	8929	/FALLTHROUGH/
	8930	case ')':
	8931	RExC_flags \|= posflags;
	8932	RExC_flags &= ~negflags;
	8933	set_regex_charset(&RExC_flags, cs);
	8934	if (paren != ':') {
	8935	oregflags \|= posflags;
	8936	oregflags &= ~negflags;
	8937	set_regex_charset(&oregflags, cs);
	8938	}
	8939	nextchar(pRExC_state);
	8940	if (paren != ':') {
	8941	*flagp = TRYAGAIN;
	8942	return NULL;
	8943	} else {
	8944	ret = NULL;
	8945	goto parse_rest;
	8946	}
	8947	/NOTREACHED/
	8948	default:
	8949	RExC_parse++;
	8950	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8951	/NOTREACHED/
	8952	}
	8953	++RExC_parse;
	8954	}
	8955	}} /* one for the default block, one for the switch */
	8956	}
	8957	else { /* (...) */
	8958	capturing_parens:
	8959	parno = RExC_npar;
	8960	RExC_npar++;
	8961
	8962	ret = reganode(pRExC_state, OPEN, parno);
	8963	if (!SIZE_ONLY ){
	8964	if (!RExC_nestroot)
	8965	RExC_nestroot = parno;
	8966	if (RExC_seen & REG_SEEN_RECURSE
	8967	&& !RExC_open_parens[parno-1])
	8968	{
	8969	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	8970	"Setting open paren #%"IVdf" to %d\n",
	8971	(IV)parno, REG_NODE_NUM(ret)));
	8972	RExC_open_parens[parno-1]= ret;
	8973	}
	8974	}
	8975	Set_Node_Length(ret, 1); /* MJD */
	8976	Set_Node_Offset(ret, RExC_parse); /* MJD */
	8977	is_open = 1;
	8978	}
	8979	}
	8980	else /* ! paren */
	8981	ret = NULL;
	8982
	8983	parse_rest:
	8984	/* Pick up the branches, linking them together. */
	8985	parse_start = RExC_parse; /* MJD */
	8986	br = regbranch(pRExC_state, &flags, 1,depth+1);
	8987
	8988	/* branch_len = (paren != 0); */
	8989
	8990	if (br == NULL)
	8991	return(NULL);
	8992	if (*RExC_parse == '\|') {
	8993	if (!SIZE_ONLY && RExC_extralen) {
	8994	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	8995	}
	8996	else { /* MJD */
	8997	reginsert(pRExC_state, BRANCH, br, depth+1);
	8998	Set_Node_Length(br, paren != 0);
	8999	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	9000	}
	9001	have_branch = 1;
	9002	if (SIZE_ONLY)
	9003	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	9004	}
	9005	else if (paren == ':') {
	9006	*flagp \|= flags&SIMPLE;
	9007	}
	9008	if (is_open) { /* Starts with OPEN. */
	9009	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	9010	}
	9011	else if (paren != '?') /* Not Conditional */
	9012	ret = br;
	9013	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	9014	lastbr = br;
	9015	while (*RExC_parse == '\|') {
	9016	if (!SIZE_ONLY && RExC_extralen) {
	9017	ender = reganode(pRExC_state, LONGJMP,0);
	9018	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
	9019	}
	9020	if (SIZE_ONLY)
	9021	RExC_extralen += 2; /* Account for LONGJMP. */
	9022	nextchar(pRExC_state);
	9023	if (freeze_paren) {
	9024	if (RExC_npar > after_freeze)
	9025	after_freeze = RExC_npar;
	9026	RExC_npar = freeze_paren;
	9027	}
	9028	br = regbranch(pRExC_state, &flags, 0, depth+1);
	9029
	9030	if (br == NULL)
	9031	return(NULL);
	9032	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	9033	lastbr = br;
	9034	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	9035	}
	9036
	9037	if (have_branch \|\| paren != ':') {
	9038	/* Make a closing node, and hook it on the end. */
	9039	switch (paren) {
	9040	case ':':
	9041	ender = reg_node(pRExC_state, TAIL);
	9042	break;
	9043	case 1:
	9044	ender = reganode(pRExC_state, CLOSE, parno);
	9045	if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
	9046	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	9047	"Setting close paren #%"IVdf" to %d\n",
	9048	(IV)parno, REG_NODE_NUM(ender)));
	9049	RExC_close_parens[parno-1]= ender;
	9050	if (RExC_nestroot == parno)
	9051	RExC_nestroot = 0;
	9052	}
	9053	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	9054	Set_Node_Length(ender,1); /* MJD */
	9055	break;
	9056	case '<':
	9057	case ',':
	9058	case '=':
	9059	case '!':
	9060	*flagp &= ~HASWIDTH;
	9061	/* FALL THROUGH */
	9062	case '>':
	9063	ender = reg_node(pRExC_state, SUCCEED);
	9064	break;
	9065	case 0:
	9066	ender = reg_node(pRExC_state, END);
	9067	if (!SIZE_ONLY) {
	9068	assert(!RExC_opend); /* there can only be one! */
	9069	RExC_opend = ender;
	9070	}
	9071	break;
	9072	}
	9073	DEBUG_PARSE_r(if (!SIZE_ONLY) {
	9074	SV * const mysv_val1=sv_newmortal();
	9075	SV * const mysv_val2=sv_newmortal();
	9076	DEBUG_PARSE_MSG("lsbr");
	9077	regprop(RExC_rx, mysv_val1, lastbr);
	9078	regprop(RExC_rx, mysv_val2, ender);
	9079	PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
	9080	SvPV_nolen_const(mysv_val1),
	9081	(IV)REG_NODE_NUM(lastbr),
	9082	SvPV_nolen_const(mysv_val2),
	9083	(IV)REG_NODE_NUM(ender),
	9084	(IV)(ender - lastbr)
	9085	);
	9086	});
	9087	REGTAIL(pRExC_state, lastbr, ender);
	9088
	9089	if (have_branch && !SIZE_ONLY) {
	9090	char is_nothing= 1;
	9091	if (depth==1)
	9092	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	9093
	9094	/* Hook the tails of the branches to the closing node. */
	9095	for (br = ret; br; br = regnext(br)) {
	9096	const U8 op = PL_regkind[OP(br)];
	9097	if (op == BRANCH) {
	9098	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	9099	if (OP(NEXTOPER(br)) != NOTHING \|\| regnext(NEXTOPER(br)) != ender)
	9100	is_nothing= 0;
	9101	}
	9102	else if (op == BRANCHJ) {
	9103	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	9104	/* for now we always disable this optimisation * /
	9105	if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING \|\| regnext(NEXTOPER(NEXTOPER(br))) != ender)
	9106	*/
	9107	is_nothing= 0;
	9108	}
	9109	}
	9110	if (is_nothing) {
	9111	br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
	9112	DEBUG_PARSE_r(if (!SIZE_ONLY) {
	9113	SV * const mysv_val1=sv_newmortal();
	9114	SV * const mysv_val2=sv_newmortal();
	9115	DEBUG_PARSE_MSG("NADA");
	9116	regprop(RExC_rx, mysv_val1, ret);
	9117	regprop(RExC_rx, mysv_val2, ender);
	9118	PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
	9119	SvPV_nolen_const(mysv_val1),
	9120	(IV)REG_NODE_NUM(ret),
	9121	SvPV_nolen_const(mysv_val2),
	9122	(IV)REG_NODE_NUM(ender),
	9123	(IV)(ender - ret)
	9124	);
	9125	});
	9126	OP(br)= NOTHING;
	9127	if (OP(ender) == TAIL) {
	9128	NEXT_OFF(br)= 0;
	9129	RExC_emit= br + 1;
	9130	} else {
	9131	regnode *opt;
	9132	for ( opt= br + 1; opt < ender ; opt++ )
	9133	OP(opt)= OPTIMIZED;
	9134	NEXT_OFF(br)= ender - br;
	9135	}
	9136	}
	9137	}
	9138	}
	9139
	9140	{
	9141	const char *p;
	9142	static const char parens[] = "=!<,>";
	9143
	9144	if (paren && (p = strchr(parens, paren))) {
	9145	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	9146	int flag = (p - parens) > 1;
	9147
	9148	if (paren == '>')
	9149	node = SUSPEND, flag = 0;
	9150	reginsert(pRExC_state, node,ret, depth+1);
	9151	Set_Node_Cur_Length(ret);
	9152	Set_Node_Offset(ret, parse_start + 1);
	9153	ret->flags = flag;
	9154	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	9155	}
	9156	}
	9157
	9158	/* Check for proper termination. */
	9159	if (paren) {
	9160	RExC_flags = oregflags;
	9161	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	9162	RExC_parse = oregcomp_parse;
	9163	vFAIL("Unmatched (");
	9164	}
	9165	}
	9166	else if (!paren && RExC_parse < RExC_end) {
	9167	if (*RExC_parse == ')') {
	9168	RExC_parse++;
	9169	vFAIL("Unmatched )");
	9170	}
	9171	else
	9172	FAIL("Junk on end of regexp"); /* "Can't happen". */
	9173	assert(0); /* NOTREACHED */
	9174	}
	9175
	9176	if (RExC_in_lookbehind) {
	9177	RExC_in_lookbehind--;
	9178	}
	9179	if (after_freeze > RExC_npar)
	9180	RExC_npar = after_freeze;
	9181	return(ret);
	9182	}
	9183
	9184	/*
	9185	- regbranch - one alternative of an \| operator
	9186	*
	9187	* Implements the concatenation operator.
	9188	*/
	9189	STATIC regnode *
	9190	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	9191	{
	9192	dVAR;
	9193	register regnode *ret;
	9194	register regnode *chain = NULL;
	9195	register regnode *latest;
	9196	I32 flags = 0, c = 0;
	9197	GET_RE_DEBUG_FLAGS_DECL;
	9198
	9199	PERL_ARGS_ASSERT_REGBRANCH;
	9200
	9201	DEBUG_PARSE("brnc");
	9202
	9203	if (first)
	9204	ret = NULL;
	9205	else {
	9206	if (!SIZE_ONLY && RExC_extralen)
	9207	ret = reganode(pRExC_state, BRANCHJ,0);
	9208	else {
	9209	ret = reg_node(pRExC_state, BRANCH);
	9210	Set_Node_Length(ret, 1);
	9211	}
	9212	}
	9213
	9214	if (!first && SIZE_ONLY)
	9215	RExC_extralen += 1; /* BRANCHJ */
	9216
	9217	flagp = WORST; / Tentatively. */
	9218
	9219	RExC_parse--;
	9220	nextchar(pRExC_state);
	9221	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	9222	flags &= ~TRYAGAIN;
	9223	latest = regpiece(pRExC_state, &flags,depth+1);
	9224	if (latest == NULL) {
	9225	if (flags & TRYAGAIN)
	9226	continue;
	9227	return(NULL);
	9228	}
	9229	else if (ret == NULL)
	9230	ret = latest;
	9231	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	9232	if (chain == NULL) /* First piece. */
	9233	*flagp \|= flags&SPSTART;
	9234	else {
	9235	RExC_naughty++;
	9236	REGTAIL(pRExC_state, chain, latest);
	9237	}
	9238	chain = latest;
	9239	c++;
	9240	}
	9241	if (chain == NULL) { /* Loop ran zero times. */
	9242	chain = reg_node(pRExC_state, NOTHING);
	9243	if (ret == NULL)
	9244	ret = chain;
	9245	}
	9246	if (c == 1) {
	9247	*flagp \|= flags&SIMPLE;
	9248	}
	9249
	9250	return ret;
	9251	}
	9252
	9253	/*
	9254	- regpiece - something followed by possible [*+?]
	9255	*
	9256	* Note that the branching code sequences used for ? and the general cases
	9257	* of * and + are somewhat optimized: they use the same NOTHING node as
	9258	* both the endmarker for their branch list and the body of the last branch.
	9259	* It might seem that this node could be dispensed with entirely, but the
	9260	* endmarker role is not redundant.
	9261	*/
	9262	STATIC regnode *
	9263	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	9264	{
	9265	dVAR;
	9266	register regnode *ret;
	9267	register char op;
	9268	register char *next;
	9269	I32 flags;
	9270	const char * const origparse = RExC_parse;
	9271	I32 min;
	9272	I32 max = REG_INFTY;
	9273	#ifdef RE_TRACK_PATTERN_OFFSETS
	9274	char *parse_start;
	9275	#endif
	9276	const char *maxpos = NULL;
	9277	GET_RE_DEBUG_FLAGS_DECL;
	9278
	9279	PERL_ARGS_ASSERT_REGPIECE;
	9280
	9281	DEBUG_PARSE("piec");
	9282
	9283	ret = regatom(pRExC_state, &flags,depth+1);
	9284	if (ret == NULL) {
	9285	if (flags & TRYAGAIN)
	9286	*flagp \|= TRYAGAIN;
	9287	return(NULL);
	9288	}
	9289
	9290	op = *RExC_parse;
	9291
	9292	if (op == '{' && regcurly(RExC_parse)) {
	9293	maxpos = NULL;
	9294	#ifdef RE_TRACK_PATTERN_OFFSETS
	9295	parse_start = RExC_parse; /* MJD */
	9296	#endif
	9297	next = RExC_parse + 1;
	9298	while (isDIGIT(next) \|\| next == ',') {
	9299	if (*next == ',') {
	9300	if (maxpos)
	9301	break;
	9302	else
	9303	maxpos = next;
	9304	}
	9305	next++;
	9306	}
	9307	if (next == '}') { / got one */
	9308	if (!maxpos)
	9309	maxpos = next;
	9310	RExC_parse++;
	9311	min = atoi(RExC_parse);
	9312	if (*maxpos == ',')
	9313	maxpos++;
	9314	else
	9315	maxpos = RExC_parse;
	9316	max = atoi(maxpos);
	9317	if (!max && *maxpos != '0')
	9318	max = REG_INFTY; /* meaning "infinity" */
	9319	else if (max >= REG_INFTY)
	9320	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	9321	RExC_parse = next;
	9322	nextchar(pRExC_state);
	9323
	9324	do_curly:
	9325	if ((flags&SIMPLE)) {
	9326	RExC_naughty += 2 + RExC_naughty / 2;
	9327	reginsert(pRExC_state, CURLY, ret, depth+1);
	9328	Set_Node_Offset(ret, parse_start+1); /* MJD */
	9329	Set_Node_Cur_Length(ret);
	9330	}
	9331	else {
	9332	regnode * const w = reg_node(pRExC_state, WHILEM);
	9333
	9334	w->flags = 0;
	9335	REGTAIL(pRExC_state, ret, w);
	9336	if (!SIZE_ONLY && RExC_extralen) {
	9337	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	9338	reginsert(pRExC_state, NOTHING,ret, depth+1);
	9339	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	9340	}
	9341	reginsert(pRExC_state, CURLYX,ret, depth+1);
	9342	/* MJD hk */
	9343	Set_Node_Offset(ret, parse_start+1);
	9344	Set_Node_Length(ret,
	9345	op == '{' ? (RExC_parse - parse_start) : 1);
	9346
	9347	if (!SIZE_ONLY && RExC_extralen)
	9348	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	9349	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	9350	if (SIZE_ONLY)
	9351	RExC_whilem_seen++, RExC_extralen += 3;
	9352	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	9353	}
	9354	ret->flags = 0;
	9355
	9356	if (min > 0)
	9357	*flagp = WORST;
	9358	if (max > 0)
	9359	*flagp \|= HASWIDTH;
	9360	if (max < min)
	9361	vFAIL("Can't do {n,m} with n > m");
	9362	if (!SIZE_ONLY) {
	9363	ARG1_SET(ret, (U16)min);
	9364	ARG2_SET(ret, (U16)max);
	9365	}
	9366
	9367	goto nest_check;
	9368	}
	9369	}
	9370
	9371	if (!ISMULT1(op)) {
	9372	*flagp = flags;
	9373	return(ret);
	9374	}
	9375
	9376	#if 0 /* Now runtime fix should be reliable. */
	9377
	9378	/* if this is reinstated, don't forget to put this back into perldiag:
	9379
	9380	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	9381
	9382	(F) The part of the regexp subject to either the * or + quantifier
	9383	could match an empty string. The {#} shows in the regular
	9384	expression about where the problem was discovered.
	9385
	9386	*/
	9387
	9388	if (!(flags&HASWIDTH) && op != '?')
	9389	vFAIL("Regexp *+ operand could be empty");
	9390	#endif
	9391
	9392	#ifdef RE_TRACK_PATTERN_OFFSETS
	9393	parse_start = RExC_parse;
	9394	#endif
	9395	nextchar(pRExC_state);
	9396
	9397	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	9398
	9399	if (op == '*' && (flags&SIMPLE)) {
	9400	reginsert(pRExC_state, STAR, ret, depth+1);
	9401	ret->flags = 0;
	9402	RExC_naughty += 4;
	9403	}
	9404	else if (op == '*') {
	9405	min = 0;
	9406	goto do_curly;
	9407	}
	9408	else if (op == '+' && (flags&SIMPLE)) {
	9409	reginsert(pRExC_state, PLUS, ret, depth+1);
	9410	ret->flags = 0;
	9411	RExC_naughty += 3;
	9412	}
	9413	else if (op == '+') {
	9414	min = 1;
	9415	goto do_curly;
	9416	}
	9417	else if (op == '?') {
	9418	min = 0; max = 1;
	9419	goto do_curly;
	9420	}
	9421	nest_check:
	9422	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	9423	ckWARN3reg(RExC_parse,
	9424	"%.*s matches null string many times",
	9425	(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
	9426	origparse);
	9427	}
	9428
	9429	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	9430	nextchar(pRExC_state);
	9431	reginsert(pRExC_state, MINMOD, ret, depth+1);
	9432	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	9433	}
	9434	#ifndef REG_ALLOW_MINMOD_SUSPEND
	9435	else
	9436	#endif
	9437	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	9438	regnode *ender;
	9439	nextchar(pRExC_state);
	9440	ender = reg_node(pRExC_state, SUCCEED);
	9441	REGTAIL(pRExC_state, ret, ender);
	9442	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	9443	ret->flags = 0;
	9444	ender = reg_node(pRExC_state, TAIL);
	9445	REGTAIL(pRExC_state, ret, ender);
	9446	/ret= ender;/
	9447	}
	9448
	9449	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	9450	RExC_parse++;
	9451	vFAIL("Nested quantifiers");
	9452	}
	9453
	9454	return(ret);
	9455	}
	9456
	9457
	9458	/* reg_namedseq(pRExC_state,UVp, UV depth)
	9459
	9460	This is expected to be called by a parser routine that has
	9461	recognized '\N' and needs to handle the rest. RExC_parse is
	9462	expected to point at the first char following the N at the time
	9463	of the call.
	9464
	9465	The \N may be inside (indicated by valuep not being NULL) or outside a
	9466	character class.
	9467
	9468	\N may begin either a named sequence, or if outside a character class, mean
	9469	to match a non-newline. For non single-quoted regexes, the tokenizer has
	9470	attempted to decide which, and in the case of a named sequence converted it
	9471	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	9472	where c1... are the characters in the sequence. For single-quoted regexes,
	9473	the tokenizer passes the \N sequence through unchanged; this code will not
	9474	attempt to determine this nor expand those. The net effect is that if the
	9475	beginning of the passed-in pattern isn't '{U+' or there is no '}', it
	9476	signals that this \N occurrence means to match a non-newline.
	9477
	9478	Only the \N{U+...} form should occur in a character class, for the same
	9479	reason that '.' inside a character class means to just match a period: it
	9480	just doesn't make sense.
	9481
	9482	If valuep is non-null then it is assumed that we are parsing inside
	9483	of a charclass definition and the first codepoint in the resolved
	9484	string is returned via *valuep and the routine will return NULL.
	9485	In this mode if a multichar string is returned from the charnames
	9486	handler, a warning will be issued, and only the first char in the
	9487	sequence will be examined. If the string returned is zero length
	9488	then the value of *valuep is undefined and NON-NULL will
	9489	be returned to indicate failure. (This will NOT be a valid pointer
	9490	to a regnode.)
	9491
	9492	If valuep is null then it is assumed that we are parsing normal text and a
	9493	new EXACT node is inserted into the program containing the resolved string,
	9494	and a pointer to the new node is returned. But if the string is zero length
	9495	a NOTHING node is emitted instead.
	9496
	9497	On success RExC_parse is set to the char following the endbrace.
	9498	Parsing failures will generate a fatal error via vFAIL(...)
	9499	*/
	9500	STATIC regnode *
	9501	S_reg_namedseq(pTHX_ RExC_state_t pRExC_state, UV valuep, I32 *flagp, U32 depth)
	9502	{
	9503	char * endbrace; /* '}' following the name */
	9504	regnode *ret = NULL;
	9505	char* p;
	9506
	9507	GET_RE_DEBUG_FLAGS_DECL;
	9508
	9509	PERL_ARGS_ASSERT_REG_NAMEDSEQ;
	9510
	9511	GET_RE_DEBUG_FLAGS;
	9512
	9513	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	9514	* modifier. The other meaning does not */
	9515	p = (RExC_flags & RXf_PMf_EXTENDED)
	9516	? regwhite( pRExC_state, RExC_parse )
	9517	: RExC_parse;
	9518
	9519	/* Disambiguate between \N meaning a named character versus \N meaning
	9520	* [^\n]. The former is assumed when it can't be the latter. */
	9521	if (*p != '{' \|\| regcurly(p)) {
	9522	RExC_parse = p;
	9523	if (valuep) {
	9524	/* no bare \N in a charclass */
	9525	vFAIL("\\N in a character class must be a named character: \\N{...}");
	9526	}
	9527	nextchar(pRExC_state);
	9528	ret = reg_node(pRExC_state, REG_ANY);
	9529	*flagp \|= HASWIDTH\|SIMPLE;
	9530	RExC_naughty++;
	9531	RExC_parse--;
	9532	Set_Node_Length(ret, 1); /* MJD */
	9533	return ret;
	9534	}
	9535
	9536	/* Here, we have decided it should be a named sequence */
	9537
	9538	/* The test above made sure that the next real character is a '{', but
	9539	* under the /x modifier, it could be separated by space (or a comment and
	9540	* \n) and this is not allowed (for consistency with \x{...} and the
	9541	* tokenizer handling of \N{NAME}). */
	9542	if (*RExC_parse != '{') {
	9543	vFAIL("Missing braces on \\N{}");
	9544	}
	9545
	9546	RExC_parse++; /* Skip past the '{' */
	9547
	9548	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	9549	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	9550	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
	9551	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
	9552	{
	9553	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	9554	vFAIL("\\N{NAME} must be resolved by the lexer");
	9555	}
	9556
	9557	if (endbrace == RExC_parse) { /* empty: \N{} */
	9558	if (! valuep) {
	9559	RExC_parse = endbrace + 1;
	9560	return reg_node(pRExC_state,NOTHING);
	9561	}
	9562
	9563	if (SIZE_ONLY) {
	9564	ckWARNreg(RExC_parse,
	9565	"Ignoring zero length \\N{} in character class"
	9566	);
	9567	RExC_parse = endbrace + 1;
	9568	}
	9569	*valuep = 0;
	9570	return (regnode ) &RExC_parse; / Invalid regnode pointer */
	9571	}
	9572
	9573	REQUIRE_UTF8; /* named sequences imply Unicode semantics */
	9574	RExC_parse += 2; /* Skip past the 'U+' */
	9575
	9576	if (valuep) { /* In a bracketed char class */
	9577	/* We only pay attention to the first char of
	9578	multichar strings being returned. I kinda wonder
	9579	if this makes sense as it does change the behaviour
	9580	from earlier versions, OTOH that behaviour was broken
	9581	as well. XXX Solution is to recharacterize as
	9582	[rest-of-class]\|multi1\|multi2... */
	9583
	9584	STRLEN length_of_hex;
	9585	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	9586	\| PERL_SCAN_DISALLOW_PREFIX
	9587	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	9588
	9589	char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
	9590	if (endchar < endbrace) {
	9591	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	9592	}
	9593
	9594	length_of_hex = (STRLEN)(endchar - RExC_parse);
	9595	*valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
	9596
	9597	/* The tokenizer should have guaranteed validity, but it's possible to
	9598	* bypass it by using single quoting, so check */
	9599	if (length_of_hex == 0
	9600	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	9601	{
	9602	RExC_parse += length_of_hex; /* Includes all the valid */
	9603	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	9604	? UTF8SKIP(RExC_parse)
	9605	: 1;
	9606	/* Guard against malformed utf8 */
	9607	if (RExC_parse >= endchar) RExC_parse = endchar;
	9608	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	9609	}
	9610
	9611	RExC_parse = endbrace + 1;
	9612	if (endchar == endbrace) return NULL;
	9613
	9614	ret = (regnode ) &RExC_parse; / Invalid regnode pointer */
	9615	}
	9616	else { /* Not a char class */
	9617
	9618	/* What is done here is to convert this to a sub-pattern of the form
	9619	* (?:\x{char1}\x{char2}...)
	9620	* and then call reg recursively. That way, it retains its atomicness,
	9621	* while not having to worry about special handling that some code
	9622	* points may have. toke.c has converted the original Unicode values
	9623	* to native, so that we can just pass on the hex values unchanged. We
	9624	* do have to set a flag to keep recoding from happening in the
	9625	* recursion */
	9626
	9627	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8\|SVs_TEMP);
	9628	STRLEN len;
	9629	char endchar; / Points to '.' or '}' ending cur char in the input
	9630	stream */
	9631	char *orig_end = RExC_end;
	9632
	9633	while (RExC_parse < endbrace) {
	9634
	9635	/* Code points are separated by dots. If none, there is only one
	9636	* code point, and is terminated by the brace */
	9637	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	9638
	9639	/* Convert to notation the rest of the code understands */
	9640	sv_catpv(substitute_parse, "\\x{");
	9641	sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
	9642	sv_catpv(substitute_parse, "}");
	9643
	9644	/* Point to the beginning of the next character in the sequence. */
	9645	RExC_parse = endchar + 1;
	9646	}
	9647	sv_catpv(substitute_parse, ")");
	9648
	9649	RExC_parse = SvPV(substitute_parse, len);
	9650
	9651	/* Don't allow empty number */
	9652	if (len < 8) {
	9653	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	9654	}
	9655	RExC_end = RExC_parse + len;
	9656
	9657	/* The values are Unicode, and therefore not subject to recoding */
	9658	RExC_override_recoding = 1;
	9659
	9660	ret = reg(pRExC_state, 1, flagp, depth+1);
	9661
	9662	RExC_parse = endbrace;
	9663	RExC_end = orig_end;
	9664	RExC_override_recoding = 0;
	9665
	9666	nextchar(pRExC_state);
	9667	}
	9668
	9669	return ret;
	9670	}
	9671
	9672
	9673	/*
	9674	* reg_recode
	9675	*
	9676	* It returns the code point in utf8 for the value in *encp.
	9677	* value: a code value in the source encoding
	9678	* encp: a pointer to an Encode object
	9679	*
	9680	* If the result from Encode is not a single character,
	9681	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	9682	*/
	9683	STATIC UV
	9684	S_reg_recode(pTHX_ const char value, SV **encp)
	9685	{
	9686	STRLEN numlen = 1;
	9687	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	9688	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	9689	const STRLEN newlen = SvCUR(sv);
	9690	UV uv = UNICODE_REPLACEMENT;
	9691
	9692	PERL_ARGS_ASSERT_REG_RECODE;
	9693
	9694	if (newlen)
	9695	uv = SvUTF8(sv)
	9696	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	9697	: (U8)s;
	9698
	9699	if (!newlen \|\| numlen != newlen) {
	9700	uv = UNICODE_REPLACEMENT;
	9701	*encp = NULL;
	9702	}
	9703	return uv;
	9704	}
	9705
	9706
	9707	/*
	9708	- regatom - the lowest level
	9709
	9710	Try to identify anything special at the start of the pattern. If there
	9711	is, then handle it as required. This may involve generating a single regop,
	9712	such as for an assertion; or it may involve recursing, such as to
	9713	handle a () structure.
	9714
	9715	If the string doesn't start with something special then we gobble up
	9716	as much literal text as we can.
	9717
	9718	Once we have been able to handle whatever type of thing started the
	9719	sequence, we return.
	9720
	9721	Note: we have to be careful with escapes, as they can be both literal
	9722	and special, and in the case of \10 and friends, context determines which.
	9723
	9724	A summary of the code structure is:
	9725
	9726	switch (first_byte) {
	9727	cases for each special:
	9728	handle this special;
	9729	break;
	9730	case '\\':
	9731	switch (2nd byte) {
	9732	cases for each unambiguous special:
	9733	handle this special;
	9734	break;
	9735	cases for each ambigous special/literal:
	9736	disambiguate;
	9737	if (special) handle here
	9738	else goto defchar;
	9739	default: // unambiguously literal:
	9740	goto defchar;
	9741	}
	9742	default: // is a literal char
	9743	// FALL THROUGH
	9744	defchar:
	9745	create EXACTish node for literal;
	9746	while (more input and node isn't full) {
	9747	switch (input_byte) {
	9748	cases for each special;
	9749	make sure parse pointer is set so that the next call to
	9750	regatom will see this special first
	9751	goto loopdone; // EXACTish node terminated by prev. char
	9752	default:
	9753	append char to EXACTISH node;
	9754	}
	9755	get next input byte;
	9756	}
	9757	loopdone:
	9758	}
	9759	return the generated node;
	9760
	9761	Specifically there are two separate switches for handling
	9762	escape sequences, with the one for handling literal escapes requiring
	9763	a dummy entry for all of the special escapes that are actually handled
	9764	by the other.
	9765	*/
	9766
	9767	STATIC regnode *
	9768	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	9769	{
	9770	dVAR;
	9771	register regnode *ret = NULL;
	9772	I32 flags;
	9773	char *parse_start = RExC_parse;
	9774	U8 op;
	9775	GET_RE_DEBUG_FLAGS_DECL;
	9776	DEBUG_PARSE("atom");
	9777	flagp = WORST; / Tentatively. */
	9778
	9779	PERL_ARGS_ASSERT_REGATOM;
	9780
	9781	tryagain:
	9782	switch ((U8)*RExC_parse) {
	9783	case '^':
	9784	RExC_seen_zerolen++;
	9785	nextchar(pRExC_state);
	9786	if (RExC_flags & RXf_PMf_MULTILINE)
	9787	ret = reg_node(pRExC_state, MBOL);
	9788	else if (RExC_flags & RXf_PMf_SINGLELINE)
	9789	ret = reg_node(pRExC_state, SBOL);
	9790	else
	9791	ret = reg_node(pRExC_state, BOL);
	9792	Set_Node_Length(ret, 1); /* MJD */
	9793	break;
	9794	case '$':
	9795	nextchar(pRExC_state);
	9796	if (*RExC_parse)
	9797	RExC_seen_zerolen++;
	9798	if (RExC_flags & RXf_PMf_MULTILINE)
	9799	ret = reg_node(pRExC_state, MEOL);
	9800	else if (RExC_flags & RXf_PMf_SINGLELINE)
	9801	ret = reg_node(pRExC_state, SEOL);
	9802	else
	9803	ret = reg_node(pRExC_state, EOL);
	9804	Set_Node_Length(ret, 1); /* MJD */
	9805	break;
	9806	case '.':
	9807	nextchar(pRExC_state);
	9808	if (RExC_flags & RXf_PMf_SINGLELINE)
	9809	ret = reg_node(pRExC_state, SANY);
	9810	else
	9811	ret = reg_node(pRExC_state, REG_ANY);
	9812	*flagp \|= HASWIDTH\|SIMPLE;
	9813	RExC_naughty++;
	9814	Set_Node_Length(ret, 1); /* MJD */
	9815	break;
	9816	case '[':
	9817	{
	9818	char * const oregcomp_parse = ++RExC_parse;
	9819	ret = regclass(pRExC_state,depth+1);
	9820	if (*RExC_parse != ']') {
	9821	RExC_parse = oregcomp_parse;
	9822	vFAIL("Unmatched [");
	9823	}
	9824	nextchar(pRExC_state);
	9825	*flagp \|= HASWIDTH\|SIMPLE;
	9826	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	9827	break;
	9828	}
	9829	case '(':
	9830	nextchar(pRExC_state);
	9831	ret = reg(pRExC_state, 1, &flags,depth+1);
	9832	if (ret == NULL) {
	9833	if (flags & TRYAGAIN) {
	9834	if (RExC_parse == RExC_end) {
	9835	/* Make parent create an empty node if needed. */
	9836	*flagp \|= TRYAGAIN;
	9837	return(NULL);
	9838	}
	9839	goto tryagain;
	9840	}
	9841	return(NULL);
	9842	}
	9843	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	9844	break;
	9845	case '\|':
	9846	case ')':
	9847	if (flags & TRYAGAIN) {
	9848	*flagp \|= TRYAGAIN;
	9849	return NULL;
	9850	}
	9851	vFAIL("Internal urp");
	9852	/* Supposed to be caught earlier. */
	9853	break;
	9854	case '?':
	9855	case '+':
	9856	case '*':
	9857	RExC_parse++;
	9858	vFAIL("Quantifier follows nothing");
	9859	break;
	9860	case '\\':
	9861	/* Special Escapes
	9862
	9863	This switch handles escape sequences that resolve to some kind
	9864	of special regop and not to literal text. Escape sequnces that
	9865	resolve to literal text are handled below in the switch marked
	9866	"Literal Escapes".
	9867
	9868	Every entry in this switch must have a corresponding entry
	9869	in the literal escape switch. However, the opposite is not
	9870	required, as the default for this switch is to jump to the
	9871	literal text handling code.
	9872	*/
	9873	switch ((U8)*++RExC_parse) {
	9874	/* Special Escapes */
	9875	case 'A':
	9876	RExC_seen_zerolen++;
	9877	ret = reg_node(pRExC_state, SBOL);
	9878	*flagp \|= SIMPLE;
	9879	goto finish_meta_pat;
	9880	case 'G':
	9881	ret = reg_node(pRExC_state, GPOS);
	9882	RExC_seen \|= REG_SEEN_GPOS;
	9883	*flagp \|= SIMPLE;
	9884	goto finish_meta_pat;
	9885	case 'K':
	9886	RExC_seen_zerolen++;
	9887	ret = reg_node(pRExC_state, KEEPS);
	9888	*flagp \|= SIMPLE;
	9889	/* XXX:dmq : disabling in-place substitution seems to
	9890	* be necessary here to avoid cases of memory corruption, as
	9891	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	9892	*/
	9893	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	9894	goto finish_meta_pat;
	9895	case 'Z':
	9896	ret = reg_node(pRExC_state, SEOL);
	9897	*flagp \|= SIMPLE;
	9898	RExC_seen_zerolen++; /* Do not optimize RE away */
	9899	goto finish_meta_pat;
	9900	case 'z':
	9901	ret = reg_node(pRExC_state, EOS);
	9902	*flagp \|= SIMPLE;
	9903	RExC_seen_zerolen++; /* Do not optimize RE away */
	9904	goto finish_meta_pat;
	9905	case 'C':
	9906	ret = reg_node(pRExC_state, CANY);
	9907	RExC_seen \|= REG_SEEN_CANY;
	9908	*flagp \|= HASWIDTH\|SIMPLE;
	9909	goto finish_meta_pat;
	9910	case 'X':
	9911	ret = reg_node(pRExC_state, CLUMP);
	9912	*flagp \|= HASWIDTH;
	9913	goto finish_meta_pat;
	9914	case 'w':
	9915	op = ALNUM + get_regex_charset(RExC_flags);
	9916	if (op > ALNUMA) { /* /aa is same as /a */
	9917	op = ALNUMA;
	9918	}
	9919	ret = reg_node(pRExC_state, op);
	9920	*flagp \|= HASWIDTH\|SIMPLE;
	9921	goto finish_meta_pat;
	9922	case 'W':
	9923	op = NALNUM + get_regex_charset(RExC_flags);
	9924	if (op > NALNUMA) { /* /aa is same as /a */
	9925	op = NALNUMA;
	9926	}
	9927	ret = reg_node(pRExC_state, op);
	9928	*flagp \|= HASWIDTH\|SIMPLE;
	9929	goto finish_meta_pat;
	9930	case 'b':
	9931	RExC_seen_zerolen++;
	9932	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	9933	op = BOUND + get_regex_charset(RExC_flags);
	9934	if (op > BOUNDA) { /* /aa is same as /a */
	9935	op = BOUNDA;
	9936	}
	9937	ret = reg_node(pRExC_state, op);
	9938	FLAGS(ret) = get_regex_charset(RExC_flags);
	9939	*flagp \|= SIMPLE;
	9940	goto finish_meta_pat;
	9941	case 'B':
	9942	RExC_seen_zerolen++;
	9943	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	9944	op = NBOUND + get_regex_charset(RExC_flags);
	9945	if (op > NBOUNDA) { /* /aa is same as /a */
	9946	op = NBOUNDA;
	9947	}
	9948	ret = reg_node(pRExC_state, op);
	9949	FLAGS(ret) = get_regex_charset(RExC_flags);
	9950	*flagp \|= SIMPLE;
	9951	goto finish_meta_pat;
	9952	case 's':
	9953	op = SPACE + get_regex_charset(RExC_flags);
	9954	if (op > SPACEA) { /* /aa is same as /a */
	9955	op = SPACEA;
	9956	}
	9957	ret = reg_node(pRExC_state, op);
	9958	*flagp \|= HASWIDTH\|SIMPLE;
	9959	goto finish_meta_pat;
	9960	case 'S':
	9961	op = NSPACE + get_regex_charset(RExC_flags);
	9962	if (op > NSPACEA) { /* /aa is same as /a */
	9963	op = NSPACEA;
	9964	}
	9965	ret = reg_node(pRExC_state, op);
	9966	*flagp \|= HASWIDTH\|SIMPLE;
	9967	goto finish_meta_pat;
	9968	case 'D':
	9969	op = NDIGIT;
	9970	goto join_D_and_d;
	9971	case 'd':
	9972	op = DIGIT;
	9973	join_D_and_d:
	9974	{
	9975	U8 offset = get_regex_charset(RExC_flags);
	9976	if (offset == REGEX_UNICODE_CHARSET) {
	9977	offset = REGEX_DEPENDS_CHARSET;
	9978	}
	9979	else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
	9980	offset = REGEX_ASCII_RESTRICTED_CHARSET;
	9981	}
	9982	op += offset;
	9983	}
	9984	ret = reg_node(pRExC_state, op);
	9985	*flagp \|= HASWIDTH\|SIMPLE;
	9986	goto finish_meta_pat;
	9987	case 'R':
	9988	ret = reg_node(pRExC_state, LNBREAK);
	9989	*flagp \|= HASWIDTH\|SIMPLE;
	9990	goto finish_meta_pat;
	9991	case 'h':
	9992	ret = reg_node(pRExC_state, HORIZWS);
	9993	*flagp \|= HASWIDTH\|SIMPLE;
	9994	goto finish_meta_pat;
	9995	case 'H':
	9996	ret = reg_node(pRExC_state, NHORIZWS);
	9997	*flagp \|= HASWIDTH\|SIMPLE;
	9998	goto finish_meta_pat;
	9999	case 'v':
	10000	ret = reg_node(pRExC_state, VERTWS);
	10001	*flagp \|= HASWIDTH\|SIMPLE;
	10002	goto finish_meta_pat;
	10003	case 'V':
	10004	ret = reg_node(pRExC_state, NVERTWS);
	10005	*flagp \|= HASWIDTH\|SIMPLE;
	10006	finish_meta_pat:
	10007	nextchar(pRExC_state);
	10008	Set_Node_Length(ret, 2); /* MJD */
	10009	break;
	10010	case 'p':
	10011	case 'P':
	10012	{
	10013	char* const oldregxend = RExC_end;
	10014	#ifdef DEBUGGING
	10015	char* parse_start = RExC_parse - 2;
	10016	#endif
	10017
	10018	if (RExC_parse[1] == '{') {
	10019	/* a lovely hack--pretend we saw [\pX] instead */
	10020	RExC_end = strchr(RExC_parse, '}');
	10021	if (!RExC_end) {
	10022	const U8 c = (U8)*RExC_parse;
	10023	RExC_parse += 2;
	10024	RExC_end = oldregxend;
	10025	vFAIL2("Missing right brace on \\%c{}", c);
	10026	}
	10027	RExC_end++;
	10028	}
	10029	else {
	10030	RExC_end = RExC_parse + 2;
	10031	if (RExC_end > oldregxend)
	10032	RExC_end = oldregxend;
	10033	}
	10034	RExC_parse--;
	10035
	10036	ret = regclass(pRExC_state,depth+1);
	10037
	10038	RExC_end = oldregxend;
	10039	RExC_parse--;
	10040
	10041	Set_Node_Offset(ret, parse_start + 2);
	10042	Set_Node_Cur_Length(ret);
	10043	nextchar(pRExC_state);
	10044	*flagp \|= HASWIDTH\|SIMPLE;
	10045	}
	10046	break;
	10047	case 'N':
	10048	/* Handle \N and \N{NAME} here and not below because it can be
	10049	multicharacter. join_exact() will join them up later on.
	10050	Also this makes sure that things like /\N{BLAH}+/ and
	10051	\N{BLAH} being multi char Just Happen. dmq*/
	10052	++RExC_parse;
	10053	ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
	10054	break;
	10055	case 'k': /* Handle \k<NAME> and \k'NAME' */
	10056	parse_named_seq:
	10057	{
	10058	char ch= RExC_parse[1];
	10059	if (ch != '<' && ch != '\'' && ch != '{') {
	10060	RExC_parse++;
	10061	vFAIL2("Sequence %.2s... not terminated",parse_start);
	10062	} else {
	10063	/* this pretty much dupes the code for (?P=...) in reg(), if
	10064	you change this make sure you change that */
	10065	char* name_start = (RExC_parse += 2);
	10066	U32 num = 0;
	10067	SV *sv_dat = reg_scan_name(pRExC_state,
	10068	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	10069	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	10070	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	10071	vFAIL2("Sequence %.3s... not terminated",parse_start);
	10072
	10073	if (!SIZE_ONLY) {
	10074	num = add_data( pRExC_state, 1, "S" );
	10075	RExC_rxi->data->data[num]=(void*)sv_dat;
	10076	SvREFCNT_inc_simple_void(sv_dat);
	10077	}
	10078
	10079	RExC_sawback = 1;
	10080	ret = reganode(pRExC_state,
	10081	((! FOLD)
	10082	? NREF
	10083	: (MORE_ASCII_RESTRICTED)
	10084	? NREFFA
	10085	: (AT_LEAST_UNI_SEMANTICS)
	10086	? NREFFU
	10087	: (LOC)
	10088	? NREFFL
	10089	: NREFF),
	10090	num);
	10091	*flagp \|= HASWIDTH;
	10092
	10093	/* override incorrect value set in reganode MJD */
	10094	Set_Node_Offset(ret, parse_start+1);
	10095	Set_Node_Cur_Length(ret); /* MJD */
	10096	nextchar(pRExC_state);
	10097
	10098	}
	10099	break;
	10100	}
	10101	case 'g':
	10102	case '1': case '2': case '3': case '4':
	10103	case '5': case '6': case '7': case '8': case '9':
	10104	{
	10105	I32 num;
	10106	bool isg = *RExC_parse == 'g';
	10107	bool isrel = 0;
	10108	bool hasbrace = 0;
	10109	if (isg) {
	10110	RExC_parse++;
	10111	if (*RExC_parse == '{') {
	10112	RExC_parse++;
	10113	hasbrace = 1;
	10114	}
	10115	if (*RExC_parse == '-') {
	10116	RExC_parse++;
	10117	isrel = 1;
	10118	}
	10119	if (hasbrace && !isDIGIT(*RExC_parse)) {
	10120	if (isrel) RExC_parse--;
	10121	RExC_parse -= 2;
	10122	goto parse_named_seq;
	10123	} }
	10124	num = atoi(RExC_parse);
	10125	if (isg && num == 0)
	10126	vFAIL("Reference to invalid group 0");
	10127	if (isrel) {
	10128	num = RExC_npar - num;
	10129	if (num < 1)
	10130	vFAIL("Reference to nonexistent or unclosed group");
	10131	}
	10132	if (!isg && num > 9 && num >= RExC_npar)
	10133	/* Probably a character specified in octal, e.g. \35 */
	10134	goto defchar;
	10135	else {
	10136	char * const parse_start = RExC_parse - 1; /* MJD */
	10137	while (isDIGIT(*RExC_parse))
	10138	RExC_parse++;
	10139	if (parse_start == RExC_parse - 1)
	10140	vFAIL("Unterminated \\g... pattern");
	10141	if (hasbrace) {
	10142	if (*RExC_parse != '}')
	10143	vFAIL("Unterminated \\g{...} pattern");
	10144	RExC_parse++;
	10145	}
	10146	if (!SIZE_ONLY) {
	10147	if (num > (I32)RExC_rx->nparens)
	10148	vFAIL("Reference to nonexistent group");
	10149	}
	10150	RExC_sawback = 1;
	10151	ret = reganode(pRExC_state,
	10152	((! FOLD)
	10153	? REF
	10154	: (MORE_ASCII_RESTRICTED)
	10155	? REFFA
	10156	: (AT_LEAST_UNI_SEMANTICS)
	10157	? REFFU
	10158	: (LOC)
	10159	? REFFL
	10160	: REFF),
	10161	num);
	10162	*flagp \|= HASWIDTH;
	10163
	10164	/* override incorrect value set in reganode MJD */
	10165	Set_Node_Offset(ret, parse_start+1);
	10166	Set_Node_Cur_Length(ret); /* MJD */
	10167	RExC_parse--;
	10168	nextchar(pRExC_state);
	10169	}
	10170	}
	10171	break;
	10172	case '\0':
	10173	if (RExC_parse >= RExC_end)
	10174	FAIL("Trailing \\");
	10175	/* FALL THROUGH */
	10176	default:
	10177	/* Do not generate "unrecognized" warnings here, we fall
	10178	back into the quick-grab loop below */
	10179	parse_start--;
	10180	goto defchar;
	10181	}
	10182	break;
	10183
	10184	case '#':
	10185	if (RExC_flags & RXf_PMf_EXTENDED) {
	10186	if ( reg_skipcomment( pRExC_state ) )
	10187	goto tryagain;
	10188	}
	10189	/* FALL THROUGH */
	10190
	10191	default:
	10192
	10193	parse_start = RExC_parse - 1;
	10194
	10195	RExC_parse++;
	10196
	10197	defchar: {
	10198	register STRLEN len;
	10199	register UV ender;
	10200	register char *p;
	10201	char *s;
	10202	STRLEN foldlen;
	10203	U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
	10204	U8 node_type;
	10205
	10206	/* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
	10207	* it is folded to 'ss' even if not utf8 */
	10208	bool is_exactfu_sharp_s;
	10209
	10210	ender = 0;
	10211	if (! FOLD) {
	10212	node_type = EXACT;
	10213	}
	10214	else {
	10215	node_type = get_regex_charset(RExC_flags);
	10216	if (node_type >= REGEX_ASCII_RESTRICTED_CHARSET) {
	10217	node_type--; /* /a is same as /u, and map /aa's offset to
	10218	what /a's would have been, so there is no
	10219	hole */
	10220	}
	10221	node_type += EXACTF;
	10222	}
	10223	ret = reg_node(pRExC_state, node_type);
	10224	s = STRING(ret);
	10225
	10226	/* XXX The node can hold up to 255 bytes, yet this only goes to
	10227	* 127. I (khw) do not know why. Keeping it somewhat less than
	10228	* 255 allows us to not have to worry about overflow due to
	10229	* converting to utf8 and fold expansion, but that value is
	10230	* 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
	10231	* split up by this limit into a single one using the real max of
	10232	* 255. Even at 127, this breaks under rare circumstances. If
	10233	* folding, we do not want to split a node at a character that is a
	10234	* non-final in a multi-char fold, as an input string could just
	10235	* happen to want to match across the node boundary. The join
	10236	* would solve that problem if the join actually happens. But a
	10237	* series of more than two nodes in a row each of 127 would cause
	10238	* the first join to succeed to get to 254, but then there wouldn't
	10239	* be room for the next one, which could at be one of those split
	10240	* multi-char folds. I don't know of any fool-proof solution. One
	10241	* could back off to end with only a code point that isn't such a
	10242	* non-final, but it is possible for there not to be any in the
	10243	* entire node. */
	10244	for (len = 0, p = RExC_parse - 1;
	10245	len < 127 && p < RExC_end;
	10246	len++)
	10247	{
	10248	char * const oldp = p;
	10249
	10250	if (RExC_flags & RXf_PMf_EXTENDED)
	10251	p = regwhite( pRExC_state, p );
	10252	switch ((U8)*p) {
	10253	case '^':
	10254	case '$':
	10255	case '.':
	10256	case '[':
	10257	case '(':
	10258	case ')':
	10259	case '\|':
	10260	goto loopdone;
	10261	case '\\':
	10262	/* Literal Escapes Switch
	10263
	10264	This switch is meant to handle escape sequences that
	10265	resolve to a literal character.
	10266
	10267	Every escape sequence that represents something
	10268	else, like an assertion or a char class, is handled
	10269	in the switch marked 'Special Escapes' above in this
	10270	routine, but also has an entry here as anything that
	10271	isn't explicitly mentioned here will be treated as
	10272	an unescaped equivalent literal.
	10273	*/
	10274
	10275	switch ((U8)*++p) {
	10276	/* These are all the special escapes. */
	10277	case 'A': /* Start assertion */
	10278	case 'b': case 'B': /* Word-boundary assertion*/
	10279	case 'C': /* Single char !DANGEROUS! */
	10280	case 'd': case 'D': /* digit class */
	10281	case 'g': case 'G': /* generic-backref, pos assertion */
	10282	case 'h': case 'H': /* HORIZWS */
	10283	case 'k': case 'K': /* named backref, keep marker */
	10284	case 'N': /* named char sequence */
	10285	case 'p': case 'P': /* Unicode property */
	10286	case 'R': /* LNBREAK */
	10287	case 's': case 'S': /* space class */
	10288	case 'v': case 'V': /* VERTWS */
	10289	case 'w': case 'W': /* word class */
	10290	case 'X': /* eXtended Unicode "combining character sequence" */
	10291	case 'z': case 'Z': /* End of line/string assertion */
	10292	--p;
	10293	goto loopdone;
	10294
	10295	/* Anything after here is an escape that resolves to a
	10296	literal. (Except digits, which may or may not)
	10297	*/
	10298	case 'n':
	10299	ender = '\n';
	10300	p++;
	10301	break;
	10302	case 'r':
	10303	ender = '\r';
	10304	p++;
	10305	break;
	10306	case 't':
	10307	ender = '\t';
	10308	p++;
	10309	break;
	10310	case 'f':
	10311	ender = '\f';
	10312	p++;
	10313	break;
	10314	case 'e':
	10315	ender = ASCII_TO_NATIVE('\033');
	10316	p++;
	10317	break;
	10318	case 'a':
	10319	ender = ASCII_TO_NATIVE('\007');
	10320	p++;
	10321	break;
	10322	case 'o':
	10323	{
	10324	STRLEN brace_len = len;
	10325	UV result;
	10326	const char* error_msg;
	10327
	10328	bool valid = grok_bslash_o(p,
	10329	&result,
	10330	&brace_len,
	10331	&error_msg,
	10332	1);
	10333	p += brace_len;
	10334	if (! valid) {
	10335	RExC_parse = p; /* going to die anyway; point
	10336	to exact spot of failure */
	10337	vFAIL(error_msg);
	10338	}
	10339	else
	10340	{
	10341	ender = result;
	10342	}
	10343	if (PL_encoding && ender < 0x100) {
	10344	goto recode_encoding;
	10345	}
	10346	if (ender > 0xff) {
	10347	REQUIRE_UTF8;
	10348	}
	10349	break;
	10350	}
	10351	case 'x':
	10352	{
	10353	STRLEN brace_len = len;
	10354	UV result;
	10355	const char* error_msg;
	10356
	10357	bool valid = grok_bslash_x(p,
	10358	&result,
	10359	&brace_len,
	10360	&error_msg,
	10361	1);
	10362	p += brace_len;
	10363	if (! valid) {
	10364	RExC_parse = p; /* going to die anyway; point
	10365	to exact spot of failure */
	10366	vFAIL(error_msg);
	10367	}
	10368	else {
	10369	ender = result;
	10370	}
	10371	if (PL_encoding && ender < 0x100) {
	10372	goto recode_encoding;
	10373	}
	10374	if (ender > 0xff) {
	10375	REQUIRE_UTF8;
	10376	}
	10377	break;
	10378	}
	10379	case 'c':
	10380	p++;
	10381	ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
	10382	break;
	10383	case '0': case '1': case '2': case '3':case '4':
	10384	case '5': case '6': case '7':
	10385	if (*p == '0' \|\|
	10386	(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
	10387	{
	10388	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	10389	STRLEN numlen = 3;
	10390	ender = grok_oct(p, &numlen, &flags, NULL);
	10391	if (ender > 0xff) {
	10392	REQUIRE_UTF8;
	10393	}
	10394	p += numlen;
	10395	}
	10396	else {
	10397	--p;
	10398	goto loopdone;
	10399	}
	10400	if (PL_encoding && ender < 0x100)
	10401	goto recode_encoding;
	10402	break;
	10403	recode_encoding:
	10404	if (! RExC_override_recoding) {
	10405	SV* enc = PL_encoding;
	10406	ender = reg_recode((const char)(U8)ender, &enc);
	10407	if (!enc && SIZE_ONLY)
	10408	ckWARNreg(p, "Invalid escape in the specified encoding");
	10409	REQUIRE_UTF8;
	10410	}
	10411	break;
	10412	case '\0':
	10413	if (p >= RExC_end)
	10414	FAIL("Trailing \\");
	10415	/* FALL THROUGH */
	10416	default:
	10417	if (!SIZE_ONLY&& isALNUMC(*p)) {
	10418	ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
	10419	}
	10420	goto normal_default;
	10421	}
	10422	break;
	10423	case '{':
	10424	/* Currently we don't warn when the lbrace is at the start
	10425	* of a construct. This catches it in the middle of a
	10426	* literal string, or when its the first thing after
	10427	* something like "\b" */
	10428	if (! SIZE_ONLY
	10429	&& (len \|\| (p > RExC_start && isALPHA_A(*(p -1)))))
	10430	{
	10431	ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
	10432	}
	10433	/FALLTHROUGH/
	10434	default:
	10435	normal_default:
	10436	if (UTF8_IS_START(*p) && UTF) {
	10437	STRLEN numlen;
	10438	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	10439	&numlen, UTF8_ALLOW_DEFAULT);
	10440	p += numlen;
	10441	}
	10442	else
	10443	ender = (U8) *p++;
	10444	break;
	10445	} /* End of switch on the literal */
	10446
	10447	is_exactfu_sharp_s = (node_type == EXACTFU
	10448	&& ender == LATIN_SMALL_LETTER_SHARP_S);
	10449	if ( RExC_flags & RXf_PMf_EXTENDED)
	10450	p = regwhite( pRExC_state, p );
	10451	if ((UTF && FOLD) \|\| is_exactfu_sharp_s) {
	10452	/* Prime the casefolded buffer. Locale rules, which apply
	10453	* only to code points < 256, aren't known until execution,
	10454	* so for them, just output the original character using
	10455	* utf8. If we start to fold non-UTF patterns, be sure to
	10456	* update join_exact() */
	10457	if (LOC && ender < 256) {
	10458	if (UNI_IS_INVARIANT(ender)) {
	10459	*tmpbuf = (U8) ender;
	10460	foldlen = 1;
	10461	} else {
	10462	*tmpbuf = UTF8_TWO_BYTE_HI(ender);
	10463	*(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
	10464	foldlen = 2;
	10465	}
	10466	}
	10467	else if (isASCII(ender)) { /* Note: Here can't also be LOC
	10468	*/
	10469	ender = toLOWER(ender);
	10470	*tmpbuf = (U8) ender;
	10471	foldlen = 1;
	10472	}
	10473	else if (! MORE_ASCII_RESTRICTED && ! LOC) {
	10474
	10475	/* Locale and /aa require more selectivity about the
	10476	* fold, so are handled below. Otherwise, here, just
	10477	* use the fold */
	10478	ender = toFOLD_uni(ender, tmpbuf, &foldlen);
	10479	}
	10480	else {
	10481	/* Under locale rules or /aa we are not to mix,
	10482	* respectively, ords < 256 or ASCII with non-. So
	10483	* reject folds that mix them, using only the
	10484	* non-folded code point. So do the fold to a
	10485	* temporary, and inspect each character in it. */
	10486	U8 trialbuf[UTF8_MAXBYTES_CASE+1];
	10487	U8* s = trialbuf;
	10488	UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
	10489	U8* e = s + foldlen;
	10490	bool fold_ok = TRUE;
	10491
	10492	while (s < e) {
	10493	if (isASCII(*s)
	10494	\|\| (LOC && (UTF8_IS_INVARIANT(*s)
	10495	\|\| UTF8_IS_DOWNGRADEABLE_START(*s))))
	10496	{
	10497	fold_ok = FALSE;
	10498	break;
	10499	}
	10500	s += UTF8SKIP(s);
	10501	}
	10502	if (fold_ok) {
	10503	Copy(trialbuf, tmpbuf, foldlen, U8);
	10504	ender = tmpender;
	10505	}
	10506	else {
	10507	uvuni_to_utf8(tmpbuf, ender);
	10508	foldlen = UNISKIP(ender);
	10509	}
	10510	}
	10511	}
	10512	if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+. /
	10513	if (len)
	10514	p = oldp;
	10515	else if (UTF \|\| is_exactfu_sharp_s) {
	10516	if (FOLD) {
	10517	/* Emit all the Unicode characters. */
	10518	STRLEN numlen;
	10519	for (foldbuf = tmpbuf;
	10520	foldlen;
	10521	foldlen -= numlen) {
	10522
	10523	/* tmpbuf has been constructed by us, so we
	10524	* know it is valid utf8 */
	10525	ender = valid_utf8_to_uvchr(foldbuf, &numlen);
	10526	if (numlen > 0) {
	10527	const STRLEN unilen = reguni(pRExC_state, ender, s);
	10528	s += unilen;
	10529	len += unilen;
	10530	/* In EBCDIC the numlen
	10531	* and unilen can differ. */
	10532	foldbuf += numlen;
	10533	if (numlen >= foldlen)
	10534	break;
	10535	}
	10536	else
	10537	break; /* "Can't happen." */
	10538	}
	10539	}
	10540	else {
	10541	const STRLEN unilen = reguni(pRExC_state, ender, s);
	10542	if (unilen > 0) {
	10543	s += unilen;
	10544	len += unilen;
	10545	}
	10546	}
	10547	}
	10548	else {
	10549	len++;
	10550	REGC((char)ender, s++);
	10551	}
	10552	break;
	10553	}
	10554	if (UTF \|\| is_exactfu_sharp_s) {
	10555	if (FOLD) {
	10556	/* Emit all the Unicode characters. */
	10557	STRLEN numlen;
	10558	for (foldbuf = tmpbuf;
	10559	foldlen;
	10560	foldlen -= numlen) {
	10561	ender = valid_utf8_to_uvchr(foldbuf, &numlen);
	10562	if (numlen > 0) {
	10563	const STRLEN unilen = reguni(pRExC_state, ender, s);
	10564	len += unilen;
	10565	s += unilen;
	10566	/* In EBCDIC the numlen
	10567	* and unilen can differ. */
	10568	foldbuf += numlen;
	10569	if (numlen >= foldlen)
	10570	break;
	10571	}
	10572	else
	10573	break;
	10574	}
	10575	}
	10576	else {
	10577	const STRLEN unilen = reguni(pRExC_state, ender, s);
	10578	if (unilen > 0) {
	10579	s += unilen;
	10580	len += unilen;
	10581	}
	10582	}
	10583	len--;
	10584	}
	10585	else {
	10586	REGC((char)ender, s++);
	10587	}
	10588	}
	10589	loopdone: /* Jumped to when encounters something that shouldn't be in
	10590	the node */
	10591	RExC_parse = p - 1;
	10592	Set_Node_Cur_Length(ret); /* MJD */
	10593	nextchar(pRExC_state);
	10594	{
	10595	/* len is STRLEN which is unsigned, need to copy to signed */
	10596	IV iv = len;
	10597	if (iv < 0)
	10598	vFAIL("Internal disaster");
	10599	}
	10600	if (len > 0)
	10601	*flagp \|= HASWIDTH;
	10602	if (len == 1 && UNI_IS_INVARIANT(ender))
	10603	*flagp \|= SIMPLE;
	10604
	10605	if (SIZE_ONLY)
	10606	RExC_size += STR_SZ(len);
	10607	else {
	10608	STR_LEN(ret) = len;
	10609	RExC_emit += STR_SZ(len);
	10610	}
	10611	}
	10612	break;
	10613	}
	10614
	10615	return(ret);
	10616	}
	10617
	10618	STATIC char *
	10619	S_regwhite( RExC_state_t pRExC_state, char p )
	10620	{
	10621	const char *e = RExC_end;
	10622
	10623	PERL_ARGS_ASSERT_REGWHITE;
	10624
	10625	while (p < e) {
	10626	if (isSPACE(*p))
	10627	++p;
	10628	else if (*p == '#') {
	10629	bool ended = 0;
	10630	do {
	10631	if (*p++ == '\n') {
	10632	ended = 1;
	10633	break;
	10634	}
	10635	} while (p < e);
	10636	if (!ended)
	10637	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	10638	}
	10639	else
	10640	break;
	10641	}
	10642	return p;
	10643	}
	10644
	10645	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	10646	Character classes ([:foo:]) can also be negated ([:^foo:]).
	10647	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	10648	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	10649	but trigger failures because they are currently unimplemented. */
	10650
	10651	#define POSIXCC_DONE(c) ((c) == ':')
	10652	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	10653	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	10654
	10655	STATIC I32
	10656	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
	10657	{
	10658	dVAR;
	10659	I32 namedclass = OOB_NAMEDCLASS;
	10660
	10661	PERL_ARGS_ASSERT_REGPPOSIXCC;
	10662
	10663	if (value == '[' && RExC_parse + 1 < RExC_end &&
	10664	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	10665	POSIXCC(UCHARAT(RExC_parse))) {
	10666	const char c = UCHARAT(RExC_parse);
	10667	char* const s = RExC_parse++;
	10668
	10669	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	10670	RExC_parse++;
	10671	if (RExC_parse == RExC_end)
	10672	/* Grandfather lone [:, [=, [. */
	10673	RExC_parse = s;
	10674	else {
	10675	const char* const t = RExC_parse++; /* skip over the c */
	10676	assert(*t == c);
	10677
	10678	if (UCHARAT(RExC_parse) == ']') {
	10679	const char *posixcc = s + 1;
	10680	RExC_parse++; /* skip over the ending ] */
	10681
	10682	if (*s == ':') {
	10683	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	10684	const I32 skip = t - posixcc;
	10685
	10686	/* Initially switch on the length of the name. */
	10687	switch (skip) {
	10688	case 4:
	10689	if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
	10690	namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
	10691	break;
	10692	case 5:
	10693	/* Names all of length 5. */
	10694	/* alnum alpha ascii blank cntrl digit graph lower
	10695	print punct space upper */
	10696	/* Offset 4 gives the best switch position. */
	10697	switch (posixcc[4]) {
	10698	case 'a':
	10699	if (memEQ(posixcc, "alph", 4)) /* alpha */
	10700	namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
	10701	break;
	10702	case 'e':
	10703	if (memEQ(posixcc, "spac", 4)) /* space */
	10704	namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
	10705	break;
	10706	case 'h':
	10707	if (memEQ(posixcc, "grap", 4)) /* graph */
	10708	namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
	10709	break;
	10710	case 'i':
	10711	if (memEQ(posixcc, "asci", 4)) /* ascii */
	10712	namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
	10713	break;
	10714	case 'k':
	10715	if (memEQ(posixcc, "blan", 4)) /* blank */
	10716	namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
	10717	break;
	10718	case 'l':
	10719	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	10720	namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
	10721	break;
	10722	case 'm':
	10723	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	10724	namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
	10725	break;
	10726	case 'r':
	10727	if (memEQ(posixcc, "lowe", 4)) /* lower */
	10728	namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
	10729	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	10730	namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
	10731	break;
	10732	case 't':
	10733	if (memEQ(posixcc, "digi", 4)) /* digit */
	10734	namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
	10735	else if (memEQ(posixcc, "prin", 4)) /* print */
	10736	namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
	10737	else if (memEQ(posixcc, "punc", 4)) /* punct */
	10738	namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
	10739	break;
	10740	}
	10741	break;
	10742	case 6:
	10743	if (memEQ(posixcc, "xdigit", 6))
	10744	namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
	10745	break;
	10746	}
	10747
	10748	if (namedclass == OOB_NAMEDCLASS)
	10749	Simple_vFAIL3("POSIX class [:%.*s:] unknown",
	10750	t - s - 1, s + 1);
	10751	assert (posixcc[skip] == ':');
	10752	assert (posixcc[skip+1] == ']');
	10753	} else if (!SIZE_ONLY) {
	10754	/* [[=foo=]] and [[.foo.]] are still future. */
	10755
	10756	/* adjust RExC_parse so the warning shows after
	10757	the class closes */
	10758	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	10759	RExC_parse++;
	10760	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	10761	}
	10762	} else {
	10763	/* Maternal grandfather:
	10764	* "[:" ending in ":" but not in ":]" */
	10765	RExC_parse = s;
	10766	}
	10767	}
	10768	}
	10769
	10770	return namedclass;
	10771	}
	10772
	10773	STATIC void
	10774	S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
	10775	{
	10776	dVAR;
	10777
	10778	PERL_ARGS_ASSERT_CHECKPOSIXCC;
	10779
	10780	if (POSIXCC(UCHARAT(RExC_parse))) {
	10781	const char *s = RExC_parse;
	10782	const char c = *s++;
	10783
	10784	while (isALNUM(*s))
	10785	s++;
	10786	if (s && c == s && s[1] == ']') {
	10787	ckWARN3reg(s+2,
	10788	"POSIX syntax [%c %c] belongs inside character classes",
	10789	c, c);
	10790
	10791	/* [[=foo=]] and [[.foo.]] are still future. */
	10792	if (POSIXCC_NOTYET(c)) {
	10793	/* adjust RExC_parse so the error shows after
	10794	the class closes */
	10795	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
	10796	NOOP;
	10797	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	10798	}
	10799	}
	10800	}
	10801	}
	10802
	10803	/* Generate the code to add a full posix character <class> to the bracketed
	10804	* character class given by <node>. (<node> is needed only under locale rules)
	10805	* destlist is the inversion list for non-locale rules that this class is
	10806	* to be added to
	10807	* sourcelist is the ASCII-range inversion list to add under /a rules
	10808	* Xsourcelist is the full Unicode range list to use otherwise. */
	10809	#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
	10810	if (LOC) { \
	10811	SV* scratch_list = NULL; \
	10812	\
	10813	/* Set this class in the node for runtime matching */ \
	10814	ANYOF_CLASS_SET(node, class); \
	10815	\
	10816	/* For above Latin1 code points, we use the full Unicode range */ \
	10817	_invlist_intersection(PL_AboveLatin1, \
	10818	Xsourcelist, \
	10819	&scratch_list); \
	10820	/* And set the output to it, adding instead if there already is an \
	10821	* output. Checking if <destlist> is NULL first saves an extra \
	10822	* clone. Its reference count will be decremented at the next \
	10823	* union, etc, or if this is the only instance, at the end of the \
	10824	* routine */ \
	10825	if (! destlist) { \
	10826	destlist = scratch_list; \
	10827	} \
	10828	else { \
	10829	_invlist_union(destlist, scratch_list, &destlist); \
	10830	SvREFCNT_dec(scratch_list); \
	10831	} \
	10832	} \
	10833	else { \
	10834	/* For non-locale, just add it to any existing list */ \
	10835	_invlist_union(destlist, \
	10836	(AT_LEAST_ASCII_RESTRICTED) \
	10837	? sourcelist \
	10838	: Xsourcelist, \
	10839	&destlist); \
	10840	}
	10841
	10842	/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
	10843	*/
	10844	#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
	10845	if (LOC) { \
	10846	SV* scratch_list = NULL; \
	10847	ANYOF_CLASS_SET(node, class); \
	10848	_invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
	10849	if (! destlist) { \
	10850	destlist = scratch_list; \
	10851	} \
	10852	else { \
	10853	_invlist_union(destlist, scratch_list, &destlist); \
	10854	SvREFCNT_dec(scratch_list); \
	10855	} \
	10856	} \
	10857	else { \
	10858	_invlist_union_complement_2nd(destlist, \
	10859	(AT_LEAST_ASCII_RESTRICTED) \
	10860	? sourcelist \
	10861	: Xsourcelist, \
	10862	&destlist); \
	10863	/* Under /d, everything in the upper half of the Latin1 range \
	10864	* matches this complement */ \
	10865	if (DEPENDS_SEMANTICS) { \
	10866	ANYOF_FLAGS(node) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	10867	} \
	10868	}
	10869
	10870	/* Generate the code to add a posix character <class> to the bracketed
	10871	* character class given by <node>. (<node> is needed only under locale rules)
	10872	* destlist is the inversion list for non-locale rules that this class is
	10873	* to be added to
	10874	* sourcelist is the ASCII-range inversion list to add under /a rules
	10875	* l1_sourcelist is the Latin1 range list to use otherwise.
	10876	* Xpropertyname is the name to add to <run_time_list> of the property to
	10877	* specify the code points above Latin1 that will have to be
	10878	* determined at run-time
	10879	* run_time_list is a SV* that contains text names of properties that are to
	10880	* be computed at run time. This concatenates <Xpropertyname>
	10881	* to it, apppropriately
	10882	* This is essentially DO_POSIX, but we know only the Latin1 values at compile
	10883	* time */
	10884	#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
	10885	l1_sourcelist, Xpropertyname, run_time_list) \
	10886	/* First, resolve whether to use the ASCII-only list or the L1 \
	10887	* list */ \
	10888	DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
	10889	((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
	10890	Xpropertyname, run_time_list)
	10891
	10892	#define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
	10893	Xpropertyname, run_time_list) \
	10894	/* If not /a matching, there are going to be code points we will have \
	10895	* to defer to runtime to look-up */ \
	10896	if (! AT_LEAST_ASCII_RESTRICTED) { \
	10897	Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
	10898	} \
	10899	if (LOC) { \
	10900	ANYOF_CLASS_SET(node, class); \
	10901	} \
	10902	else { \
	10903	_invlist_union(destlist, sourcelist, &destlist); \
	10904	}
	10905
	10906	/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
	10907	* this and DO_N_POSIX */
	10908	#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
	10909	l1_sourcelist, Xpropertyname, run_time_list) \
	10910	if (AT_LEAST_ASCII_RESTRICTED) { \
	10911	_invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
	10912	} \
	10913	else { \
	10914	Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
	10915	if (LOC) { \
	10916	ANYOF_CLASS_SET(node, namedclass); \
	10917	} \
	10918	else { \
	10919	SV* scratch_list = NULL; \
	10920	_invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
	10921	if (! destlist) { \
	10922	destlist = scratch_list; \
	10923	} \
	10924	else { \
	10925	_invlist_union(destlist, scratch_list, &destlist); \
	10926	SvREFCNT_dec(scratch_list); \
	10927	} \
	10928	if (DEPENDS_SEMANTICS) { \
	10929	ANYOF_FLAGS(node) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	10930	} \
	10931	} \
	10932	}
	10933
	10934	STATIC void
	10935	S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
	10936	{
	10937	/* Adds input 'string' with length 'len' to the ANYOF node's unicode
	10938	* alternate list, pointed to by 'alternate_ptr'. This is an array of
	10939	* the multi-character folds of characters in the node */
	10940	SV *sv;
	10941
	10942	PERL_ARGS_ASSERT_ADD_ALTERNATE;
	10943
	10944	if (! *alternate_ptr) {
	10945	*alternate_ptr = newAV();
	10946	}
	10947	sv = newSVpvn_utf8((char*)string, len, TRUE);
	10948	av_push(*alternate_ptr, sv);
	10949	return;
	10950	}
	10951
	10952	/*
	10953	parse a class specification and produce either an ANYOF node that
	10954	matches the pattern or perhaps will be optimized into an EXACTish node
	10955	instead. The node contains a bit map for the first 256 characters, with the
	10956	corresponding bit set if that character is in the list. For characters
	10957	above 255, a range list is used */
	10958
	10959	STATIC regnode *
	10960	S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
	10961	{
	10962	dVAR;
	10963	register UV nextvalue;
	10964	register IV prevvalue = OOB_UNICODE;
	10965	register IV range = 0;
	10966	UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
	10967	register regnode *ret;
	10968	STRLEN numlen;
	10969	IV namedclass;
	10970	char *rangebegin = NULL;
	10971	bool need_class = 0;
	10972	bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
	10973	SV *listsv = NULL;
	10974	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	10975	than just initialized. */
	10976	SV* properties = NULL; /* Code points that match \p{} \P{} */
	10977	UV element_count = 0; /* Number of distinct elements in the class.
	10978	Optimizations may be possible if this is tiny */
	10979	UV n;
	10980
	10981	/* Certain named classes have equivalents that can appear outside a
	10982	* character class, e.g. \w. These flags are set for these classes. The
	10983	* first flag indicates the op depends on the character set modifier, like
	10984	* /d, /u.... The second is for those that don't have this dependency. */
	10985	bool has_special_charset_op = FALSE;
	10986	bool has_special_non_charset_op = FALSE;
	10987
	10988	/* Unicode properties are stored in a swash; this holds the current one
	10989	* being parsed. If this swash is the only above-latin1 component of the
	10990	* character class, an optimization is to pass it directly on to the
	10991	* execution engine. Otherwise, it is set to NULL to indicate that there
	10992	* are other things in the class that have to be dealt with at execution
	10993	* time */
	10994	SV* swash = NULL; /* Code points that match \p{} \P{} */
	10995
	10996	/* Set if a component of this character class is user-defined; just passed
	10997	* on to the engine */
	10998	UV has_user_defined_property = 0;
	10999
	11000	/* inversion list of code points this node matches only when the target
	11001	* string is in UTF-8. (Because is under /d) */
	11002	SV* depends_list = NULL;
	11003
	11004	/* inversion list of code points this node matches. For much of the
	11005	* function, it includes only those that match regardless of the utf8ness
	11006	* of the target string */
	11007	SV* cp_list = NULL;
	11008
	11009	/* List of multi-character folds that are matched by this node */
	11010	AV* unicode_alternate = NULL;
	11011	#ifdef EBCDIC
	11012	/* In a range, counts how many 0-2 of the ends of it came from literals,
	11013	* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
	11014	UV literal_endpoint = 0;
	11015	#endif
	11016	UV stored = 0; /* how many chars stored in the bitmap */
	11017
	11018	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	11019	case we need to change the emitted regop to an EXACT. */
	11020	const char * orig_parse = RExC_parse;
	11021	GET_RE_DEBUG_FLAGS_DECL;
	11022
	11023	PERL_ARGS_ASSERT_REGCLASS;
	11024	#ifndef DEBUGGING
	11025	PERL_UNUSED_ARG(depth);
	11026	#endif
	11027
	11028	DEBUG_PARSE("clas");
	11029
	11030	/* Assume we are going to generate an ANYOF node. */
	11031	ret = reganode(pRExC_state, ANYOF, 0);
	11032
	11033
	11034	if (!SIZE_ONLY) {
	11035	ANYOF_FLAGS(ret) = 0;
	11036	}
	11037
	11038	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	11039	RExC_naughty++;
	11040	RExC_parse++;
	11041	if (!SIZE_ONLY)
	11042	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	11043
	11044	/* We have decided to not allow multi-char folds in inverted character
	11045	* classes, due to the confusion that can happen, especially with
	11046	* classes that are designed for a non-Unicode world: You have the
	11047	* peculiar case that:
	11048	"s s" =~ /^[^\xDF]+$/i => Y
	11049	"ss" =~ /^[^\xDF]+$/i => N
	11050	*
	11051	* See [perl #89750] */
	11052	allow_full_fold = FALSE;
	11053	}
	11054
	11055	if (SIZE_ONLY) {
	11056	RExC_size += ANYOF_SKIP;
	11057	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	11058	}
	11059	else {
	11060	RExC_emit += ANYOF_SKIP;
	11061	if (LOC) {
	11062	ANYOF_FLAGS(ret) \|= ANYOF_LOCALE;
	11063	}
	11064	ANYOF_BITMAP_ZERO(ret);
	11065	listsv = newSVpvs("# comment\n");
	11066	initial_listsv_len = SvCUR(listsv);
	11067	}
	11068
	11069	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	11070
	11071	if (!SIZE_ONLY && POSIXCC(nextvalue))
	11072	checkposixcc(pRExC_state);
	11073
	11074	/* allow 1st char to be ] (allowing it to be - is dealt with later) */
	11075	if (UCHARAT(RExC_parse) == ']')
	11076	goto charclassloop;
	11077
	11078	parseit:
	11079	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	11080
	11081	charclassloop:
	11082
	11083	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	11084
	11085	if (!range) {
	11086	rangebegin = RExC_parse;
	11087	element_count++;
	11088	}
	11089	if (UTF) {
	11090	value = utf8n_to_uvchr((U8*)RExC_parse,
	11091	RExC_end - RExC_parse,
	11092	&numlen, UTF8_ALLOW_DEFAULT);
	11093	RExC_parse += numlen;
	11094	}
	11095	else
	11096	value = UCHARAT(RExC_parse++);
	11097
	11098	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	11099	if (value == '[' && POSIXCC(nextvalue))
	11100	namedclass = regpposixcc(pRExC_state, value);
	11101	else if (value == '\\') {
	11102	if (UTF) {
	11103	value = utf8n_to_uvchr((U8*)RExC_parse,
	11104	RExC_end - RExC_parse,
	11105	&numlen, UTF8_ALLOW_DEFAULT);
	11106	RExC_parse += numlen;
	11107	}
	11108	else
	11109	value = UCHARAT(RExC_parse++);
	11110	/* Some compilers cannot handle switching on 64-bit integer
	11111	* values, therefore value cannot be an UV. Yes, this will
	11112	* be a problem later if we want switch on Unicode.
	11113	* A similar issue a little bit later when switching on
	11114	* namedclass. --jhi */
	11115	switch ((I32)value) {
	11116	case 'w': namedclass = ANYOF_ALNUM; break;
	11117	case 'W': namedclass = ANYOF_NALNUM; break;
	11118	case 's': namedclass = ANYOF_SPACE; break;
	11119	case 'S': namedclass = ANYOF_NSPACE; break;
	11120	case 'd': namedclass = ANYOF_DIGIT; break;
	11121	case 'D': namedclass = ANYOF_NDIGIT; break;
	11122	case 'v': namedclass = ANYOF_VERTWS; break;
	11123	case 'V': namedclass = ANYOF_NVERTWS; break;
	11124	case 'h': namedclass = ANYOF_HORIZWS; break;
	11125	case 'H': namedclass = ANYOF_NHORIZWS; break;
	11126	case 'N': /* Handle \N{NAME} in class */
	11127	{
	11128	/* We only pay attention to the first char of
	11129	multichar strings being returned. I kinda wonder
	11130	if this makes sense as it does change the behaviour
	11131	from earlier versions, OTOH that behaviour was broken
	11132	as well. */
	11133	UV v; /* value is register so we cant & it /grrr */
	11134	if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
	11135	goto parseit;
	11136	}
	11137	value= v;
	11138	}
	11139	break;
	11140	case 'p':
	11141	case 'P':
	11142	{
	11143	char *e;
	11144	if (RExC_parse >= RExC_end)
	11145	vFAIL2("Empty \\%c{}", (U8)value);
	11146	if (*RExC_parse == '{') {
	11147	const U8 c = (U8)value;
	11148	e = strchr(RExC_parse++, '}');
	11149	if (!e)
	11150	vFAIL2("Missing right brace on \\%c{}", c);
	11151	while (isSPACE(UCHARAT(RExC_parse)))
	11152	RExC_parse++;
	11153	if (e == RExC_parse)
	11154	vFAIL2("Empty \\%c{}", c);
	11155	n = e - RExC_parse;
	11156	while (isSPACE(UCHARAT(RExC_parse + n - 1)))
	11157	n--;
	11158	}
	11159	else {
	11160	e = RExC_parse;
	11161	n = 1;
	11162	}
	11163	if (!SIZE_ONLY) {
	11164	SV** invlistsvp;
	11165	SV* invlist;
	11166	char* name;
	11167	if (UCHARAT(RExC_parse) == '^') {
	11168	RExC_parse++;
	11169	n--;
	11170	value = value == 'p' ? 'P' : 'p'; /* toggle */
	11171	while (isSPACE(UCHARAT(RExC_parse))) {
	11172	RExC_parse++;
	11173	n--;
	11174	}
	11175	}
	11176	/* Try to get the definition of the property into
	11177	* <invlist>. If /i is in effect, the effective property
	11178	* will have its name be <__NAME_i>. The design is
	11179	* discussed in commit
	11180	* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
	11181	Newx(name, n + sizeof("_i__\n"), char);
	11182
	11183	sprintf(name, "%s%.*s%s\n",
	11184	(FOLD) ? "__" : "",
	11185	(int)n,
	11186	RExC_parse,
	11187	(FOLD) ? "_i" : ""
	11188	);
	11189
	11190	/* Look up the property name, and get its swash and
	11191	* inversion list, if the property is found */
	11192	if (swash) {
	11193	SvREFCNT_dec(swash);
	11194	}
	11195	swash = _core_swash_init("utf8", name, &PL_sv_undef,
	11196	1, /* binary */
	11197	0, /* not tr/// */
	11198	TRUE, /* this routine will handle
	11199	undefined properties */
	11200	NULL, FALSE /* No inversion list */
	11201	);
	11202	if ( ! swash
	11203	\|\| ! SvROK(swash)
	11204	\|\| ! SvTYPE(SvRV(swash)) == SVt_PVHV
	11205	\|\| ! (invlistsvp =
	11206	hv_fetchs(MUTABLE_HV(SvRV(swash)),
	11207	"INVLIST", FALSE))
	11208	\|\| ! (invlist = *invlistsvp))
	11209	{
	11210	if (swash) {
	11211	SvREFCNT_dec(swash);
	11212	swash = NULL;
	11213	}
	11214
	11215	/* Here didn't find it. It could be a user-defined
	11216	* property that will be available at run-time. Add it
	11217	* to the list to look up then */
	11218	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
	11219	(value == 'p' ? '+' : '!'),
	11220	name);
	11221	has_user_defined_property = 1;
	11222
	11223	/* We don't know yet, so have to assume that the
	11224	* property could match something in the Latin1 range,
	11225	* hence something that isn't utf8 */
	11226	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	11227	}
	11228	else {
	11229
	11230	/* Here, did get the swash and its inversion list. If
	11231	* the swash is from a user-defined property, then this
	11232	* whole character class should be regarded as such */
	11233	SV** user_defined_svp =
	11234	hv_fetchs(MUTABLE_HV(SvRV(swash)),
	11235	"USER_DEFINED", FALSE);
	11236	if (user_defined_svp) {
	11237	has_user_defined_property
	11238	\|= SvUV(*user_defined_svp);
	11239	}
	11240
	11241	/* Invert if asking for the complement */
	11242	if (value == 'P') {
	11243	_invlist_union_complement_2nd(properties, invlist, &properties);
	11244
	11245	/* The swash can't be used as-is, because we've
	11246	* inverted things; delay removing it to here after
	11247	* have copied its invlist above */
	11248	SvREFCNT_dec(swash);
	11249	swash = NULL;
	11250	}
	11251	else {
	11252	_invlist_union(properties, invlist, &properties);
	11253	}
	11254	}
	11255	Safefree(name);
	11256	}
	11257	RExC_parse = e + 1;
	11258	namedclass = ANYOF_MAX; /* no official name, but it's named */
	11259
	11260	/* \p means they want Unicode semantics */
	11261	RExC_uni_semantics = 1;
	11262	}
	11263	break;
	11264	case 'n': value = '\n'; break;
	11265	case 'r': value = '\r'; break;
	11266	case 't': value = '\t'; break;
	11267	case 'f': value = '\f'; break;
	11268	case 'b': value = '\b'; break;
	11269	case 'e': value = ASCII_TO_NATIVE('\033');break;
	11270	case 'a': value = ASCII_TO_NATIVE('\007');break;
	11271	case 'o':
	11272	RExC_parse--; /* function expects to be pointed at the 'o' */
	11273	{
	11274	const char* error_msg;
	11275	bool valid = grok_bslash_o(RExC_parse,
	11276	&value,
	11277	&numlen,
	11278	&error_msg,
	11279	SIZE_ONLY);
	11280	RExC_parse += numlen;
	11281	if (! valid) {
	11282	vFAIL(error_msg);
	11283	}
	11284	}
	11285	if (PL_encoding && value < 0x100) {
	11286	goto recode_encoding;
	11287	}
	11288	break;
	11289	case 'x':
	11290	RExC_parse--; /* function expects to be pointed at the 'x' */
	11291	{
	11292	const char* error_msg;
	11293	bool valid = grok_bslash_x(RExC_parse,
	11294	&value,
	11295	&numlen,
	11296	&error_msg,
	11297	1);
	11298	RExC_parse += numlen;
	11299	if (! valid) {
	11300	vFAIL(error_msg);
	11301	}
	11302	}
	11303	if (PL_encoding && value < 0x100)
	11304	goto recode_encoding;
	11305	break;
	11306	case 'c':
	11307	value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
	11308	break;
	11309	case '0': case '1': case '2': case '3': case '4':
	11310	case '5': case '6': case '7':
	11311	{
	11312	/* Take 1-3 octal digits */
	11313	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	11314	numlen = 3;
	11315	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	11316	RExC_parse += numlen;
	11317	if (PL_encoding && value < 0x100)
	11318	goto recode_encoding;
	11319	break;
	11320	}
	11321	recode_encoding:
	11322	if (! RExC_override_recoding) {
	11323	SV* enc = PL_encoding;
	11324	value = reg_recode((const char)(U8)value, &enc);
	11325	if (!enc && SIZE_ONLY)
	11326	ckWARNreg(RExC_parse,
	11327	"Invalid escape in the specified encoding");
	11328	break;
	11329	}
	11330	default:
	11331	/* Allow \_ to not give an error */
	11332	if (!SIZE_ONLY && isALNUM(value) && value != '_') {
	11333	ckWARN2reg(RExC_parse,
	11334	"Unrecognized escape \\%c in character class passed through",
	11335	(int)value);
	11336	}
	11337	break;
	11338	}
	11339	} /* end of \blah */
	11340	#ifdef EBCDIC
	11341	else
	11342	literal_endpoint++;
	11343	#endif
	11344
	11345	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	11346
	11347	/* What matches in a locale is not known until runtime, so need to
	11348	* (one time per class) allocate extra space to pass to regexec.
	11349	* The space will contain a bit for each named class that is to be
	11350	* matched against. This isn't needed for \p{} and pseudo-classes,
	11351	* as they are not affected by locale, and hence are dealt with
	11352	* separately */
	11353	if (LOC && namedclass < ANYOF_MAX && ! need_class) {
	11354	need_class = 1;
	11355	if (SIZE_ONLY) {
	11356	RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	11357	}
	11358	else {
	11359	RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	11360	ANYOF_CLASS_ZERO(ret);
	11361	}
	11362	ANYOF_FLAGS(ret) \|= ANYOF_CLASS;
	11363	}
	11364
	11365	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	11366	* literal, as is the character that began the false range, i.e.
	11367	* the 'a' in the examples */
	11368	if (range) {
	11369	if (!SIZE_ONLY) {
	11370	const int w =
	11371	RExC_parse >= rangebegin ?
	11372	RExC_parse - rangebegin : 0;
	11373	ckWARN4reg(RExC_parse,
	11374	"False [] range \"%.s\"",
	11375	w, w, rangebegin);
	11376	cp_list = add_cp_to_invlist(cp_list, '-');
	11377	cp_list = add_cp_to_invlist(cp_list, prevvalue);
	11378	}
	11379
	11380	range = 0; /* this was not a true range */
	11381	element_count += 2; /* So counts for three values */
	11382	}
	11383
	11384	if (SIZE_ONLY) {
	11385
	11386	/* In the first pass, do a little extra work so below can
	11387	* possibly optimize the whole node to one of the nodes that
	11388	* correspond to the classes given below */
	11389
	11390	/* The optimization will only take place if there is a single
	11391	* element in the class, so can skip if there is more than one
	11392	*/
	11393	if (element_count == 1) {
	11394
	11395	/* Possible truncation here but in some 64-bit environments
	11396	* the compiler gets heartburn about switch on 64-bit values.
	11397	* A similar issue a little earlier when switching on value.
	11398	* --jhi */
	11399	switch ((I32)namedclass) {
	11400	case ANYOF_ALNUM:
	11401	case ANYOF_NALNUM:
	11402	case ANYOF_DIGIT:
	11403	case ANYOF_NDIGIT:
	11404	case ANYOF_SPACE:
	11405	case ANYOF_NSPACE:
	11406	has_special_charset_op = TRUE;
	11407	break;
	11408
	11409	case ANYOF_HORIZWS:
	11410	case ANYOF_NHORIZWS:
	11411	case ANYOF_VERTWS:
	11412	case ANYOF_NVERTWS:
	11413	has_special_non_charset_op = TRUE;
	11414	break;
	11415	}
	11416	}
	11417	}
	11418	else {
	11419	switch ((I32)namedclass) {
	11420
	11421	case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
	11422	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11423	PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
	11424	break;
	11425	case ANYOF_NALNUMC:
	11426	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11427	PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
	11428	break;
	11429	case ANYOF_ALPHA:
	11430	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11431	PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
	11432	break;
	11433	case ANYOF_NALPHA:
	11434	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11435	PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
	11436	break;
	11437	case ANYOF_ASCII:
	11438	if (LOC) {
	11439	ANYOF_CLASS_SET(ret, namedclass);
	11440	}
	11441	else {
	11442	_invlist_union(properties, PL_ASCII, &properties);
	11443	}
	11444	break;
	11445	case ANYOF_NASCII:
	11446	if (LOC) {
	11447	ANYOF_CLASS_SET(ret, namedclass);
	11448	}
	11449	else {
	11450	_invlist_union_complement_2nd(properties,
	11451	PL_ASCII, &properties);
	11452	if (DEPENDS_SEMANTICS) {
	11453	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_LATIN1_ALL;
	11454	}
	11455	}
	11456	break;
	11457	case ANYOF_BLANK:
	11458	DO_POSIX(ret, namedclass, properties,
	11459	PL_PosixBlank, PL_XPosixBlank);
	11460	break;
	11461	case ANYOF_NBLANK:
	11462	DO_N_POSIX(ret, namedclass, properties,
	11463	PL_PosixBlank, PL_XPosixBlank);
	11464	break;
	11465	case ANYOF_CNTRL:
	11466	DO_POSIX(ret, namedclass, properties,
	11467	PL_PosixCntrl, PL_XPosixCntrl);
	11468	break;
	11469	case ANYOF_NCNTRL:
	11470	DO_N_POSIX(ret, namedclass, properties,
	11471	PL_PosixCntrl, PL_XPosixCntrl);
	11472	break;
	11473	case ANYOF_DIGIT:
	11474	/* There are no digits in the Latin1 range outside of
	11475	* ASCII, so call the macro that doesn't have to resolve
	11476	* them */
	11477	DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
	11478	PL_PosixDigit, "XPosixDigit", listsv);
	11479	has_special_charset_op = TRUE;
	11480	break;
	11481	case ANYOF_NDIGIT:
	11482	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11483	PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
	11484	has_special_charset_op = TRUE;
	11485	break;
	11486	case ANYOF_GRAPH:
	11487	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11488	PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
	11489	break;
	11490	case ANYOF_NGRAPH:
	11491	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11492	PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
	11493	break;
	11494	case ANYOF_HORIZWS:
	11495	/* For these, we use the cp_list, as /d doesn't make a
	11496	* difference in what these match. There would be problems
	11497	* if these characters had folds other than themselves, as
	11498	* cp_list is subject to folding. It turns out that \h
	11499	* is just a synonym for XPosixBlank */
	11500	_invlist_union(cp_list, PL_XPosixBlank, &cp_list);
	11501	has_special_non_charset_op = TRUE;
	11502	break;
	11503	case ANYOF_NHORIZWS:
	11504	_invlist_union_complement_2nd(cp_list,
	11505	PL_XPosixBlank, &cp_list);
	11506	has_special_non_charset_op = TRUE;
	11507	break;
	11508	case ANYOF_LOWER:
	11509	case ANYOF_NLOWER:
	11510	{ /* These require special handling, as they differ under
	11511	folding, matching Cased there (which in the ASCII range
	11512	is the same as Alpha */
	11513
	11514	SV* ascii_source;
	11515	SV* l1_source;
	11516	const char *Xname;
	11517
	11518	if (FOLD && ! LOC) {
	11519	ascii_source = PL_PosixAlpha;
	11520	l1_source = PL_L1Cased;
	11521	Xname = "Cased";
	11522	}
	11523	else {
	11524	ascii_source = PL_PosixLower;
	11525	l1_source = PL_L1PosixLower;
	11526	Xname = "XPosixLower";
	11527	}
	11528	if (namedclass == ANYOF_LOWER) {
	11529	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11530	ascii_source, l1_source, Xname, listsv);
	11531	}
	11532	else {
	11533	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
	11534	properties, ascii_source, l1_source, Xname, listsv);
	11535	}
	11536	break;
	11537	}
	11538	case ANYOF_PRINT:
	11539	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11540	PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
	11541	break;
	11542	case ANYOF_NPRINT:
	11543	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11544	PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
	11545	break;
	11546	case ANYOF_PUNCT:
	11547	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11548	PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
	11549	break;
	11550	case ANYOF_NPUNCT:
	11551	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11552	PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
	11553	break;
	11554	case ANYOF_PSXSPC:
	11555	DO_POSIX(ret, namedclass, properties,
	11556	PL_PosixSpace, PL_XPosixSpace);
	11557	break;
	11558	case ANYOF_NPSXSPC:
	11559	DO_N_POSIX(ret, namedclass, properties,
	11560	PL_PosixSpace, PL_XPosixSpace);
	11561	break;
	11562	case ANYOF_SPACE:
	11563	DO_POSIX(ret, namedclass, properties,
	11564	PL_PerlSpace, PL_XPerlSpace);
	11565	has_special_charset_op = TRUE;
	11566	break;
	11567	case ANYOF_NSPACE:
	11568	DO_N_POSIX(ret, namedclass, properties,
	11569	PL_PerlSpace, PL_XPerlSpace);
	11570	has_special_charset_op = TRUE;
	11571	break;
	11572	case ANYOF_UPPER: /* Same as LOWER, above */
	11573	case ANYOF_NUPPER:
	11574	{
	11575	SV* ascii_source;
	11576	SV* l1_source;
	11577	const char *Xname;
	11578
	11579	if (FOLD && ! LOC) {
	11580	ascii_source = PL_PosixAlpha;
	11581	l1_source = PL_L1Cased;
	11582	Xname = "Cased";
	11583	}
	11584	else {
	11585	ascii_source = PL_PosixUpper;
	11586	l1_source = PL_L1PosixUpper;
	11587	Xname = "XPosixUpper";
	11588	}
	11589	if (namedclass == ANYOF_UPPER) {
	11590	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11591	ascii_source, l1_source, Xname, listsv);
	11592	}
	11593	else {
	11594	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
	11595	properties, ascii_source, l1_source, Xname, listsv);
	11596	}
	11597	break;
	11598	}
	11599	case ANYOF_ALNUM: /* Really is 'Word' */
	11600	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11601	PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
	11602	has_special_charset_op = TRUE;
	11603	break;
	11604	case ANYOF_NALNUM:
	11605	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
	11606	PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
	11607	has_special_charset_op = TRUE;
	11608	break;
	11609	case ANYOF_VERTWS:
	11610	/* For these, we use the cp_list, as /d doesn't make a
	11611	* difference in what these match. There would be problems
	11612	* if these characters had folds other than themselves, as
	11613	* cp_list is subject to folding */
	11614	_invlist_union(cp_list, PL_VertSpace, &cp_list);
	11615	has_special_non_charset_op = TRUE;
	11616	break;
	11617	case ANYOF_NVERTWS:
	11618	_invlist_union_complement_2nd(cp_list,
	11619	PL_VertSpace, &cp_list);
	11620	has_special_non_charset_op = TRUE;
	11621	break;
	11622	case ANYOF_XDIGIT:
	11623	DO_POSIX(ret, namedclass, properties,
	11624	PL_PosixXDigit, PL_XPosixXDigit);
	11625	break;
	11626	case ANYOF_NXDIGIT:
	11627	DO_N_POSIX(ret, namedclass, properties,
	11628	PL_PosixXDigit, PL_XPosixXDigit);
	11629	break;
	11630	case ANYOF_MAX:
	11631	/* this is to handle \p and \P */
	11632	break;
	11633	default:
	11634	vFAIL("Invalid [::] class");
	11635	break;
	11636	}
	11637
	11638	continue;
	11639	}
	11640	} /* end of namedclass \blah */
	11641
	11642	if (range) {
	11643	if (prevvalue > (IV)value) /* b-a */ {
	11644	const int w = RExC_parse - rangebegin;
	11645	Simple_vFAIL4("Invalid [] range \"%.s\"", w, w, rangebegin);
	11646	range = 0; /* not a valid range */
	11647	}
	11648	}
	11649	else {
	11650	prevvalue = value; /* save the beginning of the range */
	11651	if (RExC_parse+1 < RExC_end
	11652	&& *RExC_parse == '-'
	11653	&& RExC_parse[1] != ']')
	11654	{
	11655	RExC_parse++;
	11656
	11657	/* a bad range like \w-, [:word:]- ? */
	11658	if (namedclass > OOB_NAMEDCLASS) {
	11659	if (ckWARN(WARN_REGEXP)) {
	11660	const int w =
	11661	RExC_parse >= rangebegin ?
	11662	RExC_parse - rangebegin : 0;
	11663	vWARN4(RExC_parse,
	11664	"False [] range \"%.s\"",
	11665	w, w, rangebegin);
	11666	}
	11667	if (!SIZE_ONLY)
	11668	cp_list = add_cp_to_invlist(cp_list, '-');
	11669	} else
	11670	range = 1; /* yeah, it's a range! */
	11671	continue; /* but do it the next time */
	11672	}
	11673	}
	11674
	11675	/* non-Latin1 code point implies unicode semantics. Must be set in
	11676	* pass1 so is there for the whole of pass 2 */
	11677	if (value > 255) {
	11678	RExC_uni_semantics = 1;
	11679	}
	11680
	11681	/* now is the next time */
	11682	if (!SIZE_ONLY) {
	11683	#ifndef EBCDIC
	11684	cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
	11685	#else
	11686	UV* this_range = _new_invlist(1);
	11687	_append_range_to_invlist(this_range, prevvalue, value);
	11688
	11689	/* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
	11690	* If this range was specified using something like 'i-j', we want
	11691	* to include only the 'i' and the 'j', and not anything in
	11692	* between, so exclude non-ASCII, non-alphabetics from it.
	11693	* However, if the range was specified with something like
	11694	* [\x89-\x91] or [\x89-j], all code points within it should be
	11695	* included. literal_endpoint==2 means both ends of the range used
	11696	* a literal character, not \x{foo} */
	11697	if (literal_endpoint == 2
	11698	&& (prevvalue >= 'a' && value <= 'z')
	11699	\|\| (prevvalue >= 'A' && value <= 'Z'))
	11700	{
	11701	_invlist_intersection(this_range, PL_ASCII, &this_range, );
	11702	_invlist_intersection(this_range, PL_Alpha, &this_range, );
	11703	}
	11704	_invlist_union(cp_list, this_range, &cp_list);
	11705	literal_endpoint = 0;
	11706	#endif
	11707	}
	11708
	11709	range = 0; /* this range (if it was one) is done now */
	11710	}
	11711
	11712	/* [\w] can be optimized into \w, but not if there is anything else in the
	11713	* brackets (except for an initial '^' which indictes omplementing). We
	11714	* also can optimize the common special case /[0-9]/ into /\d/a */
	11715	if (element_count == 1 &&
	11716	(has_special_charset_op
	11717	\|\| has_special_non_charset_op
	11718	\|\| (prevvalue == '0' && value == '9')))
	11719	{
	11720	U8 op;
	11721	bool invert = ANYOF_FLAGS(ret) & ANYOF_INVERT;
	11722	const char * cur_parse = RExC_parse;
	11723
	11724	if (has_special_charset_op) {
	11725	U8 offset = get_regex_charset(RExC_flags);
	11726
	11727	/* /aa is the same as /a for these */
	11728	if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
	11729	offset = REGEX_ASCII_RESTRICTED_CHARSET;
	11730	}
	11731	switch ((I32)namedclass) {
	11732	case ANYOF_NALNUM:
	11733	invert = ! invert;
	11734	/* FALLTHROUGH */
	11735	case ANYOF_ALNUM:
	11736	op = ALNUM;
	11737	break;
	11738	case ANYOF_NSPACE:
	11739	invert = ! invert;
	11740	/* FALLTHROUGH */
	11741	case ANYOF_SPACE:
	11742	op = SPACE;
	11743	break;
	11744	case ANYOF_NDIGIT:
	11745	invert = ! invert;
	11746	/* FALLTHROUGH */
	11747	case ANYOF_DIGIT:
	11748	op = DIGIT;
	11749
	11750	/* There is no DIGITU */
	11751	if (offset == REGEX_UNICODE_CHARSET) {
	11752	offset = REGEX_DEPENDS_CHARSET;
	11753	}
	11754	break;
	11755	default:
	11756	Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
	11757	}
	11758
	11759	/* The number of varieties of each of these is the same, hence, so
	11760	* is the delta between the normal and complemented nodes */
	11761	if (invert) {
	11762	offset += NALNUM - ALNUM;
	11763	}
	11764
	11765	op += offset;
	11766	}
	11767	else if (has_special_non_charset_op) {
	11768	switch ((I32)namedclass) {
	11769	case ANYOF_NHORIZWS:
	11770	invert = ! invert;
	11771	/* FALLTHROUGH */
	11772	case ANYOF_HORIZWS:
	11773	op = HORIZWS;
	11774	break;
	11775	case ANYOF_NVERTWS:
	11776	invert = ! invert;
	11777	/* FALLTHROUGH */
	11778	case ANYOF_VERTWS:
	11779	op = VERTWS;
	11780	break;
	11781	default:
	11782	Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
	11783	}
	11784
	11785	/* The complement version of each of these nodes is adjacently next
	11786	* */
	11787	if (invert) {
	11788	op++;
	11789	}
	11790	}
	11791	else { /* The remaining possibility is [0-9] */
	11792	op = (invert) ? NDIGITA : DIGITA;
	11793	}
	11794
	11795	/* Throw away this ANYOF regnode, and emit the calculated one, which
	11796	* should correspond to the beginning, not current, state of the parse
	11797	*/
	11798	RExC_parse = (char *)orig_parse;
	11799	RExC_emit = (regnode *)orig_emit;
	11800	ret = reg_node(pRExC_state, op);
	11801	RExC_parse = (char *) cur_parse;
	11802
	11803	SvREFCNT_dec(listsv);
	11804	return ret;
	11805	}
	11806
	11807	if (SIZE_ONLY)
	11808	return ret;
	11809	/**** !SIZE_ONLY AFTER HERE *******/
	11810
	11811	/* If folding, we calculate all characters that could fold to or from the
	11812	* ones already on the list */
	11813	if (FOLD && cp_list) {
	11814	UV start, end; /* End points of code point ranges */
	11815
	11816	SV* fold_intersection = NULL;
	11817
	11818	const UV highest_index = invlist_len(cp_list) - 1;
	11819
	11820	/* In the Latin1 range, the characters that can be folded-to or -from
	11821	* are precisely the alphabetic characters. If the highest code point
	11822	* is within Latin1, we can use the compiled-in list, and not have to
	11823	* go out to disk. If the last element in the array is in the
	11824	* inversion list set, it starts a range that goes to infinity, so the
	11825	* maximum of the inversion list is definitely above Latin1.
	11826	* Otherwise, it starts a range that isn't in the set, so the max is
	11827	* one less than it */
	11828	if (! ELEMENT_RANGE_MATCHES_INVLIST(highest_index)
	11829	&& invlist_array(cp_list)[highest_index] <= 256)
	11830	{
	11831	_invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
	11832	}
	11833	else {
	11834
	11835	/* This is a list of all the characters that participate in folds
	11836	* (except marks, etc in multi-char folds */
	11837	if (! PL_utf8_foldable) {
	11838	SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
	11839	PL_utf8_foldable = _swash_to_invlist(swash);
	11840	SvREFCNT_dec(swash);
	11841	}
	11842
	11843	/* This is a hash that for a particular fold gives all characters
	11844	* that are involved in it */
	11845	if (! PL_utf8_foldclosures) {
	11846
	11847	/* If we were unable to find any folds, then we likely won't be
	11848	* able to find the closures. So just create an empty list.
	11849	* Folding will effectively be restricted to the non-Unicode
	11850	* rules hard-coded into Perl. (This case happens legitimately
	11851	* during compilation of Perl itself before the Unicode tables
	11852	* are generated) */
	11853	if (invlist_len(PL_utf8_foldable) == 0) {
	11854	PL_utf8_foldclosures = newHV();
	11855	}
	11856	else {
	11857	/* If the folds haven't been read in, call a fold function
	11858	* to force that */
	11859	if (! PL_utf8_tofold) {
	11860	U8 dummy[UTF8_MAXBYTES+1];
	11861	STRLEN dummy_len;
	11862
	11863	/* This particular string is above \xff in both UTF-8
	11864	* and UTFEBCDIC */
	11865	to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
	11866	assert(PL_utf8_tofold); /* Verify that worked */
	11867	}
	11868	PL_utf8_foldclosures =
	11869	_swash_inversion_hash(PL_utf8_tofold);
	11870	}
	11871	}
	11872
	11873	/* Only the characters in this class that participate in folds need
	11874	* be checked. Get the intersection of this class and all the
	11875	* possible characters that are foldable. This can quickly narrow
	11876	* down a large class */
	11877	_invlist_intersection(PL_utf8_foldable, cp_list,
	11878	&fold_intersection);
	11879	}
	11880
	11881	/* Now look at the foldable characters in this class individually */
	11882	invlist_iterinit(fold_intersection);
	11883	while (invlist_iternext(fold_intersection, &start, &end)) {
	11884	UV j;
	11885
	11886	/* Locale folding for Latin1 characters is deferred until runtime */
	11887	if (LOC && start < 256) {
	11888	start = 256;
	11889	}
	11890
	11891	/* Look at every character in the range */
	11892	for (j = start; j <= end; j++) {
	11893
	11894	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	11895	STRLEN foldlen;
	11896	UV f;
	11897
	11898	if (j < 256) {
	11899
	11900	/* We have the latin1 folding rules hard-coded here so that
	11901	* an innocent-looking character class, like /[ks]/i won't
	11902	* have to go out to disk to find the possible matches.
	11903	* XXX It would be better to generate these via regen, in
	11904	* case a new version of the Unicode standard adds new
	11905	* mappings, though that is not really likely, and may be
	11906	* caught by the default: case of the switch below. */
	11907
	11908	if (PL_fold_latin1[j] != j) {
	11909
	11910	/* ASCII is always matched; non-ASCII is matched only
	11911	* under Unicode rules */
	11912	if (isASCII(j) \|\| AT_LEAST_UNI_SEMANTICS) {
	11913	cp_list =
	11914	add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
	11915	}
	11916	else {
	11917	depends_list =
	11918	add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
	11919	}
	11920	}
	11921
	11922	if (HAS_NONLATIN1_FOLD_CLOSURE(j)
	11923	&& (! isASCII(j) \|\| ! MORE_ASCII_RESTRICTED))
	11924	{
	11925	/* Certain Latin1 characters have matches outside
	11926	* Latin1, or are multi-character. To get here, 'j' is
	11927	* one of those characters. None of these matches is
	11928	* valid for ASCII characters under /aa, which is why
	11929	* the 'if' just above excludes those. The matches
	11930	* fall into three categories:
	11931	* 1) They are singly folded-to or -from an above 255
	11932	* character, e.g., LATIN SMALL LETTER Y WITH
	11933	* DIAERESIS and LATIN CAPITAL LETTER Y WITH
	11934	* DIAERESIS;
	11935	* 2) They are part of a multi-char fold with another
	11936	* latin1 character; only LATIN SMALL LETTER
	11937	* SHARP S => "ss" fits this;
	11938	* 3) They are part of a multi-char fold with a
	11939	* character outside of Latin1, such as various
	11940	* ligatures.
	11941	* We aren't dealing fully with multi-char folds, except
	11942	* we do deal with the pattern containing a character
	11943	* that has a multi-char fold (not so much the inverse).
	11944	* For types 1) and 3), the matches only happen when the
	11945	* target string is utf8; that's not true for 2), and we
	11946	* set a flag for it.
	11947	*
	11948	* The code below adds the single fold closures for 'j'
	11949	* to the inversion list. */
	11950	switch (j) {
	11951	case 'k':
	11952	case 'K':
	11953	/* KELVIN SIGN */
	11954	cp_list =
	11955	add_cp_to_invlist(cp_list, 0x212A);
	11956	break;
	11957	case 's':
	11958	case 'S':
	11959	/* LATIN SMALL LETTER LONG S */
	11960	cp_list =
	11961	add_cp_to_invlist(cp_list, 0x017F);
	11962	break;
	11963	case MICRO_SIGN:
	11964	cp_list = add_cp_to_invlist(cp_list,
	11965	GREEK_SMALL_LETTER_MU);
	11966	cp_list = add_cp_to_invlist(cp_list,
	11967	GREEK_CAPITAL_LETTER_MU);
	11968	break;
	11969	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	11970	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	11971	/* ANGSTROM SIGN */
	11972	cp_list =
	11973	add_cp_to_invlist(cp_list, 0x212B);
	11974	break;
	11975	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	11976	cp_list = add_cp_to_invlist(cp_list,
	11977	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	11978	break;
	11979	case LATIN_SMALL_LETTER_SHARP_S:
	11980	cp_list = add_cp_to_invlist(cp_list,
	11981	LATIN_CAPITAL_LETTER_SHARP_S);
	11982
	11983	/* Under /a, /d, and /u, this can match the two
	11984	* chars "ss" */
	11985	if (! MORE_ASCII_RESTRICTED) {
	11986	add_alternate(&unicode_alternate,
	11987	(U8 *) "ss", 2);
	11988
	11989	/* And under /u or /a, it can match even if
	11990	* the target is not utf8 */
	11991	if (AT_LEAST_UNI_SEMANTICS) {
	11992	ANYOF_FLAGS(ret) \|=
	11993	ANYOF_NONBITMAP_NON_UTF8;
	11994	}
	11995	}
	11996	break;
	11997	case 'F': case 'f':
	11998	case 'I': case 'i':
	11999	case 'L': case 'l':
	12000	case 'T': case 't':
	12001	case 'A': case 'a':
	12002	case 'H': case 'h':
	12003	case 'J': case 'j':
	12004	case 'N': case 'n':
	12005	case 'W': case 'w':
	12006	case 'Y': case 'y':
	12007	/* These all are targets of multi-character
	12008	* folds from code points that require UTF8 to
	12009	* express, so they can't match unless the
	12010	* target string is in UTF-8, so no action here
	12011	* is necessary, as regexec.c properly handles
	12012	* the general case for UTF-8 matching */
	12013	break;
	12014	default:
	12015	/* Use deprecated warning to increase the
	12016	* chances of this being output */
	12017	ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
	12018	break;
	12019	}
	12020	}
	12021	continue;
	12022	}
	12023
	12024	/* Here is an above Latin1 character. We don't have the rules
	12025	* hard-coded for it. First, get its fold */
	12026	f = _to_uni_fold_flags(j, foldbuf, &foldlen,
	12027	((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
	12028	\| ((LOC)
	12029	? FOLD_FLAGS_LOCALE
	12030	: (MORE_ASCII_RESTRICTED)
	12031	? FOLD_FLAGS_NOMIX_ASCII
	12032	: 0));
	12033
	12034	if (foldlen > (STRLEN)UNISKIP(f)) {
	12035
	12036	/* Any multicharacter foldings (disallowed in lookbehind
	12037	* patterns) require the following transform: [ABCDEF] ->
	12038	* (?:[ABCabcDEFd]\|pq\|rst) where E folds into "pq" and F
	12039	* folds into "rst", all other characters fold to single
	12040	* characters. We save away these multicharacter foldings,
	12041	* to be later saved as part of the additional "s" data. */
	12042	if (! RExC_in_lookbehind) {
	12043	U8* loc = foldbuf;
	12044	U8* e = foldbuf + foldlen;
	12045
	12046	/* If any of the folded characters of this are in the
	12047	* Latin1 range, tell the regex engine that this can
	12048	* match a non-utf8 target string. */
	12049	while (loc < e) {
	12050	if (UTF8_IS_INVARIANT(*loc)
	12051	\|\| UTF8_IS_DOWNGRADEABLE_START(*loc))
	12052	{
	12053	ANYOF_FLAGS(ret)
	12054	\|= ANYOF_NONBITMAP_NON_UTF8;
	12055	break;
	12056	}
	12057	loc += UTF8SKIP(loc);
	12058	}
	12059
	12060	add_alternate(&unicode_alternate, foldbuf, foldlen);
	12061	}
	12062	}
	12063	else {
	12064	/* Single character fold of above Latin1. Add everything
	12065	* in its fold closure to the list that this node should
	12066	* match */
	12067	SV** listp;
	12068
	12069	/* The fold closures data structure is a hash with the keys
	12070	* being every character that is folded to, like 'k', and
	12071	* the values each an array of everything that folds to its
	12072	* key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
	12073	if ((listp = hv_fetch(PL_utf8_foldclosures,
	12074	(char *) foldbuf, foldlen, FALSE)))
	12075	{
	12076	AV* list = (AV) listp;
	12077	IV k;
	12078	for (k = 0; k <= av_len(list); k++) {
	12079	SV** c_p = av_fetch(list, k, FALSE);
	12080	UV c;
	12081	if (c_p == NULL) {
	12082	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	12083	}
	12084	c = SvUV(*c_p);
	12085
	12086	/* /aa doesn't allow folds between ASCII and non-;
	12087	* /l doesn't allow them between above and below
	12088	* 256 */
	12089	if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j)))
	12090	\|\| (LOC && ((c < 256) != (j < 256))))
	12091	{
	12092	continue;
	12093	}
	12094
	12095	/* Folds involving non-ascii Latin1 characters
	12096	* under /d are added to a separate list */
	12097	if (isASCII(c) \|\| c > 255 \|\| AT_LEAST_UNI_SEMANTICS)
	12098	{
	12099	cp_list = add_cp_to_invlist(cp_list, c);
	12100	}
	12101	else {
	12102	depends_list = add_cp_to_invlist(depends_list, c);
	12103	}
	12104	}
	12105	}
	12106	}
	12107	}
	12108	}
	12109	SvREFCNT_dec(fold_intersection);
	12110	}
	12111
	12112	/* And combine the result (if any) with any inversion list from properties.
	12113	* The lists are kept separate up to now because we don't want to fold the
	12114	* properties */
	12115	if (properties) {
	12116	if (AT_LEAST_UNI_SEMANTICS) {
	12117	if (cp_list) {
	12118	_invlist_union(cp_list, properties, &cp_list);
	12119	SvREFCNT_dec(properties);
	12120	}
	12121	else {
	12122	cp_list = properties;
	12123	}
	12124	}
	12125	else {
	12126
	12127	/* Under /d, we put the things that match only when the target
	12128	* string is utf8, into a separate list */
	12129	SV* nonascii_but_latin1_properties = NULL;
	12130	_invlist_intersection(properties, PL_Latin1,
	12131	&nonascii_but_latin1_properties);
	12132	_invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
	12133	&nonascii_but_latin1_properties);
	12134	_invlist_subtract(properties, nonascii_but_latin1_properties,
	12135	&properties);
	12136	if (cp_list) {
	12137	_invlist_union(cp_list, properties, &cp_list);
	12138	SvREFCNT_dec(properties);
	12139	}
	12140	else {
	12141	cp_list = properties;
	12142	}
	12143
	12144	if (depends_list) {
	12145	_invlist_union(depends_list, nonascii_but_latin1_properties,
	12146	&depends_list);
	12147	SvREFCNT_dec(nonascii_but_latin1_properties);
	12148	}
	12149	else {
	12150	depends_list = nonascii_but_latin1_properties;
	12151	}
	12152	}
	12153	}
	12154
	12155	/* Here, we have calculated what code points should be in the character
	12156	* class.
	12157	*
	12158	* Now we can see about various optimizations. Fold calculation (which we
	12159	* did above) needs to take place before inversion. Otherwise /[^k]/i
	12160	* would invert to include K, which under /i would match k, which it
	12161	* shouldn't. */
	12162
	12163	/* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
	12164	* set the FOLD flag yet, so this does optimize those. It doesn't
	12165	* optimize locale. Doing so perhaps could be done as long as there is
	12166	* nothing like \w in it; some thought also would have to be given to the
	12167	* interaction with above 0x100 chars */
	12168	if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
	12169	&& ! LOC
	12170	&& ! depends_list
	12171	&& ! unicode_alternate
	12172	&& SvCUR(listsv) == initial_listsv_len)
	12173	{
	12174	_invlist_invert(cp_list);
	12175
	12176	/* Any swash can't be used as-is, because we've inverted things */
	12177	if (swash) {
	12178	SvREFCNT_dec(swash);
	12179	swash = NULL;
	12180	}
	12181
	12182	/* Clear the invert flag since have just done it here */
	12183	ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
	12184	}
	12185
	12186	/* Here, <cp_list> contains all the code points we can determine at
	12187	* compile time that match under all conditions. Go through it, and
	12188	* for things that belong in the bitmap, put them there, and delete from
	12189	* <cp_list> */
	12190	if (cp_list) {
	12191
	12192	/* This gets set if we actually need to modify things */
	12193	bool change_invlist = FALSE;
	12194
	12195	UV start, end;
	12196
	12197	/* Start looking through <cp_list> */
	12198	invlist_iterinit(cp_list);
	12199	while (invlist_iternext(cp_list, &start, &end)) {
	12200	UV high;
	12201	int i;
	12202
	12203	/* Quit if are above what we should change */
	12204	if (start > 255) {
	12205	break;
	12206	}
	12207
	12208	change_invlist = TRUE;
	12209
	12210	/* Set all the bits in the range, up to the max that we are doing */
	12211	high = (end < 255) ? end : 255;
	12212	for (i = start; i <= (int) high; i++) {
	12213	if (! ANYOF_BITMAP_TEST(ret, i)) {
	12214	ANYOF_BITMAP_SET(ret, i);
	12215	stored++;
	12216	prevvalue = value;
	12217	value = i;
	12218	}
	12219	}
	12220	}
	12221
	12222	/* Done with loop; remove any code points that are in the bitmap from
	12223	* <cp_list> */
	12224	if (change_invlist) {
	12225	_invlist_subtract(cp_list, PL_Latin1, &cp_list);
	12226	}
	12227
	12228	/* If have completely emptied it, remove it completely */
	12229	if (invlist_len(cp_list) == 0) {
	12230	SvREFCNT_dec(cp_list);
	12231	cp_list = NULL;
	12232	}
	12233	}
	12234
	12235	/* Combine the two lists into one. */
	12236	if (depends_list) {
	12237	if (cp_list) {
	12238	_invlist_union(cp_list, depends_list, &cp_list);
	12239	SvREFCNT_dec(depends_list);
	12240	}
	12241	else {
	12242	cp_list = depends_list;
	12243	}
	12244	}
	12245
	12246	/* Folding in the bitmap is taken care of above, but not for locale (for
	12247	* which we have to wait to see what folding is in effect at runtime), and
	12248	* for some things not in the bitmap (only the upper latin folds in this
	12249	* case, as all other single-char folding has been set above). Set
	12250	* run-time fold flag for these */
	12251	if (FOLD && (LOC
	12252	\|\| (DEPENDS_SEMANTICS
	12253	&& cp_list
	12254	&& ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
	12255	\|\| unicode_alternate))
	12256	{
	12257	ANYOF_FLAGS(ret) \|= ANYOF_LOC_NONBITMAP_FOLD;
	12258	}
	12259
	12260	/* A single character class can be "optimized" into an EXACTish node.
	12261	* Note that since we don't currently count how many characters there are
	12262	* outside the bitmap, we are XXX missing optimization possibilities for
	12263	* them. This optimization can't happen unless this is a truly single
	12264	* character class, which means that it can't be an inversion into a
	12265	* many-character class, and there must be no possibility of there being
	12266	* things outside the bitmap. 'stored' (only) for locales doesn't include
	12267	* \w, etc, so have to make a special test that they aren't present
	12268	*
	12269	* Similarly A 2-character class of the very special form like [bB] can be
	12270	* optimized into an EXACTFish node, but only for non-locales, and for
	12271	* characters which only have the two folds; so things like 'fF' and 'Ii'
	12272	* wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
	12273	* FI'. */
	12274	if (! cp_list
	12275	&& ! unicode_alternate
	12276	&& SvCUR(listsv) == initial_listsv_len
	12277	&& ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT\|ANYOF_UNICODE_ALL))
	12278	&& (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	12279	\|\| (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
	12280	\|\| (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	12281	&& (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
	12282	/* If the latest code point has a fold whose
	12283	* bit is set, it must be the only other one */
	12284	&& ((prevvalue = PL_fold_latin1[value]) != (IV)value)
	12285	&& ANYOF_BITMAP_TEST(ret, prevvalue)))))
	12286	{
	12287	/* Note that the information needed to decide to do this optimization
	12288	* is not currently available until the 2nd pass, and that the actually
	12289	* used EXACTish node takes less space than the calculated ANYOF node,
	12290	* and hence the amount of space calculated in the first pass is larger
	12291	* than actually used, so this optimization doesn't gain us any space.
	12292	* But an EXACT node is faster than an ANYOF node, and can be combined
	12293	* with any adjacent EXACT nodes later by the optimizer for further
	12294	* gains. The speed of executing an EXACTF is similar to an ANYOF
	12295	* node, so the optimization advantage comes from the ability to join
	12296	* it to adjacent EXACT nodes */
	12297
	12298	const char * cur_parse= RExC_parse;
	12299	U8 op;
	12300	RExC_emit = (regnode *)orig_emit;
	12301	RExC_parse = (char *)orig_parse;
	12302
	12303	if (stored == 1) {
	12304
	12305	/* A locale node with one point can be folded; all the other cases
	12306	* with folding will have two points, since we calculate them above
	12307	*/
	12308	if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
	12309	op = EXACTFL;
	12310	}
	12311	else {
	12312	op = EXACT;
	12313	}
	12314	}
	12315	else { /* else 2 chars in the bit map: the folds of each other */
	12316
	12317	/* Use the folded value, which for the cases where we get here,
	12318	* is just the lower case of the current one (which may resolve to
	12319	* itself, or to the other one */
	12320	value = toLOWER_LATIN1(value);
	12321
	12322	/* To join adjacent nodes, they must be the exact EXACTish type.
	12323	* Try to use the most likely type, by using EXACTFA if possible,
	12324	* then EXACTFU if the regex calls for it, or is required because
	12325	* the character is non-ASCII. (If <value> is ASCII, its fold is
	12326	* also ASCII for the cases where we get here.) */
	12327	if (MORE_ASCII_RESTRICTED && isASCII(value)) {
	12328	op = EXACTFA;
	12329	}
	12330	else if (AT_LEAST_UNI_SEMANTICS \|\| !isASCII(value)) {
	12331	op = EXACTFU;
	12332	}
	12333	else { /* Otherwise, more likely to be EXACTF type */
	12334	op = EXACTF;
	12335	}
	12336	}
	12337
	12338	ret = reg_node(pRExC_state, op);
	12339	RExC_parse = (char *)cur_parse;
	12340	if (UTF && ! NATIVE_IS_INVARIANT(value)) {
	12341	*STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
	12342	*(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
	12343	STR_LEN(ret)= 2;
	12344	RExC_emit += STR_SZ(2);
	12345	}
	12346	else {
	12347	*STRING(ret)= (char)value;
	12348	STR_LEN(ret)= 1;
	12349	RExC_emit += STR_SZ(1);
	12350	}
	12351	SvREFCNT_dec(listsv);
	12352	return ret;
	12353	}
	12354
	12355	/* If there is a swash and more than one element, we can't use the swash in
	12356	* the optimization below. */
	12357	if (swash && element_count > 1) {
	12358	SvREFCNT_dec(swash);
	12359	swash = NULL;
	12360	}
	12361	if (! cp_list
	12362	&& SvCUR(listsv) == initial_listsv_len
	12363	&& ! unicode_alternate)
	12364	{
	12365	ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
	12366	SvREFCNT_dec(listsv);
	12367	SvREFCNT_dec(unicode_alternate);
	12368	}
	12369	else {
	12370	/* av[0] stores the character class description in its textual form:
	12371	* used later (regexec.c:Perl_regclass_swash()) to initialize the
	12372	* appropriate swash, and is also useful for dumping the regnode.
	12373	* av[1] if NULL, is a placeholder to later contain the swash computed
	12374	* from av[0]. But if no further computation need be done, the
	12375	* swash is stored there now.
	12376	* av[2] stores the multicharacter foldings, used later in
	12377	* regexec.c:S_reginclass().
	12378	* av[3] stores the cp_list inversion list for use in addition or
	12379	* instead of av[0]; not used if av[1] isn't NULL
	12380	* av[4] is set if any component of the class is from a user-defined
	12381	* property; not used if av[1] isn't NULL */
	12382	AV * const av = newAV();
	12383	SV *rv;
	12384
	12385	av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
	12386	? &PL_sv_undef
	12387	: listsv);
	12388	if (swash) {
	12389	av_store(av, 1, swash);
	12390	SvREFCNT_dec(cp_list);
	12391	}
	12392	else {
	12393	av_store(av, 1, NULL);
	12394	if (cp_list) {
	12395	av_store(av, 3, cp_list);
	12396	av_store(av, 4, newSVuv(has_user_defined_property));
	12397	}
	12398	}
	12399
	12400	/* Store any computed multi-char folds only if we are allowing
	12401	* them */
	12402	if (allow_full_fold) {
	12403	av_store(av, 2, MUTABLE_SV(unicode_alternate));
	12404	if (unicode_alternate) { /* This node is variable length */
	12405	OP(ret) = ANYOFV;
	12406	}
	12407	}
	12408	else {
	12409	av_store(av, 2, NULL);
	12410	}
	12411	rv = newRV_noinc(MUTABLE_SV(av));
	12412	n = add_data(pRExC_state, 1, "s");
	12413	RExC_rxi->data->data[n] = (void*)rv;
	12414	ARG_SET(ret, n);
	12415	}
	12416	return ret;
	12417	}
	12418
	12419
	12420	/* reg_skipcomment()
	12421
	12422	Absorbs an /x style # comments from the input stream.
	12423	Returns true if there is more text remaining in the stream.
	12424	Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
	12425	terminates the pattern without including a newline.
	12426
	12427	Note its the callers responsibility to ensure that we are
	12428	actually in /x mode
	12429
	12430	*/
	12431
	12432	STATIC bool
	12433	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
	12434	{
	12435	bool ended = 0;
	12436
	12437	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	12438
	12439	while (RExC_parse < RExC_end)
	12440	if (*RExC_parse++ == '\n') {
	12441	ended = 1;
	12442	break;
	12443	}
	12444	if (!ended) {
	12445	/* we ran off the end of the pattern without ending
	12446	the comment, so we have to add an \n when wrapping */
	12447	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	12448	return 0;
	12449	} else
	12450	return 1;
	12451	}
	12452
	12453	/* nextchar()
	12454
	12455	Advances the parse position, and optionally absorbs
	12456	"whitespace" from the inputstream.
	12457
	12458	Without /x "whitespace" means (?#...) style comments only,
	12459	with /x this means (?#...) and # comments and whitespace proper.
	12460
	12461	Returns the RExC_parse point from BEFORE the scan occurs.
	12462
	12463	This is the /x friendly way of saying RExC_parse++.
	12464	*/
	12465
	12466	STATIC char*
	12467	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	12468	{
	12469	char* const retval = RExC_parse++;
	12470
	12471	PERL_ARGS_ASSERT_NEXTCHAR;
	12472
	12473	for (;;) {
	12474	if (RExC_end - RExC_parse >= 3
	12475	&& *RExC_parse == '('
	12476	&& RExC_parse[1] == '?'
	12477	&& RExC_parse[2] == '#')
	12478	{
	12479	while (*RExC_parse != ')') {
	12480	if (RExC_parse == RExC_end)
	12481	FAIL("Sequence (?#... not terminated");
	12482	RExC_parse++;
	12483	}
	12484	RExC_parse++;
	12485	continue;
	12486	}
	12487	if (RExC_flags & RXf_PMf_EXTENDED) {
	12488	if (isSPACE(*RExC_parse)) {
	12489	RExC_parse++;
	12490	continue;
	12491	}
	12492	else if (*RExC_parse == '#') {
	12493	if ( reg_skipcomment( pRExC_state ) )
	12494	continue;
	12495	}
	12496	}
	12497	return retval;
	12498	}
	12499	}
	12500
	12501	/*
	12502	- reg_node - emit a node
	12503	*/
	12504	STATIC regnode * /* Location. */
	12505	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	12506	{
	12507	dVAR;
	12508	register regnode *ptr;
	12509	regnode * const ret = RExC_emit;
	12510	GET_RE_DEBUG_FLAGS_DECL;
	12511
	12512	PERL_ARGS_ASSERT_REG_NODE;
	12513
	12514	if (SIZE_ONLY) {
	12515	SIZE_ALIGN(RExC_size);
	12516	RExC_size += 1;
	12517	return(ret);
	12518	}
	12519	if (RExC_emit >= RExC_emit_bound)
	12520	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	12521	op, RExC_emit, RExC_emit_bound);
	12522
	12523	NODE_ALIGN_FILL(ret);
	12524	ptr = ret;
	12525	FILL_ADVANCE_NODE(ptr, op);
	12526	#ifdef RE_TRACK_PATTERN_OFFSETS
	12527	if (RExC_offsets) { /* MJD */
	12528	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	12529	"reg_node", __LINE__,
	12530	PL_reg_name[op],
	12531	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	12532	? "Overwriting end of array!\n" : "OK",
	12533	(UV)(RExC_emit - RExC_emit_start),
	12534	(UV)(RExC_parse - RExC_start),
	12535	(UV)RExC_offsets[0]));
	12536	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	12537	}
	12538	#endif
	12539	RExC_emit = ptr;
	12540	return(ret);
	12541	}
	12542
	12543	/*
	12544	- reganode - emit a node with an argument
	12545	*/
	12546	STATIC regnode * /* Location. */
	12547	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	12548	{
	12549	dVAR;
	12550	register regnode *ptr;
	12551	regnode * const ret = RExC_emit;
	12552	GET_RE_DEBUG_FLAGS_DECL;
	12553
	12554	PERL_ARGS_ASSERT_REGANODE;
	12555
	12556	if (SIZE_ONLY) {
	12557	SIZE_ALIGN(RExC_size);
	12558	RExC_size += 2;
	12559	/*
	12560	We can't do this:
	12561
	12562	assert(2==regarglen[op]+1);
	12563
	12564	Anything larger than this has to allocate the extra amount.
	12565	If we changed this to be:
	12566
	12567	RExC_size += (1 + regarglen[op]);
	12568
	12569	then it wouldn't matter. Its not clear what side effect
	12570	might come from that so its not done so far.
	12571	-- dmq
	12572	*/
	12573	return(ret);
	12574	}
	12575	if (RExC_emit >= RExC_emit_bound)
	12576	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	12577	op, RExC_emit, RExC_emit_bound);
	12578
	12579	NODE_ALIGN_FILL(ret);
	12580	ptr = ret;
	12581	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	12582	#ifdef RE_TRACK_PATTERN_OFFSETS
	12583	if (RExC_offsets) { /* MJD */
	12584	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	12585	"reganode",
	12586	__LINE__,
	12587	PL_reg_name[op],
	12588	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	12589	"Overwriting end of array!\n" : "OK",
	12590	(UV)(RExC_emit - RExC_emit_start),
	12591	(UV)(RExC_parse - RExC_start),
	12592	(UV)RExC_offsets[0]));
	12593	Set_Cur_Node_Offset;
	12594	}
	12595	#endif
	12596	RExC_emit = ptr;
	12597	return(ret);
	12598	}
	12599
	12600	/*
	12601	- reguni - emit (if appropriate) a Unicode character
	12602	*/
	12603	STATIC STRLEN
	12604	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	12605	{
	12606	dVAR;
	12607
	12608	PERL_ARGS_ASSERT_REGUNI;
	12609
	12610	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	12611	}
	12612
	12613	/*
	12614	- reginsert - insert an operator in front of already-emitted operand
	12615	*
	12616	* Means relocating the operand.
	12617	*/
	12618	STATIC void
	12619	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	12620	{
	12621	dVAR;
	12622	register regnode *src;
	12623	register regnode *dst;
	12624	register regnode *place;
	12625	const int offset = regarglen[(U8)op];
	12626	const int size = NODE_STEP_REGNODE + offset;
	12627	GET_RE_DEBUG_FLAGS_DECL;
	12628
	12629	PERL_ARGS_ASSERT_REGINSERT;
	12630	PERL_UNUSED_ARG(depth);
	12631	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	12632	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	12633	if (SIZE_ONLY) {
	12634	RExC_size += size;
	12635	return;
	12636	}
	12637
	12638	src = RExC_emit;
	12639	RExC_emit += size;
	12640	dst = RExC_emit;
	12641	if (RExC_open_parens) {
	12642	int paren;
	12643	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	12644	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	12645	if ( RExC_open_parens[paren] >= opnd ) {
	12646	/DEBUG_PARSE_FMT("open"," - %d",size);/
	12647	RExC_open_parens[paren] += size;
	12648	} else {
	12649	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	12650	}
	12651	if ( RExC_close_parens[paren] >= opnd ) {
	12652	/DEBUG_PARSE_FMT("close"," - %d",size);/
	12653	RExC_close_parens[paren] += size;
	12654	} else {
	12655	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	12656	}
	12657	}
	12658	}
	12659
	12660	while (src > opnd) {
	12661	StructCopy(--src, --dst, regnode);
	12662	#ifdef RE_TRACK_PATTERN_OFFSETS
	12663	if (RExC_offsets) { /* MJD 20010112 */
	12664	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	12665	"reg_insert",
	12666	__LINE__,
	12667	PL_reg_name[op],
	12668	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	12669	? "Overwriting end of array!\n" : "OK",
	12670	(UV)(src - RExC_emit_start),
	12671	(UV)(dst - RExC_emit_start),
	12672	(UV)RExC_offsets[0]));
	12673	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	12674	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	12675	}
	12676	#endif
	12677	}
	12678
	12679
	12680	place = opnd; /* Op node, where operand used to be. */
	12681	#ifdef RE_TRACK_PATTERN_OFFSETS
	12682	if (RExC_offsets) { /* MJD */
	12683	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	12684	"reginsert",
	12685	__LINE__,
	12686	PL_reg_name[op],
	12687	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	12688	? "Overwriting end of array!\n" : "OK",
	12689	(UV)(place - RExC_emit_start),
	12690	(UV)(RExC_parse - RExC_start),
	12691	(UV)RExC_offsets[0]));
	12692	Set_Node_Offset(place, RExC_parse);
	12693	Set_Node_Length(place, 1);
	12694	}
	12695	#endif
	12696	src = NEXTOPER(place);
	12697	FILL_ADVANCE_NODE(place, op);
	12698	Zero(src, offset, regnode);
	12699	}
	12700
	12701	/*
	12702	- regtail - set the next-pointer at the end of a node chain of p to val.
	12703	- SEE ALSO: regtail_study
	12704	*/
	12705	/* TODO: All three parms should be const */
	12706	STATIC void
	12707	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	12708	{
	12709	dVAR;
	12710	register regnode *scan;
	12711	GET_RE_DEBUG_FLAGS_DECL;
	12712
	12713	PERL_ARGS_ASSERT_REGTAIL;
	12714	#ifndef DEBUGGING
	12715	PERL_UNUSED_ARG(depth);
	12716	#endif
	12717
	12718	if (SIZE_ONLY)
	12719	return;
	12720
	12721	/* Find last node. */
	12722	scan = p;
	12723	for (;;) {
	12724	regnode * const temp = regnext(scan);
	12725	DEBUG_PARSE_r({
	12726	SV * const mysv=sv_newmortal();
	12727	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	12728	regprop(RExC_rx, mysv, scan);
	12729	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	12730	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	12731	(temp == NULL ? "->" : ""),
	12732	(temp == NULL ? PL_reg_name[OP(val)] : "")
	12733	);
	12734	});
	12735	if (temp == NULL)
	12736	break;
	12737	scan = temp;
	12738	}
	12739
	12740	if (reg_off_by_arg[OP(scan)]) {
	12741	ARG_SET(scan, val - scan);
	12742	}
	12743	else {
	12744	NEXT_OFF(scan) = val - scan;
	12745	}
	12746	}
	12747
	12748	#ifdef DEBUGGING
	12749	/*
	12750	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	12751	- Look for optimizable sequences at the same time.
	12752	- currently only looks for EXACT chains.
	12753
	12754	This is experimental code. The idea is to use this routine to perform
	12755	in place optimizations on branches and groups as they are constructed,
	12756	with the long term intention of removing optimization from study_chunk so
	12757	that it is purely analytical.
	12758
	12759	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	12760	to control which is which.
	12761
	12762	*/
	12763	/* TODO: All four parms should be const */
	12764
	12765	STATIC U8
	12766	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	12767	{
	12768	dVAR;
	12769	register regnode *scan;
	12770	U8 exact = PSEUDO;
	12771	#ifdef EXPERIMENTAL_INPLACESCAN
	12772	I32 min = 0;
	12773	#endif
	12774	GET_RE_DEBUG_FLAGS_DECL;
	12775
	12776	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	12777
	12778
	12779	if (SIZE_ONLY)
	12780	return exact;
	12781
	12782	/* Find last node. */
	12783
	12784	scan = p;
	12785	for (;;) {
	12786	regnode * const temp = regnext(scan);
	12787	#ifdef EXPERIMENTAL_INPLACESCAN
	12788	if (PL_regkind[OP(scan)] == EXACT) {
	12789	bool has_exactf_sharp_s; /* Unexamined in this routine */
	12790	if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
	12791	return EXACT;
	12792	}
	12793	#endif
	12794	if ( exact ) {
	12795	switch (OP(scan)) {
	12796	case EXACT:
	12797	case EXACTF:
	12798	case EXACTFA:
	12799	case EXACTFU:
	12800	case EXACTFU_SS:
	12801	case EXACTFU_TRICKYFOLD:
	12802	case EXACTFL:
	12803	if( exact == PSEUDO )
	12804	exact= OP(scan);
	12805	else if ( exact != OP(scan) )
	12806	exact= 0;
	12807	case NOTHING:
	12808	break;
	12809	default:
	12810	exact= 0;
	12811	}
	12812	}
	12813	DEBUG_PARSE_r({
	12814	SV * const mysv=sv_newmortal();
	12815	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	12816	regprop(RExC_rx, mysv, scan);
	12817	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	12818	SvPV_nolen_const(mysv),
	12819	REG_NODE_NUM(scan),
	12820	PL_reg_name[exact]);
	12821	});
	12822	if (temp == NULL)
	12823	break;
	12824	scan = temp;
	12825	}
	12826	DEBUG_PARSE_r({
	12827	SV * const mysv_val=sv_newmortal();
	12828	DEBUG_PARSE_MSG("");
	12829	regprop(RExC_rx, mysv_val, val);
	12830	PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	12831	SvPV_nolen_const(mysv_val),
	12832	(IV)REG_NODE_NUM(val),
	12833	(IV)(val - scan)
	12834	);
	12835	});
	12836	if (reg_off_by_arg[OP(scan)]) {
	12837	ARG_SET(scan, val - scan);
	12838	}
	12839	else {
	12840	NEXT_OFF(scan) = val - scan;
	12841	}
	12842
	12843	return exact;
	12844	}
	12845	#endif
	12846
	12847	/*
	12848	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	12849	*/
	12850	#ifdef DEBUGGING
	12851	static void
	12852	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	12853	{
	12854	int bit;
	12855	int set=0;
	12856	regex_charset cs;
	12857
	12858	for (bit=0; bit<32; bit++) {
	12859	if (flags & (1<<bit)) {
	12860	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	12861	continue;
	12862	}
	12863	if (!set++ && lead)
	12864	PerlIO_printf(Perl_debug_log, "%s",lead);
	12865	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	12866	}
	12867	}
	12868	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	12869	if (!set++ && lead) {
	12870	PerlIO_printf(Perl_debug_log, "%s",lead);
	12871	}
	12872	switch (cs) {
	12873	case REGEX_UNICODE_CHARSET:
	12874	PerlIO_printf(Perl_debug_log, "UNICODE");
	12875	break;
	12876	case REGEX_LOCALE_CHARSET:
	12877	PerlIO_printf(Perl_debug_log, "LOCALE");
	12878	break;
	12879	case REGEX_ASCII_RESTRICTED_CHARSET:
	12880	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	12881	break;
	12882	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	12883	PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
	12884	break;
	12885	default:
	12886	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	12887	break;
	12888	}
	12889	}
	12890	if (lead) {
	12891	if (set)
	12892	PerlIO_printf(Perl_debug_log, "\n");
	12893	else
	12894	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	12895	}
	12896	}
	12897	#endif
	12898
	12899	void
	12900	Perl_regdump(pTHX_ const regexp *r)
	12901	{
	12902	#ifdef DEBUGGING
	12903	dVAR;
	12904	SV * const sv = sv_newmortal();
	12905	SV *dsv= sv_newmortal();
	12906	RXi_GET_DECL(r,ri);
	12907	GET_RE_DEBUG_FLAGS_DECL;
	12908
	12909	PERL_ARGS_ASSERT_REGDUMP;
	12910
	12911	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	12912
	12913	/* Header fields of interest. */
	12914	if (r->anchored_substr) {
	12915	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	12916	RE_SV_DUMPLEN(r->anchored_substr), 30);
	12917	PerlIO_printf(Perl_debug_log,
	12918	"anchored %s%s at %"IVdf" ",
	12919	s, RE_SV_TAIL(r->anchored_substr),
	12920	(IV)r->anchored_offset);
	12921	} else if (r->anchored_utf8) {
	12922	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	12923	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	12924	PerlIO_printf(Perl_debug_log,
	12925	"anchored utf8 %s%s at %"IVdf" ",
	12926	s, RE_SV_TAIL(r->anchored_utf8),
	12927	(IV)r->anchored_offset);
	12928	}
	12929	if (r->float_substr) {
	12930	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	12931	RE_SV_DUMPLEN(r->float_substr), 30);
	12932	PerlIO_printf(Perl_debug_log,
	12933	"floating %s%s at %"IVdf"..%"UVuf" ",
	12934	s, RE_SV_TAIL(r->float_substr),
	12935	(IV)r->float_min_offset, (UV)r->float_max_offset);
	12936	} else if (r->float_utf8) {
	12937	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	12938	RE_SV_DUMPLEN(r->float_utf8), 30);
	12939	PerlIO_printf(Perl_debug_log,
	12940	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	12941	s, RE_SV_TAIL(r->float_utf8),
	12942	(IV)r->float_min_offset, (UV)r->float_max_offset);
	12943	}
	12944	if (r->check_substr \|\| r->check_utf8)
	12945	PerlIO_printf(Perl_debug_log,
	12946	(const char *)
	12947	(r->check_substr == r->float_substr
	12948	&& r->check_utf8 == r->float_utf8
	12949	? "(checking floating" : "(checking anchored"));
	12950	if (r->extflags & RXf_NOSCAN)
	12951	PerlIO_printf(Perl_debug_log, " noscan");
	12952	if (r->extflags & RXf_CHECK_ALL)
	12953	PerlIO_printf(Perl_debug_log, " isall");
	12954	if (r->check_substr \|\| r->check_utf8)
	12955	PerlIO_printf(Perl_debug_log, ") ");
	12956
	12957	if (ri->regstclass) {
	12958	regprop(r, sv, ri->regstclass);
	12959	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	12960	}
	12961	if (r->extflags & RXf_ANCH) {
	12962	PerlIO_printf(Perl_debug_log, "anchored");
	12963	if (r->extflags & RXf_ANCH_BOL)
	12964	PerlIO_printf(Perl_debug_log, "(BOL)");
	12965	if (r->extflags & RXf_ANCH_MBOL)
	12966	PerlIO_printf(Perl_debug_log, "(MBOL)");
	12967	if (r->extflags & RXf_ANCH_SBOL)
	12968	PerlIO_printf(Perl_debug_log, "(SBOL)");
	12969	if (r->extflags & RXf_ANCH_GPOS)
	12970	PerlIO_printf(Perl_debug_log, "(GPOS)");
	12971	PerlIO_putc(Perl_debug_log, ' ');
	12972	}
	12973	if (r->extflags & RXf_GPOS_SEEN)
	12974	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	12975	if (r->intflags & PREGf_SKIP)
	12976	PerlIO_printf(Perl_debug_log, "plus ");
	12977	if (r->intflags & PREGf_IMPLICIT)
	12978	PerlIO_printf(Perl_debug_log, "implicit ");
	12979	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	12980	if (r->extflags & RXf_EVAL_SEEN)
	12981	PerlIO_printf(Perl_debug_log, "with eval ");
	12982	PerlIO_printf(Perl_debug_log, "\n");
	12983	DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
	12984	#else
	12985	PERL_ARGS_ASSERT_REGDUMP;
	12986	PERL_UNUSED_CONTEXT;
	12987	PERL_UNUSED_ARG(r);
	12988	#endif /* DEBUGGING */
	12989	}
	12990
	12991	/*
	12992	- regprop - printable representation of opcode
	12993	*/
	12994	#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
	12995	STMT_START { \
	12996	if (do_sep) { \
	12997	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
	12998	if (flags & ANYOF_INVERT) \
	12999	/make sure the invert info is in each / \
	13000	sv_catpvs(sv, "^"); \
	13001	do_sep = 0; \
	13002	} \
	13003	} STMT_END
	13004
	13005	void
	13006	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode *o)
	13007	{
	13008	#ifdef DEBUGGING
	13009	dVAR;
	13010	register int k;
	13011	RXi_GET_DECL(prog,progi);
	13012	GET_RE_DEBUG_FLAGS_DECL;
	13013
	13014	PERL_ARGS_ASSERT_REGPROP;
	13015
	13016	sv_setpvs(sv, "");
	13017
	13018	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	13019	/* It would be nice to FAIL() here, but this may be called from
	13020	regexec.c, and it would be hard to supply pRExC_state. */
	13021	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
	13022	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	13023
	13024	k = PL_regkind[OP(o)];
	13025
	13026	if (k == EXACT) {
	13027	sv_catpvs(sv, " ");
	13028	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	13029	* is a crude hack but it may be the best for now since
	13030	* we have no flag "this EXACTish node was UTF-8"
	13031	* --jhi */
	13032	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	13033	PERL_PV_ESCAPE_UNI_DETECT \|
	13034	PERL_PV_ESCAPE_NONASCII \|
	13035	PERL_PV_PRETTY_ELLIPSES \|
	13036	PERL_PV_PRETTY_LTGT \|
	13037	PERL_PV_PRETTY_NOCLEAR
	13038	);
	13039	} else if (k == TRIE) {
	13040	/* print the details of the trie in dumpuntil instead, as
	13041	* progi->data isn't available here */
	13042	const char op = OP(o);
	13043	const U32 n = ARG(o);
	13044	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	13045	(reg_ac_data *)progi->data->data[n] :
	13046	NULL;
	13047	const reg_trie_data * const trie
	13048	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	13049
	13050	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	13051	DEBUG_TRIE_COMPILE_r(
	13052	Perl_sv_catpvf(aTHX_ sv,
	13053	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	13054	(UV)trie->startstate,
	13055	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	13056	(UV)trie->wordcount,
	13057	(UV)trie->minlen,
	13058	(UV)trie->maxlen,
	13059	(UV)TRIE_CHARCOUNT(trie),
	13060	(UV)trie->uniquecharcount
	13061	)
	13062	);
	13063	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	13064	int i;
	13065	int rangestart = -1;
	13066	U8* bitmap = IS_ANYOF_TRIE(op) ? (U8)ANYOF_BITMAP(o) : (U8)TRIE_BITMAP(trie);
	13067	sv_catpvs(sv, "[");
	13068	for (i = 0; i <= 256; i++) {
	13069	if (i < 256 && BITMAP_TEST(bitmap,i)) {
	13070	if (rangestart == -1)
	13071	rangestart = i;
	13072	} else if (rangestart != -1) {
	13073	if (i <= rangestart + 3)
	13074	for (; rangestart < i; rangestart++)
	13075	put_byte(sv, rangestart);
	13076	else {
	13077	put_byte(sv, rangestart);
	13078	sv_catpvs(sv, "-");
	13079	put_byte(sv, i - 1);
	13080	}
	13081	rangestart = -1;
	13082	}
	13083	}
	13084	sv_catpvs(sv, "]");
	13085	}
	13086
	13087	} else if (k == CURLY) {
	13088	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	13089	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	13090	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	13091	}
	13092	else if (k == WHILEM && o->flags) /* Ordinal/of */
	13093	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	13094	else if (k == REF \|\| k == OPEN \|\| k == CLOSE \|\| k == GROUPP \|\| OP(o)==ACCEPT) {
	13095	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	13096	if ( RXp_PAREN_NAMES(prog) ) {
	13097	if ( k != REF \|\| (OP(o) < NREF)) {
	13098	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	13099	SV **name= av_fetch(list, ARG(o), 0 );
	13100	if (name)
	13101	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	13102	}
	13103	else {
	13104	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	13105	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	13106	I32 nums=(I32)SvPVX(sv_dat);
	13107	SV **name= av_fetch(list, nums[0], 0 );
	13108	I32 n;
	13109	if (name) {
	13110	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	13111	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	13112	(n ? "," : ""), (IV)nums[n]);
	13113	}
	13114	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	13115	}
	13116	}
	13117	}
	13118	} else if (k == GOSUB)
	13119	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
	13120	else if (k == VERB) {
	13121	if (!o->flags)
	13122	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	13123	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	13124	} else if (k == LOGICAL)
	13125	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
	13126	else if (k == ANYOF) {
	13127	int i, rangestart = -1;
	13128	const U8 flags = ANYOF_FLAGS(o);
	13129	int do_sep = 0;
	13130
	13131	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	13132	static const char * const anyofs[] = {
	13133	"\\w",
	13134	"\\W",
	13135	"\\s",
	13136	"\\S",
	13137	"\\d",
	13138	"\\D",
	13139	"[:alnum:]",
	13140	"[:^alnum:]",
	13141	"[:alpha:]",
	13142	"[:^alpha:]",
	13143	"[:ascii:]",
	13144	"[:^ascii:]",
	13145	"[:cntrl:]",
	13146	"[:^cntrl:]",
	13147	"[:graph:]",
	13148	"[:^graph:]",
	13149	"[:lower:]",
	13150	"[:^lower:]",
	13151	"[:print:]",
	13152	"[:^print:]",
	13153	"[:punct:]",
	13154	"[:^punct:]",
	13155	"[:upper:]",
	13156	"[:^upper:]",
	13157	"[:xdigit:]",
	13158	"[:^xdigit:]",
	13159	"[:space:]",
	13160	"[:^space:]",
	13161	"[:blank:]",
	13162	"[:^blank:]"
	13163	};
	13164
	13165	if (flags & ANYOF_LOCALE)
	13166	sv_catpvs(sv, "{loc}");
	13167	if (flags & ANYOF_LOC_NONBITMAP_FOLD)
	13168	sv_catpvs(sv, "{i}");
	13169	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	13170	if (flags & ANYOF_INVERT)
	13171	sv_catpvs(sv, "^");
	13172
	13173	/* output what the standard cp 0-255 bitmap matches */
	13174	for (i = 0; i <= 256; i++) {
	13175	if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
	13176	if (rangestart == -1)
	13177	rangestart = i;
	13178	} else if (rangestart != -1) {
	13179	if (i <= rangestart + 3)
	13180	for (; rangestart < i; rangestart++)
	13181	put_byte(sv, rangestart);
	13182	else {
	13183	put_byte(sv, rangestart);
	13184	sv_catpvs(sv, "-");
	13185	put_byte(sv, i - 1);
	13186	}
	13187	do_sep = 1;
	13188	rangestart = -1;
	13189	}
	13190	}
	13191
	13192	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	13193	/* output any special charclass tests (used entirely under use locale) */
	13194	if (ANYOF_CLASS_TEST_ANY_SET(o))
	13195	for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
	13196	if (ANYOF_CLASS_TEST(o,i)) {
	13197	sv_catpv(sv, anyofs[i]);
	13198	do_sep = 1;
	13199	}
	13200
	13201	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	13202
	13203	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	13204	sv_catpvs(sv, "{non-utf8-latin1-all}");
	13205	}
	13206
	13207	/* output information about the unicode matching */
	13208	if (flags & ANYOF_UNICODE_ALL)
	13209	sv_catpvs(sv, "{unicode_all}");
	13210	else if (ANYOF_NONBITMAP(o))
	13211	sv_catpvs(sv, "{unicode}");
	13212	if (flags & ANYOF_NONBITMAP_NON_UTF8)
	13213	sv_catpvs(sv, "{outside bitmap}");
	13214
	13215	if (ANYOF_NONBITMAP(o)) {
	13216	SV lv; / Set if there is something outside the bit map */
	13217	SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
	13218	bool byte_output = FALSE; /* If something in the bitmap has been
	13219	output */
	13220
	13221	if (lv && lv != &PL_sv_undef) {
	13222	if (sw) {
	13223	U8 s[UTF8_MAXBYTES_CASE+1];
	13224
	13225	for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
	13226	uvchr_to_utf8(s, i);
	13227
	13228	if (i < 256
	13229	&& ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
	13230	things already
	13231	output as part
	13232	of the bitmap */
	13233	&& swash_fetch(sw, s, TRUE))
	13234	{
	13235	if (rangestart == -1)
	13236	rangestart = i;
	13237	} else if (rangestart != -1) {
	13238	byte_output = TRUE;
	13239	if (i <= rangestart + 3)
	13240	for (; rangestart < i; rangestart++) {
	13241	put_byte(sv, rangestart);
	13242	}
	13243	else {
	13244	put_byte(sv, rangestart);
	13245	sv_catpvs(sv, "-");
	13246	put_byte(sv, i-1);
	13247	}
	13248	rangestart = -1;
	13249	}
	13250	}
	13251	}
	13252
	13253	{
	13254	char *s = savesvpv(lv);
	13255	char * const origs = s;
	13256
	13257	while (s && s != '\n')
	13258	s++;
	13259
	13260	if (*s == '\n') {
	13261	const char * const t = ++s;
	13262
	13263	if (byte_output) {
	13264	sv_catpvs(sv, " ");
	13265	}
	13266
	13267	while (*s) {
	13268	if (*s == '\n') {
	13269
	13270	/* Truncate very long output */
	13271	if (s - origs > 256) {
	13272	Perl_sv_catpvf(aTHX_ sv,
	13273	"%.*s...",
	13274	(int) (s - origs - 1),
	13275	t);
	13276	goto out_dump;
	13277	}
	13278	*s = ' ';
	13279	}
	13280	else if (*s == '\t') {
	13281	*s = '-';
	13282	}
	13283	s++;
	13284	}
	13285	if (s[-1] == ' ')
	13286	s[-1] = 0;
	13287
	13288	sv_catpv(sv, t);
	13289	}
	13290
	13291	out_dump:
	13292
	13293	Safefree(origs);
	13294	}
	13295	SvREFCNT_dec(lv);
	13296	}
	13297	}
	13298
	13299	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	13300	}
	13301	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	13302	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	13303	#else
	13304	PERL_UNUSED_CONTEXT;
	13305	PERL_UNUSED_ARG(sv);
	13306	PERL_UNUSED_ARG(o);
	13307	PERL_UNUSED_ARG(prog);
	13308	#endif /* DEBUGGING */
	13309	}
	13310
	13311	SV *
	13312	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	13313	{ /* Assume that RE_INTUIT is set */
	13314	dVAR;
	13315	struct regexp const prog = (struct regexp )SvANY(r);
	13316	GET_RE_DEBUG_FLAGS_DECL;
	13317
	13318	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	13319	PERL_UNUSED_CONTEXT;
	13320
	13321	DEBUG_COMPILE_r(
	13322	{
	13323	const char * const s = SvPV_nolen_const(prog->check_substr
	13324	? prog->check_substr : prog->check_utf8);
	13325
	13326	if (!PL_colorset) reginitcolors();
	13327	PerlIO_printf(Perl_debug_log,
	13328	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	13329	PL_colors[4],
	13330	prog->check_substr ? "" : "utf8 ",
	13331	PL_colors[5],PL_colors[0],
	13332	s,
	13333	PL_colors[1],
	13334	(strlen(s) > 60 ? "..." : ""));
	13335	} );
	13336
	13337	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	13338	}
	13339
	13340	/*
	13341	pregfree()
	13342
	13343	handles refcounting and freeing the perl core regexp structure. When
	13344	it is necessary to actually free the structure the first thing it
	13345	does is call the 'free' method of the regexp_engine associated to
	13346	the regexp, allowing the handling of the void *pprivate; member
	13347	first. (This routine is not overridable by extensions, which is why
	13348	the extensions free is called first.)
	13349
	13350	See regdupe and regdupe_internal if you change anything here.
	13351	*/
	13352	#ifndef PERL_IN_XSUB_RE
	13353	void
	13354	Perl_pregfree(pTHX_ REGEXP *r)
	13355	{
	13356	SvREFCNT_dec(r);
	13357	}
	13358
	13359	void
	13360	Perl_pregfree2(pTHX_ REGEXP *rx)
	13361	{
	13362	dVAR;
	13363	struct regexp const r = (struct regexp )SvANY(rx);
	13364	GET_RE_DEBUG_FLAGS_DECL;
	13365
	13366	PERL_ARGS_ASSERT_PREGFREE2;
	13367
	13368	if (r->mother_re) {
	13369	ReREFCNT_dec(r->mother_re);
	13370	} else {
	13371	CALLREGFREE_PVT(rx); /* free the private data */
	13372	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	13373	}
	13374	if (r->substrs) {
	13375	SvREFCNT_dec(r->anchored_substr);
	13376	SvREFCNT_dec(r->anchored_utf8);
	13377	SvREFCNT_dec(r->float_substr);
	13378	SvREFCNT_dec(r->float_utf8);
	13379	Safefree(r->substrs);
	13380	}
	13381	RX_MATCH_COPY_FREE(rx);
	13382	#ifdef PERL_OLD_COPY_ON_WRITE
	13383	SvREFCNT_dec(r->saved_copy);
	13384	#endif
	13385	Safefree(r->offs);
	13386	SvREFCNT_dec(r->qr_anoncv);
	13387	}
	13388
	13389	/* reg_temp_copy()
	13390
	13391	This is a hacky workaround to the structural issue of match results
	13392	being stored in the regexp structure which is in turn stored in
	13393	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	13394	could be PL_curpm in multiple contexts, and could require multiple
	13395	result sets being associated with the pattern simultaneously, such
	13396	as when doing a recursive match with (??{$qr})
	13397
	13398	The solution is to make a lightweight copy of the regexp structure
	13399	when a qr// is returned from the code executed by (??{$qr}) this
	13400	lightweight copy doesn't actually own any of its data except for
	13401	the starp/end and the actual regexp structure itself.
	13402
	13403	*/
	13404
	13405
	13406	REGEXP *
	13407	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	13408	{
	13409	struct regexp *ret;
	13410	struct regexp const r = (struct regexp )SvANY(rx);
	13411
	13412	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	13413
	13414	if (!ret_x)
	13415	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	13416	ret = (struct regexp *)SvANY(ret_x);
	13417
	13418	(void)ReREFCNT_inc(rx);
	13419	/* We can take advantage of the existing "copied buffer" mechanism in SVs
	13420	by pointing directly at the buffer, but flagging that the allocated
	13421	space in the copy is zero. As we've just done a struct copy, it's now
	13422	a case of zero-ing that, rather than copying the current length. */
	13423	SvPV_set(ret_x, RX_WRAPPED(rx));
	13424	SvFLAGS(ret_x) \|= SvFLAGS(rx) & (SVf_POK\|SVp_POK\|SVf_UTF8);
	13425	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	13426	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	13427	SvLEN_set(ret_x, 0);
	13428	SvSTASH_set(ret_x, NULL);
	13429	SvMAGIC_set(ret_x, NULL);
	13430	if (r->offs) {
	13431	const I32 npar = r->nparens+1;
	13432	Newx(ret->offs, npar, regexp_paren_pair);
	13433	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	13434	}
	13435	if (r->substrs) {
	13436	Newx(ret->substrs, 1, struct reg_substr_data);
	13437	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	13438
	13439	SvREFCNT_inc_void(ret->anchored_substr);
	13440	SvREFCNT_inc_void(ret->anchored_utf8);
	13441	SvREFCNT_inc_void(ret->float_substr);
	13442	SvREFCNT_inc_void(ret->float_utf8);
	13443
	13444	/* check_substr and check_utf8, if non-NULL, point to either their
	13445	anchored or float namesakes, and don't hold a second reference. */
	13446	}
	13447	RX_MATCH_COPIED_off(ret_x);
	13448	#ifdef PERL_OLD_COPY_ON_WRITE
	13449	ret->saved_copy = NULL;
	13450	#endif
	13451	ret->mother_re = rx;
	13452	SvREFCNT_inc_void(ret->qr_anoncv);
	13453
	13454	return ret_x;
	13455	}
	13456	#endif
	13457
	13458	/* regfree_internal()
	13459
	13460	Free the private data in a regexp. This is overloadable by
	13461	extensions. Perl takes care of the regexp structure in pregfree(),
	13462	this covers the *pprivate pointer which technically perl doesn't
	13463	know about, however of course we have to handle the
	13464	regexp_internal structure when no extension is in use.
	13465
	13466	Note this is called before freeing anything in the regexp
	13467	structure.
	13468	*/
	13469
	13470	void
	13471	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	13472	{
	13473	dVAR;
	13474	struct regexp const r = (struct regexp )SvANY(rx);
	13475	RXi_GET_DECL(r,ri);
	13476	GET_RE_DEBUG_FLAGS_DECL;
	13477
	13478	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	13479
	13480	DEBUG_COMPILE_r({
	13481	if (!PL_colorset)
	13482	reginitcolors();
	13483	{
	13484	SV *dsv= sv_newmortal();
	13485	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	13486	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	13487	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	13488	PL_colors[4],PL_colors[5],s);
	13489	}
	13490	});
	13491	#ifdef RE_TRACK_PATTERN_OFFSETS
	13492	if (ri->u.offsets)
	13493	Safefree(ri->u.offsets); /* 20010421 MJD */
	13494	#endif
	13495	if (ri->code_blocks) {
	13496	int n;
	13497	for (n = 0; n < ri->num_code_blocks; n++)
	13498	SvREFCNT_dec(ri->code_blocks[n].src_regex);
	13499	Safefree(ri->code_blocks);
	13500	}
	13501
	13502	if (ri->data) {
	13503	int n = ri->data->count;
	13504
	13505	while (--n >= 0) {
	13506	/* If you add a ->what type here, update the comment in regcomp.h */
	13507	switch (ri->data->what[n]) {
	13508	case 'a':
	13509	case 'r':
	13510	case 's':
	13511	case 'S':
	13512	case 'u':
	13513	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	13514	break;
	13515	case 'f':
	13516	Safefree(ri->data->data[n]);
	13517	break;
	13518	case 'l':
	13519	case 'L':
	13520	break;
	13521	case 'T':
	13522	{ /* Aho Corasick add-on structure for a trie node.
	13523	Used in stclass optimization only */
	13524	U32 refcount;
	13525	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	13526	OP_REFCNT_LOCK;
	13527	refcount = --aho->refcount;
	13528	OP_REFCNT_UNLOCK;
	13529	if ( !refcount ) {
	13530	PerlMemShared_free(aho->states);
	13531	PerlMemShared_free(aho->fail);
	13532	/* do this last!!!! */
	13533	PerlMemShared_free(ri->data->data[n]);
	13534	PerlMemShared_free(ri->regstclass);
	13535	}
	13536	}
	13537	break;
	13538	case 't':
	13539	{
	13540	/* trie structure. */
	13541	U32 refcount;
	13542	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	13543	OP_REFCNT_LOCK;
	13544	refcount = --trie->refcount;
	13545	OP_REFCNT_UNLOCK;
	13546	if ( !refcount ) {
	13547	PerlMemShared_free(trie->charmap);
	13548	PerlMemShared_free(trie->states);
	13549	PerlMemShared_free(trie->trans);
	13550	if (trie->bitmap)
	13551	PerlMemShared_free(trie->bitmap);
	13552	if (trie->jump)
	13553	PerlMemShared_free(trie->jump);
	13554	PerlMemShared_free(trie->wordinfo);
	13555	/* do this last!!!! */
	13556	PerlMemShared_free(ri->data->data[n]);
	13557	}
	13558	}
	13559	break;
	13560	default:
	13561	Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
	13562	}
	13563	}
	13564	Safefree(ri->data->what);
	13565	Safefree(ri->data);
	13566	}
	13567
	13568	Safefree(ri);
	13569	}
	13570
	13571	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	13572	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	13573	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	13574
	13575	/*
	13576	re_dup - duplicate a regexp.
	13577
	13578	This routine is expected to clone a given regexp structure. It is only
	13579	compiled under USE_ITHREADS.
	13580
	13581	After all of the core data stored in struct regexp is duplicated
	13582	the regexp_engine.dupe method is used to copy any private data
	13583	stored in the *pprivate pointer. This allows extensions to handle
	13584	any duplication it needs to do.
	13585
	13586	See pregfree() and regfree_internal() if you change anything here.
	13587	*/
	13588	#if defined(USE_ITHREADS)
	13589	#ifndef PERL_IN_XSUB_RE
	13590	void
	13591	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	13592	{
	13593	dVAR;
	13594	I32 npar;
	13595	const struct regexp r = (const struct regexp )SvANY(sstr);
	13596	struct regexp ret = (struct regexp )SvANY(dstr);
	13597
	13598	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	13599
	13600	npar = r->nparens+1;
	13601	Newx(ret->offs, npar, regexp_paren_pair);
	13602	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	13603	if(ret->swap) {
	13604	/* no need to copy these */
	13605	Newx(ret->swap, npar, regexp_paren_pair);
	13606	}
	13607
	13608	if (ret->substrs) {
	13609	/* Do it this way to avoid reading from *r after the StructCopy().
	13610	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	13611	cache, it doesn't matter. */
	13612	const bool anchored = r->check_substr
	13613	? r->check_substr == r->anchored_substr
	13614	: r->check_utf8 == r->anchored_utf8;
	13615	Newx(ret->substrs, 1, struct reg_substr_data);
	13616	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	13617
	13618	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	13619	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	13620	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	13621	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	13622
	13623	/* check_substr and check_utf8, if non-NULL, point to either their
	13624	anchored or float namesakes, and don't hold a second reference. */
	13625
	13626	if (ret->check_substr) {
	13627	if (anchored) {
	13628	assert(r->check_utf8 == r->anchored_utf8);
	13629	ret->check_substr = ret->anchored_substr;
	13630	ret->check_utf8 = ret->anchored_utf8;
	13631	} else {
	13632	assert(r->check_substr == r->float_substr);
	13633	assert(r->check_utf8 == r->float_utf8);
	13634	ret->check_substr = ret->float_substr;
	13635	ret->check_utf8 = ret->float_utf8;
	13636	}
	13637	} else if (ret->check_utf8) {
	13638	if (anchored) {
	13639	ret->check_utf8 = ret->anchored_utf8;
	13640	} else {
	13641	ret->check_utf8 = ret->float_utf8;
	13642	}
	13643	}
	13644	}
	13645
	13646	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	13647	ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
	13648
	13649	if (ret->pprivate)
	13650	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	13651
	13652	if (RX_MATCH_COPIED(dstr))
	13653	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	13654	else
	13655	ret->subbeg = NULL;
	13656	#ifdef PERL_OLD_COPY_ON_WRITE
	13657	ret->saved_copy = NULL;
	13658	#endif
	13659
	13660	if (ret->mother_re) {
	13661	if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
	13662	/* Our storage points directly to our mother regexp, but that's
	13663	1: a buffer in a different thread
	13664	2: something we no longer hold a reference on
	13665	so we need to copy it locally. */
	13666	/* Note we need to use SvCUR(), rather than
	13667	SvLEN(), on our mother_re, because it, in
	13668	turn, may well be pointing to its own mother_re. */
	13669	SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
	13670	SvCUR(ret->mother_re)+1));
	13671	SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
	13672	}
	13673	ret->mother_re = NULL;
	13674	}
	13675	ret->gofs = 0;
	13676	}
	13677	#endif /* PERL_IN_XSUB_RE */
	13678
	13679	/*
	13680	regdupe_internal()
	13681
	13682	This is the internal complement to regdupe() which is used to copy
	13683	the structure pointed to by the *pprivate pointer in the regexp.
	13684	This is the core version of the extension overridable cloning hook.
	13685	The regexp structure being duplicated will be copied by perl prior
	13686	to this and will be provided as the regexp *r argument, however
	13687	with the /old/ structures pprivate pointer value. Thus this routine
	13688	may override any copying normally done by perl.
	13689
	13690	It returns a pointer to the new regexp_internal structure.
	13691	*/
	13692
	13693	void *
	13694	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	13695	{
	13696	dVAR;
	13697	struct regexp const r = (struct regexp )SvANY(rx);
	13698	regexp_internal *reti;
	13699	int len;
	13700	RXi_GET_DECL(r,ri);
	13701
	13702	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	13703
	13704	len = ProgLen(ri);
	13705
	13706	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
	13707	Copy(ri->program, reti->program, len+1, regnode);
	13708
	13709	reti->num_code_blocks = ri->num_code_blocks;
	13710	if (ri->code_blocks) {
	13711	int n;
	13712	Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
	13713	struct reg_code_block);
	13714	Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
	13715	struct reg_code_block);
	13716	for (n = 0; n < ri->num_code_blocks; n++)
	13717	reti->code_blocks[n].src_regex = (REGEXP*)
	13718	sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
	13719	}
	13720	else
	13721	reti->code_blocks = NULL;
	13722
	13723	reti->regstclass = NULL;
	13724
	13725	if (ri->data) {
	13726	struct reg_data *d;
	13727	const int count = ri->data->count;
	13728	int i;
	13729
	13730	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	13731	char, struct reg_data);
	13732	Newx(d->what, count, U8);
	13733
	13734	d->count = count;
	13735	for (i = 0; i < count; i++) {
	13736	d->what[i] = ri->data->what[i];
	13737	switch (d->what[i]) {
	13738	/* see also regcomp.h and regfree_internal() */
	13739	case 'a': /* actually an AV, but the dup function is identical. */
	13740	case 'r':
	13741	case 's':
	13742	case 'S':
	13743	case 'u': /* actually an HV, but the dup function is identical. */
	13744	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	13745	break;
	13746	case 'f':
	13747	/* This is cheating. */
	13748	Newx(d->data[i], 1, struct regnode_charclass_class);
	13749	StructCopy(ri->data->data[i], d->data[i],
	13750	struct regnode_charclass_class);
	13751	reti->regstclass = (regnode*)d->data[i];
	13752	break;
	13753	case 'T':
	13754	/* Trie stclasses are readonly and can thus be shared
	13755	* without duplication. We free the stclass in pregfree
	13756	* when the corresponding reg_ac_data struct is freed.
	13757	*/
	13758	reti->regstclass= ri->regstclass;
	13759	/* Fall through */
	13760	case 't':
	13761	OP_REFCNT_LOCK;
	13762	((reg_trie_data*)ri->data->data[i])->refcount++;
	13763	OP_REFCNT_UNLOCK;
	13764	/* Fall through */
	13765	case 'l':
	13766	case 'L':
	13767	d->data[i] = ri->data->data[i];
	13768	break;
	13769	default:
	13770	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
	13771	}
	13772	}
	13773
	13774	reti->data = d;
	13775	}
	13776	else
	13777	reti->data = NULL;
	13778
	13779	reti->name_list_idx = ri->name_list_idx;
	13780
	13781	#ifdef RE_TRACK_PATTERN_OFFSETS
	13782	if (ri->u.offsets) {
	13783	Newx(reti->u.offsets, 2*len+1, U32);
	13784	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	13785	}
	13786	#else
	13787	SetProgLen(reti,len);
	13788	#endif
	13789
	13790	return (void*)reti;
	13791	}
	13792
	13793	#endif /* USE_ITHREADS */
	13794
	13795	#ifndef PERL_IN_XSUB_RE
	13796
	13797	/*
	13798	- regnext - dig the "next" pointer out of a node
	13799	*/
	13800	regnode *
	13801	Perl_regnext(pTHX_ register regnode *p)
	13802	{
	13803	dVAR;
	13804	register I32 offset;
	13805
	13806	if (!p)
	13807	return(NULL);
	13808
	13809	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	13810	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
	13811	}
	13812
	13813	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	13814	if (offset == 0)
	13815	return(NULL);
	13816
	13817	return(p+offset);
	13818	}
	13819	#endif
	13820
	13821	STATIC void
	13822	S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
	13823	{
	13824	va_list args;
	13825	STRLEN l1 = strlen(pat1);
	13826	STRLEN l2 = strlen(pat2);
	13827	char buf[512];
	13828	SV *msv;
	13829	const char *message;
	13830
	13831	PERL_ARGS_ASSERT_RE_CROAK2;
	13832
	13833	if (l1 > 510)
	13834	l1 = 510;
	13835	if (l1 + l2 > 510)
	13836	l2 = 510 - l1;
	13837	Copy(pat1, buf, l1 , char);
	13838	Copy(pat2, buf + l1, l2 , char);
	13839	buf[l1 + l2] = '\n';
	13840	buf[l1 + l2 + 1] = '\0';
	13841	#ifdef I_STDARG
	13842	/* ANSI variant takes additional second argument */
	13843	va_start(args, pat2);
	13844	#else
	13845	va_start(args);
	13846	#endif
	13847	msv = vmess(buf, &args);
	13848	va_end(args);
	13849	message = SvPV_const(msv,l1);
	13850	if (l1 > 512)
	13851	l1 = 512;
	13852	Copy(message, buf, l1 , char);
	13853	buf[l1-1] = '\0'; /* Overwrite \n */
	13854	Perl_croak(aTHX_ "%s", buf);
	13855	}
	13856
	13857	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	13858
	13859	#ifndef PERL_IN_XSUB_RE
	13860	void
	13861	Perl_save_re_context(pTHX)
	13862	{
	13863	dVAR;
	13864
	13865	struct re_save_state *state;
	13866
	13867	SAVEVPTR(PL_curcop);
	13868	SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
	13869
	13870	state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
	13871	PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
	13872	SSPUSHUV(SAVEt_RE_STATE);
	13873
	13874	Copy(&PL_reg_state, state, 1, struct re_save_state);
	13875
	13876	PL_reg_oldsaved = NULL;
	13877	PL_reg_oldsavedlen = 0;
	13878	PL_reg_maxiter = 0;
	13879	PL_reg_leftiter = 0;
	13880	PL_reg_poscache = NULL;
	13881	PL_reg_poscache_size = 0;
	13882	#ifdef PERL_OLD_COPY_ON_WRITE
	13883	PL_nrs = NULL;
	13884	#endif
	13885
	13886	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	13887	if (PL_curpm) {
	13888	const REGEXP * const rx = PM_GETRE(PL_curpm);
	13889	if (rx) {
	13890	U32 i;
	13891	for (i = 1; i <= RX_NPARENS(rx); i++) {
	13892	char digits[TYPE_CHARS(long)];
	13893	const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
	13894	GV const const gvp
	13895	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	13896
	13897	if (gvp) {
	13898	GV * const gv = *gvp;
	13899	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	13900	save_scalar(gv);
	13901	}
	13902	}
	13903	}
	13904	}
	13905	}
	13906	#endif
	13907
	13908	static void
	13909	clear_re(pTHX_ void *r)
	13910	{
	13911	dVAR;
	13912	ReREFCNT_dec((REGEXP *)r);
	13913	}
	13914
	13915	#ifdef DEBUGGING
	13916
	13917	STATIC void
	13918	S_put_byte(pTHX_ SV *sv, int c)
	13919	{
	13920	PERL_ARGS_ASSERT_PUT_BYTE;
	13921
	13922	/* Our definition of isPRINT() ignores locales, so only bytes that are
	13923	not part of UTF-8 are considered printable. I assume that the same
	13924	holds for UTF-EBCDIC.
	13925	Also, code point 255 is not printable in either (it's E0 in EBCDIC,
	13926	which Wikipedia says:
	13927
	13928	EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
	13929	ones (binary 1111 1111, hexadecimal FF). It is similar, but not
	13930	identical, to the ASCII delete (DEL) or rubout control character.
	13931	) So the old condition can be simplified to !isPRINT(c) */
	13932	if (!isPRINT(c)) {
	13933	if (c < 256) {
	13934	Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
	13935	}
	13936	else {
	13937	Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
	13938	}
	13939	}
	13940	else {
	13941	const char string = c;
	13942	if (c == '-' \|\| c == ']' \|\| c == '\\' \|\| c == '^')
	13943	sv_catpvs(sv, "\\");
	13944	sv_catpvn(sv, &string, 1);
	13945	}
	13946	}
	13947
	13948
	13949	#define CLEAR_OPTSTART \
	13950	if (optstart) STMT_START { \
	13951	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	13952	optstart=NULL; \
	13953	} STMT_END
	13954
	13955	#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	13956
	13957	STATIC const regnode *
	13958	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	13959	const regnode last, const regnode plast,
	13960	SV* sv, I32 indent, U32 depth)
	13961	{
	13962	dVAR;
	13963	register U8 op = PSEUDO; /* Arbitrary non-END op. */
	13964	register const regnode *next;
	13965	const regnode *optstart= NULL;
	13966
	13967	RXi_GET_DECL(r,ri);
	13968	GET_RE_DEBUG_FLAGS_DECL;
	13969
	13970	PERL_ARGS_ASSERT_DUMPUNTIL;
	13971
	13972	#ifdef DEBUG_DUMPUNTIL
	13973	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	13974	last ? last-start : 0,plast ? plast-start : 0);
	13975	#endif
	13976
	13977	if (plast && plast < last)
	13978	last= plast;
	13979
	13980	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	13981	/* While that wasn't END last time... */
	13982	NODE_ALIGN(node);
	13983	op = OP(node);
	13984	if (op == CLOSE \|\| op == WHILEM)
	13985	indent--;
	13986	next = regnext((regnode *)node);
	13987
	13988	/* Where, what. */
	13989	if (OP(node) == OPTIMIZED) {
	13990	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	13991	optstart = node;
	13992	else
	13993	goto after_print;
	13994	} else
	13995	CLEAR_OPTSTART;
	13996
	13997	regprop(r, sv, node);
	13998	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	13999	(int)(2*indent + 1), "", SvPVX_const(sv));
	14000
	14001	if (OP(node) != OPTIMIZED) {
	14002	if (next == NULL) /* Next ptr. */
	14003	PerlIO_printf(Perl_debug_log, " (0)");
	14004	else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
	14005	PerlIO_printf(Perl_debug_log, " (FAIL)");
	14006	else
	14007	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	14008	(void)PerlIO_putc(Perl_debug_log, '\n');
	14009	}
	14010
	14011	after_print:
	14012	if (PL_regkind[(U8)op] == BRANCHJ) {
	14013	assert(next);
	14014	{
	14015	register const regnode *nnode = (OP(next) == LONGJMP
	14016	? regnext((regnode *)next)
	14017	: next);
	14018	if (last && nnode > last)
	14019	nnode = last;
	14020	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	14021	}
	14022	}
	14023	else if (PL_regkind[(U8)op] == BRANCH) {
	14024	assert(next);
	14025	DUMPUNTIL(NEXTOPER(node), next);
	14026	}
	14027	else if ( PL_regkind[(U8)op] == TRIE ) {
	14028	const regnode *this_trie = node;
	14029	const char op = OP(node);
	14030	const U32 n = ARG(node);
	14031	const reg_ac_data * const ac = op>=AHOCORASICK ?
	14032	(reg_ac_data *)ri->data->data[n] :
	14033	NULL;
	14034	const reg_trie_data * const trie =
	14035	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	14036	#ifdef DEBUGGING
	14037	AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	14038	#endif
	14039	const regnode *nextbranch= NULL;
	14040	I32 word_idx;
	14041	sv_setpvs(sv, "");
	14042	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	14043	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	14044
	14045	PerlIO_printf(Perl_debug_log, "%*s%s ",
	14046	(int)(2*(indent+3)), "",
	14047	elem_ptr ? pv_pretty(sv, SvPV_nolen_const(elem_ptr), SvCUR(elem_ptr), 60,
	14048	PL_colors[0], PL_colors[1],
	14049	(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) \|
	14050	PERL_PV_PRETTY_ELLIPSES \|
	14051	PERL_PV_PRETTY_LTGT
	14052	)
	14053	: "???"
	14054	);
	14055	if (trie->jump) {
	14056	U16 dist= trie->jump[word_idx+1];
	14057	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	14058	(UV)((dist ? this_trie + dist : next) - start));
	14059	if (dist) {
	14060	if (!nextbranch)
	14061	nextbranch= this_trie + trie->jump[0];
	14062	DUMPUNTIL(this_trie + dist, nextbranch);
	14063	}
	14064	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	14065	nextbranch= regnext((regnode *)nextbranch);
	14066	} else {
	14067	PerlIO_printf(Perl_debug_log, "\n");
	14068	}
	14069	}
	14070	if (last && next > last)
	14071	node= last;
	14072	else
	14073	node= next;
	14074	}
	14075	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	14076	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	14077	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	14078	}
	14079	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	14080	assert(next);
	14081	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	14082	}
	14083	else if ( op == PLUS \|\| op == STAR) {
	14084	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	14085	}
	14086	else if (PL_regkind[(U8)op] == ANYOF) {
	14087	/* arglen 1 + class block */
	14088	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
	14089	? ANYOF_CLASS_SKIP : ANYOF_SKIP);
	14090	node = NEXTOPER(node);
	14091	}
	14092	else if (PL_regkind[(U8)op] == EXACT) {
	14093	/* Literal string, where present. */
	14094	node += NODE_SZ_STR(node) - 1;
	14095	node = NEXTOPER(node);
	14096	}
	14097	else {
	14098	node = NEXTOPER(node);
	14099	node += regarglen[(U8)op];
	14100	}
	14101	if (op == CURLYX \|\| op == OPEN)
	14102	indent++;
	14103	}
	14104	CLEAR_OPTSTART;
	14105	#ifdef DEBUG_DUMPUNTIL
	14106	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	14107	#endif
	14108	return node;
	14109	}
	14110
	14111	#endif /* DEBUGGING */
	14112
	14113	/*
	14114	* Local variables:
	14115	* c-indentation-style: bsd
	14116	* c-basic-offset: 4
	14117	* indent-tabs-mode: nil
	14118	* End:
	14119	*
	14120	* ex: set ts=8 sts=4 sw=4 et:
	14121	*/