perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	EXTERN_C const struct regexp_engine my_reg_engine;
	85	#else
	86	# include "regcomp.h"
	87	#endif
	88
	89	#include "dquote_static.c"
	90	#include "charclass_invlists.h"
	91	#include "inline_invlist.c"
	92	#include "unicode_constants.h"
	93
	94	#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
	95	_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
	96	#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
	97	_HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
	98	#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
	99	#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
	100
	101	#ifndef STATIC
	102	#define STATIC static
	103	#endif
	104
	105
	106	struct RExC_state_t {
	107	U32 flags; /* RXf_* are we folding, multilining? */
	108	U32 pm_flags; /* PMf_* stuff from the calling PMOP */
	109	char precomp; / uncompiled string. */
	110	REGEXP rx_sv; / The SV that is the regexp. */
	111	regexp rx; / perl core regexp structure */
	112	regexp_internal rxi; / internal data for regexp object
	113	pprivate field */
	114	char start; / Start of input for compile */
	115	char end; / End of input for compile */
	116	char parse; / Input-scan pointer. */
	117	SSize_t whilem_seen; /* number of WHILEM in this expr */
	118	regnode emit_start; / Start of emitted-code area */
	119	regnode emit_bound; / First regnode outside of the
	120	allocated space */
	121	regnode emit; / Code-emit pointer; if = &emit_dummy,
	122	implies compiling, so don't emit */
	123	regnode_ssc emit_dummy; /* placeholder for emit to point to;
	124	large enough for the largest
	125	non-EXACTish node, so can use it as
	126	scratch in pass1 */
	127	I32 naughty; /* How bad is this pattern? */
	128	I32 sawback; /* Did we see \1, ...? */
	129	U32 seen;
	130	SSize_t size; /* Code size. */
	131	I32 npar; /* Capture buffer count, (OPEN) plus
	132	one. ("par" 0 is the whole
	133	pattern)*/
	134	I32 nestroot; /* root parens we are in - used by
	135	accept */
	136	I32 extralen;
	137	I32 seen_zerolen;
	138	regnode *open_parens; / pointers to open parens */
	139	regnode *close_parens; / pointers to close parens */
	140	regnode opend; / END node in program */
	141	I32 utf8; /* whether the pattern is utf8 or not */
	142	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	143	/* XXX use this for future optimisation of case
	144	* where pattern must be upgraded to utf8. */
	145	I32 uni_semantics; /* If a d charset modifier should use unicode
	146	rules, even if the pattern is not in
	147	utf8 */
	148	HV paren_names; / Paren names */
	149
	150	regnode *recurse; / Recurse regops */
	151	I32 recurse_count; /* Number of recurse regops */
	152	U8 study_chunk_recursed; / bitmap of which parens we have moved
	153	through */
	154	U32 study_chunk_recursed_bytes; /* bytes in bitmap */
	155	I32 in_lookbehind;
	156	I32 contains_locale;
	157	I32 contains_i;
	158	I32 override_recoding;
	159	I32 in_multi_char_class;
	160	struct reg_code_block code_blocks; / positions of literal (?{})
	161	within pattern */
	162	int num_code_blocks; /* size of code_blocks[] */
	163	int code_index; /* next code_blocks[] slot */
	164	SSize_t maxlen; /* mininum possible number of chars in string to match */
	165	#ifdef ADD_TO_REGEXEC
	166	char starttry; / -Dr: where regtry was called. */
	167	#define RExC_starttry (pRExC_state->starttry)
	168	#endif
	169	SV runtime_code_qr; / qr with the runtime code blocks */
	170	#ifdef DEBUGGING
	171	const char *lastparse;
	172	I32 lastnum;
	173	AV paren_name_list; / idx -> name */
	174	#define RExC_lastparse (pRExC_state->lastparse)
	175	#define RExC_lastnum (pRExC_state->lastnum)
	176	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	177	#endif
	178	};
	179
	180	#define RExC_flags (pRExC_state->flags)
	181	#define RExC_pm_flags (pRExC_state->pm_flags)
	182	#define RExC_precomp (pRExC_state->precomp)
	183	#define RExC_rx_sv (pRExC_state->rx_sv)
	184	#define RExC_rx (pRExC_state->rx)
	185	#define RExC_rxi (pRExC_state->rxi)
	186	#define RExC_start (pRExC_state->start)
	187	#define RExC_end (pRExC_state->end)
	188	#define RExC_parse (pRExC_state->parse)
	189	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	190	#ifdef RE_TRACK_PATTERN_OFFSETS
	191	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
	192	others */
	193	#endif
	194	#define RExC_emit (pRExC_state->emit)
	195	#define RExC_emit_dummy (pRExC_state->emit_dummy)
	196	#define RExC_emit_start (pRExC_state->emit_start)
	197	#define RExC_emit_bound (pRExC_state->emit_bound)
	198	#define RExC_naughty (pRExC_state->naughty)
	199	#define RExC_sawback (pRExC_state->sawback)
	200	#define RExC_seen (pRExC_state->seen)
	201	#define RExC_size (pRExC_state->size)
	202	#define RExC_maxlen (pRExC_state->maxlen)
	203	#define RExC_npar (pRExC_state->npar)
	204	#define RExC_nestroot (pRExC_state->nestroot)
	205	#define RExC_extralen (pRExC_state->extralen)
	206	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	207	#define RExC_utf8 (pRExC_state->utf8)
	208	#define RExC_uni_semantics (pRExC_state->uni_semantics)
	209	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	210	#define RExC_open_parens (pRExC_state->open_parens)
	211	#define RExC_close_parens (pRExC_state->close_parens)
	212	#define RExC_opend (pRExC_state->opend)
	213	#define RExC_paren_names (pRExC_state->paren_names)
	214	#define RExC_recurse (pRExC_state->recurse)
	215	#define RExC_recurse_count (pRExC_state->recurse_count)
	216	#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
	217	#define RExC_study_chunk_recursed_bytes \
	218	(pRExC_state->study_chunk_recursed_bytes)
	219	#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
	220	#define RExC_contains_locale (pRExC_state->contains_locale)
	221	#define RExC_contains_i (pRExC_state->contains_i)
	222	#define RExC_override_recoding (pRExC_state->override_recoding)
	223	#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
	224
	225
	226	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	227	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	228	((*s) == '{' && regcurly(s)))
	229
	230	/*
	231	* Flags to be passed up and down.
	232	*/
	233	#define WORST 0 /* Worst case. */
	234	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	235
	236	/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
	237	* character. (There needs to be a case: in the switch statement in regexec.c
	238	* for any node marked SIMPLE.) Note that this is not the same thing as
	239	* REGNODE_SIMPLE */
	240	#define SIMPLE 0x02
	241	#define SPSTART 0x04 /* Starts with * or + */
	242	#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
	243	#define TRYAGAIN 0x10 /* Weeded out a declaration. */
	244	#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
	245
	246	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	247
	248	/* whether trie related optimizations are enabled */
	249	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	250	#define TRIE_STUDY_OPT
	251	#define FULL_TRIE_STUDY
	252	#define TRIE_STCLASS
	253	#endif
	254
	255
	256
	257	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	258	#define PBITVAL(paren) (1 << ((paren) & 7))
	259	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	260	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	261	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	262
	263	#define REQUIRE_UTF8 STMT_START { \
	264	if (!UTF) { \
	265	*flagp = RESTART_UTF8; \
	266	return NULL; \
	267	} \
	268	} STMT_END
	269
	270	/* This converts the named class defined in regcomp.h to its equivalent class
	271	* number defined in handy.h. */
	272	#define namedclass_to_classnum(class) ((int) ((class) / 2))
	273	#define classnum_to_namedclass(classnum) ((classnum) * 2)
	274
	275	#define _invlist_union_complement_2nd(a, b, output) \
	276	_invlist_union_maybe_complement_2nd(a, b, TRUE, output)
	277	#define _invlist_intersection_complement_2nd(a, b, output) \
	278	_invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
	279
	280	/* About scan_data_t.
	281
	282	During optimisation we recurse through the regexp program performing
	283	various inplace (keyhole style) optimisations. In addition study_chunk
	284	and scan_commit populate this data structure with information about
	285	what strings MUST appear in the pattern. We look for the longest
	286	string that must appear at a fixed location, and we look for the
	287	longest string that may appear at a floating location. So for instance
	288	in the pattern:
	289
	290	/FOO[xX]A.*B[xX]BAR/
	291
	292	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	293	strings (because they follow a .* construct). study_chunk will identify
	294	both FOO and BAR as being the longest fixed and floating strings respectively.
	295
	296	The strings can be composites, for instance
	297
	298	/(f)(o)(o)/
	299
	300	will result in a composite fixed substring 'foo'.
	301
	302	For each string some basic information is maintained:
	303
	304	- offset or min_offset
	305	This is the position the string must appear at, or not before.
	306	It also implicitly (when combined with minlenp) tells us how many
	307	characters must match before the string we are searching for.
	308	Likewise when combined with minlenp and the length of the string it
	309	tells us how many characters must appear after the string we have
	310	found.
	311
	312	- max_offset
	313	Only used for floating strings. This is the rightmost point that
	314	the string can appear at. If set to SSize_t_MAX it indicates that the
	315	string can occur infinitely far to the right.
	316
	317	- minlenp
	318	A pointer to the minimum number of characters of the pattern that the
	319	string was found inside. This is important as in the case of positive
	320	lookahead or positive lookbehind we can have multiple patterns
	321	involved. Consider
	322
	323	/(?=FOO).*F/
	324
	325	The minimum length of the pattern overall is 3, the minimum length
	326	of the lookahead part is 3, but the minimum length of the part that
	327	will actually match is 1. So 'FOO's minimum length is 3, but the
	328	minimum length for the F is 1. This is important as the minimum length
	329	is used to determine offsets in front of and behind the string being
	330	looked for. Since strings can be composites this is the length of the
	331	pattern at the time it was committed with a scan_commit. Note that
	332	the length is calculated by study_chunk, so that the minimum lengths
	333	are not known until the full pattern has been compiled, thus the
	334	pointer to the value.
	335
	336	- lookbehind
	337
	338	In the case of lookbehind the string being searched for can be
	339	offset past the start point of the final matching string.
	340	If this value was just blithely removed from the min_offset it would
	341	invalidate some of the calculations for how many chars must match
	342	before or after (as they are derived from min_offset and minlen and
	343	the length of the string being searched for).
	344	When the final pattern is compiled and the data is moved from the
	345	scan_data_t structure into the regexp structure the information
	346	about lookbehind is factored in, with the information that would
	347	have been lost precalculated in the end_shift field for the
	348	associated string.
	349
	350	The fields pos_min and pos_delta are used to store the minimum offset
	351	and the delta to the maximum offset at the current point in the pattern.
	352
	353	*/
	354
	355	typedef struct scan_data_t {
	356	/I32 len_min; unused /
	357	/I32 len_delta; unused /
	358	SSize_t pos_min;
	359	SSize_t pos_delta;
	360	SV *last_found;
	361	SSize_t last_end; /* min value, <0 unless valid. */
	362	SSize_t last_start_min;
	363	SSize_t last_start_max;
	364	SV *longest; / Either &l_fixed, or &l_float. */
	365	SV longest_fixed; / longest fixed string found in pattern */
	366	SSize_t offset_fixed; /* offset where it starts */
	367	SSize_t minlen_fixed; / pointer to the minlen relevant to the string */
	368	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	369	SV longest_float; / longest floating string found in pattern */
	370	SSize_t offset_float_min; /* earliest point in string it can appear */
	371	SSize_t offset_float_max; /* latest point in string it can appear */
	372	SSize_t minlen_float; / pointer to the minlen relevant to the string */
	373	SSize_t lookbehind_float; /* is the pos of the string modified by LB */
	374	I32 flags;
	375	I32 whilem_c;
	376	SSize_t *last_closep;
	377	regnode_ssc *start_class;
	378	} scan_data_t;
	379
	380	/*
	381	* Forward declarations for pregcomp()'s friends.
	382	*/
	383
	384	static const scan_data_t zero_scan_data =
	385	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	386
	387	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	388	#define SF_BEFORE_SEOL 0x0001
	389	#define SF_BEFORE_MEOL 0x0002
	390	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	391	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	392
	393	#define SF_FIX_SHIFT_EOL (+2)
	394	#define SF_FL_SHIFT_EOL (+4)
	395
	396	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	397	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	398
	399	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	400	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	401	#define SF_IS_INF 0x0040
	402	#define SF_HAS_PAR 0x0080
	403	#define SF_IN_PAR 0x0100
	404	#define SF_HAS_EVAL 0x0200
	405	#define SCF_DO_SUBSTR 0x0400
	406	#define SCF_DO_STCLASS_AND 0x0800
	407	#define SCF_DO_STCLASS_OR 0x1000
	408	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	409	#define SCF_WHILEM_VISITED_POS 0x2000
	410
	411	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	412	#define SCF_SEEN_ACCEPT 0x8000
	413	#define SCF_TRIE_DOING_RESTUDY 0x10000
	414
	415	#define UTF cBOOL(RExC_utf8)
	416
	417	/* The enums for all these are ordered so things work out correctly */
	418	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	419	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
	420	== REGEX_DEPENDS_CHARSET)
	421	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	422	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
	423	>= REGEX_UNICODE_CHARSET)
	424	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
	425	== REGEX_ASCII_RESTRICTED_CHARSET)
	426	#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
	427	>= REGEX_ASCII_RESTRICTED_CHARSET)
	428	#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
	429	== REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	430
	431	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	432
	433	/* For programs that want to be strictly Unicode compatible by dying if any
	434	* attempt is made to match a non-Unicode code point against a Unicode
	435	* property. */
	436	#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
	437
	438	#define OOB_NAMEDCLASS -1
	439
	440	/* There is no code point that is out-of-bounds, so this is problematic. But
	441	* its only current use is to initialize a variable that is always set before
	442	* looked at. */
	443	#define OOB_UNICODE 0xDEADBEEF
	444
	445	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	446	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	447
	448
	449	/* length of regex to show in messages that don't mark a position within */
	450	#define RegexLengthToShowInErrorMessages 127
	451
	452	/*
	453	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	454	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	455	* op/pragma/warn/regcomp.
	456	*/
	457	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	458	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	459
	460	#define REPORT_LOCATION " in regex; marked by " MARKER1 \
	461	" in m/%"UTF8f MARKER2 "%"UTF8f"/"
	462
	463	#define REPORT_LOCATION_ARGS(offset) \
	464	UTF8fARG(UTF, offset, RExC_precomp), \
	465	UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
	466
	467	/*
	468	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	469	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	470	* "...".
	471	*/
	472	#define _FAIL(code) STMT_START { \
	473	const char *ellipses = ""; \
	474	IV len = RExC_end - RExC_precomp; \
	475	\
	476	if (!SIZE_ONLY) \
	477	SAVEFREESV(RExC_rx_sv); \
	478	if (len > RegexLengthToShowInErrorMessages) { \
	479	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	480	len = RegexLengthToShowInErrorMessages - 10; \
	481	ellipses = "..."; \
	482	} \
	483	code; \
	484	} STMT_END
	485
	486	#define FAIL(msg) _FAIL( \
	487	Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
	488	msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
	489
	490	#define FAIL2(msg,arg) _FAIL( \
	491	Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
	492	arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
	493
	494	/*
	495	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	496	*/
	497	#define Simple_vFAIL(m) STMT_START { \
	498	const IV offset = RExC_parse - RExC_precomp; \
	499	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	500	m, REPORT_LOCATION_ARGS(offset)); \
	501	} STMT_END
	502
	503	/*
	504	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	505	*/
	506	#define vFAIL(m) STMT_START { \
	507	if (!SIZE_ONLY) \
	508	SAVEFREESV(RExC_rx_sv); \
	509	Simple_vFAIL(m); \
	510	} STMT_END
	511
	512	/*
	513	* Like Simple_vFAIL(), but accepts two arguments.
	514	*/
	515	#define Simple_vFAIL2(m,a1) STMT_START { \
	516	const IV offset = RExC_parse - RExC_precomp; \
	517	S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
	518	REPORT_LOCATION_ARGS(offset)); \
	519	} STMT_END
	520
	521	/*
	522	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	523	*/
	524	#define vFAIL2(m,a1) STMT_START { \
	525	if (!SIZE_ONLY) \
	526	SAVEFREESV(RExC_rx_sv); \
	527	Simple_vFAIL2(m, a1); \
	528	} STMT_END
	529
	530
	531	/*
	532	* Like Simple_vFAIL(), but accepts three arguments.
	533	*/
	534	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	535	const IV offset = RExC_parse - RExC_precomp; \
	536	S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
	537	REPORT_LOCATION_ARGS(offset)); \
	538	} STMT_END
	539
	540	/*
	541	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	542	*/
	543	#define vFAIL3(m,a1,a2) STMT_START { \
	544	if (!SIZE_ONLY) \
	545	SAVEFREESV(RExC_rx_sv); \
	546	Simple_vFAIL3(m, a1, a2); \
	547	} STMT_END
	548
	549	/*
	550	* Like Simple_vFAIL(), but accepts four arguments.
	551	*/
	552	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	553	const IV offset = RExC_parse - RExC_precomp; \
	554	S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
	555	REPORT_LOCATION_ARGS(offset)); \
	556	} STMT_END
	557
	558	#define vFAIL4(m,a1,a2,a3) STMT_START { \
	559	if (!SIZE_ONLY) \
	560	SAVEFREESV(RExC_rx_sv); \
	561	Simple_vFAIL4(m, a1, a2, a3); \
	562	} STMT_END
	563
	564	/* A specialized version of vFAIL2 that works with UTF8f */
	565	#define vFAIL2utf8f(m, a1) STMT_START { \
	566	const IV offset = RExC_parse - RExC_precomp; \
	567	if (!SIZE_ONLY) \
	568	SAVEFREESV(RExC_rx_sv); \
	569	S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
	570	REPORT_LOCATION_ARGS(offset)); \
	571	} STMT_END
	572
	573
	574	/* m is not necessarily a "literal string", in this macro */
	575	#define reg_warn_non_literal_string(loc, m) STMT_START { \
	576	const IV offset = loc - RExC_precomp; \
	577	Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
	578	m, REPORT_LOCATION_ARGS(offset)); \
	579	} STMT_END
	580
	581	#define ckWARNreg(loc,m) STMT_START { \
	582	const IV offset = loc - RExC_precomp; \
	583	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	584	REPORT_LOCATION_ARGS(offset)); \
	585	} STMT_END
	586
	587	#define vWARN_dep(loc, m) STMT_START { \
	588	const IV offset = loc - RExC_precomp; \
	589	Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
	590	REPORT_LOCATION_ARGS(offset)); \
	591	} STMT_END
	592
	593	#define ckWARNdep(loc,m) STMT_START { \
	594	const IV offset = loc - RExC_precomp; \
	595	Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
	596	m REPORT_LOCATION, \
	597	REPORT_LOCATION_ARGS(offset)); \
	598	} STMT_END
	599
	600	#define ckWARNregdep(loc,m) STMT_START { \
	601	const IV offset = loc - RExC_precomp; \
	602	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	603	m REPORT_LOCATION, \
	604	REPORT_LOCATION_ARGS(offset)); \
	605	} STMT_END
	606
	607	#define ckWARN2reg_d(loc,m, a1) STMT_START { \
	608	const IV offset = loc - RExC_precomp; \
	609	Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
	610	m REPORT_LOCATION, \
	611	a1, REPORT_LOCATION_ARGS(offset)); \
	612	} STMT_END
	613
	614	#define ckWARN2reg(loc, m, a1) STMT_START { \
	615	const IV offset = loc - RExC_precomp; \
	616	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	617	a1, REPORT_LOCATION_ARGS(offset)); \
	618	} STMT_END
	619
	620	#define vWARN3(loc, m, a1, a2) STMT_START { \
	621	const IV offset = loc - RExC_precomp; \
	622	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	623	a1, a2, REPORT_LOCATION_ARGS(offset)); \
	624	} STMT_END
	625
	626	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	627	const IV offset = loc - RExC_precomp; \
	628	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	629	a1, a2, REPORT_LOCATION_ARGS(offset)); \
	630	} STMT_END
	631
	632	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	633	const IV offset = loc - RExC_precomp; \
	634	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	635	a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
	636	} STMT_END
	637
	638	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	639	const IV offset = loc - RExC_precomp; \
	640	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	641	a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
	642	} STMT_END
	643
	644	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	645	const IV offset = loc - RExC_precomp; \
	646	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	647	a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
	648	} STMT_END
	649
	650
	651	/* Allow for side effects in s */
	652	#define REGC(c,s) STMT_START { \
	653	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	654	} STMT_END
	655
	656	/* Macros for recording node offsets. 20001227 mjd@plover.com
	657	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	658	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	659	* Element 0 holds the number n.
	660	* Position is 1 indexed.
	661	*/
	662	#ifndef RE_TRACK_PATTERN_OFFSETS
	663	#define Set_Node_Offset_To_R(node,byte)
	664	#define Set_Node_Offset(node,byte)
	665	#define Set_Cur_Node_Offset
	666	#define Set_Node_Length_To_R(node,len)
	667	#define Set_Node_Length(node,len)
	668	#define Set_Node_Cur_Length(node,start)
	669	#define Node_Offset(n)
	670	#define Node_Length(n)
	671	#define Set_Node_Offset_Length(node,offset,len)
	672	#define ProgLen(ri) ri->u.proglen
	673	#define SetProgLen(ri,x) ri->u.proglen = x
	674	#else
	675	#define ProgLen(ri) ri->u.offsets[0]
	676	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	677	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	678	if (! SIZE_ONLY) { \
	679	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	680	__LINE__, (int)(node), (int)(byte))); \
	681	if((node) < 0) { \
	682	Perl_croak(aTHX_ "value of node is %d in Offset macro", \
	683	(int)(node)); \
	684	} else { \
	685	RExC_offsets[2*(node)-1] = (byte); \
	686	} \
	687	} \
	688	} STMT_END
	689
	690	#define Set_Node_Offset(node,byte) \
	691	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	692	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	693
	694	#define Set_Node_Length_To_R(node,len) STMT_START { \
	695	if (! SIZE_ONLY) { \
	696	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	697	__LINE__, (int)(node), (int)(len))); \
	698	if((node) < 0) { \
	699	Perl_croak(aTHX_ "value of node is %d in Length macro", \
	700	(int)(node)); \
	701	} else { \
	702	RExC_offsets[2*(node)] = (len); \
	703	} \
	704	} \
	705	} STMT_END
	706
	707	#define Set_Node_Length(node,len) \
	708	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	709	#define Set_Node_Cur_Length(node, start) \
	710	Set_Node_Length(node, RExC_parse - start)
	711
	712	/* Get offsets and lengths */
	713	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	714	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	715
	716	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	717	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	718	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	719	} STMT_END
	720	#endif
	721
	722	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	723	#define EXPERIMENTAL_INPLACESCAN
	724	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	725
	726	#define DEBUG_RExC_seen() \
	727	DEBUG_OPTIMISE_MORE_r({ \
	728	PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
	729	\
	730	if (RExC_seen & REG_ZERO_LEN_SEEN) \
	731	PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
	732	\
	733	if (RExC_seen & REG_LOOKBEHIND_SEEN) \
	734	PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
	735	\
	736	if (RExC_seen & REG_GPOS_SEEN) \
	737	PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
	738	\
	739	if (RExC_seen & REG_CANY_SEEN) \
	740	PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
	741	\
	742	if (RExC_seen & REG_RECURSE_SEEN) \
	743	PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
	744	\
	745	if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
	746	PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
	747	\
	748	if (RExC_seen & REG_VERBARG_SEEN) \
	749	PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
	750	\
	751	if (RExC_seen & REG_CUTGROUP_SEEN) \
	752	PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
	753	\
	754	if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
	755	PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
	756	\
	757	if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
	758	PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
	759	\
	760	if (RExC_seen & REG_GOSTART_SEEN) \
	761	PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
	762	\
	763	if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
	764	PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
	765	\
	766	PerlIO_printf(Perl_debug_log,"\n"); \
	767	});
	768
	769	#define DEBUG_STUDYDATA(str,data,depth) \
	770	DEBUG_OPTIMISE_MORE_r(if(data){ \
	771	PerlIO_printf(Perl_debug_log, \
	772	"%*s" str "Pos:%"IVdf"/%"IVdf \
	773	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	774	(int)(depth)*2, "", \
	775	(IV)((data)->pos_min), \
	776	(IV)((data)->pos_delta), \
	777	(UV)((data)->flags), \
	778	(IV)((data)->whilem_c), \
	779	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	780	is_inf ? "INF " : "" \
	781	); \
	782	if ((data)->last_found) \
	783	PerlIO_printf(Perl_debug_log, \
	784	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	785	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	786	SvPVX_const((data)->last_found), \
	787	(IV)((data)->last_end), \
	788	(IV)((data)->last_start_min), \
	789	(IV)((data)->last_start_max), \
	790	((data)->longest && \
	791	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	792	SvPVX_const((data)->longest_fixed), \
	793	(IV)((data)->offset_fixed), \
	794	((data)->longest && \
	795	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	796	SvPVX_const((data)->longest_float), \
	797	(IV)((data)->offset_float_min), \
	798	(IV)((data)->offset_float_max) \
	799	); \
	800	PerlIO_printf(Perl_debug_log,"\n"); \
	801	});
	802
	803	/* Mark that we cannot extend a found fixed substring at this point.
	804	Update the longest found anchored substring and the longest found
	805	floating substrings if needed. */
	806
	807	STATIC void
	808	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data,
	809	SSize_t *minlenp, int is_inf)
	810	{
	811	const STRLEN l = CHR_SVLEN(data->last_found);
	812	const STRLEN old_l = CHR_SVLEN(*data->longest);
	813	GET_RE_DEBUG_FLAGS_DECL;
	814
	815	PERL_ARGS_ASSERT_SCAN_COMMIT;
	816
	817	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	818	SvSetMagicSV(*data->longest, data->last_found);
	819	if (*data->longest == data->longest_fixed) {
	820	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	821	if (data->flags & SF_BEFORE_EOL)
	822	data->flags
	823	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	824	else
	825	data->flags &= ~SF_FIX_BEFORE_EOL;
	826	data->minlen_fixed=minlenp;
	827	data->lookbehind_fixed=0;
	828	}
	829	else { /* data->longest == data->longest_float /
	830	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	831	data->offset_float_max = (l
	832	? data->last_start_max
	833	: (data->pos_delta == SSize_t_MAX
	834	? SSize_t_MAX
	835	: data->pos_min + data->pos_delta));
	836	if (is_inf
	837	\|\| (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
	838	data->offset_float_max = SSize_t_MAX;
	839	if (data->flags & SF_BEFORE_EOL)
	840	data->flags
	841	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	842	else
	843	data->flags &= ~SF_FL_BEFORE_EOL;
	844	data->minlen_float=minlenp;
	845	data->lookbehind_float=0;
	846	}
	847	}
	848	SvCUR_set(data->last_found, 0);
	849	{
	850	SV * const sv = data->last_found;
	851	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	852	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	853	if (mg)
	854	mg->mg_len = 0;
	855	}
	856	}
	857	data->last_end = -1;
	858	data->flags &= ~SF_BEFORE_EOL;
	859	DEBUG_STUDYDATA("commit: ",data,0);
	860	}
	861
	862	/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
	863	* list that describes which code points it matches */
	864
	865	STATIC void
	866	S_ssc_anything(pTHX_ regnode_ssc *ssc)
	867	{
	868	/* Set the SSC 'ssc' to match an empty string or any code point */
	869
	870	PERL_ARGS_ASSERT_SSC_ANYTHING;
	871
	872	assert(is_ANYOF_SYNTHETIC(ssc));
	873
	874	ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
	875	_append_range_to_invlist(ssc->invlist, 0, UV_MAX);
	876	ANYOF_FLAGS(ssc) \|= ANYOF_EMPTY_STRING; /* Plus match empty string */
	877	}
	878
	879	STATIC int
	880	S_ssc_is_anything(const regnode_ssc *ssc)
	881	{
	882	/* Returns TRUE if the SSC 'ssc' can match the empty string and any code
	883	* point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
	884	* us anything: if the function returns TRUE, 'ssc' hasn't been restricted
	885	* in any way, so there's no point in using it */
	886
	887	UV start, end;
	888	bool ret;
	889
	890	PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
	891
	892	assert(is_ANYOF_SYNTHETIC(ssc));
	893
	894	if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
	895	return FALSE;
	896	}
	897
	898	/* See if the list consists solely of the range 0 - Infinity */
	899	invlist_iterinit(ssc->invlist);
	900	ret = invlist_iternext(ssc->invlist, &start, &end)
	901	&& start == 0
	902	&& end == UV_MAX;
	903
	904	invlist_iterfinish(ssc->invlist);
	905
	906	if (ret) {
	907	return TRUE;
	908	}
	909
	910	/* If e.g., both \w and \W are set, matches everything */
	911	if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
	912	int i;
	913	for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
	914	if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
	915	return TRUE;
	916	}
	917	}
	918	}
	919
	920	return FALSE;
	921	}
	922
	923	STATIC void
	924	S_ssc_init(pTHX_ const RExC_state_t pRExC_state, regnode_ssc ssc)
	925	{
	926	/* Initializes the SSC 'ssc'. This includes setting it to match an empty
	927	* string, any code point, or any posix class under locale */
	928
	929	PERL_ARGS_ASSERT_SSC_INIT;
	930
	931	Zero(ssc, 1, regnode_ssc);
	932	set_ANYOF_SYNTHETIC(ssc);
	933	ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
	934	ssc_anything(ssc);
	935
	936	/* If any portion of the regex is to operate under locale rules,
	937	* initialization includes it. The reason this isn't done for all regexes
	938	* is that the optimizer was written under the assumption that locale was
	939	* all-or-nothing. Given the complexity and lack of documentation in the
	940	* optimizer, and that there are inadequate test cases for locale, many
	941	* parts of it may not work properly, it is safest to avoid locale unless
	942	* necessary. */
	943	if (RExC_contains_locale) {
	944	ANYOF_POSIXL_SETALL(ssc);
	945	}
	946	else {
	947	ANYOF_POSIXL_ZERO(ssc);
	948	}
	949	}
	950
	951	STATIC int
	952	S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
	953	const regnode_ssc *ssc)
	954	{
	955	/* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
	956	* to the list of code points matched, and locale posix classes; hence does
	957	* not check its flags) */
	958
	959	UV start, end;
	960	bool ret;
	961
	962	PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
	963
	964	assert(is_ANYOF_SYNTHETIC(ssc));
	965
	966	invlist_iterinit(ssc->invlist);
	967	ret = invlist_iternext(ssc->invlist, &start, &end)
	968	&& start == 0
	969	&& end == UV_MAX;
	970
	971	invlist_iterfinish(ssc->invlist);
	972
	973	if (! ret) {
	974	return FALSE;
	975	}
	976
	977	if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
	978	return FALSE;
	979	}
	980
	981	return TRUE;
	982	}
	983
	984	STATIC SV*
	985	S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
	986	const regnode_charclass* const node)
	987	{
	988	/* Returns a mortal inversion list defining which code points are matched
	989	* by 'node', which is of type ANYOF. Handles complementing the result if
	990	* appropriate. If some code points aren't knowable at this time, the
	991	* returned list must, and will, contain every code point that is a
	992	* possibility. */
	993
	994	SV* invlist = sv_2mortal(_new_invlist(0));
	995	SV* only_utf8_locale_invlist = NULL;
	996	unsigned int i;
	997	const U32 n = ARG(node);
	998	bool new_node_has_latin1 = FALSE;
	999
	1000	PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
	1001
	1002	/* Look at the data structure created by S_set_ANYOF_arg() */
	1003	if (n != ANYOF_NONBITMAP_EMPTY) {
	1004	SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
	1005	AV * const av = MUTABLE_AV(SvRV(rv));
	1006	SV **const ary = AvARRAY(av);
	1007	assert(RExC_rxi->data->what[n] == 's');
	1008
	1009	if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
	1010	invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
	1011	}
	1012	else if (ary[0] && ary[0] != &PL_sv_undef) {
	1013
	1014	/* Here, no compile-time swash, and there are things that won't be
	1015	* known until runtime -- we have to assume it could be anything */
	1016	return _add_range_to_invlist(invlist, 0, UV_MAX);
	1017	}
	1018	else if (ary[3] && ary[3] != &PL_sv_undef) {
	1019
	1020	/* Here no compile-time swash, and no run-time only data. Use the
	1021	* node's inversion list */
	1022	invlist = sv_2mortal(invlist_clone(ary[3]));
	1023	}
	1024
	1025	/* Get the code points valid only under UTF-8 locales */
	1026	if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
	1027	&& ary[2] && ary[2] != &PL_sv_undef)
	1028	{
	1029	only_utf8_locale_invlist = ary[2];
	1030	}
	1031	}
	1032
	1033	/* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
	1034	* code points, and an inversion list for the others, but if there are code
	1035	* points that should match only conditionally on the target string being
	1036	* UTF-8, those are placed in the inversion list, and not the bitmap.
	1037	* Since there are circumstances under which they could match, they are
	1038	* included in the SSC. But if the ANYOF node is to be inverted, we have
	1039	* to exclude them here, so that when we invert below, the end result
	1040	* actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
	1041	* have to do this here before we add the unconditionally matched code
	1042	* points */
	1043	if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
	1044	_invlist_intersection_complement_2nd(invlist,
	1045	PL_UpperLatin1,
	1046	&invlist);
	1047	}
	1048
	1049	/* Add in the points from the bit map */
	1050	for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
	1051	if (ANYOF_BITMAP_TEST(node, i)) {
	1052	invlist = add_cp_to_invlist(invlist, i);
	1053	new_node_has_latin1 = TRUE;
	1054	}
	1055	}
	1056
	1057	/* If this can match all upper Latin1 code points, have to add them
	1058	* as well */
	1059	if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
	1060	_invlist_union(invlist, PL_UpperLatin1, &invlist);
	1061	}
	1062
	1063	/* Similarly for these */
	1064	if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
	1065	invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
	1066	}
	1067
	1068	if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
	1069	_invlist_invert(invlist);
	1070	}
	1071	else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
	1072
	1073	/* Under /li, any 0-255 could fold to any other 0-255, depending on the
	1074	* locale. We can skip this if there are no 0-255 at all. */
	1075	_invlist_union(invlist, PL_Latin1, &invlist);
	1076	}
	1077
	1078	/* Similarly add the UTF-8 locale possible matches. These have to be
	1079	* deferred until after the non-UTF-8 locale ones are taken care of just
	1080	* above, or it leads to wrong results under ANYOF_INVERT */
	1081	if (only_utf8_locale_invlist) {
	1082	_invlist_union_maybe_complement_2nd(invlist,
	1083	only_utf8_locale_invlist,
	1084	ANYOF_FLAGS(node) & ANYOF_INVERT,
	1085	&invlist);
	1086	}
	1087
	1088	return invlist;
	1089	}
	1090
	1091	/* These two functions currently do the exact same thing */
	1092	#define ssc_init_zero ssc_init
	1093
	1094	#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
	1095	#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
	1096
	1097	/* 'AND' a given class with another one. Can create false positives. 'ssc'
	1098	* should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
	1099	* 'and_with' is a regnode_charclass instead of a regnode_ssc. */
	1100
	1101	STATIC void
	1102	S_ssc_and(pTHX_ const RExC_state_t pRExC_state, regnode_ssc ssc,
	1103	const regnode_charclass *and_with)
	1104	{
	1105	/* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
	1106	* another SSC or a regular ANYOF class. Can create false positives. */
	1107
	1108	SV* anded_cp_list;
	1109	U8 anded_flags;
	1110
	1111	PERL_ARGS_ASSERT_SSC_AND;
	1112
	1113	assert(is_ANYOF_SYNTHETIC(ssc));
	1114
	1115	/* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
	1116	* the code point inversion list and just the relevant flags */
	1117	if (is_ANYOF_SYNTHETIC(and_with)) {
	1118	anded_cp_list = ((regnode_ssc *)and_with)->invlist;
	1119	anded_flags = ANYOF_FLAGS(and_with);
	1120
	1121	/* XXX This is a kludge around what appears to be deficiencies in the
	1122	* optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
	1123	* there are paths through the optimizer where it doesn't get weeded
	1124	* out when it should. And if we don't make some extra provision for
	1125	* it like the code just below, it doesn't get added when it should.
	1126	* This solution is to add it only when AND'ing, which is here, and
	1127	* only when what is being AND'ed is the pristine, original node
	1128	* matching anything. Thus it is like adding it to ssc_anything() but
	1129	* only when the result is to be AND'ed. Probably the same solution
	1130	* could be adopted for the same problem we have with /l matching,
	1131	* which is solved differently in S_ssc_init(), and that would lead to
	1132	* fewer false positives than that solution has. But if this solution
	1133	* creates bugs, the consequences are only that a warning isn't raised
	1134	* that should be; while the consequences for having /l bugs is
	1135	* incorrect matches */
	1136	if (ssc_is_anything((regnode_ssc *)and_with)) {
	1137	anded_flags \|= ANYOF_WARN_SUPER;
	1138	}
	1139	}
	1140	else {
	1141	anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
	1142	anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
	1143	}
	1144
	1145	ANYOF_FLAGS(ssc) &= anded_flags;
	1146
	1147	/* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
	1148	* C2 is the list of code points in 'and-with'; P2, its posix classes.
	1149	* 'and_with' may be inverted. When not inverted, we have the situation of
	1150	* computing:
	1151	* (C1 \| P1) & (C2 \| P2)
	1152	* = (C1 & (C2 \| P2)) \| (P1 & (C2 \| P2))
	1153	* = ((C1 & C2) \| (C1 & P2)) \| ((P1 & C2) \| (P1 & P2))
	1154	* <= ((C1 & C2) \| P2)) \| ( P1 \| (P1 & P2))
	1155	* <= ((C1 & C2) \| P1 \| P2)
	1156	* Alternatively, the last few steps could be:
	1157	* = ((C1 & C2) \| (C1 & P2)) \| ((P1 & C2) \| (P1 & P2))
	1158	* <= ((C1 & C2) \| C1 ) \| ( C2 \| (P1 & P2))
	1159	* <= (C1 \| C2 \| (P1 & P2))
	1160	* We favor the second approach if either P1 or P2 is non-empty. This is
	1161	* because these components are a barrier to doing optimizations, as what
	1162	* they match cannot be known until the moment of matching as they are
	1163	* dependent on the current locale, 'AND"ing them likely will reduce or
	1164	* eliminate them.
	1165	* But we can do better if we know that C1,P1 are in their initial state (a
	1166	* frequent occurrence), each matching everything:
	1167	* (<everything>) & (C2 \| P2) = C2 \| P2
	1168	* Similarly, if C2,P2 are in their initial state (again a frequent
	1169	* occurrence), the result is a no-op
	1170	* (C1 \| P1) & (<everything>) = C1 \| P1
	1171	*
	1172	* Inverted, we have
	1173	* (C1 \| P1) & ~(C2 \| P2) = (C1 \| P1) & (~C2 & ~P2)
	1174	* = (C1 & (~C2 & ~P2)) \| (P1 & (~C2 & ~P2))
	1175	* <= (C1 & ~C2) \| (P1 & ~P2)
	1176	* */
	1177
	1178	if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
	1179	&& ! is_ANYOF_SYNTHETIC(and_with))
	1180	{
	1181	unsigned int i;
	1182
	1183	ssc_intersection(ssc,
	1184	anded_cp_list,
	1185	FALSE /* Has already been inverted */
	1186	);
	1187
	1188	/* If either P1 or P2 is empty, the intersection will be also; can skip
	1189	* the loop */
	1190	if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
	1191	ANYOF_POSIXL_ZERO(ssc);
	1192	}
	1193	else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
	1194
	1195	/* Note that the Posix class component P from 'and_with' actually
	1196	* looks like:
	1197	* P = Pa \| Pb \| ... \| Pn
	1198	* where each component is one posix class, such as in [\w\s].
	1199	* Thus
	1200	* ~P = ~(Pa \| Pb \| ... \| Pn)
	1201	* = ~Pa & ~Pb & ... & ~Pn
	1202	* <= ~Pa \| ~Pb \| ... \| ~Pn
	1203	* The last is something we can easily calculate, but unfortunately
	1204	* is likely to have many false positives. We could do better
	1205	* in some (but certainly not all) instances if two classes in
	1206	* P have known relationships. For example
	1207	* :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
	1208	* So
	1209	* :lower: & :print: = :lower:
	1210	* And similarly for classes that must be disjoint. For example,
	1211	* since \s and \w can have no elements in common based on rules in
	1212	* the POSIX standard,
	1213	* \w & ^\S = nothing
	1214	* Unfortunately, some vendor locales do not meet the Posix
	1215	* standard, in particular almost everything by Microsoft.
	1216	* The loop below just changes e.g., \w into \W and vice versa */
	1217
	1218	regnode_charclass_posixl temp;
	1219	int add = 1; /* To calculate the index of the complement */
	1220
	1221	ANYOF_POSIXL_ZERO(&temp);
	1222	for (i = 0; i < ANYOF_MAX; i++) {
	1223	assert(i % 2 != 0
	1224	\|\| ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
	1225	\|\| ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
	1226
	1227	if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
	1228	ANYOF_POSIXL_SET(&temp, i + add);
	1229	}
	1230	add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
	1231	}
	1232	ANYOF_POSIXL_AND(&temp, ssc);
	1233
	1234	} /* else ssc already has no posixes */
	1235	} /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
	1236	in its initial state */
	1237	else if (! is_ANYOF_SYNTHETIC(and_with)
	1238	\|\| ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
	1239	{
	1240	/* But if 'ssc' is in its initial state, the result is just 'and_with';
	1241	* copy it over 'ssc' */
	1242	if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
	1243	if (is_ANYOF_SYNTHETIC(and_with)) {
	1244	StructCopy(and_with, ssc, regnode_ssc);
	1245	}
	1246	else {
	1247	ssc->invlist = anded_cp_list;
	1248	ANYOF_POSIXL_ZERO(ssc);
	1249	if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
	1250	ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
	1251	}
	1252	}
	1253	}
	1254	else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
	1255	\|\| (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
	1256	{
	1257	/* One or the other of P1, P2 is non-empty. */
	1258	if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
	1259	ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
	1260	}
	1261	ssc_union(ssc, anded_cp_list, FALSE);
	1262	}
	1263	else { /* P1 = P2 = empty */
	1264	ssc_intersection(ssc, anded_cp_list, FALSE);
	1265	}
	1266	}
	1267	}
	1268
	1269	STATIC void
	1270	S_ssc_or(pTHX_ const RExC_state_t pRExC_state, regnode_ssc ssc,
	1271	const regnode_charclass *or_with)
	1272	{
	1273	/* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
	1274	* another SSC or a regular ANYOF class. Can create false positives if
	1275	* 'or_with' is to be inverted. */
	1276
	1277	SV* ored_cp_list;
	1278	U8 ored_flags;
	1279
	1280	PERL_ARGS_ASSERT_SSC_OR;
	1281
	1282	assert(is_ANYOF_SYNTHETIC(ssc));
	1283
	1284	/* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
	1285	* the code point inversion list and just the relevant flags */
	1286	if (is_ANYOF_SYNTHETIC(or_with)) {
	1287	ored_cp_list = ((regnode_ssc*) or_with)->invlist;
	1288	ored_flags = ANYOF_FLAGS(or_with);
	1289	}
	1290	else {
	1291	ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
	1292	ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
	1293	}
	1294
	1295	ANYOF_FLAGS(ssc) \|= ored_flags;
	1296
	1297	/* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
	1298	* C2 is the list of code points in 'or-with'; P2, its posix classes.
	1299	* 'or_with' may be inverted. When not inverted, we have the simple
	1300	* situation of computing:
	1301	* (C1 \| P1) \| (C2 \| P2) = (C1 \| C2) \| (P1 \| P2)
	1302	* If P1\|P2 yields a situation with both a class and its complement are
	1303	* set, like having both \w and \W, this matches all code points, and we
	1304	* can delete these from the P component of the ssc going forward. XXX We
	1305	* might be able to delete all the P components, but I (khw) am not certain
	1306	* about this, and it is better to be safe.
	1307	*
	1308	* Inverted, we have
	1309	* (C1 \| P1) \| ~(C2 \| P2) = (C1 \| P1) \| (~C2 & ~P2)
	1310	* <= (C1 \| P1) \| ~C2
	1311	* <= (C1 \| ~C2) \| P1
	1312	* (which results in actually simpler code than the non-inverted case)
	1313	* */
	1314
	1315	if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
	1316	&& ! is_ANYOF_SYNTHETIC(or_with))
	1317	{
	1318	/* We ignore P2, leaving P1 going forward */
	1319	} /* else Not inverted */
	1320	else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
	1321	ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
	1322	if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
	1323	unsigned int i;
	1324	for (i = 0; i < ANYOF_MAX; i += 2) {
	1325	if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
	1326	{
	1327	ssc_match_all_cp(ssc);
	1328	ANYOF_POSIXL_CLEAR(ssc, i);
	1329	ANYOF_POSIXL_CLEAR(ssc, i+1);
	1330	}
	1331	}
	1332	}
	1333	}
	1334
	1335	ssc_union(ssc,
	1336	ored_cp_list,
	1337	FALSE /* Already has been inverted */
	1338	);
	1339	}
	1340
	1341	PERL_STATIC_INLINE void
	1342	S_ssc_union(pTHX_ regnode_ssc ssc, SV const invlist, const bool invert2nd)
	1343	{
	1344	PERL_ARGS_ASSERT_SSC_UNION;
	1345
	1346	assert(is_ANYOF_SYNTHETIC(ssc));
	1347
	1348	_invlist_union_maybe_complement_2nd(ssc->invlist,
	1349	invlist,
	1350	invert2nd,
	1351	&ssc->invlist);
	1352	}
	1353
	1354	PERL_STATIC_INLINE void
	1355	S_ssc_intersection(pTHX_ regnode_ssc *ssc,
	1356	SV* const invlist,
	1357	const bool invert2nd)
	1358	{
	1359	PERL_ARGS_ASSERT_SSC_INTERSECTION;
	1360
	1361	assert(is_ANYOF_SYNTHETIC(ssc));
	1362
	1363	_invlist_intersection_maybe_complement_2nd(ssc->invlist,
	1364	invlist,
	1365	invert2nd,
	1366	&ssc->invlist);
	1367	}
	1368
	1369	PERL_STATIC_INLINE void
	1370	S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
	1371	{
	1372	PERL_ARGS_ASSERT_SSC_ADD_RANGE;
	1373
	1374	assert(is_ANYOF_SYNTHETIC(ssc));
	1375
	1376	ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
	1377	}
	1378
	1379	PERL_STATIC_INLINE void
	1380	S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
	1381	{
	1382	/* AND just the single code point 'cp' into the SSC 'ssc' */
	1383
	1384	SV* cp_list = _new_invlist(2);
	1385
	1386	PERL_ARGS_ASSERT_SSC_CP_AND;
	1387
	1388	assert(is_ANYOF_SYNTHETIC(ssc));
	1389
	1390	cp_list = add_cp_to_invlist(cp_list, cp);
	1391	ssc_intersection(ssc, cp_list,
	1392	FALSE /* Not inverted */
	1393	);
	1394	SvREFCNT_dec_NN(cp_list);
	1395	}
	1396
	1397	PERL_STATIC_INLINE void
	1398	S_ssc_clear_locale(regnode_ssc *ssc)
	1399	{
	1400	/* Set the SSC 'ssc' to not match any locale things */
	1401	PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
	1402
	1403	assert(is_ANYOF_SYNTHETIC(ssc));
	1404
	1405	ANYOF_POSIXL_ZERO(ssc);
	1406	ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
	1407	}
	1408
	1409	STATIC void
	1410	S_ssc_finalize(pTHX_ RExC_state_t pRExC_state, regnode_ssc ssc)
	1411	{
	1412	/* The inversion list in the SSC is marked mortal; now we need a more
	1413	* permanent copy, which is stored the same way that is done in a regular
	1414	* ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
	1415	* map */
	1416
	1417	SV* invlist = invlist_clone(ssc->invlist);
	1418
	1419	PERL_ARGS_ASSERT_SSC_FINALIZE;
	1420
	1421	assert(is_ANYOF_SYNTHETIC(ssc));
	1422
	1423	/* The code in this file assumes that all but these flags aren't relevant
	1424	* to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
	1425	* time we reach here */
	1426	assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
	1427
	1428	populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
	1429
	1430	set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
	1431	NULL, NULL, NULL, FALSE);
	1432
	1433	/* Make sure is clone-safe */
	1434	ssc->invlist = NULL;
	1435
	1436	if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
	1437	ANYOF_FLAGS(ssc) \|= ANYOF_POSIXL;
	1438	}
	1439
	1440	assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) \|\| RExC_contains_locale);
	1441	}
	1442
	1443	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	1444	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	1445	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	1446	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
	1447	? (TRIE_LIST_CUR( idx ) - 1) \
	1448	: 0 )
	1449
	1450
	1451	#ifdef DEBUGGING
	1452	/*
	1453	dump_trie(trie,widecharmap,revcharmap)
	1454	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	1455	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	1456
	1457	These routines dump out a trie in a somewhat readable format.
	1458	The _interim_ variants are used for debugging the interim
	1459	tables that are used to generate the final compressed
	1460	representation which is what dump_trie expects.
	1461
	1462	Part of the reason for their existence is to provide a form
	1463	of documentation as to how the different representations function.
	1464
	1465	*/
	1466
	1467	/*
	1468	Dumps the final compressed table form of the trie to Perl_debug_log.
	1469	Used for debugging make_trie().
	1470	*/
	1471
	1472	STATIC void
	1473	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	1474	AV *revcharmap, U32 depth)
	1475	{
	1476	U32 state;
	1477	SV *sv=sv_newmortal();
	1478	int colwidth= widecharmap ? 6 : 4;
	1479	U16 word;
	1480	GET_RE_DEBUG_FLAGS_DECL;
	1481
	1482	PERL_ARGS_ASSERT_DUMP_TRIE;
	1483
	1484	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	1485	(int)depth * 2 + 2,"",
	1486	"Match","Base","Ofs" );
	1487
	1488	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	1489	SV ** const tmp = av_fetch( revcharmap, state, 0);
	1490	if ( tmp ) {
	1491	PerlIO_printf( Perl_debug_log, "%*s",
	1492	colwidth,
	1493	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1494	PL_colors[0], PL_colors[1],
	1495	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1496	PERL_PV_ESCAPE_FIRSTCHAR
	1497	)
	1498	);
	1499	}
	1500	}
	1501	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	1502	(int)depth * 2 + 2,"");
	1503
	1504	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	1505	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	1506	PerlIO_printf( Perl_debug_log, "\n");
	1507
	1508	for( state = 1 ; state < trie->statecount ; state++ ) {
	1509	const U32 base = trie->states[ state ].trans.base;
	1510
	1511	PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"\|",
	1512	(int)depth * 2 + 2,"", (UV)state);
	1513
	1514	if ( trie->states[ state ].wordnum ) {
	1515	PerlIO_printf( Perl_debug_log, " W%4X",
	1516	trie->states[ state ].wordnum );
	1517	} else {
	1518	PerlIO_printf( Perl_debug_log, "%6s", "" );
	1519	}
	1520
	1521	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	1522
	1523	if ( base ) {
	1524	U32 ofs = 0;
	1525
	1526	while( ( base + ofs < trie->uniquecharcount ) \|\|
	1527	( base + ofs - trie->uniquecharcount < trie->lasttrans
	1528	&& trie->trans[ base + ofs - trie->uniquecharcount ].check
	1529	!= state))
	1530	ofs++;
	1531
	1532	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	1533
	1534	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1535	if ( ( base + ofs >= trie->uniquecharcount )
	1536	&& ( base + ofs - trie->uniquecharcount
	1537	< trie->lasttrans )
	1538	&& trie->trans[ base + ofs
	1539	- trie->uniquecharcount ].check == state )
	1540	{
	1541	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	1542	colwidth,
	1543	(UV)trie->trans[ base + ofs
	1544	- trie->uniquecharcount ].next );
	1545	} else {
	1546	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	1547	}
	1548	}
	1549
	1550	PerlIO_printf( Perl_debug_log, "]");
	1551
	1552	}
	1553	PerlIO_printf( Perl_debug_log, "\n" );
	1554	}
	1555	PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
	1556	(int)depth*2, "");
	1557	for (word=1; word <= trie->wordcount; word++) {
	1558	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	1559	(int)word, (int)(trie->wordinfo[word].prev),
	1560	(int)(trie->wordinfo[word].len));
	1561	}
	1562	PerlIO_printf(Perl_debug_log, "\n" );
	1563	}
	1564	/*
	1565	Dumps a fully constructed but uncompressed trie in list form.
	1566	List tries normally only are used for construction when the number of
	1567	possible chars (trie->uniquecharcount) is very high.
	1568	Used for debugging make_trie().
	1569	*/
	1570	STATIC void
	1571	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	1572	HV widecharmap, AV revcharmap, U32 next_alloc,
	1573	U32 depth)
	1574	{
	1575	U32 state;
	1576	SV *sv=sv_newmortal();
	1577	int colwidth= widecharmap ? 6 : 4;
	1578	GET_RE_DEBUG_FLAGS_DECL;
	1579
	1580	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1581
	1582	/* print out the table precompression. */
	1583	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1584	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1585	"------:-----+-----------------\n" );
	1586
	1587	for( state=1 ; state < next_alloc ; state ++ ) {
	1588	U16 charid;
	1589
	1590	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1591	(int)depth * 2 + 2,"", (UV)state );
	1592	if ( ! trie->states[ state ].wordnum ) {
	1593	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1594	} else {
	1595	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1596	trie->states[ state ].wordnum
	1597	);
	1598	}
	1599	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1600	SV ** const tmp = av_fetch( revcharmap,
	1601	TRIE_LIST_ITEM(state,charid).forid, 0);
	1602	if ( tmp ) {
	1603	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1604	colwidth,
	1605	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp),
	1606	colwidth,
	1607	PL_colors[0], PL_colors[1],
	1608	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
	1609	\| PERL_PV_ESCAPE_FIRSTCHAR
	1610	) ,
	1611	TRIE_LIST_ITEM(state,charid).forid,
	1612	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1613	);
	1614	if (!(charid % 10))
	1615	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1616	(int)((depth * 2) + 14), "");
	1617	}
	1618	}
	1619	PerlIO_printf( Perl_debug_log, "\n");
	1620	}
	1621	}
	1622
	1623	/*
	1624	Dumps a fully constructed but uncompressed trie in table form.
	1625	This is the normal DFA style state transition table, with a few
	1626	twists to facilitate compression later.
	1627	Used for debugging make_trie().
	1628	*/
	1629	STATIC void
	1630	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1631	HV widecharmap, AV revcharmap, U32 next_alloc,
	1632	U32 depth)
	1633	{
	1634	U32 state;
	1635	U16 charid;
	1636	SV *sv=sv_newmortal();
	1637	int colwidth= widecharmap ? 6 : 4;
	1638	GET_RE_DEBUG_FLAGS_DECL;
	1639
	1640	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1641
	1642	/*
	1643	print out the table precompression so that we can do a visual check
	1644	that they are identical.
	1645	*/
	1646
	1647	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1648
	1649	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1650	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1651	if ( tmp ) {
	1652	PerlIO_printf( Perl_debug_log, "%*s",
	1653	colwidth,
	1654	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1655	PL_colors[0], PL_colors[1],
	1656	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1657	PERL_PV_ESCAPE_FIRSTCHAR
	1658	)
	1659	);
	1660	}
	1661	}
	1662
	1663	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1664
	1665	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1666	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1667	}
	1668
	1669	PerlIO_printf( Perl_debug_log, "\n" );
	1670
	1671	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1672
	1673	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1674	(int)depth * 2 + 2,"",
	1675	(UV)TRIE_NODENUM( state ) );
	1676
	1677	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1678	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1679	if (v)
	1680	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1681	else
	1682	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1683	}
	1684	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1685	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
	1686	(UV)trie->trans[ state ].check );
	1687	} else {
	1688	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
	1689	(UV)trie->trans[ state ].check,
	1690	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1691	}
	1692	}
	1693	}
	1694
	1695	#endif
	1696
	1697
	1698	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1699	startbranch: the first branch in the whole branch sequence
	1700	first : start branch of sequence of branch-exact nodes.
	1701	May be the same as startbranch
	1702	last : Thing following the last branch.
	1703	May be the same as tail.
	1704	tail : item following the branch sequence
	1705	count : words in the sequence
	1706	flags : currently the OP() type we will be building one of /EXACT(\|F\|FA\|FU\|FU_SS)/
	1707	depth : indent depth
	1708
	1709	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1710
	1711	A trie is an N'ary tree where the branches are determined by digital
	1712	decomposition of the key. IE, at the root node you look up the 1st character and
	1713	follow that branch repeat until you find the end of the branches. Nodes can be
	1714	marked as "accepting" meaning they represent a complete word. Eg:
	1715
	1716	/he\|she\|his\|hers/
	1717
	1718	would convert into the following structure. Numbers represent states, letters
	1719	following numbers represent valid transitions on the letter from that state, if
	1720	the number is in square brackets it represents an accepting state, otherwise it
	1721	will be in parenthesis.
	1722
	1723	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1724	\| \|
	1725	\| (2)
	1726	\| \|
	1727	(1) +-i->(6)-+-s->[7]
	1728	\|
	1729	+-s->(3)-+-h->(4)-+-e->[5]
	1730
	1731	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1732
	1733	This shows that when matching against the string 'hers' we will begin at state 1
	1734	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1735	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1736	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1737	single traverse. We store a mapping from accepting to state to which word was
	1738	matched, and then when we have multiple possibilities we try to complete the
	1739	rest of the regex in the order in which they occured in the alternation.
	1740
	1741	The only prior NFA like behaviour that would be changed by the TRIE support is
	1742	the silent ignoring of duplicate alternations which are of the form:
	1743
	1744	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1745
	1746	Thus EVAL blocks following a trie may be called a different number of times with
	1747	and without the optimisation. With the optimisations dupes will be silently
	1748	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1749	the following demonstrates:
	1750
	1751	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1752
	1753	which prints out 'word' three times, but
	1754
	1755	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1756
	1757	which doesnt print it out at all. This is due to other optimisations kicking in.
	1758
	1759	Example of what happens on a structural level:
	1760
	1761	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1762
	1763	1: CURLYM[1] {1,32767}(18)
	1764	5: BRANCH(8)
	1765	6: EXACT <ac>(16)
	1766	8: BRANCH(11)
	1767	9: EXACT <ad>(16)
	1768	11: BRANCH(14)
	1769	12: EXACT <ab>(16)
	1770	16: SUCCEED(0)
	1771	17: NOTHING(18)
	1772	18: END(0)
	1773
	1774	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1775	and should turn into:
	1776
	1777	1: CURLYM[1] {1,32767}(18)
	1778	5: TRIE(16)
	1779	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1780	<ac>
	1781	<ad>
	1782	<ab>
	1783	16: SUCCEED(0)
	1784	17: NOTHING(18)
	1785	18: END(0)
	1786
	1787	Cases where tail != last would be like /(?foo\|bar)baz/:
	1788
	1789	1: BRANCH(4)
	1790	2: EXACT <foo>(8)
	1791	4: BRANCH(7)
	1792	5: EXACT <bar>(8)
	1793	7: TAIL(8)
	1794	8: EXACT <baz>(10)
	1795	10: END(0)
	1796
	1797	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1798	and would end up looking like:
	1799
	1800	1: TRIE(8)
	1801	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1802	<foo>
	1803	<bar>
	1804	7: TAIL(8)
	1805	8: EXACT <baz>(10)
	1806	10: END(0)
	1807
	1808	d = uvchr_to_utf8_flags(d, uv, 0);
	1809
	1810	is the recommended Unicode-aware way of saying
	1811
	1812	*(d++) = uv;
	1813	*/
	1814
	1815	#define TRIE_STORE_REVCHAR(val) \
	1816	STMT_START { \
	1817	if (UTF) { \
	1818	SV zlopp = newSV(7); / XXX: optimize me */ \
	1819	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1820	unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
	1821	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1822	SvPOK_on(zlopp); \
	1823	SvUTF8_on(zlopp); \
	1824	av_push(revcharmap, zlopp); \
	1825	} else { \
	1826	char ooooff = (char)val; \
	1827	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1828	} \
	1829	} STMT_END
	1830
	1831	/* This gets the next character from the input, folding it if not already
	1832	* folded. */
	1833	#define TRIE_READ_CHAR STMT_START { \
	1834	wordlen++; \
	1835	if ( UTF ) { \
	1836	/* if it is UTF then it is either already folded, or does not need \
	1837	* folding */ \
	1838	uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
	1839	} \
	1840	else if (folder == PL_fold_latin1) { \
	1841	/* This folder implies Unicode rules, which in the range expressible \
	1842	* by not UTF is the lower case, with the two exceptions, one of \
	1843	* which should have been taken care of before calling this */ \
	1844	assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
	1845	uvc = toLOWER_L1(*uc); \
	1846	if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
	1847	len = 1; \
	1848	} else { \
	1849	/* raw data, will be folded later if needed */ \
	1850	uvc = (U32)*uc; \
	1851	len = 1; \
	1852	} \
	1853	} STMT_END
	1854
	1855
	1856
	1857	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1858	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1859	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1860	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1861	} \
	1862	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1863	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1864	TRIE_LIST_CUR( state )++; \
	1865	} STMT_END
	1866
	1867	#define TRIE_LIST_NEW(state) STMT_START { \
	1868	Newxz( trie->states[ state ].trans.list, \
	1869	4, reg_trie_trans_le ); \
	1870	TRIE_LIST_CUR( state ) = 1; \
	1871	TRIE_LIST_LEN( state ) = 4; \
	1872	} STMT_END
	1873
	1874	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1875	U16 dupe= trie->states[ state ].wordnum; \
	1876	regnode * const noper_next = regnext( noper ); \
	1877	\
	1878	DEBUG_r({ \
	1879	/* store the word for dumping */ \
	1880	SV* tmp; \
	1881	if (OP(noper) != NOTHING) \
	1882	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1883	else \
	1884	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1885	av_push( trie_words, tmp ); \
	1886	}); \
	1887	\
	1888	curword++; \
	1889	trie->wordinfo[curword].prev = 0; \
	1890	trie->wordinfo[curword].len = wordlen; \
	1891	trie->wordinfo[curword].accept = state; \
	1892	\
	1893	if ( noper_next < tail ) { \
	1894	if (!trie->jump) \
	1895	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
	1896	sizeof(U16) ); \
	1897	trie->jump[curword] = (U16)(noper_next - convert); \
	1898	if (!jumper) \
	1899	jumper = noper_next; \
	1900	if (!nextbranch) \
	1901	nextbranch= regnext(cur); \
	1902	} \
	1903	\
	1904	if ( dupe ) { \
	1905	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1906	/* chain, so that when the bits of chain are later */\
	1907	/* linked together, the dups appear in the chain */\
	1908	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1909	trie->wordinfo[dupe].prev = curword; \
	1910	} else { \
	1911	/* we haven't inserted this word yet. */ \
	1912	trie->states[ state ].wordnum = curword; \
	1913	} \
	1914	} STMT_END
	1915
	1916
	1917	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1918	( ( base + charid >= ucharcount \
	1919	&& base + charid < ubound \
	1920	&& state == trie->trans[ base - ucharcount + charid ].check \
	1921	&& trie->trans[ base - ucharcount + charid ].next ) \
	1922	? trie->trans[ base - ucharcount + charid ].next \
	1923	: ( state==1 ? special : 0 ) \
	1924	)
	1925
	1926	#define MADE_TRIE 1
	1927	#define MADE_JUMP_TRIE 2
	1928	#define MADE_EXACT_TRIE 4
	1929
	1930	STATIC I32
	1931	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch,
	1932	regnode first, regnode last, regnode *tail,
	1933	U32 word_count, U32 flags, U32 depth)
	1934	{
	1935	/* first pass, loop through and scan words */
	1936	reg_trie_data *trie;
	1937	HV *widecharmap = NULL;
	1938	AV *revcharmap = newAV();
	1939	regnode *cur;
	1940	STRLEN len = 0;
	1941	UV uvc = 0;
	1942	U16 curword = 0;
	1943	U32 next_alloc = 0;
	1944	regnode *jumper = NULL;
	1945	regnode *nextbranch = NULL;
	1946	regnode *convert = NULL;
	1947	U32 prev_states; / temp array mapping each state to previous one */
	1948	/* we just use folder as a flag in utf8 */
	1949	const U8 * folder = NULL;
	1950
	1951	#ifdef DEBUGGING
	1952	const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
	1953	AV *trie_words = NULL;
	1954	/* along with revcharmap, this only used during construction but both are
	1955	* useful during debugging so we store them in the struct when debugging.
	1956	*/
	1957	#else
	1958	const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
	1959	STRLEN trie_charcount=0;
	1960	#endif
	1961	SV *re_trie_maxbuff;
	1962	GET_RE_DEBUG_FLAGS_DECL;
	1963
	1964	PERL_ARGS_ASSERT_MAKE_TRIE;
	1965	#ifndef DEBUGGING
	1966	PERL_UNUSED_ARG(depth);
	1967	#endif
	1968
	1969	switch (flags) {
	1970	case EXACT: break;
	1971	case EXACTFA:
	1972	case EXACTFU_SS:
	1973	case EXACTFU: folder = PL_fold_latin1; break;
	1974	case EXACTF: folder = PL_fold; break;
	1975	default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
	1976	}
	1977
	1978	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1979	trie->refcount = 1;
	1980	trie->startstate = 1;
	1981	trie->wordcount = word_count;
	1982	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1983	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1984	if (flags == EXACT)
	1985	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1986	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1987	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1988
	1989	DEBUG_r({
	1990	trie_words = newAV();
	1991	});
	1992
	1993	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1994	assert(re_trie_maxbuff);
	1995	if (!SvIOK(re_trie_maxbuff)) {
	1996	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1997	}
	1998	DEBUG_TRIE_COMPILE_r({
	1999	PerlIO_printf( Perl_debug_log,
	2000	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	2001	(int)depth * 2 + 2, "",
	2002	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	2003	REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
	2004	});
	2005
	2006	/* Find the node we are going to overwrite */
	2007	if ( first == startbranch && OP( last ) != BRANCH ) {
	2008	/* whole branch chain */
	2009	convert = first;
	2010	} else {
	2011	/* branch sub-chain */
	2012	convert = NEXTOPER( first );
	2013	}
	2014
	2015	/* -- First loop and Setup --
	2016
	2017	We first traverse the branches and scan each word to determine if it
	2018	contains widechars, and how many unique chars there are, this is
	2019	important as we have to build a table with at least as many columns as we
	2020	have unique chars.
	2021
	2022	We use an array of integers to represent the character codes 0..255
	2023	(trie->charmap) and we use a an HV* to store Unicode characters. We use
	2024	the native representation of the character value as the key and IV's for
	2025	the coded index.
	2026
	2027	TODO If we keep track of how many times each character is used we can
	2028	remap the columns so that the table compression later on is more
	2029	efficient in terms of memory by ensuring the most common value is in the
	2030	middle and the least common are on the outside. IMO this would be better
	2031	than a most to least common mapping as theres a decent chance the most
	2032	common letter will share a node with the least common, meaning the node
	2033	will not be compressible. With a middle is most common approach the worst
	2034	case is when we have the least common nodes twice.
	2035
	2036	*/
	2037
	2038	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	2039	regnode *noper = NEXTOPER( cur );
	2040	const U8 uc = (U8)STRING( noper );
	2041	const U8 *e = uc + STR_LEN( noper );
	2042	int foldlen = 0;
	2043	U32 wordlen = 0; /* required init */
	2044	STRLEN minchars = 0;
	2045	STRLEN maxchars = 0;
	2046	bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
	2047	bitmap?*/
	2048
	2049	if (OP(noper) == NOTHING) {
	2050	regnode *noper_next= regnext(noper);
	2051	if (noper_next != tail && OP(noper_next) == flags) {
	2052	noper = noper_next;
	2053	uc= (U8*)STRING(noper);
	2054	e= uc + STR_LEN(noper);
	2055	trie->minlen= STR_LEN(noper);
	2056	} else {
	2057	trie->minlen= 0;
	2058	continue;
	2059	}
	2060	}
	2061
	2062	if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
	2063	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	2064	regardless of encoding */
	2065	if (OP( noper ) == EXACTFU_SS) {
	2066	/* false positives are ok, so just set this */
	2067	TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
	2068	}
	2069	}
	2070	for ( ; uc < e ; uc += len ) { /* Look at each char in the current
	2071	branch */
	2072	TRIE_CHARCOUNT(trie)++;
	2073	TRIE_READ_CHAR;
	2074
	2075	/* TRIE_READ_CHAR returns the current character, or its fold if /i
	2076	* is in effect. Under /i, this character can match itself, or
	2077	* anything that folds to it. If not under /i, it can match just
	2078	* itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
	2079	* all fold to k, and all are single characters. But some folds
	2080	* expand to more than one character, so for example LATIN SMALL
	2081	* LIGATURE FFI folds to the three character sequence 'ffi'. If
	2082	* the string beginning at 'uc' is 'ffi', it could be matched by
	2083	* three characters, or just by the one ligature character. (It
	2084	* could also be matched by two characters: LATIN SMALL LIGATURE FF
	2085	* followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
	2086	* (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
	2087	* match.) The trie needs to know the minimum and maximum number
	2088	* of characters that could match so that it can use size alone to
	2089	* quickly reject many match attempts. The max is simple: it is
	2090	* the number of folded characters in this branch (since a fold is
	2091	* never shorter than what folds to it. */
	2092
	2093	maxchars++;
	2094
	2095	/* And the min is equal to the max if not under /i (indicated by
	2096	* 'folder' being NULL), or there are no multi-character folds. If
	2097	* there is a multi-character fold, the min is incremented just
	2098	* once, for the character that folds to the sequence. Each
	2099	* character in the sequence needs to be added to the list below of
	2100	* characters in the trie, but we count only the first towards the
	2101	* min number of characters needed. This is done through the
	2102	* variable 'foldlen', which is returned by the macros that look
	2103	* for these sequences as the number of bytes the sequence
	2104	* occupies. Each time through the loop, we decrement 'foldlen' by
	2105	* how many bytes the current char occupies. Only when it reaches
	2106	* 0 do we increment 'minchars' or look for another multi-character
	2107	* sequence. */
	2108	if (folder == NULL) {
	2109	minchars++;
	2110	}
	2111	else if (foldlen > 0) {
	2112	foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
	2113	}
	2114	else {
	2115	minchars++;
	2116
	2117	/* See if *uc is the beginning of a multi-character fold. If
	2118	* so, we decrement the length remaining to look at, to account
	2119	* for the current character this iteration. (We can use 'uc'
	2120	* instead of the fold returned by TRIE_READ_CHAR because for
	2121	* non-UTF, the latin1_safe macro is smart enough to account
	2122	* for all the unfolded characters, and because for UTF, the
	2123	* string will already have been folded earlier in the
	2124	* compilation process */
	2125	if (UTF) {
	2126	if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
	2127	foldlen -= UTF8SKIP(uc);
	2128	}
	2129	}
	2130	else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
	2131	foldlen--;
	2132	}
	2133	}
	2134
	2135	/* The current character (and any potential folds) should be added
	2136	* to the possible matching characters for this position in this
	2137	* branch */
	2138	if ( uvc < 256 ) {
	2139	if ( folder ) {
	2140	U8 folded= folder[ (U8) uvc ];
	2141	if ( !trie->charmap[ folded ] ) {
	2142	trie->charmap[ folded ]=( ++trie->uniquecharcount );
	2143	TRIE_STORE_REVCHAR( folded );
	2144	}
	2145	}
	2146	if ( !trie->charmap[ uvc ] ) {
	2147	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	2148	TRIE_STORE_REVCHAR( uvc );
	2149	}
	2150	if ( set_bit ) {
	2151	/* store the codepoint in the bitmap, and its folded
	2152	* equivalent. */
	2153	TRIE_BITMAP_SET(trie, uvc);
	2154
	2155	/* store the folded codepoint */
	2156	if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
	2157
	2158	if ( !UTF ) {
	2159	/* store first byte of utf8 representation of
	2160	variant codepoints */
	2161	if (! UVCHR_IS_INVARIANT(uvc)) {
	2162	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	2163	}
	2164	}
	2165	set_bit = 0; /* We've done our bit :-) */
	2166	}
	2167	} else {
	2168
	2169	/* XXX We could come up with the list of code points that fold
	2170	* to this using PL_utf8_foldclosures, except not for
	2171	* multi-char folds, as there may be multiple combinations
	2172	* there that could work, which needs to wait until runtime to
	2173	* resolve (The comment about LIGATURE FFI above is such an
	2174	* example */
	2175
	2176	SV** svpp;
	2177	if ( !widecharmap )
	2178	widecharmap = newHV();
	2179
	2180	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	2181
	2182	if ( !svpp )
	2183	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	2184
	2185	if ( !SvTRUE( *svpp ) ) {
	2186	sv_setiv( *svpp, ++trie->uniquecharcount );
	2187	TRIE_STORE_REVCHAR(uvc);
	2188	}
	2189	}
	2190	} /* end loop through characters in this branch of the trie */
	2191
	2192	/* We take the min and max for this branch and combine to find the min
	2193	* and max for all branches processed so far */
	2194	if( cur == first ) {
	2195	trie->minlen = minchars;
	2196	trie->maxlen = maxchars;
	2197	} else if (minchars < trie->minlen) {
	2198	trie->minlen = minchars;
	2199	} else if (maxchars > trie->maxlen) {
	2200	trie->maxlen = maxchars;
	2201	}
	2202	} /* end first pass */
	2203	DEBUG_TRIE_COMPILE_r(
	2204	PerlIO_printf( Perl_debug_log,
	2205	"%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	2206	(int)depth * 2 + 2,"",
	2207	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	2208	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	2209	(int)trie->minlen, (int)trie->maxlen )
	2210	);
	2211
	2212	/*
	2213	We now know what we are dealing with in terms of unique chars and
	2214	string sizes so we can calculate how much memory a naive
	2215	representation using a flat table will take. If it's over a reasonable
	2216	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	2217	conservative but potentially much slower representation using an array
	2218	of lists.
	2219
	2220	At the end we convert both representations into the same compressed
	2221	form that will be used in regexec.c for matching with. The latter
	2222	is a form that cannot be used to construct with but has memory
	2223	properties similar to the list form and access properties similar
	2224	to the table form making it both suitable for fast searches and
	2225	small enough that its feasable to store for the duration of a program.
	2226
	2227	See the comment in the code where the compressed table is produced
	2228	inplace from the flat tabe representation for an explanation of how
	2229	the compression works.
	2230
	2231	*/
	2232
	2233
	2234	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	2235	prev_states[1] = 0;
	2236
	2237	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
	2238	> SvIV(re_trie_maxbuff) )
	2239	{
	2240	/*
	2241	Second Pass -- Array Of Lists Representation
	2242
	2243	Each state will be represented by a list of charid:state records
	2244	(reg_trie_trans_le) the first such element holds the CUR and LEN
	2245	points of the allocated array. (See defines above).
	2246
	2247	We build the initial structure using the lists, and then convert
	2248	it into the compressed table form which allows faster lookups
	2249	(but cant be modified once converted).
	2250	*/
	2251
	2252	STRLEN transcount = 1;
	2253
	2254	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	2255	"%*sCompiling trie using list compiler\n",
	2256	(int)depth * 2 + 2, ""));
	2257
	2258	trie->states = (reg_trie_state *)
	2259	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	2260	sizeof(reg_trie_state) );
	2261	TRIE_LIST_NEW(1);
	2262	next_alloc = 2;
	2263
	2264	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	2265
	2266	regnode *noper = NEXTOPER( cur );
	2267	U8 uc = (U8)STRING( noper );
	2268	const U8 *e = uc + STR_LEN( noper );
	2269	U32 state = 1; /* required init */
	2270	U16 charid = 0; /* sanity init */
	2271	U32 wordlen = 0; /* required init */
	2272
	2273	if (OP(noper) == NOTHING) {
	2274	regnode *noper_next= regnext(noper);
	2275	if (noper_next != tail && OP(noper_next) == flags) {
	2276	noper = noper_next;
	2277	uc= (U8*)STRING(noper);
	2278	e= uc + STR_LEN(noper);
	2279	}
	2280	}
	2281
	2282	if (OP(noper) != NOTHING) {
	2283	for ( ; uc < e ; uc += len ) {
	2284
	2285	TRIE_READ_CHAR;
	2286
	2287	if ( uvc < 256 ) {
	2288	charid = trie->charmap[ uvc ];
	2289	} else {
	2290	SV** const svpp = hv_fetch( widecharmap,
	2291	(char*)&uvc,
	2292	sizeof( UV ),
	2293	0);
	2294	if ( !svpp ) {
	2295	charid = 0;
	2296	} else {
	2297	charid=(U16)SvIV( *svpp );
	2298	}
	2299	}
	2300	/* charid is now 0 if we dont know the char read, or
	2301	* nonzero if we do */
	2302	if ( charid ) {
	2303
	2304	U16 check;
	2305	U32 newstate = 0;
	2306
	2307	charid--;
	2308	if ( !trie->states[ state ].trans.list ) {
	2309	TRIE_LIST_NEW( state );
	2310	}
	2311	for ( check = 1;
	2312	check <= TRIE_LIST_USED( state );
	2313	check++ )
	2314	{
	2315	if ( TRIE_LIST_ITEM( state, check ).forid
	2316	== charid )
	2317	{
	2318	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	2319	break;
	2320	}
	2321	}
	2322	if ( ! newstate ) {
	2323	newstate = next_alloc++;
	2324	prev_states[newstate] = state;
	2325	TRIE_LIST_PUSH( state, charid, newstate );
	2326	transcount++;
	2327	}
	2328	state = newstate;
	2329	} else {
	2330	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	2331	}
	2332	}
	2333	}
	2334	TRIE_HANDLE_WORD(state);
	2335
	2336	} /* end second pass */
	2337
	2338	/* next alloc is the NEXT state to be allocated */
	2339	trie->statecount = next_alloc;
	2340	trie->states = (reg_trie_state *)
	2341	PerlMemShared_realloc( trie->states,
	2342	next_alloc
	2343	* sizeof(reg_trie_state) );
	2344
	2345	/* and now dump it out before we compress it */
	2346	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	2347	revcharmap, next_alloc,
	2348	depth+1)
	2349	);
	2350
	2351	trie->trans = (reg_trie_trans *)
	2352	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	2353	{
	2354	U32 state;
	2355	U32 tp = 0;
	2356	U32 zp = 0;
	2357
	2358
	2359	for( state=1 ; state < next_alloc ; state ++ ) {
	2360	U32 base=0;
	2361
	2362	/*
	2363	DEBUG_TRIE_COMPILE_MORE_r(
	2364	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	2365	);
	2366	*/
	2367
	2368	if (trie->states[state].trans.list) {
	2369	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	2370	U16 maxid=minid;
	2371	U16 idx;
	2372
	2373	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	2374	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	2375	if ( forid < minid ) {
	2376	minid=forid;
	2377	} else if ( forid > maxid ) {
	2378	maxid=forid;
	2379	}
	2380	}
	2381	if ( transcount < tp + maxid - minid + 1) {
	2382	transcount *= 2;
	2383	trie->trans = (reg_trie_trans *)
	2384	PerlMemShared_realloc( trie->trans,
	2385	transcount
	2386	* sizeof(reg_trie_trans) );
	2387	Zero( trie->trans + (transcount / 2),
	2388	transcount / 2,
	2389	reg_trie_trans );
	2390	}
	2391	base = trie->uniquecharcount + tp - minid;
	2392	if ( maxid == minid ) {
	2393	U32 set = 0;
	2394	for ( ; zp < tp ; zp++ ) {
	2395	if ( ! trie->trans[ zp ].next ) {
	2396	base = trie->uniquecharcount + zp - minid;
	2397	trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
	2398	1).newstate;
	2399	trie->trans[ zp ].check = state;
	2400	set = 1;
	2401	break;
	2402	}
	2403	}
	2404	if ( !set ) {
	2405	trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
	2406	1).newstate;
	2407	trie->trans[ tp ].check = state;
	2408	tp++;
	2409	zp = tp;
	2410	}
	2411	} else {
	2412	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	2413	const U32 tid = base
	2414	- trie->uniquecharcount
	2415	+ TRIE_LIST_ITEM( state, idx ).forid;
	2416	trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
	2417	idx ).newstate;
	2418	trie->trans[ tid ].check = state;
	2419	}
	2420	tp += ( maxid - minid + 1 );
	2421	}
	2422	Safefree(trie->states[ state ].trans.list);
	2423	}
	2424	/*
	2425	DEBUG_TRIE_COMPILE_MORE_r(
	2426	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	2427	);
	2428	*/
	2429	trie->states[ state ].trans.base=base;
	2430	}
	2431	trie->lasttrans = tp + 1;
	2432	}
	2433	} else {
	2434	/*
	2435	Second Pass -- Flat Table Representation.
	2436
	2437	we dont use the 0 slot of either trans[] or states[] so we add 1 to
	2438	each. We know that we will need Charcount+1 trans at most to store
	2439	the data (one row per char at worst case) So we preallocate both
	2440	structures assuming worst case.
	2441
	2442	We then construct the trie using only the .next slots of the entry
	2443	structs.
	2444
	2445	We use the .check field of the first entry of the node temporarily
	2446	to make compression both faster and easier by keeping track of how
	2447	many non zero fields are in the node.
	2448
	2449	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	2450	transition.
	2451
	2452	There are two terms at use here: state as a TRIE_NODEIDX() which is
	2453	a number representing the first entry of the node, and state as a
	2454	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
	2455	and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
	2456	if there are 2 entrys per node. eg:
	2457
	2458	A B A B
	2459	1. 2 4 1. 3 7
	2460	2. 0 3 3. 0 5
	2461	3. 0 0 5. 0 0
	2462	4. 0 0 7. 0 0
	2463
	2464	The table is internally in the right hand, idx form. However as we
	2465	also have to deal with the states array which is indexed by nodenum
	2466	we have to use TRIE_NODENUM() to convert.
	2467
	2468	*/
	2469	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	2470	"%*sCompiling trie using table compiler\n",
	2471	(int)depth * 2 + 2, ""));
	2472
	2473	trie->trans = (reg_trie_trans *)
	2474	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	2475	* trie->uniquecharcount + 1,
	2476	sizeof(reg_trie_trans) );
	2477	trie->states = (reg_trie_state *)
	2478	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	2479	sizeof(reg_trie_state) );
	2480	next_alloc = trie->uniquecharcount + 1;
	2481
	2482
	2483	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	2484
	2485	regnode *noper = NEXTOPER( cur );
	2486	const U8 uc = (U8)STRING( noper );
	2487	const U8 *e = uc + STR_LEN( noper );
	2488
	2489	U32 state = 1; /* required init */
	2490
	2491	U16 charid = 0; /* sanity init */
	2492	U32 accept_state = 0; /* sanity init */
	2493
	2494	U32 wordlen = 0; /* required init */
	2495
	2496	if (OP(noper) == NOTHING) {
	2497	regnode *noper_next= regnext(noper);
	2498	if (noper_next != tail && OP(noper_next) == flags) {
	2499	noper = noper_next;
	2500	uc= (U8*)STRING(noper);
	2501	e= uc + STR_LEN(noper);
	2502	}
	2503	}
	2504
	2505	if ( OP(noper) != NOTHING ) {
	2506	for ( ; uc < e ; uc += len ) {
	2507
	2508	TRIE_READ_CHAR;
	2509
	2510	if ( uvc < 256 ) {
	2511	charid = trie->charmap[ uvc ];
	2512	} else {
	2513	SV* const * const svpp = hv_fetch( widecharmap,
	2514	(char*)&uvc,
	2515	sizeof( UV ),
	2516	0);
	2517	charid = svpp ? (U16)SvIV(*svpp) : 0;
	2518	}
	2519	if ( charid ) {
	2520	charid--;
	2521	if ( !trie->trans[ state + charid ].next ) {
	2522	trie->trans[ state + charid ].next = next_alloc;
	2523	trie->trans[ state ].check++;
	2524	prev_states[TRIE_NODENUM(next_alloc)]
	2525	= TRIE_NODENUM(state);
	2526	next_alloc += trie->uniquecharcount;
	2527	}
	2528	state = trie->trans[ state + charid ].next;
	2529	} else {
	2530	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	2531	}
	2532	/* charid is now 0 if we dont know the char read, or
	2533	* nonzero if we do */
	2534	}
	2535	}
	2536	accept_state = TRIE_NODENUM( state );
	2537	TRIE_HANDLE_WORD(accept_state);
	2538
	2539	} /* end second pass */
	2540
	2541	/* and now dump it out before we compress it */
	2542	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	2543	revcharmap,
	2544	next_alloc, depth+1));
	2545
	2546	{
	2547	/*
	2548	* Inplace compress the table.*
	2549
	2550	For sparse data sets the table constructed by the trie algorithm will
	2551	be mostly 0/FAIL transitions or to put it another way mostly empty.
	2552	(Note that leaf nodes will not contain any transitions.)
	2553
	2554	This algorithm compresses the tables by eliminating most such
	2555	transitions, at the cost of a modest bit of extra work during lookup:
	2556
	2557	- Each states[] entry contains a .base field which indicates the
	2558	index in the state[] array wheres its transition data is stored.
	2559
	2560	- If .base is 0 there are no valid transitions from that node.
	2561
	2562	- If .base is nonzero then charid is added to it to find an entry in
	2563	the trans array.
	2564
	2565	-If trans[states[state].base+charid].check!=state then the
	2566	transition is taken to be a 0/Fail transition. Thus if there are fail
	2567	transitions at the front of the node then the .base offset will point
	2568	somewhere inside the previous nodes data (or maybe even into a node
	2569	even earlier), but the .check field determines if the transition is
	2570	valid.
	2571
	2572	XXX - wrong maybe?
	2573	The following process inplace converts the table to the compressed
	2574	table: We first do not compress the root node 1,and mark all its
	2575	.check pointers as 1 and set its .base pointer as 1 as well. This
	2576	allows us to do a DFA construction from the compressed table later,
	2577	and ensures that any .base pointers we calculate later are greater
	2578	than 0.
	2579
	2580	- We set 'pos' to indicate the first entry of the second node.
	2581
	2582	- We then iterate over the columns of the node, finding the first and
	2583	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	2584	and set the .check pointers accordingly, and advance pos
	2585	appropriately and repreat for the next node. Note that when we copy
	2586	the next pointers we have to convert them from the original
	2587	NODEIDX form to NODENUM form as the former is not valid post
	2588	compression.
	2589
	2590	- If a node has no transitions used we mark its base as 0 and do not
	2591	advance the pos pointer.
	2592
	2593	- If a node only has one transition we use a second pointer into the
	2594	structure to fill in allocated fail transitions from other states.
	2595	This pointer is independent of the main pointer and scans forward
	2596	looking for null transitions that are allocated to a state. When it
	2597	finds one it writes the single transition into the "hole". If the
	2598	pointer doesnt find one the single transition is appended as normal.
	2599
	2600	- Once compressed we can Renew/realloc the structures to release the
	2601	excess space.
	2602
	2603	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	2604	specifically Fig 3.47 and the associated pseudocode.
	2605
	2606	demq
	2607	*/
	2608	const U32 laststate = TRIE_NODENUM( next_alloc );
	2609	U32 state, charid;
	2610	U32 pos = 0, zp=0;
	2611	trie->statecount = laststate;
	2612
	2613	for ( state = 1 ; state < laststate ; state++ ) {
	2614	U8 flag = 0;
	2615	const U32 stateidx = TRIE_NODEIDX( state );
	2616	const U32 o_used = trie->trans[ stateidx ].check;
	2617	U32 used = trie->trans[ stateidx ].check;
	2618	trie->trans[ stateidx ].check = 0;
	2619
	2620	for ( charid = 0;
	2621	used && charid < trie->uniquecharcount;
	2622	charid++ )
	2623	{
	2624	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	2625	if ( trie->trans[ stateidx + charid ].next ) {
	2626	if (o_used == 1) {
	2627	for ( ; zp < pos ; zp++ ) {
	2628	if ( ! trie->trans[ zp ].next ) {
	2629	break;
	2630	}
	2631	}
	2632	trie->states[ state ].trans.base
	2633	= zp
	2634	+ trie->uniquecharcount
	2635	- charid ;
	2636	trie->trans[ zp ].next
	2637	= SAFE_TRIE_NODENUM( trie->trans[ stateidx
	2638	+ charid ].next );
	2639	trie->trans[ zp ].check = state;
	2640	if ( ++zp > pos ) pos = zp;
	2641	break;
	2642	}
	2643	used--;
	2644	}
	2645	if ( !flag ) {
	2646	flag = 1;
	2647	trie->states[ state ].trans.base
	2648	= pos + trie->uniquecharcount - charid ;
	2649	}
	2650	trie->trans[ pos ].next
	2651	= SAFE_TRIE_NODENUM(
	2652	trie->trans[ stateidx + charid ].next );
	2653	trie->trans[ pos ].check = state;
	2654	pos++;
	2655	}
	2656	}
	2657	}
	2658	trie->lasttrans = pos + 1;
	2659	trie->states = (reg_trie_state *)
	2660	PerlMemShared_realloc( trie->states, laststate
	2661	* sizeof(reg_trie_state) );
	2662	DEBUG_TRIE_COMPILE_MORE_r(
	2663	PerlIO_printf( Perl_debug_log,
	2664	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	2665	(int)depth * 2 + 2,"",
	2666	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
	2667	+ 1 ),
	2668	(IV)next_alloc,
	2669	(IV)pos,
	2670	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	2671	);
	2672
	2673	} /* end table compress */
	2674	}
	2675	DEBUG_TRIE_COMPILE_MORE_r(
	2676	PerlIO_printf(Perl_debug_log,
	2677	"%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	2678	(int)depth * 2 + 2, "",
	2679	(UV)trie->statecount,
	2680	(UV)trie->lasttrans)
	2681	);
	2682	/* resize the trans array to remove unused space */
	2683	trie->trans = (reg_trie_trans *)
	2684	PerlMemShared_realloc( trie->trans, trie->lasttrans
	2685	* sizeof(reg_trie_trans) );
	2686
	2687	{ /* Modify the program and insert the new TRIE node */
	2688	U8 nodetype =(U8)(flags & 0xFF);
	2689	char *str=NULL;
	2690
	2691	#ifdef DEBUGGING
	2692	regnode *optimize = NULL;
	2693	#ifdef RE_TRACK_PATTERN_OFFSETS
	2694
	2695	U32 mjd_offset = 0;
	2696	U32 mjd_nodelen = 0;
	2697	#endif /* RE_TRACK_PATTERN_OFFSETS */
	2698	#endif /* DEBUGGING */
	2699	/*
	2700	This means we convert either the first branch or the first Exact,
	2701	depending on whether the thing following (in 'last') is a branch
	2702	or not and whther first is the startbranch (ie is it a sub part of
	2703	the alternation or is it the whole thing.)
	2704	Assuming its a sub part we convert the EXACT otherwise we convert
	2705	the whole branch sequence, including the first.
	2706	*/
	2707	/* Find the node we are going to overwrite */
	2708	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	2709	/* branch sub-chain */
	2710	NEXT_OFF( first ) = (U16)(last - first);
	2711	#ifdef RE_TRACK_PATTERN_OFFSETS
	2712	DEBUG_r({
	2713	mjd_offset= Node_Offset((convert));
	2714	mjd_nodelen= Node_Length((convert));
	2715	});
	2716	#endif
	2717	/* whole branch chain */
	2718	}
	2719	#ifdef RE_TRACK_PATTERN_OFFSETS
	2720	else {
	2721	DEBUG_r({
	2722	const regnode *nop = NEXTOPER( convert );
	2723	mjd_offset= Node_Offset((nop));
	2724	mjd_nodelen= Node_Length((nop));
	2725	});
	2726	}
	2727	DEBUG_OPTIMISE_r(
	2728	PerlIO_printf(Perl_debug_log,
	2729	"%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2730	(int)depth * 2 + 2, "",
	2731	(UV)mjd_offset, (UV)mjd_nodelen)
	2732	);
	2733	#endif
	2734	/* But first we check to see if there is a common prefix we can
	2735	split out as an EXACT and put in front of the TRIE node. */
	2736	trie->startstate= 1;
	2737	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2738	U32 state;
	2739	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2740	U32 ofs = 0;
	2741	I32 idx = -1;
	2742	U32 count = 0;
	2743	const U32 base = trie->states[ state ].trans.base;
	2744
	2745	if ( trie->states[state].wordnum )
	2746	count = 1;
	2747
	2748	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2749	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2750	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2751	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2752	{
	2753	if ( ++count > 1 ) {
	2754	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2755	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2756	if ( state == 1 ) break;
	2757	if ( count == 2 ) {
	2758	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2759	DEBUG_OPTIMISE_r(
	2760	PerlIO_printf(Perl_debug_log,
	2761	"%*sNew Start State=%"UVuf" Class: [",
	2762	(int)depth * 2 + 2, "",
	2763	(UV)state));
	2764	if (idx >= 0) {
	2765	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2766	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2767
	2768	TRIE_BITMAP_SET(trie,*ch);
	2769	if ( folder )
	2770	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2771	DEBUG_OPTIMISE_r(
	2772	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2773	);
	2774	}
	2775	}
	2776	TRIE_BITMAP_SET(trie,*ch);
	2777	if ( folder )
	2778	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2779	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2780	}
	2781	idx = ofs;
	2782	}
	2783	}
	2784	if ( count == 1 ) {
	2785	SV **tmp = av_fetch( revcharmap, idx, 0);
	2786	STRLEN len;
	2787	char ch = SvPV( tmp, len );
	2788	DEBUG_OPTIMISE_r({
	2789	SV *sv=sv_newmortal();
	2790	PerlIO_printf( Perl_debug_log,
	2791	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2792	(int)depth * 2 + 2, "",
	2793	(UV)state, (UV)idx,
	2794	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2795	PL_colors[0], PL_colors[1],
	2796	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2797	PERL_PV_ESCAPE_FIRSTCHAR
	2798	)
	2799	);
	2800	});
	2801	if ( state==1 ) {
	2802	OP( convert ) = nodetype;
	2803	str=STRING(convert);
	2804	STR_LEN(convert)=0;
	2805	}
	2806	STR_LEN(convert) += len;
	2807	while (len--)
	2808	str++ = ch++;
	2809	} else {
	2810	#ifdef DEBUGGING
	2811	if (state>1)
	2812	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2813	#endif
	2814	break;
	2815	}
	2816	}
	2817	trie->prefixlen = (state-1);
	2818	if (str) {
	2819	regnode *n = convert+NODE_SZ_STR(convert);
	2820	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2821	trie->startstate = state;
	2822	trie->minlen -= (state - 1);
	2823	trie->maxlen -= (state - 1);
	2824	#ifdef DEBUGGING
	2825	/* At least the UNICOS C compiler choked on this
	2826	* being argument to DEBUG_r(), so let's just have
	2827	* it right here. */
	2828	if (
	2829	#ifdef PERL_EXT_RE_BUILD
	2830	1
	2831	#else
	2832	DEBUG_r_TEST
	2833	#endif
	2834	) {
	2835	regnode *fix = convert;
	2836	U32 word = trie->wordcount;
	2837	mjd_nodelen++;
	2838	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2839	while( ++fix < n ) {
	2840	Set_Node_Offset_Length(fix, 0, 0);
	2841	}
	2842	while (word--) {
	2843	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2844	if (tmp) {
	2845	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2846	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2847	else
	2848	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2849	}
	2850	}
	2851	}
	2852	#endif
	2853	if (trie->maxlen) {
	2854	convert = n;
	2855	} else {
	2856	NEXT_OFF(convert) = (U16)(tail - convert);
	2857	DEBUG_r(optimize= n);
	2858	}
	2859	}
	2860	}
	2861	if (!jumper)
	2862	jumper = last;
	2863	if ( trie->maxlen ) {
	2864	NEXT_OFF( convert ) = (U16)(tail - convert);
	2865	ARG_SET( convert, data_slot );
	2866	/* Store the offset to the first unabsorbed branch in
	2867	jump[0], which is otherwise unused by the jump logic.
	2868	We use this when dumping a trie and during optimisation. */
	2869	if (trie->jump)
	2870	trie->jump[0] = (U16)(nextbranch - convert);
	2871
	2872	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2873	* and there is a bitmap
	2874	* and the first "jump target" node we found leaves enough room
	2875	* then convert the TRIE node into a TRIEC node, with the bitmap
	2876	* embedded inline in the opcode - this is hypothetically faster.
	2877	*/
	2878	if ( !trie->states[trie->startstate].wordnum
	2879	&& trie->bitmap
	2880	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2881	{
	2882	OP( convert ) = TRIEC;
	2883	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2884	PerlMemShared_free(trie->bitmap);
	2885	trie->bitmap= NULL;
	2886	} else
	2887	OP( convert ) = TRIE;
	2888
	2889	/* store the type in the flags */
	2890	convert->flags = nodetype;
	2891	DEBUG_r({
	2892	optimize = convert
	2893	+ NODE_STEP_REGNODE
	2894	+ regarglen[ OP( convert ) ];
	2895	});
	2896	/* XXX We really should free up the resource in trie now,
	2897	as we won't use them - (which resources?) dmq */
	2898	}
	2899	/* needed for dumping*/
	2900	DEBUG_r(if (optimize) {
	2901	regnode *opt = convert;
	2902
	2903	while ( ++opt < optimize) {
	2904	Set_Node_Offset_Length(opt,0,0);
	2905	}
	2906	/*
	2907	Try to clean up some of the debris left after the
	2908	optimisation.
	2909	*/
	2910	while( optimize < jumper ) {
	2911	mjd_nodelen += Node_Length((optimize));
	2912	OP( optimize ) = OPTIMIZED;
	2913	Set_Node_Offset_Length(optimize,0,0);
	2914	optimize++;
	2915	}
	2916	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2917	});
	2918	} /* end node insert */
	2919
	2920	/* Finish populating the prev field of the wordinfo array. Walk back
	2921	* from each accept state until we find another accept state, and if
	2922	* so, point the first word's .prev field at the second word. If the
	2923	* second already has a .prev field set, stop now. This will be the
	2924	* case either if we've already processed that word's accept state,
	2925	* or that state had multiple words, and the overspill words were
	2926	* already linked up earlier.
	2927	*/
	2928	{
	2929	U16 word;
	2930	U32 state;
	2931	U16 prev;
	2932
	2933	for (word=1; word <= trie->wordcount; word++) {
	2934	prev = 0;
	2935	if (trie->wordinfo[word].prev)
	2936	continue;
	2937	state = trie->wordinfo[word].accept;
	2938	while (state) {
	2939	state = prev_states[state];
	2940	if (!state)
	2941	break;
	2942	prev = trie->states[state].wordnum;
	2943	if (prev)
	2944	break;
	2945	}
	2946	trie->wordinfo[word].prev = prev;
	2947	}
	2948	Safefree(prev_states);
	2949	}
	2950
	2951
	2952	/* and now dump out the compressed format */
	2953	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2954
	2955	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2956	#ifdef DEBUGGING
	2957	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2958	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2959	#else
	2960	SvREFCNT_dec_NN(revcharmap);
	2961	#endif
	2962	return trie->jump
	2963	? MADE_JUMP_TRIE
	2964	: trie->startstate>1
	2965	? MADE_EXACT_TRIE
	2966	: MADE_TRIE;
	2967	}
	2968
	2969	STATIC regnode *
	2970	S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t pRExC_state, regnode source, U32 depth)
	2971	{
	2972	/* The Trie is constructed and compressed now so we can build a fail array if
	2973	* it's needed
	2974
	2975	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
	2976	3.32 in the
	2977	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
	2978	Ullman 1985/88
	2979	ISBN 0-201-10088-6
	2980
	2981	We find the fail state for each state in the trie, this state is the longest
	2982	proper suffix of the current state's 'word' that is also a proper prefix of
	2983	another word in our trie. State 1 represents the word '' and is thus the
	2984	default fail state. This allows the DFA not to have to restart after its
	2985	tried and failed a word at a given point, it simply continues as though it
	2986	had been matching the other word in the first place.
	2987	Consider
	2988	'abcdgu'=~/abcdefg\|cdgu/
	2989	When we get to 'd' we are still matching the first word, we would encounter
	2990	'g' which would fail, which would bring us to the state representing 'd' in
	2991	the second word where we would try 'g' and succeed, proceeding to match
	2992	'cdgu'.
	2993	*/
	2994	/* add a fail transition */
	2995	const U32 trie_offset = ARG(source);
	2996	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2997	U32 *q;
	2998	const U32 ucharcount = trie->uniquecharcount;
	2999	const U32 numstates = trie->statecount;
	3000	const U32 ubound = trie->lasttrans + ucharcount;
	3001	U32 q_read = 0;
	3002	U32 q_write = 0;
	3003	U32 charid;
	3004	U32 base = trie->states[ 1 ].trans.base;
	3005	U32 *fail;
	3006	reg_ac_data *aho;
	3007	const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
	3008	regnode *stclass;
	3009	GET_RE_DEBUG_FLAGS_DECL;
	3010
	3011	PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
	3012	PERL_UNUSED_CONTEXT;
	3013	#ifndef DEBUGGING
	3014	PERL_UNUSED_ARG(depth);
	3015	#endif
	3016
	3017	if ( OP(source) == TRIE ) {
	3018	struct regnode_1 op = (struct regnode_1 )
	3019	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	3020	StructCopy(source,op,struct regnode_1);
	3021	stclass = (regnode *)op;
	3022	} else {
	3023	struct regnode_charclass op = (struct regnode_charclass )
	3024	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	3025	StructCopy(source,op,struct regnode_charclass);
	3026	stclass = (regnode *)op;
	3027	}
	3028	OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
	3029
	3030	ARG_SET( stclass, data_slot );
	3031	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	3032	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	3033	aho->trie=trie_offset;
	3034	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	3035	Copy( trie->states, aho->states, numstates, reg_trie_state );
	3036	Newxz( q, numstates, U32);
	3037	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	3038	aho->refcount = 1;
	3039	fail = aho->fail;
	3040	/* initialize fail[0..1] to be 1 so that we always have
	3041	a valid final fail state */
	3042	fail[ 0 ] = fail[ 1 ] = 1;
	3043
	3044	for ( charid = 0; charid < ucharcount ; charid++ ) {
	3045	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	3046	if ( newstate ) {
	3047	q[ q_write ] = newstate;
	3048	/* set to point at the root */
	3049	fail[ q[ q_write++ ] ]=1;
	3050	}
	3051	}
	3052	while ( q_read < q_write) {
	3053	const U32 cur = q[ q_read++ % numstates ];
	3054	base = trie->states[ cur ].trans.base;
	3055
	3056	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	3057	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	3058	if (ch_state) {
	3059	U32 fail_state = cur;
	3060	U32 fail_base;
	3061	do {
	3062	fail_state = fail[ fail_state ];
	3063	fail_base = aho->states[ fail_state ].trans.base;
	3064	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	3065
	3066	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	3067	fail[ ch_state ] = fail_state;
	3068	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	3069	{
	3070	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	3071	}
	3072	q[ q_write++ % numstates] = ch_state;
	3073	}
	3074	}
	3075	}
	3076	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	3077	when we fail in state 1, this allows us to use the
	3078	charclass scan to find a valid start char. This is based on the principle
	3079	that theres a good chance the string being searched contains lots of stuff
	3080	that cant be a start char.
	3081	*/
	3082	fail[ 0 ] = fail[ 1 ] = 0;
	3083	DEBUG_TRIE_COMPILE_r({
	3084	PerlIO_printf(Perl_debug_log,
	3085	"%*sStclass Failtable (%"UVuf" states): 0",
	3086	(int)(depth * 2), "", (UV)numstates
	3087	);
	3088	for( q_read=1; q_read<numstates; q_read++ ) {
	3089	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	3090	}
	3091	PerlIO_printf(Perl_debug_log, "\n");
	3092	});
	3093	Safefree(q);
	3094	/RExC_seen \|= REG_TRIEDFA_SEEN;/
	3095	return stclass;
	3096	}
	3097
	3098
	3099	#define DEBUG_PEEP(str,scan,depth) \
	3100	DEBUG_OPTIMISE_r({if (scan){ \
	3101	SV * const mysv=sv_newmortal(); \
	3102	regnode *Next = regnext(scan); \
	3103	regprop(RExC_rx, mysv, scan, NULL); \
	3104	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	3105	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	3106	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	3107	}});
	3108
	3109
	3110	/* The below joins as many adjacent EXACTish nodes as possible into a single
	3111	* one. The regop may be changed if the node(s) contain certain sequences that
	3112	* require special handling. The joining is only done if:
	3113	* 1) there is room in the current conglomerated node to entirely contain the
	3114	* next one.
	3115	* 2) they are the exact same node type
	3116	*
	3117	* The adjacent nodes actually may be separated by NOTHING-kind nodes, and
	3118	* these get optimized out
	3119	*
	3120	* If a node is to match under /i (folded), the number of characters it matches
	3121	* can be different than its character length if it contains a multi-character
	3122	* fold. *min_subtract is set to the total delta number of characters of the
	3123	* input nodes.
	3124	*
	3125	* And *unfolded_multi_char is set to indicate whether or not the node contains
	3126	* an unfolded multi-char fold. This happens when whether the fold is valid or
	3127	* not won't be known until runtime; namely for EXACTF nodes that contain LATIN
	3128	* SMALL LETTER SHARP S, as only if the target string being matched against
	3129	* turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
	3130	* folding rules depend on the locale in force at runtime. (Multi-char folds
	3131	* whose components are all above the Latin1 range are not run-time locale
	3132	* dependent, and have already been folded by the time this function is
	3133	* called.)
	3134	*
	3135	* This is as good a place as any to discuss the design of handling these
	3136	* multi-character fold sequences. It's been wrong in Perl for a very long
	3137	* time. There are three code points in Unicode whose multi-character folds
	3138	* were long ago discovered to mess things up. The previous designs for
	3139	* dealing with these involved assigning a special node for them. This
	3140	* approach doesn't always work, as evidenced by this example:
	3141	* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
	3142	* Both sides fold to "sss", but if the pattern is parsed to create a node that
	3143	* would match just the \xDF, it won't be able to handle the case where a
	3144	* successful match would have to cross the node's boundary. The new approach
	3145	* that hopefully generally solves the problem generates an EXACTFU_SS node
	3146	* that is "sss" in this case.
	3147	*
	3148	* It turns out that there are problems with all multi-character folds, and not
	3149	* just these three. Now the code is general, for all such cases. The
	3150	* approach taken is:
	3151	* 1) This routine examines each EXACTFish node that could contain multi-
	3152	* character folded sequences. Since a single character can fold into
	3153	* such a sequence, the minimum match length for this node is less than
	3154	* the number of characters in the node. This routine returns in
	3155	* *min_subtract how many characters to subtract from the the actual
	3156	* length of the string to get a real minimum match length; it is 0 if
	3157	* there are no multi-char foldeds. This delta is used by the caller to
	3158	* adjust the min length of the match, and the delta between min and max,
	3159	* so that the optimizer doesn't reject these possibilities based on size
	3160	* constraints.
	3161	* 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
	3162	* is used for an EXACTFU node that contains at least one "ss" sequence in
	3163	* it. For non-UTF-8 patterns and strings, this is the only case where
	3164	* there is a possible fold length change. That means that a regular
	3165	* EXACTFU node without UTF-8 involvement doesn't have to concern itself
	3166	* with length changes, and so can be processed faster. regexec.c takes
	3167	* advantage of this. Generally, an EXACTFish node that is in UTF-8 is
	3168	* pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
	3169	* known until runtime). This saves effort in regex matching. However,
	3170	* the pre-folding isn't done for non-UTF8 patterns because the fold of
	3171	* the MICRO SIGN requires UTF-8, and we don't want to slow things down by
	3172	* forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
	3173	* again, EXACTFL) nodes fold to isn't known until runtime. The fold
	3174	* possibilities for the non-UTF8 patterns are quite simple, except for
	3175	* the sharp s. All the ones that don't involve a UTF-8 target string are
	3176	* members of a fold-pair, and arrays are set up for all of them so that
	3177	* the other member of the pair can be found quickly. Code elsewhere in
	3178	* this file makes sure that in EXACTFU nodes, the sharp s gets folded to
	3179	* 'ss', even if the pattern isn't UTF-8. This avoids the issues
	3180	* described in the next item.
	3181	* 3) A problem remains for unfolded multi-char folds. (These occur when the
	3182	* validity of the fold won't be known until runtime, and so must remain
	3183	* unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
	3184	* nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
	3185	* be an EXACTF node with a UTF-8 pattern.) They also occur for various
	3186	* folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
	3187	* The reason this is a problem is that the optimizer part of regexec.c
	3188	* (probably unwittingly, in Perl_regexec_flags()) makes an assumption
	3189	* that a character in the pattern corresponds to at most a single
	3190	* character in the target string. (And I do mean character, and not byte
	3191	* here, unlike other parts of the documentation that have never been
	3192	* updated to account for multibyte Unicode.) sharp s in EXACTF and
	3193	* EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
	3194	* it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
	3195	* nodes, violate the assumption, and they are the only instances where it
	3196	* is violated. I'm reluctant to try to change the assumption, as the
	3197	* code involved is impenetrable to me (khw), so instead the code here
	3198	* punts. This routine examines EXACTFL nodes, and (when the pattern
	3199	* isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
	3200	* boolean indicating whether or not the node contains such a fold. When
	3201	* it is true, the caller sets a flag that later causes the optimizer in
	3202	* this file to not set values for the floating and fixed string lengths,
	3203	* and thus avoids the optimizer code in regexec.c that makes the invalid
	3204	* assumption. Thus, there is no optimization based on string lengths for
	3205	* EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
	3206	* EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
	3207	* assumption is wrong only in these cases is that all other non-UTF-8
	3208	* folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
	3209	* their expanded versions. (Again, we can't prefold sharp s to 'ss' in
	3210	* EXACTF nodes because we don't know at compile time if it actually
	3211	* matches 'ss' or not. For EXACTF nodes it will match iff the target
	3212	* string is in UTF-8. This is in contrast to EXACTFU nodes, where it
	3213	* always matches; and EXACTFA where it never does. In an EXACTFA node in
	3214	* a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
	3215	* problem; but in a non-UTF8 pattern, folding it to that above-Latin1
	3216	* string would require the pattern to be forced into UTF-8, the overhead
	3217	* of which we want to avoid. Similarly the unfolded multi-char folds in
	3218	* EXACTFL nodes will match iff the locale at the time of match is a UTF-8
	3219	* locale.)
	3220	*
	3221	* Similarly, the code that generates tries doesn't currently handle
	3222	* not-already-folded multi-char folds, and it looks like a pain to change
	3223	* that. Therefore, trie generation of EXACTFA nodes with the sharp s
	3224	* doesn't work. Instead, such an EXACTFA is turned into a new regnode,
	3225	* EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
	3226	* using /iaa matching will be doing so almost entirely with ASCII
	3227	* strings, so this should rarely be encountered in practice */
	3228
	3229	#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
	3230	if (PL_regkind[OP(scan)] == EXACT) \
	3231	join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
	3232
	3233	STATIC U32
	3234	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan,
	3235	UV min_subtract, bool unfolded_multi_char,
	3236	U32 flags,regnode *val, U32 depth)
	3237	{
	3238	/* Merge several consecutive EXACTish nodes into one. */
	3239	regnode *n = regnext(scan);
	3240	U32 stringok = 1;
	3241	regnode *next = scan + NODE_SZ_STR(scan);
	3242	U32 merged = 0;
	3243	U32 stopnow = 0;
	3244	#ifdef DEBUGGING
	3245	regnode *stop = scan;
	3246	GET_RE_DEBUG_FLAGS_DECL;
	3247	#else
	3248	PERL_UNUSED_ARG(depth);
	3249	#endif
	3250
	3251	PERL_ARGS_ASSERT_JOIN_EXACT;
	3252	#ifndef EXPERIMENTAL_INPLACESCAN
	3253	PERL_UNUSED_ARG(flags);
	3254	PERL_UNUSED_ARG(val);
	3255	#endif
	3256	DEBUG_PEEP("join",scan,depth);
	3257
	3258	/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
	3259	* EXACT ones that are mergeable to the current one. */
	3260	while (n
	3261	&& (PL_regkind[OP(n)] == NOTHING
	3262	\|\| (stringok && OP(n) == OP(scan)))
	3263	&& NEXT_OFF(n)
	3264	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
	3265	{
	3266
	3267	if (OP(n) == TAIL \|\| n > next)
	3268	stringok = 0;
	3269	if (PL_regkind[OP(n)] == NOTHING) {
	3270	DEBUG_PEEP("skip:",n,depth);
	3271	NEXT_OFF(scan) += NEXT_OFF(n);
	3272	next = n + NODE_STEP_REGNODE;
	3273	#ifdef DEBUGGING
	3274	if (stringok)
	3275	stop = n;
	3276	#endif
	3277	n = regnext(n);
	3278	}
	3279	else if (stringok) {
	3280	const unsigned int oldl = STR_LEN(scan);
	3281	regnode * const nnext = regnext(n);
	3282
	3283	/* XXX I (khw) kind of doubt that this works on platforms (should
	3284	* Perl ever run on one) where U8_MAX is above 255 because of lots
	3285	* of other assumptions */
	3286	/* Don't join if the sum can't fit into a single node */
	3287	if (oldl + STR_LEN(n) > U8_MAX)
	3288	break;
	3289
	3290	DEBUG_PEEP("merg",n,depth);
	3291	merged++;
	3292
	3293	NEXT_OFF(scan) += NEXT_OFF(n);
	3294	STR_LEN(scan) += STR_LEN(n);
	3295	next = n + NODE_SZ_STR(n);
	3296	/* Now we can overwrite n : /
	3297	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	3298	#ifdef DEBUGGING
	3299	stop = next - 1;
	3300	#endif
	3301	n = nnext;
	3302	if (stopnow) break;
	3303	}
	3304
	3305	#ifdef EXPERIMENTAL_INPLACESCAN
	3306	if (flags && !NEXT_OFF(n)) {
	3307	DEBUG_PEEP("atch", val, depth);
	3308	if (reg_off_by_arg[OP(n)]) {
	3309	ARG_SET(n, val - n);
	3310	}
	3311	else {
	3312	NEXT_OFF(n) = val - n;
	3313	}
	3314	stopnow = 1;
	3315	}
	3316	#endif
	3317	}
	3318
	3319	*min_subtract = 0;
	3320	*unfolded_multi_char = FALSE;
	3321
	3322	/* Here, all the adjacent mergeable EXACTish nodes have been merged. We
	3323	* can now analyze for sequences of problematic code points. (Prior to
	3324	* this final joining, sequences could have been split over boundaries, and
	3325	* hence missed). The sequences only happen in folding, hence for any
	3326	* non-EXACT EXACTish node */
	3327	if (OP(scan) != EXACT) {
	3328	U8* s0 = (U8*) STRING(scan);
	3329	U8* s = s0;
	3330	U8* s_end = s0 + STR_LEN(scan);
	3331
	3332	int total_count_delta = 0; /* Total delta number of characters that
	3333	multi-char folds expand to */
	3334
	3335	/* One pass is made over the node's string looking for all the
	3336	* possibilities. To avoid some tests in the loop, there are two main
	3337	* cases, for UTF-8 patterns (which can't have EXACTF nodes) and
	3338	* non-UTF-8 */
	3339	if (UTF) {
	3340	U8* folded = NULL;
	3341
	3342	if (OP(scan) == EXACTFL) {
	3343	U8 *d;
	3344
	3345	/* An EXACTFL node would already have been changed to another
	3346	* node type unless there is at least one character in it that
	3347	* is problematic; likely a character whose fold definition
	3348	* won't be known until runtime, and so has yet to be folded.
	3349	* For all but the UTF-8 locale, folds are 1-1 in length, but
	3350	* to handle the UTF-8 case, we need to create a temporary
	3351	* folded copy using UTF-8 locale rules in order to analyze it.
	3352	* This is because our macros that look to see if a sequence is
	3353	* a multi-char fold assume everything is folded (otherwise the
	3354	* tests in those macros would be too complicated and slow).
	3355	* Note that here, the non-problematic folds will have already
	3356	* been done, so we can just copy such characters. We actually
	3357	* don't completely fold the EXACTFL string. We skip the
	3358	* unfolded multi-char folds, as that would just create work
	3359	* below to figure out the size they already are */
	3360
	3361	Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
	3362	d = folded;
	3363	while (s < s_end) {
	3364	STRLEN s_len = UTF8SKIP(s);
	3365	if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
	3366	Copy(s, d, s_len, U8);
	3367	d += s_len;
	3368	}
	3369	else if (is_FOLDS_TO_MULTI_utf8(s)) {
	3370	*unfolded_multi_char = TRUE;
	3371	Copy(s, d, s_len, U8);
	3372	d += s_len;
	3373	}
	3374	else if (isASCII(*s)) {
	3375	(d++) = toFOLD(s);
	3376	}
	3377	else {
	3378	STRLEN len;
	3379	_to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
	3380	d += len;
	3381	}
	3382	s += s_len;
	3383	}
	3384
	3385	/* Point the remainder of the routine to look at our temporary
	3386	* folded copy */
	3387	s = folded;
	3388	s_end = d;
	3389	} /* End of creating folded copy of EXACTFL string */
	3390
	3391	/* Examine the string for a multi-character fold sequence. UTF-8
	3392	* patterns have all characters pre-folded by the time this code is
	3393	* executed */
	3394	while (s < s_end - 1) /* Can stop 1 before the end, as minimum
	3395	length sequence we are looking for is 2 */
	3396	{
	3397	int count = 0; /* How many characters in a multi-char fold */
	3398	int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
	3399	if (! len) { /* Not a multi-char fold: get next char */
	3400	s += UTF8SKIP(s);
	3401	continue;
	3402	}
	3403
	3404	/* Nodes with 'ss' require special handling, except for
	3405	* EXACTFA-ish for which there is no multi-char fold to this */
	3406	if (len == 2 && s == 's' && (s+1) == 's'
	3407	&& OP(scan) != EXACTFA
	3408	&& OP(scan) != EXACTFA_NO_TRIE)
	3409	{
	3410	count = 2;
	3411	if (OP(scan) != EXACTFL) {
	3412	OP(scan) = EXACTFU_SS;
	3413	}
	3414	s += 2;
	3415	}
	3416	else { /* Here is a generic multi-char fold. */
	3417	U8* multi_end = s + len;
	3418
	3419	/* Count how many characters are in it. In the case of
	3420	* /aa, no folds which contain ASCII code points are
	3421	* allowed, so check for those, and skip if found. */
	3422	if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
	3423	count = utf8_length(s, multi_end);
	3424	s = multi_end;
	3425	}
	3426	else {
	3427	while (s < multi_end) {
	3428	if (isASCII(*s)) {
	3429	s++;
	3430	goto next_iteration;
	3431	}
	3432	else {
	3433	s += UTF8SKIP(s);
	3434	}
	3435	count++;
	3436	}
	3437	}
	3438	}
	3439
	3440	/* The delta is how long the sequence is minus 1 (1 is how long
	3441	* the character that folds to the sequence is) */
	3442	total_count_delta += count - 1;
	3443	next_iteration: ;
	3444	}
	3445
	3446	/* We created a temporary folded copy of the string in EXACTFL
	3447	* nodes. Therefore we need to be sure it doesn't go below zero,
	3448	* as the real string could be shorter */
	3449	if (OP(scan) == EXACTFL) {
	3450	int total_chars = utf8_length((U8*) STRING(scan),
	3451	(U8*) STRING(scan) + STR_LEN(scan));
	3452	if (total_count_delta > total_chars) {
	3453	total_count_delta = total_chars;
	3454	}
	3455	}
	3456
	3457	*min_subtract += total_count_delta;
	3458	Safefree(folded);
	3459	}
	3460	else if (OP(scan) == EXACTFA) {
	3461
	3462	/* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
	3463	* fold to the ASCII range (and there are no existing ones in the
	3464	* upper latin1 range). But, as outlined in the comments preceding
	3465	* this function, we need to flag any occurrences of the sharp s.
	3466	* This character forbids trie formation (because of added
	3467	* complexity) */
	3468	while (s < s_end) {
	3469	if (*s == LATIN_SMALL_LETTER_SHARP_S) {
	3470	OP(scan) = EXACTFA_NO_TRIE;
	3471	*unfolded_multi_char = TRUE;
	3472	break;
	3473	}
	3474	s++;
	3475	continue;
	3476	}
	3477	}
	3478	else {
	3479
	3480	/* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
	3481	* folds that are all Latin1. As explained in the comments
	3482	* preceding this function, we look also for the sharp s in EXACTF
	3483	* and EXACTFL nodes; it can be in the final position. Otherwise
	3484	* we can stop looking 1 byte earlier because have to find at least
	3485	* two characters for a multi-fold */
	3486	const U8* upper = (OP(scan) == EXACTF \|\| OP(scan) == EXACTFL)
	3487	? s_end
	3488	: s_end -1;
	3489
	3490	while (s < upper) {
	3491	int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
	3492	if (! len) { /* Not a multi-char fold. */
	3493	if (*s == LATIN_SMALL_LETTER_SHARP_S
	3494	&& (OP(scan) == EXACTF \|\| OP(scan) == EXACTFL))
	3495	{
	3496	*unfolded_multi_char = TRUE;
	3497	}
	3498	s++;
	3499	continue;
	3500	}
	3501
	3502	if (len == 2
	3503	&& isALPHA_FOLD_EQ(*s, 's')
	3504	&& isALPHA_FOLD_EQ(*(s+1), 's'))
	3505	{
	3506
	3507	/* EXACTF nodes need to know that the minimum length
	3508	* changed so that a sharp s in the string can match this
	3509	* ss in the pattern, but they remain EXACTF nodes, as they
	3510	* won't match this unless the target string is is UTF-8,
	3511	* which we don't know until runtime. EXACTFL nodes can't
	3512	* transform into EXACTFU nodes */
	3513	if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
	3514	OP(scan) = EXACTFU_SS;
	3515	}
	3516	}
	3517
	3518	*min_subtract += len - 1;
	3519	s += len;
	3520	}
	3521	}
	3522	}
	3523
	3524	#ifdef DEBUGGING
	3525	/* Allow dumping but overwriting the collection of skipped
	3526	* ops and/or strings with fake optimized ops */
	3527	n = scan + NODE_SZ_STR(scan);
	3528	while (n <= stop) {
	3529	OP(n) = OPTIMIZED;
	3530	FLAGS(n) = 0;
	3531	NEXT_OFF(n) = 0;
	3532	n++;
	3533	}
	3534	#endif
	3535	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	3536	return stopnow;
	3537	}
	3538
	3539	/* REx optimizer. Converts nodes into quicker variants "in place".
	3540	Finds fixed substrings. */
	3541
	3542	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	3543	to the position after last scanned or to NULL. */
	3544
	3545	#define INIT_AND_WITHP \
	3546	assert(!and_withp); \
	3547	Newx(and_withp,1, regnode_ssc); \
	3548	SAVEFREEPV(and_withp)
	3549
	3550	/* this is a chain of data about sub patterns we are processing that
	3551	need to be handled separately/specially in study_chunk. Its so
	3552	we can simulate recursion without losing state. */
	3553	struct scan_frame;
	3554	typedef struct scan_frame {
	3555	regnode last; / last node to process in this frame */
	3556	regnode next; / next node to process when last is reached */
	3557	struct scan_frame prev; /previous frame*/
	3558	U32 prev_recursed_depth;
	3559	I32 stop; /* what stopparen do we use */
	3560	} scan_frame;
	3561
	3562
	3563	STATIC SSize_t
	3564	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	3565	SSize_t minlenp, SSize_t deltap,
	3566	regnode *last,
	3567	scan_data_t *data,
	3568	I32 stopparen,
	3569	U32 recursed_depth,
	3570	regnode_ssc *and_withp,
	3571	U32 flags, U32 depth)
	3572	/* scanp: Start here (read-write). */
	3573	/* deltap: Write maxlen-minlen here. */
	3574	/* last: Stop before this one. */
	3575	/* data: string data about the pattern */
	3576	/* stopparen: treat close N as END */
	3577	/* recursed: which subroutines have we recursed into */
	3578	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	3579	{
	3580	/* There must be at least this number of characters to match */
	3581	SSize_t min = 0;
	3582	I32 pars = 0, code;
	3583	regnode scan = scanp, *next;
	3584	SSize_t delta = 0;
	3585	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	3586	int is_inf_internal = 0; /* The studied chunk is infinite */
	3587	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	3588	scan_data_t data_fake;
	3589	SV *re_trie_maxbuff = NULL;
	3590	regnode *first_non_open = scan;
	3591	SSize_t stopmin = SSize_t_MAX;
	3592	scan_frame *frame = NULL;
	3593	GET_RE_DEBUG_FLAGS_DECL;
	3594
	3595	PERL_ARGS_ASSERT_STUDY_CHUNK;
	3596
	3597	#ifdef DEBUGGING
	3598	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	3599	#endif
	3600	if ( depth == 0 ) {
	3601	while (first_non_open && OP(first_non_open) == OPEN)
	3602	first_non_open=regnext(first_non_open);
	3603	}
	3604
	3605
	3606	fake_study_recurse:
	3607	while ( scan && OP(scan) != END && scan < last ){
	3608	UV min_subtract = 0; /* How mmany chars to subtract from the minimum
	3609	node length to get a real minimum (because
	3610	the folded version may be shorter) */
	3611	bool unfolded_multi_char = FALSE;
	3612	/* Peephole optimizer: */
	3613	DEBUG_OPTIMISE_MORE_r(
	3614	{
	3615	PerlIO_printf(Perl_debug_log,
	3616	"%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
	3617	((int) depth*2), "", (long)stopparen,
	3618	(unsigned long)depth, (unsigned long)recursed_depth);
	3619	if (recursed_depth) {
	3620	U32 i;
	3621	U32 j;
	3622	for ( j = 0 ; j < recursed_depth ; j++ ) {
	3623	PerlIO_printf(Perl_debug_log,"[");
	3624	for ( i = 0 ; i < (U32)RExC_npar ; i++ )
	3625	PerlIO_printf(Perl_debug_log,"%d",
	3626	PAREN_TEST(RExC_study_chunk_recursed +
	3627	(j * RExC_study_chunk_recursed_bytes), i)
	3628	? 1 : 0
	3629	);
	3630	PerlIO_printf(Perl_debug_log,"]");
	3631	}
	3632	}
	3633	PerlIO_printf(Perl_debug_log,"\n");
	3634	}
	3635	);
	3636	DEBUG_STUDYDATA("Peep:", data, depth);
	3637	DEBUG_PEEP("Peep", scan, depth);
	3638
	3639
	3640	/* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
	3641	* which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
	3642	* by a different invocation of reg() -- Yves
	3643	*/
	3644	JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
	3645
	3646	/* Follow the next-chain of the current node and optimize
	3647	away all the NOTHINGs from it. */
	3648	if (OP(scan) != CURLYX) {
	3649	const int max = (reg_off_by_arg[OP(scan)]
	3650	? I32_MAX
	3651	/* I32 may be smaller than U16 on CRAYs! */
	3652	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	3653	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	3654	int noff;
	3655	regnode *n = scan;
	3656
	3657	/* Skip NOTHING and LONGJMP. */
	3658	while ((n = regnext(n))
	3659	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	3660	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	3661	&& off + noff < max)
	3662	off += noff;
	3663	if (reg_off_by_arg[OP(scan)])
	3664	ARG(scan) = off;
	3665	else
	3666	NEXT_OFF(scan) = off;
	3667	}
	3668
	3669
	3670
	3671	/* The principal pseudo-switch. Cannot be a switch, since we
	3672	look into several different things. */
	3673	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	3674	\|\| OP(scan) == IFTHEN) {
	3675	next = regnext(scan);
	3676	code = OP(scan);
	3677	/* demq: the op(next)==code check is to see if we have
	3678	* "branch-branch" AFAICT */
	3679
	3680	if (OP(next) == code \|\| code == IFTHEN) {
	3681	/* NOTE - There is similar code to this block below for
	3682	* handling TRIE nodes on a re-study. If you change stuff here
	3683	* check there too. */
	3684	SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
	3685	regnode_ssc accum;
	3686	regnode * const startbranch=scan;
	3687
	3688	if (flags & SCF_DO_SUBSTR) {
	3689	/* Cannot merge strings after this. */
	3690	scan_commit(pRExC_state, data, minlenp, is_inf);
	3691	}
	3692
	3693	if (flags & SCF_DO_STCLASS)
	3694	ssc_init_zero(pRExC_state, &accum);
	3695
	3696	while (OP(scan) == code) {
	3697	SSize_t deltanext, minnext, fake;
	3698	I32 f = 0;
	3699	regnode_ssc this_class;
	3700
	3701	num++;
	3702	data_fake.flags = 0;
	3703	if (data) {
	3704	data_fake.whilem_c = data->whilem_c;
	3705	data_fake.last_closep = data->last_closep;
	3706	}
	3707	else
	3708	data_fake.last_closep = &fake;
	3709
	3710	data_fake.pos_delta = delta;
	3711	next = regnext(scan);
	3712	scan = NEXTOPER(scan);
	3713	if (code != BRANCH)
	3714	scan = NEXTOPER(scan);
	3715	if (flags & SCF_DO_STCLASS) {
	3716	ssc_init(pRExC_state, &this_class);
	3717	data_fake.start_class = &this_class;
	3718	f = SCF_DO_STCLASS_AND;
	3719	}
	3720	if (flags & SCF_WHILEM_VISITED_POS)
	3721	f \|= SCF_WHILEM_VISITED_POS;
	3722
	3723	/* we suppose the run is continuous, last=next...*/
	3724	minnext = study_chunk(pRExC_state, &scan, minlenp,
	3725	&deltanext, next, &data_fake, stopparen,
	3726	recursed_depth, NULL, f,depth+1);
	3727	if (min1 > minnext)
	3728	min1 = minnext;
	3729	if (deltanext == SSize_t_MAX) {
	3730	is_inf = is_inf_internal = 1;
	3731	max1 = SSize_t_MAX;
	3732	} else if (max1 < minnext + deltanext)
	3733	max1 = minnext + deltanext;
	3734	scan = next;
	3735	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3736	pars++;
	3737	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	3738	if ( stopmin > minnext)
	3739	stopmin = min + min1;
	3740	flags &= ~SCF_DO_SUBSTR;
	3741	if (data)
	3742	data->flags \|= SCF_SEEN_ACCEPT;
	3743	}
	3744	if (data) {
	3745	if (data_fake.flags & SF_HAS_EVAL)
	3746	data->flags \|= SF_HAS_EVAL;
	3747	data->whilem_c = data_fake.whilem_c;
	3748	}
	3749	if (flags & SCF_DO_STCLASS)
	3750	ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
	3751	}
	3752	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	3753	min1 = 0;
	3754	if (flags & SCF_DO_SUBSTR) {
	3755	data->pos_min += min1;
	3756	if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
	3757	data->pos_delta = SSize_t_MAX;
	3758	else
	3759	data->pos_delta += max1 - min1;
	3760	if (max1 != min1 \|\| is_inf)
	3761	data->longest = &(data->longest_float);
	3762	}
	3763	min += min1;
	3764	if (delta == SSize_t_MAX
	3765	\|\| SSize_t_MAX - delta - (max1 - min1) < 0)
	3766	delta = SSize_t_MAX;
	3767	else
	3768	delta += max1 - min1;
	3769	if (flags & SCF_DO_STCLASS_OR) {
	3770	ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
	3771	if (min1) {
	3772	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	3773	flags &= ~SCF_DO_STCLASS;
	3774	}
	3775	}
	3776	else if (flags & SCF_DO_STCLASS_AND) {
	3777	if (min1) {
	3778	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
	3779	flags &= ~SCF_DO_STCLASS;
	3780	}
	3781	else {
	3782	/* Switch to OR mode: cache the old value of
	3783	* data->start_class */
	3784	INIT_AND_WITHP;
	3785	StructCopy(data->start_class, and_withp, regnode_ssc);
	3786	flags &= ~SCF_DO_STCLASS_AND;
	3787	StructCopy(&accum, data->start_class, regnode_ssc);
	3788	flags \|= SCF_DO_STCLASS_OR;
	3789	}
	3790	}
	3791
	3792	if (PERL_ENABLE_TRIE_OPTIMISATION &&
	3793	OP( startbranch ) == BRANCH )
	3794	{
	3795	/* demq.
	3796
	3797	Assuming this was/is a branch we are dealing with: 'scan'
	3798	now points at the item that follows the branch sequence,
	3799	whatever it is. We now start at the beginning of the
	3800	sequence and look for subsequences of
	3801
	3802	BRANCH->EXACT=>x1
	3803	BRANCH->EXACT=>x2
	3804	tail
	3805
	3806	which would be constructed from a pattern like
	3807	/A\|LIST\|OF\|WORDS/
	3808
	3809	If we can find such a subsequence we need to turn the first
	3810	element into a trie and then add the subsequent branch exact
	3811	strings to the trie.
	3812
	3813	We have two cases
	3814
	3815	1. patterns where the whole set of branches can be
	3816	converted.
	3817
	3818	2. patterns where only a subset can be converted.
	3819
	3820	In case 1 we can replace the whole set with a single regop
	3821	for the trie. In case 2 we need to keep the start and end
	3822	branches so
	3823
	3824	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	3825	becomes BRANCH TRIE; BRANCH X;
	3826
	3827	There is an additional case, that being where there is a
	3828	common prefix, which gets split out into an EXACT like node
	3829	preceding the TRIE node.
	3830
	3831	If x(1..n)==tail then we can do a simple trie, if not we make
	3832	a "jump" trie, such that when we match the appropriate word
	3833	we "jump" to the appropriate tail node. Essentially we turn
	3834	a nested if into a case structure of sorts.
	3835
	3836	*/
	3837
	3838	int made=0;
	3839	if (!re_trie_maxbuff) {
	3840	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	3841	if (!SvIOK(re_trie_maxbuff))
	3842	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	3843	}
	3844	if ( SvIV(re_trie_maxbuff)>=0 ) {
	3845	regnode *cur;
	3846	regnode first = (regnode )NULL;
	3847	regnode last = (regnode )NULL;
	3848	regnode *tail = scan;
	3849	U8 trietype = 0;
	3850	U32 count=0;
	3851
	3852	#ifdef DEBUGGING
	3853	SV * const mysv = sv_newmortal(); /* for dumping */
	3854	#endif
	3855	/* var tail is used because there may be a TAIL
	3856	regop in the way. Ie, the exacts will point to the
	3857	thing following the TAIL, but the last branch will
	3858	point at the TAIL. So we advance tail. If we
	3859	have nested (?:) we may have to move through several
	3860	tails.
	3861	*/
	3862
	3863	while ( OP( tail ) == TAIL ) {
	3864	/* this is the TAIL generated by (?:) */
	3865	tail = regnext( tail );
	3866	}
	3867
	3868
	3869	DEBUG_TRIE_COMPILE_r({
	3870	regprop(RExC_rx, mysv, tail, NULL);
	3871	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	3872	(int)depth * 2 + 2, "",
	3873	"Looking for TRIE'able sequences. Tail node is: ",
	3874	SvPV_nolen_const( mysv )
	3875	);
	3876	});
	3877
	3878	/*
	3879
	3880	Step through the branches
	3881	cur represents each branch,
	3882	noper is the first thing to be matched as part
	3883	of that branch
	3884	noper_next is the regnext() of that node.
	3885
	3886	We normally handle a case like this
	3887	/FOO[xyz]\|BAR[pqr]/ via a "jump trie" but we also
	3888	support building with NOJUMPTRIE, which restricts
	3889	the trie logic to structures like /FOO\|BAR/.
	3890
	3891	If noper is a trieable nodetype then the branch is
	3892	a possible optimization target. If we are building
	3893	under NOJUMPTRIE then we require that noper_next is
	3894	the same as scan (our current position in the regex
	3895	program).
	3896
	3897	Once we have two or more consecutive such branches
	3898	we can create a trie of the EXACT's contents and
	3899	stitch it in place into the program.
	3900
	3901	If the sequence represents all of the branches in
	3902	the alternation we replace the entire thing with a
	3903	single TRIE node.
	3904
	3905	Otherwise when it is a subsequence we need to
	3906	stitch it in place and replace only the relevant
	3907	branches. This means the first branch has to remain
	3908	as it is used by the alternation logic, and its
	3909	next pointer, and needs to be repointed at the item
	3910	on the branch chain following the last branch we
	3911	have optimized away.
	3912
	3913	This could be either a BRANCH, in which case the
	3914	subsequence is internal, or it could be the item
	3915	following the branch sequence in which case the
	3916	subsequence is at the end (which does not
	3917	necessarily mean the first node is the start of the
	3918	alternation).
	3919
	3920	TRIE_TYPE(X) is a define which maps the optype to a
	3921	trietype.
	3922
	3923	optype \| trietype
	3924	----------------+-----------
	3925	NOTHING \| NOTHING
	3926	EXACT \| EXACT
	3927	EXACTFU \| EXACTFU
	3928	EXACTFU_SS \| EXACTFU
	3929	EXACTFA \| EXACTFA
	3930
	3931
	3932	*/
	3933	#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
	3934	( EXACT == (X) ) ? EXACT : \
	3935	( EXACTFU == (X) \|\| EXACTFU_SS == (X) ) ? EXACTFU : \
	3936	( EXACTFA == (X) ) ? EXACTFA : \
	3937	0 )
	3938
	3939	/* dont use tail as the end marker for this traverse */
	3940	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	3941	regnode * const noper = NEXTOPER( cur );
	3942	U8 noper_type = OP( noper );
	3943	U8 noper_trietype = TRIE_TYPE( noper_type );
	3944	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	3945	regnode * const noper_next = regnext( noper );
	3946	U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
	3947	U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
	3948	#endif
	3949
	3950	DEBUG_TRIE_COMPILE_r({
	3951	regprop(RExC_rx, mysv, cur, NULL);
	3952	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	3953	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	3954
	3955	regprop(RExC_rx, mysv, noper, NULL);
	3956	PerlIO_printf( Perl_debug_log, " -> %s",
	3957	SvPV_nolen_const(mysv));
	3958
	3959	if ( noper_next ) {
	3960	regprop(RExC_rx, mysv, noper_next, NULL);
	3961	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	3962	SvPV_nolen_const(mysv));
	3963	}
	3964	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
	3965	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
	3966	PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
	3967	);
	3968	});
	3969
	3970	/* Is noper a trieable nodetype that can be merged
	3971	* with the current trie (if there is one)? */
	3972	if ( noper_trietype
	3973	&&
	3974	(
	3975	( noper_trietype == NOTHING)
	3976	\|\| ( trietype == NOTHING )
	3977	\|\| ( trietype == noper_trietype )
	3978	)
	3979	#ifdef NOJUMPTRIE
	3980	&& noper_next == tail
	3981	#endif
	3982	&& count < U16_MAX)
	3983	{
	3984	/* Handle mergable triable node Either we are
	3985	* the first node in a new trieable sequence,
	3986	* in which case we do some bookkeeping,
	3987	* otherwise we update the end pointer. */
	3988	if ( !first ) {
	3989	first = cur;
	3990	if ( noper_trietype == NOTHING ) {
	3991	#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
	3992	regnode * const noper_next = regnext( noper );
	3993	U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
	3994	U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
	3995	#endif
	3996
	3997	if ( noper_next_trietype ) {
	3998	trietype = noper_next_trietype;
	3999	} else if (noper_next_type) {
	4000	/* a NOTHING regop is 1 regop wide.
	4001	* We need at least two for a trie
	4002	* so we can't merge this in */
	4003	first = NULL;
	4004	}
	4005	} else {
	4006	trietype = noper_trietype;
	4007	}
	4008	} else {
	4009	if ( trietype == NOTHING )
	4010	trietype = noper_trietype;
	4011	last = cur;
	4012	}
	4013	if (first)
	4014	count++;
	4015	} /* end handle mergable triable node */
	4016	else {
	4017	/* handle unmergable node -
	4018	* noper may either be a triable node which can
	4019	* not be tried together with the current trie,
	4020	* or a non triable node */
	4021	if ( last ) {
	4022	/* If last is set and trietype is not
	4023	* NOTHING then we have found at least two
	4024	* triable branch sequences in a row of a
	4025	* similar trietype so we can turn them
	4026	* into a trie. If/when we allow NOTHING to
	4027	* start a trie sequence this condition
	4028	* will be required, and it isn't expensive
	4029	* so we leave it in for now. */
	4030	if ( trietype && trietype != NOTHING )
	4031	make_trie( pRExC_state,
	4032	startbranch, first, cur, tail,
	4033	count, trietype, depth+1 );
	4034	last = NULL; /* note: we clear/update
	4035	first, trietype etc below,
	4036	so we dont do it here */
	4037	}
	4038	if ( noper_trietype
	4039	#ifdef NOJUMPTRIE
	4040	&& noper_next == tail
	4041	#endif
	4042	){
	4043	/* noper is triable, so we can start a new
	4044	* trie sequence */
	4045	count = 1;
	4046	first = cur;
	4047	trietype = noper_trietype;
	4048	} else if (first) {
	4049	/* if we already saw a first but the
	4050	* current node is not triable then we have
	4051	* to reset the first information. */
	4052	count = 0;
	4053	first = NULL;
	4054	trietype = 0;
	4055	}
	4056	} /* end handle unmergable node */
	4057	} /* loop over branches */
	4058	DEBUG_TRIE_COMPILE_r({
	4059	regprop(RExC_rx, mysv, cur, NULL);
	4060	PerlIO_printf( Perl_debug_log,
	4061	"%*s- %s (%d) <SCAN FINISHED>\n",
	4062	(int)depth * 2 + 2,
	4063	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	4064
	4065	});
	4066	if ( last && trietype ) {
	4067	if ( trietype != NOTHING ) {
	4068	/* the last branch of the sequence was part of
	4069	* a trie, so we have to construct it here
	4070	* outside of the loop */
	4071	made= make_trie( pRExC_state, startbranch,
	4072	first, scan, tail, count,
	4073	trietype, depth+1 );
	4074	#ifdef TRIE_STUDY_OPT
	4075	if ( ((made == MADE_EXACT_TRIE &&
	4076	startbranch == first)
	4077	\|\| ( first_non_open == first )) &&
	4078	depth==0 ) {
	4079	flags \|= SCF_TRIE_RESTUDY;
	4080	if ( startbranch == first
	4081	&& scan == tail )
	4082	{
	4083	RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
	4084	}
	4085	}
	4086	#endif
	4087	} else {
	4088	/* at this point we know whatever we have is a
	4089	* NOTHING sequence/branch AND if 'startbranch'
	4090	* is 'first' then we can turn the whole thing
	4091	* into a NOTHING
	4092	*/
	4093	if ( startbranch == first ) {
	4094	regnode *opt;
	4095	/* the entire thing is a NOTHING sequence,
	4096	* something like this: (?:\|) So we can
	4097	* turn it into a plain NOTHING op. */
	4098	DEBUG_TRIE_COMPILE_r({
	4099	regprop(RExC_rx, mysv, cur, NULL);
	4100	PerlIO_printf( Perl_debug_log,
	4101	"%s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth 2 + 2,
	4102	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	4103
	4104	});
	4105	OP(startbranch)= NOTHING;
	4106	NEXT_OFF(startbranch)= tail - startbranch;
	4107	for ( opt= startbranch + 1; opt < tail ; opt++ )
	4108	OP(opt)= OPTIMIZED;
	4109	}
	4110	}
	4111	} /* end if ( last) */
	4112	} /* TRIE_MAXBUF is non zero */
	4113
	4114	} /* do trie */
	4115
	4116	}
	4117	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	4118	scan = NEXTOPER(NEXTOPER(scan));
	4119	} else /* single branch is optimized. */
	4120	scan = NEXTOPER(scan);
	4121	continue;
	4122	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	4123	scan_frame *newframe = NULL;
	4124	I32 paren;
	4125	regnode *start;
	4126	regnode *end;
	4127	U32 my_recursed_depth= recursed_depth;
	4128
	4129	if (OP(scan) != SUSPEND) {
	4130	/* set the pointer */
	4131	if (OP(scan) == GOSUB) {
	4132	paren = ARG(scan);
	4133	RExC_recurse[ARG2L(scan)] = scan;
	4134	start = RExC_open_parens[paren-1];
	4135	end = RExC_close_parens[paren-1];
	4136	} else {
	4137	paren = 0;
	4138	start = RExC_rxi->program + 1;
	4139	end = RExC_opend;
	4140	}
	4141	if (!recursed_depth
	4142	\|\|
	4143	!PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
	4144	) {
	4145	if (!recursed_depth) {
	4146	Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
	4147	} else {
	4148	Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
	4149	RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
	4150	RExC_study_chunk_recursed_bytes, U8);
	4151	}
	4152	/* we havent recursed into this paren yet, so recurse into it */
	4153	DEBUG_STUDYDATA("set:", data,depth);
	4154	PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
	4155	my_recursed_depth= recursed_depth + 1;
	4156	Newx(newframe,1,scan_frame);
	4157	} else {
	4158	DEBUG_STUDYDATA("inf:", data,depth);
	4159	/* some form of infinite recursion, assume infinite length
	4160	* */
	4161	if (flags & SCF_DO_SUBSTR) {
	4162	scan_commit(pRExC_state, data, minlenp, is_inf);
	4163	data->longest = &(data->longest_float);
	4164	}
	4165	is_inf = is_inf_internal = 1;
	4166	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4167	ssc_anything(data->start_class);
	4168	flags &= ~SCF_DO_STCLASS;
	4169	}
	4170	} else {
	4171	Newx(newframe,1,scan_frame);
	4172	paren = stopparen;
	4173	start = scan+2;
	4174	end = regnext(scan);
	4175	}
	4176	if (newframe) {
	4177	assert(start);
	4178	assert(end);
	4179	SAVEFREEPV(newframe);
	4180	newframe->next = regnext(scan);
	4181	newframe->last = last;
	4182	newframe->stop = stopparen;
	4183	newframe->prev = frame;
	4184	newframe->prev_recursed_depth = recursed_depth;
	4185
	4186	DEBUG_STUDYDATA("frame-new:",data,depth);
	4187	DEBUG_PEEP("fnew", scan, depth);
	4188
	4189	frame = newframe;
	4190	scan = start;
	4191	stopparen = paren;
	4192	last = end;
	4193	depth = depth + 1;
	4194	recursed_depth= my_recursed_depth;
	4195
	4196	continue;
	4197	}
	4198	}
	4199	else if (OP(scan) == EXACT) {
	4200	SSize_t l = STR_LEN(scan);
	4201	UV uc;
	4202	if (UTF) {
	4203	const U8 * const s = (U8*)STRING(scan);
	4204	uc = utf8_to_uvchr_buf(s, s + l, NULL);
	4205	l = utf8_length(s, s + l);
	4206	} else {
	4207	uc = ((U8)STRING(scan));
	4208	}
	4209	min += l;
	4210	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	4211	/* The code below prefers earlier match for fixed
	4212	offset, later match for variable offset. */
	4213	if (data->last_end == -1) { /* Update the start info. */
	4214	data->last_start_min = data->pos_min;
	4215	data->last_start_max = is_inf
	4216	? SSize_t_MAX : data->pos_min + data->pos_delta;
	4217	}
	4218	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	4219	if (UTF)
	4220	SvUTF8_on(data->last_found);
	4221	{
	4222	SV * const sv = data->last_found;
	4223	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	4224	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	4225	if (mg && mg->mg_len >= 0)
	4226	mg->mg_len += utf8_length((U8*)STRING(scan),
	4227	(U8*)STRING(scan)+STR_LEN(scan));
	4228	}
	4229	data->last_end = data->pos_min + l;
	4230	data->pos_min += l; /* As in the first entry. */
	4231	data->flags &= ~SF_BEFORE_EOL;
	4232	}
	4233
	4234	/* ANDing the code point leaves at most it, and not in locale, and
	4235	* can't match null string */
	4236	if (flags & SCF_DO_STCLASS_AND) {
	4237	ssc_cp_and(data->start_class, uc);
	4238	ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
	4239	ssc_clear_locale(data->start_class);
	4240	}
	4241	else if (flags & SCF_DO_STCLASS_OR) {
	4242	ssc_add_cp(data->start_class, uc);
	4243	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	4244
	4245	/* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
	4246	ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
	4247	}
	4248	flags &= ~SCF_DO_STCLASS;
	4249	}
	4250	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
	4251	EXACTFish */
	4252	SSize_t l = STR_LEN(scan);
	4253	UV uc = ((U8)STRING(scan));
	4254	SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
	4255	separate code points */
	4256	const U8 * s = (U8*)STRING(scan);
	4257
	4258	/* Search for fixed substrings supports EXACT only. */
	4259	if (flags & SCF_DO_SUBSTR) {
	4260	assert(data);
	4261	scan_commit(pRExC_state, data, minlenp, is_inf);
	4262	}
	4263	if (UTF) {
	4264	uc = utf8_to_uvchr_buf(s, s + l, NULL);
	4265	l = utf8_length(s, s + l);
	4266	}
	4267	if (unfolded_multi_char) {
	4268	RExC_seen \|= REG_UNFOLDED_MULTI_SEEN;
	4269	}
	4270	min += l - min_subtract;
	4271	assert (min >= 0);
	4272	delta += min_subtract;
	4273	if (flags & SCF_DO_SUBSTR) {
	4274	data->pos_min += l - min_subtract;
	4275	if (data->pos_min < 0) {
	4276	data->pos_min = 0;
	4277	}
	4278	data->pos_delta += min_subtract;
	4279	if (min_subtract) {
	4280	data->longest = &(data->longest_float);
	4281	}
	4282	}
	4283
	4284	if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
	4285	ssc_clear_locale(data->start_class);
	4286	}
	4287
	4288	if (! UTF) {
	4289
	4290	/* We punt and assume can match anything if the node begins
	4291	* with a multi-character fold. Things are complicated. For
	4292	* example, /ffi/i could match any of:
	4293	* "\N{LATIN SMALL LIGATURE FFI}"
	4294	* "\N{LATIN SMALL LIGATURE FF}I"
	4295	* "F\N{LATIN SMALL LIGATURE FI}"
	4296	* plus several other things; and making sure we have all the
	4297	* possibilities is hard. */
	4298	if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
	4299	EXACTF_invlist =
	4300	_add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
	4301	}
	4302	else {
	4303
	4304	/* Any Latin1 range character can potentially match any
	4305	* other depending on the locale */
	4306	if (OP(scan) == EXACTFL) {
	4307	_invlist_union(EXACTF_invlist, PL_Latin1,
	4308	&EXACTF_invlist);
	4309	}
	4310	else {
	4311	/* But otherwise, it matches at least itself. We can
	4312	* quickly tell if it has a distinct fold, and if so,
	4313	* it matches that as well */
	4314	EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
	4315	if (IS_IN_SOME_FOLD_L1(uc)) {
	4316	EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
	4317	PL_fold_latin1[uc]);
	4318	}
	4319	}
	4320
	4321	/* Some characters match above-Latin1 ones under /i. This
	4322	* is true of EXACTFL ones when the locale is UTF-8 */
	4323	if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
	4324	&& (! isASCII(uc) \|\| (OP(scan) != EXACTFA
	4325	&& OP(scan) != EXACTFA_NO_TRIE)))
	4326	{
	4327	add_above_Latin1_folds(pRExC_state,
	4328	(U8) uc,
	4329	&EXACTF_invlist);
	4330	}
	4331	}
	4332	}
	4333	else { /* Pattern is UTF-8 */
	4334	U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
	4335	STRLEN foldlen = UTF8SKIP(s);
	4336	const U8* e = s + STR_LEN(scan);
	4337	SV** listp;
	4338
	4339	/* The only code points that aren't folded in a UTF EXACTFish
	4340	* node are are the problematic ones in EXACTFL nodes */
	4341	if (OP(scan) == EXACTFL
	4342	&& is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
	4343	{
	4344	/* We need to check for the possibility that this EXACTFL
	4345	* node begins with a multi-char fold. Therefore we fold
	4346	* the first few characters of it so that we can make that
	4347	* check */
	4348	U8 *d = folded;
	4349	int i;
	4350
	4351	for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
	4352	if (isASCII(*s)) {
	4353	(d++) = (U8) toFOLD(s);
	4354	s++;
	4355	}
	4356	else {
	4357	STRLEN len;
	4358	to_utf8_fold(s, d, &len);
	4359	d += len;
	4360	s += UTF8SKIP(s);
	4361	}
	4362	}
	4363
	4364	/* And set up so the code below that looks in this folded
	4365	* buffer instead of the node's string */
	4366	e = d;
	4367	foldlen = UTF8SKIP(folded);
	4368	s = folded;
	4369	}
	4370
	4371	/* When we reach here 's' points to the fold of the first
	4372	* character(s) of the node; and 'e' points to far enough along
	4373	* the folded string to be just past any possible multi-char
	4374	* fold. 'foldlen' is the length in bytes of the first
	4375	* character in 's'
	4376	*
	4377	* Unlike the non-UTF-8 case, the macro for determining if a
	4378	* string is a multi-char fold requires all the characters to
	4379	* already be folded. This is because of all the complications
	4380	* if not. Note that they are folded anyway, except in EXACTFL
	4381	* nodes. Like the non-UTF case above, we punt if the node
	4382	* begins with a multi-char fold */
	4383
	4384	if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
	4385	EXACTF_invlist =
	4386	_add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
	4387	}
	4388	else { /* Single char fold */
	4389
	4390	/* It matches all the things that fold to it, which are
	4391	* found in PL_utf8_foldclosures (including itself) */
	4392	EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
	4393	if (! PL_utf8_foldclosures) {
	4394	_load_PL_utf8_foldclosures();
	4395	}
	4396	if ((listp = hv_fetch(PL_utf8_foldclosures,
	4397	(char *) s, foldlen, FALSE)))
	4398	{
	4399	AV* list = (AV) listp;
	4400	IV k;
	4401	for (k = 0; k <= av_tindex(list); k++) {
	4402	SV** c_p = av_fetch(list, k, FALSE);
	4403	UV c;
	4404	assert(c_p);
	4405
	4406	c = SvUV(*c_p);
	4407
	4408	/* /aa doesn't allow folds between ASCII and non- */
	4409	if ((OP(scan) == EXACTFA \|\| OP(scan) == EXACTFA_NO_TRIE)
	4410	&& isASCII(c) != isASCII(uc))
	4411	{
	4412	continue;
	4413	}
	4414
	4415	EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
	4416	}
	4417	}
	4418	}
	4419	}
	4420	if (flags & SCF_DO_STCLASS_AND) {
	4421	ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
	4422	ANYOF_POSIXL_ZERO(data->start_class);
	4423	ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
	4424	}
	4425	else if (flags & SCF_DO_STCLASS_OR) {
	4426	ssc_union(data->start_class, EXACTF_invlist, FALSE);
	4427	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	4428
	4429	/* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
	4430	ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
	4431	}
	4432	flags &= ~SCF_DO_STCLASS;
	4433	SvREFCNT_dec(EXACTF_invlist);
	4434	}
	4435	else if (REGNODE_VARIES(OP(scan))) {
	4436	SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
	4437	I32 fl = 0, f = flags;
	4438	regnode * const oscan = scan;
	4439	regnode_ssc this_class;
	4440	regnode_ssc *oclass = NULL;
	4441	I32 next_is_eval = 0;
	4442
	4443	switch (PL_regkind[OP(scan)]) {
	4444	case WHILEM: /* End of (?:...)* . */
	4445	scan = NEXTOPER(scan);
	4446	goto finish;
	4447	case PLUS:
	4448	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	4449	next = NEXTOPER(scan);
	4450	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	4451	mincount = 1;
	4452	maxcount = REG_INFTY;
	4453	next = regnext(scan);
	4454	scan = NEXTOPER(scan);
	4455	goto do_curly;
	4456	}
	4457	}
	4458	if (flags & SCF_DO_SUBSTR)
	4459	data->pos_min++;
	4460	min++;
	4461	/* FALLTHROUGH */
	4462	case STAR:
	4463	if (flags & SCF_DO_STCLASS) {
	4464	mincount = 0;
	4465	maxcount = REG_INFTY;
	4466	next = regnext(scan);
	4467	scan = NEXTOPER(scan);
	4468	goto do_curly;
	4469	}
	4470	if (flags & SCF_DO_SUBSTR) {
	4471	scan_commit(pRExC_state, data, minlenp, is_inf);
	4472	/* Cannot extend fixed substrings */
	4473	data->longest = &(data->longest_float);
	4474	}
	4475	is_inf = is_inf_internal = 1;
	4476	scan = regnext(scan);
	4477	goto optimize_curly_tail;
	4478	case CURLY:
	4479	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	4480	&& (scan->flags == stopparen))
	4481	{
	4482	mincount = 1;
	4483	maxcount = 1;
	4484	} else {
	4485	mincount = ARG1(scan);
	4486	maxcount = ARG2(scan);
	4487	}
	4488	next = regnext(scan);
	4489	if (OP(scan) == CURLYX) {
	4490	I32 lp = (data ? *(data->last_closep) : 0);
	4491	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	4492	}
	4493	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	4494	next_is_eval = (OP(scan) == EVAL);
	4495	do_curly:
	4496	if (flags & SCF_DO_SUBSTR) {
	4497	if (mincount == 0)
	4498	scan_commit(pRExC_state, data, minlenp, is_inf);
	4499	/* Cannot extend fixed substrings */
	4500	pos_before = data->pos_min;
	4501	}
	4502	if (data) {
	4503	fl = data->flags;
	4504	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	4505	if (is_inf)
	4506	data->flags \|= SF_IS_INF;
	4507	}
	4508	if (flags & SCF_DO_STCLASS) {
	4509	ssc_init(pRExC_state, &this_class);
	4510	oclass = data->start_class;
	4511	data->start_class = &this_class;
	4512	f \|= SCF_DO_STCLASS_AND;
	4513	f &= ~SCF_DO_STCLASS_OR;
	4514	}
	4515	/* Exclude from super-linear cache processing any {n,m}
	4516	regops for which the combination of input pos and regex
	4517	pos is not enough information to determine if a match
	4518	will be possible.
	4519
	4520	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	4521	regex pos at the \s*, the prospects for a match depend not
	4522	only on the input position but also on how many (bar\s*)
	4523	repeats into the {4,8} we are. */
	4524	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	4525	f &= ~SCF_WHILEM_VISITED_POS;
	4526
	4527	/* This will finish on WHILEM, setting scan, or on NULL: */
	4528	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	4529	last, data, stopparen, recursed_depth, NULL,
	4530	(mincount == 0
	4531	? (f & ~SCF_DO_SUBSTR)
	4532	: f)
	4533	,depth+1);
	4534
	4535	if (flags & SCF_DO_STCLASS)
	4536	data->start_class = oclass;
	4537	if (mincount == 0 \|\| minnext == 0) {
	4538	if (flags & SCF_DO_STCLASS_OR) {
	4539	ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
	4540	}
	4541	else if (flags & SCF_DO_STCLASS_AND) {
	4542	/* Switch to OR mode: cache the old value of
	4543	* data->start_class */
	4544	INIT_AND_WITHP;
	4545	StructCopy(data->start_class, and_withp, regnode_ssc);
	4546	flags &= ~SCF_DO_STCLASS_AND;
	4547	StructCopy(&this_class, data->start_class, regnode_ssc);
	4548	flags \|= SCF_DO_STCLASS_OR;
	4549	ANYOF_FLAGS(data->start_class) \|= ANYOF_EMPTY_STRING;
	4550	}
	4551	} else { /* Non-zero len */
	4552	if (flags & SCF_DO_STCLASS_OR) {
	4553	ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
	4554	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	4555	}
	4556	else if (flags & SCF_DO_STCLASS_AND)
	4557	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
	4558	flags &= ~SCF_DO_STCLASS;
	4559	}
	4560	if (!scan) /* It was not CURLYX, but CURLY. */
	4561	scan = next;
	4562	if (!(flags & SCF_TRIE_DOING_RESTUDY)
	4563	/* ? quantifier ok, except for (?{ ... }) */
	4564	&& (next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	4565	&& (minnext == 0) && (deltanext == 0)
	4566	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	4567	&& maxcount <= REG_INFTY/3) /* Complement check for big
	4568	count */
	4569	{
	4570	/* Fatal warnings may leak the regexp without this: */
	4571	SAVEFREESV(RExC_rx_sv);
	4572	ckWARNreg(RExC_parse,
	4573	"Quantifier unexpected on zero-length expression");
	4574	(void)ReREFCNT_inc(RExC_rx_sv);
	4575	}
	4576
	4577	min += minnext * mincount;
	4578	is_inf_internal \|= deltanext == SSize_t_MAX
	4579	\|\| (maxcount == REG_INFTY && minnext + deltanext > 0);
	4580	is_inf \|= is_inf_internal;
	4581	if (is_inf) {
	4582	delta = SSize_t_MAX;
	4583	} else {
	4584	delta += (minnext + deltanext) * maxcount
	4585	- minnext * mincount;
	4586	}
	4587	/* Try powerful optimization CURLYX => CURLYN. */
	4588	if ( OP(oscan) == CURLYX && data
	4589	&& data->flags & SF_IN_PAR
	4590	&& !(data->flags & SF_HAS_EVAL)
	4591	&& !deltanext && minnext == 1 ) {
	4592	/* Try to optimize to CURLYN. */
	4593	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	4594	regnode * const nxt1 = nxt;
	4595	#ifdef DEBUGGING
	4596	regnode *nxt2;
	4597	#endif
	4598
	4599	/* Skip open. */
	4600	nxt = regnext(nxt);
	4601	if (!REGNODE_SIMPLE(OP(nxt))
	4602	&& !(PL_regkind[OP(nxt)] == EXACT
	4603	&& STR_LEN(nxt) == 1))
	4604	goto nogo;
	4605	#ifdef DEBUGGING
	4606	nxt2 = nxt;
	4607	#endif
	4608	nxt = regnext(nxt);
	4609	if (OP(nxt) != CLOSE)
	4610	goto nogo;
	4611	if (RExC_open_parens) {
	4612	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	4613	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	4614	}
	4615	/* Now we know that nxt2 is the only contents: */
	4616	oscan->flags = (U8)ARG(nxt);
	4617	OP(oscan) = CURLYN;
	4618	OP(nxt1) = NOTHING; /* was OPEN. */
	4619
	4620	#ifdef DEBUGGING
	4621	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	4622	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	4623	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	4624	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	4625	OP(nxt + 1) = OPTIMIZED; /* was count. */
	4626	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	4627	#endif
	4628	}
	4629	nogo:
	4630
	4631	/* Try optimization CURLYX => CURLYM. */
	4632	if ( OP(oscan) == CURLYX && data
	4633	&& !(data->flags & SF_HAS_PAR)
	4634	&& !(data->flags & SF_HAS_EVAL)
	4635	&& !deltanext /* atom is fixed width */
	4636	&& minnext != 0 /* CURLYM can't handle zero width */
	4637
	4638	/* Nor characters whose fold at run-time may be
	4639	* multi-character */
	4640	&& ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
	4641	) {
	4642	/* XXXX How to optimize if data == 0? */
	4643	/* Optimize to a simpler form. */
	4644	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	4645	regnode *nxt2;
	4646
	4647	OP(oscan) = CURLYM;
	4648	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	4649	&& (OP(nxt2) != WHILEM))
	4650	nxt = nxt2;
	4651	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	4652	/* Need to optimize away parenths. */
	4653	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	4654	/* Set the parenth number. */
	4655	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	4656
	4657	oscan->flags = (U8)ARG(nxt);
	4658	if (RExC_open_parens) {
	4659	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	4660	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	4661	}
	4662	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	4663	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	4664
	4665	#ifdef DEBUGGING
	4666	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	4667	OP(nxt + 1) = OPTIMIZED; /* was count. */
	4668	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	4669	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	4670	#endif
	4671	#if 0
	4672	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	4673	regnode *nnxt = regnext(nxt1);
	4674	if (nnxt == nxt) {
	4675	if (reg_off_by_arg[OP(nxt1)])
	4676	ARG_SET(nxt1, nxt2 - nxt1);
	4677	else if (nxt2 - nxt1 < U16_MAX)
	4678	NEXT_OFF(nxt1) = nxt2 - nxt1;
	4679	else
	4680	OP(nxt) = NOTHING; /* Cannot beautify */
	4681	}
	4682	nxt1 = nnxt;
	4683	}
	4684	#endif
	4685	/* Optimize again: */
	4686	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	4687	NULL, stopparen, recursed_depth, NULL, 0,depth+1);
	4688	}
	4689	else
	4690	oscan->flags = 0;
	4691	}
	4692	else if ((OP(oscan) == CURLYX)
	4693	&& (flags & SCF_WHILEM_VISITED_POS)
	4694	/* See the comment on a similar expression above.
	4695	However, this time it's not a subexpression
	4696	we care about, but the expression itself. */
	4697	&& (maxcount == REG_INFTY)
	4698	&& data && ++data->whilem_c < 16) {
	4699	/* This stays as CURLYX, we can put the count/of pair. */
	4700	/* Find WHILEM (as in regexec.c) */
	4701	regnode *nxt = oscan + NEXT_OFF(oscan);
	4702
	4703	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	4704	nxt += ARG(nxt);
	4705	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	4706	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	4707	}
	4708	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	4709	pars++;
	4710	if (flags & SCF_DO_SUBSTR) {
	4711	SV *last_str = NULL;
	4712	STRLEN last_chrs = 0;
	4713	int counted = mincount != 0;
	4714
	4715	if (data->last_end > 0 && mincount != 0) { /* Ends with a
	4716	string. */
	4717	SSize_t b = pos_before >= data->last_start_min
	4718	? pos_before : data->last_start_min;
	4719	STRLEN l;
	4720	const char * const s = SvPV_const(data->last_found, l);
	4721	SSize_t old = b - data->last_start_min;
	4722
	4723	if (UTF)
	4724	old = utf8_hop((U8)s, old) - (U8)s;
	4725	l -= old;
	4726	/* Get the added string: */
	4727	last_str = newSVpvn_utf8(s + old, l, UTF);
	4728	last_chrs = UTF ? utf8_length((U8*)(s + old),
	4729	(U8*)(s + old + l)) : l;
	4730	if (deltanext == 0 && pos_before == b) {
	4731	/* What was added is a constant string */
	4732	if (mincount > 1) {
	4733
	4734	SvGROW(last_str, (mincount * l) + 1);
	4735	repeatcpy(SvPVX(last_str) + l,
	4736	SvPVX_const(last_str), l,
	4737	mincount - 1);
	4738	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	4739	/* Add additional parts. */
	4740	SvCUR_set(data->last_found,
	4741	SvCUR(data->last_found) - l);
	4742	sv_catsv(data->last_found, last_str);
	4743	{
	4744	SV * sv = data->last_found;
	4745	MAGIC *mg =
	4746	SvUTF8(sv) && SvMAGICAL(sv) ?
	4747	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	4748	if (mg && mg->mg_len >= 0)
	4749	mg->mg_len += last_chrs * (mincount-1);
	4750	}
	4751	last_chrs *= mincount;
	4752	data->last_end += l * (mincount - 1);
	4753	}
	4754	} else {
	4755	/* start offset must point into the last copy */
	4756	data->last_start_min += minnext * (mincount - 1);
	4757	data->last_start_max += is_inf ? SSize_t_MAX
	4758	: (maxcount - 1) * (minnext + data->pos_delta);
	4759	}
	4760	}
	4761	/* It is counted once already... */
	4762	data->pos_min += minnext * (mincount - counted);
	4763	#if 0
	4764	PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
	4765	" SSize_t_MAX=%"UVuf" minnext=%"UVuf
	4766	" maxcount=%"UVuf" mincount=%"UVuf"\n",
	4767	(UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
	4768	(UV)mincount);
	4769	if (deltanext != SSize_t_MAX)
	4770	PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
	4771	(UV)(-counted * deltanext + (minnext + deltanext) * maxcount
	4772	- minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
	4773	#endif
	4774	if (deltanext == SSize_t_MAX
	4775	\|\| -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
	4776	data->pos_delta = SSize_t_MAX;
	4777	else
	4778	data->pos_delta += - counted * deltanext +
	4779	(minnext + deltanext) * maxcount - minnext * mincount;
	4780	if (mincount != maxcount) {
	4781	/* Cannot extend fixed substrings found inside
	4782	the group. */
	4783	scan_commit(pRExC_state, data, minlenp, is_inf);
	4784	if (mincount && last_str) {
	4785	SV * const sv = data->last_found;
	4786	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	4787	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	4788
	4789	if (mg)
	4790	mg->mg_len = -1;
	4791	sv_setsv(sv, last_str);
	4792	data->last_end = data->pos_min;
	4793	data->last_start_min = data->pos_min - last_chrs;
	4794	data->last_start_max = is_inf
	4795	? SSize_t_MAX
	4796	: data->pos_min + data->pos_delta - last_chrs;
	4797	}
	4798	data->longest = &(data->longest_float);
	4799	}
	4800	SvREFCNT_dec(last_str);
	4801	}
	4802	if (data && (fl & SF_HAS_EVAL))
	4803	data->flags \|= SF_HAS_EVAL;
	4804	optimize_curly_tail:
	4805	if (OP(oscan) != CURLYX) {
	4806	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	4807	&& NEXT_OFF(next))
	4808	NEXT_OFF(oscan) += NEXT_OFF(next);
	4809	}
	4810	continue;
	4811
	4812	default:
	4813	#ifdef DEBUGGING
	4814	Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
	4815	OP(scan));
	4816	#endif
	4817	case REF:
	4818	case CLUMP:
	4819	if (flags & SCF_DO_SUBSTR) {
	4820	/* Cannot expect anything... */
	4821	scan_commit(pRExC_state, data, minlenp, is_inf);
	4822	data->longest = &(data->longest_float);
	4823	}
	4824	is_inf = is_inf_internal = 1;
	4825	if (flags & SCF_DO_STCLASS_OR) {
	4826	if (OP(scan) == CLUMP) {
	4827	/* Actually is any start char, but very few code points
	4828	* aren't start characters */
	4829	ssc_match_all_cp(data->start_class);
	4830	}
	4831	else {
	4832	ssc_anything(data->start_class);
	4833	}
	4834	}
	4835	flags &= ~SCF_DO_STCLASS;
	4836	break;
	4837	}
	4838	}
	4839	else if (OP(scan) == LNBREAK) {
	4840	if (flags & SCF_DO_STCLASS) {
	4841	if (flags & SCF_DO_STCLASS_AND) {
	4842	ssc_intersection(data->start_class,
	4843	PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
	4844	ssc_clear_locale(data->start_class);
	4845	ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
	4846	}
	4847	else if (flags & SCF_DO_STCLASS_OR) {
	4848	ssc_union(data->start_class,
	4849	PL_XPosix_ptrs[_CC_VERTSPACE],
	4850	FALSE);
	4851	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	4852
	4853	/* See commit msg for
	4854	* 749e076fceedeb708a624933726e7989f2302f6a */
	4855	ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
	4856	}
	4857	flags &= ~SCF_DO_STCLASS;
	4858	}
	4859	min++;
	4860	delta++; /* Because of the 2 char string cr-lf */
	4861	if (flags & SCF_DO_SUBSTR) {
	4862	/* Cannot expect anything... */
	4863	scan_commit(pRExC_state, data, minlenp, is_inf);
	4864	data->pos_min += 1;
	4865	data->pos_delta += 1;
	4866	data->longest = &(data->longest_float);
	4867	}
	4868	}
	4869	else if (REGNODE_SIMPLE(OP(scan))) {
	4870
	4871	if (flags & SCF_DO_SUBSTR) {
	4872	scan_commit(pRExC_state, data, minlenp, is_inf);
	4873	data->pos_min++;
	4874	}
	4875	min++;
	4876	if (flags & SCF_DO_STCLASS) {
	4877	bool invert = 0;
	4878	SV* my_invlist = sv_2mortal(_new_invlist(0));
	4879	U8 namedclass;
	4880
	4881	/* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
	4882	ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
	4883
	4884	/* Some of the logic below assumes that switching
	4885	locale on will only add false positives. */
	4886	switch (OP(scan)) {
	4887
	4888	default:
	4889	#ifdef DEBUGGING
	4890	Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
	4891	OP(scan));
	4892	#endif
	4893	case CANY:
	4894	case SANY:
	4895	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4896	ssc_match_all_cp(data->start_class);
	4897	break;
	4898
	4899	case REG_ANY:
	4900	{
	4901	SV* REG_ANY_invlist = _new_invlist(2);
	4902	REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
	4903	'\n');
	4904	if (flags & SCF_DO_STCLASS_OR) {
	4905	ssc_union(data->start_class,
	4906	REG_ANY_invlist,
	4907	TRUE /* TRUE => invert, hence all but \n
	4908	*/
	4909	);
	4910	}
	4911	else if (flags & SCF_DO_STCLASS_AND) {
	4912	ssc_intersection(data->start_class,
	4913	REG_ANY_invlist,
	4914	TRUE /* TRUE => invert */
	4915	);
	4916	ssc_clear_locale(data->start_class);
	4917	}
	4918	SvREFCNT_dec_NN(REG_ANY_invlist);
	4919	}
	4920	break;
	4921
	4922	case ANYOF:
	4923	if (flags & SCF_DO_STCLASS_AND)
	4924	ssc_and(pRExC_state, data->start_class,
	4925	(regnode_charclass *) scan);
	4926	else
	4927	ssc_or(pRExC_state, data->start_class,
	4928	(regnode_charclass *) scan);
	4929	break;
	4930
	4931	case NPOSIXL:
	4932	invert = 1;
	4933	/* FALLTHROUGH */
	4934
	4935	case POSIXL:
	4936	namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
	4937	if (flags & SCF_DO_STCLASS_AND) {
	4938	bool was_there = cBOOL(
	4939	ANYOF_POSIXL_TEST(data->start_class,
	4940	namedclass));
	4941	ANYOF_POSIXL_ZERO(data->start_class);
	4942	if (was_there) { /* Do an AND */
	4943	ANYOF_POSIXL_SET(data->start_class, namedclass);
	4944	}
	4945	/* No individual code points can now match */
	4946	data->start_class->invlist
	4947	= sv_2mortal(_new_invlist(0));
	4948	}
	4949	else {
	4950	int complement = namedclass + ((invert) ? -1 : 1);
	4951
	4952	assert(flags & SCF_DO_STCLASS_OR);
	4953
	4954	/* If the complement of this class was already there,
	4955	* the result is that they match all code points,
	4956	* (\d + \D == everything). Remove the classes from
	4957	* future consideration. Locale is not relevant in
	4958	* this case */
	4959	if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
	4960	ssc_match_all_cp(data->start_class);
	4961	ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
	4962	ANYOF_POSIXL_CLEAR(data->start_class, complement);
	4963	}
	4964	else { /* The usual case; just add this class to the
	4965	existing set */
	4966	ANYOF_POSIXL_SET(data->start_class, namedclass);
	4967	}
	4968	}
	4969	break;
	4970
	4971	case NPOSIXA: /* For these, we always know the exact set of
	4972	what's matched */
	4973	invert = 1;
	4974	/* FALLTHROUGH */
	4975	case POSIXA:
	4976	if (FLAGS(scan) == _CC_ASCII) {
	4977	my_invlist = PL_XPosix_ptrs[_CC_ASCII];
	4978	}
	4979	else {
	4980	_invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
	4981	PL_XPosix_ptrs[_CC_ASCII],
	4982	&my_invlist);
	4983	}
	4984	goto join_posix;
	4985
	4986	case NPOSIXD:
	4987	case NPOSIXU:
	4988	invert = 1;
	4989	/* FALLTHROUGH */
	4990	case POSIXD:
	4991	case POSIXU:
	4992	my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
	4993
	4994	/* NPOSIXD matches all upper Latin1 code points unless the
	4995	* target string being matched is UTF-8, which is
	4996	* unknowable until match time. Since we are going to
	4997	* invert, we want to get rid of all of them so that the
	4998	* inversion will match all */
	4999	if (OP(scan) == NPOSIXD) {
	5000	_invlist_subtract(my_invlist, PL_UpperLatin1,
	5001	&my_invlist);
	5002	}
	5003
	5004	join_posix:
	5005
	5006	if (flags & SCF_DO_STCLASS_AND) {
	5007	ssc_intersection(data->start_class, my_invlist, invert);
	5008	ssc_clear_locale(data->start_class);
	5009	}
	5010	else {
	5011	assert(flags & SCF_DO_STCLASS_OR);
	5012	ssc_union(data->start_class, my_invlist, invert);
	5013	}
	5014	}
	5015	if (flags & SCF_DO_STCLASS_OR)
	5016	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	5017	flags &= ~SCF_DO_STCLASS;
	5018	}
	5019	}
	5020	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	5021	data->flags \|= (OP(scan) == MEOL
	5022	? SF_BEFORE_MEOL
	5023	: SF_BEFORE_SEOL);
	5024	scan_commit(pRExC_state, data, minlenp, is_inf);
	5025
	5026	}
	5027	else if ( PL_regkind[OP(scan)] == BRANCHJ
	5028	/* Lookbehind, or need to calculate parens/evals/stclass: */
	5029	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	5030	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM))
	5031	{
	5032	if ( OP(scan) == UNLESSM &&
	5033	scan->flags == 0 &&
	5034	OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
	5035	OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
	5036	) {
	5037	regnode *opt;
	5038	regnode *upto= regnext(scan);
	5039	DEBUG_PARSE_r({
	5040	SV * const mysv_val=sv_newmortal();
	5041	DEBUG_STUDYDATA("OPFAIL",data,depth);
	5042
	5043	/DEBUG_PARSE_MSG("opfail");/
	5044	regprop(RExC_rx, mysv_val, upto, NULL);
	5045	PerlIO_printf(Perl_debug_log,
	5046	"~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
	5047	SvPV_nolen_const(mysv_val),
	5048	(IV)REG_NODE_NUM(upto),
	5049	(IV)(upto - scan)
	5050	);
	5051	});
	5052	OP(scan) = OPFAIL;
	5053	NEXT_OFF(scan) = upto - scan;
	5054	for (opt= scan + 1; opt < upto ; opt++)
	5055	OP(opt) = OPTIMIZED;
	5056	scan= upto;
	5057	continue;
	5058	}
	5059	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	5060	\|\| OP(scan) == UNLESSM )
	5061	{
	5062	/* Negative Lookahead/lookbehind
	5063	In this case we can't do fixed string optimisation.
	5064	*/
	5065
	5066	SSize_t deltanext, minnext, fake = 0;
	5067	regnode *nscan;
	5068	regnode_ssc intrnl;
	5069	int f = 0;
	5070
	5071	data_fake.flags = 0;
	5072	if (data) {
	5073	data_fake.whilem_c = data->whilem_c;
	5074	data_fake.last_closep = data->last_closep;
	5075	}
	5076	else
	5077	data_fake.last_closep = &fake;
	5078	data_fake.pos_delta = delta;
	5079	if ( flags & SCF_DO_STCLASS && !scan->flags
	5080	&& OP(scan) == IFMATCH ) { /* Lookahead */
	5081	ssc_init(pRExC_state, &intrnl);
	5082	data_fake.start_class = &intrnl;
	5083	f \|= SCF_DO_STCLASS_AND;
	5084	}
	5085	if (flags & SCF_WHILEM_VISITED_POS)
	5086	f \|= SCF_WHILEM_VISITED_POS;
	5087	next = regnext(scan);
	5088	nscan = NEXTOPER(NEXTOPER(scan));
	5089	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	5090	last, &data_fake, stopparen,
	5091	recursed_depth, NULL, f, depth+1);
	5092	if (scan->flags) {
	5093	if (deltanext) {
	5094	FAIL("Variable length lookbehind not implemented");
	5095	}
	5096	else if (minnext > (I32)U8_MAX) {
	5097	FAIL2("Lookbehind longer than %"UVuf" not implemented",
	5098	(UV)U8_MAX);
	5099	}
	5100	scan->flags = (U8)minnext;
	5101	}
	5102	if (data) {
	5103	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	5104	pars++;
	5105	if (data_fake.flags & SF_HAS_EVAL)
	5106	data->flags \|= SF_HAS_EVAL;
	5107	data->whilem_c = data_fake.whilem_c;
	5108	}
	5109	if (f & SCF_DO_STCLASS_AND) {
	5110	if (flags & SCF_DO_STCLASS_OR) {
	5111	/* OR before, AND after: ideally we would recurse with
	5112	* data_fake to get the AND applied by study of the
	5113	* remainder of the pattern, and then derecurse;
	5114	* * HACK * for now just treat as "no information".
	5115	* See [perl #56690].
	5116	*/
	5117	ssc_init(pRExC_state, data->start_class);
	5118	} else {
	5119	/* AND before and after: combine and continue. These
	5120	* assertions are zero-length, so can match an EMPTY
	5121	* string */
	5122	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
	5123	ANYOF_FLAGS(data->start_class) \|= ANYOF_EMPTY_STRING;
	5124	}
	5125	}
	5126	}
	5127	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	5128	else {
	5129	/* Positive Lookahead/lookbehind
	5130	In this case we can do fixed string optimisation,
	5131	but we must be careful about it. Note in the case of
	5132	lookbehind the positions will be offset by the minimum
	5133	length of the pattern, something we won't know about
	5134	until after the recurse.
	5135	*/
	5136	SSize_t deltanext, fake = 0;
	5137	regnode *nscan;
	5138	regnode_ssc intrnl;
	5139	int f = 0;
	5140	/* We use SAVEFREEPV so that when the full compile
	5141	is finished perl will clean up the allocated
	5142	minlens when it's all done. This way we don't
	5143	have to worry about freeing them when we know
	5144	they wont be used, which would be a pain.
	5145	*/
	5146	SSize_t *minnextp;
	5147	Newx( minnextp, 1, SSize_t );
	5148	SAVEFREEPV(minnextp);
	5149
	5150	if (data) {
	5151	StructCopy(data, &data_fake, scan_data_t);
	5152	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	5153	f \|= SCF_DO_SUBSTR;
	5154	if (scan->flags)
	5155	scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
	5156	data_fake.last_found=newSVsv(data->last_found);
	5157	}
	5158	}
	5159	else
	5160	data_fake.last_closep = &fake;
	5161	data_fake.flags = 0;
	5162	data_fake.pos_delta = delta;
	5163	if (is_inf)
	5164	data_fake.flags \|= SF_IS_INF;
	5165	if ( flags & SCF_DO_STCLASS && !scan->flags
	5166	&& OP(scan) == IFMATCH ) { /* Lookahead */
	5167	ssc_init(pRExC_state, &intrnl);
	5168	data_fake.start_class = &intrnl;
	5169	f \|= SCF_DO_STCLASS_AND;
	5170	}
	5171	if (flags & SCF_WHILEM_VISITED_POS)
	5172	f \|= SCF_WHILEM_VISITED_POS;
	5173	next = regnext(scan);
	5174	nscan = NEXTOPER(NEXTOPER(scan));
	5175
	5176	*minnextp = study_chunk(pRExC_state, &nscan, minnextp,
	5177	&deltanext, last, &data_fake,
	5178	stopparen, recursed_depth, NULL,
	5179	f,depth+1);
	5180	if (scan->flags) {
	5181	if (deltanext) {
	5182	FAIL("Variable length lookbehind not implemented");
	5183	}
	5184	else if (*minnextp > (I32)U8_MAX) {
	5185	FAIL2("Lookbehind longer than %"UVuf" not implemented",
	5186	(UV)U8_MAX);
	5187	}
	5188	scan->flags = (U8)*minnextp;
	5189	}
	5190
	5191	*minnextp += min;
	5192
	5193	if (f & SCF_DO_STCLASS_AND) {
	5194	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
	5195	ANYOF_FLAGS(data->start_class) \|= ANYOF_EMPTY_STRING;
	5196	}
	5197	if (data) {
	5198	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	5199	pars++;
	5200	if (data_fake.flags & SF_HAS_EVAL)
	5201	data->flags \|= SF_HAS_EVAL;
	5202	data->whilem_c = data_fake.whilem_c;
	5203	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	5204	if (RExC_rx->minlen<*minnextp)
	5205	RExC_rx->minlen=*minnextp;
	5206	scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
	5207	SvREFCNT_dec_NN(data_fake.last_found);
	5208
	5209	if ( data_fake.minlen_fixed != minlenp )
	5210	{
	5211	data->offset_fixed= data_fake.offset_fixed;
	5212	data->minlen_fixed= data_fake.minlen_fixed;
	5213	data->lookbehind_fixed+= scan->flags;
	5214	}
	5215	if ( data_fake.minlen_float != minlenp )
	5216	{
	5217	data->minlen_float= data_fake.minlen_float;
	5218	data->offset_float_min=data_fake.offset_float_min;
	5219	data->offset_float_max=data_fake.offset_float_max;
	5220	data->lookbehind_float+= scan->flags;
	5221	}
	5222	}
	5223	}
	5224	}
	5225	#endif
	5226	}
	5227	else if (OP(scan) == OPEN) {
	5228	if (stopparen != (I32)ARG(scan))
	5229	pars++;
	5230	}
	5231	else if (OP(scan) == CLOSE) {
	5232	if (stopparen == (I32)ARG(scan)) {
	5233	break;
	5234	}
	5235	if ((I32)ARG(scan) == is_par) {
	5236	next = regnext(scan);
	5237
	5238	if ( next && (OP(next) != WHILEM) && next < last)
	5239	is_par = 0; /* Disable optimization */
	5240	}
	5241	if (data)
	5242	*(data->last_closep) = ARG(scan);
	5243	}
	5244	else if (OP(scan) == EVAL) {
	5245	if (data)
	5246	data->flags \|= SF_HAS_EVAL;
	5247	}
	5248	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	5249	if (flags & SCF_DO_SUBSTR) {
	5250	scan_commit(pRExC_state, data, minlenp, is_inf);
	5251	flags &= ~SCF_DO_SUBSTR;
	5252	}
	5253	if (data && OP(scan)==ACCEPT) {
	5254	data->flags \|= SCF_SEEN_ACCEPT;
	5255	if (stopmin > min)
	5256	stopmin = min;
	5257	}
	5258	}
	5259	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	5260	{
	5261	if (flags & SCF_DO_SUBSTR) {
	5262	scan_commit(pRExC_state, data, minlenp, is_inf);
	5263	data->longest = &(data->longest_float);
	5264	}
	5265	is_inf = is_inf_internal = 1;
	5266	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	5267	ssc_anything(data->start_class);
	5268	flags &= ~SCF_DO_STCLASS;
	5269	}
	5270	else if (OP(scan) == GPOS) {
	5271	if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
	5272	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	5273	{
	5274	if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
	5275	RExC_rx->intflags \|= PREGf_ANCH_GPOS;
	5276	if (RExC_rx->gofs < (STRLEN)min)
	5277	RExC_rx->gofs = min;
	5278	} else {
	5279	RExC_rx->intflags \|= PREGf_GPOS_FLOAT;
	5280	RExC_rx->gofs = 0;
	5281	}
	5282	}
	5283	#ifdef TRIE_STUDY_OPT
	5284	#ifdef FULL_TRIE_STUDY
	5285	else if (PL_regkind[OP(scan)] == TRIE) {
	5286	/* NOTE - There is similar code to this block above for handling
	5287	BRANCH nodes on the initial study. If you change stuff here
	5288	check there too. */
	5289	regnode *trie_node= scan;
	5290	regnode *tail= regnext(scan);
	5291	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	5292	SSize_t max1 = 0, min1 = SSize_t_MAX;
	5293	regnode_ssc accum;
	5294
	5295	if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
	5296	/* Cannot merge strings after this. */
	5297	scan_commit(pRExC_state, data, minlenp, is_inf);
	5298	}
	5299	if (flags & SCF_DO_STCLASS)
	5300	ssc_init_zero(pRExC_state, &accum);
	5301
	5302	if (!trie->jump) {
	5303	min1= trie->minlen;
	5304	max1= trie->maxlen;
	5305	} else {
	5306	const regnode *nextbranch= NULL;
	5307	U32 word;
	5308
	5309	for ( word=1 ; word <= trie->wordcount ; word++)
	5310	{
	5311	SSize_t deltanext=0, minnext=0, f = 0, fake;
	5312	regnode_ssc this_class;
	5313
	5314	data_fake.flags = 0;
	5315	if (data) {
	5316	data_fake.whilem_c = data->whilem_c;
	5317	data_fake.last_closep = data->last_closep;
	5318	}
	5319	else
	5320	data_fake.last_closep = &fake;
	5321	data_fake.pos_delta = delta;
	5322	if (flags & SCF_DO_STCLASS) {
	5323	ssc_init(pRExC_state, &this_class);
	5324	data_fake.start_class = &this_class;
	5325	f = SCF_DO_STCLASS_AND;
	5326	}
	5327	if (flags & SCF_WHILEM_VISITED_POS)
	5328	f \|= SCF_WHILEM_VISITED_POS;
	5329
	5330	if (trie->jump[word]) {
	5331	if (!nextbranch)
	5332	nextbranch = trie_node + trie->jump[0];
	5333	scan= trie_node + trie->jump[word];
	5334	/* We go from the jump point to the branch that follows
	5335	it. Note this means we need the vestigal unused
	5336	branches even though they arent otherwise used. */
	5337	minnext = study_chunk(pRExC_state, &scan, minlenp,
	5338	&deltanext, (regnode *)nextbranch, &data_fake,
	5339	stopparen, recursed_depth, NULL, f,depth+1);
	5340	}
	5341	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	5342	nextbranch= regnext((regnode*)nextbranch);
	5343
	5344	if (min1 > (SSize_t)(minnext + trie->minlen))
	5345	min1 = minnext + trie->minlen;
	5346	if (deltanext == SSize_t_MAX) {
	5347	is_inf = is_inf_internal = 1;
	5348	max1 = SSize_t_MAX;
	5349	} else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
	5350	max1 = minnext + deltanext + trie->maxlen;
	5351
	5352	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	5353	pars++;
	5354	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	5355	if ( stopmin > min + min1)
	5356	stopmin = min + min1;
	5357	flags &= ~SCF_DO_SUBSTR;
	5358	if (data)
	5359	data->flags \|= SCF_SEEN_ACCEPT;
	5360	}
	5361	if (data) {
	5362	if (data_fake.flags & SF_HAS_EVAL)
	5363	data->flags \|= SF_HAS_EVAL;
	5364	data->whilem_c = data_fake.whilem_c;
	5365	}
	5366	if (flags & SCF_DO_STCLASS)
	5367	ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
	5368	}
	5369	}
	5370	if (flags & SCF_DO_SUBSTR) {
	5371	data->pos_min += min1;
	5372	data->pos_delta += max1 - min1;
	5373	if (max1 != min1 \|\| is_inf)
	5374	data->longest = &(data->longest_float);
	5375	}
	5376	min += min1;
	5377	delta += max1 - min1;
	5378	if (flags & SCF_DO_STCLASS_OR) {
	5379	ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
	5380	if (min1) {
	5381	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	5382	flags &= ~SCF_DO_STCLASS;
	5383	}
	5384	}
	5385	else if (flags & SCF_DO_STCLASS_AND) {
	5386	if (min1) {
	5387	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
	5388	flags &= ~SCF_DO_STCLASS;
	5389	}
	5390	else {
	5391	/* Switch to OR mode: cache the old value of
	5392	* data->start_class */
	5393	INIT_AND_WITHP;
	5394	StructCopy(data->start_class, and_withp, regnode_ssc);
	5395	flags &= ~SCF_DO_STCLASS_AND;
	5396	StructCopy(&accum, data->start_class, regnode_ssc);
	5397	flags \|= SCF_DO_STCLASS_OR;
	5398	}
	5399	}
	5400	scan= tail;
	5401	continue;
	5402	}
	5403	#else
	5404	else if (PL_regkind[OP(scan)] == TRIE) {
	5405	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	5406	U8*bang=NULL;
	5407
	5408	min += trie->minlen;
	5409	delta += (trie->maxlen - trie->minlen);
	5410	flags &= ~SCF_DO_STCLASS; /* xxx */
	5411	if (flags & SCF_DO_SUBSTR) {
	5412	/* Cannot expect anything... */
	5413	scan_commit(pRExC_state, data, minlenp, is_inf);
	5414	data->pos_min += trie->minlen;
	5415	data->pos_delta += (trie->maxlen - trie->minlen);
	5416	if (trie->maxlen != trie->minlen)
	5417	data->longest = &(data->longest_float);
	5418	}
	5419	if (trie->jump) /* no more substrings -- for now /grr*/
	5420	flags &= ~SCF_DO_SUBSTR;
	5421	}
	5422	#endif /* old or new */
	5423	#endif /* TRIE_STUDY_OPT */
	5424
	5425	/* Else: zero-length, ignore. */
	5426	scan = regnext(scan);
	5427	}
	5428	/* If we are exiting a recursion we can unset its recursed bit
	5429	* and allow ourselves to enter it again - no danger of an
	5430	* infinite loop there.
	5431	if (stopparen > -1 && recursed) {
	5432	DEBUG_STUDYDATA("unset:", data,depth);
	5433	PAREN_UNSET( recursed, stopparen);
	5434	}
	5435	*/
	5436	if (frame) {
	5437	DEBUG_STUDYDATA("frame-end:",data,depth);
	5438	DEBUG_PEEP("fend", scan, depth);
	5439	/* restore previous context */
	5440	last = frame->last;
	5441	scan = frame->next;
	5442	stopparen = frame->stop;
	5443	recursed_depth = frame->prev_recursed_depth;
	5444	depth = depth - 1;
	5445
	5446	frame = frame->prev;
	5447	goto fake_study_recurse;
	5448	}
	5449
	5450	finish:
	5451	assert(!frame);
	5452	DEBUG_STUDYDATA("pre-fin:",data,depth);
	5453
	5454	*scanp = scan;
	5455	*deltap = is_inf_internal ? SSize_t_MAX : delta;
	5456
	5457	if (flags & SCF_DO_SUBSTR && is_inf)
	5458	data->pos_delta = SSize_t_MAX - data->pos_min;
	5459	if (is_par > (I32)U8_MAX)
	5460	is_par = 0;
	5461	if (is_par && pars==1 && data) {
	5462	data->flags \|= SF_IN_PAR;
	5463	data->flags &= ~SF_HAS_PAR;
	5464	}
	5465	else if (pars && data) {
	5466	data->flags \|= SF_HAS_PAR;
	5467	data->flags &= ~SF_IN_PAR;
	5468	}
	5469	if (flags & SCF_DO_STCLASS_OR)
	5470	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	5471	if (flags & SCF_TRIE_RESTUDY)
	5472	data->flags \|= SCF_TRIE_RESTUDY;
	5473
	5474	DEBUG_STUDYDATA("post-fin:",data,depth);
	5475
	5476	{
	5477	SSize_t final_minlen= min < stopmin ? min : stopmin;
	5478
	5479	if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
	5480	RExC_maxlen = final_minlen + delta;
	5481	}
	5482	return final_minlen;
	5483	}
	5484	/* not-reached */
	5485	}
	5486
	5487	STATIC U32
	5488	S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
	5489	{
	5490	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	5491
	5492	PERL_ARGS_ASSERT_ADD_DATA;
	5493
	5494	Renewc(RExC_rxi->data,
	5495	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	5496	char, struct reg_data);
	5497	if(count)
	5498	Renew(RExC_rxi->data->what, count + n, U8);
	5499	else
	5500	Newx(RExC_rxi->data->what, n, U8);
	5501	RExC_rxi->data->count = count + n;
	5502	Copy(s, RExC_rxi->data->what + count, n, U8);
	5503	return count;
	5504	}
	5505
	5506	/*XXX: todo make this not included in a non debugging perl, but appears to be
	5507	* used anyway there, in 'use re' */
	5508	#ifndef PERL_IN_XSUB_RE
	5509	void
	5510	Perl_reginitcolors(pTHX)
	5511	{
	5512	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	5513	if (s) {
	5514	char *t = savepv(s);
	5515	int i = 0;
	5516	PL_colors[0] = t;
	5517	while (++i < 6) {
	5518	t = strchr(t, '\t');
	5519	if (t) {
	5520	*t = '\0';
	5521	PL_colors[i] = ++t;
	5522	}
	5523	else
	5524	PL_colors[i] = t = (char *)"";
	5525	}
	5526	} else {
	5527	int i = 0;
	5528	while (i < 6)
	5529	PL_colors[i++] = (char *)"";
	5530	}
	5531	PL_colorset = 1;
	5532	}
	5533	#endif
	5534
	5535
	5536	#ifdef TRIE_STUDY_OPT
	5537	#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
	5538	STMT_START { \
	5539	if ( \
	5540	(data.flags & SCF_TRIE_RESTUDY) \
	5541	&& ! restudied++ \
	5542	) { \
	5543	dOsomething; \
	5544	goto reStudy; \
	5545	} \
	5546	} STMT_END
	5547	#else
	5548	#define CHECK_RESTUDY_GOTO_butfirst
	5549	#endif
	5550
	5551	/*
	5552	* pregcomp - compile a regular expression into internal code
	5553	*
	5554	* Decides which engine's compiler to call based on the hint currently in
	5555	* scope
	5556	*/
	5557
	5558	#ifndef PERL_IN_XSUB_RE
	5559
	5560	/* return the currently in-scope regex engine (or the default if none) */
	5561
	5562	regexp_engine const *
	5563	Perl_current_re_engine(pTHX)
	5564	{
	5565	if (IN_PERL_COMPILETIME) {
	5566	HV * const table = GvHV(PL_hintgv);
	5567	SV **ptr;
	5568
	5569	if (!table \|\| !(PL_hints & HINT_LOCALIZE_HH))
	5570	return &PL_core_reg_engine;
	5571	ptr = hv_fetchs(table, "regcomp", FALSE);
	5572	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	5573	return &PL_core_reg_engine;
	5574	return INT2PTR(regexp_engine,SvIV(ptr));
	5575	}
	5576	else {
	5577	SV *ptr;
	5578	if (!PL_curcop->cop_hints_hash)
	5579	return &PL_core_reg_engine;
	5580	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
	5581	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	5582	return &PL_core_reg_engine;
	5583	return INT2PTR(regexp_engine*,SvIV(ptr));
	5584	}
	5585	}
	5586
	5587
	5588	REGEXP *
	5589	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	5590	{
	5591	regexp_engine const *eng = current_re_engine();
	5592	GET_RE_DEBUG_FLAGS_DECL;
	5593
	5594	PERL_ARGS_ASSERT_PREGCOMP;
	5595
	5596	/* Dispatch a request to compile a regexp to correct regexp engine. */
	5597	DEBUG_COMPILE_r({
	5598	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	5599	PTR2UV(eng));
	5600	});
	5601	return CALLREGCOMP_ENG(eng, pattern, flags);
	5602	}
	5603	#endif
	5604
	5605	/* public(ish) entry point for the perl core's own regex compiling code.
	5606	* It's actually a wrapper for Perl_re_op_compile that only takes an SV
	5607	* pattern rather than a list of OPs, and uses the internal engine rather
	5608	* than the current one */
	5609
	5610	REGEXP *
	5611	Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
	5612	{
	5613	SV pat = pattern; / defeat constness! */
	5614	PERL_ARGS_ASSERT_RE_COMPILE;
	5615	return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
	5616	#ifdef PERL_IN_XSUB_RE
	5617	&my_reg_engine,
	5618	#else
	5619	&PL_core_reg_engine,
	5620	#endif
	5621	NULL, NULL, rx_flags, 0);
	5622	}
	5623
	5624
	5625	/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
	5626	* blocks, recalculate the indices. Update pat_p and plen_p in-place to
	5627	* point to the realloced string and length.
	5628	*
	5629	* This is essentially a copy of Perl_bytes_to_utf8() with the code index
	5630	* stuff added */
	5631
	5632	static void
	5633	S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
	5634	char *pat_p, STRLEN plen_p, int num_code_blocks)
	5635	{
	5636	U8 const src = (U8)*pat_p;
	5637	U8 *dst;
	5638	int n=0;
	5639	STRLEN s = 0, d = 0;
	5640	bool do_end = 0;
	5641	GET_RE_DEBUG_FLAGS_DECL;
	5642
	5643	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	5644	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	5645
	5646	Newx(dst, plen_p 2 + 1, U8);
	5647
	5648	while (s < *plen_p) {
	5649	if (NATIVE_BYTE_IS_INVARIANT(src[s]))
	5650	dst[d] = src[s];
	5651	else {
	5652	dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
	5653	dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
	5654	}
	5655	if (n < num_code_blocks) {
	5656	if (!do_end && pRExC_state->code_blocks[n].start == s) {
	5657	pRExC_state->code_blocks[n].start = d;
	5658	assert(dst[d] == '(');
	5659	do_end = 1;
	5660	}
	5661	else if (do_end && pRExC_state->code_blocks[n].end == s) {
	5662	pRExC_state->code_blocks[n].end = d;
	5663	assert(dst[d] == ')');
	5664	do_end = 0;
	5665	n++;
	5666	}
	5667	}
	5668	s++;
	5669	d++;
	5670	}
	5671	dst[d] = '\0';
	5672	*plen_p = d;
	5673	pat_p = (char) dst;
	5674	SAVEFREEPV(*pat_p);
	5675	RExC_orig_utf8 = RExC_utf8 = 1;
	5676	}
	5677
	5678
	5679
	5680	/* S_concat_pat(): concatenate a list of args to the pattern string pat,
	5681	* while recording any code block indices, and handling overloading,
	5682	* nested qr// objects etc. If pat is null, it will allocate a new
	5683	* string, or just return the first arg, if there's only one.
	5684	*
	5685	* Returns the malloced/updated pat.
	5686	* patternp and pat_count is the array of SVs to be concatted;
	5687	* oplist is the optional list of ops that generated the SVs;
	5688	* recompile_p is a pointer to a boolean that will be set if
	5689	* the regex will need to be recompiled.
	5690	* delim, if non-null is an SV that will be inserted between each element
	5691	*/
	5692
	5693	static SV*
	5694	S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
	5695	SV pat, SV * const patternp, int pat_count,
	5696	OP oplist, bool recompile_p, SV *delim)
	5697	{
	5698	SV **svp;
	5699	int n = 0;
	5700	bool use_delim = FALSE;
	5701	bool alloced = FALSE;
	5702
	5703	/* if we know we have at least two args, create an empty string,
	5704	* then concatenate args to that. For no args, return an empty string */
	5705	if (!pat && pat_count != 1) {
	5706	pat = newSVpvs("");
	5707	SAVEFREESV(pat);
	5708	alloced = TRUE;
	5709	}
	5710
	5711	for (svp = patternp; svp < patternp + pat_count; svp++) {
	5712	SV *sv;
	5713	SV *rx = NULL;
	5714	STRLEN orig_patlen = 0;
	5715	bool code = 0;
	5716	SV msv = use_delim ? delim : svp;
	5717	if (!msv) msv = &PL_sv_undef;
	5718
	5719	/* if we've got a delimiter, we go round the loop twice for each
	5720	* svp slot (except the last), using the delimiter the second
	5721	* time round */
	5722	if (use_delim) {
	5723	svp--;
	5724	use_delim = FALSE;
	5725	}
	5726	else if (delim)
	5727	use_delim = TRUE;
	5728
	5729	if (SvTYPE(msv) == SVt_PVAV) {
	5730	/* we've encountered an interpolated array within
	5731	* the pattern, e.g. /...@a..../. Expand the list of elements,
	5732	* then recursively append elements.
	5733	* The code in this block is based on S_pushav() */
	5734
	5735	AV const av = (AV)msv;
	5736	const SSize_t maxarg = AvFILL(av) + 1;
	5737	SV **array;
	5738
	5739	if (oplist) {
	5740	assert(oplist->op_type == OP_PADAV
	5741	\|\| oplist->op_type == OP_RV2AV);
	5742	oplist = OP_SIBLING(oplist);
	5743	}
	5744
	5745	if (SvRMAGICAL(av)) {
	5746	SSize_t i;
	5747
	5748	Newx(array, maxarg, SV*);
	5749	SAVEFREEPV(array);
	5750	for (i=0; i < maxarg; i++) {
	5751	SV ** const svp = av_fetch(av, i, FALSE);
	5752	array[i] = svp ? *svp : &PL_sv_undef;
	5753	}
	5754	}
	5755	else
	5756	array = AvARRAY(av);
	5757
	5758	pat = S_concat_pat(aTHX_ pRExC_state, pat,
	5759	array, maxarg, NULL, recompile_p,
	5760	/* $" */
	5761	GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
	5762
	5763	continue;
	5764	}
	5765
	5766
	5767	/* we make the assumption here that each op in the list of
	5768	* op_siblings maps to one SV pushed onto the stack,
	5769	* except for code blocks, with have both an OP_NULL and
	5770	* and OP_CONST.
	5771	* This allows us to match up the list of SVs against the
	5772	* list of OPs to find the next code block.
	5773	*
	5774	* Note that PUSHMARK PADSV PADSV ..
	5775	* is optimised to
	5776	* PADRANGE PADSV PADSV ..
	5777	* so the alignment still works. */
	5778
	5779	if (oplist) {
	5780	if (oplist->op_type == OP_NULL
	5781	&& (oplist->op_flags & OPf_SPECIAL))
	5782	{
	5783	assert(n < pRExC_state->num_code_blocks);
	5784	pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
	5785	pRExC_state->code_blocks[n].block = oplist;
	5786	pRExC_state->code_blocks[n].src_regex = NULL;
	5787	n++;
	5788	code = 1;
	5789	oplist = OP_SIBLING(oplist); /* skip CONST */
	5790	assert(oplist);
	5791	}
	5792	oplist = OP_SIBLING(oplist);;
	5793	}
	5794
	5795	/* apply magic and QR overloading to arg */
	5796
	5797	SvGETMAGIC(msv);
	5798	if (SvROK(msv) && SvAMAGIC(msv)) {
	5799	SV *sv = AMG_CALLunary(msv, regexp_amg);
	5800	if (sv) {
	5801	if (SvROK(sv))
	5802	sv = SvRV(sv);
	5803	if (SvTYPE(sv) != SVt_REGEXP)
	5804	Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
	5805	msv = sv;
	5806	}
	5807	}
	5808
	5809	/* try concatenation overload ... */
	5810	if (pat && (SvAMAGIC(pat) \|\| SvAMAGIC(msv)) &&
	5811	(sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
	5812	{
	5813	sv_setsv(pat, sv);
	5814	/* overloading involved: all bets are off over literal
	5815	* code. Pretend we haven't seen it */
	5816	pRExC_state->num_code_blocks -= n;
	5817	n = 0;
	5818	}
	5819	else {
	5820	/* ... or failing that, try "" overload */
	5821	while (SvAMAGIC(msv)
	5822	&& (sv = AMG_CALLunary(msv, string_amg))
	5823	&& sv != msv
	5824	&& !( SvROK(msv)
	5825	&& SvROK(sv)
	5826	&& SvRV(msv) == SvRV(sv))
	5827	) {
	5828	msv = sv;
	5829	SvGETMAGIC(msv);
	5830	}
	5831	if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
	5832	msv = SvRV(msv);
	5833
	5834	if (pat) {
	5835	/* this is a partially unrolled
	5836	* sv_catsv_nomg(pat, msv);
	5837	* that allows us to adjust code block indices if
	5838	* needed */
	5839	STRLEN dlen;
	5840	char *dst = SvPV_force_nomg(pat, dlen);
	5841	orig_patlen = dlen;
	5842	if (SvUTF8(msv) && !SvUTF8(pat)) {
	5843	S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
	5844	sv_setpvn(pat, dst, dlen);
	5845	SvUTF8_on(pat);
	5846	}
	5847	sv_catsv_nomg(pat, msv);
	5848	rx = msv;
	5849	}
	5850	else
	5851	pat = msv;
	5852
	5853	if (code)
	5854	pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
	5855	}
	5856
	5857	/* extract any code blocks within any embedded qr//'s */
	5858	if (rx && SvTYPE(rx) == SVt_REGEXP
	5859	&& RX_ENGINE((REGEXP*)rx)->op_comp)
	5860	{
	5861
	5862	RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
	5863	if (ri->num_code_blocks) {
	5864	int i;
	5865	/* the presence of an embedded qr// with code means
	5866	* we should always recompile: the text of the
	5867	* qr// may not have changed, but it may be a
	5868	* different closure than last time */
	5869	*recompile_p = 1;
	5870	Renew(pRExC_state->code_blocks,
	5871	pRExC_state->num_code_blocks + ri->num_code_blocks,
	5872	struct reg_code_block);
	5873	pRExC_state->num_code_blocks += ri->num_code_blocks;
	5874
	5875	for (i=0; i < ri->num_code_blocks; i++) {
	5876	struct reg_code_block src, dst;
	5877	STRLEN offset = orig_patlen
	5878	+ ReANY((REGEXP *)rx)->pre_prefix;
	5879	assert(n < pRExC_state->num_code_blocks);
	5880	src = &ri->code_blocks[i];
	5881	dst = &pRExC_state->code_blocks[n];
	5882	dst->start = src->start + offset;
	5883	dst->end = src->end + offset;
	5884	dst->block = src->block;
	5885	dst->src_regex = (REGEXP) SvREFCNT_inc( (SV)
	5886	src->src_regex
	5887	? src->src_regex
	5888	: (REGEXP*)rx);
	5889	n++;
	5890	}
	5891	}
	5892	}
	5893	}
	5894	/* avoid calling magic multiple times on a single element e.g. =~ $qr */
	5895	if (alloced)
	5896	SvSETMAGIC(pat);
	5897
	5898	return pat;
	5899	}
	5900
	5901
	5902
	5903	/* see if there are any run-time code blocks in the pattern.
	5904	* False positives are allowed */
	5905
	5906	static bool
	5907	S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
	5908	char *pat, STRLEN plen)
	5909	{
	5910	int n = 0;
	5911	STRLEN s;
	5912
	5913	PERL_UNUSED_CONTEXT;
	5914
	5915	for (s = 0; s < plen; s++) {
	5916	if (n < pRExC_state->num_code_blocks
	5917	&& s == pRExC_state->code_blocks[n].start)
	5918	{
	5919	s = pRExC_state->code_blocks[n].end;
	5920	n++;
	5921	continue;
	5922	}
	5923	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
	5924	* positives here */
	5925	if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
	5926	(pat[s+2] == '{'
	5927	\|\| (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
	5928	)
	5929	return 1;
	5930	}
	5931	return 0;
	5932	}
	5933
	5934	/* Handle run-time code blocks. We will already have compiled any direct
	5935	* or indirect literal code blocks. Now, take the pattern 'pat' and make a
	5936	* copy of it, but with any literal code blocks blanked out and
	5937	* appropriate chars escaped; then feed it into
	5938	*
	5939	* eval "qr'modified_pattern'"
	5940	*
	5941	* For example,
	5942	*
	5943	* a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
	5944	*
	5945	* becomes
	5946	*
	5947	* qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
	5948	*
	5949	* After eval_sv()-ing that, grab any new code blocks from the returned qr
	5950	* and merge them with any code blocks of the original regexp.
	5951	*
	5952	* If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
	5953	* instead, just save the qr and return FALSE; this tells our caller that
	5954	* the original pattern needs upgrading to utf8.
	5955	*/
	5956
	5957	static bool
	5958	S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
	5959	char *pat, STRLEN plen)
	5960	{
	5961	SV *qr;
	5962
	5963	GET_RE_DEBUG_FLAGS_DECL;
	5964
	5965	if (pRExC_state->runtime_code_qr) {
	5966	/* this is the second time we've been called; this should
	5967	* only happen if the main pattern got upgraded to utf8
	5968	* during compilation; re-use the qr we compiled first time
	5969	* round (which should be utf8 too)
	5970	*/
	5971	qr = pRExC_state->runtime_code_qr;
	5972	pRExC_state->runtime_code_qr = NULL;
	5973	assert(RExC_utf8 && SvUTF8(qr));
	5974	}
	5975	else {
	5976	int n = 0;
	5977	STRLEN s;
	5978	char p, newpat;
	5979	int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
	5980	SV sv, qr_ref;
	5981	dSP;
	5982
	5983	/* determine how many extra chars we need for ' and \ escaping */
	5984	for (s = 0; s < plen; s++) {
	5985	if (pat[s] == '\'' \|\| pat[s] == '\\')
	5986	newlen++;
	5987	}
	5988
	5989	Newx(newpat, newlen, char);
	5990	p = newpat;
	5991	p++ = 'q'; p++ = 'r'; *p++ = '\'';
	5992
	5993	for (s = 0; s < plen; s++) {
	5994	if (n < pRExC_state->num_code_blocks
	5995	&& s == pRExC_state->code_blocks[n].start)
	5996	{
	5997	/* blank out literal code block */
	5998	assert(pat[s] == '(');
	5999	while (s <= pRExC_state->code_blocks[n].end) {
	6000	*p++ = '_';
	6001	s++;
	6002	}
	6003	s--;
	6004	n++;
	6005	continue;
	6006	}
	6007	if (pat[s] == '\'' \|\| pat[s] == '\\')
	6008	*p++ = '\\';
	6009	*p++ = pat[s];
	6010	}
	6011	*p++ = '\'';
	6012	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
	6013	*p++ = 'x';
	6014	*p++ = '\0';
	6015	DEBUG_COMPILE_r({
	6016	PerlIO_printf(Perl_debug_log,
	6017	"%sre-parsing pattern for runtime code:%s %s\n",
	6018	PL_colors[4],PL_colors[5],newpat);
	6019	});
	6020
	6021	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
	6022	Safefree(newpat);
	6023
	6024	ENTER;
	6025	SAVETMPS;
	6026	save_re_context();
	6027	PUSHSTACKi(PERLSI_REQUIRE);
	6028	/* G_RE_REPARSING causes the toker to collapse \\ into \ when
	6029	* parsing qr''; normally only q'' does this. It also alters
	6030	* hints handling */
	6031	eval_sv(sv, G_SCALAR\|G_RE_REPARSING);
	6032	SvREFCNT_dec_NN(sv);
	6033	SPAGAIN;
	6034	qr_ref = POPs;
	6035	PUTBACK;
	6036	{
	6037	SV * const errsv = ERRSV;
	6038	if (SvTRUE_NN(errsv))
	6039	{
	6040	Safefree(pRExC_state->code_blocks);
	6041	/* use croak_sv ? */
	6042	Perl_croak_nocontext("%"SVf, SVfARG(errsv));
	6043	}
	6044	}
	6045	assert(SvROK(qr_ref));
	6046	qr = SvRV(qr_ref);
	6047	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
	6048	/* the leaving below frees the tmp qr_ref.
	6049	* Give qr a life of its own */
	6050	SvREFCNT_inc(qr);
	6051	POPSTACK;
	6052	FREETMPS;
	6053	LEAVE;
	6054
	6055	}
	6056
	6057	if (!RExC_utf8 && SvUTF8(qr)) {
	6058	/* first time through; the pattern got upgraded; save the
	6059	* qr for the next time through */
	6060	assert(!pRExC_state->runtime_code_qr);
	6061	pRExC_state->runtime_code_qr = qr;
	6062	return 0;
	6063	}
	6064
	6065
	6066	/* extract any code blocks within the returned qr// */
	6067
	6068
	6069	/* merge the main (r1) and run-time (r2) code blocks into one */
	6070	{
	6071	RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
	6072	struct reg_code_block new_block, dst;
	6073	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
	6074	int i1 = 0, i2 = 0;
	6075
	6076	if (!r2->num_code_blocks) /* we guessed wrong */
	6077	{
	6078	SvREFCNT_dec_NN(qr);
	6079	return 1;
	6080	}
	6081
	6082	Newx(new_block,
	6083	r1->num_code_blocks + r2->num_code_blocks,
	6084	struct reg_code_block);
	6085	dst = new_block;
	6086
	6087	while ( i1 < r1->num_code_blocks
	6088	\|\| i2 < r2->num_code_blocks)
	6089	{
	6090	struct reg_code_block *src;
	6091	bool is_qr = 0;
	6092
	6093	if (i1 == r1->num_code_blocks) {
	6094	src = &r2->code_blocks[i2++];
	6095	is_qr = 1;
	6096	}
	6097	else if (i2 == r2->num_code_blocks)
	6098	src = &r1->code_blocks[i1++];
	6099	else if ( r1->code_blocks[i1].start
	6100	< r2->code_blocks[i2].start)
	6101	{
	6102	src = &r1->code_blocks[i1++];
	6103	assert(src->end < r2->code_blocks[i2].start);
	6104	}
	6105	else {
	6106	assert( r1->code_blocks[i1].start
	6107	> r2->code_blocks[i2].start);
	6108	src = &r2->code_blocks[i2++];
	6109	is_qr = 1;
	6110	assert(src->end < r1->code_blocks[i1].start);
	6111	}
	6112
	6113	assert(pat[src->start] == '(');
	6114	assert(pat[src->end] == ')');
	6115	dst->start = src->start;
	6116	dst->end = src->end;
	6117	dst->block = src->block;
	6118	dst->src_regex = is_qr ? (REGEXP) SvREFCNT_inc( (SV) qr)
	6119	: src->src_regex;
	6120	dst++;
	6121	}
	6122	r1->num_code_blocks += r2->num_code_blocks;
	6123	Safefree(r1->code_blocks);
	6124	r1->code_blocks = new_block;
	6125	}
	6126
	6127	SvREFCNT_dec_NN(qr);
	6128	return 1;
	6129	}
	6130
	6131
	6132	STATIC bool
	6133	S_setup_longest(pTHX_ RExC_state_t pRExC_state, SV sv_longest,
	6134	SV rx_utf8, SV rx_substr, SSize_t* rx_end_shift,
	6135	SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
	6136	STRLEN longest_length, bool eol, bool meol)
	6137	{
	6138	/* This is the common code for setting up the floating and fixed length
	6139	* string data extracted from Perl_re_op_compile() below. Returns a boolean
	6140	* as to whether succeeded or not */
	6141
	6142	I32 t;
	6143	SSize_t ml;
	6144
	6145	if (! (longest_length
	6146	\|\| (eol /* Can't have SEOL and MULTI */
	6147	&& (! meol \|\| (RExC_flags & RXf_PMf_MULTILINE)))
	6148	)
	6149	/* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
	6150	\|\| (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
	6151	{
	6152	return FALSE;
	6153	}
	6154
	6155	/* copy the information about the longest from the reg_scan_data
	6156	over to the program. */
	6157	if (SvUTF8(sv_longest)) {
	6158	*rx_utf8 = sv_longest;
	6159	*rx_substr = NULL;
	6160	} else {
	6161	*rx_substr = sv_longest;
	6162	*rx_utf8 = NULL;
	6163	}
	6164	/* end_shift is how many chars that must be matched that
	6165	follow this item. We calculate it ahead of time as once the
	6166	lookbehind offset is added in we lose the ability to correctly
	6167	calculate it.*/
	6168	ml = minlen ? *(minlen) : (SSize_t)longest_length;
	6169	*rx_end_shift = ml - offset
	6170	- longest_length + (SvTAIL(sv_longest) != 0)
	6171	+ lookbehind;
	6172
	6173	t = (eol/* Can't have SEOL and MULTI */
	6174	&& (! meol \|\| (RExC_flags & RXf_PMf_MULTILINE)));
	6175	fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
	6176
	6177	return TRUE;
	6178	}
	6179
	6180	/*
	6181	* Perl_re_op_compile - the perl internal RE engine's function to compile a
	6182	* regular expression into internal code.
	6183	* The pattern may be passed either as:
	6184	* a list of SVs (patternp plus pat_count)
	6185	* a list of OPs (expr)
	6186	* If both are passed, the SV list is used, but the OP list indicates
	6187	* which SVs are actually pre-compiled code blocks
	6188	*
	6189	* The SVs in the list have magic and qr overloading applied to them (and
	6190	* the list may be modified in-place with replacement SVs in the latter
	6191	* case).
	6192	*
	6193	* If the pattern hasn't changed from old_re, then old_re will be
	6194	* returned.
	6195	*
	6196	* eng is the current engine. If that engine has an op_comp method, then
	6197	* handle directly (i.e. we assume that op_comp was us); otherwise, just
	6198	* do the initial concatenation of arguments and pass on to the external
	6199	* engine.
	6200	*
	6201	* If is_bare_re is not null, set it to a boolean indicating whether the
	6202	* arg list reduced (after overloading) to a single bare regex which has
	6203	* been returned (i.e. /$qr/).
	6204	*
	6205	* orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
	6206	*
	6207	* pm_flags contains the PMf_* flags, typically based on those from the
	6208	* pm_flags field of the related PMOP. Currently we're only interested in
	6209	* PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
	6210	*
	6211	* We can't allocate space until we know how big the compiled form will be,
	6212	* but we can't compile it (and thus know how big it is) until we've got a
	6213	* place to put the code. So we cheat: we compile it twice, once with code
	6214	* generation turned off and size counting turned on, and once "for real".
	6215	* This also means that we don't allocate space until we are sure that the
	6216	* thing really will compile successfully, and we never have to move the
	6217	* code and thus invalidate pointers into it. (Note that it has to be in
	6218	* one piece because free() must be able to free it all.) [NB: not true in perl]
	6219	*
	6220	* Beware that the optimization-preparation code in here knows about some
	6221	* of the structure of the compiled regexp. [I'll say.]
	6222	*/
	6223
	6224	REGEXP *
	6225	Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
	6226	OP expr, const regexp_engine eng, REGEXP *old_re,
	6227	bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
	6228	{
	6229	REGEXP *rx;
	6230	struct regexp *r;
	6231	regexp_internal *ri;
	6232	STRLEN plen;
	6233	char *exp;
	6234	regnode *scan;
	6235	I32 flags;
	6236	SSize_t minlen = 0;
	6237	U32 rx_flags;
	6238	SV *pat;
	6239	SV *code_blocksv = NULL;
	6240	SV** new_patternp = patternp;
	6241
	6242	/* these are all flags - maybe they should be turned
	6243	* into a single int with different bit masks */
	6244	I32 sawlookahead = 0;
	6245	I32 sawplus = 0;
	6246	I32 sawopen = 0;
	6247	I32 sawminmod = 0;
	6248
	6249	regex_charset initial_charset = get_regex_charset(orig_rx_flags);
	6250	bool recompile = 0;
	6251	bool runtime_code = 0;
	6252	scan_data_t data;
	6253	RExC_state_t RExC_state;
	6254	RExC_state_t * const pRExC_state = &RExC_state;
	6255	#ifdef TRIE_STUDY_OPT
	6256	int restudied = 0;
	6257	RExC_state_t copyRExC_state;
	6258	#endif
	6259	GET_RE_DEBUG_FLAGS_DECL;
	6260
	6261	PERL_ARGS_ASSERT_RE_OP_COMPILE;
	6262
	6263	DEBUG_r(if (!PL_colorset) reginitcolors());
	6264
	6265	#ifndef PERL_IN_XSUB_RE
	6266	/* Initialize these here instead of as-needed, as is quick and avoids
	6267	* having to test them each time otherwise */
	6268	if (! PL_AboveLatin1) {
	6269	PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
	6270	PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
	6271	PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
	6272	PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
	6273	PL_HasMultiCharFold =
	6274	_new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
	6275	}
	6276	#endif
	6277
	6278	pRExC_state->code_blocks = NULL;
	6279	pRExC_state->num_code_blocks = 0;
	6280
	6281	if (is_bare_re)
	6282	*is_bare_re = FALSE;
	6283
	6284	if (expr && (expr->op_type == OP_LIST \|\|
	6285	(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
	6286	/* allocate code_blocks if needed */
	6287	OP *o;
	6288	int ncode = 0;
	6289
	6290	for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
	6291	if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
	6292	ncode++; /* count of DO blocks */
	6293	if (ncode) {
	6294	pRExC_state->num_code_blocks = ncode;
	6295	Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
	6296	}
	6297	}
	6298
	6299	if (!pat_count) {
	6300	/* compile-time pattern with just OP_CONSTs and DO blocks */
	6301
	6302	int n;
	6303	OP *o;
	6304
	6305	/* find how many CONSTs there are */
	6306	assert(expr);
	6307	n = 0;
	6308	if (expr->op_type == OP_CONST)
	6309	n = 1;
	6310	else
	6311	for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
	6312	if (o->op_type == OP_CONST)
	6313	n++;
	6314	}
	6315
	6316	/* fake up an SV array */
	6317
	6318	assert(!new_patternp);
	6319	Newx(new_patternp, n, SV*);
	6320	SAVEFREEPV(new_patternp);
	6321	pat_count = n;
	6322
	6323	n = 0;
	6324	if (expr->op_type == OP_CONST)
	6325	new_patternp[n] = cSVOPx_sv(expr);
	6326	else
	6327	for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
	6328	if (o->op_type == OP_CONST)
	6329	new_patternp[n++] = cSVOPo_sv;
	6330	}
	6331
	6332	}
	6333
	6334	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	6335	"Assembling pattern from %d elements%s\n", pat_count,
	6336	orig_rx_flags & RXf_SPLIT ? " for split" : ""));
	6337
	6338	/* set expr to the first arg op */
	6339
	6340	if (pRExC_state->num_code_blocks
	6341	&& expr->op_type != OP_CONST)
	6342	{
	6343	expr = cLISTOPx(expr)->op_first;
	6344	assert( expr->op_type == OP_PUSHMARK
	6345	\|\| (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
	6346	\|\| expr->op_type == OP_PADRANGE);
	6347	expr = OP_SIBLING(expr);
	6348	}
	6349
	6350	pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
	6351	expr, &recompile, NULL);
	6352
	6353	/* handle bare (possibly after overloading) regex: foo =~ $re */
	6354	{
	6355	SV *re = pat;
	6356	if (SvROK(re))
	6357	re = SvRV(re);
	6358	if (SvTYPE(re) == SVt_REGEXP) {
	6359	if (is_bare_re)
	6360	*is_bare_re = TRUE;
	6361	SvREFCNT_inc(re);
	6362	Safefree(pRExC_state->code_blocks);
	6363	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	6364	"Precompiled pattern%s\n",
	6365	orig_rx_flags & RXf_SPLIT ? " for split" : ""));
	6366
	6367	return (REGEXP*)re;
	6368	}
	6369	}
	6370
	6371	exp = SvPV_nomg(pat, plen);
	6372
	6373	if (!eng->op_comp) {
	6374	if ((SvUTF8(pat) && IN_BYTES)
	6375	\|\| SvGMAGICAL(pat) \|\| SvAMAGIC(pat))
	6376	{
	6377	/* make a temporary copy; either to convert to bytes,
	6378	* or to avoid repeating get-magic / overloaded stringify */
	6379	pat = newSVpvn_flags(exp, plen, SVs_TEMP \|
	6380	(IN_BYTES ? 0 : SvUTF8(pat)));
	6381	}
	6382	Safefree(pRExC_state->code_blocks);
	6383	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
	6384	}
	6385
	6386	/* ignore the utf8ness if the pattern is 0 length */
	6387	RExC_utf8 = RExC_orig_utf8 = (plen == 0 \|\| IN_BYTES) ? 0 : SvUTF8(pat);
	6388	RExC_uni_semantics = 0;
	6389	RExC_contains_locale = 0;
	6390	RExC_contains_i = 0;
	6391	pRExC_state->runtime_code_qr = NULL;
	6392
	6393	DEBUG_COMPILE_r({
	6394	SV *dsv= sv_newmortal();
	6395	RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
	6396	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	6397	PL_colors[4],PL_colors[5],s);
	6398	});
	6399
	6400	redo_first_pass:
	6401	/* we jump here if we upgrade the pattern to utf8 and have to
	6402	* recompile */
	6403
	6404	if ((pm_flags & PMf_USE_RE_EVAL)
	6405	/* this second condition covers the non-regex literal case,
	6406	* i.e. $foo =~ '(?{})'. */
	6407	\|\| (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
	6408	)
	6409	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
	6410
	6411	/* return old regex if pattern hasn't changed */
	6412	/* XXX: note in the below we have to check the flags as well as the
	6413	* pattern.
	6414	*
	6415	* Things get a touch tricky as we have to compare the utf8 flag
	6416	* independently from the compile flags. */
	6417
	6418	if ( old_re
	6419	&& !recompile
	6420	&& !!RX_UTF8(old_re) == !!RExC_utf8
	6421	&& ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
	6422	&& RX_PRECOMP(old_re)
	6423	&& RX_PRELEN(old_re) == plen
	6424	&& memEQ(RX_PRECOMP(old_re), exp, plen)
	6425	&& !runtime_code /* with runtime code, always recompile */ )
	6426	{
	6427	Safefree(pRExC_state->code_blocks);
	6428	return old_re;
	6429	}
	6430
	6431	rx_flags = orig_rx_flags;
	6432
	6433	if (rx_flags & PMf_FOLD) {
	6434	RExC_contains_i = 1;
	6435	}
	6436	if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
	6437
	6438	/* Set to use unicode semantics if the pattern is in utf8 and has the
	6439	* 'depends' charset specified, as it means unicode when utf8 */
	6440	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
	6441	}
	6442
	6443	RExC_precomp = exp;
	6444	RExC_flags = rx_flags;
	6445	RExC_pm_flags = pm_flags;
	6446
	6447	if (runtime_code) {
	6448	if (TAINTING_get && TAINT_get)
	6449	Perl_croak(aTHX_ "Eval-group in insecure regular expression");
	6450
	6451	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
	6452	/* whoops, we have a non-utf8 pattern, whilst run-time code
	6453	* got compiled as utf8. Try again with a utf8 pattern */
	6454	S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
	6455	pRExC_state->num_code_blocks);
	6456	goto redo_first_pass;
	6457	}
	6458	}
	6459	assert(!pRExC_state->runtime_code_qr);
	6460
	6461	RExC_sawback = 0;
	6462
	6463	RExC_seen = 0;
	6464	RExC_maxlen = 0;
	6465	RExC_in_lookbehind = 0;
	6466	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	6467	RExC_extralen = 0;
	6468	RExC_override_recoding = 0;
	6469	RExC_in_multi_char_class = 0;
	6470
	6471	/* First pass: determine size, legality. */
	6472	RExC_parse = exp;
	6473	RExC_start = exp;
	6474	RExC_end = exp + plen;
	6475	RExC_naughty = 0;
	6476	RExC_npar = 1;
	6477	RExC_nestroot = 0;
	6478	RExC_size = 0L;
	6479	RExC_emit = (regnode *) &RExC_emit_dummy;
	6480	RExC_whilem_seen = 0;
	6481	RExC_open_parens = NULL;
	6482	RExC_close_parens = NULL;
	6483	RExC_opend = NULL;
	6484	RExC_paren_names = NULL;
	6485	#ifdef DEBUGGING
	6486	RExC_paren_name_list = NULL;
	6487	#endif
	6488	RExC_recurse = NULL;
	6489	RExC_study_chunk_recursed = NULL;
	6490	RExC_study_chunk_recursed_bytes= 0;
	6491	RExC_recurse_count = 0;
	6492	pRExC_state->code_index = 0;
	6493
	6494	#if 0 /* REGC() is (currently) a NOP at the first pass.
	6495	* Clever compilers notice this and complain. --jhi */
	6496	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	6497	#endif
	6498	DEBUG_PARSE_r(
	6499	PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
	6500	RExC_lastnum=0;
	6501	RExC_lastparse=NULL;
	6502	);
	6503	/* reg may croak on us, not giving us a chance to free
	6504	pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
	6505	need it to survive as long as the regexp (qr/(?{})/).
	6506	We must check that code_blocksv is not already set, because we may
	6507	have jumped back to restart the sizing pass. */
	6508	if (pRExC_state->code_blocks && !code_blocksv) {
	6509	code_blocksv = newSV_type(SVt_PV);
	6510	SAVEFREESV(code_blocksv);
	6511	SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
	6512	SvLEN_set(code_blocksv, 1); /sufficient to make sv_clear free it/
	6513	}
	6514	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	6515	/* It's possible to write a regexp in ascii that represents Unicode
	6516	codepoints outside of the byte range, such as via \x{100}. If we
	6517	detect such a sequence we have to convert the entire pattern to utf8
	6518	and then recompile, as our sizing calculation will have been based
	6519	on 1 byte == 1 character, but we will need to use utf8 to encode
	6520	at least some part of the pattern, and therefore must convert the whole
	6521	thing.
	6522	-- dmq */
	6523	if (flags & RESTART_UTF8) {
	6524	S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
	6525	pRExC_state->num_code_blocks);
	6526	goto redo_first_pass;
	6527	}
	6528	Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
	6529	}
	6530	if (code_blocksv)
	6531	SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
	6532
	6533	DEBUG_PARSE_r({
	6534	PerlIO_printf(Perl_debug_log,
	6535	"Required size %"IVdf" nodes\n"
	6536	"Starting second pass (creation)\n",
	6537	(IV)RExC_size);
	6538	RExC_lastnum=0;
	6539	RExC_lastparse=NULL;
	6540	});
	6541
	6542	/* The first pass could have found things that force Unicode semantics */
	6543	if ((RExC_utf8 \|\| RExC_uni_semantics)
	6544	&& get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
	6545	{
	6546	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
	6547	}
	6548
	6549	/* Small enough for pointer-storage convention?
	6550	If extralen==0, this means that we will not need long jumps. */
	6551	if (RExC_size >= 0x10000L && RExC_extralen)
	6552	RExC_size += RExC_extralen;
	6553	else
	6554	RExC_extralen = 0;
	6555	if (RExC_whilem_seen > 15)
	6556	RExC_whilem_seen = 15;
	6557
	6558	/* Allocate space and zero-initialize. Note, the two step process
	6559	of zeroing when in debug mode, thus anything assigned has to
	6560	happen after that */
	6561	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	6562	r = ReANY(rx);
	6563	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	6564	char, regexp_internal);
	6565	if ( r == NULL \|\| ri == NULL )
	6566	FAIL("Regexp out of space");
	6567	#ifdef DEBUGGING
	6568	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	6569	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	6570	char);
	6571	#else
	6572	/* bulk initialize base fields with 0. */
	6573	Zero(ri, sizeof(regexp_internal), char);
	6574	#endif
	6575
	6576	/* non-zero initialization begins here */
	6577	RXi_SET( r, ri );
	6578	r->engine= eng;
	6579	r->extflags = rx_flags;
	6580	RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
	6581
	6582	if (pm_flags & PMf_IS_QR) {
	6583	ri->code_blocks = pRExC_state->code_blocks;
	6584	ri->num_code_blocks = pRExC_state->num_code_blocks;
	6585	}
	6586	else
	6587	{
	6588	int n;
	6589	for (n = 0; n < pRExC_state->num_code_blocks; n++)
	6590	if (pRExC_state->code_blocks[n].src_regex)
	6591	SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
	6592	SAVEFREEPV(pRExC_state->code_blocks);
	6593	}
	6594
	6595	{
	6596	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	6597	bool has_charset = (get_regex_charset(r->extflags)
	6598	!= REGEX_DEPENDS_CHARSET);
	6599
	6600	/* The caret is output if there are any defaults: if not all the STD
	6601	* flags are set, or if no character set specifier is needed */
	6602	bool has_default =
	6603	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	6604	\|\| ! has_charset);
	6605	bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
	6606	== REG_RUN_ON_COMMENT_SEEN);
	6607	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	6608	>> RXf_PMf_STD_PMMOD_SHIFT);
	6609	const char fptr = STD_PAT_MODS; /"msix"*/
	6610	char *p;
	6611	/* Allocate for the worst case, which is all the std flags are turned
	6612	* on. If more precision is desired, we could do a population count of
	6613	* the flags set. This could be done with a small lookup table, or by
	6614	* shifting, masking and adding, or even, when available, assembly
	6615	* language for a machine-language population count.
	6616	* We never output a minus, as all those are defaults, so are
	6617	* covered by the caret */
	6618	const STRLEN wraplen = plen + has_p + has_runon
	6619	+ has_default /* If needs a caret */
	6620
	6621	/* If needs a character set specifier */
	6622	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	6623	+ (sizeof(STD_PAT_MODS) - 1)
	6624	+ (sizeof("(?:)") - 1);
	6625
	6626	Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
	6627	r->xpv_len_u.xpvlenu_pv = p;
	6628	if (RExC_utf8)
	6629	SvFLAGS(rx) \|= SVf_UTF8;
	6630	p++='('; p++='?';
	6631
	6632	/* If a default, cover it using the caret */
	6633	if (has_default) {
	6634	*p++= DEFAULT_PAT_MOD;
	6635	}
	6636	if (has_charset) {
	6637	STRLEN len;
	6638	const char* const name = get_regex_charset_name(r->extflags, &len);
	6639	Copy(name, p, len, char);
	6640	p += len;
	6641	}
	6642	if (has_p)
	6643	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	6644	{
	6645	char ch;
	6646	while((ch = *fptr++)) {
	6647	if(reganch & 1)
	6648	*p++ = ch;
	6649	reganch >>= 1;
	6650	}
	6651	}
	6652
	6653	*p++ = ':';
	6654	Copy(RExC_precomp, p, plen, char);
	6655	assert ((RX_WRAPPED(rx) - p) < 16);
	6656	r->pre_prefix = p - RX_WRAPPED(rx);
	6657	p += plen;
	6658	if (has_runon)
	6659	*p++ = '\n';
	6660	*p++ = ')';
	6661	*p = 0;
	6662	SvCUR_set(rx, p - RX_WRAPPED(rx));
	6663	}
	6664
	6665	r->intflags = 0;
	6666	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	6667
	6668	/* setup various meta data about recursion, this all requires
	6669	* RExC_npar to be correctly set, and a bit later on we clear it */
	6670	if (RExC_seen & REG_RECURSE_SEEN) {
	6671	Newxz(RExC_open_parens, RExC_npar,regnode *);
	6672	SAVEFREEPV(RExC_open_parens);
	6673	Newxz(RExC_close_parens,RExC_npar,regnode *);
	6674	SAVEFREEPV(RExC_close_parens);
	6675	}
	6676	if (RExC_seen & (REG_RECURSE_SEEN \| REG_GOSTART_SEEN)) {
	6677	/* Note, RExC_npar is 1 + the number of parens in a pattern.
	6678	* So its 1 if there are no parens. */
	6679	RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
	6680	((RExC_npar & 0x07) != 0);
	6681	Newx(RExC_study_chunk_recursed,
	6682	RExC_study_chunk_recursed_bytes * RExC_npar, U8);
	6683	SAVEFREEPV(RExC_study_chunk_recursed);
	6684	}
	6685
	6686	/* Useful during FAIL. */
	6687	#ifdef RE_TRACK_PATTERN_OFFSETS
	6688	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	6689	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	6690	"%s %"UVuf" bytes for offset annotations.\n",
	6691	ri->u.offsets ? "Got" : "Couldn't get",
	6692	(UV)((2RExC_size+1) sizeof(U32))));
	6693	#endif
	6694	SetProgLen(ri,RExC_size);
	6695	RExC_rx_sv = rx;
	6696	RExC_rx = r;
	6697	RExC_rxi = ri;
	6698
	6699	/* Second pass: emit code. */
	6700	RExC_flags = rx_flags; /* don't let top level (?i) bleed */
	6701	RExC_pm_flags = pm_flags;
	6702	RExC_parse = exp;
	6703	RExC_end = exp + plen;
	6704	RExC_naughty = 0;
	6705	RExC_npar = 1;
	6706	RExC_emit_start = ri->program;
	6707	RExC_emit = ri->program;
	6708	RExC_emit_bound = ri->program + RExC_size + 1;
	6709	pRExC_state->code_index = 0;
	6710
	6711	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	6712	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	6713	ReREFCNT_dec(rx);
	6714	Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
	6715	}
	6716	/* XXXX To minimize changes to RE engine we always allocate
	6717	3-units-long substrs field. */
	6718	Newx(r->substrs, 1, struct reg_substr_data);
	6719	if (RExC_recurse_count) {
	6720	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	6721	SAVEFREEPV(RExC_recurse);
	6722	}
	6723
	6724	reStudy:
	6725	r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
	6726	Zero(r->substrs, 1, struct reg_substr_data);
	6727	if (RExC_study_chunk_recursed)
	6728	Zero(RExC_study_chunk_recursed,
	6729	RExC_study_chunk_recursed_bytes * RExC_npar, U8);
	6730
	6731	#ifdef TRIE_STUDY_OPT
	6732	if (!restudied) {
	6733	StructCopy(&zero_scan_data, &data, scan_data_t);
	6734	copyRExC_state = RExC_state;
	6735	} else {
	6736	U32 seen=RExC_seen;
	6737	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	6738
	6739	RExC_state = copyRExC_state;
	6740	if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
	6741	RExC_seen \|= REG_TOP_LEVEL_BRANCHES_SEEN;
	6742	else
	6743	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
	6744	StructCopy(&zero_scan_data, &data, scan_data_t);
	6745	}
	6746	#else
	6747	StructCopy(&zero_scan_data, &data, scan_data_t);
	6748	#endif
	6749
	6750	/* Dig out information for optimizations. */
	6751	r->extflags = RExC_flags; /* was pm_op */
	6752	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	6753
	6754	if (UTF)
	6755	SvUTF8_on(rx); /* Unicode in it? */
	6756	ri->regstclass = NULL;
	6757	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	6758	r->intflags \|= PREGf_NAUGHTY;
	6759	scan = ri->program + 1; /* First BRANCH. */
	6760
	6761	/* testing for BRANCH here tells us whether there is "must appear"
	6762	data in the pattern. If there is then we can use it for optimisations */
	6763	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
	6764	*/
	6765	SSize_t fake;
	6766	STRLEN longest_float_length, longest_fixed_length;
	6767	regnode_ssc ch_class; /* pointed to by data */
	6768	int stclass_flag;
	6769	SSize_t last_close = 0; /* pointed to by data */
	6770	regnode *first= scan;
	6771	regnode *first_next= regnext(first);
	6772	/*
	6773	* Skip introductions and multiplicators >= 1
	6774	* so that we can extract the 'meat' of the pattern that must
	6775	* match in the large if() sequence following.
	6776	* NOTE that EXACT is NOT covered here, as it is normally
	6777	* picked up by the optimiser separately.
	6778	*
	6779	* This is unfortunate as the optimiser isnt handling lookahead
	6780	* properly currently.
	6781	*
	6782	*/
	6783	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	6784	/* An OR of one alternative - should not happen now. */
	6785	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	6786	/* for now we can't handle lookbehind IFMATCH*/
	6787	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	6788	(OP(first) == PLUS) \|\|
	6789	(OP(first) == MINMOD) \|\|
	6790	/* An {n,m} with n>0 */
	6791	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	6792	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	6793	{
	6794	/*
	6795	* the only op that could be a regnode is PLUS, all the rest
	6796	* will be regnode_1 or regnode_2.
	6797	*
	6798	* (yves doesn't think this is true)
	6799	*/
	6800	if (OP(first) == PLUS)
	6801	sawplus = 1;
	6802	else {
	6803	if (OP(first) == MINMOD)
	6804	sawminmod = 1;
	6805	first += regarglen[OP(first)];
	6806	}
	6807	first = NEXTOPER(first);
	6808	first_next= regnext(first);
	6809	}
	6810
	6811	/* Starting-point info. */
	6812	again:
	6813	DEBUG_PEEP("first:",first,0);
	6814	/* Ignore EXACT as we deal with it later. */
	6815	if (PL_regkind[OP(first)] == EXACT) {
	6816	if (OP(first) == EXACT)
	6817	NOOP; /* Empty, get anchored substr later. */
	6818	else
	6819	ri->regstclass = first;
	6820	}
	6821	#ifdef TRIE_STCLASS
	6822	else if (PL_regkind[OP(first)] == TRIE &&
	6823	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	6824	{
	6825	/* this can happen only on restudy */
	6826	ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
	6827	}
	6828	#endif
	6829	else if (REGNODE_SIMPLE(OP(first)))
	6830	ri->regstclass = first;
	6831	else if (PL_regkind[OP(first)] == BOUND \|\|
	6832	PL_regkind[OP(first)] == NBOUND)
	6833	ri->regstclass = first;
	6834	else if (PL_regkind[OP(first)] == BOL) {
	6835	r->intflags \|= (OP(first) == MBOL
	6836	? PREGf_ANCH_MBOL
	6837	: (OP(first) == SBOL
	6838	? PREGf_ANCH_SBOL
	6839	: PREGf_ANCH_BOL));
	6840	first = NEXTOPER(first);
	6841	goto again;
	6842	}
	6843	else if (OP(first) == GPOS) {
	6844	r->intflags \|= PREGf_ANCH_GPOS;
	6845	first = NEXTOPER(first);
	6846	goto again;
	6847	}
	6848	else if ((!sawopen \|\| !RExC_sawback) &&
	6849	!sawlookahead &&
	6850	(OP(first) == STAR &&
	6851	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	6852	!(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
	6853	{
	6854	/* turn .* into ^.* with an implied $=1 /
	6855	const int type =
	6856	(OP(NEXTOPER(first)) == REG_ANY)
	6857	? PREGf_ANCH_MBOL
	6858	: PREGf_ANCH_SBOL;
	6859	r->intflags \|= (type \| PREGf_IMPLICIT);
	6860	first = NEXTOPER(first);
	6861	goto again;
	6862	}
	6863	if (sawplus && !sawminmod && !sawlookahead
	6864	&& (!sawopen \|\| !RExC_sawback)
	6865	&& !pRExC_state->num_code_blocks) /* May examine pos and $& */
	6866	/* x+ must match at the 1st pos of run of x's */
	6867	r->intflags \|= PREGf_SKIP;
	6868
	6869	/* Scan is after the zeroth branch, first is atomic matcher. */
	6870	#ifdef TRIE_STUDY_OPT
	6871	DEBUG_PARSE_r(
	6872	if (!restudied)
	6873	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	6874	(IV)(first - scan + 1))
	6875	);
	6876	#else
	6877	DEBUG_PARSE_r(
	6878	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	6879	(IV)(first - scan + 1))
	6880	);
	6881	#endif
	6882
	6883
	6884	/*
	6885	* If there's something expensive in the r.e., find the
	6886	* longest literal string that must appear and make it the
	6887	* regmust. Resolve ties in favor of later strings, since
	6888	* the regstart check works with the beginning of the r.e.
	6889	* and avoiding duplication strengthens checking. Not a
	6890	* strong reason, but sufficient in the absence of others.
	6891	* [Now we resolve ties in favor of the earlier string if
	6892	* it happens that c_offset_min has been invalidated, since the
	6893	* earlier string may buy us something the later one won't.]
	6894	*/
	6895
	6896	data.longest_fixed = newSVpvs("");
	6897	data.longest_float = newSVpvs("");
	6898	data.last_found = newSVpvs("");
	6899	data.longest = &(data.longest_fixed);
	6900	ENTER_with_name("study_chunk");
	6901	SAVEFREESV(data.longest_fixed);
	6902	SAVEFREESV(data.longest_float);
	6903	SAVEFREESV(data.last_found);
	6904	first = scan;
	6905	if (!ri->regstclass) {
	6906	ssc_init(pRExC_state, &ch_class);
	6907	data.start_class = &ch_class;
	6908	stclass_flag = SCF_DO_STCLASS_AND;
	6909	} else /* XXXX Check for BOUND? */
	6910	stclass_flag = 0;
	6911	data.last_closep = &last_close;
	6912
	6913	DEBUG_RExC_seen();
	6914	minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
	6915	scan + RExC_size, /* Up to end */
	6916	&data, -1, 0, NULL,
	6917	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag
	6918	\| (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
	6919	0);
	6920
	6921
	6922	CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
	6923
	6924
	6925	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	6926	&& data.last_start_min == 0 && data.last_end > 0
	6927	&& !RExC_seen_zerolen
	6928	&& !(RExC_seen & REG_VERBARG_SEEN)
	6929	&& !(RExC_seen & REG_GPOS_SEEN)
	6930	){
	6931	r->extflags \|= RXf_CHECK_ALL;
	6932	}
	6933	scan_commit(pRExC_state, &data,&minlen,0);
	6934
	6935	longest_float_length = CHR_SVLEN(data.longest_float);
	6936
	6937	if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	6938	&& data.offset_fixed == data.offset_float_min
	6939	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
	6940	&& S_setup_longest (aTHX_ pRExC_state,
	6941	data.longest_float,
	6942	&(r->float_utf8),
	6943	&(r->float_substr),
	6944	&(r->float_end_shift),
	6945	data.lookbehind_float,
	6946	data.offset_float_min,
	6947	data.minlen_float,
	6948	longest_float_length,
	6949	cBOOL(data.flags & SF_FL_BEFORE_EOL),
	6950	cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
	6951	{
	6952	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	6953	r->float_max_offset = data.offset_float_max;
	6954	if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
	6955	r->float_max_offset -= data.lookbehind_float;
	6956	SvREFCNT_inc_simple_void_NN(data.longest_float);
	6957	}
	6958	else {
	6959	r->float_substr = r->float_utf8 = NULL;
	6960	longest_float_length = 0;
	6961	}
	6962
	6963	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	6964
	6965	if (S_setup_longest (aTHX_ pRExC_state,
	6966	data.longest_fixed,
	6967	&(r->anchored_utf8),
	6968	&(r->anchored_substr),
	6969	&(r->anchored_end_shift),
	6970	data.lookbehind_fixed,
	6971	data.offset_fixed,
	6972	data.minlen_fixed,
	6973	longest_fixed_length,
	6974	cBOOL(data.flags & SF_FIX_BEFORE_EOL),
	6975	cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
	6976	{
	6977	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	6978	SvREFCNT_inc_simple_void_NN(data.longest_fixed);
	6979	}
	6980	else {
	6981	r->anchored_substr = r->anchored_utf8 = NULL;
	6982	longest_fixed_length = 0;
	6983	}
	6984	LEAVE_with_name("study_chunk");
	6985
	6986	if (ri->regstclass
	6987	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	6988	ri->regstclass = NULL;
	6989
	6990	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	6991	&& stclass_flag
	6992	&& ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
	6993	&& !ssc_is_anything(data.start_class))
	6994	{
	6995	const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
	6996
	6997	ssc_finalize(pRExC_state, data.start_class);
	6998
	6999	Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
	7000	StructCopy(data.start_class,
	7001	(regnode_ssc*)RExC_rxi->data->data[n],
	7002	regnode_ssc);
	7003	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	7004	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	7005	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	7006	regprop(r, sv, (regnode*)data.start_class, NULL);
	7007	PerlIO_printf(Perl_debug_log,
	7008	"synthetic stclass \"%s\".\n",
	7009	SvPVX_const(sv));});
	7010	data.start_class = NULL;
	7011	}
	7012
	7013	/* A temporary algorithm prefers floated substr to fixed one to dig
	7014	* more info. */
	7015	if (longest_fixed_length > longest_float_length) {
	7016	r->substrs->check_ix = 0;
	7017	r->check_end_shift = r->anchored_end_shift;
	7018	r->check_substr = r->anchored_substr;
	7019	r->check_utf8 = r->anchored_utf8;
	7020	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	7021	if (r->intflags & (PREGf_ANCH_SBOL\|PREGf_ANCH_GPOS))
	7022	r->intflags \|= PREGf_NOSCAN;
	7023	}
	7024	else {
	7025	r->substrs->check_ix = 1;
	7026	r->check_end_shift = r->float_end_shift;
	7027	r->check_substr = r->float_substr;
	7028	r->check_utf8 = r->float_utf8;
	7029	r->check_offset_min = r->float_min_offset;
	7030	r->check_offset_max = r->float_max_offset;
	7031	}
	7032	if ((r->check_substr \|\| r->check_utf8) ) {
	7033	r->extflags \|= RXf_USE_INTUIT;
	7034	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	7035	r->extflags \|= RXf_INTUIT_TAIL;
	7036	}
	7037	r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
	7038
	7039	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	7040	if ( (STRLEN)minlen < longest_float_length )
	7041	minlen= longest_float_length;
	7042	if ( (STRLEN)minlen < longest_fixed_length )
	7043	minlen= longest_fixed_length;
	7044	*/
	7045	}
	7046	else {
	7047	/* Several toplevels. Best we can is to set minlen. */
	7048	SSize_t fake;
	7049	regnode_ssc ch_class;
	7050	SSize_t last_close = 0;
	7051
	7052	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	7053
	7054	scan = ri->program + 1;
	7055	ssc_init(pRExC_state, &ch_class);
	7056	data.start_class = &ch_class;
	7057	data.last_closep = &last_close;
	7058
	7059	DEBUG_RExC_seen();
	7060	minlen = study_chunk(pRExC_state,
	7061	&scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
	7062	SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS\|(restudied
	7063	? SCF_TRIE_DOING_RESTUDY
	7064	: 0),
	7065	0);
	7066
	7067	CHECK_RESTUDY_GOTO_butfirst(NOOP);
	7068
	7069	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	7070	= r->float_substr = r->float_utf8 = NULL;
	7071
	7072	if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
	7073	&& ! ssc_is_anything(data.start_class))
	7074	{
	7075	const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
	7076
	7077	ssc_finalize(pRExC_state, data.start_class);
	7078
	7079	Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
	7080	StructCopy(data.start_class,
	7081	(regnode_ssc*)RExC_rxi->data->data[n],
	7082	regnode_ssc);
	7083	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	7084	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	7085	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	7086	regprop(r, sv, (regnode*)data.start_class, NULL);
	7087	PerlIO_printf(Perl_debug_log,
	7088	"synthetic stclass \"%s\".\n",
	7089	SvPVX_const(sv));});
	7090	data.start_class = NULL;
	7091	}
	7092	}
	7093
	7094	if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
	7095	r->extflags \|= RXf_UNBOUNDED_QUANTIFIER_SEEN;
	7096	r->maxlen = REG_INFTY;
	7097	}
	7098	else {
	7099	r->maxlen = RExC_maxlen;
	7100	}
	7101
	7102	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	7103	the "real" pattern. */
	7104	DEBUG_OPTIMISE_r({
	7105	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
	7106	(IV)minlen, (IV)r->minlen, RExC_maxlen);
	7107	});
	7108	r->minlenret = minlen;
	7109	if (r->minlen < minlen)
	7110	r->minlen = minlen;
	7111
	7112	if (RExC_seen & REG_GPOS_SEEN)
	7113	r->intflags \|= PREGf_GPOS_SEEN;
	7114	if (RExC_seen & REG_LOOKBEHIND_SEEN)
	7115	r->extflags \|= RXf_NO_INPLACE_SUBST; /* inplace might break the
	7116	lookbehind */
	7117	if (pRExC_state->num_code_blocks)
	7118	r->extflags \|= RXf_EVAL_SEEN;
	7119	if (RExC_seen & REG_CANY_SEEN)
	7120	r->intflags \|= PREGf_CANY_SEEN;
	7121	if (RExC_seen & REG_VERBARG_SEEN)
	7122	{
	7123	r->intflags \|= PREGf_VERBARG_SEEN;
	7124	r->extflags \|= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
	7125	}
	7126	if (RExC_seen & REG_CUTGROUP_SEEN)
	7127	r->intflags \|= PREGf_CUTGROUP_SEEN;
	7128	if (pm_flags & PMf_USE_RE_EVAL)
	7129	r->intflags \|= PREGf_USE_RE_EVAL;
	7130	if (RExC_paren_names)
	7131	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	7132	else
	7133	RXp_PAREN_NAMES(r) = NULL;
	7134
	7135	/* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
	7136	* so it can be used in pp.c */
	7137	if (r->intflags & PREGf_ANCH)
	7138	r->extflags \|= RXf_IS_ANCHORED;
	7139
	7140
	7141	{
	7142	/* this is used to identify "special" patterns that might result
	7143	* in Perl NOT calling the regex engine and instead doing the match "itself",
	7144	* particularly special cases in split//. By having the regex compiler
	7145	* do this pattern matching at a regop level (instead of by inspecting the pattern)
	7146	* we avoid weird issues with equivalent patterns resulting in different behavior,
	7147	* AND we allow non Perl engines to get the same optimizations by the setting the
	7148	* flags appropriately - Yves */
	7149	regnode *first = ri->program + 1;
	7150	U8 fop = OP(first);
	7151	regnode *next = NEXTOPER(first);
	7152	U8 nop = OP(next);
	7153
	7154	if (PL_regkind[fop] == NOTHING && nop == END)
	7155	r->extflags \|= RXf_NULL;
	7156	else if (PL_regkind[fop] == BOL && nop == END)
	7157	r->extflags \|= RXf_START_ONLY;
	7158	else if (fop == PLUS
	7159	&& PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
	7160	&& OP(regnext(first)) == END)
	7161	r->extflags \|= RXf_WHITE;
	7162	else if ( r->extflags & RXf_SPLIT
	7163	&& fop == EXACT
	7164	&& STR_LEN(first) == 1
	7165	&& *(STRING(first)) == ' '
	7166	&& OP(regnext(first)) == END )
	7167	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	7168
	7169	}
	7170
	7171	if (RExC_contains_locale) {
	7172	RXp_EXTFLAGS(r) \|= RXf_TAINTED;
	7173	}
	7174
	7175	#ifdef DEBUGGING
	7176	if (RExC_paren_names) {
	7177	ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
	7178	ri->data->data[ri->name_list_idx]
	7179	= (void*)SvREFCNT_inc(RExC_paren_name_list);
	7180	} else
	7181	#endif
	7182	ri->name_list_idx = 0;
	7183
	7184	if (RExC_recurse_count) {
	7185	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	7186	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	7187	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	7188	}
	7189	}
	7190	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	7191	/* assume we don't need to swap parens around before we match */
	7192
	7193	DEBUG_DUMP_r({
	7194	DEBUG_RExC_seen();
	7195	PerlIO_printf(Perl_debug_log,"Final program:\n");
	7196	regdump(r);
	7197	});
	7198	#ifdef RE_TRACK_PATTERN_OFFSETS
	7199	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	7200	const STRLEN len = ri->u.offsets[0];
	7201	STRLEN i;
	7202	GET_RE_DEBUG_FLAGS_DECL;
	7203	PerlIO_printf(Perl_debug_log,
	7204	"Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	7205	for (i = 1; i <= len; i++) {
	7206	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	7207	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	7208	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	7209	}
	7210	PerlIO_printf(Perl_debug_log, "\n");
	7211	});
	7212	#endif
	7213
	7214	#ifdef USE_ITHREADS
	7215	/* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
	7216	* by setting the regexp SV to readonly-only instead. If the
	7217	* pattern's been recompiled, the USEDness should remain. */
	7218	if (old_re && SvREADONLY(old_re))
	7219	SvREADONLY_on(rx);
	7220	#endif
	7221	return rx;
	7222	}
	7223
	7224
	7225	SV*
	7226	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	7227	const U32 flags)
	7228	{
	7229	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	7230
	7231	PERL_UNUSED_ARG(value);
	7232
	7233	if (flags & RXapif_FETCH) {
	7234	return reg_named_buff_fetch(rx, key, flags);
	7235	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	7236	Perl_croak_no_modify();
	7237	return NULL;
	7238	} else if (flags & RXapif_EXISTS) {
	7239	return reg_named_buff_exists(rx, key, flags)
	7240	? &PL_sv_yes
	7241	: &PL_sv_no;
	7242	} else if (flags & RXapif_REGNAMES) {
	7243	return reg_named_buff_all(rx, flags);
	7244	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	7245	return reg_named_buff_scalar(rx, flags);
	7246	} else {
	7247	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	7248	return NULL;
	7249	}
	7250	}
	7251
	7252	SV*
	7253	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	7254	const U32 flags)
	7255	{
	7256	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	7257	PERL_UNUSED_ARG(lastkey);
	7258
	7259	if (flags & RXapif_FIRSTKEY)
	7260	return reg_named_buff_firstkey(rx, flags);
	7261	else if (flags & RXapif_NEXTKEY)
	7262	return reg_named_buff_nextkey(rx, flags);
	7263	else {
	7264	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
	7265	(int)flags);
	7266	return NULL;
	7267	}
	7268	}
	7269
	7270	SV*
	7271	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	7272	const U32 flags)
	7273	{
	7274	AV *retarray = NULL;
	7275	SV *ret;
	7276	struct regexp *const rx = ReANY(r);
	7277
	7278	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	7279
	7280	if (flags & RXapif_ALL)
	7281	retarray=newAV();
	7282
	7283	if (rx && RXp_PAREN_NAMES(rx)) {
	7284	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	7285	if (he_str) {
	7286	IV i;
	7287	SV* sv_dat=HeVAL(he_str);
	7288	I32 nums=(I32)SvPVX(sv_dat);
	7289	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	7290	if ((I32)(rx->nparens) >= nums[i]
	7291	&& rx->offs[nums[i]].start != -1
	7292	&& rx->offs[nums[i]].end != -1)
	7293	{
	7294	ret = newSVpvs("");
	7295	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	7296	if (!retarray)
	7297	return ret;
	7298	} else {
	7299	if (retarray)
	7300	ret = newSVsv(&PL_sv_undef);
	7301	}
	7302	if (retarray)
	7303	av_push(retarray, ret);
	7304	}
	7305	if (retarray)
	7306	return newRV_noinc(MUTABLE_SV(retarray));
	7307	}
	7308	}
	7309	return NULL;
	7310	}
	7311
	7312	bool
	7313	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	7314	const U32 flags)
	7315	{
	7316	struct regexp *const rx = ReANY(r);
	7317
	7318	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	7319
	7320	if (rx && RXp_PAREN_NAMES(rx)) {
	7321	if (flags & RXapif_ALL) {
	7322	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	7323	} else {
	7324	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	7325	if (sv) {
	7326	SvREFCNT_dec_NN(sv);
	7327	return TRUE;
	7328	} else {
	7329	return FALSE;
	7330	}
	7331	}
	7332	} else {
	7333	return FALSE;
	7334	}
	7335	}
	7336
	7337	SV*
	7338	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	7339	{
	7340	struct regexp *const rx = ReANY(r);
	7341
	7342	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	7343
	7344	if ( rx && RXp_PAREN_NAMES(rx) ) {
	7345	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	7346
	7347	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	7348	} else {
	7349	return FALSE;
	7350	}
	7351	}
	7352
	7353	SV*
	7354	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	7355	{
	7356	struct regexp *const rx = ReANY(r);
	7357	GET_RE_DEBUG_FLAGS_DECL;
	7358
	7359	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	7360
	7361	if (rx && RXp_PAREN_NAMES(rx)) {
	7362	HV *hv = RXp_PAREN_NAMES(rx);
	7363	HE *temphe;
	7364	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	7365	IV i;
	7366	IV parno = 0;
	7367	SV* sv_dat = HeVAL(temphe);
	7368	I32 nums = (I32)SvPVX(sv_dat);
	7369	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	7370	if ((I32)(rx->lastparen) >= nums[i] &&
	7371	rx->offs[nums[i]].start != -1 &&
	7372	rx->offs[nums[i]].end != -1)
	7373	{
	7374	parno = nums[i];
	7375	break;
	7376	}
	7377	}
	7378	if (parno \|\| flags & RXapif_ALL) {
	7379	return newSVhek(HeKEY_hek(temphe));
	7380	}
	7381	}
	7382	}
	7383	return NULL;
	7384	}
	7385
	7386	SV*
	7387	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	7388	{
	7389	SV *ret;
	7390	AV *av;
	7391	SSize_t length;
	7392	struct regexp *const rx = ReANY(r);
	7393
	7394	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	7395
	7396	if (rx && RXp_PAREN_NAMES(rx)) {
	7397	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	7398	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	7399	} else if (flags & RXapif_ONE) {
	7400	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	7401	av = MUTABLE_AV(SvRV(ret));
	7402	length = av_tindex(av);
	7403	SvREFCNT_dec_NN(ret);
	7404	return newSViv(length + 1);
	7405	} else {
	7406	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
	7407	(int)flags);
	7408	return NULL;
	7409	}
	7410	}
	7411	return &PL_sv_undef;
	7412	}
	7413
	7414	SV*
	7415	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	7416	{
	7417	struct regexp *const rx = ReANY(r);
	7418	AV *av = newAV();
	7419
	7420	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	7421
	7422	if (rx && RXp_PAREN_NAMES(rx)) {
	7423	HV *hv= RXp_PAREN_NAMES(rx);
	7424	HE *temphe;
	7425	(void)hv_iterinit(hv);
	7426	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	7427	IV i;
	7428	IV parno = 0;
	7429	SV* sv_dat = HeVAL(temphe);
	7430	I32 nums = (I32)SvPVX(sv_dat);
	7431	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	7432	if ((I32)(rx->lastparen) >= nums[i] &&
	7433	rx->offs[nums[i]].start != -1 &&
	7434	rx->offs[nums[i]].end != -1)
	7435	{
	7436	parno = nums[i];
	7437	break;
	7438	}
	7439	}
	7440	if (parno \|\| flags & RXapif_ALL) {
	7441	av_push(av, newSVhek(HeKEY_hek(temphe)));
	7442	}
	7443	}
	7444	}
	7445
	7446	return newRV_noinc(MUTABLE_SV(av));
	7447	}
	7448
	7449	void
	7450	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	7451	SV * const sv)
	7452	{
	7453	struct regexp *const rx = ReANY(r);
	7454	char *s = NULL;
	7455	SSize_t i = 0;
	7456	SSize_t s1, t1;
	7457	I32 n = paren;
	7458
	7459	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	7460
	7461	if ( n == RX_BUFF_IDX_CARET_PREMATCH
	7462	\|\| n == RX_BUFF_IDX_CARET_FULLMATCH
	7463	\|\| n == RX_BUFF_IDX_CARET_POSTMATCH
	7464	)
	7465	{
	7466	bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
	7467	if (!keepcopy) {
	7468	/* on something like
	7469	* $r = qr/.../;
	7470	* /$qr/p;
	7471	* the KEEPCOPY is set on the PMOP rather than the regex */
	7472	if (PL_curpm && r == PM_GETRE(PL_curpm))
	7473	keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
	7474	}
	7475	if (!keepcopy)
	7476	goto ret_undef;
	7477	}
	7478
	7479	if (!rx->subbeg)
	7480	goto ret_undef;
	7481
	7482	if (n == RX_BUFF_IDX_CARET_FULLMATCH)
	7483	/* no need to distinguish between them any more */
	7484	n = RX_BUFF_IDX_FULLMATCH;
	7485
	7486	if ((n == RX_BUFF_IDX_PREMATCH \|\| n == RX_BUFF_IDX_CARET_PREMATCH)
	7487	&& rx->offs[0].start != -1)
	7488	{
	7489	/* $`, ${^PREMATCH} */
	7490	i = rx->offs[0].start;
	7491	s = rx->subbeg;
	7492	}
	7493	else
	7494	if ((n == RX_BUFF_IDX_POSTMATCH \|\| n == RX_BUFF_IDX_CARET_POSTMATCH)
	7495	&& rx->offs[0].end != -1)
	7496	{
	7497	/* $', ${^POSTMATCH} */
	7498	s = rx->subbeg - rx->suboffset + rx->offs[0].end;
	7499	i = rx->sublen + rx->suboffset - rx->offs[0].end;
	7500	}
	7501	else
	7502	if ( 0 <= n && n <= (I32)rx->nparens &&
	7503	(s1 = rx->offs[n].start) != -1 &&
	7504	(t1 = rx->offs[n].end) != -1)
	7505	{
	7506	/* $&, ${^MATCH}, $1 ... */
	7507	i = t1 - s1;
	7508	s = rx->subbeg + s1 - rx->suboffset;
	7509	} else {
	7510	goto ret_undef;
	7511	}
	7512
	7513	assert(s >= rx->subbeg);
	7514	assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
	7515	if (i >= 0) {
	7516	#ifdef NO_TAINT_SUPPORT
	7517	sv_setpvn(sv, s, i);
	7518	#else
	7519	const int oldtainted = TAINT_get;
	7520	TAINT_NOT;
	7521	sv_setpvn(sv, s, i);
	7522	TAINT_set(oldtainted);
	7523	#endif
	7524	if ( (rx->intflags & PREGf_CANY_SEEN)
	7525	? (RXp_MATCH_UTF8(rx)
	7526	&& (!i \|\| is_utf8_string((U8*)s, i)))
	7527	: (RXp_MATCH_UTF8(rx)) )
	7528	{
	7529	SvUTF8_on(sv);
	7530	}
	7531	else
	7532	SvUTF8_off(sv);
	7533	if (TAINTING_get) {
	7534	if (RXp_MATCH_TAINTED(rx)) {
	7535	if (SvTYPE(sv) >= SVt_PVMG) {
	7536	MAGIC* const mg = SvMAGIC(sv);
	7537	MAGIC* mgt;
	7538	TAINT;
	7539	SvMAGIC_set(sv, mg->mg_moremagic);
	7540	SvTAINT(sv);
	7541	if ((mgt = SvMAGIC(sv))) {
	7542	mg->mg_moremagic = mgt;
	7543	SvMAGIC_set(sv, mg);
	7544	}
	7545	} else {
	7546	TAINT;
	7547	SvTAINT(sv);
	7548	}
	7549	} else
	7550	SvTAINTED_off(sv);
	7551	}
	7552	} else {
	7553	ret_undef:
	7554	sv_setsv(sv,&PL_sv_undef);
	7555	return;
	7556	}
	7557	}
	7558
	7559	void
	7560	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	7561	SV const * const value)
	7562	{
	7563	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	7564
	7565	PERL_UNUSED_ARG(rx);
	7566	PERL_UNUSED_ARG(paren);
	7567	PERL_UNUSED_ARG(value);
	7568
	7569	if (!PL_localizing)
	7570	Perl_croak_no_modify();
	7571	}
	7572
	7573	I32
	7574	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	7575	const I32 paren)
	7576	{
	7577	struct regexp *const rx = ReANY(r);
	7578	I32 i;
	7579	I32 s1, t1;
	7580
	7581	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	7582
	7583	if ( paren == RX_BUFF_IDX_CARET_PREMATCH
	7584	\|\| paren == RX_BUFF_IDX_CARET_FULLMATCH
	7585	\|\| paren == RX_BUFF_IDX_CARET_POSTMATCH
	7586	)
	7587	{
	7588	bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
	7589	if (!keepcopy) {
	7590	/* on something like
	7591	* $r = qr/.../;
	7592	* /$qr/p;
	7593	* the KEEPCOPY is set on the PMOP rather than the regex */
	7594	if (PL_curpm && r == PM_GETRE(PL_curpm))
	7595	keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
	7596	}
	7597	if (!keepcopy)
	7598	goto warn_undef;
	7599	}
	7600
	7601	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	7602	switch (paren) {
	7603	case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
	7604	case RX_BUFF_IDX_PREMATCH: /* $` */
	7605	if (rx->offs[0].start != -1) {
	7606	i = rx->offs[0].start;
	7607	if (i > 0) {
	7608	s1 = 0;
	7609	t1 = i;
	7610	goto getlen;
	7611	}
	7612	}
	7613	return 0;
	7614
	7615	case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
	7616	case RX_BUFF_IDX_POSTMATCH: /* $' */
	7617	if (rx->offs[0].end != -1) {
	7618	i = rx->sublen - rx->offs[0].end;
	7619	if (i > 0) {
	7620	s1 = rx->offs[0].end;
	7621	t1 = rx->sublen;
	7622	goto getlen;
	7623	}
	7624	}
	7625	return 0;
	7626
	7627	default: /* $& / ${^MATCH}, $1, $2, ... */
	7628	if (paren <= (I32)rx->nparens &&
	7629	(s1 = rx->offs[paren].start) != -1 &&
	7630	(t1 = rx->offs[paren].end) != -1)
	7631	{
	7632	i = t1 - s1;
	7633	goto getlen;
	7634	} else {
	7635	warn_undef:
	7636	if (ckWARN(WARN_UNINITIALIZED))
	7637	report_uninit((const SV *)sv);
	7638	return 0;
	7639	}
	7640	}
	7641	getlen:
	7642	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	7643	const char * const s = rx->subbeg - rx->suboffset + s1;
	7644	const U8 *ep;
	7645	STRLEN el;
	7646
	7647	i = t1 - s1;
	7648	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	7649	i = el;
	7650	}
	7651	return i;
	7652	}
	7653
	7654	SV*
	7655	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	7656	{
	7657	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	7658	PERL_UNUSED_ARG(rx);
	7659	if (0)
	7660	return NULL;
	7661	else
	7662	return newSVpvs("Regexp");
	7663	}
	7664
	7665	/* Scans the name of a named buffer from the pattern.
	7666	* If flags is REG_RSN_RETURN_NULL returns null.
	7667	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	7668	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	7669	* to the parsed name as looked up in the RExC_paren_names hash.
	7670	* If there is an error throws a vFAIL().. type exception.
	7671	*/
	7672
	7673	#define REG_RSN_RETURN_NULL 0
	7674	#define REG_RSN_RETURN_NAME 1
	7675	#define REG_RSN_RETURN_DATA 2
	7676
	7677	STATIC SV*
	7678	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	7679	{
	7680	char *name_start = RExC_parse;
	7681
	7682	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	7683
	7684	assert (RExC_parse <= RExC_end);
	7685	if (RExC_parse == RExC_end) NOOP;
	7686	else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	7687	/* skip IDFIRST by using do...while */
	7688	if (UTF)
	7689	do {
	7690	RExC_parse += UTF8SKIP(RExC_parse);
	7691	} while (isWORDCHAR_utf8((U8*)RExC_parse));
	7692	else
	7693	do {
	7694	RExC_parse++;
	7695	} while (isWORDCHAR(*RExC_parse));
	7696	} else {
	7697	RExC_parse++; /* so the <- from the vFAIL is after the offending
	7698	character */
	7699	vFAIL("Group name must start with a non-digit word character");
	7700	}
	7701	if ( flags ) {
	7702	SV* sv_name
	7703	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	7704	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	7705	if ( flags == REG_RSN_RETURN_NAME)
	7706	return sv_name;
	7707	else if (flags==REG_RSN_RETURN_DATA) {
	7708	HE *he_str = NULL;
	7709	SV *sv_dat = NULL;
	7710	if ( ! sv_name ) /* should not happen*/
	7711	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	7712	if (RExC_paren_names)
	7713	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	7714	if ( he_str )
	7715	sv_dat = HeVAL(he_str);
	7716	if ( ! sv_dat )
	7717	vFAIL("Reference to nonexistent named group");
	7718	return sv_dat;
	7719	}
	7720	else {
	7721	Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
	7722	(unsigned long) flags);
	7723	}
	7724	assert(0); /* NOT REACHED */
	7725	}
	7726	return NULL;
	7727	}
	7728
	7729	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	7730	int rem=(int)(RExC_end - RExC_parse); \
	7731	int cut; \
	7732	int num; \
	7733	int iscut=0; \
	7734	if (rem>10) { \
	7735	rem=10; \
	7736	iscut=1; \
	7737	} \
	7738	cut=10-rem; \
	7739	if (RExC_lastparse!=RExC_parse) \
	7740	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	7741	rem, RExC_parse, \
	7742	cut + 4, \
	7743	iscut ? "..." : "<" \
	7744	); \
	7745	else \
	7746	PerlIO_printf(Perl_debug_log,"%16s",""); \
	7747	\
	7748	if (SIZE_ONLY) \
	7749	num = RExC_size + 1; \
	7750	else \
	7751	num=REG_NODE_NUM(RExC_emit); \
	7752	if (RExC_lastnum!=num) \
	7753	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	7754	else \
	7755	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	7756	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	7757	(int)((depth*2)), "", \
	7758	(funcname) \
	7759	); \
	7760	RExC_lastnum=num; \
	7761	RExC_lastparse=RExC_parse; \
	7762	})
	7763
	7764
	7765
	7766	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	7767	DEBUG_PARSE_MSG((funcname)); \
	7768	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	7769	})
	7770	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	7771	DEBUG_PARSE_MSG((funcname)); \
	7772	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	7773	})
	7774
	7775	/* This section of code defines the inversion list object and its methods. The
	7776	* interfaces are highly subject to change, so as much as possible is static to
	7777	* this file. An inversion list is here implemented as a malloc'd C UV array
	7778	* as an SVt_INVLIST scalar.
	7779	*
	7780	* An inversion list for Unicode is an array of code points, sorted by ordinal
	7781	* number. The zeroth element is the first code point in the list. The 1th
	7782	* element is the first element beyond that not in the list. In other words,
	7783	* the first range is
	7784	* invlist[0]..(invlist[1]-1)
	7785	* The other ranges follow. Thus every element whose index is divisible by two
	7786	* marks the beginning of a range that is in the list, and every element not
	7787	* divisible by two marks the beginning of a range not in the list. A single
	7788	* element inversion list that contains the single code point N generally
	7789	* consists of two elements
	7790	* invlist[0] == N
	7791	* invlist[1] == N+1
	7792	* (The exception is when N is the highest representable value on the
	7793	* machine, in which case the list containing just it would be a single
	7794	* element, itself. By extension, if the last range in the list extends to
	7795	* infinity, then the first element of that range will be in the inversion list
	7796	* at a position that is divisible by two, and is the final element in the
	7797	* list.)
	7798	* Taking the complement (inverting) an inversion list is quite simple, if the
	7799	* first element is 0, remove it; otherwise add a 0 element at the beginning.
	7800	* This implementation reserves an element at the beginning of each inversion
	7801	* list to always contain 0; there is an additional flag in the header which
	7802	* indicates if the list begins at the 0, or is offset to begin at the next
	7803	* element.
	7804	*
	7805	* More about inversion lists can be found in "Unicode Demystified"
	7806	* Chapter 13 by Richard Gillam, published by Addison-Wesley.
	7807	* More will be coming when functionality is added later.
	7808	*
	7809	* The inversion list data structure is currently implemented as an SV pointing
	7810	* to an array of UVs that the SV thinks are bytes. This allows us to have an
	7811	* array of UV whose memory management is automatically handled by the existing
	7812	* facilities for SV's.
	7813	*
	7814	* Some of the methods should always be private to the implementation, and some
	7815	* should eventually be made public */
	7816
	7817	/* The header definitions are in F<inline_invlist.c> */
	7818
	7819	PERL_STATIC_INLINE UV*
	7820	S__invlist_array_init(SV* const invlist, const bool will_have_0)
	7821	{
	7822	/* Returns a pointer to the first element in the inversion list's array.
	7823	* This is called upon initialization of an inversion list. Where the
	7824	* array begins depends on whether the list has the code point U+0000 in it
	7825	* or not. The other parameter tells it whether the code that follows this
	7826	* call is about to put a 0 in the inversion list or not. The first
	7827	* element is either the element reserved for 0, if TRUE, or the element
	7828	* after it, if FALSE */
	7829
	7830	bool* offset = get_invlist_offset_addr(invlist);
	7831	UV* zero_addr = (UV *) SvPVX(invlist);
	7832
	7833	PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
	7834
	7835	/* Must be empty */
	7836	assert(! _invlist_len(invlist));
	7837
	7838	*zero_addr = 0;
	7839
	7840	/* 1^1 = 0; 1^0 = 1 */
	7841	*offset = 1 ^ will_have_0;
	7842	return zero_addr + *offset;
	7843	}
	7844
	7845	PERL_STATIC_INLINE UV*
	7846	S_invlist_array(SV* const invlist)
	7847	{
	7848	/* Returns the pointer to the inversion list's array. Every time the
	7849	* length changes, this needs to be called in case malloc or realloc moved
	7850	* it */
	7851
	7852	PERL_ARGS_ASSERT_INVLIST_ARRAY;
	7853
	7854	/* Must not be empty. If these fail, you probably didn't check for <len>
	7855	* being non-zero before trying to get the array */
	7856	assert(_invlist_len(invlist));
	7857
	7858	/* The very first element always contains zero, The array begins either
	7859	* there, or if the inversion list is offset, at the element after it.
	7860	* The offset header field determines which; it contains 0 or 1 to indicate
	7861	* how much additionally to add */
	7862	assert(0 == *(SvPVX(invlist)));
	7863	return ((UV ) SvPVX(invlist) + get_invlist_offset_addr(invlist));
	7864	}
	7865
	7866	PERL_STATIC_INLINE void
	7867	S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
	7868	{
	7869	/* Sets the current number of elements stored in the inversion list.
	7870	* Updates SvCUR correspondingly */
	7871	PERL_UNUSED_CONTEXT;
	7872	PERL_ARGS_ASSERT_INVLIST_SET_LEN;
	7873
	7874	assert(SvTYPE(invlist) == SVt_INVLIST);
	7875
	7876	SvCUR_set(invlist,
	7877	(len == 0)
	7878	? 0
	7879	: TO_INTERNAL_SIZE(len + offset));
	7880	assert(SvLEN(invlist) == 0 \|\| SvCUR(invlist) <= SvLEN(invlist));
	7881	}
	7882
	7883	PERL_STATIC_INLINE IV*
	7884	S_get_invlist_previous_index_addr(SV* invlist)
	7885	{
	7886	/* Return the address of the IV that is reserved to hold the cached index
	7887	* */
	7888	PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
	7889
	7890	assert(SvTYPE(invlist) == SVt_INVLIST);
	7891
	7892	return &(((XINVLIST*) SvANY(invlist))->prev_index);
	7893	}
	7894
	7895	PERL_STATIC_INLINE IV
	7896	S_invlist_previous_index(SV* const invlist)
	7897	{
	7898	/* Returns cached index of previous search */
	7899
	7900	PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
	7901
	7902	return *get_invlist_previous_index_addr(invlist);
	7903	}
	7904
	7905	PERL_STATIC_INLINE void
	7906	S_invlist_set_previous_index(SV* const invlist, const IV index)
	7907	{
	7908	/* Caches <index> for later retrieval */
	7909
	7910	PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
	7911
	7912	assert(index == 0 \|\| index < (int) _invlist_len(invlist));
	7913
	7914	*get_invlist_previous_index_addr(invlist) = index;
	7915	}
	7916
	7917	PERL_STATIC_INLINE UV
	7918	S_invlist_max(SV* const invlist)
	7919	{
	7920	/* Returns the maximum number of elements storable in the inversion list's
	7921	* array, without having to realloc() */
	7922
	7923	PERL_ARGS_ASSERT_INVLIST_MAX;
	7924
	7925	assert(SvTYPE(invlist) == SVt_INVLIST);
	7926
	7927	/* Assumes worst case, in which the 0 element is not counted in the
	7928	* inversion list, so subtracts 1 for that */
	7929	return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
	7930	? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
	7931	: FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
	7932	}
	7933
	7934	#ifndef PERL_IN_XSUB_RE
	7935	SV*
	7936	Perl__new_invlist(pTHX_ IV initial_size)
	7937	{
	7938
	7939	/* Return a pointer to a newly constructed inversion list, with enough
	7940	* space to store 'initial_size' elements. If that number is negative, a
	7941	* system default is used instead */
	7942
	7943	SV* new_list;
	7944
	7945	if (initial_size < 0) {
	7946	initial_size = 10;
	7947	}
	7948
	7949	/* Allocate the initial space */
	7950	new_list = newSV_type(SVt_INVLIST);
	7951
	7952	/* First 1 is in case the zero element isn't in the list; second 1 is for
	7953	* trailing NUL */
	7954	SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
	7955	invlist_set_len(new_list, 0, 0);
	7956
	7957	/* Force iterinit() to be used to get iteration to work */
	7958	*get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
	7959
	7960	*get_invlist_previous_index_addr(new_list) = 0;
	7961
	7962	return new_list;
	7963	}
	7964
	7965	SV*
	7966	Perl__new_invlist_C_array(pTHX_ const UV* const list)
	7967	{
	7968	/* Return a pointer to a newly constructed inversion list, initialized to
	7969	* point to <list>, which has to be in the exact correct inversion list
	7970	* form, including internal fields. Thus this is a dangerous routine that
	7971	* should not be used in the wrong hands. The passed in 'list' contains
	7972	* several header fields at the beginning that are not part of the
	7973	* inversion list body proper */
	7974
	7975	const STRLEN length = (STRLEN) list[0];
	7976	const UV version_id = list[1];
	7977	const bool offset = cBOOL(list[2]);
	7978	#define HEADER_LENGTH 3
	7979	/* If any of the above changes in any way, you must change HEADER_LENGTH
	7980	* (if appropriate) and regenerate INVLIST_VERSION_ID by running
	7981	* perl -E 'say int(rand 2**31-1)'
	7982	*/
	7983	#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
	7984	data structure type, so that one being
	7985	passed in can be validated to be an
	7986	inversion list of the correct vintage.
	7987	*/
	7988
	7989	SV* invlist = newSV_type(SVt_INVLIST);
	7990
	7991	PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
	7992
	7993	if (version_id != INVLIST_VERSION_ID) {
	7994	Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
	7995	}
	7996
	7997	/* The generated array passed in includes header elements that aren't part
	7998	* of the list proper, so start it just after them */
	7999	SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
	8000
	8001	SvLEN_set(invlist, 0); /* Means we own the contents, and the system
	8002	shouldn't touch it */
	8003
	8004	*(get_invlist_offset_addr(invlist)) = offset;
	8005
	8006	/* The 'length' passed to us is the physical number of elements in the
	8007	* inversion list. But if there is an offset the logical number is one
	8008	* less than that */
	8009	invlist_set_len(invlist, length - offset, offset);
	8010
	8011	invlist_set_previous_index(invlist, 0);
	8012
	8013	/* Initialize the iteration pointer. */
	8014	invlist_iterfinish(invlist);
	8015
	8016	SvREADONLY_on(invlist);
	8017
	8018	return invlist;
	8019	}
	8020	#endif /* ifndef PERL_IN_XSUB_RE */
	8021
	8022	STATIC void
	8023	S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
	8024	{
	8025	/* Grow the maximum size of an inversion list */
	8026
	8027	PERL_ARGS_ASSERT_INVLIST_EXTEND;
	8028
	8029	assert(SvTYPE(invlist) == SVt_INVLIST);
	8030
	8031	/* Add one to account for the zero element at the beginning which may not
	8032	* be counted by the calling parameters */
	8033	SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
	8034	}
	8035
	8036	PERL_STATIC_INLINE void
	8037	S_invlist_trim(SV* const invlist)
	8038	{
	8039	PERL_ARGS_ASSERT_INVLIST_TRIM;
	8040
	8041	assert(SvTYPE(invlist) == SVt_INVLIST);
	8042
	8043	/* Change the length of the inversion list to how many entries it currently
	8044	* has */
	8045	SvPV_shrink_to_cur((SV *) invlist);
	8046	}
	8047
	8048	STATIC void
	8049	S__append_range_to_invlist(pTHX_ SV* const invlist,
	8050	const UV start, const UV end)
	8051	{
	8052	/* Subject to change or removal. Append the range from 'start' to 'end' at
	8053	* the end of the inversion list. The range must be above any existing
	8054	* ones. */
	8055
	8056	UV* array;
	8057	UV max = invlist_max(invlist);
	8058	UV len = _invlist_len(invlist);
	8059	bool offset;
	8060
	8061	PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
	8062
	8063	if (len == 0) { /* Empty lists must be initialized */
	8064	offset = start != 0;
	8065	array = _invlist_array_init(invlist, ! offset);
	8066	}
	8067	else {
	8068	/* Here, the existing list is non-empty. The current max entry in the
	8069	* list is generally the first value not in the set, except when the
	8070	* set extends to the end of permissible values, in which case it is
	8071	* the first entry in that final set, and so this call is an attempt to
	8072	* append out-of-order */
	8073
	8074	UV final_element = len - 1;
	8075	array = invlist_array(invlist);
	8076	if (array[final_element] > start
	8077	\|\| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
	8078	{
	8079	Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
	8080	array[final_element], start,
	8081	ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
	8082	}
	8083
	8084	/* Here, it is a legal append. If the new range begins with the first
	8085	* value not in the set, it is extending the set, so the new first
	8086	* value not in the set is one greater than the newly extended range.
	8087	* */
	8088	offset = *get_invlist_offset_addr(invlist);
	8089	if (array[final_element] == start) {
	8090	if (end != UV_MAX) {
	8091	array[final_element] = end + 1;
	8092	}
	8093	else {
	8094	/* But if the end is the maximum representable on the machine,
	8095	* just let the range that this would extend to have no end */
	8096	invlist_set_len(invlist, len - 1, offset);
	8097	}
	8098	return;
	8099	}
	8100	}
	8101
	8102	/* Here the new range doesn't extend any existing set. Add it */
	8103
	8104	len += 2; /* Includes an element each for the start and end of range */
	8105
	8106	/* If wll overflow the existing space, extend, which may cause the array to
	8107	* be moved */
	8108	if (max < len) {
	8109	invlist_extend(invlist, len);
	8110
	8111	/* Have to set len here to avoid assert failure in invlist_array() */
	8112	invlist_set_len(invlist, len, offset);
	8113
	8114	array = invlist_array(invlist);
	8115	}
	8116	else {
	8117	invlist_set_len(invlist, len, offset);
	8118	}
	8119
	8120	/* The next item on the list starts the range, the one after that is
	8121	* one past the new range. */
	8122	array[len - 2] = start;
	8123	if (end != UV_MAX) {
	8124	array[len - 1] = end + 1;
	8125	}
	8126	else {
	8127	/* But if the end is the maximum representable on the machine, just let
	8128	* the range have no end */
	8129	invlist_set_len(invlist, len - 1, offset);
	8130	}
	8131	}
	8132
	8133	#ifndef PERL_IN_XSUB_RE
	8134
	8135	IV
	8136	Perl__invlist_search(SV* const invlist, const UV cp)
	8137	{
	8138	/* Searches the inversion list for the entry that contains the input code
	8139	* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
	8140	* return value is the index into the list's array of the range that
	8141	* contains <cp> */
	8142
	8143	IV low = 0;
	8144	IV mid;
	8145	IV high = _invlist_len(invlist);
	8146	const IV highest_element = high - 1;
	8147	const UV* array;
	8148
	8149	PERL_ARGS_ASSERT__INVLIST_SEARCH;
	8150
	8151	/* If list is empty, return failure. */
	8152	if (high == 0) {
	8153	return -1;
	8154	}
	8155
	8156	/* (We can't get the array unless we know the list is non-empty) */
	8157	array = invlist_array(invlist);
	8158
	8159	mid = invlist_previous_index(invlist);
	8160	assert(mid >=0 && mid <= highest_element);
	8161
	8162	/* <mid> contains the cache of the result of the previous call to this
	8163	* function (0 the first time). See if this call is for the same result,
	8164	* or if it is for mid-1. This is under the theory that calls to this
	8165	* function will often be for related code points that are near each other.
	8166	* And benchmarks show that caching gives better results. We also test
	8167	* here if the code point is within the bounds of the list. These tests
	8168	* replace others that would have had to be made anyway to make sure that
	8169	* the array bounds were not exceeded, and these give us extra information
	8170	* at the same time */
	8171	if (cp >= array[mid]) {
	8172	if (cp >= array[highest_element]) {
	8173	return highest_element;
	8174	}
	8175
	8176	/* Here, array[mid] <= cp < array[highest_element]. This means that
	8177	* the final element is not the answer, so can exclude it; it also
	8178	* means that <mid> is not the final element, so can refer to 'mid + 1'
	8179	* safely */
	8180	if (cp < array[mid + 1]) {
	8181	return mid;
	8182	}
	8183	high--;
	8184	low = mid + 1;
	8185	}
	8186	else { /* cp < aray[mid] */
	8187	if (cp < array[0]) { /* Fail if outside the array */
	8188	return -1;
	8189	}
	8190	high = mid;
	8191	if (cp >= array[mid - 1]) {
	8192	goto found_entry;
	8193	}
	8194	}
	8195
	8196	/* Binary search. What we are looking for is <i> such that
	8197	* array[i] <= cp < array[i+1]
	8198	* The loop below converges on the i+1. Note that there may not be an
	8199	* (i+1)th element in the array, and things work nonetheless */
	8200	while (low < high) {
	8201	mid = (low + high) / 2;
	8202	assert(mid <= highest_element);
	8203	if (array[mid] <= cp) { /* cp >= array[mid] */
	8204	low = mid + 1;
	8205
	8206	/* We could do this extra test to exit the loop early.
	8207	if (cp < array[low]) {
	8208	return mid;
	8209	}
	8210	*/
	8211	}
	8212	else { /* cp < array[mid] */
	8213	high = mid;
	8214	}
	8215	}
	8216
	8217	found_entry:
	8218	high--;
	8219	invlist_set_previous_index(invlist, high);
	8220	return high;
	8221	}
	8222
	8223	void
	8224	Perl__invlist_populate_swatch(SV* const invlist,
	8225	const UV start, const UV end, U8* swatch)
	8226	{
	8227	/* populates a swatch of a swash the same way swatch_get() does in utf8.c,
	8228	* but is used when the swash has an inversion list. This makes this much
	8229	* faster, as it uses a binary search instead of a linear one. This is
	8230	* intimately tied to that function, and perhaps should be in utf8.c,
	8231	* except it is intimately tied to inversion lists as well. It assumes
	8232	* that <swatch> is all 0's on input */
	8233
	8234	UV current = start;
	8235	const IV len = _invlist_len(invlist);
	8236	IV i;
	8237	const UV * array;
	8238
	8239	PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
	8240
	8241	if (len == 0) { /* Empty inversion list */
	8242	return;
	8243	}
	8244
	8245	array = invlist_array(invlist);
	8246
	8247	/* Find which element it is */
	8248	i = _invlist_search(invlist, start);
	8249
	8250	/* We populate from <start> to <end> */
	8251	while (current < end) {
	8252	UV upper;
	8253
	8254	/* The inversion list gives the results for every possible code point
	8255	* after the first one in the list. Only those ranges whose index is
	8256	* even are ones that the inversion list matches. For the odd ones,
	8257	* and if the initial code point is not in the list, we have to skip
	8258	* forward to the next element */
	8259	if (i == -1 \|\| ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
	8260	i++;
	8261	if (i >= len) { /* Finished if beyond the end of the array */
	8262	return;
	8263	}
	8264	current = array[i];
	8265	if (current >= end) { /* Finished if beyond the end of what we
	8266	are populating */
	8267	if (LIKELY(end < UV_MAX)) {
	8268	return;
	8269	}
	8270
	8271	/* We get here when the upper bound is the maximum
	8272	* representable on the machine, and we are looking for just
	8273	* that code point. Have to special case it */
	8274	i = len;
	8275	goto join_end_of_list;
	8276	}
	8277	}
	8278	assert(current >= start);
	8279
	8280	/* The current range ends one below the next one, except don't go past
	8281	* <end> */
	8282	i++;
	8283	upper = (i < len && array[i] < end) ? array[i] : end;
	8284
	8285	/* Here we are in a range that matches. Populate a bit in the 3-bit U8
	8286	* for each code point in it */
	8287	for (; current < upper; current++) {
	8288	const STRLEN offset = (STRLEN)(current - start);
	8289	swatch[offset >> 3] \|= 1 << (offset & 7);
	8290	}
	8291
	8292	join_end_of_list:
	8293
	8294	/* Quit if at the end of the list */
	8295	if (i >= len) {
	8296
	8297	/* But first, have to deal with the highest possible code point on
	8298	* the platform. The previous code assumes that <end> is one
	8299	* beyond where we want to populate, but that is impossible at the
	8300	* platform's infinity, so have to handle it specially */
	8301	if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
	8302	{
	8303	const STRLEN offset = (STRLEN)(end - start);
	8304	swatch[offset >> 3] \|= 1 << (offset & 7);
	8305	}
	8306	return;
	8307	}
	8308
	8309	/* Advance to the next range, which will be for code points not in the
	8310	* inversion list */
	8311	current = array[i];
	8312	}
	8313
	8314	return;
	8315	}
	8316
	8317	void
	8318	Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
	8319	const bool complement_b, SV** output)
	8320	{
	8321	/* Take the union of two inversion lists and point <output> to it. *output
	8322	* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
	8323	* the reference count to that list will be decremented if not already a
	8324	* temporary (mortal); otherwise *output will be made correspondingly
	8325	* mortal. The first list, <a>, may be NULL, in which case a copy of the
	8326	* second list is returned. If <complement_b> is TRUE, the union is taken
	8327	* of the complement (inversion) of <b> instead of b itself.
	8328	*
	8329	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	8330	* Richard Gillam, published by Addison-Wesley, and explained at some
	8331	* length there. The preface says to incorporate its examples into your
	8332	* code at your own risk.
	8333	*
	8334	* The algorithm is like a merge sort.
	8335	*
	8336	* XXX A potential performance improvement is to keep track as we go along
	8337	* if only one of the inputs contributes to the result, meaning the other
	8338	* is a subset of that one. In that case, we can skip the final copy and
	8339	* return the larger of the input lists, but then outside code might need
	8340	* to keep track of whether to free the input list or not */
	8341
	8342	const UV* array_a; /* a's array */
	8343	const UV* array_b;
	8344	UV len_a; /* length of a's array */
	8345	UV len_b;
	8346
	8347	SV* u; /* the resulting union */
	8348	UV* array_u;
	8349	UV len_u;
	8350
	8351	UV i_a = 0; /* current index into a's array */
	8352	UV i_b = 0;
	8353	UV i_u = 0;
	8354
	8355	/* running count, as explained in the algorithm source book; items are
	8356	* stopped accumulating and are output when the count changes to/from 0.
	8357	* The count is incremented when we start a range that's in the set, and
	8358	* decremented when we start a range that's not in the set. So its range
	8359	* is 0 to 2. Only when the count is zero is something not in the set.
	8360	*/
	8361	UV count = 0;
	8362
	8363	PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
	8364	assert(a != b);
	8365
	8366	/* If either one is empty, the union is the other one */
	8367	if (a == NULL \|\| ((len_a = _invlist_len(a)) == 0)) {
	8368	bool make_temp = FALSE; /* Should we mortalize the result? */
	8369
	8370	if (*output == a) {
	8371	if (a != NULL) {
	8372	if (! (make_temp = cBOOL(SvTEMP(a)))) {
	8373	SvREFCNT_dec_NN(a);
	8374	}
	8375	}
	8376	}
	8377	if (*output != b) {
	8378	*output = invlist_clone(b);
	8379	if (complement_b) {
	8380	_invlist_invert(*output);
	8381	}
	8382	} /* else output already = b; /
	8383
	8384	if (make_temp) {
	8385	sv_2mortal(*output);
	8386	}
	8387	return;
	8388	}
	8389	else if ((len_b = _invlist_len(b)) == 0) {
	8390	bool make_temp = FALSE;
	8391	if (*output == b) {
	8392	if (! (make_temp = cBOOL(SvTEMP(b)))) {
	8393	SvREFCNT_dec_NN(b);
	8394	}
	8395	}
	8396
	8397	/* The complement of an empty list is a list that has everything in it,
	8398	* so the union with <a> includes everything too */
	8399	if (complement_b) {
	8400	if (a == *output) {
	8401	if (! (make_temp = cBOOL(SvTEMP(a)))) {
	8402	SvREFCNT_dec_NN(a);
	8403	}
	8404	}
	8405	*output = _new_invlist(1);
	8406	_append_range_to_invlist(*output, 0, UV_MAX);
	8407	}
	8408	else if (*output != a) {
	8409	*output = invlist_clone(a);
	8410	}
	8411	/* else output already = a; /
	8412
	8413	if (make_temp) {
	8414	sv_2mortal(*output);
	8415	}
	8416	return;
	8417	}
	8418
	8419	/* Here both lists exist and are non-empty */
	8420	array_a = invlist_array(a);
	8421	array_b = invlist_array(b);
	8422
	8423	/* If are to take the union of 'a' with the complement of b, set it
	8424	* up so are looking at b's complement. */
	8425	if (complement_b) {
	8426
	8427	/* To complement, we invert: if the first element is 0, remove it. To
	8428	* do this, we just pretend the array starts one later */
	8429	if (array_b[0] == 0) {
	8430	array_b++;
	8431	len_b--;
	8432	}
	8433	else {
	8434
	8435	/* But if the first element is not zero, we pretend the list starts
	8436	* at the 0 that is always stored immediately before the array. */
	8437	array_b--;
	8438	len_b++;
	8439	}
	8440	}
	8441
	8442	/* Size the union for the worst case: that the sets are completely
	8443	* disjoint */
	8444	u = _new_invlist(len_a + len_b);
	8445
	8446	/* Will contain U+0000 if either component does */
	8447	array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
	8448	\|\| (len_b > 0 && array_b[0] == 0));
	8449
	8450	/* Go through each list item by item, stopping when exhausted one of
	8451	* them */
	8452	while (i_a < len_a && i_b < len_b) {
	8453	UV cp; /* The element to potentially add to the union's array */
	8454	bool cp_in_set; /* is it in the the input list's set or not */
	8455
	8456	/* We need to take one or the other of the two inputs for the union.
	8457	* Since we are merging two sorted lists, we take the smaller of the
	8458	* next items. In case of a tie, we take the one that is in its set
	8459	* first. If we took one not in the set first, it would decrement the
	8460	* count, possibly to 0 which would cause it to be output as ending the
	8461	* range, and the next time through we would take the same number, and
	8462	* output it again as beginning the next range. By doing it the
	8463	* opposite way, there is no possibility that the count will be
	8464	* momentarily decremented to 0, and thus the two adjoining ranges will
	8465	* be seamlessly merged. (In a tie and both are in the set or both not
	8466	* in the set, it doesn't matter which we take first.) */
	8467	if (array_a[i_a] < array_b[i_b]
	8468	\|\| (array_a[i_a] == array_b[i_b]
	8469	&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	8470	{
	8471	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	8472	cp= array_a[i_a++];
	8473	}
	8474	else {
	8475	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	8476	cp = array_b[i_b++];
	8477	}
	8478
	8479	/* Here, have chosen which of the two inputs to look at. Only output
	8480	* if the running count changes to/from 0, which marks the
	8481	* beginning/end of a range in that's in the set */
	8482	if (cp_in_set) {
	8483	if (count == 0) {
	8484	array_u[i_u++] = cp;
	8485	}
	8486	count++;
	8487	}
	8488	else {
	8489	count--;
	8490	if (count == 0) {
	8491	array_u[i_u++] = cp;
	8492	}
	8493	}
	8494	}
	8495
	8496	/* Here, we are finished going through at least one of the lists, which
	8497	* means there is something remaining in at most one. We check if the list
	8498	* that hasn't been exhausted is positioned such that we are in the middle
	8499	* of a range in its set or not. (i_a and i_b point to the element beyond
	8500	* the one we care about.) If in the set, we decrement 'count'; if 0, there
	8501	* is potentially more to output.
	8502	* There are four cases:
	8503	* 1) Both weren't in their sets, count is 0, and remains 0. What's left
	8504	* in the union is entirely from the non-exhausted set.
	8505	* 2) Both were in their sets, count is 2. Nothing further should
	8506	* be output, as everything that remains will be in the exhausted
	8507	* list's set, hence in the union; decrementing to 1 but not 0 insures
	8508	* that
	8509	* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
	8510	* Nothing further should be output because the union includes
	8511	* everything from the exhausted set. Not decrementing ensures that.
	8512	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
	8513	* decrementing to 0 insures that we look at the remainder of the
	8514	* non-exhausted set */
	8515	if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	8516	\|\| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	8517	{
	8518	count--;
	8519	}
	8520
	8521	/* The final length is what we've output so far, plus what else is about to
	8522	* be output. (If 'count' is non-zero, then the input list we exhausted
	8523	* has everything remaining up to the machine's limit in its set, and hence
	8524	* in the union, so there will be no further output. */
	8525	len_u = i_u;
	8526	if (count == 0) {
	8527	/* At most one of the subexpressions will be non-zero */
	8528	len_u += (len_a - i_a) + (len_b - i_b);
	8529	}
	8530
	8531	/* Set result to final length, which can change the pointer to array_u, so
	8532	* re-find it */
	8533	if (len_u != _invlist_len(u)) {
	8534	invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
	8535	invlist_trim(u);
	8536	array_u = invlist_array(u);
	8537	}
	8538
	8539	/* When 'count' is 0, the list that was exhausted (if one was shorter than
	8540	* the other) ended with everything above it not in its set. That means
	8541	* that the remaining part of the union is precisely the same as the
	8542	* non-exhausted list, so can just copy it unchanged. (If both list were
	8543	* exhausted at the same time, then the operations below will be both 0.)
	8544	*/
	8545	if (count == 0) {
	8546	IV copy_count; /* At most one will have a non-zero copy count */
	8547	if ((copy_count = len_a - i_a) > 0) {
	8548	Copy(array_a + i_a, array_u + i_u, copy_count, UV);
	8549	}
	8550	else if ((copy_count = len_b - i_b) > 0) {
	8551	Copy(array_b + i_b, array_u + i_u, copy_count, UV);
	8552	}
	8553	}
	8554
	8555	/* We may be removing a reference to one of the inputs. If so, the output
	8556	* is made mortal if the input was. (Mortal SVs shouldn't have their ref
	8557	* count decremented) */
	8558	if (a == output \|\| b == output) {
	8559	assert(! invlist_is_iterating(*output));
	8560	if ((SvTEMP(*output))) {
	8561	sv_2mortal(u);
	8562	}
	8563	else {
	8564	SvREFCNT_dec_NN(*output);
	8565	}
	8566	}
	8567
	8568	*output = u;
	8569
	8570	return;
	8571	}
	8572
	8573	void
	8574	Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
	8575	const bool complement_b, SV** i)
	8576	{
	8577	/* Take the intersection of two inversion lists and point <i> to it. *i
	8578	* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
	8579	* the reference count to that list will be decremented if not already a
	8580	* temporary (mortal); otherwise *i will be made correspondingly mortal.
	8581	* The first list, <a>, may be NULL, in which case an empty list is
	8582	* returned. If <complement_b> is TRUE, the result will be the
	8583	* intersection of <a> and the complement (or inversion) of <b> instead of
	8584	* <b> directly.
	8585	*
	8586	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	8587	* Richard Gillam, published by Addison-Wesley, and explained at some
	8588	* length there. The preface says to incorporate its examples into your
	8589	* code at your own risk. In fact, it had bugs
	8590	*
	8591	* The algorithm is like a merge sort, and is essentially the same as the
	8592	* union above
	8593	*/
	8594
	8595	const UV* array_a; /* a's array */
	8596	const UV* array_b;
	8597	UV len_a; /* length of a's array */
	8598	UV len_b;
	8599
	8600	SV* r; /* the resulting intersection */
	8601	UV* array_r;
	8602	UV len_r;
	8603
	8604	UV i_a = 0; /* current index into a's array */
	8605	UV i_b = 0;
	8606	UV i_r = 0;
	8607
	8608	/* running count, as explained in the algorithm source book; items are
	8609	* stopped accumulating and are output when the count changes to/from 2.
	8610	* The count is incremented when we start a range that's in the set, and
	8611	* decremented when we start a range that's not in the set. So its range
	8612	* is 0 to 2. Only when the count is 2 is something in the intersection.
	8613	*/
	8614	UV count = 0;
	8615
	8616	PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
	8617	assert(a != b);
	8618
	8619	/* Special case if either one is empty */
	8620	len_a = (a == NULL) ? 0 : _invlist_len(a);
	8621	if ((len_a == 0) \|\| ((len_b = _invlist_len(b)) == 0)) {
	8622	bool make_temp = FALSE;
	8623
	8624	if (len_a != 0 && complement_b) {
	8625
	8626	/* Here, 'a' is not empty, therefore from the above 'if', 'b' must
	8627	* be empty. Here, also we are using 'b's complement, which hence
	8628	* must be every possible code point. Thus the intersection is
	8629	* simply 'a'. */
	8630	if (*i != a) {
	8631	if (*i == b) {
	8632	if (! (make_temp = cBOOL(SvTEMP(b)))) {
	8633	SvREFCNT_dec_NN(b);
	8634	}
	8635	}
	8636
	8637	*i = invlist_clone(a);
	8638	}
	8639	/* else i is already 'a' /
	8640
	8641	if (make_temp) {
	8642	sv_2mortal(*i);
	8643	}
	8644	return;
	8645	}
	8646
	8647	/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
	8648	* intersection must be empty */
	8649	if (*i == a) {
	8650	if (! (make_temp = cBOOL(SvTEMP(a)))) {
	8651	SvREFCNT_dec_NN(a);
	8652	}
	8653	}
	8654	else if (*i == b) {
	8655	if (! (make_temp = cBOOL(SvTEMP(b)))) {
	8656	SvREFCNT_dec_NN(b);
	8657	}
	8658	}
	8659	*i = _new_invlist(0);
	8660	if (make_temp) {
	8661	sv_2mortal(*i);
	8662	}
	8663
	8664	return;
	8665	}
	8666
	8667	/* Here both lists exist and are non-empty */
	8668	array_a = invlist_array(a);
	8669	array_b = invlist_array(b);
	8670
	8671	/* If are to take the intersection of 'a' with the complement of b, set it
	8672	* up so are looking at b's complement. */
	8673	if (complement_b) {
	8674
	8675	/* To complement, we invert: if the first element is 0, remove it. To
	8676	* do this, we just pretend the array starts one later */
	8677	if (array_b[0] == 0) {
	8678	array_b++;
	8679	len_b--;
	8680	}
	8681	else {
	8682
	8683	/* But if the first element is not zero, we pretend the list starts
	8684	* at the 0 that is always stored immediately before the array. */
	8685	array_b--;
	8686	len_b++;
	8687	}
	8688	}
	8689
	8690	/* Size the intersection for the worst case: that the intersection ends up
	8691	* fragmenting everything to be completely disjoint */
	8692	r= _new_invlist(len_a + len_b);
	8693
	8694	/* Will contain U+0000 iff both components do */
	8695	array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
	8696	&& len_b > 0 && array_b[0] == 0);
	8697
	8698	/* Go through each list item by item, stopping when exhausted one of
	8699	* them */
	8700	while (i_a < len_a && i_b < len_b) {
	8701	UV cp; /* The element to potentially add to the intersection's
	8702	array */
	8703	bool cp_in_set; /* Is it in the input list's set or not */
	8704
	8705	/* We need to take one or the other of the two inputs for the
	8706	* intersection. Since we are merging two sorted lists, we take the
	8707	* smaller of the next items. In case of a tie, we take the one that
	8708	* is not in its set first (a difference from the union algorithm). If
	8709	* we took one in the set first, it would increment the count, possibly
	8710	* to 2 which would cause it to be output as starting a range in the
	8711	* intersection, and the next time through we would take that same
	8712	* number, and output it again as ending the set. By doing it the
	8713	* opposite of this, there is no possibility that the count will be
	8714	* momentarily incremented to 2. (In a tie and both are in the set or
	8715	* both not in the set, it doesn't matter which we take first.) */
	8716	if (array_a[i_a] < array_b[i_b]
	8717	\|\| (array_a[i_a] == array_b[i_b]
	8718	&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	8719	{
	8720	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	8721	cp= array_a[i_a++];
	8722	}
	8723	else {
	8724	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	8725	cp= array_b[i_b++];
	8726	}
	8727
	8728	/* Here, have chosen which of the two inputs to look at. Only output
	8729	* if the running count changes to/from 2, which marks the
	8730	* beginning/end of a range that's in the intersection */
	8731	if (cp_in_set) {
	8732	count++;
	8733	if (count == 2) {
	8734	array_r[i_r++] = cp;
	8735	}
	8736	}
	8737	else {
	8738	if (count == 2) {
	8739	array_r[i_r++] = cp;
	8740	}
	8741	count--;
	8742	}
	8743	}
	8744
	8745	/* Here, we are finished going through at least one of the lists, which
	8746	* means there is something remaining in at most one. We check if the list
	8747	* that has been exhausted is positioned such that we are in the middle
	8748	* of a range in its set or not. (i_a and i_b point to elements 1 beyond
	8749	* the ones we care about.) There are four cases:
	8750	* 1) Both weren't in their sets, count is 0, and remains 0. There's
	8751	* nothing left in the intersection.
	8752	* 2) Both were in their sets, count is 2 and perhaps is incremented to
	8753	* above 2. What should be output is exactly that which is in the
	8754	* non-exhausted set, as everything it has is also in the intersection
	8755	* set, and everything it doesn't have can't be in the intersection
	8756	* 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
	8757	* gets incremented to 2. Like the previous case, the intersection is
	8758	* everything that remains in the non-exhausted set.
	8759	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
	8760	* remains 1. And the intersection has nothing more. */
	8761	if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	8762	\|\| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	8763	{
	8764	count++;
	8765	}
	8766
	8767	/* The final length is what we've output so far plus what else is in the
	8768	* intersection. At most one of the subexpressions below will be non-zero
	8769	* */
	8770	len_r = i_r;
	8771	if (count >= 2) {
	8772	len_r += (len_a - i_a) + (len_b - i_b);
	8773	}
	8774
	8775	/* Set result to final length, which can change the pointer to array_r, so
	8776	* re-find it */
	8777	if (len_r != _invlist_len(r)) {
	8778	invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
	8779	invlist_trim(r);
	8780	array_r = invlist_array(r);
	8781	}
	8782
	8783	/* Finish outputting any remaining */
	8784	if (count >= 2) { /* At most one will have a non-zero copy count */
	8785	IV copy_count;
	8786	if ((copy_count = len_a - i_a) > 0) {
	8787	Copy(array_a + i_a, array_r + i_r, copy_count, UV);
	8788	}
	8789	else if ((copy_count = len_b - i_b) > 0) {
	8790	Copy(array_b + i_b, array_r + i_r, copy_count, UV);
	8791	}
	8792	}
	8793
	8794	/* We may be removing a reference to one of the inputs. If so, the output
	8795	* is made mortal if the input was. (Mortal SVs shouldn't have their ref
	8796	* count decremented) */
	8797	if (a == i \|\| b == i) {
	8798	assert(! invlist_is_iterating(*i));
	8799	if (SvTEMP(*i)) {
	8800	sv_2mortal(r);
	8801	}
	8802	else {
	8803	SvREFCNT_dec_NN(*i);
	8804	}
	8805	}
	8806
	8807	*i = r;
	8808
	8809	return;
	8810	}
	8811
	8812	SV*
	8813	Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
	8814	{
	8815	/* Add the range from 'start' to 'end' inclusive to the inversion list's
	8816	* set. A pointer to the inversion list is returned. This may actually be
	8817	* a new list, in which case the passed in one has been destroyed. The
	8818	* passed in inversion list can be NULL, in which case a new one is created
	8819	* with just the one range in it */
	8820
	8821	SV* range_invlist;
	8822	UV len;
	8823
	8824	if (invlist == NULL) {
	8825	invlist = _new_invlist(2);
	8826	len = 0;
	8827	}
	8828	else {
	8829	len = _invlist_len(invlist);
	8830	}
	8831
	8832	/* If comes after the final entry actually in the list, can just append it
	8833	* to the end, */
	8834	if (len == 0
	8835	\|\| (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
	8836	&& start >= invlist_array(invlist)[len - 1]))
	8837	{
	8838	_append_range_to_invlist(invlist, start, end);
	8839	return invlist;
	8840	}
	8841
	8842	/* Here, can't just append things, create and return a new inversion list
	8843	* which is the union of this range and the existing inversion list */
	8844	range_invlist = _new_invlist(2);
	8845	_append_range_to_invlist(range_invlist, start, end);
	8846
	8847	_invlist_union(invlist, range_invlist, &invlist);
	8848
	8849	/* The temporary can be freed */
	8850	SvREFCNT_dec_NN(range_invlist);
	8851
	8852	return invlist;
	8853	}
	8854
	8855	SV*
	8856	Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
	8857	UV** other_elements_ptr)
	8858	{
	8859	/* Create and return an inversion list whose contents are to be populated
	8860	* by the caller. The caller gives the number of elements (in 'size') and
	8861	* the very first element ('element0'). This function will set
	8862	* '*other_elements_ptr' to an array of UVs, where the remaining elements
	8863	* are to be placed.
	8864	*
	8865	* Obviously there is some trust involved that the caller will properly
	8866	* fill in the other elements of the array.
	8867	*
	8868	* (The first element needs to be passed in, as the underlying code does
	8869	* things differently depending on whether it is zero or non-zero) */
	8870
	8871	SV* invlist = _new_invlist(size);
	8872	bool offset;
	8873
	8874	PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
	8875
	8876	_append_range_to_invlist(invlist, element0, element0);
	8877	offset = *get_invlist_offset_addr(invlist);
	8878
	8879	invlist_set_len(invlist, size, offset);
	8880	*other_elements_ptr = invlist_array(invlist) + 1;
	8881	return invlist;
	8882	}
	8883
	8884	#endif
	8885
	8886	PERL_STATIC_INLINE SV*
	8887	S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
	8888	return _add_range_to_invlist(invlist, cp, cp);
	8889	}
	8890
	8891	#ifndef PERL_IN_XSUB_RE
	8892	void
	8893	Perl__invlist_invert(pTHX_ SV* const invlist)
	8894	{
	8895	/* Complement the input inversion list. This adds a 0 if the list didn't
	8896	* have a zero; removes it otherwise. As described above, the data
	8897	* structure is set up so that this is very efficient */
	8898
	8899	PERL_ARGS_ASSERT__INVLIST_INVERT;
	8900
	8901	assert(! invlist_is_iterating(invlist));
	8902
	8903	/* The inverse of matching nothing is matching everything */
	8904	if (_invlist_len(invlist) == 0) {
	8905	_append_range_to_invlist(invlist, 0, UV_MAX);
	8906	return;
	8907	}
	8908
	8909	get_invlist_offset_addr(invlist) = ! get_invlist_offset_addr(invlist);
	8910	}
	8911
	8912	#endif
	8913
	8914	PERL_STATIC_INLINE SV*
	8915	S_invlist_clone(pTHX_ SV* const invlist)
	8916	{
	8917
	8918	/* Return a new inversion list that is a copy of the input one, which is
	8919	* unchanged. The new list will not be mortal even if the old one was. */
	8920
	8921	/* Need to allocate extra space to accommodate Perl's addition of a
	8922	* trailing NUL to SvPV's, since it thinks they are always strings */
	8923	SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
	8924	STRLEN physical_length = SvCUR(invlist);
	8925	bool offset = *(get_invlist_offset_addr(invlist));
	8926
	8927	PERL_ARGS_ASSERT_INVLIST_CLONE;
	8928
	8929	*(get_invlist_offset_addr(new_invlist)) = offset;
	8930	invlist_set_len(new_invlist, _invlist_len(invlist), offset);
	8931	Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
	8932
	8933	return new_invlist;
	8934	}
	8935
	8936	PERL_STATIC_INLINE STRLEN*
	8937	S_get_invlist_iter_addr(SV* invlist)
	8938	{
	8939	/* Return the address of the UV that contains the current iteration
	8940	* position */
	8941
	8942	PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
	8943
	8944	assert(SvTYPE(invlist) == SVt_INVLIST);
	8945
	8946	return &(((XINVLIST*) SvANY(invlist))->iterator);
	8947	}
	8948
	8949	PERL_STATIC_INLINE void
	8950	S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
	8951	{
	8952	PERL_ARGS_ASSERT_INVLIST_ITERINIT;
	8953
	8954	*get_invlist_iter_addr(invlist) = 0;
	8955	}
	8956
	8957	PERL_STATIC_INLINE void
	8958	S_invlist_iterfinish(SV* invlist)
	8959	{
	8960	/* Terminate iterator for invlist. This is to catch development errors.
	8961	* Any iteration that is interrupted before completed should call this
	8962	* function. Functions that add code points anywhere else but to the end
	8963	* of an inversion list assert that they are not in the middle of an
	8964	* iteration. If they were, the addition would make the iteration
	8965	* problematical: if the iteration hadn't reached the place where things
	8966	* were being added, it would be ok */
	8967
	8968	PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
	8969
	8970	*get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
	8971	}
	8972
	8973	STATIC bool
	8974	S_invlist_iternext(SV* invlist, UV* start, UV* end)
	8975	{
	8976	/* An C<invlist_iterinit> call on <invlist> must be used to set this up.
	8977	* This call sets in <start> and <end>, the next range in <invlist>.
	8978	* Returns <TRUE> if successful and the next call will return the next
	8979	* range; <FALSE> if was already at the end of the list. If the latter,
	8980	* <start> and <end> are unchanged, and the next call to this function
	8981	* will start over at the beginning of the list */
	8982
	8983	STRLEN* pos = get_invlist_iter_addr(invlist);
	8984	UV len = _invlist_len(invlist);
	8985	UV *array;
	8986
	8987	PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
	8988
	8989	if (*pos >= len) {
	8990	pos = (STRLEN) UV_MAX; / Force iterinit() to be required next time */
	8991	return FALSE;
	8992	}
	8993
	8994	array = invlist_array(invlist);
	8995
	8996	start = array[(pos)++];
	8997
	8998	if (*pos >= len) {
	8999	*end = UV_MAX;
	9000	}
	9001	else {
	9002	end = array[(pos)++] - 1;
	9003	}
	9004
	9005	return TRUE;
	9006	}
	9007
	9008	PERL_STATIC_INLINE bool
	9009	S_invlist_is_iterating(SV* const invlist)
	9010	{
	9011	PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
	9012
	9013	return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
	9014	}
	9015
	9016	PERL_STATIC_INLINE UV
	9017	S_invlist_highest(SV* const invlist)
	9018	{
	9019	/* Returns the highest code point that matches an inversion list. This API
	9020	* has an ambiguity, as it returns 0 under either the highest is actually
	9021	* 0, or if the list is empty. If this distinction matters to you, check
	9022	* for emptiness before calling this function */
	9023
	9024	UV len = _invlist_len(invlist);
	9025	UV *array;
	9026
	9027	PERL_ARGS_ASSERT_INVLIST_HIGHEST;
	9028
	9029	if (len == 0) {
	9030	return 0;
	9031	}
	9032
	9033	array = invlist_array(invlist);
	9034
	9035	/* The last element in the array in the inversion list always starts a
	9036	* range that goes to infinity. That range may be for code points that are
	9037	* matched in the inversion list, or it may be for ones that aren't
	9038	* matched. In the latter case, the highest code point in the set is one
	9039	* less than the beginning of this range; otherwise it is the final element
	9040	* of this range: infinity */
	9041	return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
	9042	? UV_MAX
	9043	: array[len - 1] - 1;
	9044	}
	9045
	9046	#ifndef PERL_IN_XSUB_RE
	9047	SV *
	9048	Perl__invlist_contents(pTHX_ SV* const invlist)
	9049	{
	9050	/* Get the contents of an inversion list into a string SV so that they can
	9051	* be printed out. It uses the format traditionally done for debug tracing
	9052	*/
	9053
	9054	UV start, end;
	9055	SV* output = newSVpvs("\n");
	9056
	9057	PERL_ARGS_ASSERT__INVLIST_CONTENTS;
	9058
	9059	assert(! invlist_is_iterating(invlist));
	9060
	9061	invlist_iterinit(invlist);
	9062	while (invlist_iternext(invlist, &start, &end)) {
	9063	if (end == UV_MAX) {
	9064	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
	9065	}
	9066	else if (end != start) {
	9067	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
	9068	start, end);
	9069	}
	9070	else {
	9071	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
	9072	}
	9073	}
	9074
	9075	return output;
	9076	}
	9077	#endif
	9078
	9079	#ifndef PERL_IN_XSUB_RE
	9080	void
	9081	Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
	9082	const char * const indent, SV* const invlist)
	9083	{
	9084	/* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
	9085	* inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
	9086	* the string 'indent'. The output looks like this:
	9087	[0] 0x000A .. 0x000D
	9088	[2] 0x0085
	9089	[4] 0x2028 .. 0x2029
	9090	[6] 0x3104 .. INFINITY
	9091	* This means that the first range of code points matched by the list are
	9092	* 0xA through 0xD; the second range contains only the single code point
	9093	* 0x85, etc. An inversion list is an array of UVs. Two array elements
	9094	* are used to define each range (except if the final range extends to
	9095	* infinity, only a single element is needed). The array index of the
	9096	* first element for the corresponding range is given in brackets. */
	9097
	9098	UV start, end;
	9099	STRLEN count = 0;
	9100
	9101	PERL_ARGS_ASSERT__INVLIST_DUMP;
	9102
	9103	if (invlist_is_iterating(invlist)) {
	9104	Perl_dump_indent(aTHX_ level, file,
	9105	"%sCan't dump inversion list because is in middle of iterating\n",
	9106	indent);
	9107	return;
	9108	}
	9109
	9110	invlist_iterinit(invlist);
	9111	while (invlist_iternext(invlist, &start, &end)) {
	9112	if (end == UV_MAX) {
	9113	Perl_dump_indent(aTHX_ level, file,
	9114	"%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
	9115	indent, (UV)count, start);
	9116	}
	9117	else if (end != start) {
	9118	Perl_dump_indent(aTHX_ level, file,
	9119	"%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
	9120	indent, (UV)count, start, end);
	9121	}
	9122	else {
	9123	Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
	9124	indent, (UV)count, start);
	9125	}
	9126	count += 2;
	9127	}
	9128	}
	9129
	9130	void
	9131	Perl__load_PL_utf8_foldclosures (pTHX)
	9132	{
	9133	assert(! PL_utf8_foldclosures);
	9134
	9135	/* If the folds haven't been read in, call a fold function
	9136	* to force that */
	9137	if (! PL_utf8_tofold) {
	9138	U8 dummy[UTF8_MAXBYTES_CASE+1];
	9139
	9140	/* This string is just a short named one above \xff */
	9141	to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
	9142	assert(PL_utf8_tofold); /* Verify that worked */
	9143	}
	9144	PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
	9145	}
	9146	#endif
	9147
	9148	#ifdef PERL_ARGS_ASSERT__INVLISTEQ
	9149	bool
	9150	S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
	9151	{
	9152	/* Return a boolean as to if the two passed in inversion lists are
	9153	* identical. The final argument, if TRUE, says to take the complement of
	9154	* the second inversion list before doing the comparison */
	9155
	9156	const UV* array_a = invlist_array(a);
	9157	const UV* array_b = invlist_array(b);
	9158	UV len_a = _invlist_len(a);
	9159	UV len_b = _invlist_len(b);
	9160
	9161	UV i = 0; /* current index into the arrays */
	9162	bool retval = TRUE; /* Assume are identical until proven otherwise */
	9163
	9164	PERL_ARGS_ASSERT__INVLISTEQ;
	9165
	9166	/* If are to compare 'a' with the complement of b, set it
	9167	* up so are looking at b's complement. */
	9168	if (complement_b) {
	9169
	9170	/* The complement of nothing is everything, so <a> would have to have
	9171	* just one element, starting at zero (ending at infinity) */
	9172	if (len_b == 0) {
	9173	return (len_a == 1 && array_a[0] == 0);
	9174	}
	9175	else if (array_b[0] == 0) {
	9176
	9177	/* Otherwise, to complement, we invert. Here, the first element is
	9178	* 0, just remove it. To do this, we just pretend the array starts
	9179	* one later */
	9180
	9181	array_b++;
	9182	len_b--;
	9183	}
	9184	else {
	9185
	9186	/* But if the first element is not zero, we pretend the list starts
	9187	* at the 0 that is always stored immediately before the array. */
	9188	array_b--;
	9189	len_b++;
	9190	}
	9191	}
	9192
	9193	/* Make sure that the lengths are the same, as well as the final element
	9194	* before looping through the remainder. (Thus we test the length, final,
	9195	* and first elements right off the bat) */
	9196	if (len_a != len_b \|\| array_a[len_a-1] != array_b[len_a-1]) {
	9197	retval = FALSE;
	9198	}
	9199	else for (i = 0; i < len_a - 1; i++) {
	9200	if (array_a[i] != array_b[i]) {
	9201	retval = FALSE;
	9202	break;
	9203	}
	9204	}
	9205
	9206	return retval;
	9207	}
	9208	#endif
	9209
	9210	#undef HEADER_LENGTH
	9211	#undef TO_INTERNAL_SIZE
	9212	#undef FROM_INTERNAL_SIZE
	9213	#undef INVLIST_VERSION_ID
	9214
	9215	/* End of inversion list object */
	9216
	9217	STATIC void
	9218	S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
	9219	{
	9220	/* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
	9221	* constructs, and updates RExC_flags with them. On input, RExC_parse
	9222	* should point to the first flag; it is updated on output to point to the
	9223	* final ')' or ':'. There needs to be at least one flag, or this will
	9224	* abort */
	9225
	9226	/* for (?g), (?gc), and (?o) warnings; warning
	9227	about (?c) will warn about (?g) -- japhy */
	9228
	9229	#define WASTED_O 0x01
	9230	#define WASTED_G 0x02
	9231	#define WASTED_C 0x04
	9232	#define WASTED_GC (WASTED_G\|WASTED_C)
	9233	I32 wastedflags = 0x00;
	9234	U32 posflags = 0, negflags = 0;
	9235	U32 *flagsp = &posflags;
	9236	char has_charset_modifier = '\0';
	9237	regex_charset cs;
	9238	bool has_use_defaults = FALSE;
	9239	const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
	9240
	9241	PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
	9242
	9243	/* '^' as an initial flag sets certain defaults */
	9244	if (UCHARAT(RExC_parse) == '^') {
	9245	RExC_parse++;
	9246	has_use_defaults = TRUE;
	9247	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	9248	set_regex_charset(&RExC_flags, (RExC_utf8 \|\| RExC_uni_semantics)
	9249	? REGEX_UNICODE_CHARSET
	9250	: REGEX_DEPENDS_CHARSET);
	9251	}
	9252
	9253	cs = get_regex_charset(RExC_flags);
	9254	if (cs == REGEX_DEPENDS_CHARSET
	9255	&& (RExC_utf8 \|\| RExC_uni_semantics))
	9256	{
	9257	cs = REGEX_UNICODE_CHARSET;
	9258	}
	9259
	9260	while (*RExC_parse) {
	9261	/* && strchr("iogcmsx", RExC_parse) /
	9262	/* (?g), (?gc) and (?o) are useless here
	9263	and must be globally applied -- japhy */
	9264	switch (*RExC_parse) {
	9265
	9266	/* Code for the imsx flags */
	9267	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	9268
	9269	case LOCALE_PAT_MOD:
	9270	if (has_charset_modifier) {
	9271	goto excess_modifier;
	9272	}
	9273	else if (flagsp == &negflags) {
	9274	goto neg_modifier;
	9275	}
	9276	cs = REGEX_LOCALE_CHARSET;
	9277	has_charset_modifier = LOCALE_PAT_MOD;
	9278	break;
	9279	case UNICODE_PAT_MOD:
	9280	if (has_charset_modifier) {
	9281	goto excess_modifier;
	9282	}
	9283	else if (flagsp == &negflags) {
	9284	goto neg_modifier;
	9285	}
	9286	cs = REGEX_UNICODE_CHARSET;
	9287	has_charset_modifier = UNICODE_PAT_MOD;
	9288	break;
	9289	case ASCII_RESTRICT_PAT_MOD:
	9290	if (flagsp == &negflags) {
	9291	goto neg_modifier;
	9292	}
	9293	if (has_charset_modifier) {
	9294	if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
	9295	goto excess_modifier;
	9296	}
	9297	/* Doubled modifier implies more restricted */
	9298	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	9299	}
	9300	else {
	9301	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	9302	}
	9303	has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
	9304	break;
	9305	case DEPENDS_PAT_MOD:
	9306	if (has_use_defaults) {
	9307	goto fail_modifiers;
	9308	}
	9309	else if (flagsp == &negflags) {
	9310	goto neg_modifier;
	9311	}
	9312	else if (has_charset_modifier) {
	9313	goto excess_modifier;
	9314	}
	9315
	9316	/* The dual charset means unicode semantics if the
	9317	* pattern (or target, not known until runtime) are
	9318	* utf8, or something in the pattern indicates unicode
	9319	* semantics */
	9320	cs = (RExC_utf8 \|\| RExC_uni_semantics)
	9321	? REGEX_UNICODE_CHARSET
	9322	: REGEX_DEPENDS_CHARSET;
	9323	has_charset_modifier = DEPENDS_PAT_MOD;
	9324	break;
	9325	excess_modifier:
	9326	RExC_parse++;
	9327	if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
	9328	vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
	9329	}
	9330	else if (has_charset_modifier == *(RExC_parse - 1)) {
	9331	vFAIL2("Regexp modifier \"%c\" may not appear twice",
	9332	*(RExC_parse - 1));
	9333	}
	9334	else {
	9335	vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
	9336	}
	9337	/NOTREACHED/
	9338	neg_modifier:
	9339	RExC_parse++;
	9340	vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
	9341	*(RExC_parse - 1));
	9342	/NOTREACHED/
	9343	case ONCE_PAT_MOD: /* 'o' */
	9344	case GLOBAL_PAT_MOD: /* 'g' */
	9345	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	9346	const I32 wflagbit = *RExC_parse == 'o'
	9347	? WASTED_O
	9348	: WASTED_G;
	9349	if (! (wastedflags & wflagbit) ) {
	9350	wastedflags \|= wflagbit;
	9351	/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
	9352	vWARN5(
	9353	RExC_parse + 1,
	9354	"Useless (%s%c) - %suse /%c modifier",
	9355	flagsp == &negflags ? "?-" : "?",
	9356	*RExC_parse,
	9357	flagsp == &negflags ? "don't " : "",
	9358	*RExC_parse
	9359	);
	9360	}
	9361	}
	9362	break;
	9363
	9364	case CONTINUE_PAT_MOD: /* 'c' */
	9365	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	9366	if (! (wastedflags & WASTED_C) ) {
	9367	wastedflags \|= WASTED_GC;
	9368	/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
	9369	vWARN3(
	9370	RExC_parse + 1,
	9371	"Useless (%sc) - %suse /gc modifier",
	9372	flagsp == &negflags ? "?-" : "?",
	9373	flagsp == &negflags ? "don't " : ""
	9374	);
	9375	}
	9376	}
	9377	break;
	9378	case KEEPCOPY_PAT_MOD: /* 'p' */
	9379	if (flagsp == &negflags) {
	9380	if (SIZE_ONLY)
	9381	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	9382	} else {
	9383	*flagsp \|= RXf_PMf_KEEPCOPY;
	9384	}
	9385	break;
	9386	case '-':
	9387	/* A flag is a default iff it is following a minus, so
	9388	* if there is a minus, it means will be trying to
	9389	* re-specify a default which is an error */
	9390	if (has_use_defaults \|\| flagsp == &negflags) {
	9391	goto fail_modifiers;
	9392	}
	9393	flagsp = &negflags;
	9394	wastedflags = 0; /* reset so (?g-c) warns twice */
	9395	break;
	9396	case ':':
	9397	case ')':
	9398	RExC_flags \|= posflags;
	9399	RExC_flags &= ~negflags;
	9400	set_regex_charset(&RExC_flags, cs);
	9401	if (RExC_flags & RXf_PMf_FOLD) {
	9402	RExC_contains_i = 1;
	9403	}
	9404	return;
	9405	/NOTREACHED/
	9406	default:
	9407	fail_modifiers:
	9408	RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
	9409	/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
	9410	vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
	9411	UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
	9412	/NOTREACHED/
	9413	}
	9414
	9415	++RExC_parse;
	9416	}
	9417	}
	9418
	9419	/*
	9420	- reg - regular expression, i.e. main body or parenthesized thing
	9421	*
	9422	* Caller must absorb opening parenthesis.
	9423	*
	9424	* Combining parenthesis handling with the base level of regular expression
	9425	* is a trifle forced, but the need to tie the tails of the branches to what
	9426	* follows makes it hard to avoid.
	9427	*/
	9428	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	9429	#ifdef DEBUGGING
	9430	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	9431	#else
	9432	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	9433	#endif
	9434
	9435	/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
	9436	flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
	9437	needs to be restarted.
	9438	Otherwise would only return NULL if regbranch() returns NULL, which
	9439	cannot happen. */
	9440	STATIC regnode *
	9441	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	9442	/* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
	9443	* 2 is like 1, but indicates that nextchar() has been called to advance
	9444	* RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
	9445	* this flag alerts us to the need to check for that */
	9446	{
	9447	regnode ret; / Will be the head of the group. */
	9448	regnode *br;
	9449	regnode *lastbr;
	9450	regnode *ender = NULL;
	9451	I32 parno = 0;
	9452	I32 flags;
	9453	U32 oregflags = RExC_flags;
	9454	bool have_branch = 0;
	9455	bool is_open = 0;
	9456	I32 freeze_paren = 0;
	9457	I32 after_freeze = 0;
	9458	I32 num; /* numeric backreferences */
	9459
	9460	char * parse_start = RExC_parse; /* MJD */
	9461	char * const oregcomp_parse = RExC_parse;
	9462
	9463	GET_RE_DEBUG_FLAGS_DECL;
	9464
	9465	PERL_ARGS_ASSERT_REG;
	9466	DEBUG_PARSE("reg ");
	9467
	9468	flagp = 0; / Tentatively. */
	9469
	9470
	9471	/* Make an OPEN node, if parenthesized. */
	9472	if (paren) {
	9473
	9474	/* Under /x, space and comments can be gobbled up between the '(' and
	9475	* here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
	9476	* intervening space, as the sequence is a token, and a token should be
	9477	* indivisible */
	9478	bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
	9479
	9480	if ( RExC_parse == '') { /* (VERB:ARG) /
	9481	char *start_verb = RExC_parse;
	9482	STRLEN verb_len = 0;
	9483	char *start_arg = NULL;
	9484	unsigned char op = 0;
	9485	int argok = 1;
	9486	int internal_argval = 0; /* internal_argval is only useful if
	9487	!argok */
	9488
	9489	if (has_intervening_patws) {
	9490	RExC_parse++;
	9491	vFAIL("In '(VERB...)', the '(' and '' must be adjacent");
	9492	}
	9493	while ( RExC_parse && RExC_parse != ')' ) {
	9494	if ( *RExC_parse == ':' ) {
	9495	start_arg = RExC_parse + 1;
	9496	break;
	9497	}
	9498	RExC_parse++;
	9499	}
	9500	++start_verb;
	9501	verb_len = RExC_parse - start_verb;
	9502	if ( start_arg ) {
	9503	RExC_parse++;
	9504	while ( RExC_parse && RExC_parse != ')' )
	9505	RExC_parse++;
	9506	if ( *RExC_parse != ')' )
	9507	vFAIL("Unterminated verb pattern argument");
	9508	if ( RExC_parse == start_arg )
	9509	start_arg = NULL;
	9510	} else {
	9511	if ( *RExC_parse != ')' )
	9512	vFAIL("Unterminated verb pattern");
	9513	}
	9514
	9515	switch ( *start_verb ) {
	9516	case 'A': /* (ACCEPT) /
	9517	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	9518	op = ACCEPT;
	9519	internal_argval = RExC_nestroot;
	9520	}
	9521	break;
	9522	case 'C': /* (COMMIT) /
	9523	if ( memEQs(start_verb,verb_len,"COMMIT") )
	9524	op = COMMIT;
	9525	break;
	9526	case 'F': /* (FAIL) /
	9527	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	9528	op = OPFAIL;
	9529	argok = 0;
	9530	}
	9531	break;
	9532	case ':': /* (:NAME) /
	9533	case 'M': /* (MARK:NAME) /
	9534	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	9535	op = MARKPOINT;
	9536	argok = -1;
	9537	}
	9538	break;
	9539	case 'P': /* (PRUNE) /
	9540	if ( memEQs(start_verb,verb_len,"PRUNE") )
	9541	op = PRUNE;
	9542	break;
	9543	case 'S': /* (SKIP) /
	9544	if ( memEQs(start_verb,verb_len,"SKIP") )
	9545	op = SKIP;
	9546	break;
	9547	case 'T': /* (THEN) /
	9548	/* [19:06] <TimToady> :: is then */
	9549	if ( memEQs(start_verb,verb_len,"THEN") ) {
	9550	op = CUTGROUP;
	9551	RExC_seen \|= REG_CUTGROUP_SEEN;
	9552	}
	9553	break;
	9554	}
	9555	if ( ! op ) {
	9556	RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
	9557	vFAIL2utf8f(
	9558	"Unknown verb pattern '%"UTF8f"'",
	9559	UTF8fARG(UTF, verb_len, start_verb));
	9560	}
	9561	if ( argok ) {
	9562	if ( start_arg && internal_argval ) {
	9563	vFAIL3("Verb pattern '%.*s' may not have an argument",
	9564	verb_len, start_verb);
	9565	} else if ( argok < 0 && !start_arg ) {
	9566	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	9567	verb_len, start_verb);
	9568	} else {
	9569	ret = reganode(pRExC_state, op, internal_argval);
	9570	if ( ! internal_argval && ! SIZE_ONLY ) {
	9571	if (start_arg) {
	9572	SV *sv = newSVpvn( start_arg,
	9573	RExC_parse - start_arg);
	9574	ARG(ret) = add_data( pRExC_state,
	9575	STR_WITH_LEN("S"));
	9576	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	9577	ret->flags = 0;
	9578	} else {
	9579	ret->flags = 1;
	9580	}
	9581	}
	9582	}
	9583	if (!internal_argval)
	9584	RExC_seen \|= REG_VERBARG_SEEN;
	9585	} else if ( start_arg ) {
	9586	vFAIL3("Verb pattern '%.*s' may not have an argument",
	9587	verb_len, start_verb);
	9588	} else {
	9589	ret = reg_node(pRExC_state, op);
	9590	}
	9591	nextchar(pRExC_state);
	9592	return ret;
	9593	}
	9594	else if (RExC_parse == '?') { / (?...) */
	9595	bool is_logical = 0;
	9596	const char * const seqstart = RExC_parse;
	9597	const char * endptr;
	9598	if (has_intervening_patws) {
	9599	RExC_parse++;
	9600	vFAIL("In '(?...)', the '(' and '?' must be adjacent");
	9601	}
	9602
	9603	RExC_parse++;
	9604	paren = *RExC_parse++;
	9605	ret = NULL; /* For look-ahead/behind. */
	9606	switch (paren) {
	9607
	9608	case 'P': /* (?P...) variants for those used to PCRE/Python */
	9609	paren = *RExC_parse++;
	9610	if ( paren == '<') /* (?P<...>) named capture */
	9611	goto named_capture;
	9612	else if (paren == '>') { /* (?P>name) named recursion */
	9613	goto named_recursion;
	9614	}
	9615	else if (paren == '=') { /* (?P=...) named backref */
	9616	/* this pretty much dupes the code for \k<NAME> in
	9617	* regatom(), if you change this make sure you change that
	9618	* */
	9619	char* name_start = RExC_parse;
	9620	U32 num = 0;
	9621	SV *sv_dat = reg_scan_name(pRExC_state,
	9622	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	9623	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	9624	/* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
	9625	vFAIL2("Sequence %.3s... not terminated",parse_start);
	9626
	9627	if (!SIZE_ONLY) {
	9628	num = add_data( pRExC_state, STR_WITH_LEN("S"));
	9629	RExC_rxi->data->data[num]=(void*)sv_dat;
	9630	SvREFCNT_inc_simple_void(sv_dat);
	9631	}
	9632	RExC_sawback = 1;
	9633	ret = reganode(pRExC_state,
	9634	((! FOLD)
	9635	? NREF
	9636	: (ASCII_FOLD_RESTRICTED)
	9637	? NREFFA
	9638	: (AT_LEAST_UNI_SEMANTICS)
	9639	? NREFFU
	9640	: (LOC)
	9641	? NREFFL
	9642	: NREFF),
	9643	num);
	9644	*flagp \|= HASWIDTH;
	9645
	9646	Set_Node_Offset(ret, parse_start+1);
	9647	Set_Node_Cur_Length(ret, parse_start);
	9648
	9649	nextchar(pRExC_state);
	9650	return ret;
	9651	}
	9652	RExC_parse++;
	9653	/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
	9654	vFAIL3("Sequence (%.*s...) not recognized",
	9655	RExC_parse-seqstart, seqstart);
	9656	/NOTREACHED/
	9657	case '<': /* (?<...) */
	9658	if (*RExC_parse == '!')
	9659	paren = ',';
	9660	else if (*RExC_parse != '=')
	9661	named_capture:
	9662	{ /* (?<...>) */
	9663	char *name_start;
	9664	SV *svname;
	9665	paren= '>';
	9666	case '\'': /* (?'...') */
	9667	name_start= RExC_parse;
	9668	svname = reg_scan_name(pRExC_state,
	9669	SIZE_ONLY /* reverse test from the others */
	9670	? REG_RSN_RETURN_NAME
	9671	: REG_RSN_RETURN_NULL);
	9672	if (RExC_parse == name_start \|\| *RExC_parse != paren)
	9673	vFAIL2("Sequence (?%c... not terminated",
	9674	paren=='>' ? '<' : paren);
	9675	if (SIZE_ONLY) {
	9676	HE *he_str;
	9677	SV *sv_dat = NULL;
	9678	if (!svname) /* shouldn't happen */
	9679	Perl_croak(aTHX_
	9680	"panic: reg_scan_name returned NULL");
	9681	if (!RExC_paren_names) {
	9682	RExC_paren_names= newHV();
	9683	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	9684	#ifdef DEBUGGING
	9685	RExC_paren_name_list= newAV();
	9686	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	9687	#endif
	9688	}
	9689	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	9690	if ( he_str )
	9691	sv_dat = HeVAL(he_str);
	9692	if ( ! sv_dat ) {
	9693	/* croak baby croak */
	9694	Perl_croak(aTHX_
	9695	"panic: paren_name hash element allocation failed");
	9696	} else if ( SvPOK(sv_dat) ) {
	9697	/* (?\|...) can mean we have dupes so scan to check
	9698	its already been stored. Maybe a flag indicating
	9699	we are inside such a construct would be useful,
	9700	but the arrays are likely to be quite small, so
	9701	for now we punt -- dmq */
	9702	IV count = SvIV(sv_dat);
	9703	I32 pv = (I32)SvPVX(sv_dat);
	9704	IV i;
	9705	for ( i = 0 ; i < count ; i++ ) {
	9706	if ( pv[i] == RExC_npar ) {
	9707	count = 0;
	9708	break;
	9709	}
	9710	}
	9711	if ( count ) {
	9712	pv = (I32*)SvGROW(sv_dat,
	9713	SvCUR(sv_dat) + sizeof(I32)+1);
	9714	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	9715	pv[count] = RExC_npar;
	9716	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	9717	}
	9718	} else {
	9719	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	9720	sv_setpvn(sv_dat, (char *)&(RExC_npar),
	9721	sizeof(I32));
	9722	SvIOK_on(sv_dat);
	9723	SvIV_set(sv_dat, 1);
	9724	}
	9725	#ifdef DEBUGGING
	9726	/* Yes this does cause a memory leak in debugging Perls
	9727	* */
	9728	if (!av_store(RExC_paren_name_list,
	9729	RExC_npar, SvREFCNT_inc(svname)))
	9730	SvREFCNT_dec_NN(svname);
	9731	#endif
	9732
	9733	/sv_dump(sv_dat);/
	9734	}
	9735	nextchar(pRExC_state);
	9736	paren = 1;
	9737	goto capturing_parens;
	9738	}
	9739	RExC_seen \|= REG_LOOKBEHIND_SEEN;
	9740	RExC_in_lookbehind++;
	9741	RExC_parse++;
	9742	/* FALLTHROUGH */
	9743	case '=': /* (?=...) */
	9744	RExC_seen_zerolen++;
	9745	break;
	9746	case '!': /* (?!...) */
	9747	RExC_seen_zerolen++;
	9748	if (*RExC_parse == ')') {
	9749	ret=reg_node(pRExC_state, OPFAIL);
	9750	nextchar(pRExC_state);
	9751	return ret;
	9752	}
	9753	break;
	9754	case '\|': /* (?\|...) */
	9755	/* branch reset, behave like a (?:...) except that
	9756	buffers in alternations share the same numbers */
	9757	paren = ':';
	9758	after_freeze = freeze_paren = RExC_npar;
	9759	break;
	9760	case ':': /* (?:...) */
	9761	case '>': /* (?>...) */
	9762	break;
	9763	case '$': /* (?$...) */
	9764	case '@': /* (?@...) */
	9765	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	9766	break;
	9767	case '0' : /* (?0) */
	9768	case 'R' : /* (?R) */
	9769	if (*RExC_parse != ')')
	9770	FAIL("Sequence (?R) not terminated");
	9771	ret = reg_node(pRExC_state, GOSTART);
	9772	RExC_seen \|= REG_GOSTART_SEEN;
	9773	*flagp \|= POSTPONED;
	9774	nextchar(pRExC_state);
	9775	return ret;
	9776	/notreached/
	9777	/* named and numeric backreferences */
	9778	case '&': /* (?&NAME) */
	9779	parse_start = RExC_parse - 1;
	9780	named_recursion:
	9781	{
	9782	SV *sv_dat = reg_scan_name(pRExC_state,
	9783	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	9784	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	9785	}
	9786	if (RExC_parse == RExC_end \|\| *RExC_parse != ')')
	9787	vFAIL("Sequence (?&... not terminated");
	9788	goto gen_recurse_regop;
	9789	assert(0); /* NOT REACHED */
	9790	case '+':
	9791	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	9792	RExC_parse++;
	9793	vFAIL("Illegal pattern");
	9794	}
	9795	goto parse_recursion;
	9796	/* NOT REACHED*/
	9797	case '-': /* (?-1) */
	9798	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	9799	RExC_parse--; /* rewind to let it be handled later */
	9800	goto parse_flags;
	9801	}
	9802	/* FALLTHROUGH */
	9803	case '1': case '2': case '3': case '4': /* (?1) */
	9804	case '5': case '6': case '7': case '8': case '9':
	9805	RExC_parse--;
	9806	parse_recursion:
	9807	{
	9808	bool is_neg = FALSE;
	9809	parse_start = RExC_parse - 1; /* MJD */
	9810	if (*RExC_parse == '-') {
	9811	RExC_parse++;
	9812	is_neg = TRUE;
	9813	}
	9814	num = grok_atou(RExC_parse, &endptr);
	9815	if (endptr)
	9816	RExC_parse = (char*)endptr;
	9817	if (is_neg) {
	9818	/* Some limit for num? */
	9819	num = -num;
	9820	}
	9821	}
	9822	if (*RExC_parse!=')')
	9823	vFAIL("Expecting close bracket");
	9824
	9825	gen_recurse_regop:
	9826	if ( paren == '-' ) {
	9827	/*
	9828	Diagram of capture buffer numbering.
	9829	Top line is the normal capture buffer numbers
	9830	Bottom line is the negative indexing as from
	9831	the X (the (?-2))
	9832
	9833	+ 1 2 3 4 5 X 6 7
	9834	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	9835	- 5 4 3 2 1 X x x
	9836
	9837	*/
	9838	num = RExC_npar + num;
	9839	if (num < 1) {
	9840	RExC_parse++;
	9841	vFAIL("Reference to nonexistent group");
	9842	}
	9843	} else if ( paren == '+' ) {
	9844	num = RExC_npar + num - 1;
	9845	}
	9846
	9847	ret = reganode(pRExC_state, GOSUB, num);
	9848	if (!SIZE_ONLY) {
	9849	if (num > (I32)RExC_rx->nparens) {
	9850	RExC_parse++;
	9851	vFAIL("Reference to nonexistent group");
	9852	}
	9853	ARG2L_SET( ret, RExC_recurse_count++);
	9854	RExC_emit++;
	9855	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	9856	"Recurse #%"UVuf" to %"IVdf"\n",
	9857	(UV)ARG(ret), (IV)ARG2L(ret)));
	9858	} else {
	9859	RExC_size++;
	9860	}
	9861	RExC_seen \|= REG_RECURSE_SEEN;
	9862	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	9863	Set_Node_Offset(ret, parse_start); /* MJD */
	9864
	9865	*flagp \|= POSTPONED;
	9866	nextchar(pRExC_state);
	9867	return ret;
	9868
	9869	assert(0); /* NOT REACHED */
	9870
	9871	case '?': /* (??...) */
	9872	is_logical = 1;
	9873	if (*RExC_parse != '{') {
	9874	RExC_parse++;
	9875	/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
	9876	vFAIL2utf8f(
	9877	"Sequence (%"UTF8f"...) not recognized",
	9878	UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
	9879	/NOTREACHED/
	9880	}
	9881	*flagp \|= POSTPONED;
	9882	paren = *RExC_parse++;
	9883	/* FALLTHROUGH */
	9884	case '{': /* (?{...}) */
	9885	{
	9886	U32 n = 0;
	9887	struct reg_code_block *cb;
	9888
	9889	RExC_seen_zerolen++;
	9890
	9891	if ( !pRExC_state->num_code_blocks
	9892	\|\| pRExC_state->code_index >= pRExC_state->num_code_blocks
	9893	\|\| pRExC_state->code_blocks[pRExC_state->code_index].start
	9894	!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
	9895	- RExC_start)
	9896	) {
	9897	if (RExC_pm_flags & PMf_USE_RE_EVAL)
	9898	FAIL("panic: Sequence (?{...}): no code block found\n");
	9899	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	9900	}
	9901	/* this is a pre-compiled code block (?{...}) */
	9902	cb = &pRExC_state->code_blocks[pRExC_state->code_index];
	9903	RExC_parse = RExC_start + cb->end;
	9904	if (!SIZE_ONLY) {
	9905	OP *o = cb->block;
	9906	if (cb->src_regex) {
	9907	n = add_data(pRExC_state, STR_WITH_LEN("rl"));
	9908	RExC_rxi->data->data[n] =
	9909	(void)SvREFCNT_inc((SV)cb->src_regex);
	9910	RExC_rxi->data->data[n+1] = (void*)o;
	9911	}
	9912	else {
	9913	n = add_data(pRExC_state,
	9914	(RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
	9915	RExC_rxi->data->data[n] = (void*)o;
	9916	}
	9917	}
	9918	pRExC_state->code_index++;
	9919	nextchar(pRExC_state);
	9920
	9921	if (is_logical) {
	9922	regnode *eval;
	9923	ret = reg_node(pRExC_state, LOGICAL);
	9924	eval = reganode(pRExC_state, EVAL, n);
	9925	if (!SIZE_ONLY) {
	9926	ret->flags = 2;
	9927	/* for later propagation into (??{}) return value */
	9928	eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
	9929	}
	9930	REGTAIL(pRExC_state, ret, eval);
	9931	/* deal with the length of this later - MJD */
	9932	return ret;
	9933	}
	9934	ret = reganode(pRExC_state, EVAL, n);
	9935	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	9936	Set_Node_Offset(ret, parse_start);
	9937	return ret;
	9938	}
	9939	case '(': /* (?(?{...})...) and (?(?=...)...) */
	9940	{
	9941	int is_define= 0;
	9942	if (RExC_parse[0] == '?') { /* (?(?...)) */
	9943	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	9944	\|\| RExC_parse[1] == '<'
	9945	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	9946	I32 flag;
	9947	regnode *tail;
	9948
	9949	ret = reg_node(pRExC_state, LOGICAL);
	9950	if (!SIZE_ONLY)
	9951	ret->flags = 1;
	9952
	9953	tail = reg(pRExC_state, 1, &flag, depth+1);
	9954	if (flag & RESTART_UTF8) {
	9955	*flagp = RESTART_UTF8;
	9956	return NULL;
	9957	}
	9958	REGTAIL(pRExC_state, ret, tail);
	9959	goto insert_if;
	9960	}
	9961	/* Fall through to ‘Unknown switch condition’ at the
	9962	end of the if/else chain. */
	9963	}
	9964	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	9965	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	9966	{
	9967	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	9968	char *name_start= RExC_parse++;
	9969	U32 num = 0;
	9970	SV *sv_dat=reg_scan_name(pRExC_state,
	9971	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	9972	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	9973	vFAIL2("Sequence (?(%c... not terminated",
	9974	(ch == '>' ? '<' : ch));
	9975	RExC_parse++;
	9976	if (!SIZE_ONLY) {
	9977	num = add_data( pRExC_state, STR_WITH_LEN("S"));
	9978	RExC_rxi->data->data[num]=(void*)sv_dat;
	9979	SvREFCNT_inc_simple_void(sv_dat);
	9980	}
	9981	ret = reganode(pRExC_state,NGROUPP,num);
	9982	goto insert_if_check_paren;
	9983	}
	9984	else if (RExC_parse[0] == 'D' &&
	9985	RExC_parse[1] == 'E' &&
	9986	RExC_parse[2] == 'F' &&
	9987	RExC_parse[3] == 'I' &&
	9988	RExC_parse[4] == 'N' &&
	9989	RExC_parse[5] == 'E')
	9990	{
	9991	ret = reganode(pRExC_state,DEFINEP,0);
	9992	RExC_parse +=6 ;
	9993	is_define = 1;
	9994	goto insert_if_check_paren;
	9995	}
	9996	else if (RExC_parse[0] == 'R') {
	9997	RExC_parse++;
	9998	parno = 0;
	9999	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	10000	parno = grok_atou(RExC_parse, &endptr);
	10001	if (endptr)
	10002	RExC_parse = (char*)endptr;
	10003	} else if (RExC_parse[0] == '&') {
	10004	SV *sv_dat;
	10005	RExC_parse++;
	10006	sv_dat = reg_scan_name(pRExC_state,
	10007	SIZE_ONLY
	10008	? REG_RSN_RETURN_NULL
	10009	: REG_RSN_RETURN_DATA);
	10010	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	10011	}
	10012	ret = reganode(pRExC_state,INSUBP,parno);
	10013	goto insert_if_check_paren;
	10014	}
	10015	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	10016	/* (?(1)...) */
	10017	char c;
	10018	char *tmp;
	10019	parno = grok_atou(RExC_parse, &endptr);
	10020	if (endptr)
	10021	RExC_parse = (char*)endptr;
	10022	ret = reganode(pRExC_state, GROUPP, parno);
	10023
	10024	insert_if_check_paren:
	10025	if (*(tmp = nextchar(pRExC_state)) != ')') {
	10026	/* nextchar also skips comments, so undo its work
	10027	* and skip over the the next character.
	10028	*/
	10029	RExC_parse = tmp;
	10030	RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
	10031	vFAIL("Switch condition not recognized");
	10032	}
	10033	insert_if:
	10034	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	10035	br = regbranch(pRExC_state, &flags, 1,depth+1);
	10036	if (br == NULL) {
	10037	if (flags & RESTART_UTF8) {
	10038	*flagp = RESTART_UTF8;
	10039	return NULL;
	10040	}
	10041	FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
	10042	(UV) flags);
	10043	} else
	10044	REGTAIL(pRExC_state, br, reganode(pRExC_state,
	10045	LONGJMP, 0));
	10046	c = *nextchar(pRExC_state);
	10047	if (flags&HASWIDTH)
	10048	*flagp \|= HASWIDTH;
	10049	if (c == '\|') {
	10050	if (is_define)
	10051	vFAIL("(?(DEFINE)....) does not allow branches");
	10052
	10053	/* Fake one for optimizer. */
	10054	lastbr = reganode(pRExC_state, IFTHEN, 0);
	10055
	10056	if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
	10057	if (flags & RESTART_UTF8) {
	10058	*flagp = RESTART_UTF8;
	10059	return NULL;
	10060	}
	10061	FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
	10062	(UV) flags);
	10063	}
	10064	REGTAIL(pRExC_state, ret, lastbr);
	10065	if (flags&HASWIDTH)
	10066	*flagp \|= HASWIDTH;
	10067	c = *nextchar(pRExC_state);
	10068	}
	10069	else
	10070	lastbr = NULL;
	10071	if (c != ')')
	10072	vFAIL("Switch (?(condition)... contains too many branches");
	10073	ender = reg_node(pRExC_state, TAIL);
	10074	REGTAIL(pRExC_state, br, ender);
	10075	if (lastbr) {
	10076	REGTAIL(pRExC_state, lastbr, ender);
	10077	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	10078	}
	10079	else
	10080	REGTAIL(pRExC_state, ret, ender);
	10081	RExC_size++; /* XXX WHY do we need this?!!
	10082	For large programs it seems to be required
	10083	but I can't figure out why. -- dmq*/
	10084	return ret;
	10085	}
	10086	RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
	10087	vFAIL("Unknown switch condition (?(...))");
	10088	}
	10089	case '[': /* (?[ ... ]) */
	10090	return handle_regex_sets(pRExC_state, NULL, flagp, depth,
	10091	oregcomp_parse);
	10092	case 0:
	10093	RExC_parse--; /* for vFAIL to print correctly */
	10094	vFAIL("Sequence (? incomplete");
	10095	break;
	10096	default: /* e.g., (?i) */
	10097	--RExC_parse;
	10098	parse_flags:
	10099	parse_lparen_question_flags(pRExC_state);
	10100	if (UCHARAT(RExC_parse) != ':') {
	10101	nextchar(pRExC_state);
	10102	*flagp = TRYAGAIN;
	10103	return NULL;
	10104	}
	10105	paren = ':';
	10106	nextchar(pRExC_state);
	10107	ret = NULL;
	10108	goto parse_rest;
	10109	} /* end switch */
	10110	}
	10111	else { /* (...) */
	10112	capturing_parens:
	10113	parno = RExC_npar;
	10114	RExC_npar++;
	10115
	10116	ret = reganode(pRExC_state, OPEN, parno);
	10117	if (!SIZE_ONLY ){
	10118	if (!RExC_nestroot)
	10119	RExC_nestroot = parno;
	10120	if (RExC_seen & REG_RECURSE_SEEN
	10121	&& !RExC_open_parens[parno-1])
	10122	{
	10123	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	10124	"Setting open paren #%"IVdf" to %d\n",
	10125	(IV)parno, REG_NODE_NUM(ret)));
	10126	RExC_open_parens[parno-1]= ret;
	10127	}
	10128	}
	10129	Set_Node_Length(ret, 1); /* MJD */
	10130	Set_Node_Offset(ret, RExC_parse); /* MJD */
	10131	is_open = 1;
	10132	}
	10133	}
	10134	else /* ! paren */
	10135	ret = NULL;
	10136
	10137	parse_rest:
	10138	/* Pick up the branches, linking them together. */
	10139	parse_start = RExC_parse; /* MJD */
	10140	br = regbranch(pRExC_state, &flags, 1,depth+1);
	10141
	10142	/* branch_len = (paren != 0); */
	10143
	10144	if (br == NULL) {
	10145	if (flags & RESTART_UTF8) {
	10146	*flagp = RESTART_UTF8;
	10147	return NULL;
	10148	}
	10149	FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
	10150	}
	10151	if (*RExC_parse == '\|') {
	10152	if (!SIZE_ONLY && RExC_extralen) {
	10153	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	10154	}
	10155	else { /* MJD */
	10156	reginsert(pRExC_state, BRANCH, br, depth+1);
	10157	Set_Node_Length(br, paren != 0);
	10158	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	10159	}
	10160	have_branch = 1;
	10161	if (SIZE_ONLY)
	10162	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	10163	}
	10164	else if (paren == ':') {
	10165	*flagp \|= flags&SIMPLE;
	10166	}
	10167	if (is_open) { /* Starts with OPEN. */
	10168	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	10169	}
	10170	else if (paren != '?') /* Not Conditional */
	10171	ret = br;
	10172	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	10173	lastbr = br;
	10174	while (*RExC_parse == '\|') {
	10175	if (!SIZE_ONLY && RExC_extralen) {
	10176	ender = reganode(pRExC_state, LONGJMP,0);
	10177
	10178	/* Append to the previous. */
	10179	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	10180	}
	10181	if (SIZE_ONLY)
	10182	RExC_extralen += 2; /* Account for LONGJMP. */
	10183	nextchar(pRExC_state);
	10184	if (freeze_paren) {
	10185	if (RExC_npar > after_freeze)
	10186	after_freeze = RExC_npar;
	10187	RExC_npar = freeze_paren;
	10188	}
	10189	br = regbranch(pRExC_state, &flags, 0, depth+1);
	10190
	10191	if (br == NULL) {
	10192	if (flags & RESTART_UTF8) {
	10193	*flagp = RESTART_UTF8;
	10194	return NULL;
	10195	}
	10196	FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
	10197	}
	10198	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	10199	lastbr = br;
	10200	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	10201	}
	10202
	10203	if (have_branch \|\| paren != ':') {
	10204	/* Make a closing node, and hook it on the end. */
	10205	switch (paren) {
	10206	case ':':
	10207	ender = reg_node(pRExC_state, TAIL);
	10208	break;
	10209	case 1: case 2:
	10210	ender = reganode(pRExC_state, CLOSE, parno);
	10211	if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
	10212	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	10213	"Setting close paren #%"IVdf" to %d\n",
	10214	(IV)parno, REG_NODE_NUM(ender)));
	10215	RExC_close_parens[parno-1]= ender;
	10216	if (RExC_nestroot == parno)
	10217	RExC_nestroot = 0;
	10218	}
	10219	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	10220	Set_Node_Length(ender,1); /* MJD */
	10221	break;
	10222	case '<':
	10223	case ',':
	10224	case '=':
	10225	case '!':
	10226	*flagp &= ~HASWIDTH;
	10227	/* FALLTHROUGH */
	10228	case '>':
	10229	ender = reg_node(pRExC_state, SUCCEED);
	10230	break;
	10231	case 0:
	10232	ender = reg_node(pRExC_state, END);
	10233	if (!SIZE_ONLY) {
	10234	assert(!RExC_opend); /* there can only be one! */
	10235	RExC_opend = ender;
	10236	}
	10237	break;
	10238	}
	10239	DEBUG_PARSE_r(if (!SIZE_ONLY) {
	10240	SV * const mysv_val1=sv_newmortal();
	10241	SV * const mysv_val2=sv_newmortal();
	10242	DEBUG_PARSE_MSG("lsbr");
	10243	regprop(RExC_rx, mysv_val1, lastbr, NULL);
	10244	regprop(RExC_rx, mysv_val2, ender, NULL);
	10245	PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
	10246	SvPV_nolen_const(mysv_val1),
	10247	(IV)REG_NODE_NUM(lastbr),
	10248	SvPV_nolen_const(mysv_val2),
	10249	(IV)REG_NODE_NUM(ender),
	10250	(IV)(ender - lastbr)
	10251	);
	10252	});
	10253	REGTAIL(pRExC_state, lastbr, ender);
	10254
	10255	if (have_branch && !SIZE_ONLY) {
	10256	char is_nothing= 1;
	10257	if (depth==1)
	10258	RExC_seen \|= REG_TOP_LEVEL_BRANCHES_SEEN;
	10259
	10260	/* Hook the tails of the branches to the closing node. */
	10261	for (br = ret; br; br = regnext(br)) {
	10262	const U8 op = PL_regkind[OP(br)];
	10263	if (op == BRANCH) {
	10264	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	10265	if ( OP(NEXTOPER(br)) != NOTHING
	10266	\|\| regnext(NEXTOPER(br)) != ender)
	10267	is_nothing= 0;
	10268	}
	10269	else if (op == BRANCHJ) {
	10270	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	10271	/* for now we always disable this optimisation * /
	10272	if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
	10273	\|\| regnext(NEXTOPER(NEXTOPER(br))) != ender)
	10274	*/
	10275	is_nothing= 0;
	10276	}
	10277	}
	10278	if (is_nothing) {
	10279	br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
	10280	DEBUG_PARSE_r(if (!SIZE_ONLY) {
	10281	SV * const mysv_val1=sv_newmortal();
	10282	SV * const mysv_val2=sv_newmortal();
	10283	DEBUG_PARSE_MSG("NADA");
	10284	regprop(RExC_rx, mysv_val1, ret, NULL);
	10285	regprop(RExC_rx, mysv_val2, ender, NULL);
	10286	PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
	10287	SvPV_nolen_const(mysv_val1),
	10288	(IV)REG_NODE_NUM(ret),
	10289	SvPV_nolen_const(mysv_val2),
	10290	(IV)REG_NODE_NUM(ender),
	10291	(IV)(ender - ret)
	10292	);
	10293	});
	10294	OP(br)= NOTHING;
	10295	if (OP(ender) == TAIL) {
	10296	NEXT_OFF(br)= 0;
	10297	RExC_emit= br + 1;
	10298	} else {
	10299	regnode *opt;
	10300	for ( opt= br + 1; opt < ender ; opt++ )
	10301	OP(opt)= OPTIMIZED;
	10302	NEXT_OFF(br)= ender - br;
	10303	}
	10304	}
	10305	}
	10306	}
	10307
	10308	{
	10309	const char *p;
	10310	static const char parens[] = "=!<,>";
	10311
	10312	if (paren && (p = strchr(parens, paren))) {
	10313	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	10314	int flag = (p - parens) > 1;
	10315
	10316	if (paren == '>')
	10317	node = SUSPEND, flag = 0;
	10318	reginsert(pRExC_state, node,ret, depth+1);
	10319	Set_Node_Cur_Length(ret, parse_start);
	10320	Set_Node_Offset(ret, parse_start + 1);
	10321	ret->flags = flag;
	10322	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	10323	}
	10324	}
	10325
	10326	/* Check for proper termination. */
	10327	if (paren) {
	10328	/* restore original flags, but keep (?p) */
	10329	RExC_flags = oregflags \| (RExC_flags & RXf_PMf_KEEPCOPY);
	10330	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	10331	RExC_parse = oregcomp_parse;
	10332	vFAIL("Unmatched (");
	10333	}
	10334	}
	10335	else if (!paren && RExC_parse < RExC_end) {
	10336	if (*RExC_parse == ')') {
	10337	RExC_parse++;
	10338	vFAIL("Unmatched )");
	10339	}
	10340	else
	10341	FAIL("Junk on end of regexp"); /* "Can't happen". */
	10342	assert(0); /* NOTREACHED */
	10343	}
	10344
	10345	if (RExC_in_lookbehind) {
	10346	RExC_in_lookbehind--;
	10347	}
	10348	if (after_freeze > RExC_npar)
	10349	RExC_npar = after_freeze;
	10350	return(ret);
	10351	}
	10352
	10353	/*
	10354	- regbranch - one alternative of an \| operator
	10355	*
	10356	* Implements the concatenation operator.
	10357	*
	10358	* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
	10359	* restarted.
	10360	*/
	10361	STATIC regnode *
	10362	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	10363	{
	10364	regnode *ret;
	10365	regnode *chain = NULL;
	10366	regnode *latest;
	10367	I32 flags = 0, c = 0;
	10368	GET_RE_DEBUG_FLAGS_DECL;
	10369
	10370	PERL_ARGS_ASSERT_REGBRANCH;
	10371
	10372	DEBUG_PARSE("brnc");
	10373
	10374	if (first)
	10375	ret = NULL;
	10376	else {
	10377	if (!SIZE_ONLY && RExC_extralen)
	10378	ret = reganode(pRExC_state, BRANCHJ,0);
	10379	else {
	10380	ret = reg_node(pRExC_state, BRANCH);
	10381	Set_Node_Length(ret, 1);
	10382	}
	10383	}
	10384
	10385	if (!first && SIZE_ONLY)
	10386	RExC_extralen += 1; /* BRANCHJ */
	10387
	10388	flagp = WORST; / Tentatively. */
	10389
	10390	RExC_parse--;
	10391	nextchar(pRExC_state);
	10392	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	10393	flags &= ~TRYAGAIN;
	10394	latest = regpiece(pRExC_state, &flags,depth+1);
	10395	if (latest == NULL) {
	10396	if (flags & TRYAGAIN)
	10397	continue;
	10398	if (flags & RESTART_UTF8) {
	10399	*flagp = RESTART_UTF8;
	10400	return NULL;
	10401	}
	10402	FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
	10403	}
	10404	else if (ret == NULL)
	10405	ret = latest;
	10406	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	10407	if (chain == NULL) /* First piece. */
	10408	*flagp \|= flags&SPSTART;
	10409	else {
	10410	RExC_naughty++;
	10411	REGTAIL(pRExC_state, chain, latest);
	10412	}
	10413	chain = latest;
	10414	c++;
	10415	}
	10416	if (chain == NULL) { /* Loop ran zero times. */
	10417	chain = reg_node(pRExC_state, NOTHING);
	10418	if (ret == NULL)
	10419	ret = chain;
	10420	}
	10421	if (c == 1) {
	10422	*flagp \|= flags&SIMPLE;
	10423	}
	10424
	10425	return ret;
	10426	}
	10427
	10428	/*
	10429	- regpiece - something followed by possible [*+?]
	10430	*
	10431	* Note that the branching code sequences used for ? and the general cases
	10432	* of * and + are somewhat optimized: they use the same NOTHING node as
	10433	* both the endmarker for their branch list and the body of the last branch.
	10434	* It might seem that this node could be dispensed with entirely, but the
	10435	* endmarker role is not redundant.
	10436	*
	10437	* Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
	10438	* TRYAGAIN.
	10439	* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
	10440	* restarted.
	10441	*/
	10442	STATIC regnode *
	10443	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	10444	{
	10445	regnode *ret;
	10446	char op;
	10447	char *next;
	10448	I32 flags;
	10449	const char * const origparse = RExC_parse;
	10450	I32 min;
	10451	I32 max = REG_INFTY;
	10452	#ifdef RE_TRACK_PATTERN_OFFSETS
	10453	char *parse_start;
	10454	#endif
	10455	const char *maxpos = NULL;
	10456
	10457	/* Save the original in case we change the emitted regop to a FAIL. */
	10458	regnode * const orig_emit = RExC_emit;
	10459
	10460	GET_RE_DEBUG_FLAGS_DECL;
	10461
	10462	PERL_ARGS_ASSERT_REGPIECE;
	10463
	10464	DEBUG_PARSE("piec");
	10465
	10466	ret = regatom(pRExC_state, &flags,depth+1);
	10467	if (ret == NULL) {
	10468	if (flags & (TRYAGAIN\|RESTART_UTF8))
	10469	*flagp \|= flags & (TRYAGAIN\|RESTART_UTF8);
	10470	else
	10471	FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
	10472	return(NULL);
	10473	}
	10474
	10475	op = *RExC_parse;
	10476
	10477	if (op == '{' && regcurly(RExC_parse)) {
	10478	maxpos = NULL;
	10479	#ifdef RE_TRACK_PATTERN_OFFSETS
	10480	parse_start = RExC_parse; /* MJD */
	10481	#endif
	10482	next = RExC_parse + 1;
	10483	while (isDIGIT(next) \|\| next == ',') {
	10484	if (*next == ',') {
	10485	if (maxpos)
	10486	break;
	10487	else
	10488	maxpos = next;
	10489	}
	10490	next++;
	10491	}
	10492	if (next == '}') { / got one */
	10493	const char* endptr;
	10494	if (!maxpos)
	10495	maxpos = next;
	10496	RExC_parse++;
	10497	min = grok_atou(RExC_parse, &endptr);
	10498	if (*maxpos == ',')
	10499	maxpos++;
	10500	else
	10501	maxpos = RExC_parse;
	10502	max = grok_atou(maxpos, &endptr);
	10503	if (!max && *maxpos != '0')
	10504	max = REG_INFTY; /* meaning "infinity" */
	10505	else if (max >= REG_INFTY)
	10506	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	10507	RExC_parse = next;
	10508	nextchar(pRExC_state);
	10509	if (max < min) { /* If can't match, warn and optimize to fail
	10510	unconditionally */
	10511	if (SIZE_ONLY) {
	10512	ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
	10513
	10514	/* We can't back off the size because we have to reserve
	10515	* enough space for all the things we are about to throw
	10516	* away, but we can shrink it by the ammount we are about
	10517	* to re-use here */
	10518	RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
	10519	}
	10520	else {
	10521	RExC_emit = orig_emit;
	10522	}
	10523	ret = reg_node(pRExC_state, OPFAIL);
	10524	return ret;
	10525	}
	10526	else if (min == max
	10527	&& RExC_parse < RExC_end
	10528	&& (RExC_parse == '?' \|\| RExC_parse == '+'))
	10529	{
	10530	if (SIZE_ONLY) {
	10531	ckWARN2reg(RExC_parse + 1,
	10532	"Useless use of greediness modifier '%c'",
	10533	*RExC_parse);
	10534	}
	10535	/* Absorb the modifier, so later code doesn't see nor use
	10536	* it */
	10537	nextchar(pRExC_state);
	10538	}
	10539
	10540	do_curly:
	10541	if ((flags&SIMPLE)) {
	10542	RExC_naughty += 2 + RExC_naughty / 2;
	10543	reginsert(pRExC_state, CURLY, ret, depth+1);
	10544	Set_Node_Offset(ret, parse_start+1); /* MJD */
	10545	Set_Node_Cur_Length(ret, parse_start);
	10546	}
	10547	else {
	10548	regnode * const w = reg_node(pRExC_state, WHILEM);
	10549
	10550	w->flags = 0;
	10551	REGTAIL(pRExC_state, ret, w);
	10552	if (!SIZE_ONLY && RExC_extralen) {
	10553	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	10554	reginsert(pRExC_state, NOTHING,ret, depth+1);
	10555	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	10556	}
	10557	reginsert(pRExC_state, CURLYX,ret, depth+1);
	10558	/* MJD hk */
	10559	Set_Node_Offset(ret, parse_start+1);
	10560	Set_Node_Length(ret,
	10561	op == '{' ? (RExC_parse - parse_start) : 1);
	10562
	10563	if (!SIZE_ONLY && RExC_extralen)
	10564	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	10565	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	10566	if (SIZE_ONLY)
	10567	RExC_whilem_seen++, RExC_extralen += 3;
	10568	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	10569	}
	10570	ret->flags = 0;
	10571
	10572	if (min > 0)
	10573	*flagp = WORST;
	10574	if (max > 0)
	10575	*flagp \|= HASWIDTH;
	10576	if (!SIZE_ONLY) {
	10577	ARG1_SET(ret, (U16)min);
	10578	ARG2_SET(ret, (U16)max);
	10579	}
	10580	if (max == REG_INFTY)
	10581	RExC_seen \|= REG_UNBOUNDED_QUANTIFIER_SEEN;
	10582
	10583	goto nest_check;
	10584	}
	10585	}
	10586
	10587	if (!ISMULT1(op)) {
	10588	*flagp = flags;
	10589	return(ret);
	10590	}
	10591
	10592	#if 0 /* Now runtime fix should be reliable. */
	10593
	10594	/* if this is reinstated, don't forget to put this back into perldiag:
	10595
	10596	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	10597
	10598	(F) The part of the regexp subject to either the * or + quantifier
	10599	could match an empty string. The {#} shows in the regular
	10600	expression about where the problem was discovered.
	10601
	10602	*/
	10603
	10604	if (!(flags&HASWIDTH) && op != '?')
	10605	vFAIL("Regexp *+ operand could be empty");
	10606	#endif
	10607
	10608	#ifdef RE_TRACK_PATTERN_OFFSETS
	10609	parse_start = RExC_parse;
	10610	#endif
	10611	nextchar(pRExC_state);
	10612
	10613	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	10614
	10615	if (op == '*' && (flags&SIMPLE)) {
	10616	reginsert(pRExC_state, STAR, ret, depth+1);
	10617	ret->flags = 0;
	10618	RExC_naughty += 4;
	10619	RExC_seen \|= REG_UNBOUNDED_QUANTIFIER_SEEN;
	10620	}
	10621	else if (op == '*') {
	10622	min = 0;
	10623	goto do_curly;
	10624	}
	10625	else if (op == '+' && (flags&SIMPLE)) {
	10626	reginsert(pRExC_state, PLUS, ret, depth+1);
	10627	ret->flags = 0;
	10628	RExC_naughty += 3;
	10629	RExC_seen \|= REG_UNBOUNDED_QUANTIFIER_SEEN;
	10630	}
	10631	else if (op == '+') {
	10632	min = 1;
	10633	goto do_curly;
	10634	}
	10635	else if (op == '?') {
	10636	min = 0; max = 1;
	10637	goto do_curly;
	10638	}
	10639	nest_check:
	10640	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	10641	SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
	10642	ckWARN2reg(RExC_parse,
	10643	"%"UTF8f" matches null string many times",
	10644	UTF8fARG(UTF, (RExC_parse >= origparse
	10645	? RExC_parse - origparse
	10646	: 0),
	10647	origparse));
	10648	(void)ReREFCNT_inc(RExC_rx_sv);
	10649	}
	10650
	10651	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	10652	nextchar(pRExC_state);
	10653	reginsert(pRExC_state, MINMOD, ret, depth+1);
	10654	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	10655	}
	10656	else
	10657	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	10658	regnode *ender;
	10659	nextchar(pRExC_state);
	10660	ender = reg_node(pRExC_state, SUCCEED);
	10661	REGTAIL(pRExC_state, ret, ender);
	10662	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	10663	ret->flags = 0;
	10664	ender = reg_node(pRExC_state, TAIL);
	10665	REGTAIL(pRExC_state, ret, ender);
	10666	}
	10667
	10668	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	10669	RExC_parse++;
	10670	vFAIL("Nested quantifiers");
	10671	}
	10672
	10673	return(ret);
	10674	}
	10675
	10676	STATIC bool
	10677	S_grok_bslash_N(pTHX_ RExC_state_t pRExC_state, regnode* node_p,
	10678	UV valuep, I32 flagp, U32 depth, bool in_char_class,
	10679	const bool strict /* Apply stricter parsing rules? */
	10680	)
	10681	{
	10682
	10683	/* This is expected to be called by a parser routine that has recognized '\N'
	10684	and needs to handle the rest. RExC_parse is expected to point at the first
	10685	char following the N at the time of the call. On successful return,
	10686	RExC_parse has been updated to point to just after the sequence identified
	10687	by this routine, and <*flagp> has been updated.
	10688
	10689	The \N may be inside (indicated by the boolean <in_char_class>) or outside a
	10690	character class.
	10691
	10692	\N may begin either a named sequence, or if outside a character class, mean
	10693	to match a non-newline. For non single-quoted regexes, the tokenizer has
	10694	attempted to decide which, and in the case of a named sequence, converted it
	10695	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	10696	where c1... are the characters in the sequence. For single-quoted regexes,
	10697	the tokenizer passes the \N sequence through unchanged; this code will not
	10698	attempt to determine this nor expand those, instead raising a syntax error.
	10699	The net effect is that if the beginning of the passed-in pattern isn't '{U+'
	10700	or there is no '}', it signals that this \N occurrence means to match a
	10701	non-newline.
	10702
	10703	Only the \N{U+...} form should occur in a character class, for the same
	10704	reason that '.' inside a character class means to just match a period: it
	10705	just doesn't make sense.
	10706
	10707	The function raises an error (via vFAIL), and doesn't return for various
	10708	syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
	10709	success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
	10710	RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
	10711	only possible if node_p is non-NULL.
	10712
	10713
	10714	If <valuep> is non-null, it means the caller can accept an input sequence
	10715	consisting of a just a single code point; <*valuep> is set to that value
	10716	if the input is such.
	10717
	10718	If <node_p> is non-null it signifies that the caller can accept any other
	10719	legal sequence (i.e., one that isn't just a single code point). <*node_p>
	10720	is set as follows:
	10721	1) \N means not-a-NL: points to a newly created REG_ANY node;
	10722	2) \N{}: points to a new NOTHING node;
	10723	3) otherwise: points to a new EXACT node containing the resolved
	10724	string.
	10725	Note that FALSE is returned for single code point sequences if <valuep> is
	10726	null.
	10727	*/
	10728
	10729	char * endbrace; /* '}' following the name */
	10730	char* p;
	10731	char endchar; / Points to '.' or '}' ending cur char in the input
	10732	stream */
	10733	bool has_multiple_chars; /* true if the input stream contains a sequence of
	10734	more than one character */
	10735
	10736	GET_RE_DEBUG_FLAGS_DECL;
	10737
	10738	PERL_ARGS_ASSERT_GROK_BSLASH_N;
	10739
	10740	GET_RE_DEBUG_FLAGS;
	10741
	10742	assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
	10743
	10744	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	10745	* modifier. The other meaning does not, so use a temporary until we find
	10746	* out which we are being called with */
	10747	p = (RExC_flags & RXf_PMf_EXTENDED)
	10748	? regpatws(pRExC_state, RExC_parse,
	10749	TRUE) /* means recognize comments */
	10750	: RExC_parse;
	10751
	10752	/* Disambiguate between \N meaning a named character versus \N meaning
	10753	* [^\n]. The former is assumed when it can't be the latter. */
	10754	if (*p != '{' \|\| regcurly(p)) {
	10755	RExC_parse = p;
	10756	if (! node_p) {
	10757	/* no bare \N allowed in a charclass */
	10758	if (in_char_class) {
	10759	vFAIL("\\N in a character class must be a named character: \\N{...}");
	10760	}
	10761	return FALSE;
	10762	}
	10763	RExC_parse--; /* Need to back off so nextchar() doesn't skip the
	10764	current char */
	10765	nextchar(pRExC_state);
	10766	*node_p = reg_node(pRExC_state, REG_ANY);
	10767	*flagp \|= HASWIDTH\|SIMPLE;
	10768	RExC_naughty++;
	10769	Set_Node_Length(node_p, 1); / MJD */
	10770	return TRUE;
	10771	}
	10772
	10773	/* Here, we have decided it should be a named character or sequence */
	10774
	10775	/* The test above made sure that the next real character is a '{', but
	10776	* under the /x modifier, it could be separated by space (or a comment and
	10777	* \n) and this is not allowed (for consistency with \x{...} and the
	10778	* tokenizer handling of \N{NAME}). */
	10779	if (*RExC_parse != '{') {
	10780	vFAIL("Missing braces on \\N{}");
	10781	}
	10782
	10783	RExC_parse++; /* Skip past the '{' */
	10784
	10785	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	10786	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	10787	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
	10788	*/
	10789	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
	10790	*/
	10791	{
	10792	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	10793	vFAIL("\\N{NAME} must be resolved by the lexer");
	10794	}
	10795
	10796	if (endbrace == RExC_parse) { /* empty: \N{} */
	10797	bool ret = TRUE;
	10798	if (node_p) {
	10799	*node_p = reg_node(pRExC_state,NOTHING);
	10800	}
	10801	else if (in_char_class) {
	10802	if (SIZE_ONLY && in_char_class) {
	10803	if (strict) {
	10804	RExC_parse++; /* Position after the "}" */
	10805	vFAIL("Zero length \\N{}");
	10806	}
	10807	else {
	10808	ckWARNreg(RExC_parse,
	10809	"Ignoring zero length \\N{} in character class");
	10810	}
	10811	}
	10812	ret = FALSE;
	10813	}
	10814	else {
	10815	return FALSE;
	10816	}
	10817	nextchar(pRExC_state);
	10818	return ret;
	10819	}
	10820
	10821	RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
	10822	RExC_parse += 2; /* Skip past the 'U+' */
	10823
	10824	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	10825
	10826	/* Code points are separated by dots. If none, there is only one code
	10827	* point, and is terminated by the brace */
	10828	has_multiple_chars = (endchar < endbrace);
	10829
	10830	if (valuep && (! has_multiple_chars \|\| in_char_class)) {
	10831	/* We only pay attention to the first char of
	10832	multichar strings being returned in char classes. I kinda wonder
	10833	if this makes sense as it does change the behaviour
	10834	from earlier versions, OTOH that behaviour was broken
	10835	as well. XXX Solution is to recharacterize as
	10836	[rest-of-class]\|multi1\|multi2... */
	10837
	10838	STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
	10839	I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
	10840	\| PERL_SCAN_DISALLOW_PREFIX
	10841	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	10842
	10843	*valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
	10844
	10845	/* The tokenizer should have guaranteed validity, but it's possible to
	10846	* bypass it by using single quoting, so check */
	10847	if (length_of_hex == 0
	10848	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	10849	{
	10850	RExC_parse += length_of_hex; /* Includes all the valid */
	10851	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	10852	? UTF8SKIP(RExC_parse)
	10853	: 1;
	10854	/* Guard against malformed utf8 */
	10855	if (RExC_parse >= endchar) {
	10856	RExC_parse = endchar;
	10857	}
	10858	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	10859	}
	10860
	10861	if (in_char_class && has_multiple_chars) {
	10862	if (strict) {
	10863	RExC_parse = endbrace;
	10864	vFAIL("\\N{} in character class restricted to one character");
	10865	}
	10866	else {
	10867	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	10868	}
	10869	}
	10870
	10871	RExC_parse = endbrace + 1;
	10872	}
	10873	else if (! node_p \|\| ! has_multiple_chars) {
	10874
	10875	/* Here, the input is legal, but not according to the caller's
	10876	* options. We fail without advancing the parse, so that the
	10877	* caller can try again */
	10878	RExC_parse = p;
	10879	return FALSE;
	10880	}
	10881	else {
	10882
	10883	/* What is done here is to convert this to a sub-pattern of the form
	10884	* (?:\x{char1}\x{char2}...)
	10885	* and then call reg recursively. That way, it retains its atomicness,
	10886	* while not having to worry about special handling that some code
	10887	* points may have. toke.c has converted the original Unicode values
	10888	* to native, so that we can just pass on the hex values unchanged. We
	10889	* do have to set a flag to keep recoding from happening in the
	10890	* recursion */
	10891
	10892	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8\|SVs_TEMP);
	10893	STRLEN len;
	10894	char *orig_end = RExC_end;
	10895	I32 flags;
	10896
	10897	while (RExC_parse < endbrace) {
	10898
	10899	/* Convert to notation the rest of the code understands */
	10900	sv_catpv(substitute_parse, "\\x{");
	10901	sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
	10902	sv_catpv(substitute_parse, "}");
	10903
	10904	/* Point to the beginning of the next character in the sequence. */
	10905	RExC_parse = endchar + 1;
	10906	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	10907	}
	10908	sv_catpv(substitute_parse, ")");
	10909
	10910	RExC_parse = SvPV(substitute_parse, len);
	10911
	10912	/* Don't allow empty number */
	10913	if (len < 8) {
	10914	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	10915	}
	10916	RExC_end = RExC_parse + len;
	10917
	10918	/* The values are Unicode, and therefore not subject to recoding */
	10919	RExC_override_recoding = 1;
	10920
	10921	if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
	10922	if (flags & RESTART_UTF8) {
	10923	*flagp = RESTART_UTF8;
	10924	return FALSE;
	10925	}
	10926	FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
	10927	(UV) flags);
	10928	}
	10929	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	10930
	10931	RExC_parse = endbrace;
	10932	RExC_end = orig_end;
	10933	RExC_override_recoding = 0;
	10934
	10935	nextchar(pRExC_state);
	10936	}
	10937
	10938	return TRUE;
	10939	}
	10940
	10941
	10942	/*
	10943	* reg_recode
	10944	*
	10945	* It returns the code point in utf8 for the value in *encp.
	10946	* value: a code value in the source encoding
	10947	* encp: a pointer to an Encode object
	10948	*
	10949	* If the result from Encode is not a single character,
	10950	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	10951	*/
	10952	STATIC UV
	10953	S_reg_recode(pTHX_ const char value, SV **encp)
	10954	{
	10955	STRLEN numlen = 1;
	10956	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	10957	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	10958	const STRLEN newlen = SvCUR(sv);
	10959	UV uv = UNICODE_REPLACEMENT;
	10960
	10961	PERL_ARGS_ASSERT_REG_RECODE;
	10962
	10963	if (newlen)
	10964	uv = SvUTF8(sv)
	10965	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	10966	: (U8)s;
	10967
	10968	if (!newlen \|\| numlen != newlen) {
	10969	uv = UNICODE_REPLACEMENT;
	10970	*encp = NULL;
	10971	}
	10972	return uv;
	10973	}
	10974
	10975	PERL_STATIC_INLINE U8
	10976	S_compute_EXACTish(RExC_state_t *pRExC_state)
	10977	{
	10978	U8 op;
	10979
	10980	PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
	10981
	10982	if (! FOLD) {
	10983	return EXACT;
	10984	}
	10985
	10986	op = get_regex_charset(RExC_flags);
	10987	if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
	10988	op--; /* /a is same as /u, and map /aa's offset to what /a's would have
	10989	been, so there is no hole */
	10990	}
	10991
	10992	return op + EXACTF;
	10993	}
	10994
	10995	PERL_STATIC_INLINE void
	10996	S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
	10997	regnode node, I32 flagp, STRLEN len, UV code_point,
	10998	bool downgradable)
	10999	{
	11000	/* This knows the details about sizing an EXACTish node, setting flags for
	11001	* it (by setting <*flagp>, and potentially populating it with a single
	11002	* character.
	11003	*
	11004	* If <len> (the length in bytes) is non-zero, this function assumes that
	11005	* the node has already been populated, and just does the sizing. In this
	11006	* case <code_point> should be the final code point that has already been
	11007	* placed into the node. This value will be ignored except that under some
	11008	* circumstances <*flagp> is set based on it.
	11009	*
	11010	* If <len> is zero, the function assumes that the node is to contain only
	11011	* the single character given by <code_point> and calculates what <len>
	11012	* should be. In pass 1, it sizes the node appropriately. In pass 2, it
	11013	* additionally will populate the node's STRING with <code_point> or its
	11014	* fold if folding.
	11015	*
	11016	* In both cases <*flagp> is appropriately set
	11017	*
	11018	* It knows that under FOLD, the Latin Sharp S and UTF characters above
	11019	* 255, must be folded (the former only when the rules indicate it can
	11020	* match 'ss')
	11021	*
	11022	* When it does the populating, it looks at the flag 'downgradable'. If
	11023	* true with a node that folds, it checks if the single code point
	11024	* participates in a fold, and if not downgrades the node to an EXACT.
	11025	* This helps the optimizer */
	11026
	11027	bool len_passed_in = cBOOL(len != 0);
	11028	U8 character[UTF8_MAXBYTES_CASE+1];
	11029
	11030	PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
	11031
	11032	/* Don't bother to check for downgrading in PASS1, as it doesn't make any
	11033	* sizing difference, and is extra work that is thrown away */
	11034	if (downgradable && ! PASS2) {
	11035	downgradable = FALSE;
	11036	}
	11037
	11038	if (! len_passed_in) {
	11039	if (UTF) {
	11040	if (UNI_IS_INVARIANT(code_point)) {
	11041	if (LOC \|\| ! FOLD) { /* /l defers folding until runtime */
	11042	*character = (U8) code_point;
	11043	}
	11044	else { /* Here is /i and not /l (toFOLD() is defined on just
	11045	ASCII, which isn't the same thing as INVARIANT on
	11046	EBCDIC, but it works there, as the extra invariants
	11047	fold to themselves) */
	11048	*character = toFOLD((U8) code_point);
	11049
	11050	/* We can downgrade to an EXACT node if this character
	11051	* isn't a folding one. Note that this assumes that
	11052	* nothing above Latin1 folds to some other invariant than
	11053	* one of these alphabetics; otherwise we would also have
	11054	* to check:
	11055	* && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
	11056	* \|\| ASCII_FOLD_RESTRICTED))
	11057	*/
	11058	if (downgradable && PL_fold[code_point] == code_point) {
	11059	OP(node) = EXACT;
	11060	}
	11061	}
	11062	len = 1;
	11063	}
	11064	else if (FOLD && (! LOC
	11065	\|\| ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
	11066	{ /* Folding, and ok to do so now */
	11067	UV folded = _to_uni_fold_flags(
	11068	code_point,
	11069	character,
	11070	&len,
	11071	FOLD_FLAGS_FULL \| ((ASCII_FOLD_RESTRICTED)
	11072	? FOLD_FLAGS_NOMIX_ASCII
	11073	: 0));
	11074	if (downgradable
	11075	&& folded == code_point
	11076	&& ! _invlist_contains_cp(PL_utf8_foldable, code_point))
	11077	{
	11078	OP(node) = EXACT;
	11079	}
	11080	}
	11081	else if (code_point <= MAX_UTF8_TWO_BYTE) {
	11082
	11083	/* Not folding this cp, and can output it directly */
	11084	*character = UTF8_TWO_BYTE_HI(code_point);
	11085	*(character + 1) = UTF8_TWO_BYTE_LO(code_point);
	11086	len = 2;
	11087	}
	11088	else {
	11089	uvchr_to_utf8( character, code_point);
	11090	len = UTF8SKIP(character);
	11091	}
	11092	} /* Else pattern isn't UTF8. */
	11093	else if (! FOLD) {
	11094	*character = (U8) code_point;
	11095	len = 1;
	11096	} /* Else is folded non-UTF8 */
	11097	else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
	11098
	11099	/* We don't fold any non-UTF8 except possibly the Sharp s (see
	11100	* comments at join_exact()); */
	11101	*character = (U8) code_point;
	11102	len = 1;
	11103
	11104	/* Can turn into an EXACT node if we know the fold at compile time,
	11105	* and it folds to itself and doesn't particpate in other folds */
	11106	if (downgradable
	11107	&& ! LOC
	11108	&& PL_fold_latin1[code_point] == code_point
	11109	&& (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
	11110	\|\| (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
	11111	{
	11112	OP(node) = EXACT;
	11113	}
	11114	} /* else is Sharp s. May need to fold it */
	11115	else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
	11116	*character = 's';
	11117	*(character + 1) = 's';
	11118	len = 2;
	11119	}
	11120	else {
	11121	*character = LATIN_SMALL_LETTER_SHARP_S;
	11122	len = 1;
	11123	}
	11124	}
	11125
	11126	if (SIZE_ONLY) {
	11127	RExC_size += STR_SZ(len);
	11128	}
	11129	else {
	11130	RExC_emit += STR_SZ(len);
	11131	STR_LEN(node) = len;
	11132	if (! len_passed_in) {
	11133	Copy((char *) character, STRING(node), len, char);
	11134	}
	11135	}
	11136
	11137	*flagp \|= HASWIDTH;
	11138
	11139	/* A single character node is SIMPLE, except for the special-cased SHARP S
	11140	* under /di. */
	11141	if ((len == 1 \|\| (UTF && len == UNISKIP(code_point)))
	11142	&& (code_point != LATIN_SMALL_LETTER_SHARP_S
	11143	\|\| ! FOLD \|\| ! DEPENDS_SEMANTICS))
	11144	{
	11145	*flagp \|= SIMPLE;
	11146	}
	11147
	11148	/* The OP may not be well defined in PASS1 */
	11149	if (PASS2 && OP(node) == EXACTFL) {
	11150	RExC_contains_locale = 1;
	11151	}
	11152	}
	11153
	11154
	11155	/* Parse backref decimal value, unless it's too big to sensibly be a backref,
	11156	* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
	11157
	11158	static I32
	11159	S_backref_value(char *p)
	11160	{
	11161	const char* endptr;
	11162	UV val = grok_atou(p, &endptr);
	11163	if (endptr == p \|\| endptr == NULL \|\| val > I32_MAX)
	11164	return I32_MAX;
	11165	return (I32)val;
	11166	}
	11167
	11168
	11169	/*
	11170	- regatom - the lowest level
	11171
	11172	Try to identify anything special at the start of the pattern. If there
	11173	is, then handle it as required. This may involve generating a single regop,
	11174	such as for an assertion; or it may involve recursing, such as to
	11175	handle a () structure.
	11176
	11177	If the string doesn't start with something special then we gobble up
	11178	as much literal text as we can.
	11179
	11180	Once we have been able to handle whatever type of thing started the
	11181	sequence, we return.
	11182
	11183	Note: we have to be careful with escapes, as they can be both literal
	11184	and special, and in the case of \10 and friends, context determines which.
	11185
	11186	A summary of the code structure is:
	11187
	11188	switch (first_byte) {
	11189	cases for each special:
	11190	handle this special;
	11191	break;
	11192	case '\\':
	11193	switch (2nd byte) {
	11194	cases for each unambiguous special:
	11195	handle this special;
	11196	break;
	11197	cases for each ambigous special/literal:
	11198	disambiguate;
	11199	if (special) handle here
	11200	else goto defchar;
	11201	default: // unambiguously literal:
	11202	goto defchar;
	11203	}
	11204	default: // is a literal char
	11205	// FALL THROUGH
	11206	defchar:
	11207	create EXACTish node for literal;
	11208	while (more input and node isn't full) {
	11209	switch (input_byte) {
	11210	cases for each special;
	11211	make sure parse pointer is set so that the next call to
	11212	regatom will see this special first
	11213	goto loopdone; // EXACTish node terminated by prev. char
	11214	default:
	11215	append char to EXACTISH node;
	11216	}
	11217	get next input byte;
	11218	}
	11219	loopdone:
	11220	}
	11221	return the generated node;
	11222
	11223	Specifically there are two separate switches for handling
	11224	escape sequences, with the one for handling literal escapes requiring
	11225	a dummy entry for all of the special escapes that are actually handled
	11226	by the other.
	11227
	11228	Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
	11229	TRYAGAIN.
	11230	Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
	11231	restarted.
	11232	Otherwise does not return NULL.
	11233	*/
	11234
	11235	STATIC regnode *
	11236	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	11237	{
	11238	regnode *ret = NULL;
	11239	I32 flags = 0;
	11240	char *parse_start = RExC_parse;
	11241	U8 op;
	11242	int invert = 0;
	11243	U8 arg;
	11244
	11245	GET_RE_DEBUG_FLAGS_DECL;
	11246
	11247	flagp = WORST; / Tentatively. */
	11248
	11249	DEBUG_PARSE("atom");
	11250
	11251	PERL_ARGS_ASSERT_REGATOM;
	11252
	11253	tryagain:
	11254	switch ((U8)*RExC_parse) {
	11255	case '^':
	11256	RExC_seen_zerolen++;
	11257	nextchar(pRExC_state);
	11258	if (RExC_flags & RXf_PMf_MULTILINE)
	11259	ret = reg_node(pRExC_state, MBOL);
	11260	else if (RExC_flags & RXf_PMf_SINGLELINE)
	11261	ret = reg_node(pRExC_state, SBOL);
	11262	else
	11263	ret = reg_node(pRExC_state, BOL);
	11264	Set_Node_Length(ret, 1); /* MJD */
	11265	break;
	11266	case '$':
	11267	nextchar(pRExC_state);
	11268	if (*RExC_parse)
	11269	RExC_seen_zerolen++;
	11270	if (RExC_flags & RXf_PMf_MULTILINE)
	11271	ret = reg_node(pRExC_state, MEOL);
	11272	else if (RExC_flags & RXf_PMf_SINGLELINE)
	11273	ret = reg_node(pRExC_state, SEOL);
	11274	else
	11275	ret = reg_node(pRExC_state, EOL);
	11276	Set_Node_Length(ret, 1); /* MJD */
	11277	break;
	11278	case '.':
	11279	nextchar(pRExC_state);
	11280	if (RExC_flags & RXf_PMf_SINGLELINE)
	11281	ret = reg_node(pRExC_state, SANY);
	11282	else
	11283	ret = reg_node(pRExC_state, REG_ANY);
	11284	*flagp \|= HASWIDTH\|SIMPLE;
	11285	RExC_naughty++;
	11286	Set_Node_Length(ret, 1); /* MJD */
	11287	break;
	11288	case '[':
	11289	{
	11290	char * const oregcomp_parse = ++RExC_parse;
	11291	ret = regclass(pRExC_state, flagp,depth+1,
	11292	FALSE, /* means parse the whole char class */
	11293	TRUE, /* allow multi-char folds */
	11294	FALSE, /* don't silence non-portable warnings. */
	11295	NULL);
	11296	if (*RExC_parse != ']') {
	11297	RExC_parse = oregcomp_parse;
	11298	vFAIL("Unmatched [");
	11299	}
	11300	if (ret == NULL) {
	11301	if (*flagp & RESTART_UTF8)
	11302	return NULL;
	11303	FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
	11304	(UV) *flagp);
	11305	}
	11306	nextchar(pRExC_state);
	11307	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	11308	break;
	11309	}
	11310	case '(':
	11311	nextchar(pRExC_state);
	11312	ret = reg(pRExC_state, 2, &flags,depth+1);
	11313	if (ret == NULL) {
	11314	if (flags & TRYAGAIN) {
	11315	if (RExC_parse == RExC_end) {
	11316	/* Make parent create an empty node if needed. */
	11317	*flagp \|= TRYAGAIN;
	11318	return(NULL);
	11319	}
	11320	goto tryagain;
	11321	}
	11322	if (flags & RESTART_UTF8) {
	11323	*flagp = RESTART_UTF8;
	11324	return NULL;
	11325	}
	11326	FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
	11327	(UV) flags);
	11328	}
	11329	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	11330	break;
	11331	case '\|':
	11332	case ')':
	11333	if (flags & TRYAGAIN) {
	11334	*flagp \|= TRYAGAIN;
	11335	return NULL;
	11336	}
	11337	vFAIL("Internal urp");
	11338	/* Supposed to be caught earlier. */
	11339	break;
	11340	case '?':
	11341	case '+':
	11342	case '*':
	11343	RExC_parse++;
	11344	vFAIL("Quantifier follows nothing");
	11345	break;
	11346	case '\\':
	11347	/* Special Escapes
	11348
	11349	This switch handles escape sequences that resolve to some kind
	11350	of special regop and not to literal text. Escape sequnces that
	11351	resolve to literal text are handled below in the switch marked
	11352	"Literal Escapes".
	11353
	11354	Every entry in this switch must have a corresponding entry
	11355	in the literal escape switch. However, the opposite is not
	11356	required, as the default for this switch is to jump to the
	11357	literal text handling code.
	11358	*/
	11359	switch ((U8)*++RExC_parse) {
	11360	/* Special Escapes */
	11361	case 'A':
	11362	RExC_seen_zerolen++;
	11363	ret = reg_node(pRExC_state, SBOL);
	11364	*flagp \|= SIMPLE;
	11365	goto finish_meta_pat;
	11366	case 'G':
	11367	ret = reg_node(pRExC_state, GPOS);
	11368	RExC_seen \|= REG_GPOS_SEEN;
	11369	*flagp \|= SIMPLE;
	11370	goto finish_meta_pat;
	11371	case 'K':
	11372	RExC_seen_zerolen++;
	11373	ret = reg_node(pRExC_state, KEEPS);
	11374	*flagp \|= SIMPLE;
	11375	/* XXX:dmq : disabling in-place substitution seems to
	11376	* be necessary here to avoid cases of memory corruption, as
	11377	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	11378	*/
	11379	RExC_seen \|= REG_LOOKBEHIND_SEEN;
	11380	goto finish_meta_pat;
	11381	case 'Z':
	11382	ret = reg_node(pRExC_state, SEOL);
	11383	*flagp \|= SIMPLE;
	11384	RExC_seen_zerolen++; /* Do not optimize RE away */
	11385	goto finish_meta_pat;
	11386	case 'z':
	11387	ret = reg_node(pRExC_state, EOS);
	11388	*flagp \|= SIMPLE;
	11389	RExC_seen_zerolen++; /* Do not optimize RE away */
	11390	goto finish_meta_pat;
	11391	case 'C':
	11392	ret = reg_node(pRExC_state, CANY);
	11393	RExC_seen \|= REG_CANY_SEEN;
	11394	*flagp \|= HASWIDTH\|SIMPLE;
	11395	if (SIZE_ONLY) {
	11396	ckWARNdep(RExC_parse+1, "\\C is deprecated");
	11397	}
	11398	goto finish_meta_pat;
	11399	case 'X':
	11400	ret = reg_node(pRExC_state, CLUMP);
	11401	*flagp \|= HASWIDTH;
	11402	goto finish_meta_pat;
	11403
	11404	case 'W':
	11405	invert = 1;
	11406	/* FALLTHROUGH */
	11407	case 'w':
	11408	arg = ANYOF_WORDCHAR;
	11409	goto join_posix;
	11410
	11411	case 'b':
	11412	RExC_seen_zerolen++;
	11413	RExC_seen \|= REG_LOOKBEHIND_SEEN;
	11414	op = BOUND + get_regex_charset(RExC_flags);
	11415	if (op > BOUNDA) { /* /aa is same as /a */
	11416	op = BOUNDA;
	11417	}
	11418	else if (op == BOUNDL) {
	11419	RExC_contains_locale = 1;
	11420	}
	11421	ret = reg_node(pRExC_state, op);
	11422	FLAGS(ret) = get_regex_charset(RExC_flags);
	11423	*flagp \|= SIMPLE;
	11424	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	11425	/* diag_listed_as: Use "%s" instead of "%s" */
	11426	vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
	11427	}
	11428	goto finish_meta_pat;
	11429	case 'B':
	11430	RExC_seen_zerolen++;
	11431	RExC_seen \|= REG_LOOKBEHIND_SEEN;
	11432	op = NBOUND + get_regex_charset(RExC_flags);
	11433	if (op > NBOUNDA) { /* /aa is same as /a */
	11434	op = NBOUNDA;
	11435	}
	11436	else if (op == NBOUNDL) {
	11437	RExC_contains_locale = 1;
	11438	}
	11439	ret = reg_node(pRExC_state, op);
	11440	FLAGS(ret) = get_regex_charset(RExC_flags);
	11441	*flagp \|= SIMPLE;
	11442	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	11443	/* diag_listed_as: Use "%s" instead of "%s" */
	11444	vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
	11445	}
	11446	goto finish_meta_pat;
	11447
	11448	case 'D':
	11449	invert = 1;
	11450	/* FALLTHROUGH */
	11451	case 'd':
	11452	arg = ANYOF_DIGIT;
	11453	goto join_posix;
	11454
	11455	case 'R':
	11456	ret = reg_node(pRExC_state, LNBREAK);
	11457	*flagp \|= HASWIDTH\|SIMPLE;
	11458	goto finish_meta_pat;
	11459
	11460	case 'H':
	11461	invert = 1;
	11462	/* FALLTHROUGH */
	11463	case 'h':
	11464	arg = ANYOF_BLANK;
	11465	op = POSIXU;
	11466	goto join_posix_op_known;
	11467
	11468	case 'V':
	11469	invert = 1;
	11470	/* FALLTHROUGH */
	11471	case 'v':
	11472	arg = ANYOF_VERTWS;
	11473	op = POSIXU;
	11474	goto join_posix_op_known;
	11475
	11476	case 'S':
	11477	invert = 1;
	11478	/* FALLTHROUGH */
	11479	case 's':
	11480	arg = ANYOF_SPACE;
	11481
	11482	join_posix:
	11483
	11484	op = POSIXD + get_regex_charset(RExC_flags);
	11485	if (op > POSIXA) { /* /aa is same as /a */
	11486	op = POSIXA;
	11487	}
	11488	else if (op == POSIXL) {
	11489	RExC_contains_locale = 1;
	11490	}
	11491
	11492	join_posix_op_known:
	11493
	11494	if (invert) {
	11495	op += NPOSIXD - POSIXD;
	11496	}
	11497
	11498	ret = reg_node(pRExC_state, op);
	11499	if (! SIZE_ONLY) {
	11500	FLAGS(ret) = namedclass_to_classnum(arg);
	11501	}
	11502
	11503	*flagp \|= HASWIDTH\|SIMPLE;
	11504	/* FALLTHROUGH */
	11505
	11506	finish_meta_pat:
	11507	nextchar(pRExC_state);
	11508	Set_Node_Length(ret, 2); /* MJD */
	11509	break;
	11510	case 'p':
	11511	case 'P':
	11512	{
	11513	#ifdef DEBUGGING
	11514	char* parse_start = RExC_parse - 2;
	11515	#endif
	11516
	11517	RExC_parse--;
	11518
	11519	ret = regclass(pRExC_state, flagp,depth+1,
	11520	TRUE, /* means just parse this element */
	11521	FALSE, /* don't allow multi-char folds */
	11522	FALSE, /* don't silence non-portable warnings.
	11523	It would be a bug if these returned
	11524	non-portables */
	11525	NULL);
	11526	/* regclass() can only return RESTART_UTF8 if multi-char folds
	11527	are allowed. */
	11528	if (!ret)
	11529	FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
	11530	(UV) *flagp);
	11531
	11532	RExC_parse--;
	11533
	11534	Set_Node_Offset(ret, parse_start + 2);
	11535	Set_Node_Cur_Length(ret, parse_start);
	11536	nextchar(pRExC_state);
	11537	}
	11538	break;
	11539	case 'N':
	11540	/* Handle \N and \N{NAME} with multiple code points here and not
	11541	* below because it can be multicharacter. join_exact() will join
	11542	* them up later on. Also this makes sure that things like
	11543	* /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
	11544	* The options to the grok function call causes it to fail if the
	11545	* sequence is just a single code point. We then go treat it as
	11546	* just another character in the current EXACT node, and hence it
	11547	* gets uniform treatment with all the other characters. The
	11548	* special treatment for quantifiers is not needed for such single
	11549	* character sequences */
	11550	++RExC_parse;
	11551	if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
	11552	FALSE /* not strict */ )) {
	11553	if (*flagp & RESTART_UTF8)
	11554	return NULL;
	11555	RExC_parse--;
	11556	goto defchar;
	11557	}
	11558	break;
	11559	case 'k': /* Handle \k<NAME> and \k'NAME' */
	11560	parse_named_seq:
	11561	{
	11562	char ch= RExC_parse[1];
	11563	if (ch != '<' && ch != '\'' && ch != '{') {
	11564	RExC_parse++;
	11565	/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
	11566	vFAIL2("Sequence %.2s... not terminated",parse_start);
	11567	} else {
	11568	/* this pretty much dupes the code for (?P=...) in reg(), if
	11569	you change this make sure you change that */
	11570	char* name_start = (RExC_parse += 2);
	11571	U32 num = 0;
	11572	SV *sv_dat = reg_scan_name(pRExC_state,
	11573	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	11574	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	11575	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	11576	/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
	11577	vFAIL2("Sequence %.3s... not terminated",parse_start);
	11578
	11579	if (!SIZE_ONLY) {
	11580	num = add_data( pRExC_state, STR_WITH_LEN("S"));
	11581	RExC_rxi->data->data[num]=(void*)sv_dat;
	11582	SvREFCNT_inc_simple_void(sv_dat);
	11583	}
	11584
	11585	RExC_sawback = 1;
	11586	ret = reganode(pRExC_state,
	11587	((! FOLD)
	11588	? NREF
	11589	: (ASCII_FOLD_RESTRICTED)
	11590	? NREFFA
	11591	: (AT_LEAST_UNI_SEMANTICS)
	11592	? NREFFU
	11593	: (LOC)
	11594	? NREFFL
	11595	: NREFF),
	11596	num);
	11597	*flagp \|= HASWIDTH;
	11598
	11599	/* override incorrect value set in reganode MJD */
	11600	Set_Node_Offset(ret, parse_start+1);
	11601	Set_Node_Cur_Length(ret, parse_start);
	11602	nextchar(pRExC_state);
	11603
	11604	}
	11605	break;
	11606	}
	11607	case 'g':
	11608	case '1': case '2': case '3': case '4':
	11609	case '5': case '6': case '7': case '8': case '9':
	11610	{
	11611	I32 num;
	11612	bool hasbrace = 0;
	11613
	11614	if (*RExC_parse == 'g') {
	11615	bool isrel = 0;
	11616
	11617	RExC_parse++;
	11618	if (*RExC_parse == '{') {
	11619	RExC_parse++;
	11620	hasbrace = 1;
	11621	}
	11622	if (*RExC_parse == '-') {
	11623	RExC_parse++;
	11624	isrel = 1;
	11625	}
	11626	if (hasbrace && !isDIGIT(*RExC_parse)) {
	11627	if (isrel) RExC_parse--;
	11628	RExC_parse -= 2;
	11629	goto parse_named_seq;
	11630	}
	11631
	11632	num = S_backref_value(RExC_parse);
	11633	if (num == 0)
	11634	vFAIL("Reference to invalid group 0");
	11635	else if (num == I32_MAX) {
	11636	if (isDIGIT(*RExC_parse))
	11637	vFAIL("Reference to nonexistent group");
	11638	else
	11639	vFAIL("Unterminated \\g... pattern");
	11640	}
	11641
	11642	if (isrel) {
	11643	num = RExC_npar - num;
	11644	if (num < 1)
	11645	vFAIL("Reference to nonexistent or unclosed group");
	11646	}
	11647	}
	11648	else {
	11649	num = S_backref_value(RExC_parse);
	11650	/* bare \NNN might be backref or octal - if it is larger than or equal
	11651	* RExC_npar then it is assumed to be and octal escape.
	11652	* Note RExC_npar is +1 from the actual number of parens*/
	11653	if (num == I32_MAX \|\| (num > 9 && num >= RExC_npar
	11654	&& RExC_parse != '8' && RExC_parse != '9'))
	11655	{
	11656	/* Probably a character specified in octal, e.g. \35 */
	11657	goto defchar;
	11658	}
	11659	}
	11660
	11661	/* at this point RExC_parse definitely points to a backref
	11662	* number */
	11663	{
	11664	#ifdef RE_TRACK_PATTERN_OFFSETS
	11665	char * const parse_start = RExC_parse - 1; /* MJD */
	11666	#endif
	11667	while (isDIGIT(*RExC_parse))
	11668	RExC_parse++;
	11669	if (hasbrace) {
	11670	if (*RExC_parse != '}')
	11671	vFAIL("Unterminated \\g{...} pattern");
	11672	RExC_parse++;
	11673	}
	11674	if (!SIZE_ONLY) {
	11675	if (num > (I32)RExC_rx->nparens)
	11676	vFAIL("Reference to nonexistent group");
	11677	}
	11678	RExC_sawback = 1;
	11679	ret = reganode(pRExC_state,
	11680	((! FOLD)
	11681	? REF
	11682	: (ASCII_FOLD_RESTRICTED)
	11683	? REFFA
	11684	: (AT_LEAST_UNI_SEMANTICS)
	11685	? REFFU
	11686	: (LOC)
	11687	? REFFL
	11688	: REFF),
	11689	num);
	11690	*flagp \|= HASWIDTH;
	11691
	11692	/* override incorrect value set in reganode MJD */
	11693	Set_Node_Offset(ret, parse_start+1);
	11694	Set_Node_Cur_Length(ret, parse_start);
	11695	RExC_parse--;
	11696	nextchar(pRExC_state);
	11697	}
	11698	}
	11699	break;
	11700	case '\0':
	11701	if (RExC_parse >= RExC_end)
	11702	FAIL("Trailing \\");
	11703	/* FALLTHROUGH */
	11704	default:
	11705	/* Do not generate "unrecognized" warnings here, we fall
	11706	back into the quick-grab loop below */
	11707	parse_start--;
	11708	goto defchar;
	11709	}
	11710	break;
	11711
	11712	case '#':
	11713	if (RExC_flags & RXf_PMf_EXTENDED) {
	11714	RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
	11715	if (RExC_parse < RExC_end)
	11716	goto tryagain;
	11717	}
	11718	/* FALLTHROUGH */
	11719
	11720	default:
	11721
	11722	parse_start = RExC_parse - 1;
	11723
	11724	RExC_parse++;
	11725
	11726	defchar: {
	11727	STRLEN len = 0;
	11728	UV ender = 0;
	11729	char *p;
	11730	char *s;
	11731	#define MAX_NODE_STRING_SIZE 127
	11732	char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
	11733	char *s0;
	11734	U8 upper_parse = MAX_NODE_STRING_SIZE;
	11735	U8 node_type = compute_EXACTish(pRExC_state);
	11736	bool next_is_quantifier;
	11737	char * oldp = NULL;
	11738
	11739	/* We can convert EXACTF nodes to EXACTFU if they contain only
	11740	* characters that match identically regardless of the target
	11741	* string's UTF8ness. The reason to do this is that EXACTF is not
	11742	* trie-able, EXACTFU is.
	11743	*
	11744	* Similarly, we can convert EXACTFL nodes to EXACTFU if they
	11745	* contain only above-Latin1 characters (hence must be in UTF8),
	11746	* which don't participate in folds with Latin1-range characters,
	11747	* as the latter's folds aren't known until runtime. (We don't
	11748	* need to figure this out until pass 2) */
	11749	bool maybe_exactfu = PASS2
	11750	&& (node_type == EXACTF \|\| node_type == EXACTFL);
	11751
	11752	/* If a folding node contains only code points that don't
	11753	* participate in folds, it can be changed into an EXACT node,
	11754	* which allows the optimizer more things to look for */
	11755	bool maybe_exact;
	11756
	11757	ret = reg_node(pRExC_state, node_type);
	11758
	11759	/* In pass1, folded, we use a temporary buffer instead of the
	11760	* actual node, as the node doesn't exist yet */
	11761	s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
	11762
	11763	s0 = s;
	11764
	11765	reparse:
	11766
	11767	/* We do the EXACTFish to EXACT node only if folding. (And we
	11768	* don't need to figure this out until pass 2) */
	11769	maybe_exact = FOLD && PASS2;
	11770
	11771	/* XXX The node can hold up to 255 bytes, yet this only goes to
	11772	* 127. I (khw) do not know why. Keeping it somewhat less than
	11773	* 255 allows us to not have to worry about overflow due to
	11774	* converting to utf8 and fold expansion, but that value is
	11775	* 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
	11776	* split up by this limit into a single one using the real max of
	11777	* 255. Even at 127, this breaks under rare circumstances. If
	11778	* folding, we do not want to split a node at a character that is a
	11779	* non-final in a multi-char fold, as an input string could just
	11780	* happen to want to match across the node boundary. The join
	11781	* would solve that problem if the join actually happens. But a
	11782	* series of more than two nodes in a row each of 127 would cause
	11783	* the first join to succeed to get to 254, but then there wouldn't
	11784	* be room for the next one, which could at be one of those split
	11785	* multi-char folds. I don't know of any fool-proof solution. One
	11786	* could back off to end with only a code point that isn't such a
	11787	* non-final, but it is possible for there not to be any in the
	11788	* entire node. */
	11789	for (p = RExC_parse - 1;
	11790	len < upper_parse && p < RExC_end;
	11791	len++)
	11792	{
	11793	oldp = p;
	11794
	11795	if (RExC_flags & RXf_PMf_EXTENDED)
	11796	p = regpatws(pRExC_state, p,
	11797	TRUE); /* means recognize comments */
	11798	switch ((U8)*p) {
	11799	case '^':
	11800	case '$':
	11801	case '.':
	11802	case '[':
	11803	case '(':
	11804	case ')':
	11805	case '\|':
	11806	goto loopdone;
	11807	case '\\':
	11808	/* Literal Escapes Switch
	11809
	11810	This switch is meant to handle escape sequences that
	11811	resolve to a literal character.
	11812
	11813	Every escape sequence that represents something
	11814	else, like an assertion or a char class, is handled
	11815	in the switch marked 'Special Escapes' above in this
	11816	routine, but also has an entry here as anything that
	11817	isn't explicitly mentioned here will be treated as
	11818	an unescaped equivalent literal.
	11819	*/
	11820
	11821	switch ((U8)*++p) {
	11822	/* These are all the special escapes. */
	11823	case 'A': /* Start assertion */
	11824	case 'b': case 'B': /* Word-boundary assertion*/
	11825	case 'C': /* Single char !DANGEROUS! */
	11826	case 'd': case 'D': /* digit class */
	11827	case 'g': case 'G': /* generic-backref, pos assertion */
	11828	case 'h': case 'H': /* HORIZWS */
	11829	case 'k': case 'K': /* named backref, keep marker */
	11830	case 'p': case 'P': /* Unicode property */
	11831	case 'R': /* LNBREAK */
	11832	case 's': case 'S': /* space class */
	11833	case 'v': case 'V': /* VERTWS */
	11834	case 'w': case 'W': /* word class */
	11835	case 'X': /* eXtended Unicode "combining
	11836	character sequence" */
	11837	case 'z': case 'Z': /* End of line/string assertion */
	11838	--p;
	11839	goto loopdone;
	11840
	11841	/* Anything after here is an escape that resolves to a
	11842	literal. (Except digits, which may or may not)
	11843	*/
	11844	case 'n':
	11845	ender = '\n';
	11846	p++;
	11847	break;
	11848	case 'N': /* Handle a single-code point named character. */
	11849	/* The options cause it to fail if a multiple code
	11850	* point sequence. Handle those in the switch() above
	11851	* */
	11852	RExC_parse = p + 1;
	11853	if (! grok_bslash_N(pRExC_state, NULL, &ender,
	11854	flagp, depth, FALSE,
	11855	FALSE /* not strict */ ))
	11856	{
	11857	if (*flagp & RESTART_UTF8)
	11858	FAIL("panic: grok_bslash_N set RESTART_UTF8");
	11859	RExC_parse = p = oldp;
	11860	goto loopdone;
	11861	}
	11862	p = RExC_parse;
	11863	if (ender > 0xff) {
	11864	REQUIRE_UTF8;
	11865	}
	11866	break;
	11867	case 'r':
	11868	ender = '\r';
	11869	p++;
	11870	break;
	11871	case 't':
	11872	ender = '\t';
	11873	p++;
	11874	break;
	11875	case 'f':
	11876	ender = '\f';
	11877	p++;
	11878	break;
	11879	case 'e':
	11880	ender = ESC_NATIVE;
	11881	p++;
	11882	break;
	11883	case 'a':
	11884	ender = '\a';
	11885	p++;
	11886	break;
	11887	case 'o':
	11888	{
	11889	UV result;
	11890	const char* error_msg;
	11891
	11892	bool valid = grok_bslash_o(&p,
	11893	&result,
	11894	&error_msg,
	11895	TRUE, /* out warnings */
	11896	FALSE, /* not strict */
	11897	TRUE, /* Output warnings
	11898	for non-
	11899	portables */
	11900	UTF);
	11901	if (! valid) {
	11902	RExC_parse = p; /* going to die anyway; point
	11903	to exact spot of failure */
	11904	vFAIL(error_msg);
	11905	}
	11906	ender = result;
	11907	if (PL_encoding && ender < 0x100) {
	11908	goto recode_encoding;
	11909	}
	11910	if (ender > 0xff) {
	11911	REQUIRE_UTF8;
	11912	}
	11913	break;
	11914	}
	11915	case 'x':
	11916	{
	11917	UV result = UV_MAX; /* initialize to erroneous
	11918	value */
	11919	const char* error_msg;
	11920
	11921	bool valid = grok_bslash_x(&p,
	11922	&result,
	11923	&error_msg,
	11924	TRUE, /* out warnings */
	11925	FALSE, /* not strict */
	11926	TRUE, /* Output warnings
	11927	for non-
	11928	portables */
	11929	UTF);
	11930	if (! valid) {
	11931	RExC_parse = p; /* going to die anyway; point
	11932	to exact spot of failure */
	11933	vFAIL(error_msg);
	11934	}
	11935	ender = result;
	11936
	11937	if (PL_encoding && ender < 0x100) {
	11938	goto recode_encoding;
	11939	}
	11940	if (ender > 0xff) {
	11941	REQUIRE_UTF8;
	11942	}
	11943	break;
	11944	}
	11945	case 'c':
	11946	p++;
	11947	ender = grok_bslash_c(*p++, SIZE_ONLY);
	11948	break;
	11949	case '8': case '9': /* must be a backreference */
	11950	--p;
	11951	goto loopdone;
	11952	case '1': case '2': case '3':case '4':
	11953	case '5': case '6': case '7':
	11954	/* When we parse backslash escapes there is ambiguity
	11955	* between backreferences and octal escapes. Any escape
	11956	* from \1 - \9 is a backreference, any multi-digit
	11957	* escape which does not start with 0 and which when
	11958	* evaluated as decimal could refer to an already
	11959	* parsed capture buffer is a backslash. Anything else
	11960	* is octal.
	11961	*
	11962	* Note this implies that \118 could be interpreted as
	11963	* 118 OR as "\11" . "8" depending on whether there
	11964	* were 118 capture buffers defined already in the
	11965	* pattern. */
	11966
	11967	/* NOTE, RExC_npar is 1 more than the actual number of
	11968	* parens we have seen so far, hence the < RExC_npar below. */
	11969
	11970	if ( !isDIGIT(p[1]) \|\| S_backref_value(p) < RExC_npar)
	11971	{ /* Not to be treated as an octal constant, go
	11972	find backref */
	11973	--p;
	11974	goto loopdone;
	11975	}
	11976	/* FALLTHROUGH */
	11977	case '0':
	11978	{
	11979	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	11980	STRLEN numlen = 3;
	11981	ender = grok_oct(p, &numlen, &flags, NULL);
	11982	if (ender > 0xff) {
	11983	REQUIRE_UTF8;
	11984	}
	11985	p += numlen;
	11986	if (SIZE_ONLY /* like \08, \178 */
	11987	&& numlen < 3
	11988	&& p < RExC_end
	11989	&& isDIGIT(*p) && ckWARN(WARN_REGEXP))
	11990	{
	11991	reg_warn_non_literal_string(
	11992	p + 1,
	11993	form_short_octal_warning(p, numlen));
	11994	}
	11995	}
	11996	if (PL_encoding && ender < 0x100)
	11997	goto recode_encoding;
	11998	break;
	11999	recode_encoding:
	12000	if (! RExC_override_recoding) {
	12001	SV* enc = PL_encoding;
	12002	ender = reg_recode((const char)(U8)ender, &enc);
	12003	if (!enc && SIZE_ONLY)
	12004	ckWARNreg(p, "Invalid escape in the specified encoding");
	12005	REQUIRE_UTF8;
	12006	}
	12007	break;
	12008	case '\0':
	12009	if (p >= RExC_end)
	12010	FAIL("Trailing \\");
	12011	/* FALLTHROUGH */
	12012	default:
	12013	if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
	12014	/* Include any { following the alpha to emphasize
	12015	* that it could be part of an escape at some point
	12016	* in the future */
	12017	int len = (isALPHA(p) && (p + 1) == '{') ? 2 : 1;
	12018	ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
	12019	}
	12020	goto normal_default;
	12021	} /* End of switch on '\' */
	12022	break;
	12023	case '{':
	12024	/* Currently we don't warn when the lbrace is at the start
	12025	* of a construct. This catches it in the middle of a
	12026	* literal string, or when its the first thing after
	12027	* something like "\b" */
	12028	if (! SIZE_ONLY
	12029	&& (len \|\| (p > RExC_start && isALPHA_A(*(p -1)))))
	12030	{
	12031	ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
	12032	}
	12033	/FALLTHROUGH/
	12034	default: /* A literal character */
	12035	normal_default:
	12036	if (UTF8_IS_START(*p) && UTF) {
	12037	STRLEN numlen;
	12038	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	12039	&numlen, UTF8_ALLOW_DEFAULT);
	12040	p += numlen;
	12041	}
	12042	else
	12043	ender = (U8) *p++;
	12044	break;
	12045	} /* End of switch on the literal */
	12046
	12047	/* Here, have looked at the literal character and <ender>
	12048	* contains its ordinal, <p> points to the character after it
	12049	*/
	12050
	12051	if ( RExC_flags & RXf_PMf_EXTENDED)
	12052	p = regpatws(pRExC_state, p,
	12053	TRUE); /* means recognize comments */
	12054
	12055	/* If the next thing is a quantifier, it applies to this
	12056	* character only, which means that this character has to be in
	12057	* its own node and can't just be appended to the string in an
	12058	* existing node, so if there are already other characters in
	12059	* the node, close the node with just them, and set up to do
	12060	* this character again next time through, when it will be the
	12061	* only thing in its new node */
	12062	if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
	12063	{
	12064	p = oldp;
	12065	goto loopdone;
	12066	}
	12067
	12068	if (! FOLD /* The simple case, just append the literal */
	12069	\|\| (LOC /* Also don't fold for tricky chars under /l */
	12070	&& is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
	12071	{
	12072	if (UTF) {
	12073	const STRLEN unilen = reguni(pRExC_state, ender, s);
	12074	if (unilen > 0) {
	12075	s += unilen;
	12076	len += unilen;
	12077	}
	12078
	12079	/* The loop increments <len> each time, as all but this
	12080	* path (and one other) through it add a single byte to
	12081	* the EXACTish node. But this one has changed len to
	12082	* be the correct final value, so subtract one to
	12083	* cancel out the increment that follows */
	12084	len--;
	12085	}
	12086	else {
	12087	REGC((char)ender, s++);
	12088	}
	12089
	12090	/* Can get here if folding only if is one of the /l
	12091	* characters whose fold depends on the locale. The
	12092	* occurrence of any of these indicate that we can't
	12093	* simplify things */
	12094	if (FOLD) {
	12095	maybe_exact = FALSE;
	12096	maybe_exactfu = FALSE;
	12097	}
	12098	}
	12099	else /* FOLD */
	12100	if (! ( UTF
	12101	/* See comments for join_exact() as to why we fold this
	12102	* non-UTF at compile time */
	12103	\|\| (node_type == EXACTFU
	12104	&& ender == LATIN_SMALL_LETTER_SHARP_S)))
	12105	{
	12106	/* Here, are folding and are not UTF-8 encoded; therefore
	12107	* the character must be in the range 0-255, and is not /l
	12108	* (Not /l because we already handled these under /l in
	12109	* is_PROBLEMATIC_LOCALE_FOLD_cp */
	12110	if (IS_IN_SOME_FOLD_L1(ender)) {
	12111	maybe_exact = FALSE;
	12112
	12113	/* See if the character's fold differs between /d and
	12114	* /u. This includes the multi-char fold SHARP S to
	12115	* 'ss' */
	12116	if (maybe_exactfu
	12117	&& (PL_fold[ender] != PL_fold_latin1[ender]
	12118	\|\| ender == LATIN_SMALL_LETTER_SHARP_S
	12119	\|\| (len > 0
	12120	&& isALPHA_FOLD_EQ(ender, 's')
	12121	&& isALPHA_FOLD_EQ(*(s-1), 's'))))
	12122	{
	12123	maybe_exactfu = FALSE;
	12124	}
	12125	}
	12126
	12127	/* Even when folding, we store just the input character, as
	12128	* we have an array that finds its fold quickly */
	12129	*(s++) = (char) ender;
	12130	}
	12131	else { /* FOLD and UTF */
	12132	/* Unlike the non-fold case, we do actually have to
	12133	* calculate the results here in pass 1. This is for two
	12134	* reasons, the folded length may be longer than the
	12135	* unfolded, and we have to calculate how many EXACTish
	12136	* nodes it will take; and we may run out of room in a node
	12137	* in the middle of a potential multi-char fold, and have
	12138	* to back off accordingly. (Hence we can't use REGC for
	12139	* the simple case just below.) */
	12140
	12141	UV folded;
	12142	if (isASCII(ender)) {
	12143	folded = toFOLD(ender);
	12144	*(s)++ = (U8) folded;
	12145	}
	12146	else {
	12147	STRLEN foldlen;
	12148
	12149	folded = _to_uni_fold_flags(
	12150	ender,
	12151	(U8 *) s,
	12152	&foldlen,
	12153	FOLD_FLAGS_FULL \| ((ASCII_FOLD_RESTRICTED)
	12154	? FOLD_FLAGS_NOMIX_ASCII
	12155	: 0));
	12156	s += foldlen;
	12157
	12158	/* The loop increments <len> each time, as all but this
	12159	* path (and one other) through it add a single byte to
	12160	* the EXACTish node. But this one has changed len to
	12161	* be the correct final value, so subtract one to
	12162	* cancel out the increment that follows */
	12163	len += foldlen - 1;
	12164	}
	12165	/* If this node only contains non-folding code points so
	12166	* far, see if this new one is also non-folding */
	12167	if (maybe_exact) {
	12168	if (folded != ender) {
	12169	maybe_exact = FALSE;
	12170	}
	12171	else {
	12172	/* Here the fold is the original; we have to check
	12173	* further to see if anything folds to it */
	12174	if (_invlist_contains_cp(PL_utf8_foldable,
	12175	ender))
	12176	{
	12177	maybe_exact = FALSE;
	12178	}
	12179	}
	12180	}
	12181	ender = folded;
	12182	}
	12183
	12184	if (next_is_quantifier) {
	12185
	12186	/* Here, the next input is a quantifier, and to get here,
	12187	* the current character is the only one in the node.
	12188	* Also, here <len> doesn't include the final byte for this
	12189	* character */
	12190	len++;
	12191	goto loopdone;
	12192	}
	12193
	12194	} /* End of loop through literal characters */
	12195
	12196	/* Here we have either exhausted the input or ran out of room in
	12197	* the node. (If we encountered a character that can't be in the
	12198	* node, transfer is made directly to <loopdone>, and so we
	12199	* wouldn't have fallen off the end of the loop.) In the latter
	12200	* case, we artificially have to split the node into two, because
	12201	* we just don't have enough space to hold everything. This
	12202	* creates a problem if the final character participates in a
	12203	* multi-character fold in the non-final position, as a match that
	12204	* should have occurred won't, due to the way nodes are matched,
	12205	* and our artificial boundary. So back off until we find a non-
	12206	* problematic character -- one that isn't at the beginning or
	12207	* middle of such a fold. (Either it doesn't participate in any
	12208	* folds, or appears only in the final position of all the folds it
	12209	* does participate in.) A better solution with far fewer false
	12210	* positives, and that would fill the nodes more completely, would
	12211	* be to actually have available all the multi-character folds to
	12212	* test against, and to back-off only far enough to be sure that
	12213	* this node isn't ending with a partial one. <upper_parse> is set
	12214	* further below (if we need to reparse the node) to include just
	12215	* up through that final non-problematic character that this code
	12216	* identifies, so when it is set to less than the full node, we can
	12217	* skip the rest of this */
	12218	if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
	12219
	12220	const STRLEN full_len = len;
	12221
	12222	assert(len >= MAX_NODE_STRING_SIZE);
	12223
	12224	/* Here, <s> points to the final byte of the final character.
	12225	* Look backwards through the string until find a non-
	12226	* problematic character */
	12227
	12228	if (! UTF) {
	12229
	12230	/* This has no multi-char folds to non-UTF characters */
	12231	if (ASCII_FOLD_RESTRICTED) {
	12232	goto loopdone;
	12233	}
	12234
	12235	while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
	12236	len = s - s0 + 1;
	12237	}
	12238	else {
	12239	if (! PL_NonL1NonFinalFold) {
	12240	PL_NonL1NonFinalFold = _new_invlist_C_array(
	12241	NonL1_Perl_Non_Final_Folds_invlist);
	12242	}
	12243
	12244	/* Point to the first byte of the final character */
	12245	s = (char ) utf8_hop((U8 ) s, -1);
	12246
	12247	while (s >= s0) { /* Search backwards until find
	12248	non-problematic char */
	12249	if (UTF8_IS_INVARIANT(*s)) {
	12250
	12251	/* There are no ascii characters that participate
	12252	* in multi-char folds under /aa. In EBCDIC, the
	12253	* non-ascii invariants are all control characters,
	12254	* so don't ever participate in any folds. */
	12255	if (ASCII_FOLD_RESTRICTED
	12256	\|\| ! IS_NON_FINAL_FOLD(*s))
	12257	{
	12258	break;
	12259	}
	12260	}
	12261	else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
	12262	if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
	12263	s, (s+1))))
	12264	{
	12265	break;
	12266	}
	12267	}
	12268	else if (! _invlist_contains_cp(
	12269	PL_NonL1NonFinalFold,
	12270	valid_utf8_to_uvchr((U8 *) s, NULL)))
	12271	{
	12272	break;
	12273	}
	12274
	12275	/* Here, the current character is problematic in that
	12276	* it does occur in the non-final position of some
	12277	* fold, so try the character before it, but have to
	12278	* special case the very first byte in the string, so
	12279	* we don't read outside the string */
	12280	s = (s == s0) ? s -1 : (char ) utf8_hop((U8 ) s, -1);
	12281	} /* End of loop backwards through the string */
	12282
	12283	/* If there were only problematic characters in the string,
	12284	* <s> will point to before s0, in which case the length
	12285	* should be 0, otherwise include the length of the
	12286	* non-problematic character just found */
	12287	len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
	12288	}
	12289
	12290	/* Here, have found the final character, if any, that is
	12291	* non-problematic as far as ending the node without splitting
	12292	* it across a potential multi-char fold. <len> contains the
	12293	* number of bytes in the node up-to and including that
	12294	* character, or is 0 if there is no such character, meaning
	12295	* the whole node contains only problematic characters. In
	12296	* this case, give up and just take the node as-is. We can't
	12297	* do any better */
	12298	if (len == 0) {
	12299	len = full_len;
	12300
	12301	/* If the node ends in an 's' we make sure it stays EXACTF,
	12302	* as if it turns into an EXACTFU, it could later get
	12303	* joined with another 's' that would then wrongly match
	12304	* the sharp s */
	12305	if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
	12306	{
	12307	maybe_exactfu = FALSE;
	12308	}
	12309	} else {
	12310
	12311	/* Here, the node does contain some characters that aren't
	12312	* problematic. If one such is the final character in the
	12313	* node, we are done */
	12314	if (len == full_len) {
	12315	goto loopdone;
	12316	}
	12317	else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
	12318
	12319	/* If the final character is problematic, but the
	12320	* penultimate is not, back-off that last character to
	12321	* later start a new node with it */
	12322	p = oldp;
	12323	goto loopdone;
	12324	}
	12325
	12326	/* Here, the final non-problematic character is earlier
	12327	* in the input than the penultimate character. What we do
	12328	* is reparse from the beginning, going up only as far as
	12329	* this final ok one, thus guaranteeing that the node ends
	12330	* in an acceptable character. The reason we reparse is
	12331	* that we know how far in the character is, but we don't
	12332	* know how to correlate its position with the input parse.
	12333	* An alternate implementation would be to build that
	12334	* correlation as we go along during the original parse,
	12335	* but that would entail extra work for every node, whereas
	12336	* this code gets executed only when the string is too
	12337	* large for the node, and the final two characters are
	12338	* problematic, an infrequent occurrence. Yet another
	12339	* possible strategy would be to save the tail of the
	12340	* string, and the next time regatom is called, initialize
	12341	* with that. The problem with this is that unless you
	12342	* back off one more character, you won't be guaranteed
	12343	* regatom will get called again, unless regbranch,
	12344	* regpiece ... are also changed. If you do back off that
	12345	* extra character, so that there is input guaranteed to
	12346	* force calling regatom, you can't handle the case where
	12347	* just the first character in the node is acceptable. I
	12348	* (khw) decided to try this method which doesn't have that
	12349	* pitfall; if performance issues are found, we can do a
	12350	* combination of the current approach plus that one */
	12351	upper_parse = len;
	12352	len = 0;
	12353	s = s0;
	12354	goto reparse;
	12355	}
	12356	} /* End of verifying node ends with an appropriate char */
	12357
	12358	loopdone: /* Jumped to when encounters something that shouldn't be in
	12359	the node */
	12360
	12361	/* I (khw) don't know if you can get here with zero length, but the
	12362	* old code handled this situation by creating a zero-length EXACT
	12363	* node. Might as well be NOTHING instead */
	12364	if (len == 0) {
	12365	OP(ret) = NOTHING;
	12366	}
	12367	else {
	12368	if (FOLD) {
	12369	/* If 'maybe_exact' is still set here, means there are no
	12370	* code points in the node that participate in folds;
	12371	* similarly for 'maybe_exactfu' and code points that match
	12372	* differently depending on UTF8ness of the target string
	12373	* (for /u), or depending on locale for /l */
	12374	if (maybe_exact) {
	12375	OP(ret) = EXACT;
	12376	}
	12377	else if (maybe_exactfu) {
	12378	OP(ret) = EXACTFU;
	12379	}
	12380	}
	12381	alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
	12382	FALSE /* Don't look to see if could
	12383	be turned into an EXACT
	12384	node, as we have already
	12385	computed that */
	12386	);
	12387	}
	12388
	12389	RExC_parse = p - 1;
	12390	Set_Node_Cur_Length(ret, parse_start);
	12391	nextchar(pRExC_state);
	12392	{
	12393	/* len is STRLEN which is unsigned, need to copy to signed */
	12394	IV iv = len;
	12395	if (iv < 0)
	12396	vFAIL("Internal disaster");
	12397	}
	12398
	12399	} /* End of label 'defchar:' */
	12400	break;
	12401	} /* End of giant switch on input character */
	12402
	12403	return(ret);
	12404	}
	12405
	12406	STATIC char *
	12407	S_regpatws(RExC_state_t pRExC_state, char p , const bool recognize_comment )
	12408	{
	12409	/* Returns the next non-pattern-white space, non-comment character (the
	12410	* latter only if 'recognize_comment is true) in the string p, which is
	12411	* ended by RExC_end. See also reg_skipcomment */
	12412	const char *e = RExC_end;
	12413
	12414	PERL_ARGS_ASSERT_REGPATWS;
	12415
	12416	while (p < e) {
	12417	STRLEN len;
	12418	if ((len = is_PATWS_safe(p, e, UTF))) {
	12419	p += len;
	12420	}
	12421	else if (recognize_comment && *p == '#') {
	12422	p = reg_skipcomment(pRExC_state, p);
	12423	}
	12424	else
	12425	break;
	12426	}
	12427	return p;
	12428	}
	12429
	12430	STATIC void
	12431	S_populate_ANYOF_from_invlist(pTHX_ regnode node, SV* invlist_ptr)
	12432	{
	12433	/* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
	12434	* sets up the bitmap and any flags, removing those code points from the
	12435	* inversion list, setting it to NULL should it become completely empty */
	12436
	12437	PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
	12438	assert(PL_regkind[OP(node)] == ANYOF);
	12439
	12440	ANYOF_BITMAP_ZERO(node);
	12441	if (*invlist_ptr) {
	12442
	12443	/* This gets set if we actually need to modify things */
	12444	bool change_invlist = FALSE;
	12445
	12446	UV start, end;
	12447
	12448	/* Start looking through invlist_ptr /
	12449	invlist_iterinit(*invlist_ptr);
	12450	while (invlist_iternext(*invlist_ptr, &start, &end)) {
	12451	UV high;
	12452	int i;
	12453
	12454	if (end == UV_MAX && start <= 256) {
	12455	ANYOF_FLAGS(node) \|= ANYOF_ABOVE_LATIN1_ALL;
	12456	}
	12457	else if (end >= 256) {
	12458	ANYOF_FLAGS(node) \|= ANYOF_UTF8;
	12459	}
	12460
	12461	/* Quit if are above what we should change */
	12462	if (start >= NUM_ANYOF_CODE_POINTS) {
	12463	break;
	12464	}
	12465
	12466	change_invlist = TRUE;
	12467
	12468	/* Set all the bits in the range, up to the max that we are doing */
	12469	high = (end < NUM_ANYOF_CODE_POINTS - 1)
	12470	? end
	12471	: NUM_ANYOF_CODE_POINTS - 1;
	12472	for (i = start; i <= (int) high; i++) {
	12473	if (! ANYOF_BITMAP_TEST(node, i)) {
	12474	ANYOF_BITMAP_SET(node, i);
	12475	}
	12476	}
	12477	}
	12478	invlist_iterfinish(*invlist_ptr);
	12479
	12480	/* Done with loop; remove any code points that are in the bitmap from
	12481	* *invlist_ptr; similarly for code points above latin1 if we have a
	12482	* flag to match all of them anyways */
	12483	if (change_invlist) {
	12484	_invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
	12485	}
	12486	if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
	12487	_invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
	12488	}
	12489
	12490	/* If have completely emptied it, remove it completely */
	12491	if (_invlist_len(*invlist_ptr) == 0) {
	12492	SvREFCNT_dec_NN(*invlist_ptr);
	12493	*invlist_ptr = NULL;
	12494	}
	12495	}
	12496	}
	12497
	12498	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	12499	Character classes ([:foo:]) can also be negated ([:^foo:]).
	12500	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	12501	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	12502	but trigger failures because they are currently unimplemented. */
	12503
	12504	#define POSIXCC_DONE(c) ((c) == ':')
	12505	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	12506	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	12507
	12508	PERL_STATIC_INLINE I32
	12509	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
	12510	{
	12511	I32 namedclass = OOB_NAMEDCLASS;
	12512
	12513	PERL_ARGS_ASSERT_REGPPOSIXCC;
	12514
	12515	if (value == '[' && RExC_parse + 1 < RExC_end &&
	12516	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	12517	POSIXCC(UCHARAT(RExC_parse)))
	12518	{
	12519	const char c = UCHARAT(RExC_parse);
	12520	char* const s = RExC_parse++;
	12521
	12522	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	12523	RExC_parse++;
	12524	if (RExC_parse == RExC_end) {
	12525	if (strict) {
	12526
	12527	/* Try to give a better location for the error (than the end of
	12528	* the string) by looking for the matching ']' */
	12529	RExC_parse = s;
	12530	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	12531	RExC_parse++;
	12532	}
	12533	vFAIL2("Unmatched '%c' in POSIX class", c);
	12534	}
	12535	/* Grandfather lone [:, [=, [. */
	12536	RExC_parse = s;
	12537	}
	12538	else {
	12539	const char* const t = RExC_parse++; /* skip over the c */
	12540	assert(*t == c);
	12541
	12542	if (UCHARAT(RExC_parse) == ']') {
	12543	const char *posixcc = s + 1;
	12544	RExC_parse++; /* skip over the ending ] */
	12545
	12546	if (*s == ':') {
	12547	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	12548	const I32 skip = t - posixcc;
	12549
	12550	/* Initially switch on the length of the name. */
	12551	switch (skip) {
	12552	case 4:
	12553	if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
	12554	this is the Perl \w
	12555	*/
	12556	namedclass = ANYOF_WORDCHAR;
	12557	break;
	12558	case 5:
	12559	/* Names all of length 5. */
	12560	/* alnum alpha ascii blank cntrl digit graph lower
	12561	print punct space upper */
	12562	/* Offset 4 gives the best switch position. */
	12563	switch (posixcc[4]) {
	12564	case 'a':
	12565	if (memEQ(posixcc, "alph", 4)) /* alpha */
	12566	namedclass = ANYOF_ALPHA;
	12567	break;
	12568	case 'e':
	12569	if (memEQ(posixcc, "spac", 4)) /* space */
	12570	namedclass = ANYOF_PSXSPC;
	12571	break;
	12572	case 'h':
	12573	if (memEQ(posixcc, "grap", 4)) /* graph */
	12574	namedclass = ANYOF_GRAPH;
	12575	break;
	12576	case 'i':
	12577	if (memEQ(posixcc, "asci", 4)) /* ascii */
	12578	namedclass = ANYOF_ASCII;
	12579	break;
	12580	case 'k':
	12581	if (memEQ(posixcc, "blan", 4)) /* blank */
	12582	namedclass = ANYOF_BLANK;
	12583	break;
	12584	case 'l':
	12585	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	12586	namedclass = ANYOF_CNTRL;
	12587	break;
	12588	case 'm':
	12589	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	12590	namedclass = ANYOF_ALPHANUMERIC;
	12591	break;
	12592	case 'r':
	12593	if (memEQ(posixcc, "lowe", 4)) /* lower */
	12594	namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
	12595	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	12596	namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
	12597	break;
	12598	case 't':
	12599	if (memEQ(posixcc, "digi", 4)) /* digit */
	12600	namedclass = ANYOF_DIGIT;
	12601	else if (memEQ(posixcc, "prin", 4)) /* print */
	12602	namedclass = ANYOF_PRINT;
	12603	else if (memEQ(posixcc, "punc", 4)) /* punct */
	12604	namedclass = ANYOF_PUNCT;
	12605	break;
	12606	}
	12607	break;
	12608	case 6:
	12609	if (memEQ(posixcc, "xdigit", 6))
	12610	namedclass = ANYOF_XDIGIT;
	12611	break;
	12612	}
	12613
	12614	if (namedclass == OOB_NAMEDCLASS)
	12615	vFAIL2utf8f(
	12616	"POSIX class [:%"UTF8f":] unknown",
	12617	UTF8fARG(UTF, t - s - 1, s + 1));
	12618
	12619	/* The #defines are structured so each complement is +1 to
	12620	* the normal one */
	12621	if (complement) {
	12622	namedclass++;
	12623	}
	12624	assert (posixcc[skip] == ':');
	12625	assert (posixcc[skip+1] == ']');
	12626	} else if (!SIZE_ONLY) {
	12627	/* [[=foo=]] and [[.foo.]] are still future. */
	12628
	12629	/* adjust RExC_parse so the warning shows after
	12630	the class closes */
	12631	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	12632	RExC_parse++;
	12633	vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	12634	}
	12635	} else {
	12636	/* Maternal grandfather:
	12637	* "[:" ending in ":" but not in ":]" */
	12638	if (strict) {
	12639	vFAIL("Unmatched '[' in POSIX class");
	12640	}
	12641
	12642	/* Grandfather lone [:, [=, [. */
	12643	RExC_parse = s;
	12644	}
	12645	}
	12646	}
	12647
	12648	return namedclass;
	12649	}
	12650
	12651	STATIC bool
	12652	S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
	12653	{
	12654	/* This applies some heuristics at the current parse position (which should
	12655	* be at a '[') to see if what follows might be intended to be a [:posix:]
	12656	* class. It returns true if it really is a posix class, of course, but it
	12657	* also can return true if it thinks that what was intended was a posix
	12658	* class that didn't quite make it.
	12659	*
	12660	* It will return true for
	12661	* [:alphanumerics:
	12662	* [:alphanumerics] (as long as the ] isn't followed immediately by a
	12663	* ')' indicating the end of the (?[
	12664	* [:any garbage including %^&$ punctuation:]
	12665	*
	12666	* This is designed to be called only from S_handle_regex_sets; it could be
	12667	* easily adapted to be called from the spot at the beginning of regclass()
	12668	* that checks to see in a normal bracketed class if the surrounding []
	12669	* have been omitted ([:word:] instead of [[:word:]]). But doing so would
	12670	* change long-standing behavior, so I (khw) didn't do that */
	12671	char* p = RExC_parse + 1;
	12672	char first_char = *p;
	12673
	12674	PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
	12675
	12676	assert(*(p - 1) == '[');
	12677
	12678	if (! POSIXCC(first_char)) {
	12679	return FALSE;
	12680	}
	12681
	12682	p++;
	12683	while (p < RExC_end && isWORDCHAR(*p)) p++;
	12684
	12685	if (p >= RExC_end) {
	12686	return FALSE;
	12687	}
	12688
	12689	if (p - RExC_parse > 2 /* Got at least 1 word character */
	12690	&& (*p == first_char
	12691	\|\| (p == ']' && p + 1 < RExC_end && (p + 1) != ')')))
	12692	{
	12693	return TRUE;
	12694	}
	12695
	12696	p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
	12697
	12698	return (p
	12699	&& p - RExC_parse > 2 /* [:] evaluates to colon;
	12700	[::] is a bad posix class. */
	12701	&& first_char == *(p - 1));
	12702	}
	12703
	12704	STATIC regnode *
	12705	S_handle_regex_sets(pTHX_ RExC_state_t pRExC_state, SV* return_invlist,
	12706	I32 *flagp, U32 depth,
	12707	char * const oregcomp_parse)
	12708	{
	12709	/* Handle the (?[...]) construct to do set operations */
	12710
	12711	U8 curchar;
	12712	UV start, end; /* End points of code point ranges */
	12713	SV* result_string;
	12714	char save_end, save_parse;
	12715	SV* final;
	12716	STRLEN len;
	12717	regnode* node;
	12718	AV* stack;
	12719	const bool save_fold = FOLD;
	12720
	12721	GET_RE_DEBUG_FLAGS_DECL;
	12722
	12723	PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
	12724
	12725	if (LOC) {
	12726	vFAIL("(?[...]) not valid in locale");
	12727	}
	12728	RExC_uni_semantics = 1;
	12729
	12730	/* This will return only an ANYOF regnode, or (unlikely) something smaller
	12731	* (such as EXACT). Thus we can skip most everything if just sizing. We
	12732	* call regclass to handle '[]' so as to not have to reinvent its parsing
	12733	* rules here (throwing away the size it computes each time). And, we exit
	12734	* upon an unescaped ']' that isn't one ending a regclass. To do both
	12735	* these things, we need to realize that something preceded by a backslash
	12736	* is escaped, so we have to keep track of backslashes */
	12737	if (SIZE_ONLY) {
	12738	UV depth = 0; /* how many nested (?[...]) constructs */
	12739
	12740	Perl_ck_warner_d(aTHX_
	12741	packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
	12742	"The regex_sets feature is experimental" REPORT_LOCATION,
	12743	UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
	12744	UTF8fARG(UTF,
	12745	RExC_end - RExC_start - (RExC_parse - RExC_precomp),
	12746	RExC_precomp + (RExC_parse - RExC_precomp)));
	12747
	12748	while (RExC_parse < RExC_end) {
	12749	SV* current = NULL;
	12750	RExC_parse = regpatws(pRExC_state, RExC_parse,
	12751	TRUE); /* means recognize comments */
	12752	switch (*RExC_parse) {
	12753	case '?':
	12754	if (RExC_parse[1] == '[') depth++, RExC_parse++;
	12755	/* FALLTHROUGH */
	12756	default:
	12757	break;
	12758	case '\\':
	12759	/* Skip the next byte (which could cause us to end up in
	12760	* the middle of a UTF-8 character, but since none of those
	12761	* are confusable with anything we currently handle in this
	12762	* switch (invariants all), it's safe. We'll just hit the
	12763	* default: case next time and keep on incrementing until
	12764	* we find one of the invariants we do handle. */
	12765	RExC_parse++;
	12766	break;
	12767	case '[':
	12768	{
	12769	/* If this looks like it is a [:posix:] class, leave the
	12770	* parse pointer at the '[' to fool regclass() into
	12771	* thinking it is part of a '[[:posix:]]'. That function
	12772	* will use strict checking to force a syntax error if it
	12773	* doesn't work out to a legitimate class */
	12774	bool is_posix_class
	12775	= could_it_be_a_POSIX_class(pRExC_state);
	12776	if (! is_posix_class) {
	12777	RExC_parse++;
	12778	}
	12779
	12780	/* regclass() can only return RESTART_UTF8 if multi-char
	12781	folds are allowed. */
	12782	if (!regclass(pRExC_state, flagp,depth+1,
	12783	is_posix_class, /* parse the whole char
	12784	class only if not a
	12785	posix class */
	12786	FALSE, /* don't allow multi-char folds */
	12787	TRUE, /* silence non-portable warnings. */
	12788	&current))
	12789	FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
	12790	(UV) *flagp);
	12791
	12792	/* function call leaves parse pointing to the ']', except
	12793	* if we faked it */
	12794	if (is_posix_class) {
	12795	RExC_parse--;
	12796	}
	12797
	12798	SvREFCNT_dec(current); /* In case it returned something */
	12799	break;
	12800	}
	12801
	12802	case ']':
	12803	if (depth--) break;
	12804	RExC_parse++;
	12805	if (RExC_parse < RExC_end
	12806	&& *RExC_parse == ')')
	12807	{
	12808	node = reganode(pRExC_state, ANYOF, 0);
	12809	RExC_size += ANYOF_SKIP;
	12810	nextchar(pRExC_state);
	12811	Set_Node_Length(node,
	12812	RExC_parse - oregcomp_parse + 1); /* MJD */
	12813	return node;
	12814	}
	12815	goto no_close;
	12816	}
	12817	RExC_parse++;
	12818	}
	12819
	12820	no_close:
	12821	FAIL("Syntax error in (?[...])");
	12822	}
	12823
	12824	/* Pass 2 only after this. Everything in this construct is a
	12825	* metacharacter. Operands begin with either a '\' (for an escape
	12826	* sequence), or a '[' for a bracketed character class. Any other
	12827	* character should be an operator, or parenthesis for grouping. Both
	12828	* types of operands are handled by calling regclass() to parse them. It
	12829	* is called with a parameter to indicate to return the computed inversion
	12830	* list. The parsing here is implemented via a stack. Each entry on the
	12831	* stack is a single character representing one of the operators, or the
	12832	* '('; or else a pointer to an operand inversion list. */
	12833
	12834	#define IS_OPERAND(a) (! SvIOK(a))
	12835
	12836	/* The stack starts empty. It is a syntax error if the first thing parsed
	12837	* is a binary operator; everything else is pushed on the stack. When an
	12838	* operand is parsed, the top of the stack is examined. If it is a binary
	12839	* operator, the item before it should be an operand, and both are replaced
	12840	* by the result of doing that operation on the new operand and the one on
	12841	* the stack. Thus a sequence of binary operands is reduced to a single
	12842	* one before the next one is parsed.
	12843	*
	12844	* A unary operator may immediately follow a binary in the input, for
	12845	* example
	12846	* [a] + ! [b]
	12847	* When an operand is parsed and the top of the stack is a unary operator,
	12848	* the operation is performed, and then the stack is rechecked to see if
	12849	* this new operand is part of a binary operation; if so, it is handled as
	12850	* above.
	12851	*
	12852	* A '(' is simply pushed on the stack; it is valid only if the stack is
	12853	* empty, or the top element of the stack is an operator or another '('
	12854	* (for which the parenthesized expression will become an operand). By the
	12855	* time the corresponding ')' is parsed everything in between should have
	12856	* been parsed and evaluated to a single operand (or else is a syntax
	12857	* error), and is handled as a regular operand */
	12858
	12859	sv_2mortal((SV *)(stack = newAV()));
	12860
	12861	while (RExC_parse < RExC_end) {
	12862	I32 top_index = av_tindex(stack);
	12863	SV** top_ptr;
	12864	SV* current = NULL;
	12865
	12866	/* Skip white space */
	12867	RExC_parse = regpatws(pRExC_state, RExC_parse,
	12868	TRUE /* means recognize comments */ );
	12869	if (RExC_parse >= RExC_end) {
	12870	Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
	12871	}
	12872	if ((curchar = UCHARAT(RExC_parse)) == ']') {
	12873	break;
	12874	}
	12875
	12876	switch (curchar) {
	12877
	12878	case '?':
	12879	if (av_tindex(stack) >= 0 /* This makes sure that we can
	12880	safely subtract 1 from
	12881	RExC_parse in the next clause.
	12882	If we have something on the
	12883	stack, we have parsed something
	12884	*/
	12885	&& UCHARAT(RExC_parse - 1) == '('
	12886	&& RExC_parse < RExC_end)
	12887	{
	12888	/* If is a '(?', could be an embedded '(?flags:(?[...])'.
	12889	* This happens when we have some thing like
	12890	*
	12891	* my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
	12892	* ...
	12893	* qr/(?[ \p{Digit} & $thai_or_lao ])/;
	12894	*
	12895	* Here we would be handling the interpolated
	12896	* '$thai_or_lao'. We handle this by a recursive call to
	12897	* ourselves which returns the inversion list the
	12898	* interpolated expression evaluates to. We use the flags
	12899	* from the interpolated pattern. */
	12900	U32 save_flags = RExC_flags;
	12901	const char * const save_parse = ++RExC_parse;
	12902
	12903	parse_lparen_question_flags(pRExC_state);
	12904
	12905	if (RExC_parse == save_parse /* Makes sure there was at
	12906	least one flag (or this
	12907	embedding wasn't compiled)
	12908	*/
	12909	\|\| RExC_parse >= RExC_end - 4
	12910	\|\| UCHARAT(RExC_parse) != ':'
	12911	\|\| UCHARAT(++RExC_parse) != '('
	12912	\|\| UCHARAT(++RExC_parse) != '?'
	12913	\|\| UCHARAT(++RExC_parse) != '[')
	12914	{
	12915
	12916	/* In combination with the above, this moves the
	12917	* pointer to the point just after the first erroneous
	12918	* character (or if there are no flags, to where they
	12919	* should have been) */
	12920	if (RExC_parse >= RExC_end - 4) {
	12921	RExC_parse = RExC_end;
	12922	}
	12923	else if (RExC_parse != save_parse) {
	12924	RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
	12925	}
	12926	vFAIL("Expecting '(?flags:(?[...'");
	12927	}
	12928	RExC_parse++;
	12929	(void) handle_regex_sets(pRExC_state, &current, flagp,
	12930	depth+1, oregcomp_parse);
	12931
	12932	/* Here, 'current' contains the embedded expression's
	12933	* inversion list, and RExC_parse points to the trailing
	12934	* ']'; the next character should be the ')' which will be
	12935	* paired with the '(' that has been put on the stack, so
	12936	* the whole embedded expression reduces to '(operand)' */
	12937	RExC_parse++;
	12938
	12939	RExC_flags = save_flags;
	12940	goto handle_operand;
	12941	}
	12942	/* FALLTHROUGH */
	12943
	12944	default:
	12945	RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
	12946	vFAIL("Unexpected character");
	12947
	12948	case '\\':
	12949	/* regclass() can only return RESTART_UTF8 if multi-char
	12950	folds are allowed. */
	12951	if (!regclass(pRExC_state, flagp,depth+1,
	12952	TRUE, /* means parse just the next thing */
	12953	FALSE, /* don't allow multi-char folds */
	12954	FALSE, /* don't silence non-portable warnings. */
	12955	&current))
	12956	FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
	12957	(UV) *flagp);
	12958	/* regclass() will return with parsing just the \ sequence,
	12959	* leaving the parse pointer at the next thing to parse */
	12960	RExC_parse--;
	12961	goto handle_operand;
	12962
	12963	case '[': /* Is a bracketed character class */
	12964	{
	12965	bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
	12966
	12967	if (! is_posix_class) {
	12968	RExC_parse++;
	12969	}
	12970
	12971	/* regclass() can only return RESTART_UTF8 if multi-char
	12972	folds are allowed. */
	12973	if(!regclass(pRExC_state, flagp,depth+1,
	12974	is_posix_class, /* parse the whole char class
	12975	only if not a posix class */
	12976	FALSE, /* don't allow multi-char folds */
	12977	FALSE, /* don't silence non-portable warnings. */
	12978	&current))
	12979	FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
	12980	(UV) *flagp);
	12981	/* function call leaves parse pointing to the ']', except if we
	12982	* faked it */
	12983	if (is_posix_class) {
	12984	RExC_parse--;
	12985	}
	12986
	12987	goto handle_operand;
	12988	}
	12989
	12990	case '&':
	12991	case '\|':
	12992	case '+':
	12993	case '-':
	12994	case '^':
	12995	if (top_index < 0
	12996	\|\| ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
	12997	\|\| ! IS_OPERAND(*top_ptr))
	12998	{
	12999	RExC_parse++;
	13000	vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
	13001	}
	13002	av_push(stack, newSVuv(curchar));
	13003	break;
	13004
	13005	case '!':
	13006	av_push(stack, newSVuv(curchar));
	13007	break;
	13008
	13009	case '(':
	13010	if (top_index >= 0) {
	13011	top_ptr = av_fetch(stack, top_index, FALSE);
	13012	assert(top_ptr);
	13013	if (IS_OPERAND(*top_ptr)) {
	13014	RExC_parse++;
	13015	vFAIL("Unexpected '(' with no preceding operator");
	13016	}
	13017	}
	13018	av_push(stack, newSVuv(curchar));
	13019	break;
	13020
	13021	case ')':
	13022	{
	13023	SV* lparen;
	13024	if (top_index < 1
	13025	\|\| ! (current = av_pop(stack))
	13026	\|\| ! IS_OPERAND(current)
	13027	\|\| ! (lparen = av_pop(stack))
	13028	\|\| IS_OPERAND(lparen)
	13029	\|\| SvUV(lparen) != '(')
	13030	{
	13031	SvREFCNT_dec(current);
	13032	RExC_parse++;
	13033	vFAIL("Unexpected ')'");
	13034	}
	13035	top_index -= 2;
	13036	SvREFCNT_dec_NN(lparen);
	13037
	13038	/* FALLTHROUGH */
	13039	}
	13040
	13041	handle_operand:
	13042
	13043	/* Here, we have an operand to process, in 'current' */
	13044
	13045	if (top_index < 0) { /* Just push if stack is empty */
	13046	av_push(stack, current);
	13047	}
	13048	else {
	13049	SV* top = av_pop(stack);
	13050	SV *prev = NULL;
	13051	char current_operator;
	13052
	13053	if (IS_OPERAND(top)) {
	13054	SvREFCNT_dec_NN(top);
	13055	SvREFCNT_dec_NN(current);
	13056	vFAIL("Operand with no preceding operator");
	13057	}
	13058	current_operator = (char) SvUV(top);
	13059	switch (current_operator) {
	13060	case '(': /* Push the '(' back on followed by the new
	13061	operand */
	13062	av_push(stack, top);
	13063	av_push(stack, current);
	13064	SvREFCNT_inc(top); /* Counters the '_dec' done
	13065	just after the 'break', so
	13066	it doesn't get wrongly freed
	13067	*/
	13068	break;
	13069
	13070	case '!':
	13071	_invlist_invert(current);
	13072
	13073	/* Unlike binary operators, the top of the stack,
	13074	* now that this unary one has been popped off, may
	13075	* legally be an operator, and we now have operand
	13076	* for it. */
	13077	top_index--;
	13078	SvREFCNT_dec_NN(top);
	13079	goto handle_operand;
	13080
	13081	case '&':
	13082	prev = av_pop(stack);
	13083	_invlist_intersection(prev,
	13084	current,
	13085	&current);
	13086	av_push(stack, current);
	13087	break;
	13088
	13089	case '\|':
	13090	case '+':
	13091	prev = av_pop(stack);
	13092	_invlist_union(prev, current, &current);
	13093	av_push(stack, current);
	13094	break;
	13095
	13096	case '-':
	13097	prev = av_pop(stack);;
	13098	_invlist_subtract(prev, current, &current);
	13099	av_push(stack, current);
	13100	break;
	13101
	13102	case '^': /* The union minus the intersection */
	13103	{
	13104	SV* i = NULL;
	13105	SV* u = NULL;
	13106	SV* element;
	13107
	13108	prev = av_pop(stack);
	13109	_invlist_union(prev, current, &u);
	13110	_invlist_intersection(prev, current, &i);
	13111	/* _invlist_subtract will overwrite current
	13112	without freeing what it already contains */
	13113	element = current;
	13114	_invlist_subtract(u, i, &current);
	13115	av_push(stack, current);
	13116	SvREFCNT_dec_NN(i);
	13117	SvREFCNT_dec_NN(u);
	13118	SvREFCNT_dec_NN(element);
	13119	break;
	13120	}
	13121
	13122	default:
	13123	Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
	13124	}
	13125	SvREFCNT_dec_NN(top);
	13126	SvREFCNT_dec(prev);
	13127	}
	13128	}
	13129
	13130	RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
	13131	}
	13132
	13133	if (av_tindex(stack) < 0 /* Was empty */
	13134	\|\| ((final = av_pop(stack)) == NULL)
	13135	\|\| ! IS_OPERAND(final)
	13136	\|\| av_tindex(stack) >= 0) /* More left on stack */
	13137	{
	13138	vFAIL("Incomplete expression within '(?[ ])'");
	13139	}
	13140
	13141	/* Here, 'final' is the resultant inversion list from evaluating the
	13142	* expression. Return it if so requested */
	13143	if (return_invlist) {
	13144	*return_invlist = final;
	13145	return END;
	13146	}
	13147
	13148	/* Otherwise generate a resultant node, based on 'final'. regclass() is
	13149	* expecting a string of ranges and individual code points */
	13150	invlist_iterinit(final);
	13151	result_string = newSVpvs("");
	13152	while (invlist_iternext(final, &start, &end)) {
	13153	if (start == end) {
	13154	Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
	13155	}
	13156	else {
	13157	Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
	13158	start, end);
	13159	}
	13160	}
	13161
	13162	save_parse = RExC_parse;
	13163	RExC_parse = SvPV(result_string, len);
	13164	save_end = RExC_end;
	13165	RExC_end = RExC_parse + len;
	13166
	13167	/* We turn off folding around the call, as the class we have constructed
	13168	* already has all folding taken into consideration, and we don't want
	13169	* regclass() to add to that */
	13170	RExC_flags &= ~RXf_PMf_FOLD;
	13171	/* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
	13172	*/
	13173	node = regclass(pRExC_state, flagp,depth+1,
	13174	FALSE, /* means parse the whole char class */
	13175	FALSE, /* don't allow multi-char folds */
	13176	TRUE, /* silence non-portable warnings. The above may very
	13177	well have generated non-portable code points, but
	13178	they're valid on this machine */
	13179	NULL);
	13180	if (!node)
	13181	FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
	13182	PTR2UV(flagp));
	13183	if (save_fold) {
	13184	RExC_flags \|= RXf_PMf_FOLD;
	13185	}
	13186	RExC_parse = save_parse + 1;
	13187	RExC_end = save_end;
	13188	SvREFCNT_dec_NN(final);
	13189	SvREFCNT_dec_NN(result_string);
	13190
	13191	nextchar(pRExC_state);
	13192	Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
	13193	return node;
	13194	}
	13195	#undef IS_OPERAND
	13196
	13197	STATIC void
	13198	S_add_above_Latin1_folds(pTHX_ RExC_state_t pRExC_state, const U8 cp, SV* invlist)
	13199	{
	13200	/* This hard-codes the Latin1/above-Latin1 folding rules, so that an
	13201	* innocent-looking character class, like /[ks]/i won't have to go out to
	13202	* disk to find the possible matches.
	13203	*
	13204	* This should be called only for a Latin1-range code points, cp, which is
	13205	* known to be involved in a simple fold with other code points above
	13206	* Latin1. It would give false results if /aa has been specified.
	13207	* Multi-char folds are outside the scope of this, and must be handled
	13208	* specially.
	13209	*
	13210	* XXX It would be better to generate these via regen, in case a new
	13211	* version of the Unicode standard adds new mappings, though that is not
	13212	* really likely, and may be caught by the default: case of the switch
	13213	* below. */
	13214
	13215	PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
	13216
	13217	assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
	13218
	13219	switch (cp) {
	13220	case 'k':
	13221	case 'K':
	13222	*invlist =
	13223	add_cp_to_invlist(*invlist, KELVIN_SIGN);
	13224	break;
	13225	case 's':
	13226	case 'S':
	13227	invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_LONG_S);
	13228	break;
	13229	case MICRO_SIGN:
	13230	invlist = add_cp_to_invlist(invlist, GREEK_CAPITAL_LETTER_MU);
	13231	invlist = add_cp_to_invlist(invlist, GREEK_SMALL_LETTER_MU);
	13232	break;
	13233	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	13234	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	13235	invlist = add_cp_to_invlist(invlist, ANGSTROM_SIGN);
	13236	break;
	13237	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	13238	invlist = add_cp_to_invlist(invlist,
	13239	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	13240	break;
	13241	case LATIN_SMALL_LETTER_SHARP_S:
	13242	invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_SHARP_S);
	13243	break;
	13244	default:
	13245	/* Use deprecated warning to increase the chances of this being
	13246	* output */
	13247	ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
	13248	break;
	13249	}
	13250	}
	13251
	13252	/* The names of properties whose definitions are not known at compile time are
	13253	* stored in this SV, after a constant heading. So if the length has been
	13254	* changed since initialization, then there is a run-time definition. */
	13255	#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
	13256	(SvCUR(listsv) != initial_listsv_len)
	13257
	13258	STATIC regnode *
	13259	S_regclass(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth,
	13260	const bool stop_at_1, /* Just parse the next thing, don't
	13261	look for a full character class */
	13262	bool allow_multi_folds,
	13263	const bool silence_non_portable, /* Don't output warnings
	13264	about too large
	13265	characters */
	13266	SV** ret_invlist) /* Return an inversion list, not a node */
	13267	{
	13268	/* parse a bracketed class specification. Most of these will produce an
	13269	* ANYOF node; but something like [a] will produce an EXACT node; [aA], an
	13270	* EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
	13271	* under /i with multi-character folds: it will be rewritten following the
	13272	* paradigm of this example, where the <multi-fold>s are characters which
	13273	* fold to multiple character sequences:
	13274	* /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
	13275	* gets effectively rewritten as:
	13276	* /(?:\x{multi-fold1}\|\x{multi-fold2}\|[abcdefghi]/i
	13277	* reg() gets called (recursively) on the rewritten version, and this
	13278	* function will return what it constructs. (Actually the <multi-fold>s
	13279	* aren't physically removed from the [abcdefghi], it's just that they are
	13280	* ignored in the recursion by means of a flag:
	13281	* <RExC_in_multi_char_class>.)
	13282	*
	13283	* ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
	13284	* characters, with the corresponding bit set if that character is in the
	13285	* list. For characters above this, a range list or swash is used. There
	13286	* are extra bits for \w, etc. in locale ANYOFs, as what these match is not
	13287	* determinable at compile time
	13288	*
	13289	* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
	13290	* to be restarted. This can only happen if ret_invlist is non-NULL.
	13291	*/
	13292
	13293	UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
	13294	IV range = 0;
	13295	UV value = OOB_UNICODE, save_value = OOB_UNICODE;
	13296	regnode *ret;
	13297	STRLEN numlen;
	13298	IV namedclass = OOB_NAMEDCLASS;
	13299	char *rangebegin = NULL;
	13300	bool need_class = 0;
	13301	SV *listsv = NULL;
	13302	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	13303	than just initialized. */
	13304	SV* properties = NULL; /* Code points that match \p{} \P{} */
	13305	SV* posixes = NULL; /* Code points that match classes like [:word:],
	13306	extended beyond the Latin1 range. These have to
	13307	be kept separate from other code points for much
	13308	of this function because their handling is
	13309	different under /i, and for most classes under
	13310	/d as well */
	13311	SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
	13312	separate for a while from the non-complemented
	13313	versions because of complications with /d
	13314	matching */
	13315	UV element_count = 0; /* Number of distinct elements in the class.
	13316	Optimizations may be possible if this is tiny */
	13317	AV * multi_char_matches = NULL; /* Code points that fold to more than one
	13318	character; used under /i */
	13319	UV n;
	13320	char * stop_ptr = RExC_end; /* where to stop parsing */
	13321	const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
	13322	space? */
	13323	const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
	13324
	13325	/* Unicode properties are stored in a swash; this holds the current one
	13326	* being parsed. If this swash is the only above-latin1 component of the
	13327	* character class, an optimization is to pass it directly on to the
	13328	* execution engine. Otherwise, it is set to NULL to indicate that there
	13329	* are other things in the class that have to be dealt with at execution
	13330	* time */
	13331	SV* swash = NULL; /* Code points that match \p{} \P{} */
	13332
	13333	/* Set if a component of this character class is user-defined; just passed
	13334	* on to the engine */
	13335	bool has_user_defined_property = FALSE;
	13336
	13337	/* inversion list of code points this node matches only when the target
	13338	* string is in UTF-8. (Because is under /d) */
	13339	SV* depends_list = NULL;
	13340
	13341	/* Inversion list of code points this node matches regardless of things
	13342	* like locale, folding, utf8ness of the target string */
	13343	SV* cp_list = NULL;
	13344
	13345	/* Like cp_list, but code points on this list need to be checked for things
	13346	* that fold to/from them under /i */
	13347	SV* cp_foldable_list = NULL;
	13348
	13349	/* Like cp_list, but code points on this list are valid only when the
	13350	* runtime locale is UTF-8 */
	13351	SV* only_utf8_locale_list = NULL;
	13352
	13353	#ifdef EBCDIC
	13354	/* In a range, counts how many 0-2 of the ends of it came from literals,
	13355	* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
	13356	UV literal_endpoint = 0;
	13357	#endif
	13358	bool invert = FALSE; /* Is this class to be complemented */
	13359
	13360	bool warn_super = ALWAYS_WARN_SUPER;
	13361
	13362	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	13363	case we need to change the emitted regop to an EXACT. */
	13364	const char * orig_parse = RExC_parse;
	13365	const SSize_t orig_size = RExC_size;
	13366	bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
	13367	GET_RE_DEBUG_FLAGS_DECL;
	13368
	13369	PERL_ARGS_ASSERT_REGCLASS;
	13370	#ifndef DEBUGGING
	13371	PERL_UNUSED_ARG(depth);
	13372	#endif
	13373
	13374	DEBUG_PARSE("clas");
	13375
	13376	/* Assume we are going to generate an ANYOF node. */
	13377	ret = reganode(pRExC_state, ANYOF, 0);
	13378
	13379	if (SIZE_ONLY) {
	13380	RExC_size += ANYOF_SKIP;
	13381	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	13382	}
	13383	else {
	13384	ANYOF_FLAGS(ret) = 0;
	13385
	13386	RExC_emit += ANYOF_SKIP;
	13387	listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
	13388	initial_listsv_len = SvCUR(listsv);
	13389	SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
	13390	}
	13391
	13392	if (skip_white) {
	13393	RExC_parse = regpatws(pRExC_state, RExC_parse,
	13394	FALSE /* means don't recognize comments */ );
	13395	}
	13396
	13397	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	13398	RExC_parse++;
	13399	invert = TRUE;
	13400	allow_multi_folds = FALSE;
	13401	RExC_naughty++;
	13402	if (skip_white) {
	13403	RExC_parse = regpatws(pRExC_state, RExC_parse,
	13404	FALSE /* means don't recognize comments */ );
	13405	}
	13406	}
	13407
	13408	/* Check that they didn't say [:posix:] instead of [[:posix:]] */
	13409	if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
	13410	const char *s = RExC_parse;
	13411	const char c = *s++;
	13412
	13413	while (isWORDCHAR(*s))
	13414	s++;
	13415	if (s && c == s && s[1] == ']') {
	13416	SAVEFREESV(RExC_rx_sv);
	13417	ckWARN3reg(s+2,
	13418	"POSIX syntax [%c %c] belongs inside character classes",
	13419	c, c);
	13420	(void)ReREFCNT_inc(RExC_rx_sv);
	13421	}
	13422	}
	13423
	13424	/* If the caller wants us to just parse a single element, accomplish this
	13425	* by faking the loop ending condition */
	13426	if (stop_at_1 && RExC_end > RExC_parse) {
	13427	stop_ptr = RExC_parse + 1;
	13428	}
	13429
	13430	/* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
	13431	if (UCHARAT(RExC_parse) == ']')
	13432	goto charclassloop;
	13433
	13434	parseit:
	13435	while (1) {
	13436	if (RExC_parse >= stop_ptr) {
	13437	break;
	13438	}
	13439
	13440	if (skip_white) {
	13441	RExC_parse = regpatws(pRExC_state, RExC_parse,
	13442	FALSE /* means don't recognize comments */ );
	13443	}
	13444
	13445	if (UCHARAT(RExC_parse) == ']') {
	13446	break;
	13447	}
	13448
	13449	charclassloop:
	13450
	13451	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	13452	save_value = value;
	13453	save_prevvalue = prevvalue;
	13454
	13455	if (!range) {
	13456	rangebegin = RExC_parse;
	13457	element_count++;
	13458	}
	13459	if (UTF) {
	13460	value = utf8n_to_uvchr((U8*)RExC_parse,
	13461	RExC_end - RExC_parse,
	13462	&numlen, UTF8_ALLOW_DEFAULT);
	13463	RExC_parse += numlen;
	13464	}
	13465	else
	13466	value = UCHARAT(RExC_parse++);
	13467
	13468	if (value == '['
	13469	&& RExC_parse < RExC_end
	13470	&& POSIXCC(UCHARAT(RExC_parse)))
	13471	{
	13472	namedclass = regpposixcc(pRExC_state, value, strict);
	13473	}
	13474	else if (value != '\\') {
	13475	#ifdef EBCDIC
	13476	literal_endpoint++;
	13477	#endif
	13478	}
	13479	else {
	13480	/* Is a backslash; get the code point of the char after it */
	13481	if (UTF && ! UTF8_IS_INVARIANT(RExC_parse)) {
	13482	value = utf8n_to_uvchr((U8*)RExC_parse,
	13483	RExC_end - RExC_parse,
	13484	&numlen, UTF8_ALLOW_DEFAULT);
	13485	RExC_parse += numlen;
	13486	}
	13487	else
	13488	value = UCHARAT(RExC_parse++);
	13489
	13490	/* Some compilers cannot handle switching on 64-bit integer
	13491	* values, therefore value cannot be an UV. Yes, this will
	13492	* be a problem later if we want switch on Unicode.
	13493	* A similar issue a little bit later when switching on
	13494	* namedclass. --jhi */
	13495
	13496	/* If the \ is escaping white space when white space is being
	13497	* skipped, it means that that white space is wanted literally, and
	13498	* is already in 'value'. Otherwise, need to translate the escape
	13499	* into what it signifies. */
	13500	if (! skip_white \|\| ! is_PATWS_cp(value)) switch ((I32)value) {
	13501
	13502	case 'w': namedclass = ANYOF_WORDCHAR; break;
	13503	case 'W': namedclass = ANYOF_NWORDCHAR; break;
	13504	case 's': namedclass = ANYOF_SPACE; break;
	13505	case 'S': namedclass = ANYOF_NSPACE; break;
	13506	case 'd': namedclass = ANYOF_DIGIT; break;
	13507	case 'D': namedclass = ANYOF_NDIGIT; break;
	13508	case 'v': namedclass = ANYOF_VERTWS; break;
	13509	case 'V': namedclass = ANYOF_NVERTWS; break;
	13510	case 'h': namedclass = ANYOF_HORIZWS; break;
	13511	case 'H': namedclass = ANYOF_NHORIZWS; break;
	13512	case 'N': /* Handle \N{NAME} in class */
	13513	{
	13514	/* We only pay attention to the first char of
	13515	multichar strings being returned. I kinda wonder
	13516	if this makes sense as it does change the behaviour
	13517	from earlier versions, OTOH that behaviour was broken
	13518	as well. */
	13519	if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
	13520	TRUE, /* => charclass */
	13521	strict))
	13522	{
	13523	if (*flagp & RESTART_UTF8)
	13524	FAIL("panic: grok_bslash_N set RESTART_UTF8");
	13525	goto parseit;
	13526	}
	13527	}
	13528	break;
	13529	case 'p':
	13530	case 'P':
	13531	{
	13532	char *e;
	13533
	13534	/* We will handle any undefined properties ourselves */
	13535	U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
	13536	/* And we actually would prefer to get
	13537	* the straight inversion list of the
	13538	* swash, since we will be accessing it
	13539	* anyway, to save a little time */
	13540	\|_CORE_SWASH_INIT_ACCEPT_INVLIST;
	13541
	13542	if (RExC_parse >= RExC_end)
	13543	vFAIL2("Empty \\%c{}", (U8)value);
	13544	if (*RExC_parse == '{') {
	13545	const U8 c = (U8)value;
	13546	e = strchr(RExC_parse++, '}');
	13547	if (!e)
	13548	vFAIL2("Missing right brace on \\%c{}", c);
	13549	while (isSPACE(*RExC_parse))
	13550	RExC_parse++;
	13551	if (e == RExC_parse)
	13552	vFAIL2("Empty \\%c{}", c);
	13553	n = e - RExC_parse;
	13554	while (isSPACE(*(RExC_parse + n - 1)))
	13555	n--;
	13556	}
	13557	else {
	13558	e = RExC_parse;
	13559	n = 1;
	13560	}
	13561	if (!SIZE_ONLY) {
	13562	SV* invlist;
	13563	char* name;
	13564
	13565	if (UCHARAT(RExC_parse) == '^') {
	13566	RExC_parse++;
	13567	n--;
	13568	/* toggle. (The rhs xor gets the single bit that
	13569	* differs between P and p; the other xor inverts just
	13570	* that bit) */
	13571	value ^= 'P' ^ 'p';
	13572
	13573	while (isSPACE(*RExC_parse)) {
	13574	RExC_parse++;
	13575	n--;
	13576	}
	13577	}
	13578	/* Try to get the definition of the property into
	13579	* <invlist>. If /i is in effect, the effective property
	13580	* will have its name be <__NAME_i>. The design is
	13581	* discussed in commit
	13582	* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
	13583	name = savepv(Perl_form(aTHX_
	13584	"%s%.*s%s\n",
	13585	(FOLD) ? "__" : "",
	13586	(int)n,
	13587	RExC_parse,
	13588	(FOLD) ? "_i" : ""
	13589	));
	13590
	13591	/* Look up the property name, and get its swash and
	13592	* inversion list, if the property is found */
	13593	if (swash) {
	13594	SvREFCNT_dec_NN(swash);
	13595	}
	13596	swash = _core_swash_init("utf8", name, &PL_sv_undef,
	13597	1, /* binary */
	13598	0, /* not tr/// */
	13599	NULL, /* No inversion list */
	13600	&swash_init_flags
	13601	);
	13602	if (! swash \|\| ! (invlist = _get_swash_invlist(swash))) {
	13603	HV* curpkg = (IN_PERL_COMPILETIME)
	13604	? PL_curstash
	13605	: CopSTASH(PL_curcop);
	13606	if (swash) {
	13607	SvREFCNT_dec_NN(swash);
	13608	swash = NULL;
	13609	}
	13610
	13611	/* Here didn't find it. It could be a user-defined
	13612	* property that will be available at run-time. If we
	13613	* accept only compile-time properties, is an error;
	13614	* otherwise add it to the list for run-time look up */
	13615	if (ret_invlist) {
	13616	RExC_parse = e + 1;
	13617	vFAIL2utf8f(
	13618	"Property '%"UTF8f"' is unknown",
	13619	UTF8fARG(UTF, n, name));
	13620	}
	13621
	13622	/* If the property name doesn't already have a package
	13623	* name, add the current one to it so that it can be
	13624	* referred to outside it. [perl #121777] */
	13625	if (curpkg && ! instr(name, "::")) {
	13626	char* pkgname = HvNAME(curpkg);
	13627	if (strNE(pkgname, "main")) {
	13628	char* full_name = Perl_form(aTHX_
	13629	"%s::%s",
	13630	pkgname,
	13631	name);
	13632	n = strlen(full_name);
	13633	Safefree(name);
	13634	name = savepvn(full_name, n);
	13635	}
	13636	}
	13637	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
	13638	(value == 'p' ? '+' : '!'),
	13639	UTF8fARG(UTF, n, name));
	13640	has_user_defined_property = TRUE;
	13641
	13642	/* We don't know yet, so have to assume that the
	13643	* property could match something in the Latin1 range,
	13644	* hence something that isn't utf8. Note that this
	13645	* would cause things in <depends_list> to match
	13646	* inappropriately, except that any \p{}, including
	13647	* this one forces Unicode semantics, which means there
	13648	* is no <depends_list> */
	13649	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	13650	}
	13651	else {
	13652
	13653	/* Here, did get the swash and its inversion list. If
	13654	* the swash is from a user-defined property, then this
	13655	* whole character class should be regarded as such */
	13656	if (swash_init_flags
	13657	& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
	13658	{
	13659	has_user_defined_property = TRUE;
	13660	}
	13661	else if
	13662	/* We warn on matching an above-Unicode code point
	13663	* if the match would return true, except don't
	13664	* warn for \p{All}, which has exactly one element
	13665	* = 0 */
	13666	(_invlist_contains_cp(invlist, 0x110000)
	13667	&& (! (_invlist_len(invlist) == 1
	13668	&& *invlist_array(invlist) == 0)))
	13669	{
	13670	warn_super = TRUE;
	13671	}
	13672
	13673
	13674	/* Invert if asking for the complement */
	13675	if (value == 'P') {
	13676	_invlist_union_complement_2nd(properties,
	13677	invlist,
	13678	&properties);
	13679
	13680	/* The swash can't be used as-is, because we've
	13681	* inverted things; delay removing it to here after
	13682	* have copied its invlist above */
	13683	SvREFCNT_dec_NN(swash);
	13684	swash = NULL;
	13685	}
	13686	else {
	13687	_invlist_union(properties, invlist, &properties);
	13688	}
	13689	}
	13690	Safefree(name);
	13691	}
	13692	RExC_parse = e + 1;
	13693	namedclass = ANYOF_UNIPROP; /* no official name, but it's
	13694	named */
	13695
	13696	/* \p means they want Unicode semantics */
	13697	RExC_uni_semantics = 1;
	13698	}
	13699	break;
	13700	case 'n': value = '\n'; break;
	13701	case 'r': value = '\r'; break;
	13702	case 't': value = '\t'; break;
	13703	case 'f': value = '\f'; break;
	13704	case 'b': value = '\b'; break;
	13705	case 'e': value = ESC_NATIVE; break;
	13706	case 'a': value = '\a'; break;
	13707	case 'o':
	13708	RExC_parse--; /* function expects to be pointed at the 'o' */
	13709	{
	13710	const char* error_msg;
	13711	bool valid = grok_bslash_o(&RExC_parse,
	13712	&value,
	13713	&error_msg,
	13714	SIZE_ONLY, /* warnings in pass
	13715	1 only */
	13716	strict,
	13717	silence_non_portable,
	13718	UTF);
	13719	if (! valid) {
	13720	vFAIL(error_msg);
	13721	}
	13722	}
	13723	if (PL_encoding && value < 0x100) {
	13724	goto recode_encoding;
	13725	}
	13726	break;
	13727	case 'x':
	13728	RExC_parse--; /* function expects to be pointed at the 'x' */
	13729	{
	13730	const char* error_msg;
	13731	bool valid = grok_bslash_x(&RExC_parse,
	13732	&value,
	13733	&error_msg,
	13734	TRUE, /* Output warnings */
	13735	strict,
	13736	silence_non_portable,
	13737	UTF);
	13738	if (! valid) {
	13739	vFAIL(error_msg);
	13740	}
	13741	}
	13742	if (PL_encoding && value < 0x100)
	13743	goto recode_encoding;
	13744	break;
	13745	case 'c':
	13746	value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
	13747	break;
	13748	case '0': case '1': case '2': case '3': case '4':
	13749	case '5': case '6': case '7':
	13750	{
	13751	/* Take 1-3 octal digits */
	13752	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	13753	numlen = (strict) ? 4 : 3;
	13754	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	13755	RExC_parse += numlen;
	13756	if (numlen != 3) {
	13757	if (strict) {
	13758	RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
	13759	vFAIL("Need exactly 3 octal digits");
	13760	}
	13761	else if (! SIZE_ONLY /* like \08, \178 */
	13762	&& numlen < 3
	13763	&& RExC_parse < RExC_end
	13764	&& isDIGIT(*RExC_parse)
	13765	&& ckWARN(WARN_REGEXP))
	13766	{
	13767	SAVEFREESV(RExC_rx_sv);
	13768	reg_warn_non_literal_string(
	13769	RExC_parse + 1,
	13770	form_short_octal_warning(RExC_parse, numlen));
	13771	(void)ReREFCNT_inc(RExC_rx_sv);
	13772	}
	13773	}
	13774	if (PL_encoding && value < 0x100)
	13775	goto recode_encoding;
	13776	break;
	13777	}
	13778	recode_encoding:
	13779	if (! RExC_override_recoding) {
	13780	SV* enc = PL_encoding;
	13781	value = reg_recode((const char)(U8)value, &enc);
	13782	if (!enc) {
	13783	if (strict) {
	13784	vFAIL("Invalid escape in the specified encoding");
	13785	}
	13786	else if (SIZE_ONLY) {
	13787	ckWARNreg(RExC_parse,
	13788	"Invalid escape in the specified encoding");
	13789	}
	13790	}
	13791	break;
	13792	}
	13793	default:
	13794	/* Allow \_ to not give an error */
	13795	if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
	13796	if (strict) {
	13797	vFAIL2("Unrecognized escape \\%c in character class",
	13798	(int)value);
	13799	}
	13800	else {
	13801	SAVEFREESV(RExC_rx_sv);
	13802	ckWARN2reg(RExC_parse,
	13803	"Unrecognized escape \\%c in character class passed through",
	13804	(int)value);
	13805	(void)ReREFCNT_inc(RExC_rx_sv);
	13806	}
	13807	}
	13808	break;
	13809	} /* End of switch on char following backslash */
	13810	} /* end of handling backslash escape sequences */
	13811
	13812	/* Here, we have the current token in 'value' */
	13813
	13814	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	13815	U8 classnum;
	13816
	13817	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	13818	* literal, as is the character that began the false range, i.e.
	13819	* the 'a' in the examples */
	13820	if (range) {
	13821	if (!SIZE_ONLY) {
	13822	const int w = (RExC_parse >= rangebegin)
	13823	? RExC_parse - rangebegin
	13824	: 0;
	13825	if (strict) {
	13826	vFAIL2utf8f(
	13827	"False [] range \"%"UTF8f"\"",
	13828	UTF8fARG(UTF, w, rangebegin));
	13829	}
	13830	else {
	13831	SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
	13832	ckWARN2reg(RExC_parse,
	13833	"False [] range \"%"UTF8f"\"",
	13834	UTF8fARG(UTF, w, rangebegin));
	13835	(void)ReREFCNT_inc(RExC_rx_sv);
	13836	cp_list = add_cp_to_invlist(cp_list, '-');
	13837	cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
	13838	prevvalue);
	13839	}
	13840	}
	13841
	13842	range = 0; /* this was not a true range */
	13843	element_count += 2; /* So counts for three values */
	13844	}
	13845
	13846	classnum = namedclass_to_classnum(namedclass);
	13847
	13848	if (LOC && namedclass < ANYOF_POSIXL_MAX
	13849	#ifndef HAS_ISASCII
	13850	&& classnum != _CC_ASCII
	13851	#endif
	13852	) {
	13853	/* What the Posix classes (like \w, [:space:]) match in locale
	13854	* isn't knowable under locale until actual match time. Room
	13855	* must be reserved (one time per outer bracketed class) to
	13856	* store such classes. The space will contain a bit for each
	13857	* named class that is to be matched against. This isn't
	13858	* needed for \p{} and pseudo-classes, as they are not affected
	13859	* by locale, and hence are dealt with separately */
	13860	if (! need_class) {
	13861	need_class = 1;
	13862	if (SIZE_ONLY) {
	13863	RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
	13864	}
	13865	else {
	13866	RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
	13867	}
	13868	ANYOF_FLAGS(ret) \|= ANYOF_POSIXL;
	13869	ANYOF_POSIXL_ZERO(ret);
	13870	}
	13871
	13872	/* Coverity thinks it is possible for this to be negative; both
	13873	* jhi and khw think it's not, but be safer */
	13874	assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
	13875	\|\| (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
	13876
	13877	/* See if it already matches the complement of this POSIX
	13878	* class */
	13879	if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
	13880	&& ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
	13881	? -1
	13882	: 1)))
	13883	{
	13884	posixl_matches_all = TRUE;
	13885	break; /* No need to continue. Since it matches both
	13886	e.g., \w and \W, it matches everything, and the
	13887	bracketed class can be optimized into qr/./s */
	13888	}
	13889
	13890	/* Add this class to those that should be checked at runtime */
	13891	ANYOF_POSIXL_SET(ret, namedclass);
	13892
	13893	/* The above-Latin1 characters are not subject to locale rules.
	13894	* Just add them, in the second pass, to the
	13895	* unconditionally-matched list */
	13896	if (! SIZE_ONLY) {
	13897	SV* scratch_list = NULL;
	13898
	13899	/* Get the list of the above-Latin1 code points this
	13900	* matches */
	13901	_invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
	13902	PL_XPosix_ptrs[classnum],
	13903
	13904	/* Odd numbers are complements, like
	13905	* NDIGIT, NASCII, ... */
	13906	namedclass % 2 != 0,
	13907	&scratch_list);
	13908	/* Checking if 'cp_list' is NULL first saves an extra
	13909	* clone. Its reference count will be decremented at the
	13910	* next union, etc, or if this is the only instance, at the
	13911	* end of the routine */
	13912	if (! cp_list) {
	13913	cp_list = scratch_list;
	13914	}
	13915	else {
	13916	_invlist_union(cp_list, scratch_list, &cp_list);
	13917	SvREFCNT_dec_NN(scratch_list);
	13918	}
	13919	continue; /* Go get next character */
	13920	}
	13921	}
	13922	else if (! SIZE_ONLY) {
	13923
	13924	/* Here, not in pass1 (in that pass we skip calculating the
	13925	* contents of this class), and is /l, or is a POSIX class for
	13926	* which /l doesn't matter (or is a Unicode property, which is
	13927	* skipped here). */
	13928	if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
	13929	if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
	13930
	13931	/* Here, should be \h, \H, \v, or \V. None of /d, /i
	13932	* nor /l make a difference in what these match,
	13933	* therefore we just add what they match to cp_list. */
	13934	if (classnum != _CC_VERTSPACE) {
	13935	assert( namedclass == ANYOF_HORIZWS
	13936	\|\| namedclass == ANYOF_NHORIZWS);
	13937
	13938	/* It turns out that \h is just a synonym for
	13939	* XPosixBlank */
	13940	classnum = _CC_BLANK;
	13941	}
	13942
	13943	_invlist_union_maybe_complement_2nd(
	13944	cp_list,
	13945	PL_XPosix_ptrs[classnum],
	13946	namedclass % 2 != 0, /* Complement if odd
	13947	(NHORIZWS, NVERTWS)
	13948	*/
	13949	&cp_list);
	13950	}
	13951	}
	13952	else { /* Garden variety class. If is NASCII, NDIGIT, ...
	13953	complement and use nposixes */
	13954	SV** posixes_ptr = namedclass % 2 == 0
	13955	? &posixes
	13956	: &nposixes;
	13957	SV** source_ptr = &PL_XPosix_ptrs[classnum];
	13958	_invlist_union_maybe_complement_2nd(
	13959	*posixes_ptr,
	13960	*source_ptr,
	13961	namedclass % 2 != 0,
	13962	posixes_ptr);
	13963	}
	13964	continue; /* Go get next character */
	13965	}
	13966	} /* end of namedclass \blah */
	13967
	13968	/* Here, we have a single value. If 'range' is set, it is the ending
	13969	* of a range--check its validity. Later, we will handle each
	13970	* individual code point in the range. If 'range' isn't set, this
	13971	* could be the beginning of a range, so check for that by looking
	13972	* ahead to see if the next real character to be processed is the range
	13973	* indicator--the minus sign */
	13974
	13975	if (skip_white) {
	13976	RExC_parse = regpatws(pRExC_state, RExC_parse,
	13977	FALSE /* means don't recognize comments */ );
	13978	}
	13979
	13980	if (range) {
	13981	if (prevvalue > value) /* b-a */ {
	13982	const int w = RExC_parse - rangebegin;
	13983	vFAIL2utf8f(
	13984	"Invalid [] range \"%"UTF8f"\"",
	13985	UTF8fARG(UTF, w, rangebegin));
	13986	range = 0; /* not a valid range */
	13987	}
	13988	}
	13989	else {
	13990	prevvalue = value; /* save the beginning of the potential range */
	13991	if (! stop_at_1 /* Can't be a range if parsing just one thing */
	13992	&& *RExC_parse == '-')
	13993	{
	13994	char* next_char_ptr = RExC_parse + 1;
	13995	if (skip_white) { /* Get the next real char after the '-' */
	13996	next_char_ptr = regpatws(pRExC_state,
	13997	RExC_parse + 1,
	13998	FALSE); /* means don't recognize
	13999	comments */
	14000	}
	14001
	14002	/* If the '-' is at the end of the class (just before the ']',
	14003	* it is a literal minus; otherwise it is a range */
	14004	if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
	14005	RExC_parse = next_char_ptr;
	14006
	14007	/* a bad range like \w-, [:word:]- ? */
	14008	if (namedclass > OOB_NAMEDCLASS) {
	14009	if (strict \|\| ckWARN(WARN_REGEXP)) {
	14010	const int w =
	14011	RExC_parse >= rangebegin ?
	14012	RExC_parse - rangebegin : 0;
	14013	if (strict) {
	14014	vFAIL4("False [] range \"%.s\"",
	14015	w, w, rangebegin);
	14016	}
	14017	else {
	14018	vWARN4(RExC_parse,
	14019	"False [] range \"%.s\"",
	14020	w, w, rangebegin);
	14021	}
	14022	}
	14023	if (!SIZE_ONLY) {
	14024	cp_list = add_cp_to_invlist(cp_list, '-');
	14025	}
	14026	element_count++;
	14027	} else
	14028	range = 1; /* yeah, it's a range! */
	14029	continue; /* but do it the next time */
	14030	}
	14031	}
	14032	}
	14033
	14034	/* Here, <prevvalue> is the beginning of the range, if any; or <value>
	14035	* if not */
	14036
	14037	/* non-Latin1 code point implies unicode semantics. Must be set in
	14038	* pass1 so is there for the whole of pass 2 */
	14039	if (value > 255) {
	14040	RExC_uni_semantics = 1;
	14041	}
	14042
	14043	/* Ready to process either the single value, or the completed range.
	14044	* For single-valued non-inverted ranges, we consider the possibility
	14045	* of multi-char folds. (We made a conscious decision to not do this
	14046	* for the other cases because it can often lead to non-intuitive
	14047	* results. For example, you have the peculiar case that:
	14048	* "s s" =~ /^[^\xDF]+$/i => Y
	14049	* "ss" =~ /^[^\xDF]+$/i => N
	14050	*
	14051	* See [perl #89750] */
	14052	if (FOLD && allow_multi_folds && value == prevvalue) {
	14053	if (value == LATIN_SMALL_LETTER_SHARP_S
	14054	\|\| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
	14055	value)))
	14056	{
	14057	/* Here <value> is indeed a multi-char fold. Get what it is */
	14058
	14059	U8 foldbuf[UTF8_MAXBYTES_CASE];
	14060	STRLEN foldlen;
	14061
	14062	UV folded = _to_uni_fold_flags(
	14063	value,
	14064	foldbuf,
	14065	&foldlen,
	14066	FOLD_FLAGS_FULL \| (ASCII_FOLD_RESTRICTED
	14067	? FOLD_FLAGS_NOMIX_ASCII
	14068	: 0)
	14069	);
	14070
	14071	/* Here, <folded> should be the first character of the
	14072	* multi-char fold of <value>, with <foldbuf> containing the
	14073	* whole thing. But, if this fold is not allowed (because of
	14074	* the flags), <fold> will be the same as <value>, and should
	14075	* be processed like any other character, so skip the special
	14076	* handling */
	14077	if (folded != value) {
	14078
	14079	/* Skip if we are recursed, currently parsing the class
	14080	* again. Otherwise add this character to the list of
	14081	* multi-char folds. */
	14082	if (! RExC_in_multi_char_class) {
	14083	AV** this_array_ptr;
	14084	AV* this_array;
	14085	STRLEN cp_count = utf8_length(foldbuf,
	14086	foldbuf + foldlen);
	14087	SV* multi_fold = sv_2mortal(newSVpvs(""));
	14088
	14089	Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
	14090
	14091
	14092	if (! multi_char_matches) {
	14093	multi_char_matches = newAV();
	14094	}
	14095
	14096	/* <multi_char_matches> is actually an array of arrays.
	14097	* There will be one or two top-level elements: [2],
	14098	* and/or [3]. The [2] element is an array, each
	14099	* element thereof is a character which folds to TWO
	14100	* characters; [3] is for folds to THREE characters.
	14101	* (Unicode guarantees a maximum of 3 characters in any
	14102	* fold.) When we rewrite the character class below,
	14103	* we will do so such that the longest folds are
	14104	* written first, so that it prefers the longest
	14105	* matching strings first. This is done even if it
	14106	* turns out that any quantifier is non-greedy, out of
	14107	* programmer laziness. Tom Christiansen has agreed
	14108	* that this is ok. This makes the test for the
	14109	* ligature 'ffi' come before the test for 'ff' */
	14110	if (av_exists(multi_char_matches, cp_count)) {
	14111	this_array_ptr = (AV**) av_fetch(multi_char_matches,
	14112	cp_count, FALSE);
	14113	this_array = *this_array_ptr;
	14114	}
	14115	else {
	14116	this_array = newAV();
	14117	av_store(multi_char_matches, cp_count,
	14118	(SV*) this_array);
	14119	}
	14120	av_push(this_array, multi_fold);
	14121	}
	14122
	14123	/* This element should not be processed further in this
	14124	* class */
	14125	element_count--;
	14126	value = save_value;
	14127	prevvalue = save_prevvalue;
	14128	continue;
	14129	}
	14130	}
	14131	}
	14132
	14133	/* Deal with this element of the class */
	14134	if (! SIZE_ONLY) {
	14135	#ifndef EBCDIC
	14136	cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
	14137	prevvalue, value);
	14138	#else
	14139	SV* this_range = _new_invlist(1);
	14140	_append_range_to_invlist(this_range, prevvalue, value);
	14141
	14142	/* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
	14143	* If this range was specified using something like 'i-j', we want
	14144	* to include only the 'i' and the 'j', and not anything in
	14145	* between, so exclude non-ASCII, non-alphabetics from it.
	14146	* However, if the range was specified with something like
	14147	* [\x89-\x91] or [\x89-j], all code points within it should be
	14148	* included. literal_endpoint==2 means both ends of the range used
	14149	* a literal character, not \x{foo} */
	14150	if (literal_endpoint == 2
	14151	&& ((prevvalue >= 'a' && value <= 'z')
	14152	\|\| (prevvalue >= 'A' && value <= 'Z')))
	14153	{
	14154	_invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
	14155	&this_range);
	14156
	14157	/* Since this above only contains ascii, the intersection of it
	14158	* with anything will still yield only ascii */
	14159	_invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
	14160	&this_range);
	14161	}
	14162	_invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
	14163	literal_endpoint = 0;
	14164	#endif
	14165	}
	14166
	14167	range = 0; /* this range (if it was one) is done now */
	14168	} /* End of loop through all the text within the brackets */
	14169
	14170	/* If anything in the class expands to more than one character, we have to
	14171	* deal with them by building up a substitute parse string, and recursively
	14172	* calling reg() on it, instead of proceeding */
	14173	if (multi_char_matches) {
	14174	SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
	14175	I32 cp_count;
	14176	STRLEN len;
	14177	char *save_end = RExC_end;
	14178	char *save_parse = RExC_parse;
	14179	bool first_time = TRUE; /* First multi-char occurrence doesn't get
	14180	a "\|" */
	14181	I32 reg_flags;
	14182
	14183	assert(! invert);
	14184	#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
	14185	because too confusing */
	14186	if (invert) {
	14187	sv_catpv(substitute_parse, "(?:");
	14188	}
	14189	#endif
	14190
	14191	/* Look at the longest folds first */
	14192	for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
	14193
	14194	if (av_exists(multi_char_matches, cp_count)) {
	14195	AV** this_array_ptr;
	14196	SV* this_sequence;
	14197
	14198	this_array_ptr = (AV**) av_fetch(multi_char_matches,
	14199	cp_count, FALSE);
	14200	while ((this_sequence = av_pop(*this_array_ptr)) !=
	14201	&PL_sv_undef)
	14202	{
	14203	if (! first_time) {
	14204	sv_catpv(substitute_parse, "\|");
	14205	}
	14206	first_time = FALSE;
	14207
	14208	sv_catpv(substitute_parse, SvPVX(this_sequence));
	14209	}
	14210	}
	14211	}
	14212
	14213	/* If the character class contains anything else besides these
	14214	* multi-character folds, have to include it in recursive parsing */
	14215	if (element_count) {
	14216	sv_catpv(substitute_parse, "\|[");
	14217	sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
	14218	sv_catpv(substitute_parse, "]");
	14219	}
	14220
	14221	sv_catpv(substitute_parse, ")");
	14222	#if 0
	14223	if (invert) {
	14224	/* This is a way to get the parse to skip forward a whole named
	14225	* sequence instead of matching the 2nd character when it fails the
	14226	* first */
	14227	sv_catpv(substitute_parse, "(THEN)(SKIP)(*FAIL)\|.)");
	14228	}
	14229	#endif
	14230
	14231	RExC_parse = SvPV(substitute_parse, len);
	14232	RExC_end = RExC_parse + len;
	14233	RExC_in_multi_char_class = 1;
	14234	RExC_emit = (regnode *)orig_emit;
	14235
	14236	ret = reg(pRExC_state, 1, &reg_flags, depth+1);
	14237
	14238	*flagp \|= reg_flags&(HASWIDTH\|SIMPLE\|SPSTART\|POSTPONED\|RESTART_UTF8);
	14239
	14240	RExC_parse = save_parse;
	14241	RExC_end = save_end;
	14242	RExC_in_multi_char_class = 0;
	14243	SvREFCNT_dec_NN(multi_char_matches);
	14244	return ret;
	14245	}
	14246
	14247	/* Here, we've gone through the entire class and dealt with multi-char
	14248	* folds. We are now in a position that we can do some checks to see if we
	14249	* can optimize this ANYOF node into a simpler one, even in Pass 1.
	14250	* Currently we only do two checks:
	14251	* 1) is in the unlikely event that the user has specified both, eg. \w and
	14252	* \W under /l, then the class matches everything. (This optimization
	14253	* is done only to make the optimizer code run later work.)
	14254	* 2) if the character class contains only a single element (including a
	14255	* single range), we see if there is an equivalent node for it.
	14256	* Other checks are possible */
	14257	if (! ret_invlist /* Can't optimize if returning the constructed
	14258	inversion list */
	14259	&& (UNLIKELY(posixl_matches_all) \|\| element_count == 1))
	14260	{
	14261	U8 op = END;
	14262	U8 arg = 0;
	14263
	14264	if (UNLIKELY(posixl_matches_all)) {
	14265	op = SANY;
	14266	}
	14267	else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
	14268	\w or [:digit:] or \p{foo}
	14269	*/
	14270
	14271	/* All named classes are mapped into POSIXish nodes, with its FLAG
	14272	* argument giving which class it is */
	14273	switch ((I32)namedclass) {
	14274	case ANYOF_UNIPROP:
	14275	break;
	14276
	14277	/* These don't depend on the charset modifiers. They always
	14278	* match under /u rules */
	14279	case ANYOF_NHORIZWS:
	14280	case ANYOF_HORIZWS:
	14281	namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
	14282	/* FALLTHROUGH */
	14283
	14284	case ANYOF_NVERTWS:
	14285	case ANYOF_VERTWS:
	14286	op = POSIXU;
	14287	goto join_posix;
	14288
	14289	/* The actual POSIXish node for all the rest depends on the
	14290	* charset modifier. The ones in the first set depend only on
	14291	* ASCII or, if available on this platform, locale */
	14292	case ANYOF_ASCII:
	14293	case ANYOF_NASCII:
	14294	#ifdef HAS_ISASCII
	14295	op = (LOC) ? POSIXL : POSIXA;
	14296	#else
	14297	op = POSIXA;
	14298	#endif
	14299	goto join_posix;
	14300
	14301	case ANYOF_NCASED:
	14302	case ANYOF_LOWER:
	14303	case ANYOF_NLOWER:
	14304	case ANYOF_UPPER:
	14305	case ANYOF_NUPPER:
	14306	/* under /a could be alpha */
	14307	if (FOLD) {
	14308	if (ASCII_RESTRICTED) {
	14309	namedclass = ANYOF_ALPHA + (namedclass % 2);
	14310	}
	14311	else if (! LOC) {
	14312	break;
	14313	}
	14314	}
	14315	/* FALLTHROUGH */
	14316
	14317	/* The rest have more possibilities depending on the charset.
	14318	* We take advantage of the enum ordering of the charset
	14319	* modifiers to get the exact node type, */
	14320	default:
	14321	op = POSIXD + get_regex_charset(RExC_flags);
	14322	if (op > POSIXA) { /* /aa is same as /a */
	14323	op = POSIXA;
	14324	}
	14325
	14326	join_posix:
	14327	/* The odd numbered ones are the complements of the
	14328	* next-lower even number one */
	14329	if (namedclass % 2 == 1) {
	14330	invert = ! invert;
	14331	namedclass--;
	14332	}
	14333	arg = namedclass_to_classnum(namedclass);
	14334	break;
	14335	}
	14336	}
	14337	else if (value == prevvalue) {
	14338
	14339	/* Here, the class consists of just a single code point */
	14340
	14341	if (invert) {
	14342	if (! LOC && value == '\n') {
	14343	op = REG_ANY; /* Optimize [^\n] */
	14344	*flagp \|= HASWIDTH\|SIMPLE;
	14345	RExC_naughty++;
	14346	}
	14347	}
	14348	else if (value < 256 \|\| UTF) {
	14349
	14350	/* Optimize a single value into an EXACTish node, but not if it
	14351	* would require converting the pattern to UTF-8. */
	14352	op = compute_EXACTish(pRExC_state);
	14353	}
	14354	} /* Otherwise is a range */
	14355	else if (! LOC) { /* locale could vary these */
	14356	if (prevvalue == '0') {
	14357	if (value == '9') {
	14358	arg = _CC_DIGIT;
	14359	op = POSIXA;
	14360	}
	14361	}
	14362	else if (prevvalue == 'A') {
	14363	if (value == 'Z'
	14364	#ifdef EBCDIC
	14365	&& literal_endpoint == 2
	14366	#endif
	14367	) {
	14368	arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
	14369	op = POSIXA;
	14370	}
	14371	}
	14372	else if (prevvalue == 'a') {
	14373	if (value == 'z'
	14374	#ifdef EBCDIC
	14375	&& literal_endpoint == 2
	14376	#endif
	14377	) {
	14378	arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
	14379	op = POSIXA;
	14380	}
	14381	}
	14382	}
	14383
	14384	/* Here, we have changed <op> away from its initial value iff we found
	14385	* an optimization */
	14386	if (op != END) {
	14387
	14388	/* Throw away this ANYOF regnode, and emit the calculated one,
	14389	* which should correspond to the beginning, not current, state of
	14390	* the parse */
	14391	const char * cur_parse = RExC_parse;
	14392	RExC_parse = (char *)orig_parse;
	14393	if ( SIZE_ONLY) {
	14394	if (! LOC) {
	14395
	14396	/* To get locale nodes to not use the full ANYOF size would
	14397	* require moving the code above that writes the portions
	14398	* of it that aren't in other nodes to after this point.
	14399	* e.g. ANYOF_POSIXL_SET */
	14400	RExC_size = orig_size;
	14401	}
	14402	}
	14403	else {
	14404	RExC_emit = (regnode *)orig_emit;
	14405	if (PL_regkind[op] == POSIXD) {
	14406	if (op == POSIXL) {
	14407	RExC_contains_locale = 1;
	14408	}
	14409	if (invert) {
	14410	op += NPOSIXD - POSIXD;
	14411	}
	14412	}
	14413	}
	14414
	14415	ret = reg_node(pRExC_state, op);
	14416
	14417	if (PL_regkind[op] == POSIXD \|\| PL_regkind[op] == NPOSIXD) {
	14418	if (! SIZE_ONLY) {
	14419	FLAGS(ret) = arg;
	14420	}
	14421	*flagp \|= HASWIDTH\|SIMPLE;
	14422	}
	14423	else if (PL_regkind[op] == EXACT) {
	14424	alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
	14425	TRUE /* downgradable to EXACT */
	14426	);
	14427	}
	14428
	14429	RExC_parse = (char *) cur_parse;
	14430
	14431	SvREFCNT_dec(posixes);
	14432	SvREFCNT_dec(nposixes);
	14433	SvREFCNT_dec(cp_list);
	14434	SvREFCNT_dec(cp_foldable_list);
	14435	return ret;
	14436	}
	14437	}
	14438
	14439	if (SIZE_ONLY)
	14440	return ret;
	14441	/**** !SIZE_ONLY (Pass 2) AFTER HERE *******/
	14442
	14443	/* If folding, we calculate all characters that could fold to or from the
	14444	* ones already on the list */
	14445	if (cp_foldable_list) {
	14446	if (FOLD) {
	14447	UV start, end; /* End points of code point ranges */
	14448
	14449	SV* fold_intersection = NULL;
	14450	SV** use_list;
	14451
	14452	/* Our calculated list will be for Unicode rules. For locale
	14453	* matching, we have to keep a separate list that is consulted at
	14454	* runtime only when the locale indicates Unicode rules. For
	14455	* non-locale, we just use to the general list */
	14456	if (LOC) {
	14457	use_list = &only_utf8_locale_list;
	14458	}
	14459	else {
	14460	use_list = &cp_list;
	14461	}
	14462
	14463	/* Only the characters in this class that participate in folds need
	14464	* be checked. Get the intersection of this class and all the
	14465	* possible characters that are foldable. This can quickly narrow
	14466	* down a large class */
	14467	_invlist_intersection(PL_utf8_foldable, cp_foldable_list,
	14468	&fold_intersection);
	14469
	14470	/* The folds for all the Latin1 characters are hard-coded into this
	14471	* program, but we have to go out to disk to get the others. */
	14472	if (invlist_highest(cp_foldable_list) >= 256) {
	14473
	14474	/* This is a hash that for a particular fold gives all
	14475	* characters that are involved in it */
	14476	if (! PL_utf8_foldclosures) {
	14477	_load_PL_utf8_foldclosures();
	14478	}
	14479	}
	14480
	14481	/* Now look at the foldable characters in this class individually */
	14482	invlist_iterinit(fold_intersection);
	14483	while (invlist_iternext(fold_intersection, &start, &end)) {
	14484	UV j;
	14485
	14486	/* Look at every character in the range */
	14487	for (j = start; j <= end; j++) {
	14488	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	14489	STRLEN foldlen;
	14490	SV** listp;
	14491
	14492	if (j < 256) {
	14493
	14494	if (IS_IN_SOME_FOLD_L1(j)) {
	14495
	14496	/* ASCII is always matched; non-ASCII is matched
	14497	* only under Unicode rules (which could happen
	14498	* under /l if the locale is a UTF-8 one */
	14499	if (isASCII(j) \|\| ! DEPENDS_SEMANTICS) {
	14500	use_list = add_cp_to_invlist(use_list,
	14501	PL_fold_latin1[j]);
	14502	}
	14503	else {
	14504	depends_list =
	14505	add_cp_to_invlist(depends_list,
	14506	PL_fold_latin1[j]);
	14507	}
	14508	}
	14509
	14510	if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
	14511	&& (! isASCII(j) \|\| ! ASCII_FOLD_RESTRICTED))
	14512	{
	14513	add_above_Latin1_folds(pRExC_state,
	14514	(U8) j,
	14515	use_list);
	14516	}
	14517	continue;
	14518	}
	14519
	14520	/* Here is an above Latin1 character. We don't have the
	14521	* rules hard-coded for it. First, get its fold. This is
	14522	* the simple fold, as the multi-character folds have been
	14523	* handled earlier and separated out */
	14524	_to_uni_fold_flags(j, foldbuf, &foldlen,
	14525	(ASCII_FOLD_RESTRICTED)
	14526	? FOLD_FLAGS_NOMIX_ASCII
	14527	: 0);
	14528
	14529	/* Single character fold of above Latin1. Add everything in
	14530	* its fold closure to the list that this node should match.
	14531	* The fold closures data structure is a hash with the keys
	14532	* being the UTF-8 of every character that is folded to, like
	14533	* 'k', and the values each an array of all code points that
	14534	* fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
	14535	* Multi-character folds are not included */
	14536	if ((listp = hv_fetch(PL_utf8_foldclosures,
	14537	(char *) foldbuf, foldlen, FALSE)))
	14538	{
	14539	AV* list = (AV) listp;
	14540	IV k;
	14541	for (k = 0; k <= av_tindex(list); k++) {
	14542	SV** c_p = av_fetch(list, k, FALSE);
	14543	UV c;
	14544	assert(c_p);
	14545
	14546	c = SvUV(*c_p);
	14547
	14548	/* /aa doesn't allow folds between ASCII and non- */
	14549	if ((ASCII_FOLD_RESTRICTED
	14550	&& (isASCII(c) != isASCII(j))))
	14551	{
	14552	continue;
	14553	}
	14554
	14555	/* Folds under /l which cross the 255/256 boundary
	14556	* are added to a separate list. (These are valid
	14557	* only when the locale is UTF-8.) */
	14558	if (c < 256 && LOC) {
	14559	use_list = add_cp_to_invlist(use_list, c);
	14560	continue;
	14561	}
	14562
	14563	if (isASCII(c) \|\| c > 255 \|\| AT_LEAST_UNI_SEMANTICS)
	14564	{
	14565	cp_list = add_cp_to_invlist(cp_list, c);
	14566	}
	14567	else {
	14568	/* Similarly folds involving non-ascii Latin1
	14569	* characters under /d are added to their list */
	14570	depends_list = add_cp_to_invlist(depends_list,
	14571	c);
	14572	}
	14573	}
	14574	}
	14575	}
	14576	}
	14577	SvREFCNT_dec_NN(fold_intersection);
	14578	}
	14579
	14580	/* Now that we have finished adding all the folds, there is no reason
	14581	* to keep the foldable list separate */
	14582	_invlist_union(cp_list, cp_foldable_list, &cp_list);
	14583	SvREFCNT_dec_NN(cp_foldable_list);
	14584	}
	14585
	14586	/* And combine the result (if any) with any inversion list from posix
	14587	* classes. The lists are kept separate up to now because we don't want to
	14588	* fold the classes (folding of those is automatically handled by the swash
	14589	* fetching code) */
	14590	if (posixes \|\| nposixes) {
	14591	if (posixes && AT_LEAST_ASCII_RESTRICTED) {
	14592	/* Under /a and /aa, nothing above ASCII matches these */
	14593	_invlist_intersection(posixes,
	14594	PL_XPosix_ptrs[_CC_ASCII],
	14595	&posixes);
	14596	}
	14597	if (nposixes) {
	14598	if (DEPENDS_SEMANTICS) {
	14599	/* Under /d, everything in the upper half of the Latin1 range
	14600	* matches these complements */
	14601	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_NON_ASCII_ALL;
	14602	}
	14603	else if (AT_LEAST_ASCII_RESTRICTED) {
	14604	/* Under /a and /aa, everything above ASCII matches these
	14605	* complements */
	14606	_invlist_union_complement_2nd(nposixes,
	14607	PL_XPosix_ptrs[_CC_ASCII],
	14608	&nposixes);
	14609	}
	14610	if (posixes) {
	14611	_invlist_union(posixes, nposixes, &posixes);
	14612	SvREFCNT_dec_NN(nposixes);
	14613	}
	14614	else {
	14615	posixes = nposixes;
	14616	}
	14617	}
	14618	if (! DEPENDS_SEMANTICS) {
	14619	if (cp_list) {
	14620	_invlist_union(cp_list, posixes, &cp_list);
	14621	SvREFCNT_dec_NN(posixes);
	14622	}
	14623	else {
	14624	cp_list = posixes;
	14625	}
	14626	}
	14627	else {
	14628	/* Under /d, we put into a separate list the Latin1 things that
	14629	* match only when the target string is utf8 */
	14630	SV* nonascii_but_latin1_properties = NULL;
	14631	_invlist_intersection(posixes, PL_UpperLatin1,
	14632	&nonascii_but_latin1_properties);
	14633	_invlist_subtract(posixes, nonascii_but_latin1_properties,
	14634	&posixes);
	14635	if (cp_list) {
	14636	_invlist_union(cp_list, posixes, &cp_list);
	14637	SvREFCNT_dec_NN(posixes);
	14638	}
	14639	else {
	14640	cp_list = posixes;
	14641	}
	14642
	14643	if (depends_list) {
	14644	_invlist_union(depends_list, nonascii_but_latin1_properties,
	14645	&depends_list);
	14646	SvREFCNT_dec_NN(nonascii_but_latin1_properties);
	14647	}
	14648	else {
	14649	depends_list = nonascii_but_latin1_properties;
	14650	}
	14651	}
	14652	}
	14653
	14654	/* And combine the result (if any) with any inversion list from properties.
	14655	* The lists are kept separate up to now so that we can distinguish the two
	14656	* in regards to matching above-Unicode. A run-time warning is generated
	14657	* if a Unicode property is matched against a non-Unicode code point. But,
	14658	* we allow user-defined properties to match anything, without any warning,
	14659	* and we also suppress the warning if there is a portion of the character
	14660	* class that isn't a Unicode property, and which matches above Unicode, \W
	14661	* or [\x{110000}] for example.
	14662	* (Note that in this case, unlike the Posix one above, there is no
	14663	* <depends_list>, because having a Unicode property forces Unicode
	14664	* semantics */
	14665	if (properties) {
	14666	if (cp_list) {
	14667
	14668	/* If it matters to the final outcome, see if a non-property
	14669	* component of the class matches above Unicode. If so, the
	14670	* warning gets suppressed. This is true even if just a single
	14671	* such code point is specified, as though not strictly correct if
	14672	* another such code point is matched against, the fact that they
	14673	* are using above-Unicode code points indicates they should know
	14674	* the issues involved */
	14675	if (warn_super) {
	14676	warn_super = ! (invert
	14677	^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
	14678	}
	14679
	14680	_invlist_union(properties, cp_list, &cp_list);
	14681	SvREFCNT_dec_NN(properties);
	14682	}
	14683	else {
	14684	cp_list = properties;
	14685	}
	14686
	14687	if (warn_super) {
	14688	ANYOF_FLAGS(ret) \|= ANYOF_WARN_SUPER;
	14689	}
	14690	}
	14691
	14692	/* Here, we have calculated what code points should be in the character
	14693	* class.
	14694	*
	14695	* Now we can see about various optimizations. Fold calculation (which we
	14696	* did above) needs to take place before inversion. Otherwise /[^k]/i
	14697	* would invert to include K, which under /i would match k, which it
	14698	* shouldn't. Therefore we can't invert folded locale now, as it won't be
	14699	* folded until runtime */
	14700
	14701	/* If we didn't do folding, it's because some information isn't available
	14702	* until runtime; set the run-time fold flag for these. (We don't have to
	14703	* worry about properties folding, as that is taken care of by the swash
	14704	* fetching). We know to set the flag if we have a non-NULL list for UTF-8
	14705	* locales, or the class matches at least one 0-255 range code point */
	14706	if (LOC && FOLD) {
	14707	if (only_utf8_locale_list) {
	14708	ANYOF_FLAGS(ret) \|= ANYOF_LOC_FOLD;
	14709	}
	14710	else if (cp_list) { /* Look to see if there a 0-255 code point is in
	14711	the list */
	14712	UV start, end;
	14713	invlist_iterinit(cp_list);
	14714	if (invlist_iternext(cp_list, &start, &end) && start < 256) {
	14715	ANYOF_FLAGS(ret) \|= ANYOF_LOC_FOLD;
	14716	}
	14717	invlist_iterfinish(cp_list);
	14718	}
	14719	}
	14720
	14721	/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
	14722	* at compile time. Besides not inverting folded locale now, we can't
	14723	* invert if there are things such as \w, which aren't known until runtime
	14724	* */
	14725	if (cp_list
	14726	&& invert
	14727	&& ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
	14728	&& ! depends_list
	14729	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	14730	{
	14731	_invlist_invert(cp_list);
	14732
	14733	/* Any swash can't be used as-is, because we've inverted things */
	14734	if (swash) {
	14735	SvREFCNT_dec_NN(swash);
	14736	swash = NULL;
	14737	}
	14738
	14739	/* Clear the invert flag since have just done it here */
	14740	invert = FALSE;
	14741	}
	14742
	14743	if (ret_invlist) {
	14744	*ret_invlist = cp_list;
	14745	SvREFCNT_dec(swash);
	14746
	14747	/* Discard the generated node */
	14748	if (SIZE_ONLY) {
	14749	RExC_size = orig_size;
	14750	}
	14751	else {
	14752	RExC_emit = orig_emit;
	14753	}
	14754	return orig_emit;
	14755	}
	14756
	14757	/* Some character classes are equivalent to other nodes. Such nodes take
	14758	* up less room and generally fewer operations to execute than ANYOF nodes.
	14759	* Above, we checked for and optimized into some such equivalents for
	14760	* certain common classes that are easy to test. Getting to this point in
	14761	* the code means that the class didn't get optimized there. Since this
	14762	* code is only executed in Pass 2, it is too late to save space--it has
	14763	* been allocated in Pass 1, and currently isn't given back. But turning
	14764	* things into an EXACTish node can allow the optimizer to join it to any
	14765	* adjacent such nodes. And if the class is equivalent to things like /./,
	14766	* expensive run-time swashes can be avoided. Now that we have more
	14767	* complete information, we can find things necessarily missed by the
	14768	* earlier code. I (khw) am not sure how much to look for here. It would
	14769	* be easy, but perhaps too slow, to check any candidates against all the
	14770	* node types they could possibly match using _invlistEQ(). */
	14771
	14772	if (cp_list
	14773	&& ! invert
	14774	&& ! depends_list
	14775	&& ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
	14776	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
	14777
	14778	/* We don't optimize if we are supposed to make sure all non-Unicode
	14779	* code points raise a warning, as only ANYOF nodes have this check.
	14780	* */
	14781	&& ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
	14782	{
	14783	UV start, end;
	14784	U8 op = END; /* The optimzation node-type */
	14785	const char * cur_parse= RExC_parse;
	14786
	14787	invlist_iterinit(cp_list);
	14788	if (! invlist_iternext(cp_list, &start, &end)) {
	14789
	14790	/* Here, the list is empty. This happens, for example, when a
	14791	* Unicode property is the only thing in the character class, and
	14792	* it doesn't match anything. (perluniprops.pod notes such
	14793	* properties) */
	14794	op = OPFAIL;
	14795	*flagp \|= HASWIDTH\|SIMPLE;
	14796	}
	14797	else if (start == end) { /* The range is a single code point */
	14798	if (! invlist_iternext(cp_list, &start, &end)
	14799
	14800	/* Don't do this optimization if it would require changing
	14801	* the pattern to UTF-8 */
	14802	&& (start < 256 \|\| UTF))
	14803	{
	14804	/* Here, the list contains a single code point. Can optimize
	14805	* into an EXACTish node */
	14806
	14807	value = start;
	14808
	14809	if (! FOLD) {
	14810	op = EXACT;
	14811	}
	14812	else if (LOC) {
	14813
	14814	/* A locale node under folding with one code point can be
	14815	* an EXACTFL, as its fold won't be calculated until
	14816	* runtime */
	14817	op = EXACTFL;
	14818	}
	14819	else {
	14820
	14821	/* Here, we are generally folding, but there is only one
	14822	* code point to match. If we have to, we use an EXACT
	14823	* node, but it would be better for joining with adjacent
	14824	* nodes in the optimization pass if we used the same
	14825	* EXACTFish node that any such are likely to be. We can
	14826	* do this iff the code point doesn't participate in any
	14827	* folds. For example, an EXACTF of a colon is the same as
	14828	* an EXACT one, since nothing folds to or from a colon. */
	14829	if (value < 256) {
	14830	if (IS_IN_SOME_FOLD_L1(value)) {
	14831	op = EXACT;
	14832	}
	14833	}
	14834	else {
	14835	if (_invlist_contains_cp(PL_utf8_foldable, value)) {
	14836	op = EXACT;
	14837	}
	14838	}
	14839
	14840	/* If we haven't found the node type, above, it means we
	14841	* can use the prevailing one */
	14842	if (op == END) {
	14843	op = compute_EXACTish(pRExC_state);
	14844	}
	14845	}
	14846	}
	14847	}
	14848	else if (start == 0) {
	14849	if (end == UV_MAX) {
	14850	op = SANY;
	14851	*flagp \|= HASWIDTH\|SIMPLE;
	14852	RExC_naughty++;
	14853	}
	14854	else if (end == '\n' - 1
	14855	&& invlist_iternext(cp_list, &start, &end)
	14856	&& start == '\n' + 1 && end == UV_MAX)
	14857	{
	14858	op = REG_ANY;
	14859	*flagp \|= HASWIDTH\|SIMPLE;
	14860	RExC_naughty++;
	14861	}
	14862	}
	14863	invlist_iterfinish(cp_list);
	14864
	14865	if (op != END) {
	14866	RExC_parse = (char *)orig_parse;
	14867	RExC_emit = (regnode *)orig_emit;
	14868
	14869	ret = reg_node(pRExC_state, op);
	14870
	14871	RExC_parse = (char *)cur_parse;
	14872
	14873	if (PL_regkind[op] == EXACT) {
	14874	alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
	14875	TRUE /* downgradable to EXACT */
	14876	);
	14877	}
	14878
	14879	SvREFCNT_dec_NN(cp_list);
	14880	return ret;
	14881	}
	14882	}
	14883
	14884	/* Here, <cp_list> contains all the code points we can determine at
	14885	* compile time that match under all conditions. Go through it, and
	14886	* for things that belong in the bitmap, put them there, and delete from
	14887	* <cp_list>. While we are at it, see if everything above 255 is in the
	14888	* list, and if so, set a flag to speed up execution */
	14889
	14890	populate_ANYOF_from_invlist(ret, &cp_list);
	14891
	14892	if (invert) {
	14893	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	14894	}
	14895
	14896	/* Here, the bitmap has been populated with all the Latin1 code points that
	14897	* always match. Can now add to the overall list those that match only
	14898	* when the target string is UTF-8 (<depends_list>). */
	14899	if (depends_list) {
	14900	if (cp_list) {
	14901	_invlist_union(cp_list, depends_list, &cp_list);
	14902	SvREFCNT_dec_NN(depends_list);
	14903	}
	14904	else {
	14905	cp_list = depends_list;
	14906	}
	14907	ANYOF_FLAGS(ret) \|= ANYOF_UTF8;
	14908	}
	14909
	14910	/* If there is a swash and more than one element, we can't use the swash in
	14911	* the optimization below. */
	14912	if (swash && element_count > 1) {
	14913	SvREFCNT_dec_NN(swash);
	14914	swash = NULL;
	14915	}
	14916
	14917	/* Note that the optimization of using 'swash' if it is the only thing in
	14918	* the class doesn't have us change swash at all, so it can include things
	14919	* that are also in the bitmap; otherwise we have purposely deleted that
	14920	* duplicate information */
	14921	set_ANYOF_arg(pRExC_state, ret, cp_list,
	14922	(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	14923	? listsv : NULL,
	14924	only_utf8_locale_list,
	14925	swash, has_user_defined_property);
	14926
	14927	*flagp \|= HASWIDTH\|SIMPLE;
	14928
	14929	if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
	14930	RExC_contains_locale = 1;
	14931	}
	14932
	14933	return ret;
	14934	}
	14935
	14936	#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
	14937
	14938	STATIC void
	14939	S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
	14940	regnode* const node,
	14941	SV* const cp_list,
	14942	SV* const runtime_defns,
	14943	SV* const only_utf8_locale_list,
	14944	SV* const swash,
	14945	const bool has_user_defined_property)
	14946	{
	14947	/* Sets the arg field of an ANYOF-type node 'node', using information about
	14948	* the node passed-in. If there is nothing outside the node's bitmap, the
	14949	* arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
	14950	* the count returned by add_data(), having allocated and stored an array,
	14951	* av, that that count references, as follows:
	14952	* av[0] stores the character class description in its textual form.
	14953	* This is used later (regexec.c:Perl_regclass_swash()) to
	14954	* initialize the appropriate swash, and is also useful for dumping
	14955	* the regnode. This is set to &PL_sv_undef if the textual
	14956	* description is not needed at run-time (as happens if the other
	14957	* elements completely define the class)
	14958	* av[1] if &PL_sv_undef, is a placeholder to later contain the swash
	14959	* computed from av[0]. But if no further computation need be done,
	14960	* the swash is stored here now (and av[0] is &PL_sv_undef).
	14961	* av[2] stores the inversion list of code points that match only if the
	14962	* current locale is UTF-8
	14963	* av[3] stores the cp_list inversion list for use in addition or instead
	14964	* of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
	14965	* (Otherwise everything needed is already in av[0] and av[1])
	14966	* av[4] is set if any component of the class is from a user-defined
	14967	* property; used only if av[3] exists */
	14968
	14969	UV n;
	14970
	14971	PERL_ARGS_ASSERT_SET_ANYOF_ARG;
	14972
	14973	if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
	14974	assert(! (ANYOF_FLAGS(node)
	14975	& (ANYOF_UTF8\|ANYOF_NONBITMAP_NON_UTF8)));
	14976	ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
	14977	}
	14978	else {
	14979	AV * const av = newAV();
	14980	SV *rv;
	14981
	14982	assert(ANYOF_FLAGS(node)
	14983	& (ANYOF_UTF8\|ANYOF_NONBITMAP_NON_UTF8\|ANYOF_LOC_FOLD));
	14984
	14985	av_store(av, 0, (runtime_defns)
	14986	? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
	14987	if (swash) {
	14988	assert(cp_list);
	14989	av_store(av, 1, swash);
	14990	SvREFCNT_dec_NN(cp_list);
	14991	}
	14992	else {
	14993	av_store(av, 1, &PL_sv_undef);
	14994	if (cp_list) {
	14995	av_store(av, 3, cp_list);
	14996	av_store(av, 4, newSVuv(has_user_defined_property));
	14997	}
	14998	}
	14999
	15000	if (only_utf8_locale_list) {
	15001	av_store(av, 2, only_utf8_locale_list);
	15002	}
	15003	else {
	15004	av_store(av, 2, &PL_sv_undef);
	15005	}
	15006
	15007	rv = newRV_noinc(MUTABLE_SV(av));
	15008	n = add_data(pRExC_state, STR_WITH_LEN("s"));
	15009	RExC_rxi->data->data[n] = (void*)rv;
	15010	ARG_SET(node, n);
	15011	}
	15012	}
	15013
	15014	#if !defined(PERL_IN_XSUB_RE) \|\| defined(PLUGGABLE_RE_EXTENSION)
	15015	SV *
	15016	Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
	15017	const regnode* node,
	15018	bool doinit,
	15019	SV** listsvp,
	15020	SV** only_utf8_locale_ptr,
	15021	SV* exclude_list)
	15022
	15023	{
	15024	/* For internal core use only.
	15025	* Returns the swash for the input 'node' in the regex 'prog'.
	15026	* If <doinit> is 'true', will attempt to create the swash if not already
	15027	* done.
	15028	* If <listsvp> is non-null, will return the printable contents of the
	15029	* swash. This can be used to get debugging information even before the
	15030	* swash exists, by calling this function with 'doinit' set to false, in
	15031	* which case the components that will be used to eventually create the
	15032	* swash are returned (in a printable form).
	15033	* If <exclude_list> is not NULL, it is an inversion list of things to
	15034	* exclude from what's returned in <listsvp>.
	15035	* Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
	15036	* that, in spite of this function's name, the swash it returns may include
	15037	* the bitmap data as well */
	15038
	15039	SV *sw = NULL;
	15040	SV si = NULL; / Input swash initialization string */
	15041	SV* invlist = NULL;
	15042
	15043	RXi_GET_DECL(prog,progi);
	15044	const struct reg_data * const data = prog ? progi->data : NULL;
	15045
	15046	PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
	15047
	15048	assert(ANYOF_FLAGS(node)
	15049	& (ANYOF_UTF8\|ANYOF_NONBITMAP_NON_UTF8\|ANYOF_LOC_FOLD));
	15050
	15051	if (data && data->count) {
	15052	const U32 n = ARG(node);
	15053
	15054	if (data->what[n] == 's') {
	15055	SV * const rv = MUTABLE_SV(data->data[n]);
	15056	AV * const av = MUTABLE_AV(SvRV(rv));
	15057	SV **const ary = AvARRAY(av);
	15058	U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	15059
	15060	si = ary; / ary[0] = the string to initialize the swash with */
	15061
	15062	/* Elements 3 and 4 are either both present or both absent. [3] is
	15063	* any inversion list generated at compile time; [4] indicates if
	15064	* that inversion list has any user-defined properties in it. */
	15065	if (av_tindex(av) >= 2) {
	15066	if (only_utf8_locale_ptr
	15067	&& ary[2]
	15068	&& ary[2] != &PL_sv_undef)
	15069	{
	15070	*only_utf8_locale_ptr = ary[2];
	15071	}
	15072	else {
	15073	assert(only_utf8_locale_ptr);
	15074	*only_utf8_locale_ptr = NULL;
	15075	}
	15076
	15077	if (av_tindex(av) >= 3) {
	15078	invlist = ary[3];
	15079	if (SvUV(ary[4])) {
	15080	swash_init_flags \|= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
	15081	}
	15082	}
	15083	else {
	15084	invlist = NULL;
	15085	}
	15086	}
	15087
	15088	/* Element [1] is reserved for the set-up swash. If already there,
	15089	* return it; if not, create it and store it there */
	15090	if (ary[1] && SvROK(ary[1])) {
	15091	sw = ary[1];
	15092	}
	15093	else if (doinit && ((si && si != &PL_sv_undef)
	15094	\|\| (invlist && invlist != &PL_sv_undef))) {
	15095	assert(si);
	15096	sw = _core_swash_init("utf8", /* the utf8 package */
	15097	"", /* nameless */
	15098	si,
	15099	1, /* binary */
	15100	0, /* not from tr/// */
	15101	invlist,
	15102	&swash_init_flags);
	15103	(void)av_store(av, 1, sw);
	15104	}
	15105	}
	15106	}
	15107
	15108	/* If requested, return a printable version of what this swash matches */
	15109	if (listsvp) {
	15110	SV* matches_string = newSVpvs("");
	15111
	15112	/* The swash should be used, if possible, to get the data, as it
	15113	* contains the resolved data. But this function can be called at
	15114	* compile-time, before everything gets resolved, in which case we
	15115	* return the currently best available information, which is the string
	15116	* that will eventually be used to do that resolving, 'si' */
	15117	if ((! sw \|\| (invlist = _get_swash_invlist(sw)) == NULL)
	15118	&& (si && si != &PL_sv_undef))
	15119	{
	15120	sv_catsv(matches_string, si);
	15121	}
	15122
	15123	/* Add the inversion list to whatever we have. This may have come from
	15124	* the swash, or from an input parameter */
	15125	if (invlist) {
	15126	if (exclude_list) {
	15127	SV* clone = invlist_clone(invlist);
	15128	_invlist_subtract(clone, exclude_list, &clone);
	15129	sv_catsv(matches_string, _invlist_contents(clone));
	15130	SvREFCNT_dec_NN(clone);
	15131	}
	15132	else {
	15133	sv_catsv(matches_string, _invlist_contents(invlist));
	15134	}
	15135	}
	15136	*listsvp = matches_string;
	15137	}
	15138
	15139	return sw;
	15140	}
	15141	#endif /* !defined(PERL_IN_XSUB_RE) \|\| defined(PLUGGABLE_RE_EXTENSION) */
	15142
	15143	/* reg_skipcomment()
	15144
	15145	Absorbs an /x style # comment from the input stream,
	15146	returning a pointer to the first character beyond the comment, or if the
	15147	comment terminates the pattern without anything following it, this returns
	15148	one past the final character of the pattern (in other words, RExC_end) and
	15149	sets the REG_RUN_ON_COMMENT_SEEN flag.
	15150
	15151	Note it's the callers responsibility to ensure that we are
	15152	actually in /x mode
	15153
	15154	*/
	15155
	15156	PERL_STATIC_INLINE char*
	15157	S_reg_skipcomment(RExC_state_t pRExC_state, char p)
	15158	{
	15159	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	15160
	15161	assert(*p == '#');
	15162
	15163	while (p < RExC_end) {
	15164	if (*(++p) == '\n') {
	15165	return p+1;
	15166	}
	15167	}
	15168
	15169	/* we ran off the end of the pattern without ending the comment, so we have
	15170	* to add an \n when wrapping */
	15171	RExC_seen \|= REG_RUN_ON_COMMENT_SEEN;
	15172	return p;
	15173	}
	15174
	15175	/* nextchar()
	15176
	15177	Advances the parse position, and optionally absorbs
	15178	"whitespace" from the inputstream.
	15179
	15180	Without /x "whitespace" means (?#...) style comments only,
	15181	with /x this means (?#...) and # comments and whitespace proper.
	15182
	15183	Returns the RExC_parse point from BEFORE the scan occurs.
	15184
	15185	This is the /x friendly way of saying RExC_parse++.
	15186	*/
	15187
	15188	STATIC char*
	15189	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	15190	{
	15191	char* const retval = RExC_parse++;
	15192
	15193	PERL_ARGS_ASSERT_NEXTCHAR;
	15194
	15195	for (;;) {
	15196	if (RExC_end - RExC_parse >= 3
	15197	&& *RExC_parse == '('
	15198	&& RExC_parse[1] == '?'
	15199	&& RExC_parse[2] == '#')
	15200	{
	15201	while (*RExC_parse != ')') {
	15202	if (RExC_parse == RExC_end)
	15203	FAIL("Sequence (?#... not terminated");
	15204	RExC_parse++;
	15205	}
	15206	RExC_parse++;
	15207	continue;
	15208	}
	15209	if (RExC_flags & RXf_PMf_EXTENDED) {
	15210	char * p = regpatws(pRExC_state, RExC_parse,
	15211	TRUE); /* means recognize comments */
	15212	if (p != RExC_parse) {
	15213	RExC_parse = p;
	15214	continue;
	15215	}
	15216	}
	15217	return retval;
	15218	}
	15219	}
	15220
	15221	/*
	15222	- reg_node - emit a node
	15223	*/
	15224	STATIC regnode * /* Location. */
	15225	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	15226	{
	15227	regnode *ptr;
	15228	regnode * const ret = RExC_emit;
	15229	GET_RE_DEBUG_FLAGS_DECL;
	15230
	15231	PERL_ARGS_ASSERT_REG_NODE;
	15232
	15233	if (SIZE_ONLY) {
	15234	SIZE_ALIGN(RExC_size);
	15235	RExC_size += 1;
	15236	return(ret);
	15237	}
	15238	if (RExC_emit >= RExC_emit_bound)
	15239	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	15240	op, (void)RExC_emit, (void)RExC_emit_bound);
	15241
	15242	NODE_ALIGN_FILL(ret);
	15243	ptr = ret;
	15244	FILL_ADVANCE_NODE(ptr, op);
	15245	#ifdef RE_TRACK_PATTERN_OFFSETS
	15246	if (RExC_offsets) { /* MJD */
	15247	MJD_OFFSET_DEBUG(
	15248	("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	15249	"reg_node", __LINE__,
	15250	PL_reg_name[op],
	15251	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	15252	? "Overwriting end of array!\n" : "OK",
	15253	(UV)(RExC_emit - RExC_emit_start),
	15254	(UV)(RExC_parse - RExC_start),
	15255	(UV)RExC_offsets[0]));
	15256	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	15257	}
	15258	#endif
	15259	RExC_emit = ptr;
	15260	return(ret);
	15261	}
	15262
	15263	/*
	15264	- reganode - emit a node with an argument
	15265	*/
	15266	STATIC regnode * /* Location. */
	15267	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	15268	{
	15269	regnode *ptr;
	15270	regnode * const ret = RExC_emit;
	15271	GET_RE_DEBUG_FLAGS_DECL;
	15272
	15273	PERL_ARGS_ASSERT_REGANODE;
	15274
	15275	if (SIZE_ONLY) {
	15276	SIZE_ALIGN(RExC_size);
	15277	RExC_size += 2;
	15278	/*
	15279	We can't do this:
	15280
	15281	assert(2==regarglen[op]+1);
	15282
	15283	Anything larger than this has to allocate the extra amount.
	15284	If we changed this to be:
	15285
	15286	RExC_size += (1 + regarglen[op]);
	15287
	15288	then it wouldn't matter. Its not clear what side effect
	15289	might come from that so its not done so far.
	15290	-- dmq
	15291	*/
	15292	return(ret);
	15293	}
	15294	if (RExC_emit >= RExC_emit_bound)
	15295	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	15296	op, (void)RExC_emit, (void)RExC_emit_bound);
	15297
	15298	NODE_ALIGN_FILL(ret);
	15299	ptr = ret;
	15300	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	15301	#ifdef RE_TRACK_PATTERN_OFFSETS
	15302	if (RExC_offsets) { /* MJD */
	15303	MJD_OFFSET_DEBUG(
	15304	("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	15305	"reganode",
	15306	__LINE__,
	15307	PL_reg_name[op],
	15308	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	15309	"Overwriting end of array!\n" : "OK",
	15310	(UV)(RExC_emit - RExC_emit_start),
	15311	(UV)(RExC_parse - RExC_start),
	15312	(UV)RExC_offsets[0]));
	15313	Set_Cur_Node_Offset;
	15314	}
	15315	#endif
	15316	RExC_emit = ptr;
	15317	return(ret);
	15318	}
	15319
	15320	/*
	15321	- reguni - emit (if appropriate) a Unicode character
	15322	*/
	15323	PERL_STATIC_INLINE STRLEN
	15324	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	15325	{
	15326	PERL_ARGS_ASSERT_REGUNI;
	15327
	15328	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	15329	}
	15330
	15331	/*
	15332	- reginsert - insert an operator in front of already-emitted operand
	15333	*
	15334	* Means relocating the operand.
	15335	*/
	15336	STATIC void
	15337	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	15338	{
	15339	regnode *src;
	15340	regnode *dst;
	15341	regnode *place;
	15342	const int offset = regarglen[(U8)op];
	15343	const int size = NODE_STEP_REGNODE + offset;
	15344	GET_RE_DEBUG_FLAGS_DECL;
	15345
	15346	PERL_ARGS_ASSERT_REGINSERT;
	15347	PERL_UNUSED_CONTEXT;
	15348	PERL_UNUSED_ARG(depth);
	15349	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	15350	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	15351	if (SIZE_ONLY) {
	15352	RExC_size += size;
	15353	return;
	15354	}
	15355
	15356	src = RExC_emit;
	15357	RExC_emit += size;
	15358	dst = RExC_emit;
	15359	if (RExC_open_parens) {
	15360	int paren;
	15361	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	15362	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	15363	if ( RExC_open_parens[paren] >= opnd ) {
	15364	/DEBUG_PARSE_FMT("open"," - %d",size);/
	15365	RExC_open_parens[paren] += size;
	15366	} else {
	15367	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	15368	}
	15369	if ( RExC_close_parens[paren] >= opnd ) {
	15370	/DEBUG_PARSE_FMT("close"," - %d",size);/
	15371	RExC_close_parens[paren] += size;
	15372	} else {
	15373	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	15374	}
	15375	}
	15376	}
	15377
	15378	while (src > opnd) {
	15379	StructCopy(--src, --dst, regnode);
	15380	#ifdef RE_TRACK_PATTERN_OFFSETS
	15381	if (RExC_offsets) { /* MJD 20010112 */
	15382	MJD_OFFSET_DEBUG(
	15383	("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	15384	"reg_insert",
	15385	__LINE__,
	15386	PL_reg_name[op],
	15387	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	15388	? "Overwriting end of array!\n" : "OK",
	15389	(UV)(src - RExC_emit_start),
	15390	(UV)(dst - RExC_emit_start),
	15391	(UV)RExC_offsets[0]));
	15392	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	15393	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	15394	}
	15395	#endif
	15396	}
	15397
	15398
	15399	place = opnd; /* Op node, where operand used to be. */
	15400	#ifdef RE_TRACK_PATTERN_OFFSETS
	15401	if (RExC_offsets) { /* MJD */
	15402	MJD_OFFSET_DEBUG(
	15403	("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	15404	"reginsert",
	15405	__LINE__,
	15406	PL_reg_name[op],
	15407	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	15408	? "Overwriting end of array!\n" : "OK",
	15409	(UV)(place - RExC_emit_start),
	15410	(UV)(RExC_parse - RExC_start),
	15411	(UV)RExC_offsets[0]));
	15412	Set_Node_Offset(place, RExC_parse);
	15413	Set_Node_Length(place, 1);
	15414	}
	15415	#endif
	15416	src = NEXTOPER(place);
	15417	FILL_ADVANCE_NODE(place, op);
	15418	Zero(src, offset, regnode);
	15419	}
	15420
	15421	/*
	15422	- regtail - set the next-pointer at the end of a node chain of p to val.
	15423	- SEE ALSO: regtail_study
	15424	*/
	15425	/* TODO: All three parms should be const */
	15426	STATIC void
	15427	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p,
	15428	const regnode *val,U32 depth)
	15429	{
	15430	regnode *scan;
	15431	GET_RE_DEBUG_FLAGS_DECL;
	15432
	15433	PERL_ARGS_ASSERT_REGTAIL;
	15434	#ifndef DEBUGGING
	15435	PERL_UNUSED_ARG(depth);
	15436	#endif
	15437
	15438	if (SIZE_ONLY)
	15439	return;
	15440
	15441	/* Find last node. */
	15442	scan = p;
	15443	for (;;) {
	15444	regnode * const temp = regnext(scan);
	15445	DEBUG_PARSE_r({
	15446	SV * const mysv=sv_newmortal();
	15447	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	15448	regprop(RExC_rx, mysv, scan, NULL);
	15449	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	15450	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	15451	(temp == NULL ? "->" : ""),
	15452	(temp == NULL ? PL_reg_name[OP(val)] : "")
	15453	);
	15454	});
	15455	if (temp == NULL)
	15456	break;
	15457	scan = temp;
	15458	}
	15459
	15460	if (reg_off_by_arg[OP(scan)]) {
	15461	ARG_SET(scan, val - scan);
	15462	}
	15463	else {
	15464	NEXT_OFF(scan) = val - scan;
	15465	}
	15466	}
	15467
	15468	#ifdef DEBUGGING
	15469	/*
	15470	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	15471	- Look for optimizable sequences at the same time.
	15472	- currently only looks for EXACT chains.
	15473
	15474	This is experimental code. The idea is to use this routine to perform
	15475	in place optimizations on branches and groups as they are constructed,
	15476	with the long term intention of removing optimization from study_chunk so
	15477	that it is purely analytical.
	15478
	15479	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	15480	to control which is which.
	15481
	15482	*/
	15483	/* TODO: All four parms should be const */
	15484
	15485	STATIC U8
	15486	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p,
	15487	const regnode *val,U32 depth)
	15488	{
	15489	regnode *scan;
	15490	U8 exact = PSEUDO;
	15491	#ifdef EXPERIMENTAL_INPLACESCAN
	15492	I32 min = 0;
	15493	#endif
	15494	GET_RE_DEBUG_FLAGS_DECL;
	15495
	15496	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	15497
	15498
	15499	if (SIZE_ONLY)
	15500	return exact;
	15501
	15502	/* Find last node. */
	15503
	15504	scan = p;
	15505	for (;;) {
	15506	regnode * const temp = regnext(scan);
	15507	#ifdef EXPERIMENTAL_INPLACESCAN
	15508	if (PL_regkind[OP(scan)] == EXACT) {
	15509	bool unfolded_multi_char; /* Unexamined in this routine */
	15510	if (join_exact(pRExC_state, scan, &min,
	15511	&unfolded_multi_char, 1, val, depth+1))
	15512	return EXACT;
	15513	}
	15514	#endif
	15515	if ( exact ) {
	15516	switch (OP(scan)) {
	15517	case EXACT:
	15518	case EXACTF:
	15519	case EXACTFA_NO_TRIE:
	15520	case EXACTFA:
	15521	case EXACTFU:
	15522	case EXACTFU_SS:
	15523	case EXACTFL:
	15524	if( exact == PSEUDO )
	15525	exact= OP(scan);
	15526	else if ( exact != OP(scan) )
	15527	exact= 0;
	15528	case NOTHING:
	15529	break;
	15530	default:
	15531	exact= 0;
	15532	}
	15533	}
	15534	DEBUG_PARSE_r({
	15535	SV * const mysv=sv_newmortal();
	15536	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	15537	regprop(RExC_rx, mysv, scan, NULL);
	15538	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	15539	SvPV_nolen_const(mysv),
	15540	REG_NODE_NUM(scan),
	15541	PL_reg_name[exact]);
	15542	});
	15543	if (temp == NULL)
	15544	break;
	15545	scan = temp;
	15546	}
	15547	DEBUG_PARSE_r({
	15548	SV * const mysv_val=sv_newmortal();
	15549	DEBUG_PARSE_MSG("");
	15550	regprop(RExC_rx, mysv_val, val, NULL);
	15551	PerlIO_printf(Perl_debug_log,
	15552	"~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	15553	SvPV_nolen_const(mysv_val),
	15554	(IV)REG_NODE_NUM(val),
	15555	(IV)(val - scan)
	15556	);
	15557	});
	15558	if (reg_off_by_arg[OP(scan)]) {
	15559	ARG_SET(scan, val - scan);
	15560	}
	15561	else {
	15562	NEXT_OFF(scan) = val - scan;
	15563	}
	15564
	15565	return exact;
	15566	}
	15567	#endif
	15568
	15569	/*
	15570	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	15571	*/
	15572	#ifdef DEBUGGING
	15573
	15574	static void
	15575	S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
	15576	{
	15577	int bit;
	15578	int set=0;
	15579
	15580	ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
	15581
	15582	for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
	15583	if (flags & (1<<bit)) {
	15584	if (!set++ && lead)
	15585	PerlIO_printf(Perl_debug_log, "%s",lead);
	15586	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
	15587	}
	15588	}
	15589	if (lead) {
	15590	if (set)
	15591	PerlIO_printf(Perl_debug_log, "\n");
	15592	else
	15593	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	15594	}
	15595	}
	15596
	15597	static void
	15598	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	15599	{
	15600	int bit;
	15601	int set=0;
	15602	regex_charset cs;
	15603
	15604	ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
	15605
	15606	for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
	15607	if (flags & (1<<bit)) {
	15608	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	15609	continue;
	15610	}
	15611	if (!set++ && lead)
	15612	PerlIO_printf(Perl_debug_log, "%s",lead);
	15613	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	15614	}
	15615	}
	15616	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	15617	if (!set++ && lead) {
	15618	PerlIO_printf(Perl_debug_log, "%s",lead);
	15619	}
	15620	switch (cs) {
	15621	case REGEX_UNICODE_CHARSET:
	15622	PerlIO_printf(Perl_debug_log, "UNICODE");
	15623	break;
	15624	case REGEX_LOCALE_CHARSET:
	15625	PerlIO_printf(Perl_debug_log, "LOCALE");
	15626	break;
	15627	case REGEX_ASCII_RESTRICTED_CHARSET:
	15628	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	15629	break;
	15630	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	15631	PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
	15632	break;
	15633	default:
	15634	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	15635	break;
	15636	}
	15637	}
	15638	if (lead) {
	15639	if (set)
	15640	PerlIO_printf(Perl_debug_log, "\n");
	15641	else
	15642	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	15643	}
	15644	}
	15645	#endif
	15646
	15647	void
	15648	Perl_regdump(pTHX_ const regexp *r)
	15649	{
	15650	#ifdef DEBUGGING
	15651	SV * const sv = sv_newmortal();
	15652	SV *dsv= sv_newmortal();
	15653	RXi_GET_DECL(r,ri);
	15654	GET_RE_DEBUG_FLAGS_DECL;
	15655
	15656	PERL_ARGS_ASSERT_REGDUMP;
	15657
	15658	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	15659
	15660	/* Header fields of interest. */
	15661	if (r->anchored_substr) {
	15662	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	15663	RE_SV_DUMPLEN(r->anchored_substr), 30);
	15664	PerlIO_printf(Perl_debug_log,
	15665	"anchored %s%s at %"IVdf" ",
	15666	s, RE_SV_TAIL(r->anchored_substr),
	15667	(IV)r->anchored_offset);
	15668	} else if (r->anchored_utf8) {
	15669	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	15670	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	15671	PerlIO_printf(Perl_debug_log,
	15672	"anchored utf8 %s%s at %"IVdf" ",
	15673	s, RE_SV_TAIL(r->anchored_utf8),
	15674	(IV)r->anchored_offset);
	15675	}
	15676	if (r->float_substr) {
	15677	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	15678	RE_SV_DUMPLEN(r->float_substr), 30);
	15679	PerlIO_printf(Perl_debug_log,
	15680	"floating %s%s at %"IVdf"..%"UVuf" ",
	15681	s, RE_SV_TAIL(r->float_substr),
	15682	(IV)r->float_min_offset, (UV)r->float_max_offset);
	15683	} else if (r->float_utf8) {
	15684	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	15685	RE_SV_DUMPLEN(r->float_utf8), 30);
	15686	PerlIO_printf(Perl_debug_log,
	15687	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	15688	s, RE_SV_TAIL(r->float_utf8),
	15689	(IV)r->float_min_offset, (UV)r->float_max_offset);
	15690	}
	15691	if (r->check_substr \|\| r->check_utf8)
	15692	PerlIO_printf(Perl_debug_log,
	15693	(const char *)
	15694	(r->check_substr == r->float_substr
	15695	&& r->check_utf8 == r->float_utf8
	15696	? "(checking floating" : "(checking anchored"));
	15697	if (r->intflags & PREGf_NOSCAN)
	15698	PerlIO_printf(Perl_debug_log, " noscan");
	15699	if (r->extflags & RXf_CHECK_ALL)
	15700	PerlIO_printf(Perl_debug_log, " isall");
	15701	if (r->check_substr \|\| r->check_utf8)
	15702	PerlIO_printf(Perl_debug_log, ") ");
	15703
	15704	if (ri->regstclass) {
	15705	regprop(r, sv, ri->regstclass, NULL);
	15706	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	15707	}
	15708	if (r->intflags & PREGf_ANCH) {
	15709	PerlIO_printf(Perl_debug_log, "anchored");
	15710	if (r->intflags & PREGf_ANCH_BOL)
	15711	PerlIO_printf(Perl_debug_log, "(BOL)");
	15712	if (r->intflags & PREGf_ANCH_MBOL)
	15713	PerlIO_printf(Perl_debug_log, "(MBOL)");
	15714	if (r->intflags & PREGf_ANCH_SBOL)
	15715	PerlIO_printf(Perl_debug_log, "(SBOL)");
	15716	if (r->intflags & PREGf_ANCH_GPOS)
	15717	PerlIO_printf(Perl_debug_log, "(GPOS)");
	15718	PerlIO_putc(Perl_debug_log, ' ');
	15719	}
	15720	if (r->intflags & PREGf_GPOS_SEEN)
	15721	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	15722	if (r->intflags & PREGf_SKIP)
	15723	PerlIO_printf(Perl_debug_log, "plus ");
	15724	if (r->intflags & PREGf_IMPLICIT)
	15725	PerlIO_printf(Perl_debug_log, "implicit ");
	15726	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	15727	if (r->extflags & RXf_EVAL_SEEN)
	15728	PerlIO_printf(Perl_debug_log, "with eval ");
	15729	PerlIO_printf(Perl_debug_log, "\n");
	15730	DEBUG_FLAGS_r({
	15731	regdump_extflags("r->extflags: ",r->extflags);
	15732	regdump_intflags("r->intflags: ",r->intflags);
	15733	});
	15734	#else
	15735	PERL_ARGS_ASSERT_REGDUMP;
	15736	PERL_UNUSED_CONTEXT;
	15737	PERL_UNUSED_ARG(r);
	15738	#endif /* DEBUGGING */
	15739	}
	15740
	15741	/*
	15742	- regprop - printable representation of opcode, with run time support
	15743	*/
	15744
	15745	void
	15746	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode o, const regmatch_info reginfo)
	15747	{
	15748	#ifdef DEBUGGING
	15749	int k;
	15750
	15751	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	15752	static const char * const anyofs[] = {
	15753	#if _CC_WORDCHAR != 0 \|\| _CC_DIGIT != 1 \|\| _CC_ALPHA != 2 \|\| _CC_LOWER != 3 \
	15754	\|\| _CC_UPPER != 4 \|\| _CC_PUNCT != 5 \|\| _CC_PRINT != 6 \
	15755	\|\| _CC_ALPHANUMERIC != 7 \|\| _CC_GRAPH != 8 \|\| _CC_CASED != 9 \
	15756	\|\| _CC_SPACE != 10 \|\| _CC_BLANK != 11 \|\| _CC_XDIGIT != 12 \
	15757	\|\| _CC_PSXSPC != 13 \|\| _CC_CNTRL != 14 \|\| _CC_ASCII != 15 \
	15758	\|\| _CC_VERTSPACE != 16
	15759	#error Need to adjust order of anyofs[]
	15760	#endif
	15761	"\\w",
	15762	"\\W",
	15763	"\\d",
	15764	"\\D",
	15765	"[:alpha:]",
	15766	"[:^alpha:]",
	15767	"[:lower:]",
	15768	"[:^lower:]",
	15769	"[:upper:]",
	15770	"[:^upper:]",
	15771	"[:punct:]",
	15772	"[:^punct:]",
	15773	"[:print:]",
	15774	"[:^print:]",
	15775	"[:alnum:]",
	15776	"[:^alnum:]",
	15777	"[:graph:]",
	15778	"[:^graph:]",
	15779	"[:cased:]",
	15780	"[:^cased:]",
	15781	"\\s",
	15782	"\\S",
	15783	"[:blank:]",
	15784	"[:^blank:]",
	15785	"[:xdigit:]",
	15786	"[:^xdigit:]",
	15787	"[:space:]",
	15788	"[:^space:]",
	15789	"[:cntrl:]",
	15790	"[:^cntrl:]",
	15791	"[:ascii:]",
	15792	"[:^ascii:]",
	15793	"\\v",
	15794	"\\V"
	15795	};
	15796	RXi_GET_DECL(prog,progi);
	15797	GET_RE_DEBUG_FLAGS_DECL;
	15798
	15799	PERL_ARGS_ASSERT_REGPROP;
	15800
	15801	sv_setpvs(sv, "");
	15802
	15803	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	15804	/* It would be nice to FAIL() here, but this may be called from
	15805	regexec.c, and it would be hard to supply pRExC_state. */
	15806	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
	15807	(int)OP(o), (int)REGNODE_MAX);
	15808	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	15809
	15810	k = PL_regkind[OP(o)];
	15811
	15812	if (k == EXACT) {
	15813	sv_catpvs(sv, " ");
	15814	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	15815	* is a crude hack but it may be the best for now since
	15816	* we have no flag "this EXACTish node was UTF-8"
	15817	* --jhi */
	15818	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	15819	PERL_PV_ESCAPE_UNI_DETECT \|
	15820	PERL_PV_ESCAPE_NONASCII \|
	15821	PERL_PV_PRETTY_ELLIPSES \|
	15822	PERL_PV_PRETTY_LTGT \|
	15823	PERL_PV_PRETTY_NOCLEAR
	15824	);
	15825	} else if (k == TRIE) {
	15826	/* print the details of the trie in dumpuntil instead, as
	15827	* progi->data isn't available here */
	15828	const char op = OP(o);
	15829	const U32 n = ARG(o);
	15830	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	15831	(reg_ac_data *)progi->data->data[n] :
	15832	NULL;
	15833	const reg_trie_data * const trie
	15834	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	15835
	15836	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	15837	DEBUG_TRIE_COMPILE_r(
	15838	Perl_sv_catpvf(aTHX_ sv,
	15839	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	15840	(UV)trie->startstate,
	15841	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	15842	(UV)trie->wordcount,
	15843	(UV)trie->minlen,
	15844	(UV)trie->maxlen,
	15845	(UV)TRIE_CHARCOUNT(trie),
	15846	(UV)trie->uniquecharcount
	15847	);
	15848	);
	15849	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	15850	sv_catpvs(sv, "[");
	15851	(void) put_charclass_bitmap_innards(sv,
	15852	(IS_ANYOF_TRIE(op))
	15853	? ANYOF_BITMAP(o)
	15854	: TRIE_BITMAP(trie),
	15855	NULL);
	15856	sv_catpvs(sv, "]");
	15857	}
	15858
	15859	} else if (k == CURLY) {
	15860	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	15861	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	15862	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	15863	}
	15864	else if (k == WHILEM && o->flags) /* Ordinal/of */
	15865	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	15866	else if (k == REF \|\| k == OPEN \|\| k == CLOSE
	15867	\|\| k == GROUPP \|\| OP(o)==ACCEPT)
	15868	{
	15869	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	15870	if ( RXp_PAREN_NAMES(prog) ) {
	15871	if ( k != REF \|\| (OP(o) < NREF)) {
	15872	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	15873	SV **name= av_fetch(list, ARG(o), 0 );
	15874	if (name)
	15875	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	15876	}
	15877	else {
	15878	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	15879	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	15880	I32 nums=(I32)SvPVX(sv_dat);
	15881	SV **name= av_fetch(list, nums[0], 0 );
	15882	I32 n;
	15883	if (name) {
	15884	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	15885	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	15886	(n ? "," : ""), (IV)nums[n]);
	15887	}
	15888	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	15889	}
	15890	}
	15891	}
	15892	if ( k == REF && reginfo) {
	15893	U32 n = ARG(o); /* which paren pair */
	15894	I32 ln = prog->offs[n].start;
	15895	if (prog->lastparen < n \|\| ln == -1)
	15896	Perl_sv_catpvf(aTHX_ sv, ": FAIL");
	15897	else if (ln == prog->offs[n].end)
	15898	Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
	15899	else {
	15900	const char *s = reginfo->strbeg + ln;
	15901	Perl_sv_catpvf(aTHX_ sv, ": ");
	15902	Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
	15903	PERL_PV_ESCAPE_UNI_DETECT\|PERL_PV_PRETTY_NOCLEAR\|PERL_PV_PRETTY_ELLIPSES\|PERL_PV_PRETTY_QUOTE );
	15904	}
	15905	}
	15906	} else if (k == GOSUB)
	15907	/* Paren and offset */
	15908	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
	15909	else if (k == VERB) {
	15910	if (!o->flags)
	15911	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	15912	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	15913	} else if (k == LOGICAL)
	15914	/* 2: embedded, otherwise 1 */
	15915	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
	15916	else if (k == ANYOF) {
	15917	const U8 flags = ANYOF_FLAGS(o);
	15918	int do_sep = 0;
	15919	SV* bitmap_invlist; /* Will hold what the bit map contains */
	15920
	15921
	15922	if (flags & ANYOF_LOCALE_FLAGS)
	15923	sv_catpvs(sv, "{loc}");
	15924	if (flags & ANYOF_LOC_FOLD)
	15925	sv_catpvs(sv, "{i}");
	15926	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	15927	if (flags & ANYOF_INVERT)
	15928	sv_catpvs(sv, "^");
	15929
	15930	/* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
	15931	* */
	15932	do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
	15933	&bitmap_invlist);
	15934
	15935	/* output any special charclass tests (used entirely under use
	15936	* locale) * */
	15937	if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
	15938	int i;
	15939	for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
	15940	if (ANYOF_POSIXL_TEST(o,i)) {
	15941	sv_catpv(sv, anyofs[i]);
	15942	do_sep = 1;
	15943	}
	15944	}
	15945	}
	15946
	15947	if ((flags & (ANYOF_ABOVE_LATIN1_ALL
	15948	\|ANYOF_UTF8
	15949	\|ANYOF_NONBITMAP_NON_UTF8
	15950	\|ANYOF_LOC_FOLD)))
	15951	{
	15952	if (do_sep) {
	15953	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
	15954	if (flags & ANYOF_INVERT)
	15955	/make sure the invert info is in each /
	15956	sv_catpvs(sv, "^");
	15957	}
	15958
	15959	if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
	15960	sv_catpvs(sv, "{non-utf8-latin1-all}");
	15961	}
	15962
	15963	/* output information about the unicode matching */
	15964	if (flags & ANYOF_ABOVE_LATIN1_ALL)
	15965	sv_catpvs(sv, "{unicode_all}");
	15966	else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
	15967	SV lv; / Set if there is something outside the bit map. */
	15968	bool byte_output = FALSE; /* If something in the bitmap has
	15969	been output */
	15970	SV *only_utf8_locale;
	15971
	15972	/* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
	15973	* is used to guarantee that nothing in the bitmap gets
	15974	* returned */
	15975	(void) _get_regclass_nonbitmap_data(prog, o, FALSE,
	15976	&lv, &only_utf8_locale,
	15977	bitmap_invlist);
	15978	if (lv && lv != &PL_sv_undef) {
	15979	char *s = savesvpv(lv);
	15980	char * const origs = s;
	15981
	15982	while (s && s != '\n')
	15983	s++;
	15984
	15985	if (*s == '\n') {
	15986	const char * const t = ++s;
	15987
	15988	if (flags & ANYOF_NONBITMAP_NON_UTF8) {
	15989	sv_catpvs(sv, "{outside bitmap}");
	15990	}
	15991	else {
	15992	sv_catpvs(sv, "{utf8}");
	15993	}
	15994
	15995	if (byte_output) {
	15996	sv_catpvs(sv, " ");
	15997	}
	15998
	15999	while (*s) {
	16000	if (*s == '\n') {
	16001
	16002	/* Truncate very long output */
	16003	if (s - origs > 256) {
	16004	Perl_sv_catpvf(aTHX_ sv,
	16005	"%.*s...",
	16006	(int) (s - origs - 1),
	16007	t);
	16008	goto out_dump;
	16009	}
	16010	*s = ' ';
	16011	}
	16012	else if (*s == '\t') {
	16013	*s = '-';
	16014	}
	16015	s++;
	16016	}
	16017	if (s[-1] == ' ')
	16018	s[-1] = 0;
	16019
	16020	sv_catpv(sv, t);
	16021	}
	16022
	16023	out_dump:
	16024
	16025	Safefree(origs);
	16026	SvREFCNT_dec_NN(lv);
	16027	}
	16028
	16029	if ((flags & ANYOF_LOC_FOLD)
	16030	&& only_utf8_locale
	16031	&& only_utf8_locale != &PL_sv_undef)
	16032	{
	16033	UV start, end;
	16034	int max_entries = 256;
	16035
	16036	sv_catpvs(sv, "{utf8 locale}");
	16037	invlist_iterinit(only_utf8_locale);
	16038	while (invlist_iternext(only_utf8_locale,
	16039	&start, &end)) {
	16040	put_range(sv, start, end, FALSE);
	16041	max_entries --;
	16042	if (max_entries < 0) {
	16043	sv_catpvs(sv, "...");
	16044	break;
	16045	}
	16046	}
	16047	invlist_iterfinish(only_utf8_locale);
	16048	}
	16049	}
	16050	}
	16051	SvREFCNT_dec(bitmap_invlist);
	16052
	16053
	16054	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	16055	}
	16056	else if (k == POSIXD \|\| k == NPOSIXD) {
	16057	U8 index = FLAGS(o) * 2;
	16058	if (index < C_ARRAY_LENGTH(anyofs)) {
	16059	if (*anyofs[index] != '[') {
	16060	sv_catpv(sv, "[");
	16061	}
	16062	sv_catpv(sv, anyofs[index]);
	16063	if (*anyofs[index] != '[') {
	16064	sv_catpv(sv, "]");
	16065	}
	16066	}
	16067	else {
	16068	Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
	16069	}
	16070	}
	16071	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	16072	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	16073	#else
	16074	PERL_UNUSED_CONTEXT;
	16075	PERL_UNUSED_ARG(sv);
	16076	PERL_UNUSED_ARG(o);
	16077	PERL_UNUSED_ARG(prog);
	16078	PERL_UNUSED_ARG(reginfo);
	16079	#endif /* DEBUGGING */
	16080	}
	16081
	16082
	16083
	16084	SV *
	16085	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	16086	{ /* Assume that RE_INTUIT is set */
	16087	struct regexp *const prog = ReANY(r);
	16088	GET_RE_DEBUG_FLAGS_DECL;
	16089
	16090	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	16091	PERL_UNUSED_CONTEXT;
	16092
	16093	DEBUG_COMPILE_r(
	16094	{
	16095	const char * const s = SvPV_nolen_const(prog->check_substr
	16096	? prog->check_substr : prog->check_utf8);
	16097
	16098	if (!PL_colorset) reginitcolors();
	16099	PerlIO_printf(Perl_debug_log,
	16100	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	16101	PL_colors[4],
	16102	prog->check_substr ? "" : "utf8 ",
	16103	PL_colors[5],PL_colors[0],
	16104	s,
	16105	PL_colors[1],
	16106	(strlen(s) > 60 ? "..." : ""));
	16107	} );
	16108
	16109	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	16110	}
	16111
	16112	/*
	16113	pregfree()
	16114
	16115	handles refcounting and freeing the perl core regexp structure. When
	16116	it is necessary to actually free the structure the first thing it
	16117	does is call the 'free' method of the regexp_engine associated to
	16118	the regexp, allowing the handling of the void *pprivate; member
	16119	first. (This routine is not overridable by extensions, which is why
	16120	the extensions free is called first.)
	16121
	16122	See regdupe and regdupe_internal if you change anything here.
	16123	*/
	16124	#ifndef PERL_IN_XSUB_RE
	16125	void
	16126	Perl_pregfree(pTHX_ REGEXP *r)
	16127	{
	16128	SvREFCNT_dec(r);
	16129	}
	16130
	16131	void
	16132	Perl_pregfree2(pTHX_ REGEXP *rx)
	16133	{
	16134	struct regexp *const r = ReANY(rx);
	16135	GET_RE_DEBUG_FLAGS_DECL;
	16136
	16137	PERL_ARGS_ASSERT_PREGFREE2;
	16138
	16139	if (r->mother_re) {
	16140	ReREFCNT_dec(r->mother_re);
	16141	} else {
	16142	CALLREGFREE_PVT(rx); /* free the private data */
	16143	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	16144	Safefree(r->xpv_len_u.xpvlenu_pv);
	16145	}
	16146	if (r->substrs) {
	16147	SvREFCNT_dec(r->anchored_substr);
	16148	SvREFCNT_dec(r->anchored_utf8);
	16149	SvREFCNT_dec(r->float_substr);
	16150	SvREFCNT_dec(r->float_utf8);
	16151	Safefree(r->substrs);
	16152	}
	16153	RX_MATCH_COPY_FREE(rx);
	16154	#ifdef PERL_ANY_COW
	16155	SvREFCNT_dec(r->saved_copy);
	16156	#endif
	16157	Safefree(r->offs);
	16158	SvREFCNT_dec(r->qr_anoncv);
	16159	rx->sv_u.svu_rx = 0;
	16160	}
	16161
	16162	/* reg_temp_copy()
	16163
	16164	This is a hacky workaround to the structural issue of match results
	16165	being stored in the regexp structure which is in turn stored in
	16166	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	16167	could be PL_curpm in multiple contexts, and could require multiple
	16168	result sets being associated with the pattern simultaneously, such
	16169	as when doing a recursive match with (??{$qr})
	16170
	16171	The solution is to make a lightweight copy of the regexp structure
	16172	when a qr// is returned from the code executed by (??{$qr}) this
	16173	lightweight copy doesn't actually own any of its data except for
	16174	the starp/end and the actual regexp structure itself.
	16175
	16176	*/
	16177
	16178
	16179	REGEXP *
	16180	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	16181	{
	16182	struct regexp *ret;
	16183	struct regexp *const r = ReANY(rx);
	16184	const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
	16185
	16186	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	16187
	16188	if (!ret_x)
	16189	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	16190	else {
	16191	SvOK_off((SV *)ret_x);
	16192	if (islv) {
	16193	/* For PVLVs, SvANY points to the xpvlv body while sv_u points
	16194	to the regexp. (For SVt_REGEXPs, sv_upgrade has already
	16195	made both spots point to the same regexp body.) */
	16196	REGEXP temp = (REGEXP )newSV_type(SVt_REGEXP);
	16197	assert(!SvPVX(ret_x));
	16198	ret_x->sv_u.svu_rx = temp->sv_any;
	16199	temp->sv_any = NULL;
	16200	SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) \| SVt_NULL;
	16201	SvREFCNT_dec_NN(temp);
	16202	/* SvCUR still resides in the xpvlv struct, so the regexp copy-
	16203	ing below will not set it. */
	16204	SvCUR_set(ret_x, SvCUR(rx));
	16205	}
	16206	}
	16207	/* This ensures that SvTHINKFIRST(sv) is true, and hence that
	16208	sv_force_normal(sv) is called. */
	16209	SvFAKE_on(ret_x);
	16210	ret = ReANY(ret_x);
	16211
	16212	SvFLAGS(ret_x) \|= SvUTF8(rx);
	16213	/* We share the same string buffer as the original regexp, on which we
	16214	hold a reference count, incremented when mother_re is set below.
	16215	The string pointer is copied here, being part of the regexp struct.
	16216	*/
	16217	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	16218	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	16219	if (r->offs) {
	16220	const I32 npar = r->nparens+1;
	16221	Newx(ret->offs, npar, regexp_paren_pair);
	16222	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	16223	}
	16224	if (r->substrs) {
	16225	Newx(ret->substrs, 1, struct reg_substr_data);
	16226	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	16227
	16228	SvREFCNT_inc_void(ret->anchored_substr);
	16229	SvREFCNT_inc_void(ret->anchored_utf8);
	16230	SvREFCNT_inc_void(ret->float_substr);
	16231	SvREFCNT_inc_void(ret->float_utf8);
	16232
	16233	/* check_substr and check_utf8, if non-NULL, point to either their
	16234	anchored or float namesakes, and don't hold a second reference. */
	16235	}
	16236	RX_MATCH_COPIED_off(ret_x);
	16237	#ifdef PERL_ANY_COW
	16238	ret->saved_copy = NULL;
	16239	#endif
	16240	ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
	16241	SvREFCNT_inc_void(ret->qr_anoncv);
	16242
	16243	return ret_x;
	16244	}
	16245	#endif
	16246
	16247	/* regfree_internal()
	16248
	16249	Free the private data in a regexp. This is overloadable by
	16250	extensions. Perl takes care of the regexp structure in pregfree(),
	16251	this covers the *pprivate pointer which technically perl doesn't
	16252	know about, however of course we have to handle the
	16253	regexp_internal structure when no extension is in use.
	16254
	16255	Note this is called before freeing anything in the regexp
	16256	structure.
	16257	*/
	16258
	16259	void
	16260	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	16261	{
	16262	struct regexp *const r = ReANY(rx);
	16263	RXi_GET_DECL(r,ri);
	16264	GET_RE_DEBUG_FLAGS_DECL;
	16265
	16266	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	16267
	16268	DEBUG_COMPILE_r({
	16269	if (!PL_colorset)
	16270	reginitcolors();
	16271	{
	16272	SV *dsv= sv_newmortal();
	16273	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	16274	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	16275	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	16276	PL_colors[4],PL_colors[5],s);
	16277	}
	16278	});
	16279	#ifdef RE_TRACK_PATTERN_OFFSETS
	16280	if (ri->u.offsets)
	16281	Safefree(ri->u.offsets); /* 20010421 MJD */
	16282	#endif
	16283	if (ri->code_blocks) {
	16284	int n;
	16285	for (n = 0; n < ri->num_code_blocks; n++)
	16286	SvREFCNT_dec(ri->code_blocks[n].src_regex);
	16287	Safefree(ri->code_blocks);
	16288	}
	16289
	16290	if (ri->data) {
	16291	int n = ri->data->count;
	16292
	16293	while (--n >= 0) {
	16294	/* If you add a ->what type here, update the comment in regcomp.h */
	16295	switch (ri->data->what[n]) {
	16296	case 'a':
	16297	case 'r':
	16298	case 's':
	16299	case 'S':
	16300	case 'u':
	16301	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	16302	break;
	16303	case 'f':
	16304	Safefree(ri->data->data[n]);
	16305	break;
	16306	case 'l':
	16307	case 'L':
	16308	break;
	16309	case 'T':
	16310	{ /* Aho Corasick add-on structure for a trie node.
	16311	Used in stclass optimization only */
	16312	U32 refcount;
	16313	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	16314	#ifdef USE_ITHREADS
	16315	dVAR;
	16316	#endif
	16317	OP_REFCNT_LOCK;
	16318	refcount = --aho->refcount;
	16319	OP_REFCNT_UNLOCK;
	16320	if ( !refcount ) {
	16321	PerlMemShared_free(aho->states);
	16322	PerlMemShared_free(aho->fail);
	16323	/* do this last!!!! */
	16324	PerlMemShared_free(ri->data->data[n]);
	16325	/* we should only ever get called once, so
	16326	* assert as much, and also guard the free
	16327	* which /might/ happen twice. At the least
	16328	* it will make code anlyzers happy and it
	16329	* doesn't cost much. - Yves */
	16330	assert(ri->regstclass);
	16331	if (ri->regstclass) {
	16332	PerlMemShared_free(ri->regstclass);
	16333	ri->regstclass = 0;
	16334	}
	16335	}
	16336	}
	16337	break;
	16338	case 't':
	16339	{
	16340	/* trie structure. */
	16341	U32 refcount;
	16342	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	16343	#ifdef USE_ITHREADS
	16344	dVAR;
	16345	#endif
	16346	OP_REFCNT_LOCK;
	16347	refcount = --trie->refcount;
	16348	OP_REFCNT_UNLOCK;
	16349	if ( !refcount ) {
	16350	PerlMemShared_free(trie->charmap);
	16351	PerlMemShared_free(trie->states);
	16352	PerlMemShared_free(trie->trans);
	16353	if (trie->bitmap)
	16354	PerlMemShared_free(trie->bitmap);
	16355	if (trie->jump)
	16356	PerlMemShared_free(trie->jump);
	16357	PerlMemShared_free(trie->wordinfo);
	16358	/* do this last!!!! */
	16359	PerlMemShared_free(ri->data->data[n]);
	16360	}
	16361	}
	16362	break;
	16363	default:
	16364	Perl_croak(aTHX_ "panic: regfree data code '%c'",
	16365	ri->data->what[n]);
	16366	}
	16367	}
	16368	Safefree(ri->data->what);
	16369	Safefree(ri->data);
	16370	}
	16371
	16372	Safefree(ri);
	16373	}
	16374
	16375	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	16376	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	16377	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	16378
	16379	/*
	16380	re_dup - duplicate a regexp.
	16381
	16382	This routine is expected to clone a given regexp structure. It is only
	16383	compiled under USE_ITHREADS.
	16384
	16385	After all of the core data stored in struct regexp is duplicated
	16386	the regexp_engine.dupe method is used to copy any private data
	16387	stored in the *pprivate pointer. This allows extensions to handle
	16388	any duplication it needs to do.
	16389
	16390	See pregfree() and regfree_internal() if you change anything here.
	16391	*/
	16392	#if defined(USE_ITHREADS)
	16393	#ifndef PERL_IN_XSUB_RE
	16394	void
	16395	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	16396	{
	16397	dVAR;
	16398	I32 npar;
	16399	const struct regexp *r = ReANY(sstr);
	16400	struct regexp *ret = ReANY(dstr);
	16401
	16402	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	16403
	16404	npar = r->nparens+1;
	16405	Newx(ret->offs, npar, regexp_paren_pair);
	16406	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	16407
	16408	if (ret->substrs) {
	16409	/* Do it this way to avoid reading from *r after the StructCopy().
	16410	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	16411	cache, it doesn't matter. */
	16412	const bool anchored = r->check_substr
	16413	? r->check_substr == r->anchored_substr
	16414	: r->check_utf8 == r->anchored_utf8;
	16415	Newx(ret->substrs, 1, struct reg_substr_data);
	16416	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	16417
	16418	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	16419	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	16420	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	16421	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	16422
	16423	/* check_substr and check_utf8, if non-NULL, point to either their
	16424	anchored or float namesakes, and don't hold a second reference. */
	16425
	16426	if (ret->check_substr) {
	16427	if (anchored) {
	16428	assert(r->check_utf8 == r->anchored_utf8);
	16429	ret->check_substr = ret->anchored_substr;
	16430	ret->check_utf8 = ret->anchored_utf8;
	16431	} else {
	16432	assert(r->check_substr == r->float_substr);
	16433	assert(r->check_utf8 == r->float_utf8);
	16434	ret->check_substr = ret->float_substr;
	16435	ret->check_utf8 = ret->float_utf8;
	16436	}
	16437	} else if (ret->check_utf8) {
	16438	if (anchored) {
	16439	ret->check_utf8 = ret->anchored_utf8;
	16440	} else {
	16441	ret->check_utf8 = ret->float_utf8;
	16442	}
	16443	}
	16444	}
	16445
	16446	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	16447	ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
	16448
	16449	if (ret->pprivate)
	16450	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	16451
	16452	if (RX_MATCH_COPIED(dstr))
	16453	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	16454	else
	16455	ret->subbeg = NULL;
	16456	#ifdef PERL_ANY_COW
	16457	ret->saved_copy = NULL;
	16458	#endif
	16459
	16460	/* Whether mother_re be set or no, we need to copy the string. We
	16461	cannot refrain from copying it when the storage points directly to
	16462	our mother regexp, because that's
	16463	1: a buffer in a different thread
	16464	2: something we no longer hold a reference on
	16465	so we need to copy it locally. */
	16466	RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
	16467	ret->mother_re = NULL;
	16468	}
	16469	#endif /* PERL_IN_XSUB_RE */
	16470
	16471	/*
	16472	regdupe_internal()
	16473
	16474	This is the internal complement to regdupe() which is used to copy
	16475	the structure pointed to by the *pprivate pointer in the regexp.
	16476	This is the core version of the extension overridable cloning hook.
	16477	The regexp structure being duplicated will be copied by perl prior
	16478	to this and will be provided as the regexp *r argument, however
	16479	with the /old/ structures pprivate pointer value. Thus this routine
	16480	may override any copying normally done by perl.
	16481
	16482	It returns a pointer to the new regexp_internal structure.
	16483	*/
	16484
	16485	void *
	16486	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	16487	{
	16488	dVAR;
	16489	struct regexp *const r = ReANY(rx);
	16490	regexp_internal *reti;
	16491	int len;
	16492	RXi_GET_DECL(r,ri);
	16493
	16494	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	16495
	16496	len = ProgLen(ri);
	16497
	16498	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
	16499	char, regexp_internal);
	16500	Copy(ri->program, reti->program, len+1, regnode);
	16501
	16502	reti->num_code_blocks = ri->num_code_blocks;
	16503	if (ri->code_blocks) {
	16504	int n;
	16505	Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
	16506	struct reg_code_block);
	16507	Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
	16508	struct reg_code_block);
	16509	for (n = 0; n < ri->num_code_blocks; n++)
	16510	reti->code_blocks[n].src_regex = (REGEXP*)
	16511	sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
	16512	}
	16513	else
	16514	reti->code_blocks = NULL;
	16515
	16516	reti->regstclass = NULL;
	16517
	16518	if (ri->data) {
	16519	struct reg_data *d;
	16520	const int count = ri->data->count;
	16521	int i;
	16522
	16523	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	16524	char, struct reg_data);
	16525	Newx(d->what, count, U8);
	16526
	16527	d->count = count;
	16528	for (i = 0; i < count; i++) {
	16529	d->what[i] = ri->data->what[i];
	16530	switch (d->what[i]) {
	16531	/* see also regcomp.h and regfree_internal() */
	16532	case 'a': /* actually an AV, but the dup function is identical. */
	16533	case 'r':
	16534	case 's':
	16535	case 'S':
	16536	case 'u': /* actually an HV, but the dup function is identical. */
	16537	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	16538	break;
	16539	case 'f':
	16540	/* This is cheating. */
	16541	Newx(d->data[i], 1, regnode_ssc);
	16542	StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
	16543	reti->regstclass = (regnode*)d->data[i];
	16544	break;
	16545	case 'T':
	16546	/* Trie stclasses are readonly and can thus be shared
	16547	* without duplication. We free the stclass in pregfree
	16548	* when the corresponding reg_ac_data struct is freed.
	16549	*/
	16550	reti->regstclass= ri->regstclass;
	16551	/* FALLTHROUGH */
	16552	case 't':
	16553	OP_REFCNT_LOCK;
	16554	((reg_trie_data*)ri->data->data[i])->refcount++;
	16555	OP_REFCNT_UNLOCK;
	16556	/* FALLTHROUGH */
	16557	case 'l':
	16558	case 'L':
	16559	d->data[i] = ri->data->data[i];
	16560	break;
	16561	default:
	16562	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
	16563	ri->data->what[i]);
	16564	}
	16565	}
	16566
	16567	reti->data = d;
	16568	}
	16569	else
	16570	reti->data = NULL;
	16571
	16572	reti->name_list_idx = ri->name_list_idx;
	16573
	16574	#ifdef RE_TRACK_PATTERN_OFFSETS
	16575	if (ri->u.offsets) {
	16576	Newx(reti->u.offsets, 2*len+1, U32);
	16577	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	16578	}
	16579	#else
	16580	SetProgLen(reti,len);
	16581	#endif
	16582
	16583	return (void*)reti;
	16584	}
	16585
	16586	#endif /* USE_ITHREADS */
	16587
	16588	#ifndef PERL_IN_XSUB_RE
	16589
	16590	/*
	16591	- regnext - dig the "next" pointer out of a node
	16592	*/
	16593	regnode *
	16594	Perl_regnext(pTHX_ regnode *p)
	16595	{
	16596	I32 offset;
	16597
	16598	if (!p)
	16599	return(NULL);
	16600
	16601	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	16602	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
	16603	(int)OP(p), (int)REGNODE_MAX);
	16604	}
	16605
	16606	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	16607	if (offset == 0)
	16608	return(NULL);
	16609
	16610	return(p+offset);
	16611	}
	16612	#endif
	16613
	16614	STATIC void
	16615	S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
	16616	{
	16617	va_list args;
	16618	STRLEN l1 = strlen(pat1);
	16619	STRLEN l2 = strlen(pat2);
	16620	char buf[512];
	16621	SV *msv;
	16622	const char *message;
	16623
	16624	PERL_ARGS_ASSERT_RE_CROAK2;
	16625
	16626	if (l1 > 510)
	16627	l1 = 510;
	16628	if (l1 + l2 > 510)
	16629	l2 = 510 - l1;
	16630	Copy(pat1, buf, l1 , char);
	16631	Copy(pat2, buf + l1, l2 , char);
	16632	buf[l1 + l2] = '\n';
	16633	buf[l1 + l2 + 1] = '\0';
	16634	va_start(args, pat2);
	16635	msv = vmess(buf, &args);
	16636	va_end(args);
	16637	message = SvPV_const(msv,l1);
	16638	if (l1 > 512)
	16639	l1 = 512;
	16640	Copy(message, buf, l1 , char);
	16641	/* l1-1 to avoid \n */
	16642	Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
	16643	}
	16644
	16645	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	16646
	16647	#ifndef PERL_IN_XSUB_RE
	16648	void
	16649	Perl_save_re_context(pTHX)
	16650	{
	16651	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	16652	if (PL_curpm) {
	16653	const REGEXP * const rx = PM_GETRE(PL_curpm);
	16654	if (rx) {
	16655	U32 i;
	16656	for (i = 1; i <= RX_NPARENS(rx); i++) {
	16657	char digits[TYPE_CHARS(long)];
	16658	const STRLEN len = my_snprintf(digits, sizeof(digits),
	16659	"%lu", (long)i);
	16660	GV const const gvp
	16661	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	16662
	16663	if (gvp) {
	16664	GV * const gv = *gvp;
	16665	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	16666	save_scalar(gv);
	16667	}
	16668	}
	16669	}
	16670	}
	16671	}
	16672	#endif
	16673
	16674	#ifdef DEBUGGING
	16675
	16676	/* Given that c is a control character, is it one for which we have a
	16677	* mnemonic? */
	16678	#define isMNEMONIC_CNTRL(c) ((isSPACE_A(c) && (c) != '\v') \
	16679	\|\| (c) == '\a' \
	16680	\|\| (c) == '\b' \
	16681	\|\| (c) == ESC_NATIVE)
	16682	/* Certain characters are output as a sequence with the first being a
	16683	* backslash. */
	16684	#define isBACKSLASHED_PUNCT(c) \
	16685	((c) == '-' \|\| (c) == ']' \|\| (c) == '\\' \|\| (c) == '^')
	16686
	16687	STATIC void
	16688	S_put_byte(pTHX_ SV *sv, int c)
	16689	{
	16690	PERL_ARGS_ASSERT_PUT_BYTE;
	16691
	16692	if (!isPRINT(c)) {
	16693	switch (c) {
	16694	case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
	16695	case '\b': Perl_sv_catpvf(aTHX_ sv, "\\b"); break;
	16696	case ESC_NATIVE: Perl_sv_catpvf(aTHX_ sv, "\\e"); break;
	16697	case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
	16698	case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
	16699	case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
	16700	case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
	16701	default: Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", c); break;
	16702	}
	16703	}
	16704	else {
	16705	const char string = c;
	16706	if (isBACKSLASHED_PUNCT(c))
	16707	sv_catpvs(sv, "\\");
	16708	sv_catpvn(sv, &string, 1);
	16709	}
	16710	}
	16711
	16712	#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
	16713
	16714	#ifndef MIN
	16715	#define MIN(a,b) ((a) < (b) ? (a) : (b))
	16716	#endif
	16717
	16718	STATIC void
	16719	S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
	16720	{
	16721	/* Appends to 'sv' a displayable version of the range of code points from
	16722	* 'start' to 'end'. It assumes that only ASCII printables are displayable
	16723	* as-is (though some of these will be escaped by put_byte()). */
	16724
	16725	const unsigned int min_range_count = 3;
	16726
	16727	assert(start <= end);
	16728
	16729	PERL_ARGS_ASSERT_PUT_RANGE;
	16730
	16731	while (start <= end) {
	16732	if (end - start < min_range_count) {
	16733
	16734	/* Individual chars in short ranges */
	16735	for (; start <= end; start++) {
	16736	put_byte(sv, start);
	16737	}
	16738	break;
	16739	}
	16740
	16741	/* If permitted by the input options, and there is a possibility that
	16742	* this range contains a printable literal, look to see if there is
	16743	* one. */
	16744	if (allow_literals && start <= MAX_PRINT_A) {
	16745
	16746	/* If the range begin isn't an ASCII printable, effectively split
	16747	* the range into two parts:
	16748	* 1) the portion before the first such printable,
	16749	* 2) the rest
	16750	* and output them separately. */
	16751	if (! isPRINT_A(start)) {
	16752	UV temp_end = start + 1;
	16753
	16754	/* There is no point looking beyond the final possible
	16755	* printable, in MAX_PRINT_A */
	16756	UV max = MIN(end, MAX_PRINT_A);
	16757
	16758	while (temp_end <= max && ! isPRINT_A(temp_end)) {
	16759	temp_end++;
	16760	}
	16761
	16762	/* Here, temp_end points to one beyond the first printable if
	16763	* found, or to one beyond 'max' if not. If none found, make
	16764	* sure that we use the entire range */
	16765	if (temp_end > MAX_PRINT_A) {
	16766	temp_end = end + 1;
	16767	}
	16768
	16769	/* Output the first part of the split range, the part that
	16770	* doesn't have printables, with no looking for literals
	16771	* (otherwise we would infinitely recurse) */
	16772	put_range(sv, start, temp_end - 1, FALSE);
	16773
	16774	/* The 2nd part of the range (if any) starts here. */
	16775	start = temp_end;
	16776
	16777	/* We continue instead of dropping down because even if the 2nd
	16778	* part is non-empty, it could be so short that we want to
	16779	* output it specially, as tested for at the top of this loop.
	16780	* */
	16781	continue;
	16782	}
	16783
	16784	/* Here, 'start' is a printable ASCII. If it is an alphanumeric,
	16785	* output a sub-range of just the digits or letters, then process
	16786	* the remaining portion as usual. */
	16787	if (isALPHANUMERIC_A(start)) {
	16788	UV mask = (isDIGIT_A(start))
	16789	? _CC_DIGIT
	16790	: isUPPER_A(start)
	16791	? _CC_UPPER
	16792	: _CC_LOWER;
	16793	UV temp_end = start + 1;
	16794
	16795	/* Find the end of the sub-range that includes just the
	16796	* characters in the same class as the first character in it */
	16797	while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
	16798	temp_end++;
	16799	}
	16800	temp_end--;
	16801
	16802	/* For short ranges, don't duplicate the code above to output
	16803	* them; just call recursively */
	16804	if (temp_end - start < min_range_count) {
	16805	put_range(sv, start, temp_end, FALSE);
	16806	}
	16807	else { /* Output as a range */
	16808	put_byte(sv, start);
	16809	sv_catpvs(sv, "-");
	16810	put_byte(sv, temp_end);
	16811	}
	16812	start = temp_end + 1;
	16813	continue;
	16814	}
	16815
	16816	/* We output any other printables as individual characters */
	16817	if (isPUNCT_A(start) \|\| isSPACE_A(start)) {
	16818	while (start <= end && (isPUNCT_A(start)
	16819	\|\| isSPACE_A(start)))
	16820	{
	16821	put_byte(sv, start);
	16822	start++;
	16823	}
	16824	continue;
	16825	}
	16826	} /* End of looking for literals */
	16827
	16828	/* Here is not to output as a literal. Some control characters have
	16829	* mnemonic names. Split off any of those at the beginning and end of
	16830	* the range to print mnemonically. It isn't possible for many of
	16831	* these to be in a row, so this won't overwhelm with output */
	16832	if (isMNEMONIC_CNTRL(start)) {
	16833	while (isMNEMONIC_CNTRL(start) && start <= end) {
	16834	put_byte(sv, start);
	16835	start++;
	16836	}
	16837	}
	16838	if (start < end && isMNEMONIC_CNTRL(end)) {
	16839
	16840	/* Here, the final character in the range has a mnemonic name.
	16841	* Work backwards from the end to find the final non-mnemonic */
	16842	UV temp_end = end - 1;
	16843	while (isMNEMONIC_CNTRL(temp_end)) {
	16844	temp_end--;
	16845	}
	16846
	16847	/* And separately output the range that doesn't have mnemonics */
	16848	put_range(sv, start, temp_end, FALSE);
	16849
	16850	/* Then output the mnemonic trailing controls */
	16851	start = temp_end + 1;
	16852	while (start <= end) {
	16853	put_byte(sv, start);
	16854	start++;
	16855	}
	16856	break;
	16857	}
	16858
	16859	/* As a final resort, output the range or subrange as hex. */
	16860	Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
	16861	start,
	16862	(end < NUM_ANYOF_CODE_POINTS)
	16863	? end
	16864	: NUM_ANYOF_CODE_POINTS - 1);
	16865	break;
	16866	}
	16867	}
	16868
	16869	STATIC bool
	16870	S_put_charclass_bitmap_innards(pTHX_ SV sv, char bitmap, SV** bitmap_invlist)
	16871	{
	16872	/* Appends to 'sv' a displayable version of the innards of the bracketed
	16873	* character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
	16874	* output anything, and bitmap_invlist, if not NULL, will point to an
	16875	* inversion list of what is in the bit map */
	16876
	16877	int i;
	16878	UV start, end;
	16879	unsigned int punct_count = 0;
	16880	SV* invlist = NULL;
	16881	SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
	16882	bool allow_literals = TRUE;
	16883
	16884	PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
	16885
	16886	invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
	16887
	16888	/* Worst case is exactly every-other code point is in the list */
	16889	*invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
	16890
	16891	/* Convert the bit map to an inversion list, keeping track of how many
	16892	* ASCII puncts are set, including an extra amount for the backslashed
	16893	* ones. */
	16894	for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
	16895	if (BITMAP_TEST((U8 *) bitmap,i)) {
	16896	invlist_ptr = add_cp_to_invlist(invlist_ptr, i);
	16897	if (isPUNCT_A(i)) {
	16898	punct_count++;
	16899	if isBACKSLASHED_PUNCT(i) {
	16900	punct_count++;
	16901	}
	16902	}
	16903	}
	16904	}
	16905
	16906	/* Nothing to output */
	16907	if (_invlist_len(*invlist_ptr) == 0) {
	16908	SvREFCNT_dec(invlist);
	16909	return FALSE;
	16910	}
	16911
	16912	/* Generally, it is more readable if printable characters are output as
	16913	* literals, but if a range (nearly) spans all of them, it's best to output
	16914	* it as a single range. This code will use a single range if all but 2
	16915	* printables are in it */
	16916	invlist_iterinit(*invlist_ptr);
	16917	while (invlist_iternext(*invlist_ptr, &start, &end)) {
	16918
	16919	/* If range starts beyond final printable, it doesn't have any in it */
	16920	if (start > MAX_PRINT_A) {
	16921	break;
	16922	}
	16923
	16924	/* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
	16925	* all but two, the range must start and end no later than 2 from
	16926	* either end */
	16927	if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
	16928	if (end > MAX_PRINT_A) {
	16929	end = MAX_PRINT_A;
	16930	}
	16931	if (start < ' ') {
	16932	start = ' ';
	16933	}
	16934	if (end - start >= MAX_PRINT_A - ' ' - 2) {
	16935	allow_literals = FALSE;
	16936	}
	16937	break;
	16938	}
	16939	}
	16940	invlist_iterfinish(*invlist_ptr);
	16941
	16942	/* The legibility of the output depends mostly on how many punctuation
	16943	* characters are output. There are 32 possible ASCII ones, and some have
	16944	* an additional backslash, bringing it to currently 36, so if any more
	16945	* than 18 are to be output, we can instead output it as its complement,
	16946	* yielding fewer puncts, and making it more legible. But give some weight
	16947	* to the fact that outputting it as a complement is less legible than a
	16948	* straight output, so don't complement unless we are somewhat over the 18
	16949	* mark */
	16950	if (allow_literals && punct_count > 22) {
	16951	sv_catpvs(sv, "^");
	16952
	16953	/* Add everything remaining to the list, so when we invert it just
	16954	* below, it will be excluded */
	16955	invlist_ptr = _add_range_to_invlist(invlist_ptr,
	16956	NUM_ANYOF_CODE_POINTS, UV_MAX);
	16957	_invlist_invert(*invlist_ptr);
	16958	}
	16959
	16960	/* Here we have figured things out. Output each range */
	16961	invlist_iterinit(*invlist_ptr);
	16962	while (invlist_iternext(*invlist_ptr, &start, &end)) {
	16963	if (start >= NUM_ANYOF_CODE_POINTS) {
	16964	break;
	16965	}
	16966	put_range(sv, start, end, allow_literals);
	16967	}
	16968	invlist_iterfinish(*invlist_ptr);
	16969
	16970	return TRUE;
	16971	}
	16972
	16973	#define CLEAR_OPTSTART \
	16974	if (optstart) STMT_START { \
	16975	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
	16976	" (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	16977	optstart=NULL; \
	16978	} STMT_END
	16979
	16980	#define DUMPUNTIL(b,e) \
	16981	CLEAR_OPTSTART; \
	16982	node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	16983
	16984	STATIC const regnode *
	16985	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	16986	const regnode last, const regnode plast,
	16987	SV* sv, I32 indent, U32 depth)
	16988	{
	16989	U8 op = PSEUDO; /* Arbitrary non-END op. */
	16990	const regnode *next;
	16991	const regnode *optstart= NULL;
	16992
	16993	RXi_GET_DECL(r,ri);
	16994	GET_RE_DEBUG_FLAGS_DECL;
	16995
	16996	PERL_ARGS_ASSERT_DUMPUNTIL;
	16997
	16998	#ifdef DEBUG_DUMPUNTIL
	16999	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	17000	last ? last-start : 0,plast ? plast-start : 0);
	17001	#endif
	17002
	17003	if (plast && plast < last)
	17004	last= plast;
	17005
	17006	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	17007	assert(node);
	17008	/* While that wasn't END last time... */
	17009	NODE_ALIGN(node);
	17010	op = OP(node);
	17011	if (op == CLOSE \|\| op == WHILEM)
	17012	indent--;
	17013	next = regnext((regnode *)node);
	17014
	17015	/* Where, what. */
	17016	if (OP(node) == OPTIMIZED) {
	17017	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	17018	optstart = node;
	17019	else
	17020	goto after_print;
	17021	} else
	17022	CLEAR_OPTSTART;
	17023
	17024	regprop(r, sv, node, NULL);
	17025	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	17026	(int)(2*indent + 1), "", SvPVX_const(sv));
	17027
	17028	if (OP(node) != OPTIMIZED) {
	17029	if (next == NULL) /* Next ptr. */
	17030	PerlIO_printf(Perl_debug_log, " (0)");
	17031	else if (PL_regkind[(U8)op] == BRANCH
	17032	&& PL_regkind[OP(next)] != BRANCH )
	17033	PerlIO_printf(Perl_debug_log, " (FAIL)");
	17034	else
	17035	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	17036	(void)PerlIO_putc(Perl_debug_log, '\n');
	17037	}
	17038
	17039	after_print:
	17040	if (PL_regkind[(U8)op] == BRANCHJ) {
	17041	assert(next);
	17042	{
	17043	const regnode *nnode = (OP(next) == LONGJMP
	17044	? regnext((regnode *)next)
	17045	: next);
	17046	if (last && nnode > last)
	17047	nnode = last;
	17048	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	17049	}
	17050	}
	17051	else if (PL_regkind[(U8)op] == BRANCH) {
	17052	assert(next);
	17053	DUMPUNTIL(NEXTOPER(node), next);
	17054	}
	17055	else if ( PL_regkind[(U8)op] == TRIE ) {
	17056	const regnode *this_trie = node;
	17057	const char op = OP(node);
	17058	const U32 n = ARG(node);
	17059	const reg_ac_data * const ac = op>=AHOCORASICK ?
	17060	(reg_ac_data *)ri->data->data[n] :
	17061	NULL;
	17062	const reg_trie_data * const trie =
	17063	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	17064	#ifdef DEBUGGING
	17065	AV *const trie_words
	17066	= MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	17067	#endif
	17068	const regnode *nextbranch= NULL;
	17069	I32 word_idx;
	17070	sv_setpvs(sv, "");
	17071	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	17072	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	17073
	17074	PerlIO_printf(Perl_debug_log, "%*s%s ",
	17075	(int)(2*(indent+3)), "",
	17076	elem_ptr
	17077	? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
	17078	SvCUR(*elem_ptr), 60,
	17079	PL_colors[0], PL_colors[1],
	17080	(SvUTF8(*elem_ptr)
	17081	? PERL_PV_ESCAPE_UNI
	17082	: 0)
	17083	\| PERL_PV_PRETTY_ELLIPSES
	17084	\| PERL_PV_PRETTY_LTGT
	17085	)
	17086	: "???"
	17087	);
	17088	if (trie->jump) {
	17089	U16 dist= trie->jump[word_idx+1];
	17090	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	17091	(UV)((dist ? this_trie + dist : next) - start));
	17092	if (dist) {
	17093	if (!nextbranch)
	17094	nextbranch= this_trie + trie->jump[0];
	17095	DUMPUNTIL(this_trie + dist, nextbranch);
	17096	}
	17097	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	17098	nextbranch= regnext((regnode *)nextbranch);
	17099	} else {
	17100	PerlIO_printf(Perl_debug_log, "\n");
	17101	}
	17102	}
	17103	if (last && next > last)
	17104	node= last;
	17105	else
	17106	node= next;
	17107	}
	17108	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	17109	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	17110	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	17111	}
	17112	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	17113	assert(next);
	17114	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	17115	}
	17116	else if ( op == PLUS \|\| op == STAR) {
	17117	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	17118	}
	17119	else if (PL_regkind[(U8)op] == ANYOF) {
	17120	/* arglen 1 + class block */
	17121	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
	17122	? ANYOF_POSIXL_SKIP
	17123	: ANYOF_SKIP);
	17124	node = NEXTOPER(node);
	17125	}
	17126	else if (PL_regkind[(U8)op] == EXACT) {
	17127	/* Literal string, where present. */
	17128	node += NODE_SZ_STR(node) - 1;
	17129	node = NEXTOPER(node);
	17130	}
	17131	else {
	17132	node = NEXTOPER(node);
	17133	node += regarglen[(U8)op];
	17134	}
	17135	if (op == CURLYX \|\| op == OPEN)
	17136	indent++;
	17137	}
	17138	CLEAR_OPTSTART;
	17139	#ifdef DEBUG_DUMPUNTIL
	17140	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	17141	#endif
	17142	return node;
	17143	}
	17144
	17145	#endif /* DEBUGGING */
	17146
	17147	/*
	17148	* Local variables:
	17149	* c-indentation-style: bsd
	17150	* c-basic-offset: 4
	17151	* indent-tabs-mode: nil
	17152	* End:
	17153	*
	17154	* ex: set ts=8 sts=4 sw=4 et:
	17155	*/