perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	#else
	85	# include "regcomp.h"
	86	#endif
	87
	88	#include "dquote_static.c"
	89
	90	#ifdef op
	91	#undef op
	92	#endif /* op */
	93
	94	#ifdef MSDOS
	95	# if defined(BUGGY_MSC6)
	96	/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
	97	# pragma optimize("a",off)
	98	/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
	99	# pragma optimize("w",on )
	100	# endif /* BUGGY_MSC6 */
	101	#endif /* MSDOS */
	102
	103	#ifndef STATIC
	104	#define STATIC static
	105	#endif
	106
	107	typedef struct RExC_state_t {
	108	U32 flags; /* are we folding, multilining? */
	109	char precomp; / uncompiled string. */
	110	REGEXP rx_sv; / The SV that is the regexp. */
	111	regexp rx; / perl core regexp structure */
	112	regexp_internal rxi; / internal data for regexp object pprivate field */
	113	char start; / Start of input for compile */
	114	char end; / End of input for compile */
	115	char parse; / Input-scan pointer. */
	116	I32 whilem_seen; /* number of WHILEM in this expr */
	117	regnode emit_start; / Start of emitted-code area */
	118	regnode emit_bound; / First regnode outside of the allocated space */
	119	regnode emit; / Code-emit pointer; &regdummy = don't = compiling */
	120	I32 naughty; /* How bad is this pattern? */
	121	I32 sawback; /* Did we see \1, ...? */
	122	U32 seen;
	123	I32 size; /* Code size. */
	124	I32 npar; /* Capture buffer count, (OPEN). */
	125	I32 cpar; /* Capture buffer count, (CLOSE). */
	126	I32 nestroot; /* root parens we are in - used by accept */
	127	I32 extralen;
	128	I32 seen_zerolen;
	129	I32 seen_evals;
	130	regnode *open_parens; / pointers to open parens */
	131	regnode *close_parens; / pointers to close parens */
	132	regnode opend; / END node in program */
	133	I32 utf8; /* whether the pattern is utf8 or not */
	134	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	135	/* XXX use this for future optimisation of case
	136	* where pattern must be upgraded to utf8. */
	137	I32 uni_semantics; /* If a d charset modifier should use unicode
	138	rules, even if the pattern is not in
	139	utf8 */
	140	HV paren_names; / Paren names */
	141
	142	regnode *recurse; / Recurse regops */
	143	I32 recurse_count; /* Number of recurse regops */
	144	I32 in_lookbehind;
	145	I32 contains_locale;
	146	#if ADD_TO_REGEXEC
	147	char starttry; / -Dr: where regtry was called. */
	148	#define RExC_starttry (pRExC_state->starttry)
	149	#endif
	150	#ifdef DEBUGGING
	151	const char *lastparse;
	152	I32 lastnum;
	153	AV paren_name_list; / idx -> name */
	154	#define RExC_lastparse (pRExC_state->lastparse)
	155	#define RExC_lastnum (pRExC_state->lastnum)
	156	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	157	#endif
	158	} RExC_state_t;
	159
	160	#define RExC_flags (pRExC_state->flags)
	161	#define RExC_precomp (pRExC_state->precomp)
	162	#define RExC_rx_sv (pRExC_state->rx_sv)
	163	#define RExC_rx (pRExC_state->rx)
	164	#define RExC_rxi (pRExC_state->rxi)
	165	#define RExC_start (pRExC_state->start)
	166	#define RExC_end (pRExC_state->end)
	167	#define RExC_parse (pRExC_state->parse)
	168	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	169	#ifdef RE_TRACK_PATTERN_OFFSETS
	170	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
	171	#endif
	172	#define RExC_emit (pRExC_state->emit)
	173	#define RExC_emit_start (pRExC_state->emit_start)
	174	#define RExC_emit_bound (pRExC_state->emit_bound)
	175	#define RExC_naughty (pRExC_state->naughty)
	176	#define RExC_sawback (pRExC_state->sawback)
	177	#define RExC_seen (pRExC_state->seen)
	178	#define RExC_size (pRExC_state->size)
	179	#define RExC_npar (pRExC_state->npar)
	180	#define RExC_nestroot (pRExC_state->nestroot)
	181	#define RExC_extralen (pRExC_state->extralen)
	182	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	183	#define RExC_seen_evals (pRExC_state->seen_evals)
	184	#define RExC_utf8 (pRExC_state->utf8)
	185	#define RExC_uni_semantics (pRExC_state->uni_semantics)
	186	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	187	#define RExC_open_parens (pRExC_state->open_parens)
	188	#define RExC_close_parens (pRExC_state->close_parens)
	189	#define RExC_opend (pRExC_state->opend)
	190	#define RExC_paren_names (pRExC_state->paren_names)
	191	#define RExC_recurse (pRExC_state->recurse)
	192	#define RExC_recurse_count (pRExC_state->recurse_count)
	193	#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
	194	#define RExC_contains_locale (pRExC_state->contains_locale)
	195
	196
	197	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	198	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	199	((*s) == '{' && regcurly(s)))
	200
	201	#ifdef SPSTART
	202	#undef SPSTART /* dratted cpp namespace... */
	203	#endif
	204	/*
	205	* Flags to be passed up and down.
	206	*/
	207	#define WORST 0 /* Worst case. */
	208	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	209
	210	/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
	211	* character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
	212	#define SIMPLE 0x02
	213	#define SPSTART 0x04 /* Starts with * or +. */
	214	#define TRYAGAIN 0x08 /* Weeded out a declaration. */
	215	#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
	216
	217	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	218
	219	/* whether trie related optimizations are enabled */
	220	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	221	#define TRIE_STUDY_OPT
	222	#define FULL_TRIE_STUDY
	223	#define TRIE_STCLASS
	224	#endif
	225
	226
	227
	228	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	229	#define PBITVAL(paren) (1 << ((paren) & 7))
	230	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	231	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	232	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	233
	234	/* If not already in utf8, do a longjmp back to the beginning */
	235	#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
	236	#define REQUIRE_UTF8 STMT_START { \
	237	if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
	238	} STMT_END
	239
	240	/* About scan_data_t.
	241
	242	During optimisation we recurse through the regexp program performing
	243	various inplace (keyhole style) optimisations. In addition study_chunk
	244	and scan_commit populate this data structure with information about
	245	what strings MUST appear in the pattern. We look for the longest
	246	string that must appear at a fixed location, and we look for the
	247	longest string that may appear at a floating location. So for instance
	248	in the pattern:
	249
	250	/FOO[xX]A.*B[xX]BAR/
	251
	252	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	253	strings (because they follow a .* construct). study_chunk will identify
	254	both FOO and BAR as being the longest fixed and floating strings respectively.
	255
	256	The strings can be composites, for instance
	257
	258	/(f)(o)(o)/
	259
	260	will result in a composite fixed substring 'foo'.
	261
	262	For each string some basic information is maintained:
	263
	264	- offset or min_offset
	265	This is the position the string must appear at, or not before.
	266	It also implicitly (when combined with minlenp) tells us how many
	267	characters must match before the string we are searching for.
	268	Likewise when combined with minlenp and the length of the string it
	269	tells us how many characters must appear after the string we have
	270	found.
	271
	272	- max_offset
	273	Only used for floating strings. This is the rightmost point that
	274	the string can appear at. If set to I32 max it indicates that the
	275	string can occur infinitely far to the right.
	276
	277	- minlenp
	278	A pointer to the minimum length of the pattern that the string
	279	was found inside. This is important as in the case of positive
	280	lookahead or positive lookbehind we can have multiple patterns
	281	involved. Consider
	282
	283	/(?=FOO).*F/
	284
	285	The minimum length of the pattern overall is 3, the minimum length
	286	of the lookahead part is 3, but the minimum length of the part that
	287	will actually match is 1. So 'FOO's minimum length is 3, but the
	288	minimum length for the F is 1. This is important as the minimum length
	289	is used to determine offsets in front of and behind the string being
	290	looked for. Since strings can be composites this is the length of the
	291	pattern at the time it was committed with a scan_commit. Note that
	292	the length is calculated by study_chunk, so that the minimum lengths
	293	are not known until the full pattern has been compiled, thus the
	294	pointer to the value.
	295
	296	- lookbehind
	297
	298	In the case of lookbehind the string being searched for can be
	299	offset past the start point of the final matching string.
	300	If this value was just blithely removed from the min_offset it would
	301	invalidate some of the calculations for how many chars must match
	302	before or after (as they are derived from min_offset and minlen and
	303	the length of the string being searched for).
	304	When the final pattern is compiled and the data is moved from the
	305	scan_data_t structure into the regexp structure the information
	306	about lookbehind is factored in, with the information that would
	307	have been lost precalculated in the end_shift field for the
	308	associated string.
	309
	310	The fields pos_min and pos_delta are used to store the minimum offset
	311	and the delta to the maximum offset at the current point in the pattern.
	312
	313	*/
	314
	315	typedef struct scan_data_t {
	316	/I32 len_min; unused /
	317	/I32 len_delta; unused /
	318	I32 pos_min;
	319	I32 pos_delta;
	320	SV *last_found;
	321	I32 last_end; /* min value, <0 unless valid. */
	322	I32 last_start_min;
	323	I32 last_start_max;
	324	SV *longest; / Either &l_fixed, or &l_float. */
	325	SV longest_fixed; / longest fixed string found in pattern */
	326	I32 offset_fixed; /* offset where it starts */
	327	I32 minlen_fixed; / pointer to the minlen relevant to the string */
	328	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	329	SV longest_float; / longest floating string found in pattern */
	330	I32 offset_float_min; /* earliest point in string it can appear */
	331	I32 offset_float_max; /* latest point in string it can appear */
	332	I32 minlen_float; / pointer to the minlen relevant to the string */
	333	I32 lookbehind_float; /* is the position of the string modified by LB */
	334	I32 flags;
	335	I32 whilem_c;
	336	I32 *last_closep;
	337	struct regnode_charclass_class *start_class;
	338	} scan_data_t;
	339
	340	/*
	341	* Forward declarations for pregcomp()'s friends.
	342	*/
	343
	344	static const scan_data_t zero_scan_data =
	345	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	346
	347	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	348	#define SF_BEFORE_SEOL 0x0001
	349	#define SF_BEFORE_MEOL 0x0002
	350	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	351	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	352
	353	#ifdef NO_UNARY_PLUS
	354	# define SF_FIX_SHIFT_EOL (0+2)
	355	# define SF_FL_SHIFT_EOL (0+4)
	356	#else
	357	# define SF_FIX_SHIFT_EOL (+2)
	358	# define SF_FL_SHIFT_EOL (+4)
	359	#endif
	360
	361	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	362	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	363
	364	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	365	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	366	#define SF_IS_INF 0x0040
	367	#define SF_HAS_PAR 0x0080
	368	#define SF_IN_PAR 0x0100
	369	#define SF_HAS_EVAL 0x0200
	370	#define SCF_DO_SUBSTR 0x0400
	371	#define SCF_DO_STCLASS_AND 0x0800
	372	#define SCF_DO_STCLASS_OR 0x1000
	373	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	374	#define SCF_WHILEM_VISITED_POS 0x2000
	375
	376	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	377	#define SCF_SEEN_ACCEPT 0x8000
	378
	379	#define UTF cBOOL(RExC_utf8)
	380	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	381	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	382	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
	383	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
	384	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
	385	#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	386	#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
	387
	388	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	389
	390	#define OOB_UNICODE 12345678
	391	#define OOB_NAMEDCLASS -1
	392
	393	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	394	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	395
	396
	397	/* length of regex to show in messages that don't mark a position within */
	398	#define RegexLengthToShowInErrorMessages 127
	399
	400	/*
	401	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	402	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	403	* op/pragma/warn/regcomp.
	404	*/
	405	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	406	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	407
	408	#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
	409
	410	/*
	411	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	412	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	413	* "...".
	414	*/
	415	#define _FAIL(code) STMT_START { \
	416	const char *ellipses = ""; \
	417	IV len = RExC_end - RExC_precomp; \
	418	\
	419	if (!SIZE_ONLY) \
	420	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	421	if (len > RegexLengthToShowInErrorMessages) { \
	422	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	423	len = RegexLengthToShowInErrorMessages - 10; \
	424	ellipses = "..."; \
	425	} \
	426	code; \
	427	} STMT_END
	428
	429	#define FAIL(msg) _FAIL( \
	430	Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
	431	msg, (int)len, RExC_precomp, ellipses))
	432
	433	#define FAIL2(msg,arg) _FAIL( \
	434	Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
	435	arg, (int)len, RExC_precomp, ellipses))
	436
	437	/*
	438	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	439	*/
	440	#define Simple_vFAIL(m) STMT_START { \
	441	const IV offset = RExC_parse - RExC_precomp; \
	442	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	443	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	444	} STMT_END
	445
	446	/*
	447	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	448	*/
	449	#define vFAIL(m) STMT_START { \
	450	if (!SIZE_ONLY) \
	451	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	452	Simple_vFAIL(m); \
	453	} STMT_END
	454
	455	/*
	456	* Like Simple_vFAIL(), but accepts two arguments.
	457	*/
	458	#define Simple_vFAIL2(m,a1) STMT_START { \
	459	const IV offset = RExC_parse - RExC_precomp; \
	460	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
	461	(int)offset, RExC_precomp, RExC_precomp + offset); \
	462	} STMT_END
	463
	464	/*
	465	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	466	*/
	467	#define vFAIL2(m,a1) STMT_START { \
	468	if (!SIZE_ONLY) \
	469	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	470	Simple_vFAIL2(m, a1); \
	471	} STMT_END
	472
	473
	474	/*
	475	* Like Simple_vFAIL(), but accepts three arguments.
	476	*/
	477	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	478	const IV offset = RExC_parse - RExC_precomp; \
	479	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
	480	(int)offset, RExC_precomp, RExC_precomp + offset); \
	481	} STMT_END
	482
	483	/*
	484	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	485	*/
	486	#define vFAIL3(m,a1,a2) STMT_START { \
	487	if (!SIZE_ONLY) \
	488	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	489	Simple_vFAIL3(m, a1, a2); \
	490	} STMT_END
	491
	492	/*
	493	* Like Simple_vFAIL(), but accepts four arguments.
	494	*/
	495	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	496	const IV offset = RExC_parse - RExC_precomp; \
	497	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
	498	(int)offset, RExC_precomp, RExC_precomp + offset); \
	499	} STMT_END
	500
	501	#define ckWARNreg(loc,m) STMT_START { \
	502	const IV offset = loc - RExC_precomp; \
	503	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	504	(int)offset, RExC_precomp, RExC_precomp + offset); \
	505	} STMT_END
	506
	507	#define ckWARNregdep(loc,m) STMT_START { \
	508	const IV offset = loc - RExC_precomp; \
	509	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	510	m REPORT_LOCATION, \
	511	(int)offset, RExC_precomp, RExC_precomp + offset); \
	512	} STMT_END
	513
	514	#define ckWARN2regdep(loc,m, a1) STMT_START { \
	515	const IV offset = loc - RExC_precomp; \
	516	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	517	m REPORT_LOCATION, \
	518	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	519	} STMT_END
	520
	521	#define ckWARN2reg(loc, m, a1) STMT_START { \
	522	const IV offset = loc - RExC_precomp; \
	523	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	524	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	525	} STMT_END
	526
	527	#define vWARN3(loc, m, a1, a2) STMT_START { \
	528	const IV offset = loc - RExC_precomp; \
	529	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	530	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	531	} STMT_END
	532
	533	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	534	const IV offset = loc - RExC_precomp; \
	535	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	536	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	537	} STMT_END
	538
	539	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	540	const IV offset = loc - RExC_precomp; \
	541	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	542	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	543	} STMT_END
	544
	545	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	546	const IV offset = loc - RExC_precomp; \
	547	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	548	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	549	} STMT_END
	550
	551	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	552	const IV offset = loc - RExC_precomp; \
	553	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	554	a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
	555	} STMT_END
	556
	557
	558	/* Allow for side effects in s */
	559	#define REGC(c,s) STMT_START { \
	560	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	561	} STMT_END
	562
	563	/* Macros for recording node offsets. 20001227 mjd@plover.com
	564	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	565	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	566	* Element 0 holds the number n.
	567	* Position is 1 indexed.
	568	*/
	569	#ifndef RE_TRACK_PATTERN_OFFSETS
	570	#define Set_Node_Offset_To_R(node,byte)
	571	#define Set_Node_Offset(node,byte)
	572	#define Set_Cur_Node_Offset
	573	#define Set_Node_Length_To_R(node,len)
	574	#define Set_Node_Length(node,len)
	575	#define Set_Node_Cur_Length(node)
	576	#define Node_Offset(n)
	577	#define Node_Length(n)
	578	#define Set_Node_Offset_Length(node,offset,len)
	579	#define ProgLen(ri) ri->u.proglen
	580	#define SetProgLen(ri,x) ri->u.proglen = x
	581	#else
	582	#define ProgLen(ri) ri->u.offsets[0]
	583	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	584	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	585	if (! SIZE_ONLY) { \
	586	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	587	__LINE__, (int)(node), (int)(byte))); \
	588	if((node) < 0) { \
	589	Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
	590	} else { \
	591	RExC_offsets[2*(node)-1] = (byte); \
	592	} \
	593	} \
	594	} STMT_END
	595
	596	#define Set_Node_Offset(node,byte) \
	597	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	598	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	599
	600	#define Set_Node_Length_To_R(node,len) STMT_START { \
	601	if (! SIZE_ONLY) { \
	602	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	603	__LINE__, (int)(node), (int)(len))); \
	604	if((node) < 0) { \
	605	Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
	606	} else { \
	607	RExC_offsets[2*(node)] = (len); \
	608	} \
	609	} \
	610	} STMT_END
	611
	612	#define Set_Node_Length(node,len) \
	613	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	614	#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
	615	#define Set_Node_Cur_Length(node) \
	616	Set_Node_Length(node, RExC_parse - parse_start)
	617
	618	/* Get offsets and lengths */
	619	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	620	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	621
	622	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	623	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	624	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	625	} STMT_END
	626	#endif
	627
	628	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	629	#define EXPERIMENTAL_INPLACESCAN
	630	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	631
	632	#define DEBUG_STUDYDATA(str,data,depth) \
	633	DEBUG_OPTIMISE_MORE_r(if(data){ \
	634	PerlIO_printf(Perl_debug_log, \
	635	"%*s" str "Pos:%"IVdf"/%"IVdf \
	636	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	637	(int)(depth)*2, "", \
	638	(IV)((data)->pos_min), \
	639	(IV)((data)->pos_delta), \
	640	(UV)((data)->flags), \
	641	(IV)((data)->whilem_c), \
	642	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	643	is_inf ? "INF " : "" \
	644	); \
	645	if ((data)->last_found) \
	646	PerlIO_printf(Perl_debug_log, \
	647	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	648	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	649	SvPVX_const((data)->last_found), \
	650	(IV)((data)->last_end), \
	651	(IV)((data)->last_start_min), \
	652	(IV)((data)->last_start_max), \
	653	((data)->longest && \
	654	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	655	SvPVX_const((data)->longest_fixed), \
	656	(IV)((data)->offset_fixed), \
	657	((data)->longest && \
	658	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	659	SvPVX_const((data)->longest_float), \
	660	(IV)((data)->offset_float_min), \
	661	(IV)((data)->offset_float_max) \
	662	); \
	663	PerlIO_printf(Perl_debug_log,"\n"); \
	664	});
	665
	666	static void clear_re(pTHX_ void *r);
	667
	668	/* Mark that we cannot extend a found fixed substring at this point.
	669	Update the longest found anchored substring and the longest found
	670	floating substrings if needed. */
	671
	672	STATIC void
	673	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data, I32 *minlenp, int is_inf)
	674	{
	675	const STRLEN l = CHR_SVLEN(data->last_found);
	676	const STRLEN old_l = CHR_SVLEN(*data->longest);
	677	GET_RE_DEBUG_FLAGS_DECL;
	678
	679	PERL_ARGS_ASSERT_SCAN_COMMIT;
	680
	681	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	682	SvSetMagicSV(*data->longest, data->last_found);
	683	if (*data->longest == data->longest_fixed) {
	684	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	685	if (data->flags & SF_BEFORE_EOL)
	686	data->flags
	687	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	688	else
	689	data->flags &= ~SF_FIX_BEFORE_EOL;
	690	data->minlen_fixed=minlenp;
	691	data->lookbehind_fixed=0;
	692	}
	693	else { /* data->longest == data->longest_float /
	694	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	695	data->offset_float_max = (l
	696	? data->last_start_max
	697	: data->pos_min + data->pos_delta);
	698	if (is_inf \|\| (U32)data->offset_float_max > (U32)I32_MAX)
	699	data->offset_float_max = I32_MAX;
	700	if (data->flags & SF_BEFORE_EOL)
	701	data->flags
	702	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	703	else
	704	data->flags &= ~SF_FL_BEFORE_EOL;
	705	data->minlen_float=minlenp;
	706	data->lookbehind_float=0;
	707	}
	708	}
	709	SvCUR_set(data->last_found, 0);
	710	{
	711	SV * const sv = data->last_found;
	712	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	713	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	714	if (mg)
	715	mg->mg_len = 0;
	716	}
	717	}
	718	data->last_end = -1;
	719	data->flags &= ~SF_BEFORE_EOL;
	720	DEBUG_STUDYDATA("commit: ",data,0);
	721	}
	722
	723	/* Can match anything (initialization) */
	724	STATIC void
	725	S_cl_anything(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	726	{
	727	PERL_ARGS_ASSERT_CL_ANYTHING;
	728
	729	ANYOF_BITMAP_SETALL(cl);
	730	cl->flags = ANYOF_CLASS\|ANYOF_EOS\|ANYOF_UNICODE_ALL
	731	\|ANYOF_LOC_NONBITMAP_FOLD\|ANYOF_NON_UTF8_LATIN1_ALL
	732	/* Even though no bitmap is in use here, we need to set
	733	* the flag below so an AND with a node that does have one
	734	* doesn't lose that one. The flag should get cleared if
	735	* the other one doesn't; and the code in regexec.c is
	736	* structured so this being set when not needed does no
	737	* harm. It seemed a little cleaner to set it here than do
	738	* a special case in cl_and() */
	739	\|ANYOF_NONBITMAP_NON_UTF8;
	740
	741	/* If any portion of the regex is to operate under locale rules,
	742	* initialization includes it. The reason this isn't done for all regexes
	743	* is that the optimizer was written under the assumption that locale was
	744	* all-or-nothing. Given the complexity and lack of documentation in the
	745	* optimizer, and that there are inadequate test cases for locale, so many
	746	* parts of it may not work properly, it is safest to avoid locale unless
	747	* necessary. */
	748	if (RExC_contains_locale) {
	749	ANYOF_CLASS_SETALL(cl); /* /l uses class */
	750	cl->flags \|= ANYOF_LOCALE;
	751	}
	752	else {
	753	ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
	754	}
	755	}
	756
	757	/* Can match anything (initialization) */
	758	STATIC int
	759	S_cl_is_anything(const struct regnode_charclass_class *cl)
	760	{
	761	int value;
	762
	763	PERL_ARGS_ASSERT_CL_IS_ANYTHING;
	764
	765	for (value = 0; value <= ANYOF_MAX; value += 2)
	766	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
	767	return 1;
	768	if (!(cl->flags & ANYOF_UNICODE_ALL))
	769	return 0;
	770	if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
	771	return 0;
	772	return 1;
	773	}
	774
	775	/* Can match anything (initialization) */
	776	STATIC void
	777	S_cl_init(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	778	{
	779	PERL_ARGS_ASSERT_CL_INIT;
	780
	781	Zero(cl, 1, struct regnode_charclass_class);
	782	cl->type = ANYOF;
	783	cl_anything(pRExC_state, cl);
	784	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	785	}
	786
	787	/* These two functions currently do the exact same thing */
	788	#define cl_init_zero S_cl_init
	789
	790	/* 'AND' a given class with another one. Can create false positives. 'cl'
	791	* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
	792	* 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
	793	STATIC void
	794	S_cl_and(struct regnode_charclass_class *cl,
	795	const struct regnode_charclass_class *and_with)
	796	{
	797	PERL_ARGS_ASSERT_CL_AND;
	798
	799	assert(and_with->type == ANYOF);
	800
	801	/* I (khw) am not sure all these restrictions are necessary XXX */
	802	if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
	803	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
	804	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	805	&& !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	806	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
	807	int i;
	808
	809	if (and_with->flags & ANYOF_INVERT)
	810	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	811	cl->bitmap[i] &= ~and_with->bitmap[i];
	812	else
	813	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	814	cl->bitmap[i] &= and_with->bitmap[i];
	815	} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
	816
	817	if (and_with->flags & ANYOF_INVERT) {
	818
	819	/* Here, the and'ed node is inverted. Get the AND of the flags that
	820	* aren't affected by the inversion. Those that are affected are
	821	* handled individually below */
	822	U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
	823	cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
	824	cl->flags \|= affected_flags;
	825
	826	/* We currently don't know how to deal with things that aren't in the
	827	* bitmap, but we know that the intersection is no greater than what
	828	* is already in cl, so let there be false positives that get sorted
	829	* out after the synthetic start class succeeds, and the node is
	830	* matched for real. */
	831
	832	/* The inversion of these two flags indicate that the resulting
	833	* intersection doesn't have them */
	834	if (and_with->flags & ANYOF_UNICODE_ALL) {
	835	cl->flags &= ~ANYOF_UNICODE_ALL;
	836	}
	837	if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	838	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
	839	}
	840	}
	841	else { /* and'd node is not inverted */
	842	if (! ANYOF_NONBITMAP(and_with)) {
	843
	844	/* Here 'and_with' doesn't match anything outside the bitmap
	845	* (except possibly ANYOF_UNICODE_ALL), which means the
	846	* intersection can't either, except for ANYOF_UNICODE_ALL, in
	847	* which case we don't know what the intersection is, but it's no
	848	* greater than what cl already has, so can just leave it alone,
	849	* with possible false positives */
	850	if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
	851	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	852	cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
	853	}
	854	}
	855	else if (! ANYOF_NONBITMAP(cl)) {
	856
	857	/* Here, 'and_with' does match something outside the bitmap, and cl
	858	* doesn't have a list of things to match outside the bitmap. If
	859	* cl can match all code points above 255, the intersection will
	860	* be those above-255 code points that 'and_with' matches. There
	861	* may be false positives from code points in 'and_with' that are
	862	* outside the bitmap but below 256, but those get sorted out
	863	* after the synthetic start class succeeds). If cl can't match
	864	* all Unicode code points, it means here that it can't match *
	865	* anything outside the bitmap, so we leave the bitmap empty */
	866	if (cl->flags & ANYOF_UNICODE_ALL) {
	867	ARG_SET(cl, ARG(and_with));
	868	}
	869	}
	870	else {
	871	/* Here, both 'and_with' and cl match something outside the
	872	* bitmap. Currently we do not do the intersection, so just match
	873	* whatever cl had at the beginning. */
	874	}
	875
	876
	877	/* Take the intersection of the two sets of flags */
	878	cl->flags &= and_with->flags;
	879	}
	880	}
	881
	882	/* 'OR' a given class with another one. Can create false positives. 'cl'
	883	* should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
	884	* 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
	885	STATIC void
	886	S_cl_or(const RExC_state_t pRExC_state, struct regnode_charclass_class cl, const struct regnode_charclass_class *or_with)
	887	{
	888	PERL_ARGS_ASSERT_CL_OR;
	889
	890	if (or_with->flags & ANYOF_INVERT) {
	891
	892	/* Here, the or'd node is to be inverted. This means we take the
	893	* complement of everything not in the bitmap, but currently we don't
	894	* know what that is, so give up and match anything */
	895	if (ANYOF_NONBITMAP(or_with)) {
	896	cl_anything(pRExC_state, cl);
	897	}
	898	/* We do not use
	899	* (B1 \| CL1) \| (!B2 & !CL2) = (B1 \| !B2 & !CL2) \| (CL1 \| (!B2 & !CL2))
	900	* <= (B1 \| !B2) \| (CL1 \| !CL2)
	901	* which is wasteful if CL2 is small, but we ignore CL2:
	902	* (B1 \| CL1) \| (!B2 & !CL2) <= (B1 \| CL1) \| !B2 = (B1 \| !B2) \| CL1
	903	* XXXX Can we handle case-fold? Unclear:
	904	* (OK1(i) \| OK1(i')) \| !(OK1(i) \| OK1(i')) =
	905	* (OK1(i) \| OK1(i')) \| (!OK1(i) & !OK1(i'))
	906	*/
	907	else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	908	&& !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	909	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
	910	int i;
	911
	912	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	913	cl->bitmap[i] \|= ~or_with->bitmap[i];
	914	} /* XXXX: logic is complicated otherwise */
	915	else {
	916	cl_anything(pRExC_state, cl);
	917	}
	918
	919	/* And, we can just take the union of the flags that aren't affected
	920	* by the inversion */
	921	cl->flags \|= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
	922
	923	/* For the remaining flags:
	924	ANYOF_UNICODE_ALL and inverted means to not match anything above
	925	255, which means that the union with cl should just be
	926	what cl has in it, so can ignore this flag
	927	ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
	928	is 127-255 to match them, but then invert that, so the
	929	union with cl should just be what cl has in it, so can
	930	ignore this flag
	931	*/
	932	} else { /* 'or_with' is not inverted */
	933	/* (B1 \| CL1) \| (B2 \| CL2) = (B1 \| B2) \| (CL1 \| CL2)) */
	934	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	935	&& (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	936	\|\| (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
	937	int i;
	938
	939	/* OR char bitmap and class bitmap separately */
	940	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	941	cl->bitmap[i] \|= or_with->bitmap[i];
	942	if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
	943	for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
	944	cl->classflags[i] \|= or_with->classflags[i];
	945	cl->flags \|= ANYOF_CLASS;
	946	}
	947	}
	948	else { /* XXXX: logic is complicated, leave it along for a moment. */
	949	cl_anything(pRExC_state, cl);
	950	}
	951
	952	if (ANYOF_NONBITMAP(or_with)) {
	953
	954	/* Use the added node's outside-the-bit-map match if there isn't a
	955	* conflict. If there is a conflict (both nodes match something
	956	* outside the bitmap, but what they match outside is not the same
	957	* pointer, and hence not easily compared until XXX we extend
	958	* inversion lists this far), give up and allow the start class to
	959	* match everything outside the bitmap. If that stuff is all above
	960	* 255, can just set UNICODE_ALL, otherwise caould be anything. */
	961	if (! ANYOF_NONBITMAP(cl)) {
	962	ARG_SET(cl, ARG(or_with));
	963	}
	964	else if (ARG(cl) != ARG(or_with)) {
	965
	966	if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
	967	cl_anything(pRExC_state, cl);
	968	}
	969	else {
	970	cl->flags \|= ANYOF_UNICODE_ALL;
	971	}
	972	}
	973
	974	/* Take the union */
	975	cl->flags \|= or_with->flags;
	976	}
	977	}
	978	}
	979
	980	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	981	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	982	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	983	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
	984
	985
	986	#ifdef DEBUGGING
	987	/*
	988	dump_trie(trie,widecharmap,revcharmap)
	989	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	990	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	991
	992	These routines dump out a trie in a somewhat readable format.
	993	The _interim_ variants are used for debugging the interim
	994	tables that are used to generate the final compressed
	995	representation which is what dump_trie expects.
	996
	997	Part of the reason for their existence is to provide a form
	998	of documentation as to how the different representations function.
	999
	1000	*/
	1001
	1002	/*
	1003	Dumps the final compressed table form of the trie to Perl_debug_log.
	1004	Used for debugging make_trie().
	1005	*/
	1006
	1007	STATIC void
	1008	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	1009	AV *revcharmap, U32 depth)
	1010	{
	1011	U32 state;
	1012	SV *sv=sv_newmortal();
	1013	int colwidth= widecharmap ? 6 : 4;
	1014	U16 word;
	1015	GET_RE_DEBUG_FLAGS_DECL;
	1016
	1017	PERL_ARGS_ASSERT_DUMP_TRIE;
	1018
	1019	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	1020	(int)depth * 2 + 2,"",
	1021	"Match","Base","Ofs" );
	1022
	1023	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	1024	SV ** const tmp = av_fetch( revcharmap, state, 0);
	1025	if ( tmp ) {
	1026	PerlIO_printf( Perl_debug_log, "%*s",
	1027	colwidth,
	1028	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1029	PL_colors[0], PL_colors[1],
	1030	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1031	PERL_PV_ESCAPE_FIRSTCHAR
	1032	)
	1033	);
	1034	}
	1035	}
	1036	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	1037	(int)depth * 2 + 2,"");
	1038
	1039	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	1040	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	1041	PerlIO_printf( Perl_debug_log, "\n");
	1042
	1043	for( state = 1 ; state < trie->statecount ; state++ ) {
	1044	const U32 base = trie->states[ state ].trans.base;
	1045
	1046	PerlIO_printf( Perl_debug_log, "%s#%4"UVXf"\|", (int)depth 2 + 2,"", (UV)state);
	1047
	1048	if ( trie->states[ state ].wordnum ) {
	1049	PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
	1050	} else {
	1051	PerlIO_printf( Perl_debug_log, "%6s", "" );
	1052	}
	1053
	1054	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	1055
	1056	if ( base ) {
	1057	U32 ofs = 0;
	1058
	1059	while( ( base + ofs < trie->uniquecharcount ) \|\|
	1060	( base + ofs - trie->uniquecharcount < trie->lasttrans
	1061	&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
	1062	ofs++;
	1063
	1064	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	1065
	1066	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1067	if ( ( base + ofs >= trie->uniquecharcount ) &&
	1068	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	1069	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	1070	{
	1071	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	1072	colwidth,
	1073	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
	1074	} else {
	1075	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	1076	}
	1077	}
	1078
	1079	PerlIO_printf( Perl_debug_log, "]");
	1080
	1081	}
	1082	PerlIO_printf( Perl_debug_log, "\n" );
	1083	}
	1084	PerlIO_printf(Perl_debug_log, "%sword_info N:(prev,len)=", (int)depth2, "");
	1085	for (word=1; word <= trie->wordcount; word++) {
	1086	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	1087	(int)word, (int)(trie->wordinfo[word].prev),
	1088	(int)(trie->wordinfo[word].len));
	1089	}
	1090	PerlIO_printf(Perl_debug_log, "\n" );
	1091	}
	1092	/*
	1093	Dumps a fully constructed but uncompressed trie in list form.
	1094	List tries normally only are used for construction when the number of
	1095	possible chars (trie->uniquecharcount) is very high.
	1096	Used for debugging make_trie().
	1097	*/
	1098	STATIC void
	1099	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	1100	HV widecharmap, AV revcharmap, U32 next_alloc,
	1101	U32 depth)
	1102	{
	1103	U32 state;
	1104	SV *sv=sv_newmortal();
	1105	int colwidth= widecharmap ? 6 : 4;
	1106	GET_RE_DEBUG_FLAGS_DECL;
	1107
	1108	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1109
	1110	/* print out the table precompression. */
	1111	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1112	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1113	"------:-----+-----------------\n" );
	1114
	1115	for( state=1 ; state < next_alloc ; state ++ ) {
	1116	U16 charid;
	1117
	1118	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1119	(int)depth * 2 + 2,"", (UV)state );
	1120	if ( ! trie->states[ state ].wordnum ) {
	1121	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1122	} else {
	1123	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1124	trie->states[ state ].wordnum
	1125	);
	1126	}
	1127	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1128	SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
	1129	if ( tmp ) {
	1130	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1131	colwidth,
	1132	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1133	PL_colors[0], PL_colors[1],
	1134	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1135	PERL_PV_ESCAPE_FIRSTCHAR
	1136	) ,
	1137	TRIE_LIST_ITEM(state,charid).forid,
	1138	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1139	);
	1140	if (!(charid % 10))
	1141	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1142	(int)((depth * 2) + 14), "");
	1143	}
	1144	}
	1145	PerlIO_printf( Perl_debug_log, "\n");
	1146	}
	1147	}
	1148
	1149	/*
	1150	Dumps a fully constructed but uncompressed trie in table form.
	1151	This is the normal DFA style state transition table, with a few
	1152	twists to facilitate compression later.
	1153	Used for debugging make_trie().
	1154	*/
	1155	STATIC void
	1156	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1157	HV widecharmap, AV revcharmap, U32 next_alloc,
	1158	U32 depth)
	1159	{
	1160	U32 state;
	1161	U16 charid;
	1162	SV *sv=sv_newmortal();
	1163	int colwidth= widecharmap ? 6 : 4;
	1164	GET_RE_DEBUG_FLAGS_DECL;
	1165
	1166	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1167
	1168	/*
	1169	print out the table precompression so that we can do a visual check
	1170	that they are identical.
	1171	*/
	1172
	1173	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1174
	1175	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1176	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1177	if ( tmp ) {
	1178	PerlIO_printf( Perl_debug_log, "%*s",
	1179	colwidth,
	1180	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1181	PL_colors[0], PL_colors[1],
	1182	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1183	PERL_PV_ESCAPE_FIRSTCHAR
	1184	)
	1185	);
	1186	}
	1187	}
	1188
	1189	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1190
	1191	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1192	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1193	}
	1194
	1195	PerlIO_printf( Perl_debug_log, "\n" );
	1196
	1197	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1198
	1199	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1200	(int)depth * 2 + 2,"",
	1201	(UV)TRIE_NODENUM( state ) );
	1202
	1203	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1204	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1205	if (v)
	1206	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1207	else
	1208	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1209	}
	1210	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1211	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
	1212	} else {
	1213	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
	1214	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1215	}
	1216	}
	1217	}
	1218
	1219	#endif
	1220
	1221
	1222	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1223	startbranch: the first branch in the whole branch sequence
	1224	first : start branch of sequence of branch-exact nodes.
	1225	May be the same as startbranch
	1226	last : Thing following the last branch.
	1227	May be the same as tail.
	1228	tail : item following the branch sequence
	1229	count : words in the sequence
	1230	flags : currently the OP() type we will be building one of /EXACT(\|F\|Fl)/
	1231	depth : indent depth
	1232
	1233	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1234
	1235	A trie is an N'ary tree where the branches are determined by digital
	1236	decomposition of the key. IE, at the root node you look up the 1st character and
	1237	follow that branch repeat until you find the end of the branches. Nodes can be
	1238	marked as "accepting" meaning they represent a complete word. Eg:
	1239
	1240	/he\|she\|his\|hers/
	1241
	1242	would convert into the following structure. Numbers represent states, letters
	1243	following numbers represent valid transitions on the letter from that state, if
	1244	the number is in square brackets it represents an accepting state, otherwise it
	1245	will be in parenthesis.
	1246
	1247	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1248	\| \|
	1249	\| (2)
	1250	\| \|
	1251	(1) +-i->(6)-+-s->[7]
	1252	\|
	1253	+-s->(3)-+-h->(4)-+-e->[5]
	1254
	1255	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1256
	1257	This shows that when matching against the string 'hers' we will begin at state 1
	1258	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1259	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1260	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1261	single traverse. We store a mapping from accepting to state to which word was
	1262	matched, and then when we have multiple possibilities we try to complete the
	1263	rest of the regex in the order in which they occured in the alternation.
	1264
	1265	The only prior NFA like behaviour that would be changed by the TRIE support is
	1266	the silent ignoring of duplicate alternations which are of the form:
	1267
	1268	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1269
	1270	Thus EVAL blocks following a trie may be called a different number of times with
	1271	and without the optimisation. With the optimisations dupes will be silently
	1272	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1273	the following demonstrates:
	1274
	1275	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1276
	1277	which prints out 'word' three times, but
	1278
	1279	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1280
	1281	which doesnt print it out at all. This is due to other optimisations kicking in.
	1282
	1283	Example of what happens on a structural level:
	1284
	1285	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1286
	1287	1: CURLYM[1] {1,32767}(18)
	1288	5: BRANCH(8)
	1289	6: EXACT <ac>(16)
	1290	8: BRANCH(11)
	1291	9: EXACT <ad>(16)
	1292	11: BRANCH(14)
	1293	12: EXACT <ab>(16)
	1294	16: SUCCEED(0)
	1295	17: NOTHING(18)
	1296	18: END(0)
	1297
	1298	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1299	and should turn into:
	1300
	1301	1: CURLYM[1] {1,32767}(18)
	1302	5: TRIE(16)
	1303	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1304	<ac>
	1305	<ad>
	1306	<ab>
	1307	16: SUCCEED(0)
	1308	17: NOTHING(18)
	1309	18: END(0)
	1310
	1311	Cases where tail != last would be like /(?foo\|bar)baz/:
	1312
	1313	1: BRANCH(4)
	1314	2: EXACT <foo>(8)
	1315	4: BRANCH(7)
	1316	5: EXACT <bar>(8)
	1317	7: TAIL(8)
	1318	8: EXACT <baz>(10)
	1319	10: END(0)
	1320
	1321	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1322	and would end up looking like:
	1323
	1324	1: TRIE(8)
	1325	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1326	<foo>
	1327	<bar>
	1328	7: TAIL(8)
	1329	8: EXACT <baz>(10)
	1330	10: END(0)
	1331
	1332	d = uvuni_to_utf8_flags(d, uv, 0);
	1333
	1334	is the recommended Unicode-aware way of saying
	1335
	1336	*(d++) = uv;
	1337	*/
	1338
	1339	#define TRIE_STORE_REVCHAR \
	1340	STMT_START { \
	1341	if (UTF) { \
	1342	SV *zlopp = newSV(2); \
	1343	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1344	unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
	1345	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1346	SvPOK_on(zlopp); \
	1347	SvUTF8_on(zlopp); \
	1348	av_push(revcharmap, zlopp); \
	1349	} else { \
	1350	char ooooff = (char)uvc; \
	1351	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1352	} \
	1353	} STMT_END
	1354
	1355	#define TRIE_READ_CHAR STMT_START { \
	1356	wordlen++; \
	1357	if ( UTF ) { \
	1358	if ( folder ) { \
	1359	if ( foldlen > 0 ) { \
	1360	uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
	1361	foldlen -= len; \
	1362	scan += len; \
	1363	len = 0; \
	1364	} else { \
	1365	uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
	1366	uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
	1367	foldlen -= UNISKIP( uvc ); \
	1368	scan = foldbuf + UNISKIP( uvc ); \
	1369	} \
	1370	} else { \
	1371	uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
	1372	} \
	1373	} else { \
	1374	uvc = (U32)*uc; \
	1375	len = 1; \
	1376	} \
	1377	} STMT_END
	1378
	1379
	1380
	1381	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1382	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1383	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1384	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1385	} \
	1386	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1387	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1388	TRIE_LIST_CUR( state )++; \
	1389	} STMT_END
	1390
	1391	#define TRIE_LIST_NEW(state) STMT_START { \
	1392	Newxz( trie->states[ state ].trans.list, \
	1393	4, reg_trie_trans_le ); \
	1394	TRIE_LIST_CUR( state ) = 1; \
	1395	TRIE_LIST_LEN( state ) = 4; \
	1396	} STMT_END
	1397
	1398	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1399	U16 dupe= trie->states[ state ].wordnum; \
	1400	regnode * const noper_next = regnext( noper ); \
	1401	\
	1402	DEBUG_r({ \
	1403	/* store the word for dumping */ \
	1404	SV* tmp; \
	1405	if (OP(noper) != NOTHING) \
	1406	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1407	else \
	1408	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1409	av_push( trie_words, tmp ); \
	1410	}); \
	1411	\
	1412	curword++; \
	1413	trie->wordinfo[curword].prev = 0; \
	1414	trie->wordinfo[curword].len = wordlen; \
	1415	trie->wordinfo[curword].accept = state; \
	1416	\
	1417	if ( noper_next < tail ) { \
	1418	if (!trie->jump) \
	1419	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
	1420	trie->jump[curword] = (U16)(noper_next - convert); \
	1421	if (!jumper) \
	1422	jumper = noper_next; \
	1423	if (!nextbranch) \
	1424	nextbranch= regnext(cur); \
	1425	} \
	1426	\
	1427	if ( dupe ) { \
	1428	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1429	/* chain, so that when the bits of chain are later */\
	1430	/* linked together, the dups appear in the chain */\
	1431	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1432	trie->wordinfo[dupe].prev = curword; \
	1433	} else { \
	1434	/* we haven't inserted this word yet. */ \
	1435	trie->states[ state ].wordnum = curword; \
	1436	} \
	1437	} STMT_END
	1438
	1439
	1440	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1441	( ( base + charid >= ucharcount \
	1442	&& base + charid < ubound \
	1443	&& state == trie->trans[ base - ucharcount + charid ].check \
	1444	&& trie->trans[ base - ucharcount + charid ].next ) \
	1445	? trie->trans[ base - ucharcount + charid ].next \
	1446	: ( state==1 ? special : 0 ) \
	1447	)
	1448
	1449	#define MADE_TRIE 1
	1450	#define MADE_JUMP_TRIE 2
	1451	#define MADE_EXACT_TRIE 4
	1452
	1453	STATIC I32
	1454	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch, regnode first, regnode last, regnode *tail, U32 word_count, U32 flags, U32 depth)
	1455	{
	1456	dVAR;
	1457	/* first pass, loop through and scan words */
	1458	reg_trie_data *trie;
	1459	HV *widecharmap = NULL;
	1460	AV *revcharmap = newAV();
	1461	regnode *cur;
	1462	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1463	STRLEN len = 0;
	1464	UV uvc = 0;
	1465	U16 curword = 0;
	1466	U32 next_alloc = 0;
	1467	regnode *jumper = NULL;
	1468	regnode *nextbranch = NULL;
	1469	regnode *convert = NULL;
	1470	U32 prev_states; / temp array mapping each state to previous one */
	1471	/* we just use folder as a flag in utf8 */
	1472	const U8 * folder = NULL;
	1473
	1474	#ifdef DEBUGGING
	1475	const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
	1476	AV *trie_words = NULL;
	1477	/* along with revcharmap, this only used during construction but both are
	1478	* useful during debugging so we store them in the struct when debugging.
	1479	*/
	1480	#else
	1481	const U32 data_slot = add_data( pRExC_state, 2, "tu" );
	1482	STRLEN trie_charcount=0;
	1483	#endif
	1484	SV *re_trie_maxbuff;
	1485	GET_RE_DEBUG_FLAGS_DECL;
	1486
	1487	PERL_ARGS_ASSERT_MAKE_TRIE;
	1488	#ifndef DEBUGGING
	1489	PERL_UNUSED_ARG(depth);
	1490	#endif
	1491
	1492	switch (flags) {
	1493	case EXACTFA:
	1494	case EXACTFU: folder = PL_fold_latin1; break;
	1495	case EXACTF: folder = PL_fold; break;
	1496	case EXACTFL: folder = PL_fold_locale; break;
	1497	}
	1498
	1499	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1500	trie->refcount = 1;
	1501	trie->startstate = 1;
	1502	trie->wordcount = word_count;
	1503	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1504	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1505	if (!(UTF && folder))
	1506	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1507	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1508	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1509
	1510	DEBUG_r({
	1511	trie_words = newAV();
	1512	});
	1513
	1514	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1515	if (!SvIOK(re_trie_maxbuff)) {
	1516	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1517	}
	1518	DEBUG_OPTIMISE_r({
	1519	PerlIO_printf( Perl_debug_log,
	1520	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	1521	(int)depth * 2 + 2, "",
	1522	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	1523	REG_NODE_NUM(last), REG_NODE_NUM(tail),
	1524	(int)depth);
	1525	});
	1526
	1527	/* Find the node we are going to overwrite */
	1528	if ( first == startbranch && OP( last ) != BRANCH ) {
	1529	/* whole branch chain */
	1530	convert = first;
	1531	} else {
	1532	/* branch sub-chain */
	1533	convert = NEXTOPER( first );
	1534	}
	1535
	1536	/* -- First loop and Setup --
	1537
	1538	We first traverse the branches and scan each word to determine if it
	1539	contains widechars, and how many unique chars there are, this is
	1540	important as we have to build a table with at least as many columns as we
	1541	have unique chars.
	1542
	1543	We use an array of integers to represent the character codes 0..255
	1544	(trie->charmap) and we use a an HV* to store Unicode characters. We use the
	1545	native representation of the character value as the key and IV's for the
	1546	coded index.
	1547
	1548	TODO If we keep track of how many times each character is used we can
	1549	remap the columns so that the table compression later on is more
	1550	efficient in terms of memory by ensuring the most common value is in the
	1551	middle and the least common are on the outside. IMO this would be better
	1552	than a most to least common mapping as theres a decent chance the most
	1553	common letter will share a node with the least common, meaning the node
	1554	will not be compressible. With a middle is most common approach the worst
	1555	case is when we have the least common nodes twice.
	1556
	1557	*/
	1558
	1559	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1560	regnode * const noper = NEXTOPER( cur );
	1561	const U8 uc = (U8)STRING( noper );
	1562	const U8 * const e = uc + STR_LEN( noper );
	1563	STRLEN foldlen = 0;
	1564	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1565	const U8 scan = (U8)NULL;
	1566	U32 wordlen = 0; /* required init */
	1567	STRLEN chars = 0;
	1568	bool set_bit = trie->bitmap ? 1 : 0; /store the first char in the bitmap?/
	1569
	1570	if (OP(noper) == NOTHING) {
	1571	trie->minlen= 0;
	1572	continue;
	1573	}
	1574	if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
	1575	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	1576	regardless of encoding */
	1577
	1578	for ( ; uc < e ; uc += len ) {
	1579	TRIE_CHARCOUNT(trie)++;
	1580	TRIE_READ_CHAR;
	1581	chars++;
	1582	if ( uvc < 256 ) {
	1583	if ( !trie->charmap[ uvc ] ) {
	1584	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	1585	if ( folder )
	1586	trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
	1587	TRIE_STORE_REVCHAR;
	1588	}
	1589	if ( set_bit ) {
	1590	/* store the codepoint in the bitmap, and its folded
	1591	* equivalent. */
	1592	TRIE_BITMAP_SET(trie,uvc);
	1593
	1594	/* store the folded codepoint */
	1595	if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
	1596
	1597	if ( !UTF ) {
	1598	/* store first byte of utf8 representation of
	1599	variant codepoints */
	1600	if (! UNI_IS_INVARIANT(uvc)) {
	1601	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	1602	}
	1603	}
	1604	set_bit = 0; /* We've done our bit :-) */
	1605	}
	1606	} else {
	1607	SV** svpp;
	1608	if ( !widecharmap )
	1609	widecharmap = newHV();
	1610
	1611	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	1612
	1613	if ( !svpp )
	1614	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	1615
	1616	if ( !SvTRUE( *svpp ) ) {
	1617	sv_setiv( *svpp, ++trie->uniquecharcount );
	1618	TRIE_STORE_REVCHAR;
	1619	}
	1620	}
	1621	}
	1622	if( cur == first ) {
	1623	trie->minlen=chars;
	1624	trie->maxlen=chars;
	1625	} else if (chars < trie->minlen) {
	1626	trie->minlen=chars;
	1627	} else if (chars > trie->maxlen) {
	1628	trie->maxlen=chars;
	1629	}
	1630
	1631	} /* end first pass */
	1632	DEBUG_TRIE_COMPILE_r(
	1633	PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	1634	(int)depth * 2 + 2,"",
	1635	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	1636	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	1637	(int)trie->minlen, (int)trie->maxlen )
	1638	);
	1639
	1640	/*
	1641	We now know what we are dealing with in terms of unique chars and
	1642	string sizes so we can calculate how much memory a naive
	1643	representation using a flat table will take. If it's over a reasonable
	1644	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	1645	conservative but potentially much slower representation using an array
	1646	of lists.
	1647
	1648	At the end we convert both representations into the same compressed
	1649	form that will be used in regexec.c for matching with. The latter
	1650	is a form that cannot be used to construct with but has memory
	1651	properties similar to the list form and access properties similar
	1652	to the table form making it both suitable for fast searches and
	1653	small enough that its feasable to store for the duration of a program.
	1654
	1655	See the comment in the code where the compressed table is produced
	1656	inplace from the flat tabe representation for an explanation of how
	1657	the compression works.
	1658
	1659	*/
	1660
	1661
	1662	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	1663	prev_states[1] = 0;
	1664
	1665	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
	1666	/*
	1667	Second Pass -- Array Of Lists Representation
	1668
	1669	Each state will be represented by a list of charid:state records
	1670	(reg_trie_trans_le) the first such element holds the CUR and LEN
	1671	points of the allocated array. (See defines above).
	1672
	1673	We build the initial structure using the lists, and then convert
	1674	it into the compressed table form which allows faster lookups
	1675	(but cant be modified once converted).
	1676	*/
	1677
	1678	STRLEN transcount = 1;
	1679
	1680	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1681	"%*sCompiling trie using list compiler\n",
	1682	(int)depth * 2 + 2, ""));
	1683
	1684	trie->states = (reg_trie_state *)
	1685	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1686	sizeof(reg_trie_state) );
	1687	TRIE_LIST_NEW(1);
	1688	next_alloc = 2;
	1689
	1690	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1691
	1692	regnode * const noper = NEXTOPER( cur );
	1693	U8 uc = (U8)STRING( noper );
	1694	const U8 * const e = uc + STR_LEN( noper );
	1695	U32 state = 1; /* required init */
	1696	U16 charid = 0; /* sanity init */
	1697	U8 scan = (U8)NULL; /* sanity init */
	1698	STRLEN foldlen = 0; /* required init */
	1699	U32 wordlen = 0; /* required init */
	1700	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1701
	1702	if (OP(noper) != NOTHING) {
	1703	for ( ; uc < e ; uc += len ) {
	1704
	1705	TRIE_READ_CHAR;
	1706
	1707	if ( uvc < 256 ) {
	1708	charid = trie->charmap[ uvc ];
	1709	} else {
	1710	SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1711	if ( !svpp ) {
	1712	charid = 0;
	1713	} else {
	1714	charid=(U16)SvIV( *svpp );
	1715	}
	1716	}
	1717	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1718	if ( charid ) {
	1719
	1720	U16 check;
	1721	U32 newstate = 0;
	1722
	1723	charid--;
	1724	if ( !trie->states[ state ].trans.list ) {
	1725	TRIE_LIST_NEW( state );
	1726	}
	1727	for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
	1728	if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
	1729	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	1730	break;
	1731	}
	1732	}
	1733	if ( ! newstate ) {
	1734	newstate = next_alloc++;
	1735	prev_states[newstate] = state;
	1736	TRIE_LIST_PUSH( state, charid, newstate );
	1737	transcount++;
	1738	}
	1739	state = newstate;
	1740	} else {
	1741	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1742	}
	1743	}
	1744	}
	1745	TRIE_HANDLE_WORD(state);
	1746
	1747	} /* end second pass */
	1748
	1749	/* next alloc is the NEXT state to be allocated */
	1750	trie->statecount = next_alloc;
	1751	trie->states = (reg_trie_state *)
	1752	PerlMemShared_realloc( trie->states,
	1753	next_alloc
	1754	* sizeof(reg_trie_state) );
	1755
	1756	/* and now dump it out before we compress it */
	1757	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	1758	revcharmap, next_alloc,
	1759	depth+1)
	1760	);
	1761
	1762	trie->trans = (reg_trie_trans *)
	1763	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	1764	{
	1765	U32 state;
	1766	U32 tp = 0;
	1767	U32 zp = 0;
	1768
	1769
	1770	for( state=1 ; state < next_alloc ; state ++ ) {
	1771	U32 base=0;
	1772
	1773	/*
	1774	DEBUG_TRIE_COMPILE_MORE_r(
	1775	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	1776	);
	1777	*/
	1778
	1779	if (trie->states[state].trans.list) {
	1780	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1781	U16 maxid=minid;
	1782	U16 idx;
	1783
	1784	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1785	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1786	if ( forid < minid ) {
	1787	minid=forid;
	1788	} else if ( forid > maxid ) {
	1789	maxid=forid;
	1790	}
	1791	}
	1792	if ( transcount < tp + maxid - minid + 1) {
	1793	transcount *= 2;
	1794	trie->trans = (reg_trie_trans *)
	1795	PerlMemShared_realloc( trie->trans,
	1796	transcount
	1797	* sizeof(reg_trie_trans) );
	1798	Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
	1799	}
	1800	base = trie->uniquecharcount + tp - minid;
	1801	if ( maxid == minid ) {
	1802	U32 set = 0;
	1803	for ( ; zp < tp ; zp++ ) {
	1804	if ( ! trie->trans[ zp ].next ) {
	1805	base = trie->uniquecharcount + zp - minid;
	1806	trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1807	trie->trans[ zp ].check = state;
	1808	set = 1;
	1809	break;
	1810	}
	1811	}
	1812	if ( !set ) {
	1813	trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1814	trie->trans[ tp ].check = state;
	1815	tp++;
	1816	zp = tp;
	1817	}
	1818	} else {
	1819	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1820	const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
	1821	trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
	1822	trie->trans[ tid ].check = state;
	1823	}
	1824	tp += ( maxid - minid + 1 );
	1825	}
	1826	Safefree(trie->states[ state ].trans.list);
	1827	}
	1828	/*
	1829	DEBUG_TRIE_COMPILE_MORE_r(
	1830	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	1831	);
	1832	*/
	1833	trie->states[ state ].trans.base=base;
	1834	}
	1835	trie->lasttrans = tp + 1;
	1836	}
	1837	} else {
	1838	/*
	1839	Second Pass -- Flat Table Representation.
	1840
	1841	we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
	1842	We know that we will need Charcount+1 trans at most to store the data
	1843	(one row per char at worst case) So we preallocate both structures
	1844	assuming worst case.
	1845
	1846	We then construct the trie using only the .next slots of the entry
	1847	structs.
	1848
	1849	We use the .check field of the first entry of the node temporarily to
	1850	make compression both faster and easier by keeping track of how many non
	1851	zero fields are in the node.
	1852
	1853	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	1854	transition.
	1855
	1856	There are two terms at use here: state as a TRIE_NODEIDX() which is a
	1857	number representing the first entry of the node, and state as a
	1858	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
	1859	TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
	1860	are 2 entrys per node. eg:
	1861
	1862	A B A B
	1863	1. 2 4 1. 3 7
	1864	2. 0 3 3. 0 5
	1865	3. 0 0 5. 0 0
	1866	4. 0 0 7. 0 0
	1867
	1868	The table is internally in the right hand, idx form. However as we also
	1869	have to deal with the states array which is indexed by nodenum we have to
	1870	use TRIE_NODENUM() to convert.
	1871
	1872	*/
	1873	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1874	"%*sCompiling trie using table compiler\n",
	1875	(int)depth * 2 + 2, ""));
	1876
	1877	trie->trans = (reg_trie_trans *)
	1878	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	1879	* trie->uniquecharcount + 1,
	1880	sizeof(reg_trie_trans) );
	1881	trie->states = (reg_trie_state *)
	1882	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1883	sizeof(reg_trie_state) );
	1884	next_alloc = trie->uniquecharcount + 1;
	1885
	1886
	1887	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1888
	1889	regnode * const noper = NEXTOPER( cur );
	1890	const U8 uc = (U8)STRING( noper );
	1891	const U8 * const e = uc + STR_LEN( noper );
	1892
	1893	U32 state = 1; /* required init */
	1894
	1895	U16 charid = 0; /* sanity init */
	1896	U32 accept_state = 0; /* sanity init */
	1897	U8 scan = (U8)NULL; /* sanity init */
	1898
	1899	STRLEN foldlen = 0; /* required init */
	1900	U32 wordlen = 0; /* required init */
	1901	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1902
	1903	if ( OP(noper) != NOTHING ) {
	1904	for ( ; uc < e ; uc += len ) {
	1905
	1906	TRIE_READ_CHAR;
	1907
	1908	if ( uvc < 256 ) {
	1909	charid = trie->charmap[ uvc ];
	1910	} else {
	1911	SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1912	charid = svpp ? (U16)SvIV(*svpp) : 0;
	1913	}
	1914	if ( charid ) {
	1915	charid--;
	1916	if ( !trie->trans[ state + charid ].next ) {
	1917	trie->trans[ state + charid ].next = next_alloc;
	1918	trie->trans[ state ].check++;
	1919	prev_states[TRIE_NODENUM(next_alloc)]
	1920	= TRIE_NODENUM(state);
	1921	next_alloc += trie->uniquecharcount;
	1922	}
	1923	state = trie->trans[ state + charid ].next;
	1924	} else {
	1925	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1926	}
	1927	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1928	}
	1929	}
	1930	accept_state = TRIE_NODENUM( state );
	1931	TRIE_HANDLE_WORD(accept_state);
	1932
	1933	} /* end second pass */
	1934
	1935	/* and now dump it out before we compress it */
	1936	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	1937	revcharmap,
	1938	next_alloc, depth+1));
	1939
	1940	{
	1941	/*
	1942	* Inplace compress the table.*
	1943
	1944	For sparse data sets the table constructed by the trie algorithm will
	1945	be mostly 0/FAIL transitions or to put it another way mostly empty.
	1946	(Note that leaf nodes will not contain any transitions.)
	1947
	1948	This algorithm compresses the tables by eliminating most such
	1949	transitions, at the cost of a modest bit of extra work during lookup:
	1950
	1951	- Each states[] entry contains a .base field which indicates the
	1952	index in the state[] array wheres its transition data is stored.
	1953
	1954	- If .base is 0 there are no valid transitions from that node.
	1955
	1956	- If .base is nonzero then charid is added to it to find an entry in
	1957	the trans array.
	1958
	1959	-If trans[states[state].base+charid].check!=state then the
	1960	transition is taken to be a 0/Fail transition. Thus if there are fail
	1961	transitions at the front of the node then the .base offset will point
	1962	somewhere inside the previous nodes data (or maybe even into a node
	1963	even earlier), but the .check field determines if the transition is
	1964	valid.
	1965
	1966	XXX - wrong maybe?
	1967	The following process inplace converts the table to the compressed
	1968	table: We first do not compress the root node 1,and mark all its
	1969	.check pointers as 1 and set its .base pointer as 1 as well. This
	1970	allows us to do a DFA construction from the compressed table later,
	1971	and ensures that any .base pointers we calculate later are greater
	1972	than 0.
	1973
	1974	- We set 'pos' to indicate the first entry of the second node.
	1975
	1976	- We then iterate over the columns of the node, finding the first and
	1977	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	1978	and set the .check pointers accordingly, and advance pos
	1979	appropriately and repreat for the next node. Note that when we copy
	1980	the next pointers we have to convert them from the original
	1981	NODEIDX form to NODENUM form as the former is not valid post
	1982	compression.
	1983
	1984	- If a node has no transitions used we mark its base as 0 and do not
	1985	advance the pos pointer.
	1986
	1987	- If a node only has one transition we use a second pointer into the
	1988	structure to fill in allocated fail transitions from other states.
	1989	This pointer is independent of the main pointer and scans forward
	1990	looking for null transitions that are allocated to a state. When it
	1991	finds one it writes the single transition into the "hole". If the
	1992	pointer doesnt find one the single transition is appended as normal.
	1993
	1994	- Once compressed we can Renew/realloc the structures to release the
	1995	excess space.
	1996
	1997	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	1998	specifically Fig 3.47 and the associated pseudocode.
	1999
	2000	demq
	2001	*/
	2002	const U32 laststate = TRIE_NODENUM( next_alloc );
	2003	U32 state, charid;
	2004	U32 pos = 0, zp=0;
	2005	trie->statecount = laststate;
	2006
	2007	for ( state = 1 ; state < laststate ; state++ ) {
	2008	U8 flag = 0;
	2009	const U32 stateidx = TRIE_NODEIDX( state );
	2010	const U32 o_used = trie->trans[ stateidx ].check;
	2011	U32 used = trie->trans[ stateidx ].check;
	2012	trie->trans[ stateidx ].check = 0;
	2013
	2014	for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
	2015	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	2016	if ( trie->trans[ stateidx + charid ].next ) {
	2017	if (o_used == 1) {
	2018	for ( ; zp < pos ; zp++ ) {
	2019	if ( ! trie->trans[ zp ].next ) {
	2020	break;
	2021	}
	2022	}
	2023	trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
	2024	trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2025	trie->trans[ zp ].check = state;
	2026	if ( ++zp > pos ) pos = zp;
	2027	break;
	2028	}
	2029	used--;
	2030	}
	2031	if ( !flag ) {
	2032	flag = 1;
	2033	trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
	2034	}
	2035	trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2036	trie->trans[ pos ].check = state;
	2037	pos++;
	2038	}
	2039	}
	2040	}
	2041	trie->lasttrans = pos + 1;
	2042	trie->states = (reg_trie_state *)
	2043	PerlMemShared_realloc( trie->states, laststate
	2044	* sizeof(reg_trie_state) );
	2045	DEBUG_TRIE_COMPILE_MORE_r(
	2046	PerlIO_printf( Perl_debug_log,
	2047	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	2048	(int)depth * 2 + 2,"",
	2049	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
	2050	(IV)next_alloc,
	2051	(IV)pos,
	2052	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	2053	);
	2054
	2055	} /* end table compress */
	2056	}
	2057	DEBUG_TRIE_COMPILE_MORE_r(
	2058	PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	2059	(int)depth * 2 + 2, "",
	2060	(UV)trie->statecount,
	2061	(UV)trie->lasttrans)
	2062	);
	2063	/* resize the trans array to remove unused space */
	2064	trie->trans = (reg_trie_trans *)
	2065	PerlMemShared_realloc( trie->trans, trie->lasttrans
	2066	* sizeof(reg_trie_trans) );
	2067
	2068	{ /* Modify the program and insert the new TRIE node */
	2069	U8 nodetype =(U8)(flags & 0xFF);
	2070	char *str=NULL;
	2071
	2072	#ifdef DEBUGGING
	2073	regnode *optimize = NULL;
	2074	#ifdef RE_TRACK_PATTERN_OFFSETS
	2075
	2076	U32 mjd_offset = 0;
	2077	U32 mjd_nodelen = 0;
	2078	#endif /* RE_TRACK_PATTERN_OFFSETS */
	2079	#endif /* DEBUGGING */
	2080	/*
	2081	This means we convert either the first branch or the first Exact,
	2082	depending on whether the thing following (in 'last') is a branch
	2083	or not and whther first is the startbranch (ie is it a sub part of
	2084	the alternation or is it the whole thing.)
	2085	Assuming its a sub part we convert the EXACT otherwise we convert
	2086	the whole branch sequence, including the first.
	2087	*/
	2088	/* Find the node we are going to overwrite */
	2089	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	2090	/* branch sub-chain */
	2091	NEXT_OFF( first ) = (U16)(last - first);
	2092	#ifdef RE_TRACK_PATTERN_OFFSETS
	2093	DEBUG_r({
	2094	mjd_offset= Node_Offset((convert));
	2095	mjd_nodelen= Node_Length((convert));
	2096	});
	2097	#endif
	2098	/* whole branch chain */
	2099	}
	2100	#ifdef RE_TRACK_PATTERN_OFFSETS
	2101	else {
	2102	DEBUG_r({
	2103	const regnode *nop = NEXTOPER( convert );
	2104	mjd_offset= Node_Offset((nop));
	2105	mjd_nodelen= Node_Length((nop));
	2106	});
	2107	}
	2108	DEBUG_OPTIMISE_r(
	2109	PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2110	(int)depth * 2 + 2, "",
	2111	(UV)mjd_offset, (UV)mjd_nodelen)
	2112	);
	2113	#endif
	2114	/* But first we check to see if there is a common prefix we can
	2115	split out as an EXACT and put in front of the TRIE node. */
	2116	trie->startstate= 1;
	2117	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2118	U32 state;
	2119	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2120	U32 ofs = 0;
	2121	I32 idx = -1;
	2122	U32 count = 0;
	2123	const U32 base = trie->states[ state ].trans.base;
	2124
	2125	if ( trie->states[state].wordnum )
	2126	count = 1;
	2127
	2128	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2129	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2130	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2131	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2132	{
	2133	if ( ++count > 1 ) {
	2134	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2135	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2136	if ( state == 1 ) break;
	2137	if ( count == 2 ) {
	2138	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2139	DEBUG_OPTIMISE_r(
	2140	PerlIO_printf(Perl_debug_log,
	2141	"%*sNew Start State=%"UVuf" Class: [",
	2142	(int)depth * 2 + 2, "",
	2143	(UV)state));
	2144	if (idx >= 0) {
	2145	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2146	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2147
	2148	TRIE_BITMAP_SET(trie,*ch);
	2149	if ( folder )
	2150	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2151	DEBUG_OPTIMISE_r(
	2152	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2153	);
	2154	}
	2155	}
	2156	TRIE_BITMAP_SET(trie,*ch);
	2157	if ( folder )
	2158	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2159	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2160	}
	2161	idx = ofs;
	2162	}
	2163	}
	2164	if ( count == 1 ) {
	2165	SV **tmp = av_fetch( revcharmap, idx, 0);
	2166	STRLEN len;
	2167	char ch = SvPV( tmp, len );
	2168	DEBUG_OPTIMISE_r({
	2169	SV *sv=sv_newmortal();
	2170	PerlIO_printf( Perl_debug_log,
	2171	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2172	(int)depth * 2 + 2, "",
	2173	(UV)state, (UV)idx,
	2174	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2175	PL_colors[0], PL_colors[1],
	2176	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2177	PERL_PV_ESCAPE_FIRSTCHAR
	2178	)
	2179	);
	2180	});
	2181	if ( state==1 ) {
	2182	OP( convert ) = nodetype;
	2183	str=STRING(convert);
	2184	STR_LEN(convert)=0;
	2185	}
	2186	STR_LEN(convert) += len;
	2187	while (len--)
	2188	str++ = ch++;
	2189	} else {
	2190	#ifdef DEBUGGING
	2191	if (state>1)
	2192	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2193	#endif
	2194	break;
	2195	}
	2196	}
	2197	trie->prefixlen = (state-1);
	2198	if (str) {
	2199	regnode *n = convert+NODE_SZ_STR(convert);
	2200	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2201	trie->startstate = state;
	2202	trie->minlen -= (state - 1);
	2203	trie->maxlen -= (state - 1);
	2204	#ifdef DEBUGGING
	2205	/* At least the UNICOS C compiler choked on this
	2206	* being argument to DEBUG_r(), so let's just have
	2207	* it right here. */
	2208	if (
	2209	#ifdef PERL_EXT_RE_BUILD
	2210	1
	2211	#else
	2212	DEBUG_r_TEST
	2213	#endif
	2214	) {
	2215	regnode *fix = convert;
	2216	U32 word = trie->wordcount;
	2217	mjd_nodelen++;
	2218	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2219	while( ++fix < n ) {
	2220	Set_Node_Offset_Length(fix, 0, 0);
	2221	}
	2222	while (word--) {
	2223	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2224	if (tmp) {
	2225	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2226	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2227	else
	2228	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2229	}
	2230	}
	2231	}
	2232	#endif
	2233	if (trie->maxlen) {
	2234	convert = n;
	2235	} else {
	2236	NEXT_OFF(convert) = (U16)(tail - convert);
	2237	DEBUG_r(optimize= n);
	2238	}
	2239	}
	2240	}
	2241	if (!jumper)
	2242	jumper = last;
	2243	if ( trie->maxlen ) {
	2244	NEXT_OFF( convert ) = (U16)(tail - convert);
	2245	ARG_SET( convert, data_slot );
	2246	/* Store the offset to the first unabsorbed branch in
	2247	jump[0], which is otherwise unused by the jump logic.
	2248	We use this when dumping a trie and during optimisation. */
	2249	if (trie->jump)
	2250	trie->jump[0] = (U16)(nextbranch - convert);
	2251
	2252	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2253	* and there is a bitmap
	2254	* and the first "jump target" node we found leaves enough room
	2255	* then convert the TRIE node into a TRIEC node, with the bitmap
	2256	* embedded inline in the opcode - this is hypothetically faster.
	2257	*/
	2258	if ( !trie->states[trie->startstate].wordnum
	2259	&& trie->bitmap
	2260	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2261	{
	2262	OP( convert ) = TRIEC;
	2263	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2264	PerlMemShared_free(trie->bitmap);
	2265	trie->bitmap= NULL;
	2266	} else
	2267	OP( convert ) = TRIE;
	2268
	2269	/* store the type in the flags */
	2270	convert->flags = nodetype;
	2271	DEBUG_r({
	2272	optimize = convert
	2273	+ NODE_STEP_REGNODE
	2274	+ regarglen[ OP( convert ) ];
	2275	});
	2276	/* XXX We really should free up the resource in trie now,
	2277	as we won't use them - (which resources?) dmq */
	2278	}
	2279	/* needed for dumping*/
	2280	DEBUG_r(if (optimize) {
	2281	regnode *opt = convert;
	2282
	2283	while ( ++opt < optimize) {
	2284	Set_Node_Offset_Length(opt,0,0);
	2285	}
	2286	/*
	2287	Try to clean up some of the debris left after the
	2288	optimisation.
	2289	*/
	2290	while( optimize < jumper ) {
	2291	mjd_nodelen += Node_Length((optimize));
	2292	OP( optimize ) = OPTIMIZED;
	2293	Set_Node_Offset_Length(optimize,0,0);
	2294	optimize++;
	2295	}
	2296	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2297	});
	2298	} /* end node insert */
	2299
	2300	/* Finish populating the prev field of the wordinfo array. Walk back
	2301	* from each accept state until we find another accept state, and if
	2302	* so, point the first word's .prev field at the second word. If the
	2303	* second already has a .prev field set, stop now. This will be the
	2304	* case either if we've already processed that word's accept state,
	2305	* or that state had multiple words, and the overspill words were
	2306	* already linked up earlier.
	2307	*/
	2308	{
	2309	U16 word;
	2310	U32 state;
	2311	U16 prev;
	2312
	2313	for (word=1; word <= trie->wordcount; word++) {
	2314	prev = 0;
	2315	if (trie->wordinfo[word].prev)
	2316	continue;
	2317	state = trie->wordinfo[word].accept;
	2318	while (state) {
	2319	state = prev_states[state];
	2320	if (!state)
	2321	break;
	2322	prev = trie->states[state].wordnum;
	2323	if (prev)
	2324	break;
	2325	}
	2326	trie->wordinfo[word].prev = prev;
	2327	}
	2328	Safefree(prev_states);
	2329	}
	2330
	2331
	2332	/* and now dump out the compressed format */
	2333	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2334
	2335	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2336	#ifdef DEBUGGING
	2337	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2338	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2339	#else
	2340	SvREFCNT_dec(revcharmap);
	2341	#endif
	2342	return trie->jump
	2343	? MADE_JUMP_TRIE
	2344	: trie->startstate>1
	2345	? MADE_EXACT_TRIE
	2346	: MADE_TRIE;
	2347	}
	2348
	2349	STATIC void
	2350	S_make_trie_failtable(pTHX_ RExC_state_t pRExC_state, regnode source, regnode *stclass, U32 depth)
	2351	{
	2352	/* The Trie is constructed and compressed now so we can build a fail array if it's needed
	2353
	2354	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
	2355	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
	2356	ISBN 0-201-10088-6
	2357
	2358	We find the fail state for each state in the trie, this state is the longest proper
	2359	suffix of the current state's 'word' that is also a proper prefix of another word in our
	2360	trie. State 1 represents the word '' and is thus the default fail state. This allows
	2361	the DFA not to have to restart after its tried and failed a word at a given point, it
	2362	simply continues as though it had been matching the other word in the first place.
	2363	Consider
	2364	'abcdgu'=~/abcdefg\|cdgu/
	2365	When we get to 'd' we are still matching the first word, we would encounter 'g' which would
	2366	fail, which would bring us to the state representing 'd' in the second word where we would
	2367	try 'g' and succeed, proceeding to match 'cdgu'.
	2368	*/
	2369	/* add a fail transition */
	2370	const U32 trie_offset = ARG(source);
	2371	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2372	U32 *q;
	2373	const U32 ucharcount = trie->uniquecharcount;
	2374	const U32 numstates = trie->statecount;
	2375	const U32 ubound = trie->lasttrans + ucharcount;
	2376	U32 q_read = 0;
	2377	U32 q_write = 0;
	2378	U32 charid;
	2379	U32 base = trie->states[ 1 ].trans.base;
	2380	U32 *fail;
	2381	reg_ac_data *aho;
	2382	const U32 data_slot = add_data( pRExC_state, 1, "T" );
	2383	GET_RE_DEBUG_FLAGS_DECL;
	2384
	2385	PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
	2386	#ifndef DEBUGGING
	2387	PERL_UNUSED_ARG(depth);
	2388	#endif
	2389
	2390
	2391	ARG_SET( stclass, data_slot );
	2392	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	2393	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	2394	aho->trie=trie_offset;
	2395	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	2396	Copy( trie->states, aho->states, numstates, reg_trie_state );
	2397	Newxz( q, numstates, U32);
	2398	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	2399	aho->refcount = 1;
	2400	fail = aho->fail;
	2401	/* initialize fail[0..1] to be 1 so that we always have
	2402	a valid final fail state */
	2403	fail[ 0 ] = fail[ 1 ] = 1;
	2404
	2405	for ( charid = 0; charid < ucharcount ; charid++ ) {
	2406	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	2407	if ( newstate ) {
	2408	q[ q_write ] = newstate;
	2409	/* set to point at the root */
	2410	fail[ q[ q_write++ ] ]=1;
	2411	}
	2412	}
	2413	while ( q_read < q_write) {
	2414	const U32 cur = q[ q_read++ % numstates ];
	2415	base = trie->states[ cur ].trans.base;
	2416
	2417	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	2418	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	2419	if (ch_state) {
	2420	U32 fail_state = cur;
	2421	U32 fail_base;
	2422	do {
	2423	fail_state = fail[ fail_state ];
	2424	fail_base = aho->states[ fail_state ].trans.base;
	2425	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	2426
	2427	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	2428	fail[ ch_state ] = fail_state;
	2429	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	2430	{
	2431	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	2432	}
	2433	q[ q_write++ % numstates] = ch_state;
	2434	}
	2435	}
	2436	}
	2437	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	2438	when we fail in state 1, this allows us to use the
	2439	charclass scan to find a valid start char. This is based on the principle
	2440	that theres a good chance the string being searched contains lots of stuff
	2441	that cant be a start char.
	2442	*/
	2443	fail[ 0 ] = fail[ 1 ] = 0;
	2444	DEBUG_TRIE_COMPILE_r({
	2445	PerlIO_printf(Perl_debug_log,
	2446	"%*sStclass Failtable (%"UVuf" states): 0",
	2447	(int)(depth * 2), "", (UV)numstates
	2448	);
	2449	for( q_read=1; q_read<numstates; q_read++ ) {
	2450	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	2451	}
	2452	PerlIO_printf(Perl_debug_log, "\n");
	2453	});
	2454	Safefree(q);
	2455	/RExC_seen \|= REG_SEEN_TRIEDFA;/
	2456	}
	2457
	2458
	2459	/*
	2460	* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
	2461	* These need to be revisited when a newer toolchain becomes available.
	2462	*/
	2463	#if defined(__sparc64__) && defined(__GNUC__)
	2464	# if __GNUC__ < 2 \|\| (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
	2465	# undef SPARC64_GCC_WORKAROUND
	2466	# define SPARC64_GCC_WORKAROUND 1
	2467	# endif
	2468	#endif
	2469
	2470	#define DEBUG_PEEP(str,scan,depth) \
	2471	DEBUG_OPTIMISE_r({if (scan){ \
	2472	SV * const mysv=sv_newmortal(); \
	2473	regnode *Next = regnext(scan); \
	2474	regprop(RExC_rx, mysv, scan); \
	2475	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	2476	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	2477	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	2478	}});
	2479
	2480
	2481
	2482
	2483
	2484	#define JOIN_EXACT(scan,min,flags) \
	2485	if (PL_regkind[OP(scan)] == EXACT) \
	2486	join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
	2487
	2488	STATIC U32
	2489	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan, I32 min, U32 flags,regnode val, U32 depth) {
	2490	/* Merge several consecutive EXACTish nodes into one. */
	2491	regnode *n = regnext(scan);
	2492	U32 stringok = 1;
	2493	regnode *next = scan + NODE_SZ_STR(scan);
	2494	U32 merged = 0;
	2495	U32 stopnow = 0;
	2496	#ifdef DEBUGGING
	2497	regnode *stop = scan;
	2498	GET_RE_DEBUG_FLAGS_DECL;
	2499	#else
	2500	PERL_UNUSED_ARG(depth);
	2501	#endif
	2502
	2503	PERL_ARGS_ASSERT_JOIN_EXACT;
	2504	#ifndef EXPERIMENTAL_INPLACESCAN
	2505	PERL_UNUSED_ARG(flags);
	2506	PERL_UNUSED_ARG(val);
	2507	#endif
	2508	DEBUG_PEEP("join",scan,depth);
	2509
	2510	/* Skip NOTHING, merge EXACT. /
	2511	while (n &&
	2512	( PL_regkind[OP(n)] == NOTHING \|\|
	2513	(stringok && (OP(n) == OP(scan))))
	2514	&& NEXT_OFF(n)
	2515	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
	2516
	2517	if (OP(n) == TAIL \|\| n > next)
	2518	stringok = 0;
	2519	if (PL_regkind[OP(n)] == NOTHING) {
	2520	DEBUG_PEEP("skip:",n,depth);
	2521	NEXT_OFF(scan) += NEXT_OFF(n);
	2522	next = n + NODE_STEP_REGNODE;
	2523	#ifdef DEBUGGING
	2524	if (stringok)
	2525	stop = n;
	2526	#endif
	2527	n = regnext(n);
	2528	}
	2529	else if (stringok) {
	2530	const unsigned int oldl = STR_LEN(scan);
	2531	regnode * const nnext = regnext(n);
	2532
	2533	DEBUG_PEEP("merg",n,depth);
	2534
	2535	merged++;
	2536	if (oldl + STR_LEN(n) > U8_MAX)
	2537	break;
	2538	NEXT_OFF(scan) += NEXT_OFF(n);
	2539	STR_LEN(scan) += STR_LEN(n);
	2540	next = n + NODE_SZ_STR(n);
	2541	/* Now we can overwrite n : /
	2542	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	2543	#ifdef DEBUGGING
	2544	stop = next - 1;
	2545	#endif
	2546	n = nnext;
	2547	if (stopnow) break;
	2548	}
	2549
	2550	#ifdef EXPERIMENTAL_INPLACESCAN
	2551	if (flags && !NEXT_OFF(n)) {
	2552	DEBUG_PEEP("atch", val, depth);
	2553	if (reg_off_by_arg[OP(n)]) {
	2554	ARG_SET(n, val - n);
	2555	}
	2556	else {
	2557	NEXT_OFF(n) = val - n;
	2558	}
	2559	stopnow = 1;
	2560	}
	2561	#endif
	2562	}
	2563	#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
	2564	#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
	2565	#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
	2566	#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
	2567
	2568	if (UTF
	2569	&& ( OP(scan) == EXACTF \|\| OP(scan) == EXACTFU \|\| OP(scan) == EXACTFA)
	2570	&& ( STR_LEN(scan) >= 6 ) )
	2571	{
	2572	/*
	2573	Two problematic code points in Unicode casefolding of EXACT nodes:
	2574
	2575	U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	2576	U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
	2577
	2578	which casefold to
	2579
	2580	Unicode UTF-8
	2581
	2582	U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
	2583	U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
	2584
	2585	This means that in case-insensitive matching (or "loose matching",
	2586	as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
	2587	length of the above casefolded versions) can match a target string
	2588	of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
	2589	This would rather mess up the minimum length computation.
	2590
	2591	What we'll do is to look for the tail four bytes, and then peek
	2592	at the preceding two bytes to see whether we need to decrease
	2593	the minimum length by four (six minus two).
	2594
	2595	Thanks to the design of UTF-8, there cannot be false matches:
	2596	A sequence of valid UTF-8 bytes cannot be a subsequence of
	2597	another valid sequence of UTF-8 bytes.
	2598
	2599	*/
	2600	char * const s0 = STRING(scan), s, t;
	2601	char * const s1 = s0 + STR_LEN(scan) - 1;
	2602	char * const s2 = s1 - 4;
	2603	#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
	2604	const char t0[] = "\xaf\x49\xaf\x42";
	2605	#else
	2606	const char t0[] = "\xcc\x88\xcc\x81";
	2607	#endif
	2608	const char * const t1 = t0 + 3;
	2609
	2610	for (s = s0 + 2;
	2611	s < s2 && (t = ninstr(s, s1, t0, t1));
	2612	s = t + 4) {
	2613	#ifdef EBCDIC
	2614	if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) \|\|
	2615	((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
	2616	#else
	2617	if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) \|\|
	2618	((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
	2619	#endif
	2620	*min -= 4;
	2621	}
	2622	}
	2623
	2624	#ifdef DEBUGGING
	2625	/* Allow dumping */
	2626	n = scan + NODE_SZ_STR(scan);
	2627	while (n <= stop) {
	2628	if (PL_regkind[OP(n)] != NOTHING \|\| OP(n) == NOTHING) {
	2629	OP(n) = OPTIMIZED;
	2630	NEXT_OFF(n) = 0;
	2631	}
	2632	n++;
	2633	}
	2634	#endif
	2635	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	2636	return stopnow;
	2637	}
	2638
	2639	/* REx optimizer. Converts nodes into quicker variants "in place".
	2640	Finds fixed substrings. */
	2641
	2642	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	2643	to the position after last scanned or to NULL. */
	2644
	2645	#define INIT_AND_WITHP \
	2646	assert(!and_withp); \
	2647	Newx(and_withp,1,struct regnode_charclass_class); \
	2648	SAVEFREEPV(and_withp)
	2649
	2650	/* this is a chain of data about sub patterns we are processing that
	2651	need to be handled separately/specially in study_chunk. Its so
	2652	we can simulate recursion without losing state. */
	2653	struct scan_frame;
	2654	typedef struct scan_frame {
	2655	regnode last; / last node to process in this frame */
	2656	regnode next; / next node to process when last is reached */
	2657	struct scan_frame prev; /previous frame*/
	2658	I32 stop; /* what stopparen do we use */
	2659	} scan_frame;
	2660
	2661
	2662	#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
	2663
	2664	#define CASE_SYNST_FNC(nAmE) \
	2665	case nAmE: \
	2666	if (flags & SCF_DO_STCLASS_AND) { \
	2667	for (value = 0; value < 256; value++) \
	2668	if (!is_ ## nAmE ## _cp(value)) \
	2669	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2670	} \
	2671	else { \
	2672	for (value = 0; value < 256; value++) \
	2673	if (is_ ## nAmE ## _cp(value)) \
	2674	ANYOF_BITMAP_SET(data->start_class, value); \
	2675	} \
	2676	break; \
	2677	case N ## nAmE: \
	2678	if (flags & SCF_DO_STCLASS_AND) { \
	2679	for (value = 0; value < 256; value++) \
	2680	if (is_ ## nAmE ## _cp(value)) \
	2681	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2682	} \
	2683	else { \
	2684	for (value = 0; value < 256; value++) \
	2685	if (!is_ ## nAmE ## _cp(value)) \
	2686	ANYOF_BITMAP_SET(data->start_class, value); \
	2687	} \
	2688	break
	2689
	2690
	2691
	2692	STATIC I32
	2693	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	2694	I32 minlenp, I32 deltap,
	2695	regnode *last,
	2696	scan_data_t *data,
	2697	I32 stopparen,
	2698	U8* recursed,
	2699	struct regnode_charclass_class *and_withp,
	2700	U32 flags, U32 depth)
	2701	/* scanp: Start here (read-write). */
	2702	/* deltap: Write maxlen-minlen here. */
	2703	/* last: Stop before this one. */
	2704	/* data: string data about the pattern */
	2705	/* stopparen: treat close N as END */
	2706	/* recursed: which subroutines have we recursed into */
	2707	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	2708	{
	2709	dVAR;
	2710	I32 min = 0, pars = 0, code;
	2711	regnode scan = scanp, *next;
	2712	I32 delta = 0;
	2713	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	2714	int is_inf_internal = 0; /* The studied chunk is infinite */
	2715	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	2716	scan_data_t data_fake;
	2717	SV *re_trie_maxbuff = NULL;
	2718	regnode *first_non_open = scan;
	2719	I32 stopmin = I32_MAX;
	2720	scan_frame *frame = NULL;
	2721	GET_RE_DEBUG_FLAGS_DECL;
	2722
	2723	PERL_ARGS_ASSERT_STUDY_CHUNK;
	2724
	2725	#ifdef DEBUGGING
	2726	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	2727	#endif
	2728
	2729	if ( depth == 0 ) {
	2730	while (first_non_open && OP(first_non_open) == OPEN)
	2731	first_non_open=regnext(first_non_open);
	2732	}
	2733
	2734
	2735	fake_study_recurse:
	2736	while ( scan && OP(scan) != END && scan < last ){
	2737	/* Peephole optimizer: */
	2738	DEBUG_STUDYDATA("Peep:", data,depth);
	2739	DEBUG_PEEP("Peep",scan,depth);
	2740	JOIN_EXACT(scan,&min,0);
	2741
	2742	/* Follow the next-chain of the current node and optimize
	2743	away all the NOTHINGs from it. */
	2744	if (OP(scan) != CURLYX) {
	2745	const int max = (reg_off_by_arg[OP(scan)]
	2746	? I32_MAX
	2747	/* I32 may be smaller than U16 on CRAYs! */
	2748	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	2749	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	2750	int noff;
	2751	regnode *n = scan;
	2752
	2753	/* Skip NOTHING and LONGJMP. */
	2754	while ((n = regnext(n))
	2755	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	2756	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	2757	&& off + noff < max)
	2758	off += noff;
	2759	if (reg_off_by_arg[OP(scan)])
	2760	ARG(scan) = off;
	2761	else
	2762	NEXT_OFF(scan) = off;
	2763	}
	2764
	2765
	2766
	2767	/* The principal pseudo-switch. Cannot be a switch, since we
	2768	look into several different things. */
	2769	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	2770	\|\| OP(scan) == IFTHEN) {
	2771	next = regnext(scan);
	2772	code = OP(scan);
	2773	/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
	2774
	2775	if (OP(next) == code \|\| code == IFTHEN) {
	2776	/* NOTE - There is similar code to this block below for handling
	2777	TRIE nodes on a re-study. If you change stuff here check there
	2778	too. */
	2779	I32 max1 = 0, min1 = I32_MAX, num = 0;
	2780	struct regnode_charclass_class accum;
	2781	regnode * const startbranch=scan;
	2782
	2783	if (flags & SCF_DO_SUBSTR)
	2784	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
	2785	if (flags & SCF_DO_STCLASS)
	2786	cl_init_zero(pRExC_state, &accum);
	2787
	2788	while (OP(scan) == code) {
	2789	I32 deltanext, minnext, f = 0, fake;
	2790	struct regnode_charclass_class this_class;
	2791
	2792	num++;
	2793	data_fake.flags = 0;
	2794	if (data) {
	2795	data_fake.whilem_c = data->whilem_c;
	2796	data_fake.last_closep = data->last_closep;
	2797	}
	2798	else
	2799	data_fake.last_closep = &fake;
	2800
	2801	data_fake.pos_delta = delta;
	2802	next = regnext(scan);
	2803	scan = NEXTOPER(scan);
	2804	if (code != BRANCH)
	2805	scan = NEXTOPER(scan);
	2806	if (flags & SCF_DO_STCLASS) {
	2807	cl_init(pRExC_state, &this_class);
	2808	data_fake.start_class = &this_class;
	2809	f = SCF_DO_STCLASS_AND;
	2810	}
	2811	if (flags & SCF_WHILEM_VISITED_POS)
	2812	f \|= SCF_WHILEM_VISITED_POS;
	2813
	2814	/* we suppose the run is continuous, last=next...*/
	2815	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	2816	next, &data_fake,
	2817	stopparen, recursed, NULL, f,depth+1);
	2818	if (min1 > minnext)
	2819	min1 = minnext;
	2820	if (max1 < minnext + deltanext)
	2821	max1 = minnext + deltanext;
	2822	if (deltanext == I32_MAX)
	2823	is_inf = is_inf_internal = 1;
	2824	scan = next;
	2825	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	2826	pars++;
	2827	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	2828	if ( stopmin > minnext)
	2829	stopmin = min + min1;
	2830	flags &= ~SCF_DO_SUBSTR;
	2831	if (data)
	2832	data->flags \|= SCF_SEEN_ACCEPT;
	2833	}
	2834	if (data) {
	2835	if (data_fake.flags & SF_HAS_EVAL)
	2836	data->flags \|= SF_HAS_EVAL;
	2837	data->whilem_c = data_fake.whilem_c;
	2838	}
	2839	if (flags & SCF_DO_STCLASS)
	2840	cl_or(pRExC_state, &accum, &this_class);
	2841	}
	2842	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	2843	min1 = 0;
	2844	if (flags & SCF_DO_SUBSTR) {
	2845	data->pos_min += min1;
	2846	data->pos_delta += max1 - min1;
	2847	if (max1 != min1 \|\| is_inf)
	2848	data->longest = &(data->longest_float);
	2849	}
	2850	min += min1;
	2851	delta += max1 - min1;
	2852	if (flags & SCF_DO_STCLASS_OR) {
	2853	cl_or(pRExC_state, data->start_class, &accum);
	2854	if (min1) {
	2855	cl_and(data->start_class, and_withp);
	2856	flags &= ~SCF_DO_STCLASS;
	2857	}
	2858	}
	2859	else if (flags & SCF_DO_STCLASS_AND) {
	2860	if (min1) {
	2861	cl_and(data->start_class, &accum);
	2862	flags &= ~SCF_DO_STCLASS;
	2863	}
	2864	else {
	2865	/* Switch to OR mode: cache the old value of
	2866	* data->start_class */
	2867	INIT_AND_WITHP;
	2868	StructCopy(data->start_class, and_withp,
	2869	struct regnode_charclass_class);
	2870	flags &= ~SCF_DO_STCLASS_AND;
	2871	StructCopy(&accum, data->start_class,
	2872	struct regnode_charclass_class);
	2873	flags \|= SCF_DO_STCLASS_OR;
	2874	data->start_class->flags \|= ANYOF_EOS;
	2875	}
	2876	}
	2877
	2878	if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
	2879	/* demq.
	2880
	2881	Assuming this was/is a branch we are dealing with: 'scan' now
	2882	points at the item that follows the branch sequence, whatever
	2883	it is. We now start at the beginning of the sequence and look
	2884	for subsequences of
	2885
	2886	BRANCH->EXACT=>x1
	2887	BRANCH->EXACT=>x2
	2888	tail
	2889
	2890	which would be constructed from a pattern like /A\|LIST\|OF\|WORDS/
	2891
	2892	If we can find such a subsequence we need to turn the first
	2893	element into a trie and then add the subsequent branch exact
	2894	strings to the trie.
	2895
	2896	We have two cases
	2897
	2898	1. patterns where the whole set of branches can be converted.
	2899
	2900	2. patterns where only a subset can be converted.
	2901
	2902	In case 1 we can replace the whole set with a single regop
	2903	for the trie. In case 2 we need to keep the start and end
	2904	branches so
	2905
	2906	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	2907	becomes BRANCH TRIE; BRANCH X;
	2908
	2909	There is an additional case, that being where there is a
	2910	common prefix, which gets split out into an EXACT like node
	2911	preceding the TRIE node.
	2912
	2913	If x(1..n)==tail then we can do a simple trie, if not we make
	2914	a "jump" trie, such that when we match the appropriate word
	2915	we "jump" to the appropriate tail node. Essentially we turn
	2916	a nested if into a case structure of sorts.
	2917
	2918	*/
	2919
	2920	int made=0;
	2921	if (!re_trie_maxbuff) {
	2922	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	2923	if (!SvIOK(re_trie_maxbuff))
	2924	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	2925	}
	2926	if ( SvIV(re_trie_maxbuff)>=0 ) {
	2927	regnode *cur;
	2928	regnode first = (regnode )NULL;
	2929	regnode last = (regnode )NULL;
	2930	regnode *tail = scan;
	2931	U8 optype = 0;
	2932	U32 count=0;
	2933
	2934	#ifdef DEBUGGING
	2935	SV * const mysv = sv_newmortal(); /* for dumping */
	2936	#endif
	2937	/* var tail is used because there may be a TAIL
	2938	regop in the way. Ie, the exacts will point to the
	2939	thing following the TAIL, but the last branch will
	2940	point at the TAIL. So we advance tail. If we
	2941	have nested (?:) we may have to move through several
	2942	tails.
	2943	*/
	2944
	2945	while ( OP( tail ) == TAIL ) {
	2946	/* this is the TAIL generated by (?:) */
	2947	tail = regnext( tail );
	2948	}
	2949
	2950
	2951	DEBUG_OPTIMISE_r({
	2952	regprop(RExC_rx, mysv, tail );
	2953	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	2954	(int)depth * 2 + 2, "",
	2955	"Looking for TRIE'able sequences. Tail node is: ",
	2956	SvPV_nolen_const( mysv )
	2957	);
	2958	});
	2959
	2960	/*
	2961
	2962	step through the branches, cur represents each
	2963	branch, noper is the first thing to be matched
	2964	as part of that branch and noper_next is the
	2965	regnext() of that node. if noper is an EXACT
	2966	and noper_next is the same as scan (our current
	2967	position in the regex) then the EXACT branch is
	2968	a possible optimization target. Once we have
	2969	two or more consecutive such branches we can
	2970	create a trie of the EXACT's contents and stich
	2971	it in place. If the sequence represents all of
	2972	the branches we eliminate the whole thing and
	2973	replace it with a single TRIE. If it is a
	2974	subsequence then we need to stitch it in. This
	2975	means the first branch has to remain, and needs
	2976	to be repointed at the item on the branch chain
	2977	following the last branch optimized. This could
	2978	be either a BRANCH, in which case the
	2979	subsequence is internal, or it could be the
	2980	item following the branch sequence in which
	2981	case the subsequence is at the end.
	2982
	2983	*/
	2984
	2985	/* dont use tail as the end marker for this traverse */
	2986	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	2987	regnode * const noper = NEXTOPER( cur );
	2988	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	2989	regnode * const noper_next = regnext( noper );
	2990	#endif
	2991
	2992	DEBUG_OPTIMISE_r({
	2993	regprop(RExC_rx, mysv, cur);
	2994	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	2995	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	2996
	2997	regprop(RExC_rx, mysv, noper);
	2998	PerlIO_printf( Perl_debug_log, " -> %s",
	2999	SvPV_nolen_const(mysv));
	3000
	3001	if ( noper_next ) {
	3002	regprop(RExC_rx, mysv, noper_next );
	3003	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	3004	SvPV_nolen_const(mysv));
	3005	}
	3006	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
	3007	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
	3008	});
	3009	if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
	3010	: PL_regkind[ OP( noper ) ] == EXACT )
	3011	\|\| OP(noper) == NOTHING )
	3012	#ifdef NOJUMPTRIE
	3013	&& noper_next == tail
	3014	#endif
	3015	&& count < U16_MAX)
	3016	{
	3017	count++;
	3018	if ( !first \|\| optype == NOTHING ) {
	3019	if (!first) first = cur;
	3020	optype = OP( noper );
	3021	} else {
	3022	last = cur;
	3023	}
	3024	} else {
	3025	/*
	3026	Currently we do not believe that the trie logic can
	3027	handle case insensitive matching properly when the
	3028	pattern is not unicode (thus forcing unicode semantics).
	3029
	3030	If/when this is fixed the following define can be swapped
	3031	in below to fully enable trie logic.
	3032
	3033	XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
	3034	not /aa
	3035
	3036	#define TRIE_TYPE_IS_SAFE 1
	3037
	3038	*/
	3039	#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) \|\| optype==EXACT)
	3040
	3041	if ( last && TRIE_TYPE_IS_SAFE ) {
	3042	make_trie( pRExC_state,
	3043	startbranch, first, cur, tail, count,
	3044	optype, depth+1 );
	3045	}
	3046	if ( PL_regkind[ OP( noper ) ] == EXACT
	3047	#ifdef NOJUMPTRIE
	3048	&& noper_next == tail
	3049	#endif
	3050	){
	3051	count = 1;
	3052	first = cur;
	3053	optype = OP( noper );
	3054	} else {
	3055	count = 0;
	3056	first = NULL;
	3057	optype = 0;
	3058	}
	3059	last = NULL;
	3060	}
	3061	}
	3062	DEBUG_OPTIMISE_r({
	3063	regprop(RExC_rx, mysv, cur);
	3064	PerlIO_printf( Perl_debug_log,
	3065	"%s- %s (%d) <SCAN FINISHED>\n", (int)depth 2 + 2,
	3066	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3067
	3068	});
	3069
	3070	if ( last && TRIE_TYPE_IS_SAFE ) {
	3071	made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
	3072	#ifdef TRIE_STUDY_OPT
	3073	if ( ((made == MADE_EXACT_TRIE &&
	3074	startbranch == first)
	3075	\|\| ( first_non_open == first )) &&
	3076	depth==0 ) {
	3077	flags \|= SCF_TRIE_RESTUDY;
	3078	if ( startbranch == first
	3079	&& scan == tail )
	3080	{
	3081	RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
	3082	}
	3083	}
	3084	#endif
	3085	}
	3086	}
	3087
	3088	} /* do trie */
	3089
	3090	}
	3091	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	3092	scan = NEXTOPER(NEXTOPER(scan));
	3093	} else /* single branch is optimized. */
	3094	scan = NEXTOPER(scan);
	3095	continue;
	3096	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	3097	scan_frame *newframe = NULL;
	3098	I32 paren;
	3099	regnode *start;
	3100	regnode *end;
	3101
	3102	if (OP(scan) != SUSPEND) {
	3103	/* set the pointer */
	3104	if (OP(scan) == GOSUB) {
	3105	paren = ARG(scan);
	3106	RExC_recurse[ARG2L(scan)] = scan;
	3107	start = RExC_open_parens[paren-1];
	3108	end = RExC_close_parens[paren-1];
	3109	} else {
	3110	paren = 0;
	3111	start = RExC_rxi->program + 1;
	3112	end = RExC_opend;
	3113	}
	3114	if (!recursed) {
	3115	Newxz(recursed, (((RExC_npar)>>3) +1), U8);
	3116	SAVEFREEPV(recursed);
	3117	}
	3118	if (!PAREN_TEST(recursed,paren+1)) {
	3119	PAREN_SET(recursed,paren+1);
	3120	Newx(newframe,1,scan_frame);
	3121	} else {
	3122	if (flags & SCF_DO_SUBSTR) {
	3123	SCAN_COMMIT(pRExC_state,data,minlenp);
	3124	data->longest = &(data->longest_float);
	3125	}
	3126	is_inf = is_inf_internal = 1;
	3127	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3128	cl_anything(pRExC_state, data->start_class);
	3129	flags &= ~SCF_DO_STCLASS;
	3130	}
	3131	} else {
	3132	Newx(newframe,1,scan_frame);
	3133	paren = stopparen;
	3134	start = scan+2;
	3135	end = regnext(scan);
	3136	}
	3137	if (newframe) {
	3138	assert(start);
	3139	assert(end);
	3140	SAVEFREEPV(newframe);
	3141	newframe->next = regnext(scan);
	3142	newframe->last = last;
	3143	newframe->stop = stopparen;
	3144	newframe->prev = frame;
	3145
	3146	frame = newframe;
	3147	scan = start;
	3148	stopparen = paren;
	3149	last = end;
	3150
	3151	continue;
	3152	}
	3153	}
	3154	else if (OP(scan) == EXACT) {
	3155	I32 l = STR_LEN(scan);
	3156	UV uc;
	3157	if (UTF) {
	3158	const U8 * const s = (U8*)STRING(scan);
	3159	l = utf8_length(s, s + l);
	3160	uc = utf8_to_uvchr(s, NULL);
	3161	} else {
	3162	uc = ((U8)STRING(scan));
	3163	}
	3164	min += l;
	3165	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	3166	/* The code below prefers earlier match for fixed
	3167	offset, later match for variable offset. */
	3168	if (data->last_end == -1) { /* Update the start info. */
	3169	data->last_start_min = data->pos_min;
	3170	data->last_start_max = is_inf
	3171	? I32_MAX : data->pos_min + data->pos_delta;
	3172	}
	3173	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	3174	if (UTF)
	3175	SvUTF8_on(data->last_found);
	3176	{
	3177	SV * const sv = data->last_found;
	3178	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3179	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3180	if (mg && mg->mg_len >= 0)
	3181	mg->mg_len += utf8_length((U8*)STRING(scan),
	3182	(U8*)STRING(scan)+STR_LEN(scan));
	3183	}
	3184	data->last_end = data->pos_min + l;
	3185	data->pos_min += l; /* As in the first entry. */
	3186	data->flags &= ~SF_BEFORE_EOL;
	3187	}
	3188	if (flags & SCF_DO_STCLASS_AND) {
	3189	/* Check whether it is compatible with what we know already! */
	3190	int compat = 1;
	3191
	3192
	3193	/* If compatible, we or it in below. It is compatible if is
	3194	* in the bitmp and either 1) its bit or its fold is set, or 2)
	3195	* it's for a locale. Even if there isn't unicode semantics
	3196	* here, at runtime there may be because of matching against a
	3197	* utf8 string, so accept a possible false positive for
	3198	* latin1-range folds */
	3199	if (uc >= 0x100 \|\|
	3200	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3201	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3202	&& (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
	3203	\|\| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3204	)
	3205	{
	3206	compat = 0;
	3207	}
	3208	ANYOF_CLASS_ZERO(data->start_class);
	3209	ANYOF_BITMAP_ZERO(data->start_class);
	3210	if (compat)
	3211	ANYOF_BITMAP_SET(data->start_class, uc);
	3212	else if (uc >= 0x100) {
	3213	int i;
	3214
	3215	/* Some Unicode code points fold to the Latin1 range; as
	3216	* XXX temporary code, instead of figuring out if this is
	3217	* one, just assume it is and set all the start class bits
	3218	* that could be some such above 255 code point's fold
	3219	* which will generate fals positives. As the code
	3220	* elsewhere that does compute the fold settles down, it
	3221	* can be extracted out and re-used here */
	3222	for (i = 0; i < 256; i++){
	3223	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3224	ANYOF_BITMAP_SET(data->start_class, i);
	3225	}
	3226	}
	3227	}
	3228	data->start_class->flags &= ~ANYOF_EOS;
	3229	if (uc < 0x100)
	3230	data->start_class->flags &= ~ANYOF_UNICODE_ALL;
	3231	}
	3232	else if (flags & SCF_DO_STCLASS_OR) {
	3233	/* false positive possible if the class is case-folded */
	3234	if (uc < 0x100)
	3235	ANYOF_BITMAP_SET(data->start_class, uc);
	3236	else
	3237	data->start_class->flags \|= ANYOF_UNICODE_ALL;
	3238	data->start_class->flags &= ~ANYOF_EOS;
	3239	cl_and(data->start_class, and_withp);
	3240	}
	3241	flags &= ~SCF_DO_STCLASS;
	3242	}
	3243	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
	3244	I32 l = STR_LEN(scan);
	3245	UV uc = ((U8)STRING(scan));
	3246
	3247	/* Search for fixed substrings supports EXACT only. */
	3248	if (flags & SCF_DO_SUBSTR) {
	3249	assert(data);
	3250	SCAN_COMMIT(pRExC_state, data, minlenp);
	3251	}
	3252	if (UTF) {
	3253	const U8 * const s = (U8 *)STRING(scan);
	3254	l = utf8_length(s, s + l);
	3255	uc = utf8_to_uvchr(s, NULL);
	3256	}
	3257	min += l;
	3258	if (flags & SCF_DO_SUBSTR)
	3259	data->pos_min += l;
	3260	if (flags & SCF_DO_STCLASS_AND) {
	3261	/* Check whether it is compatible with what we know already! */
	3262	int compat = 1;
	3263	if (uc >= 0x100 \|\|
	3264	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3265	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3266	&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3267	{
	3268	compat = 0;
	3269	}
	3270	ANYOF_CLASS_ZERO(data->start_class);
	3271	ANYOF_BITMAP_ZERO(data->start_class);
	3272	if (compat) {
	3273	ANYOF_BITMAP_SET(data->start_class, uc);
	3274	data->start_class->flags &= ~ANYOF_EOS;
	3275	data->start_class->flags \|= ANYOF_LOC_NONBITMAP_FOLD;
	3276	if (OP(scan) == EXACTFL) {
	3277	/* XXX This set is probably no longer necessary, and
	3278	* probably wrong as LOCALE now is on in the initial
	3279	* state */
	3280	data->start_class->flags \|= ANYOF_LOCALE;
	3281	}
	3282	else {
	3283
	3284	/* Also set the other member of the fold pair. In case
	3285	* that unicode semantics is called for at runtime, use
	3286	* the full latin1 fold. (Can't do this for locale,
	3287	* because not known until runtime */
	3288	ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
	3289	}
	3290	}
	3291	else if (uc >= 0x100) {
	3292	int i;
	3293	for (i = 0; i < 256; i++){
	3294	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3295	ANYOF_BITMAP_SET(data->start_class, i);
	3296	}
	3297	}
	3298	}
	3299	}
	3300	else if (flags & SCF_DO_STCLASS_OR) {
	3301	if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
	3302	/* false positive possible if the class is case-folded.
	3303	Assume that the locale settings are the same... */
	3304	if (uc < 0x100) {
	3305	ANYOF_BITMAP_SET(data->start_class, uc);
	3306	if (OP(scan) != EXACTFL) {
	3307
	3308	/* And set the other member of the fold pair, but
	3309	* can't do that in locale because not known until
	3310	* run-time */
	3311	ANYOF_BITMAP_SET(data->start_class,
	3312	PL_fold_latin1[uc]);
	3313	}
	3314	}
	3315	data->start_class->flags &= ~ANYOF_EOS;
	3316	}
	3317	cl_and(data->start_class, and_withp);
	3318	}
	3319	flags &= ~SCF_DO_STCLASS;
	3320	}
	3321	else if (REGNODE_VARIES(OP(scan))) {
	3322	I32 mincount, maxcount, minnext, deltanext, fl = 0;
	3323	I32 f = flags, pos_before = 0;
	3324	regnode * const oscan = scan;
	3325	struct regnode_charclass_class this_class;
	3326	struct regnode_charclass_class *oclass = NULL;
	3327	I32 next_is_eval = 0;
	3328
	3329	switch (PL_regkind[OP(scan)]) {
	3330	case WHILEM: /* End of (?:...)* . */
	3331	scan = NEXTOPER(scan);
	3332	goto finish;
	3333	case PLUS:
	3334	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	3335	next = NEXTOPER(scan);
	3336	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	3337	mincount = 1;
	3338	maxcount = REG_INFTY;
	3339	next = regnext(scan);
	3340	scan = NEXTOPER(scan);
	3341	goto do_curly;
	3342	}
	3343	}
	3344	if (flags & SCF_DO_SUBSTR)
	3345	data->pos_min++;
	3346	min++;
	3347	/* Fall through. */
	3348	case STAR:
	3349	if (flags & SCF_DO_STCLASS) {
	3350	mincount = 0;
	3351	maxcount = REG_INFTY;
	3352	next = regnext(scan);
	3353	scan = NEXTOPER(scan);
	3354	goto do_curly;
	3355	}
	3356	is_inf = is_inf_internal = 1;
	3357	scan = regnext(scan);
	3358	if (flags & SCF_DO_SUBSTR) {
	3359	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
	3360	data->longest = &(data->longest_float);
	3361	}
	3362	goto optimize_curly_tail;
	3363	case CURLY:
	3364	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	3365	&& (scan->flags == stopparen))
	3366	{
	3367	mincount = 1;
	3368	maxcount = 1;
	3369	} else {
	3370	mincount = ARG1(scan);
	3371	maxcount = ARG2(scan);
	3372	}
	3373	next = regnext(scan);
	3374	if (OP(scan) == CURLYX) {
	3375	I32 lp = (data ? *(data->last_closep) : 0);
	3376	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	3377	}
	3378	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	3379	next_is_eval = (OP(scan) == EVAL);
	3380	do_curly:
	3381	if (flags & SCF_DO_SUBSTR) {
	3382	if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
	3383	pos_before = data->pos_min;
	3384	}
	3385	if (data) {
	3386	fl = data->flags;
	3387	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	3388	if (is_inf)
	3389	data->flags \|= SF_IS_INF;
	3390	}
	3391	if (flags & SCF_DO_STCLASS) {
	3392	cl_init(pRExC_state, &this_class);
	3393	oclass = data->start_class;
	3394	data->start_class = &this_class;
	3395	f \|= SCF_DO_STCLASS_AND;
	3396	f &= ~SCF_DO_STCLASS_OR;
	3397	}
	3398	/* Exclude from super-linear cache processing any {n,m}
	3399	regops for which the combination of input pos and regex
	3400	pos is not enough information to determine if a match
	3401	will be possible.
	3402
	3403	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	3404	regex pos at the \s*, the prospects for a match depend not
	3405	only on the input position but also on how many (bar\s*)
	3406	repeats into the {4,8} we are. */
	3407	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	3408	f &= ~SCF_WHILEM_VISITED_POS;
	3409
	3410	/* This will finish on WHILEM, setting scan, or on NULL: */
	3411	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3412	last, data, stopparen, recursed, NULL,
	3413	(mincount == 0
	3414	? (f & ~SCF_DO_SUBSTR) : f),depth+1);
	3415
	3416	if (flags & SCF_DO_STCLASS)
	3417	data->start_class = oclass;
	3418	if (mincount == 0 \|\| minnext == 0) {
	3419	if (flags & SCF_DO_STCLASS_OR) {
	3420	cl_or(pRExC_state, data->start_class, &this_class);
	3421	}
	3422	else if (flags & SCF_DO_STCLASS_AND) {
	3423	/* Switch to OR mode: cache the old value of
	3424	* data->start_class */
	3425	INIT_AND_WITHP;
	3426	StructCopy(data->start_class, and_withp,
	3427	struct regnode_charclass_class);
	3428	flags &= ~SCF_DO_STCLASS_AND;
	3429	StructCopy(&this_class, data->start_class,
	3430	struct regnode_charclass_class);
	3431	flags \|= SCF_DO_STCLASS_OR;
	3432	data->start_class->flags \|= ANYOF_EOS;
	3433	}
	3434	} else { /* Non-zero len */
	3435	if (flags & SCF_DO_STCLASS_OR) {
	3436	cl_or(pRExC_state, data->start_class, &this_class);
	3437	cl_and(data->start_class, and_withp);
	3438	}
	3439	else if (flags & SCF_DO_STCLASS_AND)
	3440	cl_and(data->start_class, &this_class);
	3441	flags &= ~SCF_DO_STCLASS;
	3442	}
	3443	if (!scan) /* It was not CURLYX, but CURLY. */
	3444	scan = next;
	3445	if ( /* ? quantifier ok, except for (?{ ... }) */
	3446	(next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	3447	&& (minnext == 0) && (deltanext == 0)
	3448	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	3449	&& maxcount <= REG_INFTY/3) /* Complement check for big count */
	3450	{
	3451	ckWARNreg(RExC_parse,
	3452	"Quantifier unexpected on zero-length expression");
	3453	}
	3454
	3455	min += minnext * mincount;
	3456	is_inf_internal \|= ((maxcount == REG_INFTY
	3457	&& (minnext + deltanext) > 0)
	3458	\|\| deltanext == I32_MAX);
	3459	is_inf \|= is_inf_internal;
	3460	delta += (minnext + deltanext) * maxcount - minnext * mincount;
	3461
	3462	/* Try powerful optimization CURLYX => CURLYN. */
	3463	if ( OP(oscan) == CURLYX && data
	3464	&& data->flags & SF_IN_PAR
	3465	&& !(data->flags & SF_HAS_EVAL)
	3466	&& !deltanext && minnext == 1 ) {
	3467	/* Try to optimize to CURLYN. */
	3468	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	3469	regnode * const nxt1 = nxt;
	3470	#ifdef DEBUGGING
	3471	regnode *nxt2;
	3472	#endif
	3473
	3474	/* Skip open. */
	3475	nxt = regnext(nxt);
	3476	if (!REGNODE_SIMPLE(OP(nxt))
	3477	&& !(PL_regkind[OP(nxt)] == EXACT
	3478	&& STR_LEN(nxt) == 1))
	3479	goto nogo;
	3480	#ifdef DEBUGGING
	3481	nxt2 = nxt;
	3482	#endif
	3483	nxt = regnext(nxt);
	3484	if (OP(nxt) != CLOSE)
	3485	goto nogo;
	3486	if (RExC_open_parens) {
	3487	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3488	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	3489	}
	3490	/* Now we know that nxt2 is the only contents: */
	3491	oscan->flags = (U8)ARG(nxt);
	3492	OP(oscan) = CURLYN;
	3493	OP(nxt1) = NOTHING; /* was OPEN. */
	3494
	3495	#ifdef DEBUGGING
	3496	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3497	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	3498	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	3499	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3500	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3501	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	3502	#endif
	3503	}
	3504	nogo:
	3505
	3506	/* Try optimization CURLYX => CURLYM. */
	3507	if ( OP(oscan) == CURLYX && data
	3508	&& !(data->flags & SF_HAS_PAR)
	3509	&& !(data->flags & SF_HAS_EVAL)
	3510	&& !deltanext /* atom is fixed width */
	3511	&& minnext != 0 /* CURLYM can't handle zero width */
	3512	) {
	3513	/* XXXX How to optimize if data == 0? */
	3514	/* Optimize to a simpler form. */
	3515	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	3516	regnode *nxt2;
	3517
	3518	OP(oscan) = CURLYM;
	3519	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	3520	&& (OP(nxt2) != WHILEM))
	3521	nxt = nxt2;
	3522	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	3523	/* Need to optimize away parenths. */
	3524	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	3525	/* Set the parenth number. */
	3526	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	3527
	3528	oscan->flags = (U8)ARG(nxt);
	3529	if (RExC_open_parens) {
	3530	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3531	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	3532	}
	3533	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	3534	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3535
	3536	#ifdef DEBUGGING
	3537	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3538	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3539	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	3540	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	3541	#endif
	3542	#if 0
	3543	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	3544	regnode *nnxt = regnext(nxt1);
	3545	if (nnxt == nxt) {
	3546	if (reg_off_by_arg[OP(nxt1)])
	3547	ARG_SET(nxt1, nxt2 - nxt1);
	3548	else if (nxt2 - nxt1 < U16_MAX)
	3549	NEXT_OFF(nxt1) = nxt2 - nxt1;
	3550	else
	3551	OP(nxt) = NOTHING; /* Cannot beautify */
	3552	}
	3553	nxt1 = nnxt;
	3554	}
	3555	#endif
	3556	/* Optimize again: */
	3557	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	3558	NULL, stopparen, recursed, NULL, 0,depth+1);
	3559	}
	3560	else
	3561	oscan->flags = 0;
	3562	}
	3563	else if ((OP(oscan) == CURLYX)
	3564	&& (flags & SCF_WHILEM_VISITED_POS)
	3565	/* See the comment on a similar expression above.
	3566	However, this time it's not a subexpression
	3567	we care about, but the expression itself. */
	3568	&& (maxcount == REG_INFTY)
	3569	&& data && ++data->whilem_c < 16) {
	3570	/* This stays as CURLYX, we can put the count/of pair. */
	3571	/* Find WHILEM (as in regexec.c) */
	3572	regnode *nxt = oscan + NEXT_OFF(oscan);
	3573
	3574	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	3575	nxt += ARG(nxt);
	3576	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	3577	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	3578	}
	3579	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	3580	pars++;
	3581	if (flags & SCF_DO_SUBSTR) {
	3582	SV *last_str = NULL;
	3583	int counted = mincount != 0;
	3584
	3585	if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
	3586	#if defined(SPARC64_GCC_WORKAROUND)
	3587	I32 b = 0;
	3588	STRLEN l = 0;
	3589	const char *s = NULL;
	3590	I32 old = 0;
	3591
	3592	if (pos_before >= data->last_start_min)
	3593	b = pos_before;
	3594	else
	3595	b = data->last_start_min;
	3596
	3597	l = 0;
	3598	s = SvPV_const(data->last_found, l);
	3599	old = b - data->last_start_min;
	3600
	3601	#else
	3602	I32 b = pos_before >= data->last_start_min
	3603	? pos_before : data->last_start_min;
	3604	STRLEN l;
	3605	const char * const s = SvPV_const(data->last_found, l);
	3606	I32 old = b - data->last_start_min;
	3607	#endif
	3608
	3609	if (UTF)
	3610	old = utf8_hop((U8)s, old) - (U8)s;
	3611	l -= old;
	3612	/* Get the added string: */
	3613	last_str = newSVpvn_utf8(s + old, l, UTF);
	3614	if (deltanext == 0 && pos_before == b) {
	3615	/* What was added is a constant string */
	3616	if (mincount > 1) {
	3617	SvGROW(last_str, (mincount * l) + 1);
	3618	repeatcpy(SvPVX(last_str) + l,
	3619	SvPVX_const(last_str), l, mincount - 1);
	3620	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	3621	/* Add additional parts. */
	3622	SvCUR_set(data->last_found,
	3623	SvCUR(data->last_found) - l);
	3624	sv_catsv(data->last_found, last_str);
	3625	{
	3626	SV * sv = data->last_found;
	3627	MAGIC *mg =
	3628	SvUTF8(sv) && SvMAGICAL(sv) ?
	3629	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3630	if (mg && mg->mg_len >= 0)
	3631	mg->mg_len += CHR_SVLEN(last_str) - l;
	3632	}
	3633	data->last_end += l * (mincount - 1);
	3634	}
	3635	} else {
	3636	/* start offset must point into the last copy */
	3637	data->last_start_min += minnext * (mincount - 1);
	3638	data->last_start_max += is_inf ? I32_MAX
	3639	: (maxcount - 1) * (minnext + data->pos_delta);
	3640	}
	3641	}
	3642	/* It is counted once already... */
	3643	data->pos_min += minnext * (mincount - counted);
	3644	data->pos_delta += - counted * deltanext +
	3645	(minnext + deltanext) * maxcount - minnext * mincount;
	3646	if (mincount != maxcount) {
	3647	/* Cannot extend fixed substrings found inside
	3648	the group. */
	3649	SCAN_COMMIT(pRExC_state,data,minlenp);
	3650	if (mincount && last_str) {
	3651	SV * const sv = data->last_found;
	3652	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3653	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3654
	3655	if (mg)
	3656	mg->mg_len = -1;
	3657	sv_setsv(sv, last_str);
	3658	data->last_end = data->pos_min;
	3659	data->last_start_min =
	3660	data->pos_min - CHR_SVLEN(last_str);
	3661	data->last_start_max = is_inf
	3662	? I32_MAX
	3663	: data->pos_min + data->pos_delta
	3664	- CHR_SVLEN(last_str);
	3665	}
	3666	data->longest = &(data->longest_float);
	3667	}
	3668	SvREFCNT_dec(last_str);
	3669	}
	3670	if (data && (fl & SF_HAS_EVAL))
	3671	data->flags \|= SF_HAS_EVAL;
	3672	optimize_curly_tail:
	3673	if (OP(oscan) != CURLYX) {
	3674	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	3675	&& NEXT_OFF(next))
	3676	NEXT_OFF(oscan) += NEXT_OFF(next);
	3677	}
	3678	continue;
	3679	default: /* REF, ANYOFV, and CLUMP only? */
	3680	if (flags & SCF_DO_SUBSTR) {
	3681	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3682	data->longest = &(data->longest_float);
	3683	}
	3684	is_inf = is_inf_internal = 1;
	3685	if (flags & SCF_DO_STCLASS_OR)
	3686	cl_anything(pRExC_state, data->start_class);
	3687	flags &= ~SCF_DO_STCLASS;
	3688	break;
	3689	}
	3690	}
	3691	else if (OP(scan) == LNBREAK) {
	3692	if (flags & SCF_DO_STCLASS) {
	3693	int value = 0;
	3694	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3695	if (flags & SCF_DO_STCLASS_AND) {
	3696	for (value = 0; value < 256; value++)
	3697	if (!is_VERTWS_cp(value))
	3698	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3699	}
	3700	else {
	3701	for (value = 0; value < 256; value++)
	3702	if (is_VERTWS_cp(value))
	3703	ANYOF_BITMAP_SET(data->start_class, value);
	3704	}
	3705	if (flags & SCF_DO_STCLASS_OR)
	3706	cl_and(data->start_class, and_withp);
	3707	flags &= ~SCF_DO_STCLASS;
	3708	}
	3709	min += 1;
	3710	delta += 1;
	3711	if (flags & SCF_DO_SUBSTR) {
	3712	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3713	data->pos_min += 1;
	3714	data->pos_delta += 1;
	3715	data->longest = &(data->longest_float);
	3716	}
	3717	}
	3718	else if (OP(scan) == FOLDCHAR) {
	3719	int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
	3720	flags &= ~SCF_DO_STCLASS;
	3721	min += 1;
	3722	delta += d;
	3723	if (flags & SCF_DO_SUBSTR) {
	3724	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3725	data->pos_min += 1;
	3726	data->pos_delta += d;
	3727	data->longest = &(data->longest_float);
	3728	}
	3729	}
	3730	else if (REGNODE_SIMPLE(OP(scan))) {
	3731	int value = 0;
	3732
	3733	if (flags & SCF_DO_SUBSTR) {
	3734	SCAN_COMMIT(pRExC_state,data,minlenp);
	3735	data->pos_min++;
	3736	}
	3737	min++;
	3738	if (flags & SCF_DO_STCLASS) {
	3739	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3740
	3741	/* Some of the logic below assumes that switching
	3742	locale on will only add false positives. */
	3743	switch (PL_regkind[OP(scan)]) {
	3744	case SANY:
	3745	default:
	3746	do_default:
	3747	/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
	3748	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3749	cl_anything(pRExC_state, data->start_class);
	3750	break;
	3751	case REG_ANY:
	3752	if (OP(scan) == SANY)
	3753	goto do_default;
	3754	if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
	3755	value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
	3756	\|\| ANYOF_CLASS_TEST_ANY_SET(data->start_class));
	3757	cl_anything(pRExC_state, data->start_class);
	3758	}
	3759	if (flags & SCF_DO_STCLASS_AND \|\| !value)
	3760	ANYOF_BITMAP_CLEAR(data->start_class,'\n');
	3761	break;
	3762	case ANYOF:
	3763	if (flags & SCF_DO_STCLASS_AND)
	3764	cl_and(data->start_class,
	3765	(struct regnode_charclass_class*)scan);
	3766	else
	3767	cl_or(pRExC_state, data->start_class,
	3768	(struct regnode_charclass_class*)scan);
	3769	break;
	3770	case ALNUM:
	3771	if (flags & SCF_DO_STCLASS_AND) {
	3772	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3773	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
	3774	if (OP(scan) == ALNUMU) {
	3775	for (value = 0; value < 256; value++) {
	3776	if (!isWORDCHAR_L1(value)) {
	3777	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3778	}
	3779	}
	3780	} else {
	3781	for (value = 0; value < 256; value++) {
	3782	if (!isALNUM(value)) {
	3783	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3784	}
	3785	}
	3786	}
	3787	}
	3788	}
	3789	else {
	3790	if (data->start_class->flags & ANYOF_LOCALE)
	3791	ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
	3792
	3793	/* Even if under locale, set the bits for non-locale
	3794	* in case it isn't a true locale-node. This will
	3795	* create false positives if it truly is locale */
	3796	if (OP(scan) == ALNUMU) {
	3797	for (value = 0; value < 256; value++) {
	3798	if (isWORDCHAR_L1(value)) {
	3799	ANYOF_BITMAP_SET(data->start_class, value);
	3800	}
	3801	}
	3802	} else {
	3803	for (value = 0; value < 256; value++) {
	3804	if (isALNUM(value)) {
	3805	ANYOF_BITMAP_SET(data->start_class, value);
	3806	}
	3807	}
	3808	}
	3809	}
	3810	break;
	3811	case NALNUM:
	3812	if (flags & SCF_DO_STCLASS_AND) {
	3813	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3814	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
	3815	if (OP(scan) == NALNUMU) {
	3816	for (value = 0; value < 256; value++) {
	3817	if (isWORDCHAR_L1(value)) {
	3818	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3819	}
	3820	}
	3821	} else {
	3822	for (value = 0; value < 256; value++) {
	3823	if (isALNUM(value)) {
	3824	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3825	}
	3826	}
	3827	}
	3828	}
	3829	}
	3830	else {
	3831	if (data->start_class->flags & ANYOF_LOCALE)
	3832	ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
	3833
	3834	/* Even if under locale, set the bits for non-locale in
	3835	* case it isn't a true locale-node. This will create
	3836	* false positives if it truly is locale */
	3837	if (OP(scan) == NALNUMU) {
	3838	for (value = 0; value < 256; value++) {
	3839	if (! isWORDCHAR_L1(value)) {
	3840	ANYOF_BITMAP_SET(data->start_class, value);
	3841	}
	3842	}
	3843	} else {
	3844	for (value = 0; value < 256; value++) {
	3845	if (! isALNUM(value)) {
	3846	ANYOF_BITMAP_SET(data->start_class, value);
	3847	}
	3848	}
	3849	}
	3850	}
	3851	break;
	3852	case SPACE:
	3853	if (flags & SCF_DO_STCLASS_AND) {
	3854	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3855	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
	3856	if (OP(scan) == SPACEU) {
	3857	for (value = 0; value < 256; value++) {
	3858	if (!isSPACE_L1(value)) {
	3859	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3860	}
	3861	}
	3862	} else {
	3863	for (value = 0; value < 256; value++) {
	3864	if (!isSPACE(value)) {
	3865	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3866	}
	3867	}
	3868	}
	3869	}
	3870	}
	3871	else {
	3872	if (data->start_class->flags & ANYOF_LOCALE) {
	3873	ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
	3874	}
	3875	if (OP(scan) == SPACEU) {
	3876	for (value = 0; value < 256; value++) {
	3877	if (isSPACE_L1(value)) {
	3878	ANYOF_BITMAP_SET(data->start_class, value);
	3879	}
	3880	}
	3881	} else {
	3882	for (value = 0; value < 256; value++) {
	3883	if (isSPACE(value)) {
	3884	ANYOF_BITMAP_SET(data->start_class, value);
	3885	}
	3886	}
	3887	}
	3888	}
	3889	break;
	3890	case NSPACE:
	3891	if (flags & SCF_DO_STCLASS_AND) {
	3892	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3893	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
	3894	if (OP(scan) == NSPACEU) {
	3895	for (value = 0; value < 256; value++) {
	3896	if (isSPACE_L1(value)) {
	3897	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3898	}
	3899	}
	3900	} else {
	3901	for (value = 0; value < 256; value++) {
	3902	if (isSPACE(value)) {
	3903	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3904	}
	3905	}
	3906	}
	3907	}
	3908	}
	3909	else {
	3910	if (data->start_class->flags & ANYOF_LOCALE)
	3911	ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
	3912	if (OP(scan) == NSPACEU) {
	3913	for (value = 0; value < 256; value++) {
	3914	if (!isSPACE_L1(value)) {
	3915	ANYOF_BITMAP_SET(data->start_class, value);
	3916	}
	3917	}
	3918	}
	3919	else {
	3920	for (value = 0; value < 256; value++) {
	3921	if (!isSPACE(value)) {
	3922	ANYOF_BITMAP_SET(data->start_class, value);
	3923	}
	3924	}
	3925	}
	3926	}
	3927	break;
	3928	case DIGIT:
	3929	if (flags & SCF_DO_STCLASS_AND) {
	3930	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3931	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
	3932	for (value = 0; value < 256; value++)
	3933	if (!isDIGIT(value))
	3934	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3935	}
	3936	}
	3937	else {
	3938	if (data->start_class->flags & ANYOF_LOCALE)
	3939	ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
	3940	for (value = 0; value < 256; value++)
	3941	if (isDIGIT(value))
	3942	ANYOF_BITMAP_SET(data->start_class, value);
	3943	}
	3944	break;
	3945	case NDIGIT:
	3946	if (flags & SCF_DO_STCLASS_AND) {
	3947	if (!(data->start_class->flags & ANYOF_LOCALE))
	3948	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
	3949	for (value = 0; value < 256; value++)
	3950	if (isDIGIT(value))
	3951	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3952	}
	3953	else {
	3954	if (data->start_class->flags & ANYOF_LOCALE)
	3955	ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
	3956	for (value = 0; value < 256; value++)
	3957	if (!isDIGIT(value))
	3958	ANYOF_BITMAP_SET(data->start_class, value);
	3959	}
	3960	break;
	3961	CASE_SYNST_FNC(VERTWS);
	3962	CASE_SYNST_FNC(HORIZWS);
	3963
	3964	}
	3965	if (flags & SCF_DO_STCLASS_OR)
	3966	cl_and(data->start_class, and_withp);
	3967	flags &= ~SCF_DO_STCLASS;
	3968	}
	3969	}
	3970	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	3971	data->flags \|= (OP(scan) == MEOL
	3972	? SF_BEFORE_MEOL
	3973	: SF_BEFORE_SEOL);
	3974	}
	3975	else if ( PL_regkind[OP(scan)] == BRANCHJ
	3976	/* Lookbehind, or need to calculate parens/evals/stclass: */
	3977	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	3978	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM)) {
	3979	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	3980	\|\| OP(scan) == UNLESSM )
	3981	{
	3982	/* Negative Lookahead/lookbehind
	3983	In this case we can't do fixed string optimisation.
	3984	*/
	3985
	3986	I32 deltanext, minnext, fake = 0;
	3987	regnode *nscan;
	3988	struct regnode_charclass_class intrnl;
	3989	int f = 0;
	3990
	3991	data_fake.flags = 0;
	3992	if (data) {
	3993	data_fake.whilem_c = data->whilem_c;
	3994	data_fake.last_closep = data->last_closep;
	3995	}
	3996	else
	3997	data_fake.last_closep = &fake;
	3998	data_fake.pos_delta = delta;
	3999	if ( flags & SCF_DO_STCLASS && !scan->flags
	4000	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4001	cl_init(pRExC_state, &intrnl);
	4002	data_fake.start_class = &intrnl;
	4003	f \|= SCF_DO_STCLASS_AND;
	4004	}
	4005	if (flags & SCF_WHILEM_VISITED_POS)
	4006	f \|= SCF_WHILEM_VISITED_POS;
	4007	next = regnext(scan);
	4008	nscan = NEXTOPER(NEXTOPER(scan));
	4009	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	4010	last, &data_fake, stopparen, recursed, NULL, f, depth+1);
	4011	if (scan->flags) {
	4012	if (deltanext) {
	4013	FAIL("Variable length lookbehind not implemented");
	4014	}
	4015	else if (minnext > (I32)U8_MAX) {
	4016	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4017	}
	4018	scan->flags = (U8)minnext;
	4019	}
	4020	if (data) {
	4021	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4022	pars++;
	4023	if (data_fake.flags & SF_HAS_EVAL)
	4024	data->flags \|= SF_HAS_EVAL;
	4025	data->whilem_c = data_fake.whilem_c;
	4026	}
	4027	if (f & SCF_DO_STCLASS_AND) {
	4028	if (flags & SCF_DO_STCLASS_OR) {
	4029	/* OR before, AND after: ideally we would recurse with
	4030	* data_fake to get the AND applied by study of the
	4031	* remainder of the pattern, and then derecurse;
	4032	* * HACK * for now just treat as "no information".
	4033	* See [perl #56690].
	4034	*/
	4035	cl_init(pRExC_state, data->start_class);
	4036	} else {
	4037	/* AND before and after: combine and continue */
	4038	const int was = (data->start_class->flags & ANYOF_EOS);
	4039
	4040	cl_and(data->start_class, &intrnl);
	4041	if (was)
	4042	data->start_class->flags \|= ANYOF_EOS;
	4043	}
	4044	}
	4045	}
	4046	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4047	else {
	4048	/* Positive Lookahead/lookbehind
	4049	In this case we can do fixed string optimisation,
	4050	but we must be careful about it. Note in the case of
	4051	lookbehind the positions will be offset by the minimum
	4052	length of the pattern, something we won't know about
	4053	until after the recurse.
	4054	*/
	4055	I32 deltanext, fake = 0;
	4056	regnode *nscan;
	4057	struct regnode_charclass_class intrnl;
	4058	int f = 0;
	4059	/* We use SAVEFREEPV so that when the full compile
	4060	is finished perl will clean up the allocated
	4061	minlens when it's all done. This way we don't
	4062	have to worry about freeing them when we know
	4063	they wont be used, which would be a pain.
	4064	*/
	4065	I32 *minnextp;
	4066	Newx( minnextp, 1, I32 );
	4067	SAVEFREEPV(minnextp);
	4068
	4069	if (data) {
	4070	StructCopy(data, &data_fake, scan_data_t);
	4071	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	4072	f \|= SCF_DO_SUBSTR;
	4073	if (scan->flags)
	4074	SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
	4075	data_fake.last_found=newSVsv(data->last_found);
	4076	}
	4077	}
	4078	else
	4079	data_fake.last_closep = &fake;
	4080	data_fake.flags = 0;
	4081	data_fake.pos_delta = delta;
	4082	if (is_inf)
	4083	data_fake.flags \|= SF_IS_INF;
	4084	if ( flags & SCF_DO_STCLASS && !scan->flags
	4085	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4086	cl_init(pRExC_state, &intrnl);
	4087	data_fake.start_class = &intrnl;
	4088	f \|= SCF_DO_STCLASS_AND;
	4089	}
	4090	if (flags & SCF_WHILEM_VISITED_POS)
	4091	f \|= SCF_WHILEM_VISITED_POS;
	4092	next = regnext(scan);
	4093	nscan = NEXTOPER(NEXTOPER(scan));
	4094
	4095	*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
	4096	last, &data_fake, stopparen, recursed, NULL, f,depth+1);
	4097	if (scan->flags) {
	4098	if (deltanext) {
	4099	FAIL("Variable length lookbehind not implemented");
	4100	}
	4101	else if (*minnextp > (I32)U8_MAX) {
	4102	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4103	}
	4104	scan->flags = (U8)*minnextp;
	4105	}
	4106
	4107	*minnextp += min;
	4108
	4109	if (f & SCF_DO_STCLASS_AND) {
	4110	const int was = (data->start_class->flags & ANYOF_EOS);
	4111
	4112	cl_and(data->start_class, &intrnl);
	4113	if (was)
	4114	data->start_class->flags \|= ANYOF_EOS;
	4115	}
	4116	if (data) {
	4117	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4118	pars++;
	4119	if (data_fake.flags & SF_HAS_EVAL)
	4120	data->flags \|= SF_HAS_EVAL;
	4121	data->whilem_c = data_fake.whilem_c;
	4122	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	4123	if (RExC_rx->minlen<*minnextp)
	4124	RExC_rx->minlen=*minnextp;
	4125	SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
	4126	SvREFCNT_dec(data_fake.last_found);
	4127
	4128	if ( data_fake.minlen_fixed != minlenp )
	4129	{
	4130	data->offset_fixed= data_fake.offset_fixed;
	4131	data->minlen_fixed= data_fake.minlen_fixed;
	4132	data->lookbehind_fixed+= scan->flags;
	4133	}
	4134	if ( data_fake.minlen_float != minlenp )
	4135	{
	4136	data->minlen_float= data_fake.minlen_float;
	4137	data->offset_float_min=data_fake.offset_float_min;
	4138	data->offset_float_max=data_fake.offset_float_max;
	4139	data->lookbehind_float+= scan->flags;
	4140	}
	4141	}
	4142	}
	4143
	4144
	4145	}
	4146	#endif
	4147	}
	4148	else if (OP(scan) == OPEN) {
	4149	if (stopparen != (I32)ARG(scan))
	4150	pars++;
	4151	}
	4152	else if (OP(scan) == CLOSE) {
	4153	if (stopparen == (I32)ARG(scan)) {
	4154	break;
	4155	}
	4156	if ((I32)ARG(scan) == is_par) {
	4157	next = regnext(scan);
	4158
	4159	if ( next && (OP(next) != WHILEM) && next < last)
	4160	is_par = 0; /* Disable optimization */
	4161	}
	4162	if (data)
	4163	*(data->last_closep) = ARG(scan);
	4164	}
	4165	else if (OP(scan) == EVAL) {
	4166	if (data)
	4167	data->flags \|= SF_HAS_EVAL;
	4168	}
	4169	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	4170	if (flags & SCF_DO_SUBSTR) {
	4171	SCAN_COMMIT(pRExC_state,data,minlenp);
	4172	flags &= ~SCF_DO_SUBSTR;
	4173	}
	4174	if (data && OP(scan)==ACCEPT) {
	4175	data->flags \|= SCF_SEEN_ACCEPT;
	4176	if (stopmin > min)
	4177	stopmin = min;
	4178	}
	4179	}
	4180	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	4181	{
	4182	if (flags & SCF_DO_SUBSTR) {
	4183	SCAN_COMMIT(pRExC_state,data,minlenp);
	4184	data->longest = &(data->longest_float);
	4185	}
	4186	is_inf = is_inf_internal = 1;
	4187	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4188	cl_anything(pRExC_state, data->start_class);
	4189	flags &= ~SCF_DO_STCLASS;
	4190	}
	4191	else if (OP(scan) == GPOS) {
	4192	if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
	4193	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	4194	{
	4195	if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
	4196	RExC_rx->extflags \|= RXf_ANCH_GPOS;
	4197	if (RExC_rx->gofs < (U32)min)
	4198	RExC_rx->gofs = min;
	4199	} else {
	4200	RExC_rx->extflags \|= RXf_GPOS_FLOAT;
	4201	RExC_rx->gofs = 0;
	4202	}
	4203	}
	4204	#ifdef TRIE_STUDY_OPT
	4205	#ifdef FULL_TRIE_STUDY
	4206	else if (PL_regkind[OP(scan)] == TRIE) {
	4207	/* NOTE - There is similar code to this block above for handling
	4208	BRANCH nodes on the initial study. If you change stuff here
	4209	check there too. */
	4210	regnode *trie_node= scan;
	4211	regnode *tail= regnext(scan);
	4212	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4213	I32 max1 = 0, min1 = I32_MAX;
	4214	struct regnode_charclass_class accum;
	4215
	4216	if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
	4217	SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
	4218	if (flags & SCF_DO_STCLASS)
	4219	cl_init_zero(pRExC_state, &accum);
	4220
	4221	if (!trie->jump) {
	4222	min1= trie->minlen;
	4223	max1= trie->maxlen;
	4224	} else {
	4225	const regnode *nextbranch= NULL;
	4226	U32 word;
	4227
	4228	for ( word=1 ; word <= trie->wordcount ; word++)
	4229	{
	4230	I32 deltanext=0, minnext=0, f = 0, fake;
	4231	struct regnode_charclass_class this_class;
	4232
	4233	data_fake.flags = 0;
	4234	if (data) {
	4235	data_fake.whilem_c = data->whilem_c;
	4236	data_fake.last_closep = data->last_closep;
	4237	}
	4238	else
	4239	data_fake.last_closep = &fake;
	4240	data_fake.pos_delta = delta;
	4241	if (flags & SCF_DO_STCLASS) {
	4242	cl_init(pRExC_state, &this_class);
	4243	data_fake.start_class = &this_class;
	4244	f = SCF_DO_STCLASS_AND;
	4245	}
	4246	if (flags & SCF_WHILEM_VISITED_POS)
	4247	f \|= SCF_WHILEM_VISITED_POS;
	4248
	4249	if (trie->jump[word]) {
	4250	if (!nextbranch)
	4251	nextbranch = trie_node + trie->jump[0];
	4252	scan= trie_node + trie->jump[word];
	4253	/* We go from the jump point to the branch that follows
	4254	it. Note this means we need the vestigal unused branches
	4255	even though they arent otherwise used.
	4256	*/
	4257	minnext = study_chunk(pRExC_state, &scan, minlenp,
	4258	&deltanext, (regnode *)nextbranch, &data_fake,
	4259	stopparen, recursed, NULL, f,depth+1);
	4260	}
	4261	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	4262	nextbranch= regnext((regnode*)nextbranch);
	4263
	4264	if (min1 > (I32)(minnext + trie->minlen))
	4265	min1 = minnext + trie->minlen;
	4266	if (max1 < (I32)(minnext + deltanext + trie->maxlen))
	4267	max1 = minnext + deltanext + trie->maxlen;
	4268	if (deltanext == I32_MAX)
	4269	is_inf = is_inf_internal = 1;
	4270
	4271	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4272	pars++;
	4273	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	4274	if ( stopmin > min + min1)
	4275	stopmin = min + min1;
	4276	flags &= ~SCF_DO_SUBSTR;
	4277	if (data)
	4278	data->flags \|= SCF_SEEN_ACCEPT;
	4279	}
	4280	if (data) {
	4281	if (data_fake.flags & SF_HAS_EVAL)
	4282	data->flags \|= SF_HAS_EVAL;
	4283	data->whilem_c = data_fake.whilem_c;
	4284	}
	4285	if (flags & SCF_DO_STCLASS)
	4286	cl_or(pRExC_state, &accum, &this_class);
	4287	}
	4288	}
	4289	if (flags & SCF_DO_SUBSTR) {
	4290	data->pos_min += min1;
	4291	data->pos_delta += max1 - min1;
	4292	if (max1 != min1 \|\| is_inf)
	4293	data->longest = &(data->longest_float);
	4294	}
	4295	min += min1;
	4296	delta += max1 - min1;
	4297	if (flags & SCF_DO_STCLASS_OR) {
	4298	cl_or(pRExC_state, data->start_class, &accum);
	4299	if (min1) {
	4300	cl_and(data->start_class, and_withp);
	4301	flags &= ~SCF_DO_STCLASS;
	4302	}
	4303	}
	4304	else if (flags & SCF_DO_STCLASS_AND) {
	4305	if (min1) {
	4306	cl_and(data->start_class, &accum);
	4307	flags &= ~SCF_DO_STCLASS;
	4308	}
	4309	else {
	4310	/* Switch to OR mode: cache the old value of
	4311	* data->start_class */
	4312	INIT_AND_WITHP;
	4313	StructCopy(data->start_class, and_withp,
	4314	struct regnode_charclass_class);
	4315	flags &= ~SCF_DO_STCLASS_AND;
	4316	StructCopy(&accum, data->start_class,
	4317	struct regnode_charclass_class);
	4318	flags \|= SCF_DO_STCLASS_OR;
	4319	data->start_class->flags \|= ANYOF_EOS;
	4320	}
	4321	}
	4322	scan= tail;
	4323	continue;
	4324	}
	4325	#else
	4326	else if (PL_regkind[OP(scan)] == TRIE) {
	4327	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4328	U8*bang=NULL;
	4329
	4330	min += trie->minlen;
	4331	delta += (trie->maxlen - trie->minlen);
	4332	flags &= ~SCF_DO_STCLASS; /* xxx */
	4333	if (flags & SCF_DO_SUBSTR) {
	4334	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4335	data->pos_min += trie->minlen;
	4336	data->pos_delta += (trie->maxlen - trie->minlen);
	4337	if (trie->maxlen != trie->minlen)
	4338	data->longest = &(data->longest_float);
	4339	}
	4340	if (trie->jump) /* no more substrings -- for now /grr*/
	4341	flags &= ~SCF_DO_SUBSTR;
	4342	}
	4343	#endif /* old or new */
	4344	#endif /* TRIE_STUDY_OPT */
	4345
	4346	/* Else: zero-length, ignore. */
	4347	scan = regnext(scan);
	4348	}
	4349	if (frame) {
	4350	last = frame->last;
	4351	scan = frame->next;
	4352	stopparen = frame->stop;
	4353	frame = frame->prev;
	4354	goto fake_study_recurse;
	4355	}
	4356
	4357	finish:
	4358	assert(!frame);
	4359	DEBUG_STUDYDATA("pre-fin:",data,depth);
	4360
	4361	*scanp = scan;
	4362	*deltap = is_inf_internal ? I32_MAX : delta;
	4363	if (flags & SCF_DO_SUBSTR && is_inf)
	4364	data->pos_delta = I32_MAX - data->pos_min;
	4365	if (is_par > (I32)U8_MAX)
	4366	is_par = 0;
	4367	if (is_par && pars==1 && data) {
	4368	data->flags \|= SF_IN_PAR;
	4369	data->flags &= ~SF_HAS_PAR;
	4370	}
	4371	else if (pars && data) {
	4372	data->flags \|= SF_HAS_PAR;
	4373	data->flags &= ~SF_IN_PAR;
	4374	}
	4375	if (flags & SCF_DO_STCLASS_OR)
	4376	cl_and(data->start_class, and_withp);
	4377	if (flags & SCF_TRIE_RESTUDY)
	4378	data->flags \|= SCF_TRIE_RESTUDY;
	4379
	4380	DEBUG_STUDYDATA("post-fin:",data,depth);
	4381
	4382	return min < stopmin ? min : stopmin;
	4383	}
	4384
	4385	STATIC U32
	4386	S_add_data(RExC_state_t pRExC_state, U32 n, const char s)
	4387	{
	4388	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	4389
	4390	PERL_ARGS_ASSERT_ADD_DATA;
	4391
	4392	Renewc(RExC_rxi->data,
	4393	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	4394	char, struct reg_data);
	4395	if(count)
	4396	Renew(RExC_rxi->data->what, count + n, U8);
	4397	else
	4398	Newx(RExC_rxi->data->what, n, U8);
	4399	RExC_rxi->data->count = count + n;
	4400	Copy(s, RExC_rxi->data->what + count, n, U8);
	4401	return count;
	4402	}
	4403
	4404	/XXX: todo make this not included in a non debugging perl /
	4405	#ifndef PERL_IN_XSUB_RE
	4406	void
	4407	Perl_reginitcolors(pTHX)
	4408	{
	4409	dVAR;
	4410	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	4411	if (s) {
	4412	char *t = savepv(s);
	4413	int i = 0;
	4414	PL_colors[0] = t;
	4415	while (++i < 6) {
	4416	t = strchr(t, '\t');
	4417	if (t) {
	4418	*t = '\0';
	4419	PL_colors[i] = ++t;
	4420	}
	4421	else
	4422	PL_colors[i] = t = (char *)"";
	4423	}
	4424	} else {
	4425	int i = 0;
	4426	while (i < 6)
	4427	PL_colors[i++] = (char *)"";
	4428	}
	4429	PL_colorset = 1;
	4430	}
	4431	#endif
	4432
	4433
	4434	#ifdef TRIE_STUDY_OPT
	4435	#define CHECK_RESTUDY_GOTO \
	4436	if ( \
	4437	(data.flags & SCF_TRIE_RESTUDY) \
	4438	&& ! restudied++ \
	4439	) goto reStudy
	4440	#else
	4441	#define CHECK_RESTUDY_GOTO
	4442	#endif
	4443
	4444	/*
	4445	- pregcomp - compile a regular expression into internal code
	4446	*
	4447	* We can't allocate space until we know how big the compiled form will be,
	4448	* but we can't compile it (and thus know how big it is) until we've got a
	4449	* place to put the code. So we cheat: we compile it twice, once with code
	4450	* generation turned off and size counting turned on, and once "for real".
	4451	* This also means that we don't allocate space until we are sure that the
	4452	* thing really will compile successfully, and we never have to move the
	4453	* code and thus invalidate pointers into it. (Note that it has to be in
	4454	* one piece because free() must be able to free it all.) [NB: not true in perl]
	4455	*
	4456	* Beware that the optimization-preparation code in here knows about some
	4457	* of the structure of the compiled regexp. [I'll say.]
	4458	*/
	4459
	4460
	4461
	4462	#ifndef PERL_IN_XSUB_RE
	4463	#define RE_ENGINE_PTR &PL_core_reg_engine
	4464	#else
	4465	extern const struct regexp_engine my_reg_engine;
	4466	#define RE_ENGINE_PTR &my_reg_engine
	4467	#endif
	4468
	4469	#ifndef PERL_IN_XSUB_RE
	4470	REGEXP *
	4471	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	4472	{
	4473	dVAR;
	4474	HV * const table = GvHV(PL_hintgv);
	4475
	4476	PERL_ARGS_ASSERT_PREGCOMP;
	4477
	4478	/* Dispatch a request to compile a regexp to correct
	4479	regexp engine. */
	4480	if (table) {
	4481	SV **ptr= hv_fetchs(table, "regcomp", FALSE);
	4482	GET_RE_DEBUG_FLAGS_DECL;
	4483	if (ptr && SvIOK(ptr) && SvIV(ptr)) {
	4484	const regexp_engine eng=INT2PTR(regexp_engine,SvIV(*ptr));
	4485	DEBUG_COMPILE_r({
	4486	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	4487	SvIV(*ptr));
	4488	});
	4489	return CALLREGCOMP_ENG(eng, pattern, flags);
	4490	}
	4491	}
	4492	return Perl_re_compile(aTHX_ pattern, flags);
	4493	}
	4494	#endif
	4495
	4496	REGEXP *
	4497	Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
	4498	{
	4499	dVAR;
	4500	REGEXP *rx;
	4501	struct regexp *r;
	4502	register regexp_internal *ri;
	4503	STRLEN plen;
	4504	char *exp;
	4505	char* xend;
	4506	regnode *scan;
	4507	I32 flags;
	4508	I32 minlen = 0;
	4509	U32 pm_flags;
	4510
	4511	/* these are all flags - maybe they should be turned
	4512	* into a single int with different bit masks */
	4513	I32 sawlookahead = 0;
	4514	I32 sawplus = 0;
	4515	I32 sawopen = 0;
	4516	bool used_setjump = FALSE;
	4517	regex_charset initial_charset = get_regex_charset(orig_pm_flags);
	4518
	4519	U8 jump_ret = 0;
	4520	dJMPENV;
	4521	scan_data_t data;
	4522	RExC_state_t RExC_state;
	4523	RExC_state_t * const pRExC_state = &RExC_state;
	4524	#ifdef TRIE_STUDY_OPT
	4525	int restudied;
	4526	RExC_state_t copyRExC_state;
	4527	#endif
	4528	GET_RE_DEBUG_FLAGS_DECL;
	4529
	4530	PERL_ARGS_ASSERT_RE_COMPILE;
	4531
	4532	DEBUG_r(if (!PL_colorset) reginitcolors());
	4533
	4534	RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
	4535	RExC_uni_semantics = 0;
	4536	RExC_contains_locale = 0;
	4537
	4538	/**************** LONG JUMP TARGET HERE*********************/
	4539	/* Longjmp back to here if have to switch in midstream to utf8 */
	4540	if (! RExC_orig_utf8) {
	4541	JMPENV_PUSH(jump_ret);
	4542	used_setjump = TRUE;
	4543	}
	4544
	4545	if (jump_ret == 0) { /* First time through */
	4546	exp = SvPV(pattern, plen);
	4547	xend = exp + plen;
	4548	/* ignore the utf8ness if the pattern is 0 length */
	4549	if (plen == 0) {
	4550	RExC_utf8 = RExC_orig_utf8 = 0;
	4551	}
	4552
	4553	DEBUG_COMPILE_r({
	4554	SV *dsv= sv_newmortal();
	4555	RE_PV_QUOTED_DECL(s, RExC_utf8,
	4556	dsv, exp, plen, 60);
	4557	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	4558	PL_colors[4],PL_colors[5],s);
	4559	});
	4560	}
	4561	else { /* longjumped back */
	4562	STRLEN len = plen;
	4563
	4564	/* If the cause for the longjmp was other than changing to utf8, pop
	4565	* our own setjmp, and longjmp to the correct handler */
	4566	if (jump_ret != UTF8_LONGJMP) {
	4567	JMPENV_POP;
	4568	JMPENV_JUMP(jump_ret);
	4569	}
	4570
	4571	GET_RE_DEBUG_FLAGS;
	4572
	4573	/* It's possible to write a regexp in ascii that represents Unicode
	4574	codepoints outside of the byte range, such as via \x{100}. If we
	4575	detect such a sequence we have to convert the entire pattern to utf8
	4576	and then recompile, as our sizing calculation will have been based
	4577	on 1 byte == 1 character, but we will need to use utf8 to encode
	4578	at least some part of the pattern, and therefore must convert the whole
	4579	thing.
	4580	-- dmq */
	4581	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	4582	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	4583	exp = (char)Perl_bytes_to_utf8(aTHX_ (U8)SvPV(pattern, plen), &len);
	4584	xend = exp + len;
	4585	RExC_orig_utf8 = RExC_utf8 = 1;
	4586	SAVEFREEPV(exp);
	4587	}
	4588
	4589	#ifdef TRIE_STUDY_OPT
	4590	restudied = 0;
	4591	#endif
	4592
	4593	pm_flags = orig_pm_flags;
	4594
	4595	if (initial_charset == REGEX_LOCALE_CHARSET) {
	4596	RExC_contains_locale = 1;
	4597	}
	4598	else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
	4599
	4600	/* Set to use unicode semantics if the pattern is in utf8 and has the
	4601	* 'depends' charset specified, as it means unicode when utf8 */
	4602	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4603	}
	4604
	4605	RExC_precomp = exp;
	4606	RExC_flags = pm_flags;
	4607	RExC_sawback = 0;
	4608
	4609	RExC_seen = 0;
	4610	RExC_in_lookbehind = 0;
	4611	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	4612	RExC_seen_evals = 0;
	4613	RExC_extralen = 0;
	4614
	4615	/* First pass: determine size, legality. */
	4616	RExC_parse = exp;
	4617	RExC_start = exp;
	4618	RExC_end = xend;
	4619	RExC_naughty = 0;
	4620	RExC_npar = 1;
	4621	RExC_nestroot = 0;
	4622	RExC_size = 0L;
	4623	RExC_emit = &PL_regdummy;
	4624	RExC_whilem_seen = 0;
	4625	RExC_open_parens = NULL;
	4626	RExC_close_parens = NULL;
	4627	RExC_opend = NULL;
	4628	RExC_paren_names = NULL;
	4629	#ifdef DEBUGGING
	4630	RExC_paren_name_list = NULL;
	4631	#endif
	4632	RExC_recurse = NULL;
	4633	RExC_recurse_count = 0;
	4634
	4635	#if 0 /* REGC() is (currently) a NOP at the first pass.
	4636	* Clever compilers notice this and complain. --jhi */
	4637	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	4638	#endif
	4639	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
	4640	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4641	RExC_precomp = NULL;
	4642	return(NULL);
	4643	}
	4644
	4645	/* Here, finished first pass. Get rid of any added setjmp */
	4646	if (used_setjump) {
	4647	JMPENV_POP;
	4648	}
	4649
	4650	DEBUG_PARSE_r({
	4651	PerlIO_printf(Perl_debug_log,
	4652	"Required size %"IVdf" nodes\n"
	4653	"Starting second pass (creation)\n",
	4654	(IV)RExC_size);
	4655	RExC_lastnum=0;
	4656	RExC_lastparse=NULL;
	4657	});
	4658
	4659	/* The first pass could have found things that force Unicode semantics */
	4660	if ((RExC_utf8 \|\| RExC_uni_semantics)
	4661	&& get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
	4662	{
	4663	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4664	}
	4665
	4666	/* Small enough for pointer-storage convention?
	4667	If extralen==0, this means that we will not need long jumps. */
	4668	if (RExC_size >= 0x10000L && RExC_extralen)
	4669	RExC_size += RExC_extralen;
	4670	else
	4671	RExC_extralen = 0;
	4672	if (RExC_whilem_seen > 15)
	4673	RExC_whilem_seen = 15;
	4674
	4675	/* Allocate space and zero-initialize. Note, the two step process
	4676	of zeroing when in debug mode, thus anything assigned has to
	4677	happen after that */
	4678	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	4679	r = (struct regexp*)SvANY(rx);
	4680	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	4681	char, regexp_internal);
	4682	if ( r == NULL \|\| ri == NULL )
	4683	FAIL("Regexp out of space");
	4684	#ifdef DEBUGGING
	4685	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	4686	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
	4687	#else
	4688	/* bulk initialize base fields with 0. */
	4689	Zero(ri, sizeof(regexp_internal), char);
	4690	#endif
	4691
	4692	/* non-zero initialization begins here */
	4693	RXi_SET( r, ri );
	4694	r->engine= RE_ENGINE_PTR;
	4695	r->extflags = pm_flags;
	4696	{
	4697	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	4698	bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
	4699
	4700	/* The caret is output if there are any defaults: if not all the STD
	4701	* flags are set, or if no character set specifier is needed */
	4702	bool has_default =
	4703	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	4704	\|\| ! has_charset);
	4705	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
	4706	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	4707	>> RXf_PMf_STD_PMMOD_SHIFT);
	4708	const char fptr = STD_PAT_MODS; /"msix"*/
	4709	char *p;
	4710	/* Allocate for the worst case, which is all the std flags are turned
	4711	* on. If more precision is desired, we could do a population count of
	4712	* the flags set. This could be done with a small lookup table, or by
	4713	* shifting, masking and adding, or even, when available, assembly
	4714	* language for a machine-language population count.
	4715	* We never output a minus, as all those are defaults, so are
	4716	* covered by the caret */
	4717	const STRLEN wraplen = plen + has_p + has_runon
	4718	+ has_default /* If needs a caret */
	4719
	4720	/* If needs a character set specifier */
	4721	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	4722	+ (sizeof(STD_PAT_MODS) - 1)
	4723	+ (sizeof("(?:)") - 1);
	4724
	4725	p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
	4726	SvPOK_on(rx);
	4727	SvFLAGS(rx) \|= SvUTF8(pattern);
	4728	p++='('; p++='?';
	4729
	4730	/* If a default, cover it using the caret */
	4731	if (has_default) {
	4732	*p++= DEFAULT_PAT_MOD;
	4733	}
	4734	if (has_charset) {
	4735	STRLEN len;
	4736	const char* const name = get_regex_charset_name(r->extflags, &len);
	4737	Copy(name, p, len, char);
	4738	p += len;
	4739	}
	4740	if (has_p)
	4741	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	4742	{
	4743	char ch;
	4744	while((ch = *fptr++)) {
	4745	if(reganch & 1)
	4746	*p++ = ch;
	4747	reganch >>= 1;
	4748	}
	4749	}
	4750
	4751	*p++ = ':';
	4752	Copy(RExC_precomp, p, plen, char);
	4753	assert ((RX_WRAPPED(rx) - p) < 16);
	4754	r->pre_prefix = p - RX_WRAPPED(rx);
	4755	p += plen;
	4756	if (has_runon)
	4757	*p++ = '\n';
	4758	*p++ = ')';
	4759	*p = 0;
	4760	SvCUR_set(rx, p - SvPVX_const(rx));
	4761	}
	4762
	4763	r->intflags = 0;
	4764	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	4765
	4766	if (RExC_seen & REG_SEEN_RECURSE) {
	4767	Newxz(RExC_open_parens, RExC_npar,regnode *);
	4768	SAVEFREEPV(RExC_open_parens);
	4769	Newxz(RExC_close_parens,RExC_npar,regnode *);
	4770	SAVEFREEPV(RExC_close_parens);
	4771	}
	4772
	4773	/* Useful during FAIL. */
	4774	#ifdef RE_TRACK_PATTERN_OFFSETS
	4775	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	4776	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	4777	"%s %"UVuf" bytes for offset annotations.\n",
	4778	ri->u.offsets ? "Got" : "Couldn't get",
	4779	(UV)((2RExC_size+1) sizeof(U32))));
	4780	#endif
	4781	SetProgLen(ri,RExC_size);
	4782	RExC_rx_sv = rx;
	4783	RExC_rx = r;
	4784	RExC_rxi = ri;
	4785
	4786	/* Second pass: emit code. */
	4787	RExC_flags = pm_flags; /* don't let top level (?i) bleed */
	4788	RExC_parse = exp;
	4789	RExC_end = xend;
	4790	RExC_naughty = 0;
	4791	RExC_npar = 1;
	4792	RExC_emit_start = ri->program;
	4793	RExC_emit = ri->program;
	4794	RExC_emit_bound = ri->program + RExC_size + 1;
	4795
	4796	/* Store the count of eval-groups for security checks: */
	4797	RExC_rx->seen_evals = RExC_seen_evals;
	4798	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	4799	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4800	ReREFCNT_dec(rx);
	4801	return(NULL);
	4802	}
	4803	/* XXXX To minimize changes to RE engine we always allocate
	4804	3-units-long substrs field. */
	4805	Newx(r->substrs, 1, struct reg_substr_data);
	4806	if (RExC_recurse_count) {
	4807	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	4808	SAVEFREEPV(RExC_recurse);
	4809	}
	4810
	4811	reStudy:
	4812	r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
	4813	Zero(r->substrs, 1, struct reg_substr_data);
	4814
	4815	#ifdef TRIE_STUDY_OPT
	4816	if (!restudied) {
	4817	StructCopy(&zero_scan_data, &data, scan_data_t);
	4818	copyRExC_state = RExC_state;
	4819	} else {
	4820	U32 seen=RExC_seen;
	4821	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	4822
	4823	RExC_state = copyRExC_state;
	4824	if (seen & REG_TOP_LEVEL_BRANCHES)
	4825	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	4826	else
	4827	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
	4828	if (data.last_found) {
	4829	SvREFCNT_dec(data.longest_fixed);
	4830	SvREFCNT_dec(data.longest_float);
	4831	SvREFCNT_dec(data.last_found);
	4832	}
	4833	StructCopy(&zero_scan_data, &data, scan_data_t);
	4834	}
	4835	#else
	4836	StructCopy(&zero_scan_data, &data, scan_data_t);
	4837	#endif
	4838
	4839	/* Dig out information for optimizations. */
	4840	r->extflags = RExC_flags; /* was pm_op */
	4841	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	4842
	4843	if (UTF)
	4844	SvUTF8_on(rx); /* Unicode in it? */
	4845	ri->regstclass = NULL;
	4846	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	4847	r->intflags \|= PREGf_NAUGHTY;
	4848	scan = ri->program + 1; /* First BRANCH. */
	4849
	4850	/* testing for BRANCH here tells us whether there is "must appear"
	4851	data in the pattern. If there is then we can use it for optimisations */
	4852	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
	4853	I32 fake;
	4854	STRLEN longest_float_length, longest_fixed_length;
	4855	struct regnode_charclass_class ch_class; /* pointed to by data */
	4856	int stclass_flag;
	4857	I32 last_close = 0; /* pointed to by data */
	4858	regnode *first= scan;
	4859	regnode *first_next= regnext(first);
	4860	/*
	4861	* Skip introductions and multiplicators >= 1
	4862	* so that we can extract the 'meat' of the pattern that must
	4863	* match in the large if() sequence following.
	4864	* NOTE that EXACT is NOT covered here, as it is normally
	4865	* picked up by the optimiser separately.
	4866	*
	4867	* This is unfortunate as the optimiser isnt handling lookahead
	4868	* properly currently.
	4869	*
	4870	*/
	4871	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	4872	/* An OR of one alternative - should not happen now. */
	4873	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	4874	/* for now we can't handle lookbehind IFMATCH*/
	4875	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	4876	(OP(first) == PLUS) \|\|
	4877	(OP(first) == MINMOD) \|\|
	4878	/* An {n,m} with n>0 */
	4879	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	4880	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	4881	{
	4882	/*
	4883	* the only op that could be a regnode is PLUS, all the rest
	4884	* will be regnode_1 or regnode_2.
	4885	*
	4886	*/
	4887	if (OP(first) == PLUS)
	4888	sawplus = 1;
	4889	else
	4890	first += regarglen[OP(first)];
	4891
	4892	first = NEXTOPER(first);
	4893	first_next= regnext(first);
	4894	}
	4895
	4896	/* Starting-point info. */
	4897	again:
	4898	DEBUG_PEEP("first:",first,0);
	4899	/* Ignore EXACT as we deal with it later. */
	4900	if (PL_regkind[OP(first)] == EXACT) {
	4901	if (OP(first) == EXACT)
	4902	NOOP; /* Empty, get anchored substr later. */
	4903	else
	4904	ri->regstclass = first;
	4905	}
	4906	#ifdef TRIE_STCLASS
	4907	else if (PL_regkind[OP(first)] == TRIE &&
	4908	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	4909	{
	4910	regnode *trie_op;
	4911	/* this can happen only on restudy */
	4912	if ( OP(first) == TRIE ) {
	4913	struct regnode_1 trieop = (struct regnode_1 )
	4914	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	4915	StructCopy(first,trieop,struct regnode_1);
	4916	trie_op=(regnode *)trieop;
	4917	} else {
	4918	struct regnode_charclass trieop = (struct regnode_charclass )
	4919	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	4920	StructCopy(first,trieop,struct regnode_charclass);
	4921	trie_op=(regnode *)trieop;
	4922	}
	4923	OP(trie_op)+=2;
	4924	make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
	4925	ri->regstclass = trie_op;
	4926	}
	4927	#endif
	4928	else if (REGNODE_SIMPLE(OP(first)))
	4929	ri->regstclass = first;
	4930	else if (PL_regkind[OP(first)] == BOUND \|\|
	4931	PL_regkind[OP(first)] == NBOUND)
	4932	ri->regstclass = first;
	4933	else if (PL_regkind[OP(first)] == BOL) {
	4934	r->extflags \|= (OP(first) == MBOL
	4935	? RXf_ANCH_MBOL
	4936	: (OP(first) == SBOL
	4937	? RXf_ANCH_SBOL
	4938	: RXf_ANCH_BOL));
	4939	first = NEXTOPER(first);
	4940	goto again;
	4941	}
	4942	else if (OP(first) == GPOS) {
	4943	r->extflags \|= RXf_ANCH_GPOS;
	4944	first = NEXTOPER(first);
	4945	goto again;
	4946	}
	4947	else if ((!sawopen \|\| !RExC_sawback) &&
	4948	(OP(first) == STAR &&
	4949	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	4950	!(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
	4951	{
	4952	/* turn .* into ^.* with an implied $=1 /
	4953	const int type =
	4954	(OP(NEXTOPER(first)) == REG_ANY)
	4955	? RXf_ANCH_MBOL
	4956	: RXf_ANCH_SBOL;
	4957	r->extflags \|= type;
	4958	r->intflags \|= PREGf_IMPLICIT;
	4959	first = NEXTOPER(first);
	4960	goto again;
	4961	}
	4962	if (sawplus && !sawlookahead && (!sawopen \|\| !RExC_sawback)
	4963	&& !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
	4964	/* x+ must match at the 1st pos of run of x's */
	4965	r->intflags \|= PREGf_SKIP;
	4966
	4967	/* Scan is after the zeroth branch, first is atomic matcher. */
	4968	#ifdef TRIE_STUDY_OPT
	4969	DEBUG_PARSE_r(
	4970	if (!restudied)
	4971	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	4972	(IV)(first - scan + 1))
	4973	);
	4974	#else
	4975	DEBUG_PARSE_r(
	4976	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	4977	(IV)(first - scan + 1))
	4978	);
	4979	#endif
	4980
	4981
	4982	/*
	4983	* If there's something expensive in the r.e., find the
	4984	* longest literal string that must appear and make it the
	4985	* regmust. Resolve ties in favor of later strings, since
	4986	* the regstart check works with the beginning of the r.e.
	4987	* and avoiding duplication strengthens checking. Not a
	4988	* strong reason, but sufficient in the absence of others.
	4989	* [Now we resolve ties in favor of the earlier string if
	4990	* it happens that c_offset_min has been invalidated, since the
	4991	* earlier string may buy us something the later one won't.]
	4992	*/
	4993
	4994	data.longest_fixed = newSVpvs("");
	4995	data.longest_float = newSVpvs("");
	4996	data.last_found = newSVpvs("");
	4997	data.longest = &(data.longest_fixed);
	4998	first = scan;
	4999	if (!ri->regstclass) {
	5000	cl_init(pRExC_state, &ch_class);
	5001	data.start_class = &ch_class;
	5002	stclass_flag = SCF_DO_STCLASS_AND;
	5003	} else /* XXXX Check for BOUND? */
	5004	stclass_flag = 0;
	5005	data.last_closep = &last_close;
	5006
	5007	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
	5008	&data, -1, NULL, NULL,
	5009	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag,0);
	5010
	5011
	5012	CHECK_RESTUDY_GOTO;
	5013
	5014
	5015	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	5016	&& data.last_start_min == 0 && data.last_end > 0
	5017	&& !RExC_seen_zerolen
	5018	&& !(RExC_seen & REG_SEEN_VERBARG)
	5019	&& (!(RExC_seen & REG_SEEN_GPOS) \|\| (r->extflags & RXf_ANCH_GPOS)))
	5020	r->extflags \|= RXf_CHECK_ALL;
	5021	scan_commit(pRExC_state, &data,&minlen,0);
	5022	SvREFCNT_dec(data.last_found);
	5023
	5024	/* Note that code very similar to this but for anchored string
	5025	follows immediately below, changes may need to be made to both.
	5026	Be careful.
	5027	*/
	5028	longest_float_length = CHR_SVLEN(data.longest_float);
	5029	if (longest_float_length
	5030	\|\| (data.flags & SF_FL_BEFORE_EOL
	5031	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5032	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	5033	{
	5034	I32 t,ml;
	5035
	5036	if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	5037	&& data.offset_fixed == data.offset_float_min
	5038	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
	5039	goto remove_float; /* As in (a)+. */
	5040
	5041	/* copy the information about the longest float from the reg_scan_data
	5042	over to the program. */
	5043	if (SvUTF8(data.longest_float)) {
	5044	r->float_utf8 = data.longest_float;
	5045	r->float_substr = NULL;
	5046	} else {
	5047	r->float_substr = data.longest_float;
	5048	r->float_utf8 = NULL;
	5049	}
	5050	/* float_end_shift is how many chars that must be matched that
	5051	follow this item. We calculate it ahead of time as once the
	5052	lookbehind offset is added in we lose the ability to correctly
	5053	calculate it.*/
	5054	ml = data.minlen_float ? *(data.minlen_float)
	5055	: (I32)longest_float_length;
	5056	r->float_end_shift = ml - data.offset_float_min
	5057	- longest_float_length + (SvTAIL(data.longest_float) != 0)
	5058	+ data.lookbehind_float;
	5059	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	5060	r->float_max_offset = data.offset_float_max;
	5061	if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
	5062	r->float_max_offset -= data.lookbehind_float;
	5063
	5064	t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
	5065	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5066	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5067	fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
	5068	}
	5069	else {
	5070	remove_float:
	5071	r->float_substr = r->float_utf8 = NULL;
	5072	SvREFCNT_dec(data.longest_float);
	5073	longest_float_length = 0;
	5074	}
	5075
	5076	/* Note that code very similar to this but for floating string
	5077	is immediately above, changes may need to be made to both.
	5078	Be careful.
	5079	*/
	5080	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	5081	if (longest_fixed_length
	5082	\|\| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
	5083	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5084	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	5085	{
	5086	I32 t,ml;
	5087
	5088	/* copy the information about the longest fixed
	5089	from the reg_scan_data over to the program. */
	5090	if (SvUTF8(data.longest_fixed)) {
	5091	r->anchored_utf8 = data.longest_fixed;
	5092	r->anchored_substr = NULL;
	5093	} else {
	5094	r->anchored_substr = data.longest_fixed;
	5095	r->anchored_utf8 = NULL;
	5096	}
	5097	/* fixed_end_shift is how many chars that must be matched that
	5098	follow this item. We calculate it ahead of time as once the
	5099	lookbehind offset is added in we lose the ability to correctly
	5100	calculate it.*/
	5101	ml = data.minlen_fixed ? *(data.minlen_fixed)
	5102	: (I32)longest_fixed_length;
	5103	r->anchored_end_shift = ml - data.offset_fixed
	5104	- longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
	5105	+ data.lookbehind_fixed;
	5106	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	5107
	5108	t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
	5109	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5110	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5111	fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
	5112	}
	5113	else {
	5114	r->anchored_substr = r->anchored_utf8 = NULL;
	5115	SvREFCNT_dec(data.longest_fixed);
	5116	longest_fixed_length = 0;
	5117	}
	5118	if (ri->regstclass
	5119	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	5120	ri->regstclass = NULL;
	5121
	5122	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	5123	&& stclass_flag
	5124	&& !(data.start_class->flags & ANYOF_EOS)
	5125	&& !cl_is_anything(data.start_class))
	5126	{
	5127	const U32 n = add_data(pRExC_state, 1, "f");
	5128	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5129
	5130	Newx(RExC_rxi->data->data[n], 1,
	5131	struct regnode_charclass_class);
	5132	StructCopy(data.start_class,
	5133	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5134	struct regnode_charclass_class);
	5135	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5136	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5137	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	5138	regprop(r, sv, (regnode*)data.start_class);
	5139	PerlIO_printf(Perl_debug_log,
	5140	"synthetic stclass \"%s\".\n",
	5141	SvPVX_const(sv));});
	5142	}
	5143
	5144	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
	5145	if (longest_fixed_length > longest_float_length) {
	5146	r->check_end_shift = r->anchored_end_shift;
	5147	r->check_substr = r->anchored_substr;
	5148	r->check_utf8 = r->anchored_utf8;
	5149	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	5150	if (r->extflags & RXf_ANCH_SINGLE)
	5151	r->extflags \|= RXf_NOSCAN;
	5152	}
	5153	else {
	5154	r->check_end_shift = r->float_end_shift;
	5155	r->check_substr = r->float_substr;
	5156	r->check_utf8 = r->float_utf8;
	5157	r->check_offset_min = r->float_min_offset;
	5158	r->check_offset_max = r->float_max_offset;
	5159	}
	5160	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
	5161	This should be changed ASAP! */
	5162	if ((r->check_substr \|\| r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
	5163	r->extflags \|= RXf_USE_INTUIT;
	5164	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	5165	r->extflags \|= RXf_INTUIT_TAIL;
	5166	}
	5167	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	5168	if ( (STRLEN)minlen < longest_float_length )
	5169	minlen= longest_float_length;
	5170	if ( (STRLEN)minlen < longest_fixed_length )
	5171	minlen= longest_fixed_length;
	5172	*/
	5173	}
	5174	else {
	5175	/* Several toplevels. Best we can is to set minlen. */
	5176	I32 fake;
	5177	struct regnode_charclass_class ch_class;
	5178	I32 last_close = 0;
	5179
	5180	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	5181
	5182	scan = ri->program + 1;
	5183	cl_init(pRExC_state, &ch_class);
	5184	data.start_class = &ch_class;
	5185	data.last_closep = &last_close;
	5186
	5187
	5188	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
	5189	&data, -1, NULL, NULL, SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS,0);
	5190
	5191	CHECK_RESTUDY_GOTO;
	5192
	5193	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	5194	= r->float_substr = r->float_utf8 = NULL;
	5195
	5196	if (!(data.start_class->flags & ANYOF_EOS)
	5197	&& !cl_is_anything(data.start_class))
	5198	{
	5199	const U32 n = add_data(pRExC_state, 1, "f");
	5200	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5201
	5202	Newx(RExC_rxi->data->data[n], 1,
	5203	struct regnode_charclass_class);
	5204	StructCopy(data.start_class,
	5205	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5206	struct regnode_charclass_class);
	5207	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5208	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5209	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	5210	regprop(r, sv, (regnode*)data.start_class);
	5211	PerlIO_printf(Perl_debug_log,
	5212	"synthetic stclass \"%s\".\n",
	5213	SvPVX_const(sv));});
	5214	}
	5215	}
	5216
	5217	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	5218	the "real" pattern. */
	5219	DEBUG_OPTIMISE_r({
	5220	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
	5221	(IV)minlen, (IV)r->minlen);
	5222	});
	5223	r->minlenret = minlen;
	5224	if (r->minlen < minlen)
	5225	r->minlen = minlen;
	5226
	5227	if (RExC_seen & REG_SEEN_GPOS)
	5228	r->extflags \|= RXf_GPOS_SEEN;
	5229	if (RExC_seen & REG_SEEN_LOOKBEHIND)
	5230	r->extflags \|= RXf_LOOKBEHIND_SEEN;
	5231	if (RExC_seen & REG_SEEN_EVAL)
	5232	r->extflags \|= RXf_EVAL_SEEN;
	5233	if (RExC_seen & REG_SEEN_CANY)
	5234	r->extflags \|= RXf_CANY_SEEN;
	5235	if (RExC_seen & REG_SEEN_VERBARG)
	5236	r->intflags \|= PREGf_VERBARG_SEEN;
	5237	if (RExC_seen & REG_SEEN_CUTGROUP)
	5238	r->intflags \|= PREGf_CUTGROUP_SEEN;
	5239	if (RExC_paren_names)
	5240	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	5241	else
	5242	RXp_PAREN_NAMES(r) = NULL;
	5243
	5244	#ifdef STUPID_PATTERN_CHECKS
	5245	if (RX_PRELEN(rx) == 0)
	5246	r->extflags \|= RXf_NULL;
	5247	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5248	/* XXX: this should happen BEFORE we compile */
	5249	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5250	else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
	5251	r->extflags \|= RXf_WHITE;
	5252	else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
	5253	r->extflags \|= RXf_START_ONLY;
	5254	#else
	5255	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5256	/* XXX: this should happen BEFORE we compile */
	5257	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5258	else {
	5259	regnode *first = ri->program + 1;
	5260	U8 fop = OP(first);
	5261
	5262	if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
	5263	r->extflags \|= RXf_NULL;
	5264	else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
	5265	r->extflags \|= RXf_START_ONLY;
	5266	else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
	5267	&& OP(regnext(first)) == END)
	5268	r->extflags \|= RXf_WHITE;
	5269	}
	5270	#endif
	5271	#ifdef DEBUGGING
	5272	if (RExC_paren_names) {
	5273	ri->name_list_idx = add_data( pRExC_state, 1, "a" );
	5274	ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
	5275	} else
	5276	#endif
	5277	ri->name_list_idx = 0;
	5278
	5279	if (RExC_recurse_count) {
	5280	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	5281	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	5282	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	5283	}
	5284	}
	5285	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	5286	/* assume we don't need to swap parens around before we match */
	5287
	5288	DEBUG_DUMP_r({
	5289	PerlIO_printf(Perl_debug_log,"Final program:\n");
	5290	regdump(r);
	5291	});
	5292	#ifdef RE_TRACK_PATTERN_OFFSETS
	5293	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	5294	const U32 len = ri->u.offsets[0];
	5295	U32 i;
	5296	GET_RE_DEBUG_FLAGS_DECL;
	5297	PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	5298	for (i = 1; i <= len; i++) {
	5299	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	5300	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	5301	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	5302	}
	5303	PerlIO_printf(Perl_debug_log, "\n");
	5304	});
	5305	#endif
	5306	return rx;
	5307	}
	5308
	5309	#undef RE_ENGINE_PTR
	5310
	5311
	5312	SV*
	5313	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	5314	const U32 flags)
	5315	{
	5316	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	5317
	5318	PERL_UNUSED_ARG(value);
	5319
	5320	if (flags & RXapif_FETCH) {
	5321	return reg_named_buff_fetch(rx, key, flags);
	5322	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	5323	Perl_croak_no_modify(aTHX);
	5324	return NULL;
	5325	} else if (flags & RXapif_EXISTS) {
	5326	return reg_named_buff_exists(rx, key, flags)
	5327	? &PL_sv_yes
	5328	: &PL_sv_no;
	5329	} else if (flags & RXapif_REGNAMES) {
	5330	return reg_named_buff_all(rx, flags);
	5331	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	5332	return reg_named_buff_scalar(rx, flags);
	5333	} else {
	5334	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	5335	return NULL;
	5336	}
	5337	}
	5338
	5339	SV*
	5340	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	5341	const U32 flags)
	5342	{
	5343	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	5344	PERL_UNUSED_ARG(lastkey);
	5345
	5346	if (flags & RXapif_FIRSTKEY)
	5347	return reg_named_buff_firstkey(rx, flags);
	5348	else if (flags & RXapif_NEXTKEY)
	5349	return reg_named_buff_nextkey(rx, flags);
	5350	else {
	5351	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
	5352	return NULL;
	5353	}
	5354	}
	5355
	5356	SV*
	5357	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	5358	const U32 flags)
	5359	{
	5360	AV *retarray = NULL;
	5361	SV *ret;
	5362	struct regexp const rx = (struct regexp )SvANY(r);
	5363
	5364	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	5365
	5366	if (flags & RXapif_ALL)
	5367	retarray=newAV();
	5368
	5369	if (rx && RXp_PAREN_NAMES(rx)) {
	5370	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	5371	if (he_str) {
	5372	IV i;
	5373	SV* sv_dat=HeVAL(he_str);
	5374	I32 nums=(I32)SvPVX(sv_dat);
	5375	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	5376	if ((I32)(rx->nparens) >= nums[i]
	5377	&& rx->offs[nums[i]].start != -1
	5378	&& rx->offs[nums[i]].end != -1)
	5379	{
	5380	ret = newSVpvs("");
	5381	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	5382	if (!retarray)
	5383	return ret;
	5384	} else {
	5385	ret = newSVsv(&PL_sv_undef);
	5386	}
	5387	if (retarray)
	5388	av_push(retarray, ret);
	5389	}
	5390	if (retarray)
	5391	return newRV_noinc(MUTABLE_SV(retarray));
	5392	}
	5393	}
	5394	return NULL;
	5395	}
	5396
	5397	bool
	5398	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	5399	const U32 flags)
	5400	{
	5401	struct regexp const rx = (struct regexp )SvANY(r);
	5402
	5403	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	5404
	5405	if (rx && RXp_PAREN_NAMES(rx)) {
	5406	if (flags & RXapif_ALL) {
	5407	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	5408	} else {
	5409	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	5410	if (sv) {
	5411	SvREFCNT_dec(sv);
	5412	return TRUE;
	5413	} else {
	5414	return FALSE;
	5415	}
	5416	}
	5417	} else {
	5418	return FALSE;
	5419	}
	5420	}
	5421
	5422	SV*
	5423	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	5424	{
	5425	struct regexp const rx = (struct regexp )SvANY(r);
	5426
	5427	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	5428
	5429	if ( rx && RXp_PAREN_NAMES(rx) ) {
	5430	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	5431
	5432	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	5433	} else {
	5434	return FALSE;
	5435	}
	5436	}
	5437
	5438	SV*
	5439	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	5440	{
	5441	struct regexp const rx = (struct regexp )SvANY(r);
	5442	GET_RE_DEBUG_FLAGS_DECL;
	5443
	5444	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	5445
	5446	if (rx && RXp_PAREN_NAMES(rx)) {
	5447	HV *hv = RXp_PAREN_NAMES(rx);
	5448	HE *temphe;
	5449	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5450	IV i;
	5451	IV parno = 0;
	5452	SV* sv_dat = HeVAL(temphe);
	5453	I32 nums = (I32)SvPVX(sv_dat);
	5454	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5455	if ((I32)(rx->lastparen) >= nums[i] &&
	5456	rx->offs[nums[i]].start != -1 &&
	5457	rx->offs[nums[i]].end != -1)
	5458	{
	5459	parno = nums[i];
	5460	break;
	5461	}
	5462	}
	5463	if (parno \|\| flags & RXapif_ALL) {
	5464	return newSVhek(HeKEY_hek(temphe));
	5465	}
	5466	}
	5467	}
	5468	return NULL;
	5469	}
	5470
	5471	SV*
	5472	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	5473	{
	5474	SV *ret;
	5475	AV *av;
	5476	I32 length;
	5477	struct regexp const rx = (struct regexp )SvANY(r);
	5478
	5479	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	5480
	5481	if (rx && RXp_PAREN_NAMES(rx)) {
	5482	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	5483	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	5484	} else if (flags & RXapif_ONE) {
	5485	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	5486	av = MUTABLE_AV(SvRV(ret));
	5487	length = av_len(av);
	5488	SvREFCNT_dec(ret);
	5489	return newSViv(length + 1);
	5490	} else {
	5491	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
	5492	return NULL;
	5493	}
	5494	}
	5495	return &PL_sv_undef;
	5496	}
	5497
	5498	SV*
	5499	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	5500	{
	5501	struct regexp const rx = (struct regexp )SvANY(r);
	5502	AV *av = newAV();
	5503
	5504	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	5505
	5506	if (rx && RXp_PAREN_NAMES(rx)) {
	5507	HV *hv= RXp_PAREN_NAMES(rx);
	5508	HE *temphe;
	5509	(void)hv_iterinit(hv);
	5510	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5511	IV i;
	5512	IV parno = 0;
	5513	SV* sv_dat = HeVAL(temphe);
	5514	I32 nums = (I32)SvPVX(sv_dat);
	5515	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5516	if ((I32)(rx->lastparen) >= nums[i] &&
	5517	rx->offs[nums[i]].start != -1 &&
	5518	rx->offs[nums[i]].end != -1)
	5519	{
	5520	parno = nums[i];
	5521	break;
	5522	}
	5523	}
	5524	if (parno \|\| flags & RXapif_ALL) {
	5525	av_push(av, newSVhek(HeKEY_hek(temphe)));
	5526	}
	5527	}
	5528	}
	5529
	5530	return newRV_noinc(MUTABLE_SV(av));
	5531	}
	5532
	5533	void
	5534	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	5535	SV * const sv)
	5536	{
	5537	struct regexp const rx = (struct regexp )SvANY(r);
	5538	char *s = NULL;
	5539	I32 i = 0;
	5540	I32 s1, t1;
	5541
	5542	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	5543
	5544	if (!rx->subbeg) {
	5545	sv_setsv(sv,&PL_sv_undef);
	5546	return;
	5547	}
	5548	else
	5549	if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
	5550	/* $` */
	5551	i = rx->offs[0].start;
	5552	s = rx->subbeg;
	5553	}
	5554	else
	5555	if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
	5556	/* $' */
	5557	s = rx->subbeg + rx->offs[0].end;
	5558	i = rx->sublen - rx->offs[0].end;
	5559	}
	5560	else
	5561	if ( 0 <= paren && paren <= (I32)rx->nparens &&
	5562	(s1 = rx->offs[paren].start) != -1 &&
	5563	(t1 = rx->offs[paren].end) != -1)
	5564	{
	5565	/* $& $1 ... */
	5566	i = t1 - s1;
	5567	s = rx->subbeg + s1;
	5568	} else {
	5569	sv_setsv(sv,&PL_sv_undef);
	5570	return;
	5571	}
	5572	assert(rx->sublen >= (s - rx->subbeg) + i );
	5573	if (i >= 0) {
	5574	const int oldtainted = PL_tainted;
	5575	TAINT_NOT;
	5576	sv_setpvn(sv, s, i);
	5577	PL_tainted = oldtainted;
	5578	if ( (rx->extflags & RXf_CANY_SEEN)
	5579	? (RXp_MATCH_UTF8(rx)
	5580	&& (!i \|\| is_utf8_string((U8*)s, i)))
	5581	: (RXp_MATCH_UTF8(rx)) )
	5582	{
	5583	SvUTF8_on(sv);
	5584	}
	5585	else
	5586	SvUTF8_off(sv);
	5587	if (PL_tainting) {
	5588	if (RXp_MATCH_TAINTED(rx)) {
	5589	if (SvTYPE(sv) >= SVt_PVMG) {
	5590	MAGIC* const mg = SvMAGIC(sv);
	5591	MAGIC* mgt;
	5592	PL_tainted = 1;
	5593	SvMAGIC_set(sv, mg->mg_moremagic);
	5594	SvTAINT(sv);
	5595	if ((mgt = SvMAGIC(sv))) {
	5596	mg->mg_moremagic = mgt;
	5597	SvMAGIC_set(sv, mg);
	5598	}
	5599	} else {
	5600	PL_tainted = 1;
	5601	SvTAINT(sv);
	5602	}
	5603	} else
	5604	SvTAINTED_off(sv);
	5605	}
	5606	} else {
	5607	sv_setsv(sv,&PL_sv_undef);
	5608	return;
	5609	}
	5610	}
	5611
	5612	void
	5613	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	5614	SV const * const value)
	5615	{
	5616	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	5617
	5618	PERL_UNUSED_ARG(rx);
	5619	PERL_UNUSED_ARG(paren);
	5620	PERL_UNUSED_ARG(value);
	5621
	5622	if (!PL_localizing)
	5623	Perl_croak_no_modify(aTHX);
	5624	}
	5625
	5626	I32
	5627	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	5628	const I32 paren)
	5629	{
	5630	struct regexp const rx = (struct regexp )SvANY(r);
	5631	I32 i;
	5632	I32 s1, t1;
	5633
	5634	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	5635
	5636	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	5637	switch (paren) {
	5638	/* $` / ${^PREMATCH} */
	5639	case RX_BUFF_IDX_PREMATCH:
	5640	if (rx->offs[0].start != -1) {
	5641	i = rx->offs[0].start;
	5642	if (i > 0) {
	5643	s1 = 0;
	5644	t1 = i;
	5645	goto getlen;
	5646	}
	5647	}
	5648	return 0;
	5649	/* $' / ${^POSTMATCH} */
	5650	case RX_BUFF_IDX_POSTMATCH:
	5651	if (rx->offs[0].end != -1) {
	5652	i = rx->sublen - rx->offs[0].end;
	5653	if (i > 0) {
	5654	s1 = rx->offs[0].end;
	5655	t1 = rx->sublen;
	5656	goto getlen;
	5657	}
	5658	}
	5659	return 0;
	5660	/* $& / ${^MATCH}, $1, $2, ... */
	5661	default:
	5662	if (paren <= (I32)rx->nparens &&
	5663	(s1 = rx->offs[paren].start) != -1 &&
	5664	(t1 = rx->offs[paren].end) != -1)
	5665	{
	5666	i = t1 - s1;
	5667	goto getlen;
	5668	} else {
	5669	if (ckWARN(WARN_UNINITIALIZED))
	5670	report_uninit((const SV *)sv);
	5671	return 0;
	5672	}
	5673	}
	5674	getlen:
	5675	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	5676	const char * const s = rx->subbeg + s1;
	5677	const U8 *ep;
	5678	STRLEN el;
	5679
	5680	i = t1 - s1;
	5681	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	5682	i = el;
	5683	}
	5684	return i;
	5685	}
	5686
	5687	SV*
	5688	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	5689	{
	5690	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	5691	PERL_UNUSED_ARG(rx);
	5692	if (0)
	5693	return NULL;
	5694	else
	5695	return newSVpvs("Regexp");
	5696	}
	5697
	5698	/* Scans the name of a named buffer from the pattern.
	5699	* If flags is REG_RSN_RETURN_NULL returns null.
	5700	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	5701	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	5702	* to the parsed name as looked up in the RExC_paren_names hash.
	5703	* If there is an error throws a vFAIL().. type exception.
	5704	*/
	5705
	5706	#define REG_RSN_RETURN_NULL 0
	5707	#define REG_RSN_RETURN_NAME 1
	5708	#define REG_RSN_RETURN_DATA 2
	5709
	5710	STATIC SV*
	5711	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	5712	{
	5713	char *name_start = RExC_parse;
	5714
	5715	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	5716
	5717	if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	5718	/* skip IDFIRST by using do...while */
	5719	if (UTF)
	5720	do {
	5721	RExC_parse += UTF8SKIP(RExC_parse);
	5722	} while (isALNUM_utf8((U8*)RExC_parse));
	5723	else
	5724	do {
	5725	RExC_parse++;
	5726	} while (isALNUM(*RExC_parse));
	5727	}
	5728
	5729	if ( flags ) {
	5730	SV* sv_name
	5731	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	5732	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	5733	if ( flags == REG_RSN_RETURN_NAME)
	5734	return sv_name;
	5735	else if (flags==REG_RSN_RETURN_DATA) {
	5736	HE *he_str = NULL;
	5737	SV *sv_dat = NULL;
	5738	if ( ! sv_name ) /* should not happen*/
	5739	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	5740	if (RExC_paren_names)
	5741	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	5742	if ( he_str )
	5743	sv_dat = HeVAL(he_str);
	5744	if ( ! sv_dat )
	5745	vFAIL("Reference to nonexistent named group");
	5746	return sv_dat;
	5747	}
	5748	else {
	5749	Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
	5750	}
	5751	/* NOT REACHED */
	5752	}
	5753	return NULL;
	5754	}
	5755
	5756	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	5757	int rem=(int)(RExC_end - RExC_parse); \
	5758	int cut; \
	5759	int num; \
	5760	int iscut=0; \
	5761	if (rem>10) { \
	5762	rem=10; \
	5763	iscut=1; \
	5764	} \
	5765	cut=10-rem; \
	5766	if (RExC_lastparse!=RExC_parse) \
	5767	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	5768	rem, RExC_parse, \
	5769	cut + 4, \
	5770	iscut ? "..." : "<" \
	5771	); \
	5772	else \
	5773	PerlIO_printf(Perl_debug_log,"%16s",""); \
	5774	\
	5775	if (SIZE_ONLY) \
	5776	num = RExC_size + 1; \
	5777	else \
	5778	num=REG_NODE_NUM(RExC_emit); \
	5779	if (RExC_lastnum!=num) \
	5780	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	5781	else \
	5782	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	5783	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	5784	(int)((depth*2)), "", \
	5785	(funcname) \
	5786	); \
	5787	RExC_lastnum=num; \
	5788	RExC_lastparse=RExC_parse; \
	5789	})
	5790
	5791
	5792
	5793	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	5794	DEBUG_PARSE_MSG((funcname)); \
	5795	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	5796	})
	5797	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	5798	DEBUG_PARSE_MSG((funcname)); \
	5799	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	5800	})
	5801
	5802	/* This section of code defines the inversion list object and its methods. The
	5803	* interfaces are highly subject to change, so as much as possible is static to
	5804	* this file. An inversion list is here implemented as a malloc'd C array with
	5805	* some added info. More will be coming when functionality is added later.
	5806	*
	5807	* Some of the methods should always be private to the implementation, and some
	5808	* should eventually be made public */
	5809
	5810	#define INVLIST_INITIAL_LEN 10
	5811	#define INVLIST_ARRAY_KEY "array"
	5812	#define INVLIST_MAX_KEY "max"
	5813	#define INVLIST_LEN_KEY "len"
	5814
	5815	PERL_STATIC_INLINE UV*
	5816	S_invlist_array(pTHX_ HV* const invlist)
	5817	{
	5818	/* Returns the pointer to the inversion list's array. Every time the
	5819	* length changes, this needs to be called in case malloc or realloc moved
	5820	* it */
	5821
	5822	SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
	5823
	5824	PERL_ARGS_ASSERT_INVLIST_ARRAY;
	5825
	5826	if (list_ptr == NULL) {
	5827	Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
	5828	INVLIST_ARRAY_KEY);
	5829	}
	5830
	5831	return INT2PTR(UV , SvUV(list_ptr));
	5832	}
	5833
	5834	PERL_STATIC_INLINE void
	5835	S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
	5836	{
	5837	PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
	5838
	5839	/* Sets the array stored in the inversion list to the memory beginning with
	5840	* the parameter */
	5841
	5842	if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
	5843	Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
	5844	INVLIST_ARRAY_KEY);
	5845	}
	5846	}
	5847
	5848	PERL_STATIC_INLINE UV
	5849	S_invlist_len(pTHX_ HV* const invlist)
	5850	{
	5851	/* Returns the current number of elements in the inversion list's array */
	5852
	5853	SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
	5854
	5855	PERL_ARGS_ASSERT_INVLIST_LEN;
	5856
	5857	if (len_ptr == NULL) {
	5858	Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
	5859	INVLIST_LEN_KEY);
	5860	}
	5861
	5862	return SvUV(*len_ptr);
	5863	}
	5864
	5865	PERL_STATIC_INLINE UV
	5866	S_invlist_max(pTHX_ HV* const invlist)
	5867	{
	5868	/* Returns the maximum number of elements storable in the inversion list's
	5869	* array, without having to realloc() */
	5870
	5871	SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
	5872
	5873	PERL_ARGS_ASSERT_INVLIST_MAX;
	5874
	5875	if (max_ptr == NULL) {
	5876	Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
	5877	INVLIST_MAX_KEY);
	5878	}
	5879
	5880	return SvUV(*max_ptr);
	5881	}
	5882
	5883	PERL_STATIC_INLINE void
	5884	S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
	5885	{
	5886	/* Sets the current number of elements stored in the inversion list */
	5887
	5888	PERL_ARGS_ASSERT_INVLIST_SET_LEN;
	5889
	5890	if (len != 0 && len > invlist_max(invlist)) {
	5891	Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
	5892	}
	5893
	5894	if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
	5895	Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
	5896	INVLIST_LEN_KEY);
	5897	}
	5898	}
	5899
	5900	PERL_STATIC_INLINE void
	5901	S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
	5902	{
	5903
	5904	/* Sets the maximum number of elements storable in the inversion list
	5905	* without having to realloc() */
	5906
	5907	PERL_ARGS_ASSERT_INVLIST_SET_MAX;
	5908
	5909	if (max < invlist_len(invlist)) {
	5910	Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
	5911	}
	5912
	5913	if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
	5914	Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
	5915	INVLIST_LEN_KEY);
	5916	}
	5917	}
	5918
	5919	#ifndef PERL_IN_XSUB_RE
	5920	HV*
	5921	Perl__new_invlist(pTHX_ IV initial_size)
	5922	{
	5923
	5924	/* Return a pointer to a newly constructed inversion list, with enough
	5925	* space to store 'initial_size' elements. If that number is negative, a
	5926	* system default is used instead */
	5927
	5928	HV* invlist = newHV();
	5929	UV* list;
	5930
	5931	if (initial_size < 0) {
	5932	initial_size = INVLIST_INITIAL_LEN;
	5933	}
	5934
	5935	/* Allocate the initial space */
	5936	Newx(list, initial_size, UV);
	5937	invlist_set_array(invlist, list);
	5938
	5939	/* set_len has to come before set_max, as the latter inspects the len */
	5940	invlist_set_len(invlist, 0);
	5941	invlist_set_max(invlist, initial_size);
	5942
	5943	return invlist;
	5944	}
	5945	#endif
	5946
	5947	PERL_STATIC_INLINE void
	5948	S_invlist_destroy(pTHX_ HV* const invlist)
	5949	{
	5950	/* Inversion list destructor */
	5951
	5952	SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
	5953
	5954	PERL_ARGS_ASSERT_INVLIST_DESTROY;
	5955
	5956	if (list_ptr != NULL) {
	5957	UV list = INT2PTR(UV , SvUV(list_ptr)); / PERL_POISON needs lvalue */
	5958	Safefree(list);
	5959	}
	5960	}
	5961
	5962	STATIC void
	5963	S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
	5964	{
	5965	/* Change the maximum size of an inversion list (up or down) */
	5966
	5967	UV* orig_array;
	5968	UV* array;
	5969	const UV old_max = invlist_max(invlist);
	5970
	5971	PERL_ARGS_ASSERT_INVLIST_EXTEND;
	5972
	5973	if (old_max == new_max) { /* If a no-op */
	5974	return;
	5975	}
	5976
	5977	array = orig_array = invlist_array(invlist);
	5978	Renew(array, new_max, UV);
	5979
	5980	/* If the size change moved the list in memory, set the new one */
	5981	if (array != orig_array) {
	5982	invlist_set_array(invlist, array);
	5983	}
	5984
	5985	invlist_set_max(invlist, new_max);
	5986
	5987	}
	5988
	5989	PERL_STATIC_INLINE void
	5990	S_invlist_trim(pTHX_ HV* const invlist)
	5991	{
	5992	PERL_ARGS_ASSERT_INVLIST_TRIM;
	5993
	5994	/* Change the length of the inversion list to how many entries it currently
	5995	* has */
	5996
	5997	invlist_extend(invlist, invlist_len(invlist));
	5998	}
	5999
	6000	/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
	6001	* etc */
	6002
	6003	#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
	6004
	6005	#ifndef PERL_IN_XSUB_RE
	6006	void
	6007	Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
	6008	{
	6009	/* Subject to change or removal. Append the range from 'start' to 'end' at
	6010	* the end of the inversion list. The range must be above any existing
	6011	* ones. */
	6012
	6013	UV* array = invlist_array(invlist);
	6014	UV max = invlist_max(invlist);
	6015	UV len = invlist_len(invlist);
	6016
	6017	PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
	6018
	6019	if (len > 0) {
	6020
	6021	/* Here, the existing list is non-empty. The current max entry in the
	6022	* list is generally the first value not in the set, except when the
	6023	* set extends to the end of permissible values, in which case it is
	6024	* the first entry in that final set, and so this call is an attempt to
	6025	* append out-of-order */
	6026
	6027	UV final_element = len - 1;
	6028	if (array[final_element] > start
	6029	\|\| ELEMENT_IN_INVLIST_SET(final_element))
	6030	{
	6031	Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
	6032	}
	6033
	6034	/* Here, it is a legal append. If the new range begins with the first
	6035	* value not in the set, it is extending the set, so the new first
	6036	* value not in the set is one greater than the newly extended range.
	6037	* */
	6038	if (array[final_element] == start) {
	6039	if (end != UV_MAX) {
	6040	array[final_element] = end + 1;
	6041	}
	6042	else {
	6043	/* But if the end is the maximum representable on the machine,
	6044	* just let the range that this would extend have no end */
	6045	invlist_set_len(invlist, len - 1);
	6046	}
	6047	return;
	6048	}
	6049	}
	6050
	6051	/* Here the new range doesn't extend any existing set. Add it */
	6052
	6053	len += 2; /* Includes an element each for the start and end of range */
	6054
	6055	/* If overflows the existing space, extend, which may cause the array to be
	6056	* moved */
	6057	if (max < len) {
	6058	invlist_extend(invlist, len);
	6059	array = invlist_array(invlist);
	6060	}
	6061
	6062	invlist_set_len(invlist, len);
	6063
	6064	/* The next item on the list starts the range, the one after that is
	6065	* one past the new range. */
	6066	array[len - 2] = start;
	6067	if (end != UV_MAX) {
	6068	array[len - 1] = end + 1;
	6069	}
	6070	else {
	6071	/* But if the end is the maximum representable on the machine, just let
	6072	* the range have no end */
	6073	invlist_set_len(invlist, len - 1);
	6074	}
	6075	}
	6076	#endif
	6077
	6078	STATIC HV*
	6079	S_invlist_union(pTHX_ HV* const a, HV* const b)
	6080	{
	6081	/* Return a new inversion list which is the union of two inversion lists.
	6082	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	6083	* Richard Gillam, published by Addison-Wesley, and explained at some
	6084	* length there. The preface says to incorporate its examples into your
	6085	* code at your own risk.
	6086	*
	6087	* The algorithm is like a merge sort.
	6088	*
	6089	* XXX A potential performance improvement is to keep track as we go along
	6090	* if only one of the inputs contributes to the result, meaning the other
	6091	* is a subset of that one. In that case, we can skip the final copy and
	6092	* return the larger of the input lists */
	6093
	6094	UV* array_a = invlist_array(a); /* a's array */
	6095	UV* array_b = invlist_array(b);
	6096	UV len_a = invlist_len(a); /* length of a's array */
	6097	UV len_b = invlist_len(b);
	6098
	6099	HV* u; /* the resulting union */
	6100	UV* array_u;
	6101	UV len_u;
	6102
	6103	UV i_a = 0; /* current index into a's array */
	6104	UV i_b = 0;
	6105	UV i_u = 0;
	6106
	6107	/* running count, as explained in the algorithm source book; items are
	6108	* stopped accumulating and are output when the count changes to/from 0.
	6109	* The count is incremented when we start a range that's in the set, and
	6110	* decremented when we start a range that's not in the set. So its range
	6111	* is 0 to 2. Only when the count is zero is something not in the set.
	6112	*/
	6113	UV count = 0;
	6114
	6115	PERL_ARGS_ASSERT_INVLIST_UNION;
	6116
	6117	/* Size the union for the worst case: that the sets are completely
	6118	* disjoint */
	6119	u = _new_invlist(len_a + len_b);
	6120	array_u = invlist_array(u);
	6121
	6122	/* Go through each list item by item, stopping when exhausted one of
	6123	* them */
	6124	while (i_a < len_a && i_b < len_b) {
	6125	UV cp; /* The element to potentially add to the union's array */
	6126	bool cp_in_set; /* is it in the the input list's set or not */
	6127
	6128	/* We need to take one or the other of the two inputs for the union.
	6129	* Since we are merging two sorted lists, we take the smaller of the
	6130	* next items. In case of a tie, we take the one that is in its set
	6131	* first. If we took one not in the set first, it would decrement the
	6132	* count, possibly to 0 which would cause it to be output as ending the
	6133	* range, and the next time through we would take the same number, and
	6134	* output it again as beginning the next range. By doing it the
	6135	* opposite way, there is no possibility that the count will be
	6136	* momentarily decremented to 0, and thus the two adjoining ranges will
	6137	* be seamlessly merged. (In a tie and both are in the set or both not
	6138	* in the set, it doesn't matter which we take first.) */
	6139	if (array_a[i_a] < array_b[i_b]
	6140	\|\| (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
	6141	{
	6142	cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
	6143	cp= array_a[i_a++];
	6144	}
	6145	else {
	6146	cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
	6147	cp= array_b[i_b++];
	6148	}
	6149
	6150	/* Here, have chosen which of the two inputs to look at. Only output
	6151	* if the running count changes to/from 0, which marks the
	6152	* beginning/end of a range in that's in the set */
	6153	if (cp_in_set) {
	6154	if (count == 0) {
	6155	array_u[i_u++] = cp;
	6156	}
	6157	count++;
	6158	}
	6159	else {
	6160	count--;
	6161	if (count == 0) {
	6162	array_u[i_u++] = cp;
	6163	}
	6164	}
	6165	}
	6166
	6167	/* Here, we are finished going through at least one of the lists, which
	6168	* means there is something remaining in at most one. We check if the list
	6169	* that hasn't been exhausted is positioned such that we are in the middle
	6170	* of a range in its set or not. (We are in the set if the next item in
	6171	* the array marks the beginning of something not in the set) If in the
	6172	* set, we decrement 'count'; if 0, there is potentially more to output.
	6173	* There are four cases:
	6174	* 1) Both weren't in their sets, count is 0, and remains 0. What's left
	6175	* in the union is entirely from the non-exhausted set.
	6176	* 2) Both were in their sets, count is 2. Nothing further should
	6177	* be output, as everything that remains will be in the exhausted
	6178	* list's set, hence in the union; decrementing to 1 but not 0 insures
	6179	* that
	6180	* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
	6181	* Nothing further should be output because the union includes
	6182	* everything from the exhausted set. Not decrementing insures that.
	6183	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
	6184	* decrementing to 0 insures that we look at the remainder of the
	6185	* non-exhausted set */
	6186	if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
	6187	\|\| (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
	6188	{
	6189	count--;
	6190	}
	6191
	6192	/* The final length is what we've output so far, plus what else is about to
	6193	* be output. (If 'count' is non-zero, then the input list we exhausted
	6194	* has everything remaining up to the machine's limit in its set, and hence
	6195	* in the union, so there will be no further output. */
	6196	len_u = i_u;
	6197	if (count == 0) {
	6198	/* At most one of the subexpressions will be non-zero */
	6199	len_u += (len_a - i_a) + (len_b - i_b);
	6200	}
	6201
	6202	/* Set result to final length, which can change the pointer to array_u, so
	6203	* re-find it */
	6204	if (len_u != invlist_len(u)) {
	6205	invlist_set_len(u, len_u);
	6206	invlist_trim(u);
	6207	array_u = invlist_array(u);
	6208	}
	6209
	6210	/* When 'count' is 0, the list that was exhausted (if one was shorter than
	6211	* the other) ended with everything above it not in its set. That means
	6212	* that the remaining part of the union is precisely the same as the
	6213	* non-exhausted list, so can just copy it unchanged. (If both list were
	6214	* exhausted at the same time, then the operations below will be both 0.)
	6215	*/
	6216	if (count == 0) {
	6217	IV copy_count; /* At most one will have a non-zero copy count */
	6218	if ((copy_count = len_a - i_a) > 0) {
	6219	Copy(array_a + i_a, array_u + i_u, copy_count, UV);
	6220	}
	6221	else if ((copy_count = len_b - i_b) > 0) {
	6222	Copy(array_b + i_b, array_u + i_u, copy_count, UV);
	6223	}
	6224	}
	6225
	6226	return u;
	6227	}
	6228
	6229	STATIC HV*
	6230	S_invlist_intersection(pTHX_ HV* const a, HV* const b)
	6231	{
	6232	/* Return the intersection of two inversion lists. The basis for this
	6233	* comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
	6234	* by Addison-Wesley, and explained at some length there. The preface says
	6235	* to incorporate its examples into your code at your own risk.
	6236	*
	6237	* The algorithm is like a merge sort, and is essentially the same as the
	6238	* union above
	6239	*/
	6240
	6241	UV* array_a = invlist_array(a); /* a's array */
	6242	UV* array_b = invlist_array(b);
	6243	UV len_a = invlist_len(a); /* length of a's array */
	6244	UV len_b = invlist_len(b);
	6245
	6246	HV* r; /* the resulting intersection */
	6247	UV* array_r;
	6248	UV len_r;
	6249
	6250	UV i_a = 0; /* current index into a's array */
	6251	UV i_b = 0;
	6252	UV i_r = 0;
	6253
	6254	/* running count, as explained in the algorithm source book; items are
	6255	* stopped accumulating and are output when the count changes to/from 2.
	6256	* The count is incremented when we start a range that's in the set, and
	6257	* decremented when we start a range that's not in the set. So its range
	6258	* is 0 to 2. Only when the count is 2 is something in the intersection.
	6259	*/
	6260	UV count = 0;
	6261
	6262	PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
	6263
	6264	/* Size the intersection for the worst case: that the intersection ends up
	6265	* fragmenting everything to be completely disjoint */
	6266	r= _new_invlist(len_a + len_b);
	6267	array_r = invlist_array(r);
	6268
	6269	/* Go through each list item by item, stopping when exhausted one of
	6270	* them */
	6271	while (i_a < len_a && i_b < len_b) {
	6272	UV cp; /* The element to potentially add to the intersection's
	6273	array */
	6274	bool cp_in_set; /* Is it in the input list's set or not */
	6275
	6276	/* We need to take one or the other of the two inputs for the union.
	6277	* Since we are merging two sorted lists, we take the smaller of the
	6278	* next items. In case of a tie, we take the one that is not in its
	6279	* set first (a difference from the union algorithm). If we took one
	6280	* in the set first, it would increment the count, possibly to 2 which
	6281	* would cause it to be output as starting a range in the intersection,
	6282	* and the next time through we would take that same number, and output
	6283	* it again as ending the set. By doing it the opposite of this, we
	6284	* there is no possibility that the count will be momentarily
	6285	* incremented to 2. (In a tie and both are in the set or both not in
	6286	* the set, it doesn't matter which we take first.) */
	6287	if (array_a[i_a] < array_b[i_b]
	6288	\|\| (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
	6289	{
	6290	cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
	6291	cp= array_a[i_a++];
	6292	}
	6293	else {
	6294	cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
	6295	cp= array_b[i_b++];
	6296	}
	6297
	6298	/* Here, have chosen which of the two inputs to look at. Only output
	6299	* if the running count changes to/from 2, which marks the
	6300	* beginning/end of a range that's in the intersection */
	6301	if (cp_in_set) {
	6302	count++;
	6303	if (count == 2) {
	6304	array_r[i_r++] = cp;
	6305	}
	6306	}
	6307	else {
	6308	if (count == 2) {
	6309	array_r[i_r++] = cp;
	6310	}
	6311	count--;
	6312	}
	6313	}
	6314
	6315	/* Here, we are finished going through at least one of the sets, which
	6316	* means there is something remaining in at most one. See the comments in
	6317	* the union code */
	6318	if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
	6319	\|\| (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
	6320	{
	6321	count--;
	6322	}
	6323
	6324	/* The final length is what we've output so far plus what else is in the
	6325	* intersection. Only one of the subexpressions below will be non-zero */
	6326	len_r = i_r;
	6327	if (count == 2) {
	6328	len_r += (len_a - i_a) + (len_b - i_b);
	6329	}
	6330
	6331	/* Set result to final length, which can change the pointer to array_r, so
	6332	* re-find it */
	6333	if (len_r != invlist_len(r)) {
	6334	invlist_set_len(r, len_r);
	6335	invlist_trim(r);
	6336	array_r = invlist_array(r);
	6337	}
	6338
	6339	/* Finish outputting any remaining */
	6340	if (count == 2) { /* Only one of will have a non-zero copy count */
	6341	IV copy_count;
	6342	if ((copy_count = len_a - i_a) > 0) {
	6343	Copy(array_a + i_a, array_r + i_r, copy_count, UV);
	6344	}
	6345	else if ((copy_count = len_b - i_b) > 0) {
	6346	Copy(array_b + i_b, array_r + i_r, copy_count, UV);
	6347	}
	6348	}
	6349
	6350	return r;
	6351	}
	6352
	6353	STATIC HV*
	6354	S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
	6355	{
	6356	/* Add the range from 'start' to 'end' inclusive to the inversion list's
	6357	* set. A pointer to the inversion list is returned. This may actually be
	6358	* a new list, in which case the passed in one has been destroyed. The
	6359	* passed in inversion list can be NULL, in which case a new one is created
	6360	* with just the one range in it */
	6361
	6362	HV* range_invlist;
	6363	HV* added_invlist;
	6364	UV len;
	6365
	6366	if (invlist == NULL) {
	6367	invlist = _new_invlist(2);
	6368	len = 0;
	6369	}
	6370	else {
	6371	len = invlist_len(invlist);
	6372	}
	6373
	6374	/* If comes after the final entry, can just append it to the end */
	6375	if (len == 0
	6376	\|\| start >= invlist_array(invlist)
	6377	[invlist_len(invlist) - 1])
	6378	{
	6379	_append_range_to_invlist(invlist, start, end);
	6380	return invlist;
	6381	}
	6382
	6383	/* Here, can't just append things, create and return a new inversion list
	6384	* which is the union of this range and the existing inversion list */
	6385	range_invlist = _new_invlist(2);
	6386	_append_range_to_invlist(range_invlist, start, end);
	6387
	6388	added_invlist = invlist_union(invlist, range_invlist);
	6389
	6390	/* The passed in list can be freed, as well as our temporary */
	6391	invlist_destroy(range_invlist);
	6392	if (invlist != added_invlist) {
	6393	invlist_destroy(invlist);
	6394	}
	6395
	6396	return added_invlist;
	6397	}
	6398
	6399	PERL_STATIC_INLINE HV*
	6400	S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
	6401	return add_range_to_invlist(invlist, cp, cp);
	6402	}
	6403
	6404	/* End of inversion list object */
	6405
	6406	/*
	6407	- reg - regular expression, i.e. main body or parenthesized thing
	6408	*
	6409	* Caller must absorb opening parenthesis.
	6410	*
	6411	* Combining parenthesis handling with the base level of regular expression
	6412	* is a trifle forced, but the need to tie the tails of the branches to what
	6413	* follows makes it hard to avoid.
	6414	*/
	6415	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	6416	#ifdef DEBUGGING
	6417	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	6418	#else
	6419	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	6420	#endif
	6421
	6422	STATIC regnode *
	6423	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	6424	/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
	6425	{
	6426	dVAR;
	6427	register regnode ret; / Will be the head of the group. */
	6428	register regnode *br;
	6429	register regnode *lastbr;
	6430	register regnode *ender = NULL;
	6431	register I32 parno = 0;
	6432	I32 flags;
	6433	U32 oregflags = RExC_flags;
	6434	bool have_branch = 0;
	6435	bool is_open = 0;
	6436	I32 freeze_paren = 0;
	6437	I32 after_freeze = 0;
	6438
	6439	/* for (?g), (?gc), and (?o) warnings; warning
	6440	about (?c) will warn about (?g) -- japhy */
	6441
	6442	#define WASTED_O 0x01
	6443	#define WASTED_G 0x02
	6444	#define WASTED_C 0x04
	6445	#define WASTED_GC (0x02\|0x04)
	6446	I32 wastedflags = 0x00;
	6447
	6448	char * parse_start = RExC_parse; /* MJD */
	6449	char * const oregcomp_parse = RExC_parse;
	6450
	6451	GET_RE_DEBUG_FLAGS_DECL;
	6452
	6453	PERL_ARGS_ASSERT_REG;
	6454	DEBUG_PARSE("reg ");
	6455
	6456	flagp = 0; / Tentatively. */
	6457
	6458
	6459	/* Make an OPEN node, if parenthesized. */
	6460	if (paren) {
	6461	if ( RExC_parse == '') { /* (VERB:ARG) /
	6462	char *start_verb = RExC_parse;
	6463	STRLEN verb_len = 0;
	6464	char *start_arg = NULL;
	6465	unsigned char op = 0;
	6466	int argok = 1;
	6467	int internal_argval = 0; /* internal_argval is only useful if !argok */
	6468	while ( RExC_parse && RExC_parse != ')' ) {
	6469	if ( *RExC_parse == ':' ) {
	6470	start_arg = RExC_parse + 1;
	6471	break;
	6472	}
	6473	RExC_parse++;
	6474	}
	6475	++start_verb;
	6476	verb_len = RExC_parse - start_verb;
	6477	if ( start_arg ) {
	6478	RExC_parse++;
	6479	while ( RExC_parse && RExC_parse != ')' )
	6480	RExC_parse++;
	6481	if ( *RExC_parse != ')' )
	6482	vFAIL("Unterminated verb pattern argument");
	6483	if ( RExC_parse == start_arg )
	6484	start_arg = NULL;
	6485	} else {
	6486	if ( *RExC_parse != ')' )
	6487	vFAIL("Unterminated verb pattern");
	6488	}
	6489
	6490	switch ( *start_verb ) {
	6491	case 'A': /* (ACCEPT) /
	6492	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	6493	op = ACCEPT;
	6494	internal_argval = RExC_nestroot;
	6495	}
	6496	break;
	6497	case 'C': /* (COMMIT) /
	6498	if ( memEQs(start_verb,verb_len,"COMMIT") )
	6499	op = COMMIT;
	6500	break;
	6501	case 'F': /* (FAIL) /
	6502	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	6503	op = OPFAIL;
	6504	argok = 0;
	6505	}
	6506	break;
	6507	case ':': /* (:NAME) /
	6508	case 'M': /* (MARK:NAME) /
	6509	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	6510	op = MARKPOINT;
	6511	argok = -1;
	6512	}
	6513	break;
	6514	case 'P': /* (PRUNE) /
	6515	if ( memEQs(start_verb,verb_len,"PRUNE") )
	6516	op = PRUNE;
	6517	break;
	6518	case 'S': /* (SKIP) /
	6519	if ( memEQs(start_verb,verb_len,"SKIP") )
	6520	op = SKIP;
	6521	break;
	6522	case 'T': /* (THEN) /
	6523	/* [19:06] <TimToady> :: is then */
	6524	if ( memEQs(start_verb,verb_len,"THEN") ) {
	6525	op = CUTGROUP;
	6526	RExC_seen \|= REG_SEEN_CUTGROUP;
	6527	}
	6528	break;
	6529	}
	6530	if ( ! op ) {
	6531	RExC_parse++;
	6532	vFAIL3("Unknown verb pattern '%.*s'",
	6533	verb_len, start_verb);
	6534	}
	6535	if ( argok ) {
	6536	if ( start_arg && internal_argval ) {
	6537	vFAIL3("Verb pattern '%.*s' may not have an argument",
	6538	verb_len, start_verb);
	6539	} else if ( argok < 0 && !start_arg ) {
	6540	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	6541	verb_len, start_verb);
	6542	} else {
	6543	ret = reganode(pRExC_state, op, internal_argval);
	6544	if ( ! internal_argval && ! SIZE_ONLY ) {
	6545	if (start_arg) {
	6546	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	6547	ARG(ret) = add_data( pRExC_state, 1, "S" );
	6548	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	6549	ret->flags = 0;
	6550	} else {
	6551	ret->flags = 1;
	6552	}
	6553	}
	6554	}
	6555	if (!internal_argval)
	6556	RExC_seen \|= REG_SEEN_VERBARG;
	6557	} else if ( start_arg ) {
	6558	vFAIL3("Verb pattern '%.*s' may not have an argument",
	6559	verb_len, start_verb);
	6560	} else {
	6561	ret = reg_node(pRExC_state, op);
	6562	}
	6563	nextchar(pRExC_state);
	6564	return ret;
	6565	} else
	6566	if (RExC_parse == '?') { / (?...) */
	6567	bool is_logical = 0;
	6568	const char * const seqstart = RExC_parse;
	6569	bool has_use_defaults = FALSE;
	6570
	6571	RExC_parse++;
	6572	paren = *RExC_parse++;
	6573	ret = NULL; /* For look-ahead/behind. */
	6574	switch (paren) {
	6575
	6576	case 'P': /* (?P...) variants for those used to PCRE/Python */
	6577	paren = *RExC_parse++;
	6578	if ( paren == '<') /* (?P<...>) named capture */
	6579	goto named_capture;
	6580	else if (paren == '>') { /* (?P>name) named recursion */
	6581	goto named_recursion;
	6582	}
	6583	else if (paren == '=') { /* (?P=...) named backref */
	6584	/* this pretty much dupes the code for \k<NAME> in regatom(), if
	6585	you change this make sure you change that */
	6586	char* name_start = RExC_parse;
	6587	U32 num = 0;
	6588	SV *sv_dat = reg_scan_name(pRExC_state,
	6589	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6590	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	6591	vFAIL2("Sequence %.3s... not terminated",parse_start);
	6592
	6593	if (!SIZE_ONLY) {
	6594	num = add_data( pRExC_state, 1, "S" );
	6595	RExC_rxi->data->data[num]=(void*)sv_dat;
	6596	SvREFCNT_inc_simple_void(sv_dat);
	6597	}
	6598	RExC_sawback = 1;
	6599	ret = reganode(pRExC_state,
	6600	((! FOLD)
	6601	? NREF
	6602	: (MORE_ASCII_RESTRICTED)
	6603	? NREFFA
	6604	: (AT_LEAST_UNI_SEMANTICS)
	6605	? NREFFU
	6606	: (LOC)
	6607	? NREFFL
	6608	: NREFF),
	6609	num);
	6610	*flagp \|= HASWIDTH;
	6611
	6612	Set_Node_Offset(ret, parse_start+1);
	6613	Set_Node_Cur_Length(ret); /* MJD */
	6614
	6615	nextchar(pRExC_state);
	6616	return ret;
	6617	}
	6618	RExC_parse++;
	6619	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6620	/NOTREACHED/
	6621	case '<': /* (?<...) */
	6622	if (*RExC_parse == '!')
	6623	paren = ',';
	6624	else if (*RExC_parse != '=')
	6625	named_capture:
	6626	{ /* (?<...>) */
	6627	char *name_start;
	6628	SV *svname;
	6629	paren= '>';
	6630	case '\'': /* (?'...') */
	6631	name_start= RExC_parse;
	6632	svname = reg_scan_name(pRExC_state,
	6633	SIZE_ONLY ? /* reverse test from the others */
	6634	REG_RSN_RETURN_NAME :
	6635	REG_RSN_RETURN_NULL);
	6636	if (RExC_parse == name_start) {
	6637	RExC_parse++;
	6638	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6639	/NOTREACHED/
	6640	}
	6641	if (*RExC_parse != paren)
	6642	vFAIL2("Sequence (?%c... not terminated",
	6643	paren=='>' ? '<' : paren);
	6644	if (SIZE_ONLY) {
	6645	HE *he_str;
	6646	SV *sv_dat = NULL;
	6647	if (!svname) /* shouldn't happen */
	6648	Perl_croak(aTHX_
	6649	"panic: reg_scan_name returned NULL");
	6650	if (!RExC_paren_names) {
	6651	RExC_paren_names= newHV();
	6652	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	6653	#ifdef DEBUGGING
	6654	RExC_paren_name_list= newAV();
	6655	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	6656	#endif
	6657	}
	6658	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	6659	if ( he_str )
	6660	sv_dat = HeVAL(he_str);
	6661	if ( ! sv_dat ) {
	6662	/* croak baby croak */
	6663	Perl_croak(aTHX_
	6664	"panic: paren_name hash element allocation failed");
	6665	} else if ( SvPOK(sv_dat) ) {
	6666	/* (?\|...) can mean we have dupes so scan to check
	6667	its already been stored. Maybe a flag indicating
	6668	we are inside such a construct would be useful,
	6669	but the arrays are likely to be quite small, so
	6670	for now we punt -- dmq */
	6671	IV count = SvIV(sv_dat);
	6672	I32 pv = (I32)SvPVX(sv_dat);
	6673	IV i;
	6674	for ( i = 0 ; i < count ; i++ ) {
	6675	if ( pv[i] == RExC_npar ) {
	6676	count = 0;
	6677	break;
	6678	}
	6679	}
	6680	if ( count ) {
	6681	pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
	6682	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	6683	pv[count] = RExC_npar;
	6684	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	6685	}
	6686	} else {
	6687	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	6688	sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
	6689	SvIOK_on(sv_dat);
	6690	SvIV_set(sv_dat, 1);
	6691	}
	6692	#ifdef DEBUGGING
	6693	if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
	6694	SvREFCNT_dec(svname);
	6695	#endif
	6696
	6697	/sv_dump(sv_dat);/
	6698	}
	6699	nextchar(pRExC_state);
	6700	paren = 1;
	6701	goto capturing_parens;
	6702	}
	6703	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	6704	RExC_in_lookbehind++;
	6705	RExC_parse++;
	6706	case '=': /* (?=...) */
	6707	RExC_seen_zerolen++;
	6708	break;
	6709	case '!': /* (?!...) */
	6710	RExC_seen_zerolen++;
	6711	if (*RExC_parse == ')') {
	6712	ret=reg_node(pRExC_state, OPFAIL);
	6713	nextchar(pRExC_state);
	6714	return ret;
	6715	}
	6716	break;
	6717	case '\|': /* (?\|...) */
	6718	/* branch reset, behave like a (?:...) except that
	6719	buffers in alternations share the same numbers */
	6720	paren = ':';
	6721	after_freeze = freeze_paren = RExC_npar;
	6722	break;
	6723	case ':': /* (?:...) */
	6724	case '>': /* (?>...) */
	6725	break;
	6726	case '$': /* (?$...) */
	6727	case '@': /* (?@...) */
	6728	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	6729	break;
	6730	case '#': /* (?#...) */
	6731	while (RExC_parse && RExC_parse != ')')
	6732	RExC_parse++;
	6733	if (*RExC_parse != ')')
	6734	FAIL("Sequence (?#... not terminated");
	6735	nextchar(pRExC_state);
	6736	*flagp = TRYAGAIN;
	6737	return NULL;
	6738	case '0' : /* (?0) */
	6739	case 'R' : /* (?R) */
	6740	if (*RExC_parse != ')')
	6741	FAIL("Sequence (?R) not terminated");
	6742	ret = reg_node(pRExC_state, GOSTART);
	6743	*flagp \|= POSTPONED;
	6744	nextchar(pRExC_state);
	6745	return ret;
	6746	/notreached/
	6747	{ /* named and numeric backreferences */
	6748	I32 num;
	6749	case '&': /* (?&NAME) */
	6750	parse_start = RExC_parse - 1;
	6751	named_recursion:
	6752	{
	6753	SV *sv_dat = reg_scan_name(pRExC_state,
	6754	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6755	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	6756	}
	6757	goto gen_recurse_regop;
	6758	/* NOT REACHED */
	6759	case '+':
	6760	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	6761	RExC_parse++;
	6762	vFAIL("Illegal pattern");
	6763	}
	6764	goto parse_recursion;
	6765	/* NOT REACHED*/
	6766	case '-': /* (?-1) */
	6767	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	6768	RExC_parse--; /* rewind to let it be handled later */
	6769	goto parse_flags;
	6770	}
	6771	/FALLTHROUGH /
	6772	case '1': case '2': case '3': case '4': /* (?1) */
	6773	case '5': case '6': case '7': case '8': case '9':
	6774	RExC_parse--;
	6775	parse_recursion:
	6776	num = atoi(RExC_parse);
	6777	parse_start = RExC_parse - 1; /* MJD */
	6778	if (*RExC_parse == '-')
	6779	RExC_parse++;
	6780	while (isDIGIT(*RExC_parse))
	6781	RExC_parse++;
	6782	if (*RExC_parse!=')')
	6783	vFAIL("Expecting close bracket");
	6784
	6785	gen_recurse_regop:
	6786	if ( paren == '-' ) {
	6787	/*
	6788	Diagram of capture buffer numbering.
	6789	Top line is the normal capture buffer numbers
	6790	Bottom line is the negative indexing as from
	6791	the X (the (?-2))
	6792
	6793	+ 1 2 3 4 5 X 6 7
	6794	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	6795	- 5 4 3 2 1 X x x
	6796
	6797	*/
	6798	num = RExC_npar + num;
	6799	if (num < 1) {
	6800	RExC_parse++;
	6801	vFAIL("Reference to nonexistent group");
	6802	}
	6803	} else if ( paren == '+' ) {
	6804	num = RExC_npar + num - 1;
	6805	}
	6806
	6807	ret = reganode(pRExC_state, GOSUB, num);
	6808	if (!SIZE_ONLY) {
	6809	if (num > (I32)RExC_rx->nparens) {
	6810	RExC_parse++;
	6811	vFAIL("Reference to nonexistent group");
	6812	}
	6813	ARG2L_SET( ret, RExC_recurse_count++);
	6814	RExC_emit++;
	6815	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	6816	"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
	6817	} else {
	6818	RExC_size++;
	6819	}
	6820	RExC_seen \|= REG_SEEN_RECURSE;
	6821	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	6822	Set_Node_Offset(ret, parse_start); /* MJD */
	6823
	6824	*flagp \|= POSTPONED;
	6825	nextchar(pRExC_state);
	6826	return ret;
	6827	} /* named and numeric backreferences */
	6828	/* NOT REACHED */
	6829
	6830	case '?': /* (??...) */
	6831	is_logical = 1;
	6832	if (*RExC_parse != '{') {
	6833	RExC_parse++;
	6834	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6835	/NOTREACHED/
	6836	}
	6837	*flagp \|= POSTPONED;
	6838	paren = *RExC_parse++;
	6839	/* FALL THROUGH */
	6840	case '{': /* (?{...}) */
	6841	{
	6842	I32 count = 1;
	6843	U32 n = 0;
	6844	char c;
	6845	char *s = RExC_parse;
	6846
	6847	RExC_seen_zerolen++;
	6848	RExC_seen \|= REG_SEEN_EVAL;
	6849	while (count && (c = *RExC_parse)) {
	6850	if (c == '\\') {
	6851	if (RExC_parse[1])
	6852	RExC_parse++;
	6853	}
	6854	else if (c == '{')
	6855	count++;
	6856	else if (c == '}')
	6857	count--;
	6858	RExC_parse++;
	6859	}
	6860	if (*RExC_parse != ')') {
	6861	RExC_parse = s;
	6862	vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
	6863	}
	6864	if (!SIZE_ONLY) {
	6865	PAD *pad;
	6866	OP_4tree sop, rop;
	6867	SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
	6868
	6869	ENTER;
	6870	Perl_save_re_context(aTHX);
	6871	rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
	6872	sop->op_private \|= OPpREFCOUNTED;
	6873	/* re_dup will OpREFCNT_inc */
	6874	OpREFCNT_set(sop, 1);
	6875	LEAVE;
	6876
	6877	n = add_data(pRExC_state, 3, "nop");
	6878	RExC_rxi->data->data[n] = (void*)rop;
	6879	RExC_rxi->data->data[n+1] = (void*)sop;
	6880	RExC_rxi->data->data[n+2] = (void*)pad;
	6881	SvREFCNT_dec(sv);
	6882	}
	6883	else { /* First pass */
	6884	if (PL_reginterp_cnt < ++RExC_seen_evals
	6885	&& IN_PERL_RUNTIME)
	6886	/* No compiled RE interpolated, has runtime
	6887	components ===> unsafe. */
	6888	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	6889	if (PL_tainting && PL_tainted)
	6890	FAIL("Eval-group in insecure regular expression");
	6891	#if PERL_VERSION > 8
	6892	if (IN_PERL_COMPILETIME)
	6893	PL_cv_has_eval = 1;
	6894	#endif
	6895	}
	6896
	6897	nextchar(pRExC_state);
	6898	if (is_logical) {
	6899	ret = reg_node(pRExC_state, LOGICAL);
	6900	if (!SIZE_ONLY)
	6901	ret->flags = 2;
	6902	REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
	6903	/* deal with the length of this later - MJD */
	6904	return ret;
	6905	}
	6906	ret = reganode(pRExC_state, EVAL, n);
	6907	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	6908	Set_Node_Offset(ret, parse_start);
	6909	return ret;
	6910	}
	6911	case '(': /* (?(?{...})...) and (?(?=...)...) */
	6912	{
	6913	int is_define= 0;
	6914	if (RExC_parse[0] == '?') { /* (?(?...)) */
	6915	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	6916	\|\| RExC_parse[1] == '<'
	6917	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	6918	I32 flag;
	6919
	6920	ret = reg_node(pRExC_state, LOGICAL);
	6921	if (!SIZE_ONLY)
	6922	ret->flags = 1;
	6923	REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
	6924	goto insert_if;
	6925	}
	6926	}
	6927	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	6928	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	6929	{
	6930	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	6931	char *name_start= RExC_parse++;
	6932	U32 num = 0;
	6933	SV *sv_dat=reg_scan_name(pRExC_state,
	6934	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6935	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	6936	vFAIL2("Sequence (?(%c... not terminated",
	6937	(ch == '>' ? '<' : ch));
	6938	RExC_parse++;
	6939	if (!SIZE_ONLY) {
	6940	num = add_data( pRExC_state, 1, "S" );
	6941	RExC_rxi->data->data[num]=(void*)sv_dat;
	6942	SvREFCNT_inc_simple_void(sv_dat);
	6943	}
	6944	ret = reganode(pRExC_state,NGROUPP,num);
	6945	goto insert_if_check_paren;
	6946	}
	6947	else if (RExC_parse[0] == 'D' &&
	6948	RExC_parse[1] == 'E' &&
	6949	RExC_parse[2] == 'F' &&
	6950	RExC_parse[3] == 'I' &&
	6951	RExC_parse[4] == 'N' &&
	6952	RExC_parse[5] == 'E')
	6953	{
	6954	ret = reganode(pRExC_state,DEFINEP,0);
	6955	RExC_parse +=6 ;
	6956	is_define = 1;
	6957	goto insert_if_check_paren;
	6958	}
	6959	else if (RExC_parse[0] == 'R') {
	6960	RExC_parse++;
	6961	parno = 0;
	6962	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	6963	parno = atoi(RExC_parse++);
	6964	while (isDIGIT(*RExC_parse))
	6965	RExC_parse++;
	6966	} else if (RExC_parse[0] == '&') {
	6967	SV *sv_dat;
	6968	RExC_parse++;
	6969	sv_dat = reg_scan_name(pRExC_state,
	6970	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6971	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	6972	}
	6973	ret = reganode(pRExC_state,INSUBP,parno);
	6974	goto insert_if_check_paren;
	6975	}
	6976	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	6977	/* (?(1)...) */
	6978	char c;
	6979	parno = atoi(RExC_parse++);
	6980
	6981	while (isDIGIT(*RExC_parse))
	6982	RExC_parse++;
	6983	ret = reganode(pRExC_state, GROUPP, parno);
	6984
	6985	insert_if_check_paren:
	6986	if ((c = *nextchar(pRExC_state)) != ')')
	6987	vFAIL("Switch condition not recognized");
	6988	insert_if:
	6989	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	6990	br = regbranch(pRExC_state, &flags, 1,depth+1);
	6991	if (br == NULL)
	6992	br = reganode(pRExC_state, LONGJMP, 0);
	6993	else
	6994	REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
	6995	c = *nextchar(pRExC_state);
	6996	if (flags&HASWIDTH)
	6997	*flagp \|= HASWIDTH;
	6998	if (c == '\|') {
	6999	if (is_define)
	7000	vFAIL("(?(DEFINE)....) does not allow branches");
	7001	lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
	7002	regbranch(pRExC_state, &flags, 1,depth+1);
	7003	REGTAIL(pRExC_state, ret, lastbr);
	7004	if (flags&HASWIDTH)
	7005	*flagp \|= HASWIDTH;
	7006	c = *nextchar(pRExC_state);
	7007	}
	7008	else
	7009	lastbr = NULL;
	7010	if (c != ')')
	7011	vFAIL("Switch (?(condition)... contains too many branches");
	7012	ender = reg_node(pRExC_state, TAIL);
	7013	REGTAIL(pRExC_state, br, ender);
	7014	if (lastbr) {
	7015	REGTAIL(pRExC_state, lastbr, ender);
	7016	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	7017	}
	7018	else
	7019	REGTAIL(pRExC_state, ret, ender);
	7020	RExC_size++; /* XXX WHY do we need this?!!
	7021	For large programs it seems to be required
	7022	but I can't figure out why. -- dmq*/
	7023	return ret;
	7024	}
	7025	else {
	7026	vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
	7027	}
	7028	}
	7029	case 0:
	7030	RExC_parse--; /* for vFAIL to print correctly */
	7031	vFAIL("Sequence (? incomplete");
	7032	break;
	7033	case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
	7034	that follow */
	7035	has_use_defaults = TRUE;
	7036	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	7037	set_regex_charset(&RExC_flags, (RExC_utf8 \|\| RExC_uni_semantics)
	7038	? REGEX_UNICODE_CHARSET
	7039	: REGEX_DEPENDS_CHARSET);
	7040	goto parse_flags;
	7041	default:
	7042	--RExC_parse;
	7043	parse_flags: /* (?i) */
	7044	{
	7045	U32 posflags = 0, negflags = 0;
	7046	U32 *flagsp = &posflags;
	7047	bool has_charset_modifier = 0;
	7048	regex_charset cs = (RExC_utf8 \|\| RExC_uni_semantics)
	7049	? REGEX_UNICODE_CHARSET
	7050	: REGEX_DEPENDS_CHARSET;
	7051
	7052	while (*RExC_parse) {
	7053	/* && strchr("iogcmsx", RExC_parse) /
	7054	/* (?g), (?gc) and (?o) are useless here
	7055	and must be globally applied -- japhy */
	7056	switch (*RExC_parse) {
	7057	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	7058	case LOCALE_PAT_MOD:
	7059	if (has_charset_modifier \|\| flagsp == &negflags) {
	7060	goto fail_modifiers;
	7061	}
	7062	cs = REGEX_LOCALE_CHARSET;
	7063	has_charset_modifier = 1;
	7064	RExC_contains_locale = 1;
	7065	break;
	7066	case UNICODE_PAT_MOD:
	7067	if (has_charset_modifier \|\| flagsp == &negflags) {
	7068	goto fail_modifiers;
	7069	}
	7070	cs = REGEX_UNICODE_CHARSET;
	7071	has_charset_modifier = 1;
	7072	break;
	7073	case ASCII_RESTRICT_PAT_MOD:
	7074	if (has_charset_modifier \|\| flagsp == &negflags) {
	7075	goto fail_modifiers;
	7076	}
	7077	if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
	7078	/* Doubled modifier implies more restricted */
	7079	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	7080	RExC_parse++;
	7081	}
	7082	else {
	7083	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	7084	}
	7085	has_charset_modifier = 1;
	7086	break;
	7087	case DEPENDS_PAT_MOD:
	7088	if (has_use_defaults
	7089	\|\| has_charset_modifier
	7090	\|\| flagsp == &negflags)
	7091	{
	7092	goto fail_modifiers;
	7093	}
	7094
	7095	/* The dual charset means unicode semantics if the
	7096	* pattern (or target, not known until runtime) are
	7097	* utf8, or something in the pattern indicates unicode
	7098	* semantics */
	7099	cs = (RExC_utf8 \|\| RExC_uni_semantics)
	7100	? REGEX_UNICODE_CHARSET
	7101	: REGEX_DEPENDS_CHARSET;
	7102	has_charset_modifier = 1;
	7103	break;
	7104	case ONCE_PAT_MOD: /* 'o' */
	7105	case GLOBAL_PAT_MOD: /* 'g' */
	7106	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	7107	const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
	7108	if (! (wastedflags & wflagbit) ) {
	7109	wastedflags \|= wflagbit;
	7110	vWARN5(
	7111	RExC_parse + 1,
	7112	"Useless (%s%c) - %suse /%c modifier",
	7113	flagsp == &negflags ? "?-" : "?",
	7114	*RExC_parse,
	7115	flagsp == &negflags ? "don't " : "",
	7116	*RExC_parse
	7117	);
	7118	}
	7119	}
	7120	break;
	7121
	7122	case CONTINUE_PAT_MOD: /* 'c' */
	7123	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	7124	if (! (wastedflags & WASTED_C) ) {
	7125	wastedflags \|= WASTED_GC;
	7126	vWARN3(
	7127	RExC_parse + 1,
	7128	"Useless (%sc) - %suse /gc modifier",
	7129	flagsp == &negflags ? "?-" : "?",
	7130	flagsp == &negflags ? "don't " : ""
	7131	);
	7132	}
	7133	}
	7134	break;
	7135	case KEEPCOPY_PAT_MOD: /* 'p' */
	7136	if (flagsp == &negflags) {
	7137	if (SIZE_ONLY)
	7138	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	7139	} else {
	7140	*flagsp \|= RXf_PMf_KEEPCOPY;
	7141	}
	7142	break;
	7143	case '-':
	7144	/* A flag is a default iff it is following a minus, so
	7145	* if there is a minus, it means will be trying to
	7146	* re-specify a default which is an error */
	7147	if (has_use_defaults \|\| flagsp == &negflags) {
	7148	fail_modifiers:
	7149	RExC_parse++;
	7150	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7151	/NOTREACHED/
	7152	}
	7153	flagsp = &negflags;
	7154	wastedflags = 0; /* reset so (?g-c) warns twice */
	7155	break;
	7156	case ':':
	7157	paren = ':';
	7158	/FALLTHROUGH/
	7159	case ')':
	7160	RExC_flags \|= posflags;
	7161	RExC_flags &= ~negflags;
	7162	set_regex_charset(&RExC_flags, cs);
	7163	if (paren != ':') {
	7164	oregflags \|= posflags;
	7165	oregflags &= ~negflags;
	7166	set_regex_charset(&oregflags, cs);
	7167	}
	7168	nextchar(pRExC_state);
	7169	if (paren != ':') {
	7170	*flagp = TRYAGAIN;
	7171	return NULL;
	7172	} else {
	7173	ret = NULL;
	7174	goto parse_rest;
	7175	}
	7176	/NOTREACHED/
	7177	default:
	7178	RExC_parse++;
	7179	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7180	/NOTREACHED/
	7181	}
	7182	++RExC_parse;
	7183	}
	7184	}} /* one for the default block, one for the switch */
	7185	}
	7186	else { /* (...) */
	7187	capturing_parens:
	7188	parno = RExC_npar;
	7189	RExC_npar++;
	7190
	7191	ret = reganode(pRExC_state, OPEN, parno);
	7192	if (!SIZE_ONLY ){
	7193	if (!RExC_nestroot)
	7194	RExC_nestroot = parno;
	7195	if (RExC_seen & REG_SEEN_RECURSE
	7196	&& !RExC_open_parens[parno-1])
	7197	{
	7198	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	7199	"Setting open paren #%"IVdf" to %d\n",
	7200	(IV)parno, REG_NODE_NUM(ret)));
	7201	RExC_open_parens[parno-1]= ret;
	7202	}
	7203	}
	7204	Set_Node_Length(ret, 1); /* MJD */
	7205	Set_Node_Offset(ret, RExC_parse); /* MJD */
	7206	is_open = 1;
	7207	}
	7208	}
	7209	else /* ! paren */
	7210	ret = NULL;
	7211
	7212	parse_rest:
	7213	/* Pick up the branches, linking them together. */
	7214	parse_start = RExC_parse; /* MJD */
	7215	br = regbranch(pRExC_state, &flags, 1,depth+1);
	7216
	7217	/* branch_len = (paren != 0); */
	7218
	7219	if (br == NULL)
	7220	return(NULL);
	7221	if (*RExC_parse == '\|') {
	7222	if (!SIZE_ONLY && RExC_extralen) {
	7223	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	7224	}
	7225	else { /* MJD */
	7226	reginsert(pRExC_state, BRANCH, br, depth+1);
	7227	Set_Node_Length(br, paren != 0);
	7228	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	7229	}
	7230	have_branch = 1;
	7231	if (SIZE_ONLY)
	7232	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	7233	}
	7234	else if (paren == ':') {
	7235	*flagp \|= flags&SIMPLE;
	7236	}
	7237	if (is_open) { /* Starts with OPEN. */
	7238	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	7239	}
	7240	else if (paren != '?') /* Not Conditional */
	7241	ret = br;
	7242	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	7243	lastbr = br;
	7244	while (*RExC_parse == '\|') {
	7245	if (!SIZE_ONLY && RExC_extralen) {
	7246	ender = reganode(pRExC_state, LONGJMP,0);
	7247	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
	7248	}
	7249	if (SIZE_ONLY)
	7250	RExC_extralen += 2; /* Account for LONGJMP. */
	7251	nextchar(pRExC_state);
	7252	if (freeze_paren) {
	7253	if (RExC_npar > after_freeze)
	7254	after_freeze = RExC_npar;
	7255	RExC_npar = freeze_paren;
	7256	}
	7257	br = regbranch(pRExC_state, &flags, 0, depth+1);
	7258
	7259	if (br == NULL)
	7260	return(NULL);
	7261	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	7262	lastbr = br;
	7263	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	7264	}
	7265
	7266	if (have_branch \|\| paren != ':') {
	7267	/* Make a closing node, and hook it on the end. */
	7268	switch (paren) {
	7269	case ':':
	7270	ender = reg_node(pRExC_state, TAIL);
	7271	break;
	7272	case 1:
	7273	ender = reganode(pRExC_state, CLOSE, parno);
	7274	if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
	7275	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	7276	"Setting close paren #%"IVdf" to %d\n",
	7277	(IV)parno, REG_NODE_NUM(ender)));
	7278	RExC_close_parens[parno-1]= ender;
	7279	if (RExC_nestroot == parno)
	7280	RExC_nestroot = 0;
	7281	}
	7282	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	7283	Set_Node_Length(ender,1); /* MJD */
	7284	break;
	7285	case '<':
	7286	case ',':
	7287	case '=':
	7288	case '!':
	7289	*flagp &= ~HASWIDTH;
	7290	/* FALL THROUGH */
	7291	case '>':
	7292	ender = reg_node(pRExC_state, SUCCEED);
	7293	break;
	7294	case 0:
	7295	ender = reg_node(pRExC_state, END);
	7296	if (!SIZE_ONLY) {
	7297	assert(!RExC_opend); /* there can only be one! */
	7298	RExC_opend = ender;
	7299	}
	7300	break;
	7301	}
	7302	REGTAIL(pRExC_state, lastbr, ender);
	7303
	7304	if (have_branch && !SIZE_ONLY) {
	7305	if (depth==1)
	7306	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	7307
	7308	/* Hook the tails of the branches to the closing node. */
	7309	for (br = ret; br; br = regnext(br)) {
	7310	const U8 op = PL_regkind[OP(br)];
	7311	if (op == BRANCH) {
	7312	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	7313	}
	7314	else if (op == BRANCHJ) {
	7315	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	7316	}
	7317	}
	7318	}
	7319	}
	7320
	7321	{
	7322	const char *p;
	7323	static const char parens[] = "=!<,>";
	7324
	7325	if (paren && (p = strchr(parens, paren))) {
	7326	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	7327	int flag = (p - parens) > 1;
	7328
	7329	if (paren == '>')
	7330	node = SUSPEND, flag = 0;
	7331	reginsert(pRExC_state, node,ret, depth+1);
	7332	Set_Node_Cur_Length(ret);
	7333	Set_Node_Offset(ret, parse_start + 1);
	7334	ret->flags = flag;
	7335	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	7336	}
	7337	}
	7338
	7339	/* Check for proper termination. */
	7340	if (paren) {
	7341	RExC_flags = oregflags;
	7342	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	7343	RExC_parse = oregcomp_parse;
	7344	vFAIL("Unmatched (");
	7345	}
	7346	}
	7347	else if (!paren && RExC_parse < RExC_end) {
	7348	if (*RExC_parse == ')') {
	7349	RExC_parse++;
	7350	vFAIL("Unmatched )");
	7351	}
	7352	else
	7353	FAIL("Junk on end of regexp"); /* "Can't happen". */
	7354	/* NOTREACHED */
	7355	}
	7356
	7357	if (RExC_in_lookbehind) {
	7358	RExC_in_lookbehind--;
	7359	}
	7360	if (after_freeze > RExC_npar)
	7361	RExC_npar = after_freeze;
	7362	return(ret);
	7363	}
	7364
	7365	/*
	7366	- regbranch - one alternative of an \| operator
	7367	*
	7368	* Implements the concatenation operator.
	7369	*/
	7370	STATIC regnode *
	7371	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	7372	{
	7373	dVAR;
	7374	register regnode *ret;
	7375	register regnode *chain = NULL;
	7376	register regnode *latest;
	7377	I32 flags = 0, c = 0;
	7378	GET_RE_DEBUG_FLAGS_DECL;
	7379
	7380	PERL_ARGS_ASSERT_REGBRANCH;
	7381
	7382	DEBUG_PARSE("brnc");
	7383
	7384	if (first)
	7385	ret = NULL;
	7386	else {
	7387	if (!SIZE_ONLY && RExC_extralen)
	7388	ret = reganode(pRExC_state, BRANCHJ,0);
	7389	else {
	7390	ret = reg_node(pRExC_state, BRANCH);
	7391	Set_Node_Length(ret, 1);
	7392	}
	7393	}
	7394
	7395	if (!first && SIZE_ONLY)
	7396	RExC_extralen += 1; /* BRANCHJ */
	7397
	7398	flagp = WORST; / Tentatively. */
	7399
	7400	RExC_parse--;
	7401	nextchar(pRExC_state);
	7402	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	7403	flags &= ~TRYAGAIN;
	7404	latest = regpiece(pRExC_state, &flags,depth+1);
	7405	if (latest == NULL) {
	7406	if (flags & TRYAGAIN)
	7407	continue;
	7408	return(NULL);
	7409	}
	7410	else if (ret == NULL)
	7411	ret = latest;
	7412	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	7413	if (chain == NULL) /* First piece. */
	7414	*flagp \|= flags&SPSTART;
	7415	else {
	7416	RExC_naughty++;
	7417	REGTAIL(pRExC_state, chain, latest);
	7418	}
	7419	chain = latest;
	7420	c++;
	7421	}
	7422	if (chain == NULL) { /* Loop ran zero times. */
	7423	chain = reg_node(pRExC_state, NOTHING);
	7424	if (ret == NULL)
	7425	ret = chain;
	7426	}
	7427	if (c == 1) {
	7428	*flagp \|= flags&SIMPLE;
	7429	}
	7430
	7431	return ret;
	7432	}
	7433
	7434	/*
	7435	- regpiece - something followed by possible [*+?]
	7436	*
	7437	* Note that the branching code sequences used for ? and the general cases
	7438	* of * and + are somewhat optimized: they use the same NOTHING node as
	7439	* both the endmarker for their branch list and the body of the last branch.
	7440	* It might seem that this node could be dispensed with entirely, but the
	7441	* endmarker role is not redundant.
	7442	*/
	7443	STATIC regnode *
	7444	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	7445	{
	7446	dVAR;
	7447	register regnode *ret;
	7448	register char op;
	7449	register char *next;
	7450	I32 flags;
	7451	const char * const origparse = RExC_parse;
	7452	I32 min;
	7453	I32 max = REG_INFTY;
	7454	char *parse_start;
	7455	const char *maxpos = NULL;
	7456	GET_RE_DEBUG_FLAGS_DECL;
	7457
	7458	PERL_ARGS_ASSERT_REGPIECE;
	7459
	7460	DEBUG_PARSE("piec");
	7461
	7462	ret = regatom(pRExC_state, &flags,depth+1);
	7463	if (ret == NULL) {
	7464	if (flags & TRYAGAIN)
	7465	*flagp \|= TRYAGAIN;
	7466	return(NULL);
	7467	}
	7468
	7469	op = *RExC_parse;
	7470
	7471	if (op == '{' && regcurly(RExC_parse)) {
	7472	maxpos = NULL;
	7473	parse_start = RExC_parse; /* MJD */
	7474	next = RExC_parse + 1;
	7475	while (isDIGIT(next) \|\| next == ',') {
	7476	if (*next == ',') {
	7477	if (maxpos)
	7478	break;
	7479	else
	7480	maxpos = next;
	7481	}
	7482	next++;
	7483	}
	7484	if (next == '}') { / got one */
	7485	if (!maxpos)
	7486	maxpos = next;
	7487	RExC_parse++;
	7488	min = atoi(RExC_parse);
	7489	if (*maxpos == ',')
	7490	maxpos++;
	7491	else
	7492	maxpos = RExC_parse;
	7493	max = atoi(maxpos);
	7494	if (!max && *maxpos != '0')
	7495	max = REG_INFTY; /* meaning "infinity" */
	7496	else if (max >= REG_INFTY)
	7497	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	7498	RExC_parse = next;
	7499	nextchar(pRExC_state);
	7500
	7501	do_curly:
	7502	if ((flags&SIMPLE)) {
	7503	RExC_naughty += 2 + RExC_naughty / 2;
	7504	reginsert(pRExC_state, CURLY, ret, depth+1);
	7505	Set_Node_Offset(ret, parse_start+1); /* MJD */
	7506	Set_Node_Cur_Length(ret);
	7507	}
	7508	else {
	7509	regnode * const w = reg_node(pRExC_state, WHILEM);
	7510
	7511	w->flags = 0;
	7512	REGTAIL(pRExC_state, ret, w);
	7513	if (!SIZE_ONLY && RExC_extralen) {
	7514	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	7515	reginsert(pRExC_state, NOTHING,ret, depth+1);
	7516	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	7517	}
	7518	reginsert(pRExC_state, CURLYX,ret, depth+1);
	7519	/* MJD hk */
	7520	Set_Node_Offset(ret, parse_start+1);
	7521	Set_Node_Length(ret,
	7522	op == '{' ? (RExC_parse - parse_start) : 1);
	7523
	7524	if (!SIZE_ONLY && RExC_extralen)
	7525	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	7526	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	7527	if (SIZE_ONLY)
	7528	RExC_whilem_seen++, RExC_extralen += 3;
	7529	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	7530	}
	7531	ret->flags = 0;
	7532
	7533	if (min > 0)
	7534	*flagp = WORST;
	7535	if (max > 0)
	7536	*flagp \|= HASWIDTH;
	7537	if (max < min)
	7538	vFAIL("Can't do {n,m} with n > m");
	7539	if (!SIZE_ONLY) {
	7540	ARG1_SET(ret, (U16)min);
	7541	ARG2_SET(ret, (U16)max);
	7542	}
	7543
	7544	goto nest_check;
	7545	}
	7546	}
	7547
	7548	if (!ISMULT1(op)) {
	7549	*flagp = flags;
	7550	return(ret);
	7551	}
	7552
	7553	#if 0 /* Now runtime fix should be reliable. */
	7554
	7555	/* if this is reinstated, don't forget to put this back into perldiag:
	7556
	7557	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	7558
	7559	(F) The part of the regexp subject to either the * or + quantifier
	7560	could match an empty string. The {#} shows in the regular
	7561	expression about where the problem was discovered.
	7562
	7563	*/
	7564
	7565	if (!(flags&HASWIDTH) && op != '?')
	7566	vFAIL("Regexp *+ operand could be empty");
	7567	#endif
	7568
	7569	parse_start = RExC_parse;
	7570	nextchar(pRExC_state);
	7571
	7572	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	7573
	7574	if (op == '*' && (flags&SIMPLE)) {
	7575	reginsert(pRExC_state, STAR, ret, depth+1);
	7576	ret->flags = 0;
	7577	RExC_naughty += 4;
	7578	}
	7579	else if (op == '*') {
	7580	min = 0;
	7581	goto do_curly;
	7582	}
	7583	else if (op == '+' && (flags&SIMPLE)) {
	7584	reginsert(pRExC_state, PLUS, ret, depth+1);
	7585	ret->flags = 0;
	7586	RExC_naughty += 3;
	7587	}
	7588	else if (op == '+') {
	7589	min = 1;
	7590	goto do_curly;
	7591	}
	7592	else if (op == '?') {
	7593	min = 0; max = 1;
	7594	goto do_curly;
	7595	}
	7596	nest_check:
	7597	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	7598	ckWARN3reg(RExC_parse,
	7599	"%.*s matches null string many times",
	7600	(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
	7601	origparse);
	7602	}
	7603
	7604	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	7605	nextchar(pRExC_state);
	7606	reginsert(pRExC_state, MINMOD, ret, depth+1);
	7607	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	7608	}
	7609	#ifndef REG_ALLOW_MINMOD_SUSPEND
	7610	else
	7611	#endif
	7612	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	7613	regnode *ender;
	7614	nextchar(pRExC_state);
	7615	ender = reg_node(pRExC_state, SUCCEED);
	7616	REGTAIL(pRExC_state, ret, ender);
	7617	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	7618	ret->flags = 0;
	7619	ender = reg_node(pRExC_state, TAIL);
	7620	REGTAIL(pRExC_state, ret, ender);
	7621	/ret= ender;/
	7622	}
	7623
	7624	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	7625	RExC_parse++;
	7626	vFAIL("Nested quantifiers");
	7627	}
	7628
	7629	return(ret);
	7630	}
	7631
	7632
	7633	/* reg_namedseq(pRExC_state,UVp)
	7634
	7635	This is expected to be called by a parser routine that has
	7636	recognized '\N' and needs to handle the rest. RExC_parse is
	7637	expected to point at the first char following the N at the time
	7638	of the call.
	7639
	7640	The \N may be inside (indicated by valuep not being NULL) or outside a
	7641	character class.
	7642
	7643	\N may begin either a named sequence, or if outside a character class, mean
	7644	to match a non-newline. For non single-quoted regexes, the tokenizer has
	7645	attempted to decide which, and in the case of a named sequence converted it
	7646	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	7647	where c1... are the characters in the sequence. For single-quoted regexes,
	7648	the tokenizer passes the \N sequence through unchanged; this code will not
	7649	attempt to determine this nor expand those. The net effect is that if the
	7650	beginning of the passed-in pattern isn't '{U+' or there is no '}', it
	7651	signals that this \N occurrence means to match a non-newline.
	7652
	7653	Only the \N{U+...} form should occur in a character class, for the same
	7654	reason that '.' inside a character class means to just match a period: it
	7655	just doesn't make sense.
	7656
	7657	If valuep is non-null then it is assumed that we are parsing inside
	7658	of a charclass definition and the first codepoint in the resolved
	7659	string is returned via *valuep and the routine will return NULL.
	7660	In this mode if a multichar string is returned from the charnames
	7661	handler, a warning will be issued, and only the first char in the
	7662	sequence will be examined. If the string returned is zero length
	7663	then the value of *valuep is undefined and NON-NULL will
	7664	be returned to indicate failure. (This will NOT be a valid pointer
	7665	to a regnode.)
	7666
	7667	If valuep is null then it is assumed that we are parsing normal text and a
	7668	new EXACT node is inserted into the program containing the resolved string,
	7669	and a pointer to the new node is returned. But if the string is zero length
	7670	a NOTHING node is emitted instead.
	7671
	7672	On success RExC_parse is set to the char following the endbrace.
	7673	Parsing failures will generate a fatal error via vFAIL(...)
	7674	*/
	7675	STATIC regnode *
	7676	S_reg_namedseq(pTHX_ RExC_state_t pRExC_state, UV valuep, I32 *flagp)
	7677	{
	7678	char * endbrace; /* '}' following the name */
	7679	regnode *ret = NULL;
	7680	#ifdef DEBUGGING
	7681	char* parse_start = RExC_parse - 2; /* points to the '\N' */
	7682	#endif
	7683	char* p;
	7684
	7685	GET_RE_DEBUG_FLAGS_DECL;
	7686
	7687	PERL_ARGS_ASSERT_REG_NAMEDSEQ;
	7688
	7689	GET_RE_DEBUG_FLAGS;
	7690
	7691	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	7692	* modifier. The other meaning does not */
	7693	p = (RExC_flags & RXf_PMf_EXTENDED)
	7694	? regwhite( pRExC_state, RExC_parse )
	7695	: RExC_parse;
	7696
	7697	/* Disambiguate between \N meaning a named character versus \N meaning
	7698	* [^\n]. The former is assumed when it can't be the latter. */
	7699	if (*p != '{' \|\| regcurly(p)) {
	7700	RExC_parse = p;
	7701	if (valuep) {
	7702	/* no bare \N in a charclass */
	7703	vFAIL("\\N in a character class must be a named character: \\N{...}");
	7704	}
	7705	nextchar(pRExC_state);
	7706	ret = reg_node(pRExC_state, REG_ANY);
	7707	*flagp \|= HASWIDTH\|SIMPLE;
	7708	RExC_naughty++;
	7709	RExC_parse--;
	7710	Set_Node_Length(ret, 1); /* MJD */
	7711	return ret;
	7712	}
	7713
	7714	/* Here, we have decided it should be a named sequence */
	7715
	7716	/* The test above made sure that the next real character is a '{', but
	7717	* under the /x modifier, it could be separated by space (or a comment and
	7718	* \n) and this is not allowed (for consistency with \x{...} and the
	7719	* tokenizer handling of \N{NAME}). */
	7720	if (*RExC_parse != '{') {
	7721	vFAIL("Missing braces on \\N{}");
	7722	}
	7723
	7724	RExC_parse++; /* Skip past the '{' */
	7725
	7726	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	7727	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	7728	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
	7729	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
	7730	{
	7731	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	7732	vFAIL("\\N{NAME} must be resolved by the lexer");
	7733	}
	7734
	7735	if (endbrace == RExC_parse) { /* empty: \N{} */
	7736	if (! valuep) {
	7737	RExC_parse = endbrace + 1;
	7738	return reg_node(pRExC_state,NOTHING);
	7739	}
	7740
	7741	if (SIZE_ONLY) {
	7742	ckWARNreg(RExC_parse,
	7743	"Ignoring zero length \\N{} in character class"
	7744	);
	7745	RExC_parse = endbrace + 1;
	7746	}
	7747	*valuep = 0;
	7748	return (regnode ) &RExC_parse; / Invalid regnode pointer */
	7749	}
	7750
	7751	REQUIRE_UTF8; /* named sequences imply Unicode semantics */
	7752	RExC_parse += 2; /* Skip past the 'U+' */
	7753
	7754	if (valuep) { /* In a bracketed char class */
	7755	/* We only pay attention to the first char of
	7756	multichar strings being returned. I kinda wonder
	7757	if this makes sense as it does change the behaviour
	7758	from earlier versions, OTOH that behaviour was broken
	7759	as well. XXX Solution is to recharacterize as
	7760	[rest-of-class]\|multi1\|multi2... */
	7761
	7762	STRLEN length_of_hex;
	7763	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	7764	\| PERL_SCAN_DISALLOW_PREFIX
	7765	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	7766
	7767	char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
	7768	if (endchar < endbrace) {
	7769	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	7770	}
	7771
	7772	length_of_hex = (STRLEN)(endchar - RExC_parse);
	7773	*valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
	7774
	7775	/* The tokenizer should have guaranteed validity, but it's possible to
	7776	* bypass it by using single quoting, so check */
	7777	if (length_of_hex == 0
	7778	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	7779	{
	7780	RExC_parse += length_of_hex; /* Includes all the valid */
	7781	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	7782	? UTF8SKIP(RExC_parse)
	7783	: 1;
	7784	/* Guard against malformed utf8 */
	7785	if (RExC_parse >= endchar) RExC_parse = endchar;
	7786	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	7787	}
	7788
	7789	RExC_parse = endbrace + 1;
	7790	if (endchar == endbrace) return NULL;
	7791
	7792	ret = (regnode ) &RExC_parse; / Invalid regnode pointer */
	7793	}
	7794	else { /* Not a char class */
	7795	char s; / String to put in generated EXACT node */
	7796	STRLEN len = 0; /* Its current byte length */
	7797	char endchar; / Points to '.' or '}' ending cur char in the input
	7798	stream */
	7799	ret = reg_node(pRExC_state,
	7800	(U8) ((! FOLD) ? EXACT
	7801	: (LOC)
	7802	? EXACTFL
	7803	: (MORE_ASCII_RESTRICTED)
	7804	? EXACTFA
	7805	: (AT_LEAST_UNI_SEMANTICS)
	7806	? EXACTFU
	7807	: EXACTF));
	7808	s= STRING(ret);
	7809
	7810	/* Exact nodes can hold only a U8 length's of text = 255. Loop through
	7811	* the input which is of the form now 'c1.c2.c3...}' until find the
	7812	* ending brace or exceed length 255. The characters that exceed this
	7813	* limit are dropped. The limit could be relaxed should it become
	7814	* desirable by reparsing this as (?:\N{NAME}), so could generate
	7815	* multiple EXACT nodes, as is done for just regular input. But this
	7816	* is primarily a named character, and not intended to be a huge long
	7817	* string, so 255 bytes should be good enough */
	7818	while (1) {
	7819	STRLEN length_of_hex;
	7820	I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
	7821	\| PERL_SCAN_DISALLOW_PREFIX
	7822	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	7823	UV cp; /* Ord of current character */
	7824	bool use_this_char_fold = FOLD;
	7825
	7826	/* Code points are separated by dots. If none, there is only one
	7827	* code point, and is terminated by the brace */
	7828	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	7829
	7830	/* The values are Unicode even on EBCDIC machines */
	7831	length_of_hex = (STRLEN)(endchar - RExC_parse);
	7832	cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
	7833	if ( length_of_hex == 0
	7834	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	7835	{
	7836	RExC_parse += length_of_hex; /* Includes all the valid */
	7837	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	7838	? UTF8SKIP(RExC_parse)
	7839	: 1;
	7840	/* Guard against malformed utf8 */
	7841	if (RExC_parse >= endchar) RExC_parse = endchar;
	7842	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	7843	}
	7844
	7845	/* XXX ? Change to ANYOF node
	7846	if (FOLD
	7847	&& (cp > 255 \|\| (! MORE_ASCII_RESTRICTED && ! LOC))
	7848	&& is_TRICKYFOLD_cp(cp))
	7849	{
	7850	}
	7851	*/
	7852
	7853	/* Under /aa, we can't mix ASCII with non- in a fold. If we are
	7854	* folding, and the source isn't ASCII, look through all the
	7855	* characters it folds to. If any one of them is ASCII, forbid
	7856	* this fold. (cp is uni, so the 127 below is correct even for
	7857	* EBCDIC). Similarly under locale rules, we don't mix under 256
	7858	* with above 255. XXX It really doesn't make sense to have \N{}
	7859	* which means a Unicode rules under locale. I (khw) think this
	7860	* should be warned about, but the counter argument is that people
	7861	* who have programmed around Perl's earlier lack of specifying the
	7862	* rules and used \N{} to force Unicode things in a local
	7863	* environment shouldn't get suddenly a warning */
	7864	if (use_this_char_fold) {
	7865	if (LOC && cp < 256) { /* Fold not known until run-time */
	7866	use_this_char_fold = FALSE;
	7867	}
	7868	else if ((cp > 127 && MORE_ASCII_RESTRICTED)
	7869	\|\| (cp > 255 && LOC))
	7870	{
	7871	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	7872	U8* s = tmpbuf;
	7873	U8* e;
	7874	STRLEN foldlen;
	7875
	7876	(void) toFOLD_uni(cp, tmpbuf, &foldlen);
	7877	e = s + foldlen;
	7878
	7879	while (s < e) {
	7880	if (isASCII(*s)
	7881	\|\| (LOC && (UTF8_IS_INVARIANT(*s)
	7882	\|\| UTF8_IS_DOWNGRADEABLE_START(*s))))
	7883	{
	7884	use_this_char_fold = FALSE;
	7885	break;
	7886	}
	7887	s += UTF8SKIP(s);
	7888	}
	7889	}
	7890	}
	7891
	7892	if (! use_this_char_fold) { /* Not folding, just append to the
	7893	string */
	7894	STRLEN unilen;
	7895
	7896	/* Quit before adding this character if would exceed limit */
	7897	if (len + UNISKIP(cp) > U8_MAX) break;
	7898
	7899	unilen = reguni(pRExC_state, cp, s);
	7900	if (unilen > 0) {
	7901	s += unilen;
	7902	len += unilen;
	7903	}
	7904	} else { /* Folding, output the folded equivalent */
	7905	STRLEN foldlen,numlen;
	7906	U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
	7907	cp = toFOLD_uni(cp, tmpbuf, &foldlen);
	7908
	7909	/* Quit before exceeding size limit */
	7910	if (len + foldlen > U8_MAX) break;
	7911
	7912	for (foldbuf = tmpbuf;
	7913	foldlen;
	7914	foldlen -= numlen)
	7915	{
	7916	cp = utf8_to_uvchr(foldbuf, &numlen);
	7917	if (numlen > 0) {
	7918	const STRLEN unilen = reguni(pRExC_state, cp, s);
	7919	s += unilen;
	7920	len += unilen;
	7921	/* In EBCDIC the numlen and unilen can differ. */
	7922	foldbuf += numlen;
	7923	if (numlen >= foldlen)
	7924	break;
	7925	}
	7926	else
	7927	break; /* "Can't happen." */
	7928	}
	7929	}
	7930
	7931	/* Point to the beginning of the next character in the sequence. */
	7932	RExC_parse = endchar + 1;
	7933
	7934	/* Quit if no more characters */
	7935	if (RExC_parse >= endbrace) break;
	7936	}
	7937
	7938
	7939	if (SIZE_ONLY) {
	7940	if (RExC_parse < endbrace) {
	7941	ckWARNreg(RExC_parse - 1,
	7942	"Using just the first characters returned by \\N{}");
	7943	}
	7944
	7945	RExC_size += STR_SZ(len);
	7946	} else {
	7947	STR_LEN(ret) = len;
	7948	RExC_emit += STR_SZ(len);
	7949	}
	7950
	7951	RExC_parse = endbrace + 1;
	7952
	7953	flagp \|= HASWIDTH; / Not SIMPLE, as that causes the engine to fail
	7954	with malformed in t/re/pat_advanced.t */
	7955	RExC_parse --;
	7956	Set_Node_Cur_Length(ret); /* MJD */
	7957	nextchar(pRExC_state);
	7958	}
	7959
	7960	return ret;
	7961	}
	7962
	7963
	7964	/*
	7965	* reg_recode
	7966	*
	7967	* It returns the code point in utf8 for the value in *encp.
	7968	* value: a code value in the source encoding
	7969	* encp: a pointer to an Encode object
	7970	*
	7971	* If the result from Encode is not a single character,
	7972	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	7973	*/
	7974	STATIC UV
	7975	S_reg_recode(pTHX_ const char value, SV **encp)
	7976	{
	7977	STRLEN numlen = 1;
	7978	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	7979	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	7980	const STRLEN newlen = SvCUR(sv);
	7981	UV uv = UNICODE_REPLACEMENT;
	7982
	7983	PERL_ARGS_ASSERT_REG_RECODE;
	7984
	7985	if (newlen)
	7986	uv = SvUTF8(sv)
	7987	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	7988	: (U8)s;
	7989
	7990	if (!newlen \|\| numlen != newlen) {
	7991	uv = UNICODE_REPLACEMENT;
	7992	*encp = NULL;
	7993	}
	7994	return uv;
	7995	}
	7996
	7997
	7998	/*
	7999	- regatom - the lowest level
	8000
	8001	Try to identify anything special at the start of the pattern. If there
	8002	is, then handle it as required. This may involve generating a single regop,
	8003	such as for an assertion; or it may involve recursing, such as to
	8004	handle a () structure.
	8005
	8006	If the string doesn't start with something special then we gobble up
	8007	as much literal text as we can.
	8008
	8009	Once we have been able to handle whatever type of thing started the
	8010	sequence, we return.
	8011
	8012	Note: we have to be careful with escapes, as they can be both literal
	8013	and special, and in the case of \10 and friends can either, depending
	8014	on context. Specifically there are two separate switches for handling
	8015	escape sequences, with the one for handling literal escapes requiring
	8016	a dummy entry for all of the special escapes that are actually handled
	8017	by the other.
	8018	*/
	8019
	8020	STATIC regnode *
	8021	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	8022	{
	8023	dVAR;
	8024	register regnode *ret = NULL;
	8025	I32 flags;
	8026	char *parse_start = RExC_parse;
	8027	U8 op;
	8028	GET_RE_DEBUG_FLAGS_DECL;
	8029	DEBUG_PARSE("atom");
	8030	flagp = WORST; / Tentatively. */
	8031
	8032	PERL_ARGS_ASSERT_REGATOM;
	8033
	8034	tryagain:
	8035	switch ((U8)*RExC_parse) {
	8036	case '^':
	8037	RExC_seen_zerolen++;
	8038	nextchar(pRExC_state);
	8039	if (RExC_flags & RXf_PMf_MULTILINE)
	8040	ret = reg_node(pRExC_state, MBOL);
	8041	else if (RExC_flags & RXf_PMf_SINGLELINE)
	8042	ret = reg_node(pRExC_state, SBOL);
	8043	else
	8044	ret = reg_node(pRExC_state, BOL);
	8045	Set_Node_Length(ret, 1); /* MJD */
	8046	break;
	8047	case '$':
	8048	nextchar(pRExC_state);
	8049	if (*RExC_parse)
	8050	RExC_seen_zerolen++;
	8051	if (RExC_flags & RXf_PMf_MULTILINE)
	8052	ret = reg_node(pRExC_state, MEOL);
	8053	else if (RExC_flags & RXf_PMf_SINGLELINE)
	8054	ret = reg_node(pRExC_state, SEOL);
	8055	else
	8056	ret = reg_node(pRExC_state, EOL);
	8057	Set_Node_Length(ret, 1); /* MJD */
	8058	break;
	8059	case '.':
	8060	nextchar(pRExC_state);
	8061	if (RExC_flags & RXf_PMf_SINGLELINE)
	8062	ret = reg_node(pRExC_state, SANY);
	8063	else
	8064	ret = reg_node(pRExC_state, REG_ANY);
	8065	*flagp \|= HASWIDTH\|SIMPLE;
	8066	RExC_naughty++;
	8067	Set_Node_Length(ret, 1); /* MJD */
	8068	break;
	8069	case '[':
	8070	{
	8071	char * const oregcomp_parse = ++RExC_parse;
	8072	ret = regclass(pRExC_state,depth+1);
	8073	if (*RExC_parse != ']') {
	8074	RExC_parse = oregcomp_parse;
	8075	vFAIL("Unmatched [");
	8076	}
	8077	nextchar(pRExC_state);
	8078	*flagp \|= HASWIDTH\|SIMPLE;
	8079	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	8080	break;
	8081	}
	8082	case '(':
	8083	nextchar(pRExC_state);
	8084	ret = reg(pRExC_state, 1, &flags,depth+1);
	8085	if (ret == NULL) {
	8086	if (flags & TRYAGAIN) {
	8087	if (RExC_parse == RExC_end) {
	8088	/* Make parent create an empty node if needed. */
	8089	*flagp \|= TRYAGAIN;
	8090	return(NULL);
	8091	}
	8092	goto tryagain;
	8093	}
	8094	return(NULL);
	8095	}
	8096	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	8097	break;
	8098	case '\|':
	8099	case ')':
	8100	if (flags & TRYAGAIN) {
	8101	*flagp \|= TRYAGAIN;
	8102	return NULL;
	8103	}
	8104	vFAIL("Internal urp");
	8105	/* Supposed to be caught earlier. */
	8106	break;
	8107	case '{':
	8108	if (!regcurly(RExC_parse)) {
	8109	RExC_parse++;
	8110	goto defchar;
	8111	}
	8112	/* FALL THROUGH */
	8113	case '?':
	8114	case '+':
	8115	case '*':
	8116	RExC_parse++;
	8117	vFAIL("Quantifier follows nothing");
	8118	break;
	8119	case LATIN_SMALL_LETTER_SHARP_S:
	8120	case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
	8121	case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
	8122	#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
	8123	#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
	8124	case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
	8125	#endif
	8126	do_foldchar:
	8127	if (!LOC && FOLD) {
	8128	U32 len,cp;
	8129	len=0; /* silence a spurious compiler warning */
	8130	if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
	8131	flagp \|= HASWIDTH; / could be SIMPLE too, but needs a handler in regexec.regrepeat */
	8132	RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
	8133	ret = reganode(pRExC_state, FOLDCHAR, cp);
	8134	Set_Node_Length(ret, 1); /* MJD */
	8135	nextchar(pRExC_state); /* kill whitespace under /x */
	8136	return ret;
	8137	}
	8138	}
	8139	goto outer_default;
	8140	case '\\':
	8141	/* Special Escapes
	8142
	8143	This switch handles escape sequences that resolve to some kind
	8144	of special regop and not to literal text. Escape sequnces that
	8145	resolve to literal text are handled below in the switch marked
	8146	"Literal Escapes".
	8147
	8148	Every entry in this switch must have a corresponding entry
	8149	in the literal escape switch. However, the opposite is not
	8150	required, as the default for this switch is to jump to the
	8151	literal text handling code.
	8152	*/
	8153	switch ((U8)*++RExC_parse) {
	8154	case LATIN_SMALL_LETTER_SHARP_S:
	8155	case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
	8156	case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
	8157	goto do_foldchar;
	8158	/* Special Escapes */
	8159	case 'A':
	8160	RExC_seen_zerolen++;
	8161	ret = reg_node(pRExC_state, SBOL);
	8162	*flagp \|= SIMPLE;
	8163	goto finish_meta_pat;
	8164	case 'G':
	8165	ret = reg_node(pRExC_state, GPOS);
	8166	RExC_seen \|= REG_SEEN_GPOS;
	8167	*flagp \|= SIMPLE;
	8168	goto finish_meta_pat;
	8169	case 'K':
	8170	RExC_seen_zerolen++;
	8171	ret = reg_node(pRExC_state, KEEPS);
	8172	*flagp \|= SIMPLE;
	8173	/* XXX:dmq : disabling in-place substitution seems to
	8174	* be necessary here to avoid cases of memory corruption, as
	8175	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	8176	*/
	8177	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8178	goto finish_meta_pat;
	8179	case 'Z':
	8180	ret = reg_node(pRExC_state, SEOL);
	8181	*flagp \|= SIMPLE;
	8182	RExC_seen_zerolen++; /* Do not optimize RE away */
	8183	goto finish_meta_pat;
	8184	case 'z':
	8185	ret = reg_node(pRExC_state, EOS);
	8186	*flagp \|= SIMPLE;
	8187	RExC_seen_zerolen++; /* Do not optimize RE away */
	8188	goto finish_meta_pat;
	8189	case 'C':
	8190	ret = reg_node(pRExC_state, CANY);
	8191	RExC_seen \|= REG_SEEN_CANY;
	8192	*flagp \|= HASWIDTH\|SIMPLE;
	8193	goto finish_meta_pat;
	8194	case 'X':
	8195	ret = reg_node(pRExC_state, CLUMP);
	8196	*flagp \|= HASWIDTH;
	8197	goto finish_meta_pat;
	8198	case 'w':
	8199	switch (get_regex_charset(RExC_flags)) {
	8200	case REGEX_LOCALE_CHARSET:
	8201	op = ALNUML;
	8202	break;
	8203	case REGEX_UNICODE_CHARSET:
	8204	op = ALNUMU;
	8205	break;
	8206	case REGEX_ASCII_RESTRICTED_CHARSET:
	8207	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8208	op = ALNUMA;
	8209	break;
	8210	case REGEX_DEPENDS_CHARSET:
	8211	op = ALNUM;
	8212	break;
	8213	default:
	8214	goto bad_charset;
	8215	}
	8216	ret = reg_node(pRExC_state, op);
	8217	*flagp \|= HASWIDTH\|SIMPLE;
	8218	goto finish_meta_pat;
	8219	case 'W':
	8220	switch (get_regex_charset(RExC_flags)) {
	8221	case REGEX_LOCALE_CHARSET:
	8222	op = NALNUML;
	8223	break;
	8224	case REGEX_UNICODE_CHARSET:
	8225	op = NALNUMU;
	8226	break;
	8227	case REGEX_ASCII_RESTRICTED_CHARSET:
	8228	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8229	op = NALNUMA;
	8230	break;
	8231	case REGEX_DEPENDS_CHARSET:
	8232	op = NALNUM;
	8233	break;
	8234	default:
	8235	goto bad_charset;
	8236	}
	8237	ret = reg_node(pRExC_state, op);
	8238	*flagp \|= HASWIDTH\|SIMPLE;
	8239	goto finish_meta_pat;
	8240	case 'b':
	8241	RExC_seen_zerolen++;
	8242	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8243	switch (get_regex_charset(RExC_flags)) {
	8244	case REGEX_LOCALE_CHARSET:
	8245	op = BOUNDL;
	8246	break;
	8247	case REGEX_UNICODE_CHARSET:
	8248	op = BOUNDU;
	8249	break;
	8250	case REGEX_ASCII_RESTRICTED_CHARSET:
	8251	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8252	op = BOUNDA;
	8253	break;
	8254	case REGEX_DEPENDS_CHARSET:
	8255	op = BOUND;
	8256	break;
	8257	default:
	8258	goto bad_charset;
	8259	}
	8260	ret = reg_node(pRExC_state, op);
	8261	FLAGS(ret) = get_regex_charset(RExC_flags);
	8262	*flagp \|= SIMPLE;
	8263	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	8264	ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
	8265	}
	8266	goto finish_meta_pat;
	8267	case 'B':
	8268	RExC_seen_zerolen++;
	8269	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8270	switch (get_regex_charset(RExC_flags)) {
	8271	case REGEX_LOCALE_CHARSET:
	8272	op = NBOUNDL;
	8273	break;
	8274	case REGEX_UNICODE_CHARSET:
	8275	op = NBOUNDU;
	8276	break;
	8277	case REGEX_ASCII_RESTRICTED_CHARSET:
	8278	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8279	op = NBOUNDA;
	8280	break;
	8281	case REGEX_DEPENDS_CHARSET:
	8282	op = NBOUND;
	8283	break;
	8284	default:
	8285	goto bad_charset;
	8286	}
	8287	ret = reg_node(pRExC_state, op);
	8288	FLAGS(ret) = get_regex_charset(RExC_flags);
	8289	*flagp \|= SIMPLE;
	8290	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	8291	ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
	8292	}
	8293	goto finish_meta_pat;
	8294	case 's':
	8295	switch (get_regex_charset(RExC_flags)) {
	8296	case REGEX_LOCALE_CHARSET:
	8297	op = SPACEL;
	8298	break;
	8299	case REGEX_UNICODE_CHARSET:
	8300	op = SPACEU;
	8301	break;
	8302	case REGEX_ASCII_RESTRICTED_CHARSET:
	8303	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8304	op = SPACEA;
	8305	break;
	8306	case REGEX_DEPENDS_CHARSET:
	8307	op = SPACE;
	8308	break;
	8309	default:
	8310	goto bad_charset;
	8311	}
	8312	ret = reg_node(pRExC_state, op);
	8313	*flagp \|= HASWIDTH\|SIMPLE;
	8314	goto finish_meta_pat;
	8315	case 'S':
	8316	switch (get_regex_charset(RExC_flags)) {
	8317	case REGEX_LOCALE_CHARSET:
	8318	op = NSPACEL;
	8319	break;
	8320	case REGEX_UNICODE_CHARSET:
	8321	op = NSPACEU;
	8322	break;
	8323	case REGEX_ASCII_RESTRICTED_CHARSET:
	8324	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8325	op = NSPACEA;
	8326	break;
	8327	case REGEX_DEPENDS_CHARSET:
	8328	op = NSPACE;
	8329	break;
	8330	default:
	8331	goto bad_charset;
	8332	}
	8333	ret = reg_node(pRExC_state, op);
	8334	*flagp \|= HASWIDTH\|SIMPLE;
	8335	goto finish_meta_pat;
	8336	case 'd':
	8337	switch (get_regex_charset(RExC_flags)) {
	8338	case REGEX_LOCALE_CHARSET:
	8339	op = DIGITL;
	8340	break;
	8341	case REGEX_ASCII_RESTRICTED_CHARSET:
	8342	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8343	op = DIGITA;
	8344	break;
	8345	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	8346	case REGEX_UNICODE_CHARSET:
	8347	op = DIGIT;
	8348	break;
	8349	default:
	8350	goto bad_charset;
	8351	}
	8352	ret = reg_node(pRExC_state, op);
	8353	*flagp \|= HASWIDTH\|SIMPLE;
	8354	goto finish_meta_pat;
	8355	case 'D':
	8356	switch (get_regex_charset(RExC_flags)) {
	8357	case REGEX_LOCALE_CHARSET:
	8358	op = NDIGITL;
	8359	break;
	8360	case REGEX_ASCII_RESTRICTED_CHARSET:
	8361	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8362	op = NDIGITA;
	8363	break;
	8364	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	8365	case REGEX_UNICODE_CHARSET:
	8366	op = NDIGIT;
	8367	break;
	8368	default:
	8369	goto bad_charset;
	8370	}
	8371	ret = reg_node(pRExC_state, op);
	8372	*flagp \|= HASWIDTH\|SIMPLE;
	8373	goto finish_meta_pat;
	8374	case 'R':
	8375	ret = reg_node(pRExC_state, LNBREAK);
	8376	*flagp \|= HASWIDTH\|SIMPLE;
	8377	goto finish_meta_pat;
	8378	case 'h':
	8379	ret = reg_node(pRExC_state, HORIZWS);
	8380	*flagp \|= HASWIDTH\|SIMPLE;
	8381	goto finish_meta_pat;
	8382	case 'H':
	8383	ret = reg_node(pRExC_state, NHORIZWS);
	8384	*flagp \|= HASWIDTH\|SIMPLE;
	8385	goto finish_meta_pat;
	8386	case 'v':
	8387	ret = reg_node(pRExC_state, VERTWS);
	8388	*flagp \|= HASWIDTH\|SIMPLE;
	8389	goto finish_meta_pat;
	8390	case 'V':
	8391	ret = reg_node(pRExC_state, NVERTWS);
	8392	*flagp \|= HASWIDTH\|SIMPLE;
	8393	finish_meta_pat:
	8394	nextchar(pRExC_state);
	8395	Set_Node_Length(ret, 2); /* MJD */
	8396	break;
	8397	case 'p':
	8398	case 'P':
	8399	{
	8400	char* const oldregxend = RExC_end;
	8401	#ifdef DEBUGGING
	8402	char* parse_start = RExC_parse - 2;
	8403	#endif
	8404
	8405	if (RExC_parse[1] == '{') {
	8406	/* a lovely hack--pretend we saw [\pX] instead */
	8407	RExC_end = strchr(RExC_parse, '}');
	8408	if (!RExC_end) {
	8409	const U8 c = (U8)*RExC_parse;
	8410	RExC_parse += 2;
	8411	RExC_end = oldregxend;
	8412	vFAIL2("Missing right brace on \\%c{}", c);
	8413	}
	8414	RExC_end++;
	8415	}
	8416	else {
	8417	RExC_end = RExC_parse + 2;
	8418	if (RExC_end > oldregxend)
	8419	RExC_end = oldregxend;
	8420	}
	8421	RExC_parse--;
	8422
	8423	ret = regclass(pRExC_state,depth+1);
	8424
	8425	RExC_end = oldregxend;
	8426	RExC_parse--;
	8427
	8428	Set_Node_Offset(ret, parse_start + 2);
	8429	Set_Node_Cur_Length(ret);
	8430	nextchar(pRExC_state);
	8431	*flagp \|= HASWIDTH\|SIMPLE;
	8432	}
	8433	break;
	8434	case 'N':
	8435	/* Handle \N and \N{NAME} here and not below because it can be
	8436	multicharacter. join_exact() will join them up later on.
	8437	Also this makes sure that things like /\N{BLAH}+/ and
	8438	\N{BLAH} being multi char Just Happen. dmq*/
	8439	++RExC_parse;
	8440	ret= reg_namedseq(pRExC_state, NULL, flagp);
	8441	break;
	8442	case 'k': /* Handle \k<NAME> and \k'NAME' */
	8443	parse_named_seq:
	8444	{
	8445	char ch= RExC_parse[1];
	8446	if (ch != '<' && ch != '\'' && ch != '{') {
	8447	RExC_parse++;
	8448	vFAIL2("Sequence %.2s... not terminated",parse_start);
	8449	} else {
	8450	/* this pretty much dupes the code for (?P=...) in reg(), if
	8451	you change this make sure you change that */
	8452	char* name_start = (RExC_parse += 2);
	8453	U32 num = 0;
	8454	SV *sv_dat = reg_scan_name(pRExC_state,
	8455	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8456	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	8457	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	8458	vFAIL2("Sequence %.3s... not terminated",parse_start);
	8459
	8460	if (!SIZE_ONLY) {
	8461	num = add_data( pRExC_state, 1, "S" );
	8462	RExC_rxi->data->data[num]=(void*)sv_dat;
	8463	SvREFCNT_inc_simple_void(sv_dat);
	8464	}
	8465
	8466	RExC_sawback = 1;
	8467	ret = reganode(pRExC_state,
	8468	((! FOLD)
	8469	? NREF
	8470	: (MORE_ASCII_RESTRICTED)
	8471	? NREFFA
	8472	: (AT_LEAST_UNI_SEMANTICS)
	8473	? NREFFU
	8474	: (LOC)
	8475	? NREFFL
	8476	: NREFF),
	8477	num);
	8478	*flagp \|= HASWIDTH;
	8479
	8480	/* override incorrect value set in reganode MJD */
	8481	Set_Node_Offset(ret, parse_start+1);
	8482	Set_Node_Cur_Length(ret); /* MJD */
	8483	nextchar(pRExC_state);
	8484
	8485	}
	8486	break;
	8487	}
	8488	case 'g':
	8489	case '1': case '2': case '3': case '4':
	8490	case '5': case '6': case '7': case '8': case '9':
	8491	{
	8492	I32 num;
	8493	bool isg = *RExC_parse == 'g';
	8494	bool isrel = 0;
	8495	bool hasbrace = 0;
	8496	if (isg) {
	8497	RExC_parse++;
	8498	if (*RExC_parse == '{') {
	8499	RExC_parse++;
	8500	hasbrace = 1;
	8501	}
	8502	if (*RExC_parse == '-') {
	8503	RExC_parse++;
	8504	isrel = 1;
	8505	}
	8506	if (hasbrace && !isDIGIT(*RExC_parse)) {
	8507	if (isrel) RExC_parse--;
	8508	RExC_parse -= 2;
	8509	goto parse_named_seq;
	8510	} }
	8511	num = atoi(RExC_parse);
	8512	if (isg && num == 0)
	8513	vFAIL("Reference to invalid group 0");
	8514	if (isrel) {
	8515	num = RExC_npar - num;
	8516	if (num < 1)
	8517	vFAIL("Reference to nonexistent or unclosed group");
	8518	}
	8519	if (!isg && num > 9 && num >= RExC_npar)
	8520	goto defchar;
	8521	else {
	8522	char * const parse_start = RExC_parse - 1; /* MJD */
	8523	while (isDIGIT(*RExC_parse))
	8524	RExC_parse++;
	8525	if (parse_start == RExC_parse - 1)
	8526	vFAIL("Unterminated \\g... pattern");
	8527	if (hasbrace) {
	8528	if (*RExC_parse != '}')
	8529	vFAIL("Unterminated \\g{...} pattern");
	8530	RExC_parse++;
	8531	}
	8532	if (!SIZE_ONLY) {
	8533	if (num > (I32)RExC_rx->nparens)
	8534	vFAIL("Reference to nonexistent group");
	8535	}
	8536	RExC_sawback = 1;
	8537	ret = reganode(pRExC_state,
	8538	((! FOLD)
	8539	? REF
	8540	: (MORE_ASCII_RESTRICTED)
	8541	? REFFA
	8542	: (AT_LEAST_UNI_SEMANTICS)
	8543	? REFFU
	8544	: (LOC)
	8545	? REFFL
	8546	: REFF),
	8547	num);
	8548	*flagp \|= HASWIDTH;
	8549
	8550	/* override incorrect value set in reganode MJD */
	8551	Set_Node_Offset(ret, parse_start+1);
	8552	Set_Node_Cur_Length(ret); /* MJD */
	8553	RExC_parse--;
	8554	nextchar(pRExC_state);
	8555	}
	8556	}
	8557	break;
	8558	case '\0':
	8559	if (RExC_parse >= RExC_end)
	8560	FAIL("Trailing \\");
	8561	/* FALL THROUGH */
	8562	default:
	8563	/* Do not generate "unrecognized" warnings here, we fall
	8564	back into the quick-grab loop below */
	8565	parse_start--;
	8566	goto defchar;
	8567	}
	8568	break;
	8569
	8570	case '#':
	8571	if (RExC_flags & RXf_PMf_EXTENDED) {
	8572	if ( reg_skipcomment( pRExC_state ) )
	8573	goto tryagain;
	8574	}
	8575	/* FALL THROUGH */
	8576
	8577	default:
	8578	outer_default:{
	8579	register STRLEN len;
	8580	register UV ender;
	8581	register char *p;
	8582	char *s;
	8583	STRLEN foldlen;
	8584	U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
	8585	regnode * orig_emit;
	8586
	8587	parse_start = RExC_parse - 1;
	8588
	8589	RExC_parse++;
	8590
	8591	defchar:
	8592	ender = 0;
	8593	orig_emit = RExC_emit; /* Save the original output node position in
	8594	case we need to output a different node
	8595	type */
	8596	ret = reg_node(pRExC_state,
	8597	(U8) ((! FOLD) ? EXACT
	8598	: (LOC)
	8599	? EXACTFL
	8600	: (MORE_ASCII_RESTRICTED)
	8601	? EXACTFA
	8602	: (AT_LEAST_UNI_SEMANTICS)
	8603	? EXACTFU
	8604	: EXACTF)
	8605	);
	8606	s = STRING(ret);
	8607	for (len = 0, p = RExC_parse - 1;
	8608	len < 127 && p < RExC_end;
	8609	len++)
	8610	{
	8611	char * const oldp = p;
	8612
	8613	if (RExC_flags & RXf_PMf_EXTENDED)
	8614	p = regwhite( pRExC_state, p );
	8615	switch ((U8)*p) {
	8616	case LATIN_SMALL_LETTER_SHARP_S:
	8617	case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
	8618	case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
	8619	if (LOC \|\| !FOLD \|\| !is_TRICKYFOLD_safe(p,RExC_end,UTF))
	8620	goto normal_default;
	8621	case '^':
	8622	case '$':
	8623	case '.':
	8624	case '[':
	8625	case '(':
	8626	case ')':
	8627	case '\|':
	8628	goto loopdone;
	8629	case '\\':
	8630	/* Literal Escapes Switch
	8631
	8632	This switch is meant to handle escape sequences that
	8633	resolve to a literal character.
	8634
	8635	Every escape sequence that represents something
	8636	else, like an assertion or a char class, is handled
	8637	in the switch marked 'Special Escapes' above in this
	8638	routine, but also has an entry here as anything that
	8639	isn't explicitly mentioned here will be treated as
	8640	an unescaped equivalent literal.
	8641	*/
	8642
	8643	switch ((U8)*++p) {
	8644	/* These are all the special escapes. */
	8645	case LATIN_SMALL_LETTER_SHARP_S:
	8646	case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
	8647	case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
	8648	if (LOC \|\| !FOLD \|\| !is_TRICKYFOLD_safe(p,RExC_end,UTF))
	8649	goto normal_default;
	8650	case 'A': /* Start assertion */
	8651	case 'b': case 'B': /* Word-boundary assertion*/
	8652	case 'C': /* Single char !DANGEROUS! */
	8653	case 'd': case 'D': /* digit class */
	8654	case 'g': case 'G': /* generic-backref, pos assertion */
	8655	case 'h': case 'H': /* HORIZWS */
	8656	case 'k': case 'K': /* named backref, keep marker */
	8657	case 'N': /* named char sequence */
	8658	case 'p': case 'P': /* Unicode property */
	8659	case 'R': /* LNBREAK */
	8660	case 's': case 'S': /* space class */
	8661	case 'v': case 'V': /* VERTWS */
	8662	case 'w': case 'W': /* word class */
	8663	case 'X': /* eXtended Unicode "combining character sequence" */
	8664	case 'z': case 'Z': /* End of line/string assertion */
	8665	--p;
	8666	goto loopdone;
	8667
	8668	/* Anything after here is an escape that resolves to a
	8669	literal. (Except digits, which may or may not)
	8670	*/
	8671	case 'n':
	8672	ender = '\n';
	8673	p++;
	8674	break;
	8675	case 'r':
	8676	ender = '\r';
	8677	p++;
	8678	break;
	8679	case 't':
	8680	ender = '\t';
	8681	p++;
	8682	break;
	8683	case 'f':
	8684	ender = '\f';
	8685	p++;
	8686	break;
	8687	case 'e':
	8688	ender = ASCII_TO_NATIVE('\033');
	8689	p++;
	8690	break;
	8691	case 'a':
	8692	ender = ASCII_TO_NATIVE('\007');
	8693	p++;
	8694	break;
	8695	case 'o':
	8696	{
	8697	STRLEN brace_len = len;
	8698	UV result;
	8699	const char* error_msg;
	8700
	8701	bool valid = grok_bslash_o(p,
	8702	&result,
	8703	&brace_len,
	8704	&error_msg,
	8705	1);
	8706	p += brace_len;
	8707	if (! valid) {
	8708	RExC_parse = p; /* going to die anyway; point
	8709	to exact spot of failure */
	8710	vFAIL(error_msg);
	8711	}
	8712	else
	8713	{
	8714	ender = result;
	8715	}
	8716	if (PL_encoding && ender < 0x100) {
	8717	goto recode_encoding;
	8718	}
	8719	if (ender > 0xff) {
	8720	REQUIRE_UTF8;
	8721	}
	8722	break;
	8723	}
	8724	case 'x':
	8725	if (*++p == '{') {
	8726	char* const e = strchr(p, '}');
	8727
	8728	if (!e) {
	8729	RExC_parse = p + 1;
	8730	vFAIL("Missing right brace on \\x{}");
	8731	}
	8732	else {
	8733	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	8734	\| PERL_SCAN_DISALLOW_PREFIX;
	8735	STRLEN numlen = e - p - 1;
	8736	ender = grok_hex(p + 1, &numlen, &flags, NULL);
	8737	if (ender > 0xff)
	8738	REQUIRE_UTF8;
	8739	p = e + 1;
	8740	}
	8741	}
	8742	else {
	8743	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	8744	STRLEN numlen = 2;
	8745	ender = grok_hex(p, &numlen, &flags, NULL);
	8746	p += numlen;
	8747	}
	8748	if (PL_encoding && ender < 0x100)
	8749	goto recode_encoding;
	8750	break;
	8751	case 'c':
	8752	p++;
	8753	ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
	8754	break;
	8755	case '0': case '1': case '2': case '3':case '4':
	8756	case '5': case '6': case '7': case '8':case '9':
	8757	if (*p == '0' \|\|
	8758	(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
	8759	{
	8760	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	8761	STRLEN numlen = 3;
	8762	ender = grok_oct(p, &numlen, &flags, NULL);
	8763	if (ender > 0xff) {
	8764	REQUIRE_UTF8;
	8765	}
	8766	p += numlen;
	8767	}
	8768	else {
	8769	--p;
	8770	goto loopdone;
	8771	}
	8772	if (PL_encoding && ender < 0x100)
	8773	goto recode_encoding;
	8774	break;
	8775	recode_encoding:
	8776	{
	8777	SV* enc = PL_encoding;
	8778	ender = reg_recode((const char)(U8)ender, &enc);
	8779	if (!enc && SIZE_ONLY)
	8780	ckWARNreg(p, "Invalid escape in the specified encoding");
	8781	REQUIRE_UTF8;
	8782	}
	8783	break;
	8784	case '\0':
	8785	if (p >= RExC_end)
	8786	FAIL("Trailing \\");
	8787	/* FALL THROUGH */
	8788	default:
	8789	if (!SIZE_ONLY&& isALPHA(*p)) {
	8790	/* Include any { following the alpha to emphasize
	8791	* that it could be part of an escape at some point
	8792	* in the future */
	8793	int len = (*(p + 1) == '{') ? 2 : 1;
	8794	ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
	8795	}
	8796	goto normal_default;
	8797	}
	8798	break;
	8799	default:
	8800	normal_default:
	8801	if (UTF8_IS_START(*p) && UTF) {
	8802	STRLEN numlen;
	8803	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	8804	&numlen, UTF8_ALLOW_DEFAULT);
	8805	p += numlen;
	8806	}
	8807	else
	8808	ender = (U8) *p++;
	8809	break;
	8810	} /* End of switch on the literal */
	8811
	8812	/* Certain characters are problematic because their folded
	8813	* length is so different from their original length that it
	8814	* isn't handleable by the optimizer. They are therefore not
	8815	* placed in an EXACTish node; and are here handled specially.
	8816	* (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
	8817	* putting it in a special node keeps regexec from having to
	8818	* deal with a non-utf8 multi-char fold */
	8819	if (FOLD
	8820	&& (ender > 255 \|\| (! MORE_ASCII_RESTRICTED && ! LOC))
	8821	&& is_TRICKYFOLD_cp(ender))
	8822	{
	8823	/* If is in middle of outputting characters into an
	8824	* EXACTish node, go output what we have so far, and
	8825	* position the parse so that this will be called again
	8826	* immediately */
	8827	if (len) {
	8828	p = oldp;
	8829	goto loopdone;
	8830	}
	8831	else {
	8832
	8833	/* Here we are ready to output our tricky fold
	8834	* character. What's done is to pretend it's in a
	8835	* [bracketed] class, and let the code that deals with
	8836	* those handle it, as that code has all the
	8837	* intelligence necessary. First save the current
	8838	* parse state, get rid of the already allocated EXACT
	8839	* node that the ANYOFV node will replace, and point
	8840	* the parse to a buffer which we fill with the
	8841	* character we want the regclass code to think is
	8842	* being parsed */
	8843	char* const oldregxend = RExC_end;
	8844	char tmpbuf[2];
	8845	RExC_emit = orig_emit;
	8846	RExC_parse = tmpbuf;
	8847	if (UTF) {
	8848	tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
	8849	tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
	8850	RExC_end = RExC_parse + 2;
	8851	}
	8852	else {
	8853	tmpbuf[0] = (char) ender;
	8854	RExC_end = RExC_parse + 1;
	8855	}
	8856
	8857	ret = regclass(pRExC_state,depth+1);
	8858
	8859	/* Here, have parsed the buffer. Reset the parse to
	8860	* the actual input, and return */
	8861	RExC_end = oldregxend;
	8862	RExC_parse = p - 1;
	8863
	8864	Set_Node_Offset(ret, RExC_parse);
	8865	Set_Node_Cur_Length(ret);
	8866	nextchar(pRExC_state);
	8867	*flagp \|= HASWIDTH\|SIMPLE;
	8868	return ret;
	8869	}
	8870	}
	8871
	8872	if ( RExC_flags & RXf_PMf_EXTENDED)
	8873	p = regwhite( pRExC_state, p );
	8874	if (UTF && FOLD) {
	8875	/* Prime the casefolded buffer. Locale rules, which apply
	8876	* only to code points < 256, aren't known until execution,
	8877	* so for them, just output the original character using
	8878	* utf8 */
	8879	if (LOC && ender < 256) {
	8880	if (UNI_IS_INVARIANT(ender)) {
	8881	*tmpbuf = (U8) ender;
	8882	foldlen = 1;
	8883	} else {
	8884	*tmpbuf = UTF8_TWO_BYTE_HI(ender);
	8885	*(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
	8886	foldlen = 2;
	8887	}
	8888	}
	8889	else if (isASCII(ender)) { /* Note: Here can't also be LOC
	8890	*/
	8891	ender = toLOWER(ender);
	8892	*tmpbuf = (U8) ender;
	8893	foldlen = 1;
	8894	}
	8895	else if (! MORE_ASCII_RESTRICTED && ! LOC) {
	8896
	8897	/* Locale and /aa require more selectivity about the
	8898	* fold, so are handled below. Otherwise, here, just
	8899	* use the fold */
	8900	ender = toFOLD_uni(ender, tmpbuf, &foldlen);
	8901	}
	8902	else {
	8903	/* Under locale rules or /aa we are not to mix,
	8904	* respectively, ords < 256 or ASCII with non-. So
	8905	* reject folds that mix them, using only the
	8906	* non-folded code point. So do the fold to a
	8907	* temporary, and inspect each character in it. */
	8908	U8 trialbuf[UTF8_MAXBYTES_CASE+1];
	8909	U8* s = trialbuf;
	8910	UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
	8911	U8* e = s + foldlen;
	8912	bool fold_ok = TRUE;
	8913
	8914	while (s < e) {
	8915	if (isASCII(*s)
	8916	\|\| (LOC && (UTF8_IS_INVARIANT(*s)
	8917	\|\| UTF8_IS_DOWNGRADEABLE_START(*s))))
	8918	{
	8919	fold_ok = FALSE;
	8920	break;
	8921	}
	8922	s += UTF8SKIP(s);
	8923	}
	8924	if (fold_ok) {
	8925	Copy(trialbuf, tmpbuf, foldlen, U8);
	8926	ender = tmpender;
	8927	}
	8928	else {
	8929	uvuni_to_utf8(tmpbuf, ender);
	8930	foldlen = UNISKIP(ender);
	8931	}
	8932	}
	8933	}
	8934	if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+. /
	8935	if (len)
	8936	p = oldp;
	8937	else if (UTF) {
	8938	if (FOLD) {
	8939	/* Emit all the Unicode characters. */
	8940	STRLEN numlen;
	8941	for (foldbuf = tmpbuf;
	8942	foldlen;
	8943	foldlen -= numlen) {
	8944	ender = utf8_to_uvchr(foldbuf, &numlen);
	8945	if (numlen > 0) {
	8946	const STRLEN unilen = reguni(pRExC_state, ender, s);
	8947	s += unilen;
	8948	len += unilen;
	8949	/* In EBCDIC the numlen
	8950	* and unilen can differ. */
	8951	foldbuf += numlen;
	8952	if (numlen >= foldlen)
	8953	break;
	8954	}
	8955	else
	8956	break; /* "Can't happen." */
	8957	}
	8958	}
	8959	else {
	8960	const STRLEN unilen = reguni(pRExC_state, ender, s);
	8961	if (unilen > 0) {
	8962	s += unilen;
	8963	len += unilen;
	8964	}
	8965	}
	8966	}
	8967	else {
	8968	len++;
	8969	REGC((char)ender, s++);
	8970	}
	8971	break;
	8972	}
	8973	if (UTF) {
	8974	if (FOLD) {
	8975	/* Emit all the Unicode characters. */
	8976	STRLEN numlen;
	8977	for (foldbuf = tmpbuf;
	8978	foldlen;
	8979	foldlen -= numlen) {
	8980	ender = utf8_to_uvchr(foldbuf, &numlen);
	8981	if (numlen > 0) {
	8982	const STRLEN unilen = reguni(pRExC_state, ender, s);
	8983	len += unilen;
	8984	s += unilen;
	8985	/* In EBCDIC the numlen
	8986	* and unilen can differ. */
	8987	foldbuf += numlen;
	8988	if (numlen >= foldlen)
	8989	break;
	8990	}
	8991	else
	8992	break;
	8993	}
	8994	}
	8995	else {
	8996	const STRLEN unilen = reguni(pRExC_state, ender, s);
	8997	if (unilen > 0) {
	8998	s += unilen;
	8999	len += unilen;
	9000	}
	9001	}
	9002	len--;
	9003	}
	9004	else
	9005	REGC((char)ender, s++);
	9006	}
	9007	loopdone: /* Jumped to when encounters something that shouldn't be in
	9008	the node */
	9009	RExC_parse = p - 1;
	9010	Set_Node_Cur_Length(ret); /* MJD */
	9011	nextchar(pRExC_state);
	9012	{
	9013	/* len is STRLEN which is unsigned, need to copy to signed */
	9014	IV iv = len;
	9015	if (iv < 0)
	9016	vFAIL("Internal disaster");
	9017	}
	9018	if (len > 0)
	9019	*flagp \|= HASWIDTH;
	9020	if (len == 1 && UNI_IS_INVARIANT(ender))
	9021	*flagp \|= SIMPLE;
	9022
	9023	if (SIZE_ONLY)
	9024	RExC_size += STR_SZ(len);
	9025	else {
	9026	STR_LEN(ret) = len;
	9027	RExC_emit += STR_SZ(len);
	9028	}
	9029	}
	9030	break;
	9031	}
	9032
	9033	return(ret);
	9034
	9035	/* Jumped to when an unrecognized character set is encountered */
	9036	bad_charset:
	9037	Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
	9038	return(NULL);
	9039	}
	9040
	9041	STATIC char *
	9042	S_regwhite( RExC_state_t pRExC_state, char p )
	9043	{
	9044	const char *e = RExC_end;
	9045
	9046	PERL_ARGS_ASSERT_REGWHITE;
	9047
	9048	while (p < e) {
	9049	if (isSPACE(*p))
	9050	++p;
	9051	else if (*p == '#') {
	9052	bool ended = 0;
	9053	do {
	9054	if (*p++ == '\n') {
	9055	ended = 1;
	9056	break;
	9057	}
	9058	} while (p < e);
	9059	if (!ended)
	9060	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	9061	}
	9062	else
	9063	break;
	9064	}
	9065	return p;
	9066	}
	9067
	9068	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	9069	Character classes ([:foo:]) can also be negated ([:^foo:]).
	9070	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	9071	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	9072	but trigger failures because they are currently unimplemented. */
	9073
	9074	#define POSIXCC_DONE(c) ((c) == ':')
	9075	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	9076	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	9077
	9078	STATIC I32
	9079	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
	9080	{
	9081	dVAR;
	9082	I32 namedclass = OOB_NAMEDCLASS;
	9083
	9084	PERL_ARGS_ASSERT_REGPPOSIXCC;
	9085
	9086	if (value == '[' && RExC_parse + 1 < RExC_end &&
	9087	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	9088	POSIXCC(UCHARAT(RExC_parse))) {
	9089	const char c = UCHARAT(RExC_parse);
	9090	char* const s = RExC_parse++;
	9091
	9092	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	9093	RExC_parse++;
	9094	if (RExC_parse == RExC_end)
	9095	/* Grandfather lone [:, [=, [. */
	9096	RExC_parse = s;
	9097	else {
	9098	const char* const t = RExC_parse++; /* skip over the c */
	9099	assert(*t == c);
	9100
	9101	if (UCHARAT(RExC_parse) == ']') {
	9102	const char *posixcc = s + 1;
	9103	RExC_parse++; /* skip over the ending ] */
	9104
	9105	if (*s == ':') {
	9106	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	9107	const I32 skip = t - posixcc;
	9108
	9109	/* Initially switch on the length of the name. */
	9110	switch (skip) {
	9111	case 4:
	9112	if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
	9113	namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
	9114	break;
	9115	case 5:
	9116	/* Names all of length 5. */
	9117	/* alnum alpha ascii blank cntrl digit graph lower
	9118	print punct space upper */
	9119	/* Offset 4 gives the best switch position. */
	9120	switch (posixcc[4]) {
	9121	case 'a':
	9122	if (memEQ(posixcc, "alph", 4)) /* alpha */
	9123	namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
	9124	break;
	9125	case 'e':
	9126	if (memEQ(posixcc, "spac", 4)) /* space */
	9127	namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
	9128	break;
	9129	case 'h':
	9130	if (memEQ(posixcc, "grap", 4)) /* graph */
	9131	namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
	9132	break;
	9133	case 'i':
	9134	if (memEQ(posixcc, "asci", 4)) /* ascii */
	9135	namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
	9136	break;
	9137	case 'k':
	9138	if (memEQ(posixcc, "blan", 4)) /* blank */
	9139	namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
	9140	break;
	9141	case 'l':
	9142	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	9143	namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
	9144	break;
	9145	case 'm':
	9146	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	9147	namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
	9148	break;
	9149	case 'r':
	9150	if (memEQ(posixcc, "lowe", 4)) /* lower */
	9151	namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
	9152	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	9153	namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
	9154	break;
	9155	case 't':
	9156	if (memEQ(posixcc, "digi", 4)) /* digit */
	9157	namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
	9158	else if (memEQ(posixcc, "prin", 4)) /* print */
	9159	namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
	9160	else if (memEQ(posixcc, "punc", 4)) /* punct */
	9161	namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
	9162	break;
	9163	}
	9164	break;
	9165	case 6:
	9166	if (memEQ(posixcc, "xdigit", 6))
	9167	namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
	9168	break;
	9169	}
	9170
	9171	if (namedclass == OOB_NAMEDCLASS)
	9172	Simple_vFAIL3("POSIX class [:%.*s:] unknown",
	9173	t - s - 1, s + 1);
	9174	assert (posixcc[skip] == ':');
	9175	assert (posixcc[skip+1] == ']');
	9176	} else if (!SIZE_ONLY) {
	9177	/* [[=foo=]] and [[.foo.]] are still future. */
	9178
	9179	/* adjust RExC_parse so the warning shows after
	9180	the class closes */
	9181	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	9182	RExC_parse++;
	9183	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	9184	}
	9185	} else {
	9186	/* Maternal grandfather:
	9187	* "[:" ending in ":" but not in ":]" */
	9188	RExC_parse = s;
	9189	}
	9190	}
	9191	}
	9192
	9193	return namedclass;
	9194	}
	9195
	9196	STATIC void
	9197	S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
	9198	{
	9199	dVAR;
	9200
	9201	PERL_ARGS_ASSERT_CHECKPOSIXCC;
	9202
	9203	if (POSIXCC(UCHARAT(RExC_parse))) {
	9204	const char *s = RExC_parse;
	9205	const char c = *s++;
	9206
	9207	while (isALNUM(*s))
	9208	s++;
	9209	if (s && c == s && s[1] == ']') {
	9210	ckWARN3reg(s+2,
	9211	"POSIX syntax [%c %c] belongs inside character classes",
	9212	c, c);
	9213
	9214	/* [[=foo=]] and [[.foo.]] are still future. */
	9215	if (POSIXCC_NOTYET(c)) {
	9216	/* adjust RExC_parse so the error shows after
	9217	the class closes */
	9218	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
	9219	NOOP;
	9220	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	9221	}
	9222	}
	9223	}
	9224	}
	9225
	9226	/* No locale test, and always Unicode semantics */
	9227	#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
	9228	ANYOF_##NAME: \
	9229	for (value = 0; value < 256; value++) \
	9230	if (TEST) \
	9231	stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9232	yesno = '+'; \
	9233	what = WORD; \
	9234	break; \
	9235	case ANYOF_N##NAME: \
	9236	for (value = 0; value < 256; value++) \
	9237	if (!TEST) \
	9238	stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9239	yesno = '!'; \
	9240	what = WORD; \
	9241	break
	9242
	9243	/* Like the above, but there are differences if we are in uni-8-bit or not, so
	9244	* there are two tests passed in, to use depending on that. There aren't any
	9245	* cases where the label is different from the name, so no need for that
	9246	* parameter */
	9247	#define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
	9248	ANYOF_##NAME: \
	9249	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
	9250	else if (UNI_SEMANTICS) { \
	9251	for (value = 0; value < 256; value++) { \
	9252	if (TEST_8(value)) stored += \
	9253	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9254	} \
	9255	} \
	9256	else { \
	9257	for (value = 0; value < 128; value++) { \
	9258	if (TEST_7(UNI_TO_NATIVE(value))) stored += \
	9259	set_regclass_bit(pRExC_state, ret, \
	9260	(U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9261	} \
	9262	} \
	9263	yesno = '+'; \
	9264	what = WORD; \
	9265	break; \
	9266	case ANYOF_N##NAME: \
	9267	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
	9268	else if (UNI_SEMANTICS) { \
	9269	for (value = 0; value < 256; value++) { \
	9270	if (! TEST_8(value)) stored += \
	9271	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9272	} \
	9273	} \
	9274	else { \
	9275	for (value = 0; value < 128; value++) { \
	9276	if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
	9277	pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9278	} \
	9279	if (AT_LEAST_ASCII_RESTRICTED) { \
	9280	for (value = 128; value < 256; value++) { \
	9281	stored += set_regclass_bit( \
	9282	pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9283	} \
	9284	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL; \
	9285	} \
	9286	else { \
	9287	/* For a non-ut8 target string with DEPENDS semantics, all above \
	9288	* ASCII Latin1 code points match the complement of any of the \
	9289	* classes. But in utf8, they have their Unicode semantics, so \
	9290	* can't just set them in the bitmap, or else regexec.c will think \
	9291	* they matched when they shouldn't. */ \
	9292	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	9293	} \
	9294	} \
	9295	yesno = '!'; \
	9296	what = WORD; \
	9297	break
	9298
	9299	STATIC U8
	9300	S_set_regclass_bit_fold(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, HV invlist_ptr, AV alternate_ptr)
	9301	{
	9302
	9303	/* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
	9304	* Locale folding is done at run-time, so this function should not be
	9305	* called for nodes that are for locales.
	9306	*
	9307	* This function sets the bit corresponding to the fold of the input
	9308	* 'value', if not already set. The fold of 'f' is 'F', and the fold of
	9309	* 'F' is 'f'.
	9310	*
	9311	* It also knows about the characters that are in the bitmap that have
	9312	* folds that are matchable only outside it, and sets the appropriate lists
	9313	* and flags.
	9314	*
	9315	* It returns the number of bits that actually changed from 0 to 1 */
	9316
	9317	U8 stored = 0;
	9318	U8 fold;
	9319
	9320	PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
	9321
	9322	fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
	9323	: PL_fold[value];
	9324
	9325	/* It assumes the bit for 'value' has already been set */
	9326	if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
	9327	ANYOF_BITMAP_SET(node, fold);
	9328	stored++;
	9329	}
	9330	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) \|\| ! MORE_ASCII_RESTRICTED)) {
	9331	/* Certain Latin1 characters have matches outside the bitmap. To get
	9332	* here, 'value' is one of those characters. None of these matches is
	9333	* valid for ASCII characters under /aa, which have been excluded by
	9334	* the 'if' above. The matches fall into three categories:
	9335	* 1) They are singly folded-to or -from an above 255 character, as
	9336	* LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
	9337	* WITH DIAERESIS;
	9338	* 2) They are part of a multi-char fold with another character in the
	9339	* bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
	9340	* 3) They are part of a multi-char fold with a character not in the
	9341	* bitmap, such as various ligatures.
	9342	* We aren't dealing fully with multi-char folds, except we do deal
	9343	* with the pattern containing a character that has a multi-char fold
	9344	* (not so much the inverse).
	9345	* For types 1) and 3), the matches only happen when the target string
	9346	* is utf8; that's not true for 2), and we set a flag for it.
	9347	*
	9348	* The code below adds to the passed in inversion list the single fold
	9349	* closures for 'value'. The values are hard-coded here so that an
	9350	* innocent-looking character class, like /[ks]/i won't have to go out
	9351	* to disk to find the possible matches. XXX It would be better to
	9352	* generate these via regen, in case a new version of the Unicode
	9353	* standard adds new mappings, though that is not really likely. */
	9354	switch (value) {
	9355	case 'k':
	9356	case 'K':
	9357	/* KELVIN SIGN */
	9358	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212A);
	9359	break;
	9360	case 's':
	9361	case 'S':
	9362	/* LATIN SMALL LETTER LONG S */
	9363	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x017F);
	9364	break;
	9365	case MICRO_SIGN:
	9366	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9367	GREEK_SMALL_LETTER_MU);
	9368	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9369	GREEK_CAPITAL_LETTER_MU);
	9370	break;
	9371	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	9372	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	9373	/* ANGSTROM SIGN */
	9374	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212B);
	9375	if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
	9376	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9377	PL_fold_latin1[value]);
	9378	}
	9379	break;
	9380	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	9381	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9382	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	9383	break;
	9384	case LATIN_SMALL_LETTER_SHARP_S:
	9385	/* 0x1E9E is LATIN CAPITAL LETTER SHARP S */
	9386	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x1E9E);
	9387
	9388	/* Under /a, /d, and /u, this can match the two chars "ss" */
	9389	if (! MORE_ASCII_RESTRICTED) {
	9390	add_alternate(alternate_ptr, (U8 *) "ss", 2);
	9391
	9392	/* And under /u or /a, it can match even if the target is
	9393	* not utf8 */
	9394	if (AT_LEAST_UNI_SEMANTICS) {
	9395	ANYOF_FLAGS(node) \|= ANYOF_NONBITMAP_NON_UTF8;
	9396	}
	9397	}
	9398	break;
	9399	case 'F': case 'f':
	9400	case 'I': case 'i':
	9401	case 'L': case 'l':
	9402	case 'T': case 't':
	9403	/* These all are targets of multi-character folds, which can
	9404	* occur with only non-Latin1 characters in the fold, so they
	9405	* can match if the target string isn't UTF-8 */
	9406	ANYOF_FLAGS(node) \|= ANYOF_NONBITMAP_NON_UTF8;
	9407	break;
	9408	case 'A': case 'a':
	9409	case 'H': case 'h':
	9410	case 'J': case 'j':
	9411	case 'N': case 'n':
	9412	case 'W': case 'w':
	9413	case 'Y': case 'y':
	9414	/* These all are targets of multi-character folds, which occur
	9415	* only with a non-Latin1 character as part of the fold, so
	9416	* they can't match unless the target string is in UTF-8, so no
	9417	* action here is necessary */
	9418	break;
	9419	default:
	9420	/* Use deprecated warning to increase the chances of this
	9421	* being output */
	9422	ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
	9423	break;
	9424	}
	9425	}
	9426	else if (DEPENDS_SEMANTICS
	9427	&& ! isASCII(value)
	9428	&& PL_fold_latin1[value] != value)
	9429	{
	9430	/* Under DEPENDS rules, non-ASCII Latin1 characters match their
	9431	* folds only when the target string is in UTF-8. We add the fold
	9432	* here to the list of things to match outside the bitmap, which
	9433	* won't be looked at unless it is UTF8 (or else if something else
	9434	* says to look even if not utf8, but those things better not happen
	9435	* under DEPENDS semantics. */
	9436	invlist_ptr = add_cp_to_invlist(invlist_ptr, PL_fold_latin1[value]);
	9437	}
	9438
	9439	return stored;
	9440	}
	9441
	9442
	9443	PERL_STATIC_INLINE U8
	9444	S_set_regclass_bit(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, HV invlist_ptr, AV alternate_ptr)
	9445	{
	9446	/* This inline function sets a bit in the bitmap if not already set, and if
	9447	* appropriate, its fold, returning the number of bits that actually
	9448	* changed from 0 to 1 */
	9449
	9450	U8 stored;
	9451
	9452	PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
	9453
	9454	if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
	9455	return 0;
	9456	}
	9457
	9458	ANYOF_BITMAP_SET(node, value);
	9459	stored = 1;
	9460
	9461	if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
	9462	stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
	9463	}
	9464
	9465	return stored;
	9466	}
	9467
	9468	STATIC void
	9469	S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
	9470	{
	9471	/* Adds input 'string' with length 'len' to the ANYOF node's unicode
	9472	* alternate list, pointed to by 'alternate_ptr'. This is an array of
	9473	* the multi-character folds of characters in the node */
	9474	SV *sv;
	9475
	9476	PERL_ARGS_ASSERT_ADD_ALTERNATE;
	9477
	9478	if (! *alternate_ptr) {
	9479	*alternate_ptr = newAV();
	9480	}
	9481	sv = newSVpvn_utf8((char*)string, len, TRUE);
	9482	av_push(*alternate_ptr, sv);
	9483	return;
	9484	}
	9485
	9486	/*
	9487	parse a class specification and produce either an ANYOF node that
	9488	matches the pattern or perhaps will be optimized into an EXACTish node
	9489	instead. The node contains a bit map for the first 256 characters, with the
	9490	corresponding bit set if that character is in the list. For characters
	9491	above 255, a range list is used */
	9492
	9493	STATIC regnode *
	9494	S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
	9495	{
	9496	dVAR;
	9497	register UV nextvalue;
	9498	register IV prevvalue = OOB_UNICODE;
	9499	register IV range = 0;
	9500	UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
	9501	register regnode *ret;
	9502	STRLEN numlen;
	9503	IV namedclass;
	9504	char *rangebegin = NULL;
	9505	bool need_class = 0;
	9506	SV *listsv = NULL;
	9507	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	9508	than just initialized. */
	9509	UV n;
	9510
	9511	/* code points this node matches that can't be stored in the bitmap */
	9512	HV* nonbitmap = NULL;
	9513
	9514	/* The items that are to match that aren't stored in the bitmap, but are a
	9515	* result of things that are stored there. This is the fold closure of
	9516	* such a character, either because it has DEPENDS semantics and shouldn't
	9517	* be matched unless the target string is utf8, or is a code point that is
	9518	* too large for the bit map, as for example, the fold of the MICRO SIGN is
	9519	* above 255. This all is solely for performance reasons. By having this
	9520	* code know the outside-the-bitmap folds that the bitmapped characters are
	9521	* involved with, we don't have to go out to disk to find the list of
	9522	* matches, unless the character class includes code points that aren't
	9523	* storable in the bit map. That means that a character class with an 's'
	9524	* in it, for example, doesn't need to go out to disk to find everything
	9525	* that matches. A 2nd list is used so that the 'nonbitmap' list is kept
	9526	* empty unless there is something whose fold we don't know about, and will
	9527	* have to go out to the disk to find. */
	9528	HV* l1_fold_invlist = NULL;
	9529
	9530	/* List of multi-character folds that are matched by this node */
	9531	AV* unicode_alternate = NULL;
	9532	#ifdef EBCDIC
	9533	UV literal_endpoint = 0;
	9534	#endif
	9535	UV stored = 0; /* how many chars stored in the bitmap */
	9536
	9537	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	9538	case we need to change the emitted regop to an EXACT. */
	9539	const char * orig_parse = RExC_parse;
	9540	GET_RE_DEBUG_FLAGS_DECL;
	9541
	9542	PERL_ARGS_ASSERT_REGCLASS;
	9543	#ifndef DEBUGGING
	9544	PERL_UNUSED_ARG(depth);
	9545	#endif
	9546
	9547	DEBUG_PARSE("clas");
	9548
	9549	/* Assume we are going to generate an ANYOF node. */
	9550	ret = reganode(pRExC_state, ANYOF, 0);
	9551
	9552
	9553	if (!SIZE_ONLY) {
	9554	ANYOF_FLAGS(ret) = 0;
	9555	}
	9556
	9557	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	9558	RExC_naughty++;
	9559	RExC_parse++;
	9560	if (!SIZE_ONLY)
	9561	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	9562	}
	9563
	9564	if (SIZE_ONLY) {
	9565	RExC_size += ANYOF_SKIP;
	9566	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	9567	}
	9568	else {
	9569	RExC_emit += ANYOF_SKIP;
	9570	if (LOC) {
	9571	ANYOF_FLAGS(ret) \|= ANYOF_LOCALE;
	9572	}
	9573	ANYOF_BITMAP_ZERO(ret);
	9574	listsv = newSVpvs("# comment\n");
	9575	initial_listsv_len = SvCUR(listsv);
	9576	}
	9577
	9578	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	9579
	9580	if (!SIZE_ONLY && POSIXCC(nextvalue))
	9581	checkposixcc(pRExC_state);
	9582
	9583	/* allow 1st char to be ] (allowing it to be - is dealt with later) */
	9584	if (UCHARAT(RExC_parse) == ']')
	9585	goto charclassloop;
	9586
	9587	parseit:
	9588	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	9589
	9590	charclassloop:
	9591
	9592	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	9593
	9594	if (!range)
	9595	rangebegin = RExC_parse;
	9596	if (UTF) {
	9597	value = utf8n_to_uvchr((U8*)RExC_parse,
	9598	RExC_end - RExC_parse,
	9599	&numlen, UTF8_ALLOW_DEFAULT);
	9600	RExC_parse += numlen;
	9601	}
	9602	else
	9603	value = UCHARAT(RExC_parse++);
	9604
	9605	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	9606	if (value == '[' && POSIXCC(nextvalue))
	9607	namedclass = regpposixcc(pRExC_state, value);
	9608	else if (value == '\\') {
	9609	if (UTF) {
	9610	value = utf8n_to_uvchr((U8*)RExC_parse,
	9611	RExC_end - RExC_parse,
	9612	&numlen, UTF8_ALLOW_DEFAULT);
	9613	RExC_parse += numlen;
	9614	}
	9615	else
	9616	value = UCHARAT(RExC_parse++);
	9617	/* Some compilers cannot handle switching on 64-bit integer
	9618	* values, therefore value cannot be an UV. Yes, this will
	9619	* be a problem later if we want switch on Unicode.
	9620	* A similar issue a little bit later when switching on
	9621	* namedclass. --jhi */
	9622	switch ((I32)value) {
	9623	case 'w': namedclass = ANYOF_ALNUM; break;
	9624	case 'W': namedclass = ANYOF_NALNUM; break;
	9625	case 's': namedclass = ANYOF_SPACE; break;
	9626	case 'S': namedclass = ANYOF_NSPACE; break;
	9627	case 'd': namedclass = ANYOF_DIGIT; break;
	9628	case 'D': namedclass = ANYOF_NDIGIT; break;
	9629	case 'v': namedclass = ANYOF_VERTWS; break;
	9630	case 'V': namedclass = ANYOF_NVERTWS; break;
	9631	case 'h': namedclass = ANYOF_HORIZWS; break;
	9632	case 'H': namedclass = ANYOF_NHORIZWS; break;
	9633	case 'N': /* Handle \N{NAME} in class */
	9634	{
	9635	/* We only pay attention to the first char of
	9636	multichar strings being returned. I kinda wonder
	9637	if this makes sense as it does change the behaviour
	9638	from earlier versions, OTOH that behaviour was broken
	9639	as well. */
	9640	UV v; /* value is register so we cant & it /grrr */
	9641	if (reg_namedseq(pRExC_state, &v, NULL)) {
	9642	goto parseit;
	9643	}
	9644	value= v;
	9645	}
	9646	break;
	9647	case 'p':
	9648	case 'P':
	9649	{
	9650	char *e;
	9651	if (RExC_parse >= RExC_end)
	9652	vFAIL2("Empty \\%c{}", (U8)value);
	9653	if (*RExC_parse == '{') {
	9654	const U8 c = (U8)value;
	9655	e = strchr(RExC_parse++, '}');
	9656	if (!e)
	9657	vFAIL2("Missing right brace on \\%c{}", c);
	9658	while (isSPACE(UCHARAT(RExC_parse)))
	9659	RExC_parse++;
	9660	if (e == RExC_parse)
	9661	vFAIL2("Empty \\%c{}", c);
	9662	n = e - RExC_parse;
	9663	while (isSPACE(UCHARAT(RExC_parse + n - 1)))
	9664	n--;
	9665	}
	9666	else {
	9667	e = RExC_parse;
	9668	n = 1;
	9669	}
	9670	if (!SIZE_ONLY) {
	9671	if (UCHARAT(RExC_parse) == '^') {
	9672	RExC_parse++;
	9673	n--;
	9674	value = value == 'p' ? 'P' : 'p'; /* toggle */
	9675	while (isSPACE(UCHARAT(RExC_parse))) {
	9676	RExC_parse++;
	9677	n--;
	9678	}
	9679	}
	9680
	9681	/* Add the property name to the list. If /i matching, give
	9682	* a different name which consists of the normal name
	9683	* sandwiched between two underscores and '_i'. The design
	9684	* is discussed in the commit message for this. */
	9685	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
	9686	(value=='p' ? '+' : '!'),
	9687	(FOLD) ? "__" : "",
	9688	(int)n,
	9689	RExC_parse,
	9690	(FOLD) ? "_i" : ""
	9691	);
	9692	}
	9693	RExC_parse = e + 1;
	9694
	9695	/* The \p could match something in the Latin1 range, hence
	9696	* something that isn't utf8 */
	9697	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	9698	namedclass = ANYOF_MAX; /* no official name, but it's named */
	9699
	9700	/* \p means they want Unicode semantics */
	9701	RExC_uni_semantics = 1;
	9702	}
	9703	break;
	9704	case 'n': value = '\n'; break;
	9705	case 'r': value = '\r'; break;
	9706	case 't': value = '\t'; break;
	9707	case 'f': value = '\f'; break;
	9708	case 'b': value = '\b'; break;
	9709	case 'e': value = ASCII_TO_NATIVE('\033');break;
	9710	case 'a': value = ASCII_TO_NATIVE('\007');break;
	9711	case 'o':
	9712	RExC_parse--; /* function expects to be pointed at the 'o' */
	9713	{
	9714	const char* error_msg;
	9715	bool valid = grok_bslash_o(RExC_parse,
	9716	&value,
	9717	&numlen,
	9718	&error_msg,
	9719	SIZE_ONLY);
	9720	RExC_parse += numlen;
	9721	if (! valid) {
	9722	vFAIL(error_msg);
	9723	}
	9724	}
	9725	if (PL_encoding && value < 0x100) {
	9726	goto recode_encoding;
	9727	}
	9728	break;
	9729	case 'x':
	9730	if (*RExC_parse == '{') {
	9731	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	9732	\| PERL_SCAN_DISALLOW_PREFIX;
	9733	char * const e = strchr(RExC_parse++, '}');
	9734	if (!e)
	9735	vFAIL("Missing right brace on \\x{}");
	9736
	9737	numlen = e - RExC_parse;
	9738	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	9739	RExC_parse = e + 1;
	9740	}
	9741	else {
	9742	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	9743	numlen = 2;
	9744	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	9745	RExC_parse += numlen;
	9746	}
	9747	if (PL_encoding && value < 0x100)
	9748	goto recode_encoding;
	9749	break;
	9750	case 'c':
	9751	value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
	9752	break;
	9753	case '0': case '1': case '2': case '3': case '4':
	9754	case '5': case '6': case '7':
	9755	{
	9756	/* Take 1-3 octal digits */
	9757	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	9758	numlen = 3;
	9759	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	9760	RExC_parse += numlen;
	9761	if (PL_encoding && value < 0x100)
	9762	goto recode_encoding;
	9763	break;
	9764	}
	9765	recode_encoding:
	9766	{
	9767	SV* enc = PL_encoding;
	9768	value = reg_recode((const char)(U8)value, &enc);
	9769	if (!enc && SIZE_ONLY)
	9770	ckWARNreg(RExC_parse,
	9771	"Invalid escape in the specified encoding");
	9772	break;
	9773	}
	9774	default:
	9775	/* Allow \_ to not give an error */
	9776	if (!SIZE_ONLY && isALNUM(value) && value != '_') {
	9777	ckWARN2reg(RExC_parse,
	9778	"Unrecognized escape \\%c in character class passed through",
	9779	(int)value);
	9780	}
	9781	break;
	9782	}
	9783	} /* end of \blah */
	9784	#ifdef EBCDIC
	9785	else
	9786	literal_endpoint++;
	9787	#endif
	9788
	9789	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	9790
	9791	/* What matches in a locale is not known until runtime, so need to
	9792	* (one time per class) allocate extra space to pass to regexec.
	9793	* The space will contain a bit for each named class that is to be
	9794	* matched against. This isn't needed for \p{} and pseudo-classes,
	9795	* as they are not affected by locale, and hence are dealt with
	9796	* separately */
	9797	if (LOC && namedclass < ANYOF_MAX && ! need_class) {
	9798	need_class = 1;
	9799	if (SIZE_ONLY) {
	9800	RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	9801	}
	9802	else {
	9803	RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	9804	ANYOF_CLASS_ZERO(ret);
	9805	}
	9806	ANYOF_FLAGS(ret) \|= ANYOF_CLASS;
	9807	}
	9808
	9809	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	9810	* literal, as is the character that began the false range, i.e.
	9811	* the 'a' in the examples */
	9812	if (range) {
	9813	if (!SIZE_ONLY) {
	9814	const int w =
	9815	RExC_parse >= rangebegin ?
	9816	RExC_parse - rangebegin : 0;
	9817	ckWARN4reg(RExC_parse,
	9818	"False [] range \"%.s\"",
	9819	w, w, rangebegin);
	9820
	9821	stored +=
	9822	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	9823	if (prevvalue < 256) {
	9824	stored +=
	9825	set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
	9826	}
	9827	else {
	9828	nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
	9829	}
	9830	}
	9831
	9832	range = 0; /* this was not a true range */
	9833	}
	9834
	9835
	9836
	9837	if (!SIZE_ONLY) {
	9838	const char *what = NULL;
	9839	char yesno = 0;
	9840
	9841	/* Possible truncation here but in some 64-bit environments
	9842	* the compiler gets heartburn about switch on 64-bit values.
	9843	* A similar issue a little earlier when switching on value.
	9844	* --jhi */
	9845	switch ((I32)namedclass) {
	9846
	9847	case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
	9848	case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
	9849	case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
	9850	case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
	9851	case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
	9852	case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
	9853	case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
	9854	case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
	9855	case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
	9856	case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
	9857	/* \s, \w match all unicode if utf8. */
	9858	case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
	9859	case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
	9860	case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
	9861	case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
	9862	case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
	9863	case ANYOF_ASCII:
	9864	if (LOC)
	9865	ANYOF_CLASS_SET(ret, ANYOF_ASCII);
	9866	else {
	9867	for (value = 0; value < 128; value++)
	9868	stored +=
	9869	set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
	9870	}
	9871	yesno = '+';
	9872	what = NULL; /* Doesn't match outside ascii, so
	9873	don't want to add +utf8:: */
	9874	break;
	9875	case ANYOF_NASCII:
	9876	if (LOC)
	9877	ANYOF_CLASS_SET(ret, ANYOF_NASCII);
	9878	else {
	9879	for (value = 128; value < 256; value++)
	9880	stored +=
	9881	set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
	9882	}
	9883	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	9884	yesno = '!';
	9885	what = "ASCII";
	9886	break;
	9887	case ANYOF_DIGIT:
	9888	if (LOC)
	9889	ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
	9890	else {
	9891	/* consecutive digits assumed */
	9892	for (value = '0'; value <= '9'; value++)
	9893	stored +=
	9894	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	9895	}
	9896	yesno = '+';
	9897	what = "Digit";
	9898	break;
	9899	case ANYOF_NDIGIT:
	9900	if (LOC)
	9901	ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
	9902	else {
	9903	/* consecutive digits assumed */
	9904	for (value = 0; value < '0'; value++)
	9905	stored +=
	9906	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	9907	for (value = '9' + 1; value < 256; value++)
	9908	stored +=
	9909	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	9910	}
	9911	yesno = '!';
	9912	what = "Digit";
	9913	if (AT_LEAST_ASCII_RESTRICTED ) {
	9914	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	9915	}
	9916	break;
	9917	case ANYOF_MAX:
	9918	/* this is to handle \p and \P */
	9919	break;
	9920	default:
	9921	vFAIL("Invalid [::] class");
	9922	break;
	9923	}
	9924	if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
	9925	/* Strings such as "+utf8::isWord\n" */
	9926	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
	9927	}
	9928
	9929	continue;
	9930	}
	9931	} /* end of namedclass \blah */
	9932
	9933	if (range) {
	9934	if (prevvalue > (IV)value) /* b-a */ {
	9935	const int w = RExC_parse - rangebegin;
	9936	Simple_vFAIL4("Invalid [] range \"%.s\"", w, w, rangebegin);
	9937	range = 0; /* not a valid range */
	9938	}
	9939	}
	9940	else {
	9941	prevvalue = value; /* save the beginning of the range */
	9942	if (RExC_parse+1 < RExC_end
	9943	&& *RExC_parse == '-'
	9944	&& RExC_parse[1] != ']')
	9945	{
	9946	RExC_parse++;
	9947
	9948	/* a bad range like \w-, [:word:]- ? */
	9949	if (namedclass > OOB_NAMEDCLASS) {
	9950	if (ckWARN(WARN_REGEXP)) {
	9951	const int w =
	9952	RExC_parse >= rangebegin ?
	9953	RExC_parse - rangebegin : 0;
	9954	vWARN4(RExC_parse,
	9955	"False [] range \"%.s\"",
	9956	w, w, rangebegin);
	9957	}
	9958	if (!SIZE_ONLY)
	9959	stored +=
	9960	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	9961	} else
	9962	range = 1; /* yeah, it's a range! */
	9963	continue; /* but do it the next time */
	9964	}
	9965	}
	9966
	9967	/* non-Latin1 code point implies unicode semantics. Must be set in
	9968	* pass1 so is there for the whole of pass 2 */
	9969	if (value > 255) {
	9970	RExC_uni_semantics = 1;
	9971	}
	9972
	9973	/* now is the next time */
	9974	if (!SIZE_ONLY) {
	9975	if (prevvalue < 256) {
	9976	const IV ceilvalue = value < 256 ? value : 255;
	9977	IV i;
	9978	#ifdef EBCDIC
	9979	/* In EBCDIC [\x89-\x91] should include
	9980	* the \x8e but [i-j] should not. */
	9981	if (literal_endpoint == 2 &&
	9982	((isLOWER(prevvalue) && isLOWER(ceilvalue)) \|\|
	9983	(isUPPER(prevvalue) && isUPPER(ceilvalue))))
	9984	{
	9985	if (isLOWER(prevvalue)) {
	9986	for (i = prevvalue; i <= ceilvalue; i++)
	9987	if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	9988	stored +=
	9989	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	9990	}
	9991	} else {
	9992	for (i = prevvalue; i <= ceilvalue; i++)
	9993	if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	9994	stored +=
	9995	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	9996	}
	9997	}
	9998	}
	9999	else
	10000	#endif
	10001	for (i = prevvalue; i <= ceilvalue; i++) {
	10002	stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10003	}
	10004	}
	10005	if (value > 255) {
	10006	const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
	10007	const UV natvalue = NATIVE_TO_UNI(value);
	10008	nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
	10009	}
	10010	#ifdef EBCDIC
	10011	literal_endpoint = 0;
	10012	#endif
	10013	}
	10014
	10015	range = 0; /* this range (if it was one) is done now */
	10016	}
	10017
	10018
	10019
	10020	if (SIZE_ONLY)
	10021	return ret;
	10022	/**** !SIZE_ONLY AFTER HERE *******/
	10023
	10024	/* If folding and there are code points above 255, we calculate all
	10025	* characters that could fold to or from the ones already on the list */
	10026	if (FOLD && nonbitmap) {
	10027	UV i;
	10028
	10029	HV* fold_intersection;
	10030	UV* fold_list;
	10031
	10032	/* This is a list of all the characters that participate in folds
	10033	* (except marks, etc in multi-char folds */
	10034	if (! PL_utf8_foldable) {
	10035	SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
	10036	PL_utf8_foldable = _swash_to_invlist(swash);
	10037	}
	10038
	10039	/* This is a hash that for a particular fold gives all characters
	10040	* that are involved in it */
	10041	if (! PL_utf8_foldclosures) {
	10042
	10043	/* If we were unable to find any folds, then we likely won't be
	10044	* able to find the closures. So just create an empty list.
	10045	* Folding will effectively be restricted to the non-Unicode rules
	10046	* hard-coded into Perl. (This case happens legitimately during
	10047	* compilation of Perl itself before the Unicode tables are
	10048	* generated) */
	10049	if (invlist_len(PL_utf8_foldable) == 0) {
	10050	PL_utf8_foldclosures = _new_invlist(0);
	10051	} else {
	10052	/* If the folds haven't been read in, call a fold function
	10053	* to force that */
	10054	if (! PL_utf8_tofold) {
	10055	U8 dummy[UTF8_MAXBYTES+1];
	10056	STRLEN dummy_len;
	10057	to_utf8_fold((U8*) "A", dummy, &dummy_len);
	10058	}
	10059	PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
	10060	}
	10061	}
	10062
	10063	/* Only the characters in this class that participate in folds need
	10064	* be checked. Get the intersection of this class and all the
	10065	* possible characters that are foldable. This can quickly narrow
	10066	* down a large class */
	10067	fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
	10068
	10069	/* Now look at the foldable characters in this class individually */
	10070	fold_list = invlist_array(fold_intersection);
	10071	for (i = 0; i < invlist_len(fold_intersection); i++) {
	10072	UV j;
	10073
	10074	/* The next entry is the beginning of the range that is in the
	10075	* class */
	10076	UV start = fold_list[i++];
	10077
	10078
	10079	/* The next entry is the beginning of the next range, which
	10080	* isn't in the class, so the end of the current range is one
	10081	* less than that */
	10082	UV end = fold_list[i] - 1;
	10083
	10084	/* Look at every character in the range */
	10085	for (j = start; j <= end; j++) {
	10086
	10087	/* Get its fold */
	10088	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	10089	STRLEN foldlen;
	10090	const UV f = to_uni_fold(j, foldbuf, &foldlen);
	10091
	10092	if (foldlen > (STRLEN)UNISKIP(f)) {
	10093
	10094	/* Any multicharacter foldings (disallowed in
	10095	* lookbehind patterns) require the following
	10096	* transform: [ABCDEF] -> (?:[ABCabcDEFd]\|pq\|rst) where
	10097	* E folds into "pq" and F folds into "rst", all other
	10098	* characters fold to single characters. We save away
	10099	* these multicharacter foldings, to be later saved as
	10100	* part of the additional "s" data. */
	10101	if (! RExC_in_lookbehind) {
	10102	U8* loc = foldbuf;
	10103	U8* e = foldbuf + foldlen;
	10104
	10105	/* If any of the folded characters of this are in
	10106	* the Latin1 range, tell the regex engine that
	10107	* this can match a non-utf8 target string. The
	10108	* only multi-byte fold whose source is in the
	10109	* Latin1 range (U+00DF) applies only when the
	10110	* target string is utf8, or under unicode rules */
	10111	if (j > 255 \|\| AT_LEAST_UNI_SEMANTICS) {
	10112	while (loc < e) {
	10113
	10114	/* Can't mix ascii with non- under /aa */
	10115	if (MORE_ASCII_RESTRICTED
	10116	&& (isASCII(*loc) != isASCII(j)))
	10117	{
	10118	goto end_multi_fold;
	10119	}
	10120	if (UTF8_IS_INVARIANT(*loc)
	10121	\|\| UTF8_IS_DOWNGRADEABLE_START(*loc))
	10122	{
	10123	/* Can't mix above and below 256 under
	10124	* LOC */
	10125	if (LOC) {
	10126	goto end_multi_fold;
	10127	}
	10128	ANYOF_FLAGS(ret)
	10129	\|= ANYOF_NONBITMAP_NON_UTF8;
	10130	break;
	10131	}
	10132	loc += UTF8SKIP(loc);
	10133	}
	10134	}
	10135
	10136	add_alternate(&unicode_alternate, foldbuf, foldlen);
	10137	end_multi_fold: ;
	10138	}
	10139	}
	10140	else {
	10141	/* Single character fold. Add everything in its fold
	10142	* closure to the list that this node should match */
	10143	SV** listp;
	10144
	10145	/* The fold closures data structure is a hash with the
	10146	* keys being every character that is folded to, like
	10147	* 'k', and the values each an array of everything that
	10148	* folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
	10149	if ((listp = hv_fetch(PL_utf8_foldclosures,
	10150	(char *) foldbuf, foldlen, FALSE)))
	10151	{
	10152	AV* list = (AV) listp;
	10153	IV k;
	10154	for (k = 0; k <= av_len(list); k++) {
	10155	SV** c_p = av_fetch(list, k, FALSE);
	10156	UV c;
	10157	if (c_p == NULL) {
	10158	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	10159	}
	10160	c = SvUV(*c_p);
	10161
	10162	/* /aa doesn't allow folds between ASCII and
	10163	* non-; /l doesn't allow them between above
	10164	* and below 256 */
	10165	if ((MORE_ASCII_RESTRICTED
	10166	&& (isASCII(c) != isASCII(j)))
	10167	\|\| (LOC && ((c < 256) != (j < 256))))
	10168	{
	10169	continue;
	10170	}
	10171
	10172	if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
	10173	stored += set_regclass_bit(pRExC_state,
	10174	ret,
	10175	(U8) c,
	10176	&l1_fold_invlist, &unicode_alternate);
	10177	}
	10178	/* It may be that the code point is already
	10179	* in this range or already in the bitmap,
	10180	* in which case we need do nothing */
	10181	else if ((c < start \|\| c > end)
	10182	&& (c > 255
	10183	\|\| ! ANYOF_BITMAP_TEST(ret, c)))
	10184	{
	10185	nonbitmap = add_cp_to_invlist(nonbitmap, c);
	10186	}
	10187	}
	10188	}
	10189	}
	10190	}
	10191	}
	10192	invlist_destroy(fold_intersection);
	10193	}
	10194
	10195	/* Combine the two lists into one. */
	10196	if (l1_fold_invlist) {
	10197	if (nonbitmap) {
	10198	nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
	10199	}
	10200	else {
	10201	nonbitmap = l1_fold_invlist;
	10202	}
	10203	}
	10204
	10205	/* Here, we have calculated what code points should be in the character
	10206	* class. Now we can see about various optimizations. Fold calculation
	10207	* needs to take place before inversion. Otherwise /[^k]/i would invert to
	10208	* include K, which under /i would match k. */
	10209
	10210	/* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
	10211	* set the FOLD flag yet, so this this does optimize those. It doesn't
	10212	* optimize locale. Doing so perhaps could be done as long as there is
	10213	* nothing like \w in it; some thought also would have to be given to the
	10214	* interaction with above 0x100 chars */
	10215	if (! LOC
	10216	&& (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
	10217	&& ! unicode_alternate
	10218	&& ! nonbitmap
	10219	&& SvCUR(listsv) == initial_listsv_len)
	10220	{
	10221	for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
	10222	ANYOF_BITMAP(ret)[value] ^= 0xFF;
	10223	stored = 256 - stored;
	10224
	10225	/* The inversion means that everything above 255 is matched; and at the
	10226	* same time we clear the invert flag */
	10227	ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
	10228	}
	10229
	10230	/* Folding in the bitmap is taken care of above, but not for locale (for
	10231	* which we have to wait to see what folding is in effect at runtime), and
	10232	* for things not in the bitmap. Set run-time fold flag for these */
	10233	if (FOLD && (LOC \|\| nonbitmap \|\| unicode_alternate)) {
	10234	ANYOF_FLAGS(ret) \|= ANYOF_LOC_NONBITMAP_FOLD;
	10235	}
	10236
	10237	/* A single character class can be "optimized" into an EXACTish node.
	10238	* Note that since we don't currently count how many characters there are
	10239	* outside the bitmap, we are XXX missing optimization possibilities for
	10240	* them. This optimization can't happen unless this is a truly single
	10241	* character class, which means that it can't be an inversion into a
	10242	* many-character class, and there must be no possibility of there being
	10243	* things outside the bitmap. 'stored' (only) for locales doesn't include
	10244	* \w, etc, so have to make a special test that they aren't present
	10245	*
	10246	* Similarly A 2-character class of the very special form like [bB] can be
	10247	* optimized into an EXACTFish node, but only for non-locales, and for
	10248	* characters which only have the two folds; so things like 'fF' and 'Ii'
	10249	* wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
	10250	* FI'. */
	10251	if (! nonbitmap
	10252	&& ! unicode_alternate
	10253	&& SvCUR(listsv) == initial_listsv_len
	10254	&& ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT\|ANYOF_UNICODE_ALL))
	10255	&& (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	10256	\|\| (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
	10257	\|\| (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	10258	&& (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
	10259	/* If the latest code point has a fold whose
	10260	* bit is set, it must be the only other one */
	10261	&& ((prevvalue = PL_fold_latin1[value]) != (IV)value)
	10262	&& ANYOF_BITMAP_TEST(ret, prevvalue)))))
	10263	{
	10264	/* Note that the information needed to decide to do this optimization
	10265	* is not currently available until the 2nd pass, and that the actually
	10266	* used EXACTish node takes less space than the calculated ANYOF node,
	10267	* and hence the amount of space calculated in the first pass is larger
	10268	* than actually used, so this optimization doesn't gain us any space.
	10269	* But an EXACT node is faster than an ANYOF node, and can be combined
	10270	* with any adjacent EXACT nodes later by the optimizer for further
	10271	* gains. The speed of executing an EXACTF is similar to an ANYOF
	10272	* node, so the optimization advantage comes from the ability to join
	10273	* it to adjacent EXACT nodes */
	10274
	10275	const char * cur_parse= RExC_parse;
	10276	U8 op;
	10277	RExC_emit = (regnode *)orig_emit;
	10278	RExC_parse = (char *)orig_parse;
	10279
	10280	if (stored == 1) {
	10281
	10282	/* A locale node with one point can be folded; all the other cases
	10283	* with folding will have two points, since we calculate them above
	10284	*/
	10285	if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
	10286	op = EXACTFL;
	10287	}
	10288	else {
	10289	op = EXACT;
	10290	}
	10291	} /* else 2 chars in the bit map: the folds of each other */
	10292	else if (AT_LEAST_UNI_SEMANTICS \|\| !isASCII(value)) {
	10293
	10294	/* To join adjacent nodes, they must be the exact EXACTish type.
	10295	* Try to use the most likely type, by using EXACTFU if the regex
	10296	* calls for them, or is required because the character is
	10297	* non-ASCII */
	10298	op = EXACTFU;
	10299	}
	10300	else { /* Otherwise, more likely to be EXACTF type */
	10301	op = EXACTF;
	10302	}
	10303
	10304	ret = reg_node(pRExC_state, op);
	10305	RExC_parse = (char *)cur_parse;
	10306	if (UTF && ! NATIVE_IS_INVARIANT(value)) {
	10307	*STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
	10308	*(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
	10309	STR_LEN(ret)= 2;
	10310	RExC_emit += STR_SZ(2);
	10311	}
	10312	else {
	10313	*STRING(ret)= (char)value;
	10314	STR_LEN(ret)= 1;
	10315	RExC_emit += STR_SZ(1);
	10316	}
	10317	SvREFCNT_dec(listsv);
	10318	return ret;
	10319	}
	10320
	10321	if (nonbitmap) {
	10322	UV* nonbitmap_array = invlist_array(nonbitmap);
	10323	UV nonbitmap_len = invlist_len(nonbitmap);
	10324	UV i;
	10325
	10326	/* Here have the full list of items to match that aren't in the
	10327	* bitmap. Convert to the structure that the rest of the code is
	10328	* expecting. XXX That rest of the code should convert to this
	10329	* structure */
	10330	for (i = 0; i < nonbitmap_len; i++) {
	10331
	10332	/* The next entry is the beginning of the range that is in the
	10333	* class */
	10334	UV start = nonbitmap_array[i++];
	10335	UV end;
	10336
	10337	/* The next entry is the beginning of the next range, which isn't
	10338	* in the class, so the end of the current range is one less than
	10339	* that. But if there is no next range, it means that the range
	10340	* begun by 'start' extends to infinity, which for this platform
	10341	* ends at UV_MAX */
	10342	if (i == nonbitmap_len) {
	10343	end = UV_MAX;
	10344	}
	10345	else {
	10346	end = nonbitmap_array[i] - 1;
	10347	}
	10348
	10349	if (start == end) {
	10350	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
	10351	}
	10352	else {
	10353	/* The \t sets the whole range */
	10354	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
	10355	/* XXX EBCDIC */
	10356	start, end);
	10357	}
	10358	}
	10359	invlist_destroy(nonbitmap);
	10360	}
	10361
	10362	if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
	10363	ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
	10364	SvREFCNT_dec(listsv);
	10365	SvREFCNT_dec(unicode_alternate);
	10366	}
	10367	else {
	10368
	10369	AV * const av = newAV();
	10370	SV *rv;
	10371	/* The 0th element stores the character class description
	10372	* in its textual form: used later (regexec.c:Perl_regclass_swash())
	10373	* to initialize the appropriate swash (which gets stored in
	10374	* the 1st element), and also useful for dumping the regnode.
	10375	* The 2nd element stores the multicharacter foldings,
	10376	* used later (regexec.c:S_reginclass()). */
	10377	av_store(av, 0, listsv);
	10378	av_store(av, 1, NULL);
	10379	av_store(av, 2, MUTABLE_SV(unicode_alternate));
	10380	if (unicode_alternate) { /* This node is variable length */
	10381	OP(ret) = ANYOFV;
	10382	}
	10383	rv = newRV_noinc(MUTABLE_SV(av));
	10384	n = add_data(pRExC_state, 1, "s");
	10385	RExC_rxi->data->data[n] = (void*)rv;
	10386	ARG_SET(ret, n);
	10387	}
	10388	return ret;
	10389	}
	10390	#undef _C_C_T_
	10391
	10392
	10393	/* reg_skipcomment()
	10394
	10395	Absorbs an /x style # comments from the input stream.
	10396	Returns true if there is more text remaining in the stream.
	10397	Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
	10398	terminates the pattern without including a newline.
	10399
	10400	Note its the callers responsibility to ensure that we are
	10401	actually in /x mode
	10402
	10403	*/
	10404
	10405	STATIC bool
	10406	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
	10407	{
	10408	bool ended = 0;
	10409
	10410	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	10411
	10412	while (RExC_parse < RExC_end)
	10413	if (*RExC_parse++ == '\n') {
	10414	ended = 1;
	10415	break;
	10416	}
	10417	if (!ended) {
	10418	/* we ran off the end of the pattern without ending
	10419	the comment, so we have to add an \n when wrapping */
	10420	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	10421	return 0;
	10422	} else
	10423	return 1;
	10424	}
	10425
	10426	/* nextchar()
	10427
	10428	Advances the parse position, and optionally absorbs
	10429	"whitespace" from the inputstream.
	10430
	10431	Without /x "whitespace" means (?#...) style comments only,
	10432	with /x this means (?#...) and # comments and whitespace proper.
	10433
	10434	Returns the RExC_parse point from BEFORE the scan occurs.
	10435
	10436	This is the /x friendly way of saying RExC_parse++.
	10437	*/
	10438
	10439	STATIC char*
	10440	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	10441	{
	10442	char* const retval = RExC_parse++;
	10443
	10444	PERL_ARGS_ASSERT_NEXTCHAR;
	10445
	10446	for (;;) {
	10447	if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
	10448	RExC_parse[2] == '#') {
	10449	while (*RExC_parse != ')') {
	10450	if (RExC_parse == RExC_end)
	10451	FAIL("Sequence (?#... not terminated");
	10452	RExC_parse++;
	10453	}
	10454	RExC_parse++;
	10455	continue;
	10456	}
	10457	if (RExC_flags & RXf_PMf_EXTENDED) {
	10458	if (isSPACE(*RExC_parse)) {
	10459	RExC_parse++;
	10460	continue;
	10461	}
	10462	else if (*RExC_parse == '#') {
	10463	if ( reg_skipcomment( pRExC_state ) )
	10464	continue;
	10465	}
	10466	}
	10467	return retval;
	10468	}
	10469	}
	10470
	10471	/*
	10472	- reg_node - emit a node
	10473	*/
	10474	STATIC regnode * /* Location. */
	10475	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	10476	{
	10477	dVAR;
	10478	register regnode *ptr;
	10479	regnode * const ret = RExC_emit;
	10480	GET_RE_DEBUG_FLAGS_DECL;
	10481
	10482	PERL_ARGS_ASSERT_REG_NODE;
	10483
	10484	if (SIZE_ONLY) {
	10485	SIZE_ALIGN(RExC_size);
	10486	RExC_size += 1;
	10487	return(ret);
	10488	}
	10489	if (RExC_emit >= RExC_emit_bound)
	10490	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
	10491
	10492	NODE_ALIGN_FILL(ret);
	10493	ptr = ret;
	10494	FILL_ADVANCE_NODE(ptr, op);
	10495	#ifdef RE_TRACK_PATTERN_OFFSETS
	10496	if (RExC_offsets) { /* MJD */
	10497	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	10498	"reg_node", __LINE__,
	10499	PL_reg_name[op],
	10500	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	10501	? "Overwriting end of array!\n" : "OK",
	10502	(UV)(RExC_emit - RExC_emit_start),
	10503	(UV)(RExC_parse - RExC_start),
	10504	(UV)RExC_offsets[0]));
	10505	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	10506	}
	10507	#endif
	10508	RExC_emit = ptr;
	10509	return(ret);
	10510	}
	10511
	10512	/*
	10513	- reganode - emit a node with an argument
	10514	*/
	10515	STATIC regnode * /* Location. */
	10516	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	10517	{
	10518	dVAR;
	10519	register regnode *ptr;
	10520	regnode * const ret = RExC_emit;
	10521	GET_RE_DEBUG_FLAGS_DECL;
	10522
	10523	PERL_ARGS_ASSERT_REGANODE;
	10524
	10525	if (SIZE_ONLY) {
	10526	SIZE_ALIGN(RExC_size);
	10527	RExC_size += 2;
	10528	/*
	10529	We can't do this:
	10530
	10531	assert(2==regarglen[op]+1);
	10532
	10533	Anything larger than this has to allocate the extra amount.
	10534	If we changed this to be:
	10535
	10536	RExC_size += (1 + regarglen[op]);
	10537
	10538	then it wouldn't matter. Its not clear what side effect
	10539	might come from that so its not done so far.
	10540	-- dmq
	10541	*/
	10542	return(ret);
	10543	}
	10544	if (RExC_emit >= RExC_emit_bound)
	10545	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
	10546
	10547	NODE_ALIGN_FILL(ret);
	10548	ptr = ret;
	10549	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	10550	#ifdef RE_TRACK_PATTERN_OFFSETS
	10551	if (RExC_offsets) { /* MJD */
	10552	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	10553	"reganode",
	10554	__LINE__,
	10555	PL_reg_name[op],
	10556	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	10557	"Overwriting end of array!\n" : "OK",
	10558	(UV)(RExC_emit - RExC_emit_start),
	10559	(UV)(RExC_parse - RExC_start),
	10560	(UV)RExC_offsets[0]));
	10561	Set_Cur_Node_Offset;
	10562	}
	10563	#endif
	10564	RExC_emit = ptr;
	10565	return(ret);
	10566	}
	10567
	10568	/*
	10569	- reguni - emit (if appropriate) a Unicode character
	10570	*/
	10571	STATIC STRLEN
	10572	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	10573	{
	10574	dVAR;
	10575
	10576	PERL_ARGS_ASSERT_REGUNI;
	10577
	10578	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	10579	}
	10580
	10581	/*
	10582	- reginsert - insert an operator in front of already-emitted operand
	10583	*
	10584	* Means relocating the operand.
	10585	*/
	10586	STATIC void
	10587	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	10588	{
	10589	dVAR;
	10590	register regnode *src;
	10591	register regnode *dst;
	10592	register regnode *place;
	10593	const int offset = regarglen[(U8)op];
	10594	const int size = NODE_STEP_REGNODE + offset;
	10595	GET_RE_DEBUG_FLAGS_DECL;
	10596
	10597	PERL_ARGS_ASSERT_REGINSERT;
	10598	PERL_UNUSED_ARG(depth);
	10599	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	10600	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	10601	if (SIZE_ONLY) {
	10602	RExC_size += size;
	10603	return;
	10604	}
	10605
	10606	src = RExC_emit;
	10607	RExC_emit += size;
	10608	dst = RExC_emit;
	10609	if (RExC_open_parens) {
	10610	int paren;
	10611	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	10612	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	10613	if ( RExC_open_parens[paren] >= opnd ) {
	10614	/DEBUG_PARSE_FMT("open"," - %d",size);/
	10615	RExC_open_parens[paren] += size;
	10616	} else {
	10617	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	10618	}
	10619	if ( RExC_close_parens[paren] >= opnd ) {
	10620	/DEBUG_PARSE_FMT("close"," - %d",size);/
	10621	RExC_close_parens[paren] += size;
	10622	} else {
	10623	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	10624	}
	10625	}
	10626	}
	10627
	10628	while (src > opnd) {
	10629	StructCopy(--src, --dst, regnode);
	10630	#ifdef RE_TRACK_PATTERN_OFFSETS
	10631	if (RExC_offsets) { /* MJD 20010112 */
	10632	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	10633	"reg_insert",
	10634	__LINE__,
	10635	PL_reg_name[op],
	10636	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	10637	? "Overwriting end of array!\n" : "OK",
	10638	(UV)(src - RExC_emit_start),
	10639	(UV)(dst - RExC_emit_start),
	10640	(UV)RExC_offsets[0]));
	10641	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	10642	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	10643	}
	10644	#endif
	10645	}
	10646
	10647
	10648	place = opnd; /* Op node, where operand used to be. */
	10649	#ifdef RE_TRACK_PATTERN_OFFSETS
	10650	if (RExC_offsets) { /* MJD */
	10651	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	10652	"reginsert",
	10653	__LINE__,
	10654	PL_reg_name[op],
	10655	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	10656	? "Overwriting end of array!\n" : "OK",
	10657	(UV)(place - RExC_emit_start),
	10658	(UV)(RExC_parse - RExC_start),
	10659	(UV)RExC_offsets[0]));
	10660	Set_Node_Offset(place, RExC_parse);
	10661	Set_Node_Length(place, 1);
	10662	}
	10663	#endif
	10664	src = NEXTOPER(place);
	10665	FILL_ADVANCE_NODE(place, op);
	10666	Zero(src, offset, regnode);
	10667	}
	10668
	10669	/*
	10670	- regtail - set the next-pointer at the end of a node chain of p to val.
	10671	- SEE ALSO: regtail_study
	10672	*/
	10673	/* TODO: All three parms should be const */
	10674	STATIC void
	10675	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	10676	{
	10677	dVAR;
	10678	register regnode *scan;
	10679	GET_RE_DEBUG_FLAGS_DECL;
	10680
	10681	PERL_ARGS_ASSERT_REGTAIL;
	10682	#ifndef DEBUGGING
	10683	PERL_UNUSED_ARG(depth);
	10684	#endif
	10685
	10686	if (SIZE_ONLY)
	10687	return;
	10688
	10689	/* Find last node. */
	10690	scan = p;
	10691	for (;;) {
	10692	regnode * const temp = regnext(scan);
	10693	DEBUG_PARSE_r({
	10694	SV * const mysv=sv_newmortal();
	10695	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	10696	regprop(RExC_rx, mysv, scan);
	10697	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	10698	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	10699	(temp == NULL ? "->" : ""),
	10700	(temp == NULL ? PL_reg_name[OP(val)] : "")
	10701	);
	10702	});
	10703	if (temp == NULL)
	10704	break;
	10705	scan = temp;
	10706	}
	10707
	10708	if (reg_off_by_arg[OP(scan)]) {
	10709	ARG_SET(scan, val - scan);
	10710	}
	10711	else {
	10712	NEXT_OFF(scan) = val - scan;
	10713	}
	10714	}
	10715
	10716	#ifdef DEBUGGING
	10717	/*
	10718	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	10719	- Look for optimizable sequences at the same time.
	10720	- currently only looks for EXACT chains.
	10721
	10722	This is experimental code. The idea is to use this routine to perform
	10723	in place optimizations on branches and groups as they are constructed,
	10724	with the long term intention of removing optimization from study_chunk so
	10725	that it is purely analytical.
	10726
	10727	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	10728	to control which is which.
	10729
	10730	*/
	10731	/* TODO: All four parms should be const */
	10732
	10733	STATIC U8
	10734	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	10735	{
	10736	dVAR;
	10737	register regnode *scan;
	10738	U8 exact = PSEUDO;
	10739	#ifdef EXPERIMENTAL_INPLACESCAN
	10740	I32 min = 0;
	10741	#endif
	10742	GET_RE_DEBUG_FLAGS_DECL;
	10743
	10744	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	10745
	10746
	10747	if (SIZE_ONLY)
	10748	return exact;
	10749
	10750	/* Find last node. */
	10751
	10752	scan = p;
	10753	for (;;) {
	10754	regnode * const temp = regnext(scan);
	10755	#ifdef EXPERIMENTAL_INPLACESCAN
	10756	if (PL_regkind[OP(scan)] == EXACT)
	10757	if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
	10758	return EXACT;
	10759	#endif
	10760	if ( exact ) {
	10761	switch (OP(scan)) {
	10762	case EXACT:
	10763	case EXACTF:
	10764	case EXACTFA:
	10765	case EXACTFU:
	10766	case EXACTFL:
	10767	if( exact == PSEUDO )
	10768	exact= OP(scan);
	10769	else if ( exact != OP(scan) )
	10770	exact= 0;
	10771	case NOTHING:
	10772	break;
	10773	default:
	10774	exact= 0;
	10775	}
	10776	}
	10777	DEBUG_PARSE_r({
	10778	SV * const mysv=sv_newmortal();
	10779	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	10780	regprop(RExC_rx, mysv, scan);
	10781	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	10782	SvPV_nolen_const(mysv),
	10783	REG_NODE_NUM(scan),
	10784	PL_reg_name[exact]);
	10785	});
	10786	if (temp == NULL)
	10787	break;
	10788	scan = temp;
	10789	}
	10790	DEBUG_PARSE_r({
	10791	SV * const mysv_val=sv_newmortal();
	10792	DEBUG_PARSE_MSG("");
	10793	regprop(RExC_rx, mysv_val, val);
	10794	PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	10795	SvPV_nolen_const(mysv_val),
	10796	(IV)REG_NODE_NUM(val),
	10797	(IV)(val - scan)
	10798	);
	10799	});
	10800	if (reg_off_by_arg[OP(scan)]) {
	10801	ARG_SET(scan, val - scan);
	10802	}
	10803	else {
	10804	NEXT_OFF(scan) = val - scan;
	10805	}
	10806
	10807	return exact;
	10808	}
	10809	#endif
	10810
	10811	/*
	10812	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	10813	*/
	10814	#ifdef DEBUGGING
	10815	static void
	10816	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	10817	{
	10818	int bit;
	10819	int set=0;
	10820	regex_charset cs;
	10821
	10822	for (bit=0; bit<32; bit++) {
	10823	if (flags & (1<<bit)) {
	10824	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	10825	continue;
	10826	}
	10827	if (!set++ && lead)
	10828	PerlIO_printf(Perl_debug_log, "%s",lead);
	10829	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	10830	}
	10831	}
	10832	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	10833	if (!set++ && lead) {
	10834	PerlIO_printf(Perl_debug_log, "%s",lead);
	10835	}
	10836	switch (cs) {
	10837	case REGEX_UNICODE_CHARSET:
	10838	PerlIO_printf(Perl_debug_log, "UNICODE");
	10839	break;
	10840	case REGEX_LOCALE_CHARSET:
	10841	PerlIO_printf(Perl_debug_log, "LOCALE");
	10842	break;
	10843	case REGEX_ASCII_RESTRICTED_CHARSET:
	10844	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	10845	break;
	10846	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	10847	PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
	10848	break;
	10849	default:
	10850	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	10851	break;
	10852	}
	10853	}
	10854	if (lead) {
	10855	if (set)
	10856	PerlIO_printf(Perl_debug_log, "\n");
	10857	else
	10858	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	10859	}
	10860	}
	10861	#endif
	10862
	10863	void
	10864	Perl_regdump(pTHX_ const regexp *r)
	10865	{
	10866	#ifdef DEBUGGING
	10867	dVAR;
	10868	SV * const sv = sv_newmortal();
	10869	SV *dsv= sv_newmortal();
	10870	RXi_GET_DECL(r,ri);
	10871	GET_RE_DEBUG_FLAGS_DECL;
	10872
	10873	PERL_ARGS_ASSERT_REGDUMP;
	10874
	10875	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	10876
	10877	/* Header fields of interest. */
	10878	if (r->anchored_substr) {
	10879	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	10880	RE_SV_DUMPLEN(r->anchored_substr), 30);
	10881	PerlIO_printf(Perl_debug_log,
	10882	"anchored %s%s at %"IVdf" ",
	10883	s, RE_SV_TAIL(r->anchored_substr),
	10884	(IV)r->anchored_offset);
	10885	} else if (r->anchored_utf8) {
	10886	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	10887	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	10888	PerlIO_printf(Perl_debug_log,
	10889	"anchored utf8 %s%s at %"IVdf" ",
	10890	s, RE_SV_TAIL(r->anchored_utf8),
	10891	(IV)r->anchored_offset);
	10892	}
	10893	if (r->float_substr) {
	10894	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	10895	RE_SV_DUMPLEN(r->float_substr), 30);
	10896	PerlIO_printf(Perl_debug_log,
	10897	"floating %s%s at %"IVdf"..%"UVuf" ",
	10898	s, RE_SV_TAIL(r->float_substr),
	10899	(IV)r->float_min_offset, (UV)r->float_max_offset);
	10900	} else if (r->float_utf8) {
	10901	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	10902	RE_SV_DUMPLEN(r->float_utf8), 30);
	10903	PerlIO_printf(Perl_debug_log,
	10904	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	10905	s, RE_SV_TAIL(r->float_utf8),
	10906	(IV)r->float_min_offset, (UV)r->float_max_offset);
	10907	}
	10908	if (r->check_substr \|\| r->check_utf8)
	10909	PerlIO_printf(Perl_debug_log,
	10910	(const char *)
	10911	(r->check_substr == r->float_substr
	10912	&& r->check_utf8 == r->float_utf8
	10913	? "(checking floating" : "(checking anchored"));
	10914	if (r->extflags & RXf_NOSCAN)
	10915	PerlIO_printf(Perl_debug_log, " noscan");
	10916	if (r->extflags & RXf_CHECK_ALL)
	10917	PerlIO_printf(Perl_debug_log, " isall");
	10918	if (r->check_substr \|\| r->check_utf8)
	10919	PerlIO_printf(Perl_debug_log, ") ");
	10920
	10921	if (ri->regstclass) {
	10922	regprop(r, sv, ri->regstclass);
	10923	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	10924	}
	10925	if (r->extflags & RXf_ANCH) {
	10926	PerlIO_printf(Perl_debug_log, "anchored");
	10927	if (r->extflags & RXf_ANCH_BOL)
	10928	PerlIO_printf(Perl_debug_log, "(BOL)");
	10929	if (r->extflags & RXf_ANCH_MBOL)
	10930	PerlIO_printf(Perl_debug_log, "(MBOL)");
	10931	if (r->extflags & RXf_ANCH_SBOL)
	10932	PerlIO_printf(Perl_debug_log, "(SBOL)");
	10933	if (r->extflags & RXf_ANCH_GPOS)
	10934	PerlIO_printf(Perl_debug_log, "(GPOS)");
	10935	PerlIO_putc(Perl_debug_log, ' ');
	10936	}
	10937	if (r->extflags & RXf_GPOS_SEEN)
	10938	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	10939	if (r->intflags & PREGf_SKIP)
	10940	PerlIO_printf(Perl_debug_log, "plus ");
	10941	if (r->intflags & PREGf_IMPLICIT)
	10942	PerlIO_printf(Perl_debug_log, "implicit ");
	10943	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	10944	if (r->extflags & RXf_EVAL_SEEN)
	10945	PerlIO_printf(Perl_debug_log, "with eval ");
	10946	PerlIO_printf(Perl_debug_log, "\n");
	10947	DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
	10948	#else
	10949	PERL_ARGS_ASSERT_REGDUMP;
	10950	PERL_UNUSED_CONTEXT;
	10951	PERL_UNUSED_ARG(r);
	10952	#endif /* DEBUGGING */
	10953	}
	10954
	10955	/*
	10956	- regprop - printable representation of opcode
	10957	*/
	10958	#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
	10959	STMT_START { \
	10960	if (do_sep) { \
	10961	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
	10962	if (flags & ANYOF_INVERT) \
	10963	/make sure the invert info is in each / \
	10964	sv_catpvs(sv, "^"); \
	10965	do_sep = 0; \
	10966	} \
	10967	} STMT_END
	10968
	10969	void
	10970	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode *o)
	10971	{
	10972	#ifdef DEBUGGING
	10973	dVAR;
	10974	register int k;
	10975	RXi_GET_DECL(prog,progi);
	10976	GET_RE_DEBUG_FLAGS_DECL;
	10977
	10978	PERL_ARGS_ASSERT_REGPROP;
	10979
	10980	sv_setpvs(sv, "");
	10981
	10982	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	10983	/* It would be nice to FAIL() here, but this may be called from
	10984	regexec.c, and it would be hard to supply pRExC_state. */
	10985	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
	10986	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	10987
	10988	k = PL_regkind[OP(o)];
	10989
	10990	if (k == EXACT) {
	10991	sv_catpvs(sv, " ");
	10992	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	10993	* is a crude hack but it may be the best for now since
	10994	* we have no flag "this EXACTish node was UTF-8"
	10995	* --jhi */
	10996	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	10997	PERL_PV_ESCAPE_UNI_DETECT \|
	10998	PERL_PV_ESCAPE_NONASCII \|
	10999	PERL_PV_PRETTY_ELLIPSES \|
	11000	PERL_PV_PRETTY_LTGT \|
	11001	PERL_PV_PRETTY_NOCLEAR
	11002	);
	11003	} else if (k == TRIE) {
	11004	/* print the details of the trie in dumpuntil instead, as
	11005	* progi->data isn't available here */
	11006	const char op = OP(o);
	11007	const U32 n = ARG(o);
	11008	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	11009	(reg_ac_data *)progi->data->data[n] :
	11010	NULL;
	11011	const reg_trie_data * const trie
	11012	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	11013
	11014	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	11015	DEBUG_TRIE_COMPILE_r(
	11016	Perl_sv_catpvf(aTHX_ sv,
	11017	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	11018	(UV)trie->startstate,
	11019	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	11020	(UV)trie->wordcount,
	11021	(UV)trie->minlen,
	11022	(UV)trie->maxlen,
	11023	(UV)TRIE_CHARCOUNT(trie),
	11024	(UV)trie->uniquecharcount
	11025	)
	11026	);
	11027	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	11028	int i;
	11029	int rangestart = -1;
	11030	U8* bitmap = IS_ANYOF_TRIE(op) ? (U8)ANYOF_BITMAP(o) : (U8)TRIE_BITMAP(trie);
	11031	sv_catpvs(sv, "[");
	11032	for (i = 0; i <= 256; i++) {
	11033	if (i < 256 && BITMAP_TEST(bitmap,i)) {
	11034	if (rangestart == -1)
	11035	rangestart = i;
	11036	} else if (rangestart != -1) {
	11037	if (i <= rangestart + 3)
	11038	for (; rangestart < i; rangestart++)
	11039	put_byte(sv, rangestart);
	11040	else {
	11041	put_byte(sv, rangestart);
	11042	sv_catpvs(sv, "-");
	11043	put_byte(sv, i - 1);
	11044	}
	11045	rangestart = -1;
	11046	}
	11047	}
	11048	sv_catpvs(sv, "]");
	11049	}
	11050
	11051	} else if (k == CURLY) {
	11052	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	11053	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	11054	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	11055	}
	11056	else if (k == WHILEM && o->flags) /* Ordinal/of */
	11057	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	11058	else if (k == REF \|\| k == OPEN \|\| k == CLOSE \|\| k == GROUPP \|\| OP(o)==ACCEPT) {
	11059	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	11060	if ( RXp_PAREN_NAMES(prog) ) {
	11061	if ( k != REF \|\| (OP(o) < NREF)) {
	11062	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	11063	SV **name= av_fetch(list, ARG(o), 0 );
	11064	if (name)
	11065	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	11066	}
	11067	else {
	11068	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	11069	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	11070	I32 nums=(I32)SvPVX(sv_dat);
	11071	SV **name= av_fetch(list, nums[0], 0 );
	11072	I32 n;
	11073	if (name) {
	11074	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	11075	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	11076	(n ? "," : ""), (IV)nums[n]);
	11077	}
	11078	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	11079	}
	11080	}
	11081	}
	11082	} else if (k == GOSUB)
	11083	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
	11084	else if (k == VERB) {
	11085	if (!o->flags)
	11086	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	11087	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	11088	} else if (k == LOGICAL)
	11089	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
	11090	else if (k == FOLDCHAR)
	11091	Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
	11092	else if (k == ANYOF) {
	11093	int i, rangestart = -1;
	11094	const U8 flags = ANYOF_FLAGS(o);
	11095	int do_sep = 0;
	11096
	11097	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	11098	static const char * const anyofs[] = {
	11099	"\\w",
	11100	"\\W",
	11101	"\\s",
	11102	"\\S",
	11103	"\\d",
	11104	"\\D",
	11105	"[:alnum:]",
	11106	"[:^alnum:]",
	11107	"[:alpha:]",
	11108	"[:^alpha:]",
	11109	"[:ascii:]",
	11110	"[:^ascii:]",
	11111	"[:cntrl:]",
	11112	"[:^cntrl:]",
	11113	"[:graph:]",
	11114	"[:^graph:]",
	11115	"[:lower:]",
	11116	"[:^lower:]",
	11117	"[:print:]",
	11118	"[:^print:]",
	11119	"[:punct:]",
	11120	"[:^punct:]",
	11121	"[:upper:]",
	11122	"[:^upper:]",
	11123	"[:xdigit:]",
	11124	"[:^xdigit:]",
	11125	"[:space:]",
	11126	"[:^space:]",
	11127	"[:blank:]",
	11128	"[:^blank:]"
	11129	};
	11130
	11131	if (flags & ANYOF_LOCALE)
	11132	sv_catpvs(sv, "{loc}");
	11133	if (flags & ANYOF_LOC_NONBITMAP_FOLD)
	11134	sv_catpvs(sv, "{i}");
	11135	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	11136	if (flags & ANYOF_INVERT)
	11137	sv_catpvs(sv, "^");
	11138
	11139	/* output what the standard cp 0-255 bitmap matches */
	11140	for (i = 0; i <= 256; i++) {
	11141	if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
	11142	if (rangestart == -1)
	11143	rangestart = i;
	11144	} else if (rangestart != -1) {
	11145	if (i <= rangestart + 3)
	11146	for (; rangestart < i; rangestart++)
	11147	put_byte(sv, rangestart);
	11148	else {
	11149	put_byte(sv, rangestart);
	11150	sv_catpvs(sv, "-");
	11151	put_byte(sv, i - 1);
	11152	}
	11153	do_sep = 1;
	11154	rangestart = -1;
	11155	}
	11156	}
	11157
	11158	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	11159	/* output any special charclass tests (used entirely under use locale) */
	11160	if (ANYOF_CLASS_TEST_ANY_SET(o))
	11161	for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
	11162	if (ANYOF_CLASS_TEST(o,i)) {
	11163	sv_catpv(sv, anyofs[i]);
	11164	do_sep = 1;
	11165	}
	11166
	11167	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	11168
	11169	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	11170	sv_catpvs(sv, "{non-utf8-latin1-all}");
	11171	}
	11172
	11173	/* output information about the unicode matching */
	11174	if (flags & ANYOF_UNICODE_ALL)
	11175	sv_catpvs(sv, "{unicode_all}");
	11176	else if (ANYOF_NONBITMAP(o))
	11177	sv_catpvs(sv, "{unicode}");
	11178	if (flags & ANYOF_NONBITMAP_NON_UTF8)
	11179	sv_catpvs(sv, "{outside bitmap}");
	11180
	11181	if (ANYOF_NONBITMAP(o)) {
	11182	SV *lv;
	11183	SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
	11184
	11185	if (lv) {
	11186	if (sw) {
	11187	U8 s[UTF8_MAXBYTES_CASE+1];
	11188
	11189	for (i = 0; i <= 256; i++) { /* just the first 256 */
	11190	uvchr_to_utf8(s, i);
	11191
	11192	if (i < 256 && swash_fetch(sw, s, TRUE)) {
	11193	if (rangestart == -1)
	11194	rangestart = i;
	11195	} else if (rangestart != -1) {
	11196	if (i <= rangestart + 3)
	11197	for (; rangestart < i; rangestart++) {
	11198	const U8 * const e = uvchr_to_utf8(s,rangestart);
	11199	U8 *p;
	11200	for(p = s; p < e; p++)
	11201	put_byte(sv, *p);
	11202	}
	11203	else {
	11204	const U8 *e = uvchr_to_utf8(s,rangestart);
	11205	U8 *p;
	11206	for (p = s; p < e; p++)
	11207	put_byte(sv, *p);
	11208	sv_catpvs(sv, "-");
	11209	e = uvchr_to_utf8(s, i-1);
	11210	for (p = s; p < e; p++)
	11211	put_byte(sv, *p);
	11212	}
	11213	rangestart = -1;
	11214	}
	11215	}
	11216
	11217	sv_catpvs(sv, "..."); /* et cetera */
	11218	}
	11219
	11220	{
	11221	char *s = savesvpv(lv);
	11222	char * const origs = s;
	11223
	11224	while (s && s != '\n')
	11225	s++;
	11226
	11227	if (*s == '\n') {
	11228	const char * const t = ++s;
	11229
	11230	while (*s) {
	11231	if (*s == '\n')
	11232	*s = ' ';
	11233	s++;
	11234	}
	11235	if (s[-1] == ' ')
	11236	s[-1] = 0;
	11237
	11238	sv_catpv(sv, t);
	11239	}
	11240
	11241	Safefree(origs);
	11242	}
	11243	}
	11244	}
	11245
	11246	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	11247	}
	11248	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	11249	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	11250	#else
	11251	PERL_UNUSED_CONTEXT;
	11252	PERL_UNUSED_ARG(sv);
	11253	PERL_UNUSED_ARG(o);
	11254	PERL_UNUSED_ARG(prog);
	11255	#endif /* DEBUGGING */
	11256	}
	11257
	11258	SV *
	11259	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	11260	{ /* Assume that RE_INTUIT is set */
	11261	dVAR;
	11262	struct regexp const prog = (struct regexp )SvANY(r);
	11263	GET_RE_DEBUG_FLAGS_DECL;
	11264
	11265	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	11266	PERL_UNUSED_CONTEXT;
	11267
	11268	DEBUG_COMPILE_r(
	11269	{
	11270	const char * const s = SvPV_nolen_const(prog->check_substr
	11271	? prog->check_substr : prog->check_utf8);
	11272
	11273	if (!PL_colorset) reginitcolors();
	11274	PerlIO_printf(Perl_debug_log,
	11275	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	11276	PL_colors[4],
	11277	prog->check_substr ? "" : "utf8 ",
	11278	PL_colors[5],PL_colors[0],
	11279	s,
	11280	PL_colors[1],
	11281	(strlen(s) > 60 ? "..." : ""));
	11282	} );
	11283
	11284	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	11285	}
	11286
	11287	/*
	11288	pregfree()
	11289
	11290	handles refcounting and freeing the perl core regexp structure. When
	11291	it is necessary to actually free the structure the first thing it
	11292	does is call the 'free' method of the regexp_engine associated to
	11293	the regexp, allowing the handling of the void *pprivate; member
	11294	first. (This routine is not overridable by extensions, which is why
	11295	the extensions free is called first.)
	11296
	11297	See regdupe and regdupe_internal if you change anything here.
	11298	*/
	11299	#ifndef PERL_IN_XSUB_RE
	11300	void
	11301	Perl_pregfree(pTHX_ REGEXP *r)
	11302	{
	11303	SvREFCNT_dec(r);
	11304	}
	11305
	11306	void
	11307	Perl_pregfree2(pTHX_ REGEXP *rx)
	11308	{
	11309	dVAR;
	11310	struct regexp const r = (struct regexp )SvANY(rx);
	11311	GET_RE_DEBUG_FLAGS_DECL;
	11312
	11313	PERL_ARGS_ASSERT_PREGFREE2;
	11314
	11315	if (r->mother_re) {
	11316	ReREFCNT_dec(r->mother_re);
	11317	} else {
	11318	CALLREGFREE_PVT(rx); /* free the private data */
	11319	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	11320	}
	11321	if (r->substrs) {
	11322	SvREFCNT_dec(r->anchored_substr);
	11323	SvREFCNT_dec(r->anchored_utf8);
	11324	SvREFCNT_dec(r->float_substr);
	11325	SvREFCNT_dec(r->float_utf8);
	11326	Safefree(r->substrs);
	11327	}
	11328	RX_MATCH_COPY_FREE(rx);
	11329	#ifdef PERL_OLD_COPY_ON_WRITE
	11330	SvREFCNT_dec(r->saved_copy);
	11331	#endif
	11332	Safefree(r->offs);
	11333	}
	11334
	11335	/* reg_temp_copy()
	11336
	11337	This is a hacky workaround to the structural issue of match results
	11338	being stored in the regexp structure which is in turn stored in
	11339	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	11340	could be PL_curpm in multiple contexts, and could require multiple
	11341	result sets being associated with the pattern simultaneously, such
	11342	as when doing a recursive match with (??{$qr})
	11343
	11344	The solution is to make a lightweight copy of the regexp structure
	11345	when a qr// is returned from the code executed by (??{$qr}) this
	11346	lightweight copy doesn't actually own any of its data except for
	11347	the starp/end and the actual regexp structure itself.
	11348
	11349	*/
	11350
	11351
	11352	REGEXP *
	11353	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	11354	{
	11355	struct regexp *ret;
	11356	struct regexp const r = (struct regexp )SvANY(rx);
	11357	register const I32 npar = r->nparens+1;
	11358
	11359	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	11360
	11361	if (!ret_x)
	11362	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	11363	ret = (struct regexp *)SvANY(ret_x);
	11364
	11365	(void)ReREFCNT_inc(rx);
	11366	/* We can take advantage of the existing "copied buffer" mechanism in SVs
	11367	by pointing directly at the buffer, but flagging that the allocated
	11368	space in the copy is zero. As we've just done a struct copy, it's now
	11369	a case of zero-ing that, rather than copying the current length. */
	11370	SvPV_set(ret_x, RX_WRAPPED(rx));
	11371	SvFLAGS(ret_x) \|= SvFLAGS(rx) & (SVf_POK\|SVp_POK\|SVf_UTF8);
	11372	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	11373	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	11374	SvLEN_set(ret_x, 0);
	11375	SvSTASH_set(ret_x, NULL);
	11376	SvMAGIC_set(ret_x, NULL);
	11377	Newx(ret->offs, npar, regexp_paren_pair);
	11378	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	11379	if (r->substrs) {
	11380	Newx(ret->substrs, 1, struct reg_substr_data);
	11381	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	11382
	11383	SvREFCNT_inc_void(ret->anchored_substr);
	11384	SvREFCNT_inc_void(ret->anchored_utf8);
	11385	SvREFCNT_inc_void(ret->float_substr);
	11386	SvREFCNT_inc_void(ret->float_utf8);
	11387
	11388	/* check_substr and check_utf8, if non-NULL, point to either their
	11389	anchored or float namesakes, and don't hold a second reference. */
	11390	}
	11391	RX_MATCH_COPIED_off(ret_x);
	11392	#ifdef PERL_OLD_COPY_ON_WRITE
	11393	ret->saved_copy = NULL;
	11394	#endif
	11395	ret->mother_re = rx;
	11396
	11397	return ret_x;
	11398	}
	11399	#endif
	11400
	11401	/* regfree_internal()
	11402
	11403	Free the private data in a regexp. This is overloadable by
	11404	extensions. Perl takes care of the regexp structure in pregfree(),
	11405	this covers the *pprivate pointer which technically perl doesn't
	11406	know about, however of course we have to handle the
	11407	regexp_internal structure when no extension is in use.
	11408
	11409	Note this is called before freeing anything in the regexp
	11410	structure.
	11411	*/
	11412
	11413	void
	11414	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	11415	{
	11416	dVAR;
	11417	struct regexp const r = (struct regexp )SvANY(rx);
	11418	RXi_GET_DECL(r,ri);
	11419	GET_RE_DEBUG_FLAGS_DECL;
	11420
	11421	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	11422
	11423	DEBUG_COMPILE_r({
	11424	if (!PL_colorset)
	11425	reginitcolors();
	11426	{
	11427	SV *dsv= sv_newmortal();
	11428	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	11429	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	11430	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	11431	PL_colors[4],PL_colors[5],s);
	11432	}
	11433	});
	11434	#ifdef RE_TRACK_PATTERN_OFFSETS
	11435	if (ri->u.offsets)
	11436	Safefree(ri->u.offsets); /* 20010421 MJD */
	11437	#endif
	11438	if (ri->data) {
	11439	int n = ri->data->count;
	11440	PAD* new_comppad = NULL;
	11441	PAD* old_comppad;
	11442	PADOFFSET refcnt;
	11443
	11444	while (--n >= 0) {
	11445	/* If you add a ->what type here, update the comment in regcomp.h */
	11446	switch (ri->data->what[n]) {
	11447	case 'a':
	11448	case 's':
	11449	case 'S':
	11450	case 'u':
	11451	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	11452	break;
	11453	case 'f':
	11454	Safefree(ri->data->data[n]);
	11455	break;
	11456	case 'p':
	11457	new_comppad = MUTABLE_AV(ri->data->data[n]);
	11458	break;
	11459	case 'o':
	11460	if (new_comppad == NULL)
	11461	Perl_croak(aTHX_ "panic: pregfree comppad");
	11462	PAD_SAVE_LOCAL(old_comppad,
	11463	/* Watch out for global destruction's random ordering. */
	11464	(SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
	11465	);
	11466	OP_REFCNT_LOCK;
	11467	refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
	11468	OP_REFCNT_UNLOCK;
	11469	if (!refcnt)
	11470	op_free((OP_4tree*)ri->data->data[n]);
	11471
	11472	PAD_RESTORE_LOCAL(old_comppad);
	11473	SvREFCNT_dec(MUTABLE_SV(new_comppad));
	11474	new_comppad = NULL;
	11475	break;
	11476	case 'n':
	11477	break;
	11478	case 'T':
	11479	{ /* Aho Corasick add-on structure for a trie node.
	11480	Used in stclass optimization only */
	11481	U32 refcount;
	11482	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	11483	OP_REFCNT_LOCK;
	11484	refcount = --aho->refcount;
	11485	OP_REFCNT_UNLOCK;
	11486	if ( !refcount ) {
	11487	PerlMemShared_free(aho->states);
	11488	PerlMemShared_free(aho->fail);
	11489	/* do this last!!!! */
	11490	PerlMemShared_free(ri->data->data[n]);
	11491	PerlMemShared_free(ri->regstclass);
	11492	}
	11493	}
	11494	break;
	11495	case 't':
	11496	{
	11497	/* trie structure. */
	11498	U32 refcount;
	11499	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	11500	OP_REFCNT_LOCK;
	11501	refcount = --trie->refcount;
	11502	OP_REFCNT_UNLOCK;
	11503	if ( !refcount ) {
	11504	PerlMemShared_free(trie->charmap);
	11505	PerlMemShared_free(trie->states);
	11506	PerlMemShared_free(trie->trans);
	11507	if (trie->bitmap)
	11508	PerlMemShared_free(trie->bitmap);
	11509	if (trie->jump)
	11510	PerlMemShared_free(trie->jump);
	11511	PerlMemShared_free(trie->wordinfo);
	11512	/* do this last!!!! */
	11513	PerlMemShared_free(ri->data->data[n]);
	11514	}
	11515	}
	11516	break;
	11517	default:
	11518	Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
	11519	}
	11520	}
	11521	Safefree(ri->data->what);
	11522	Safefree(ri->data);
	11523	}
	11524
	11525	Safefree(ri);
	11526	}
	11527
	11528	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	11529	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	11530	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	11531
	11532	/*
	11533	re_dup - duplicate a regexp.
	11534
	11535	This routine is expected to clone a given regexp structure. It is only
	11536	compiled under USE_ITHREADS.
	11537
	11538	After all of the core data stored in struct regexp is duplicated
	11539	the regexp_engine.dupe method is used to copy any private data
	11540	stored in the *pprivate pointer. This allows extensions to handle
	11541	any duplication it needs to do.
	11542
	11543	See pregfree() and regfree_internal() if you change anything here.
	11544	*/
	11545	#if defined(USE_ITHREADS)
	11546	#ifndef PERL_IN_XSUB_RE
	11547	void
	11548	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	11549	{
	11550	dVAR;
	11551	I32 npar;
	11552	const struct regexp r = (const struct regexp )SvANY(sstr);
	11553	struct regexp ret = (struct regexp )SvANY(dstr);
	11554
	11555	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	11556
	11557	npar = r->nparens+1;
	11558	Newx(ret->offs, npar, regexp_paren_pair);
	11559	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	11560	if(ret->swap) {
	11561	/* no need to copy these */
	11562	Newx(ret->swap, npar, regexp_paren_pair);
	11563	}
	11564
	11565	if (ret->substrs) {
	11566	/* Do it this way to avoid reading from *r after the StructCopy().
	11567	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	11568	cache, it doesn't matter. */
	11569	const bool anchored = r->check_substr
	11570	? r->check_substr == r->anchored_substr
	11571	: r->check_utf8 == r->anchored_utf8;
	11572	Newx(ret->substrs, 1, struct reg_substr_data);
	11573	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	11574
	11575	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	11576	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	11577	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	11578	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	11579
	11580	/* check_substr and check_utf8, if non-NULL, point to either their
	11581	anchored or float namesakes, and don't hold a second reference. */
	11582
	11583	if (ret->check_substr) {
	11584	if (anchored) {
	11585	assert(r->check_utf8 == r->anchored_utf8);
	11586	ret->check_substr = ret->anchored_substr;
	11587	ret->check_utf8 = ret->anchored_utf8;
	11588	} else {
	11589	assert(r->check_substr == r->float_substr);
	11590	assert(r->check_utf8 == r->float_utf8);
	11591	ret->check_substr = ret->float_substr;
	11592	ret->check_utf8 = ret->float_utf8;
	11593	}
	11594	} else if (ret->check_utf8) {
	11595	if (anchored) {
	11596	ret->check_utf8 = ret->anchored_utf8;
	11597	} else {
	11598	ret->check_utf8 = ret->float_utf8;
	11599	}
	11600	}
	11601	}
	11602
	11603	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	11604
	11605	if (ret->pprivate)
	11606	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	11607
	11608	if (RX_MATCH_COPIED(dstr))
	11609	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	11610	else
	11611	ret->subbeg = NULL;
	11612	#ifdef PERL_OLD_COPY_ON_WRITE
	11613	ret->saved_copy = NULL;
	11614	#endif
	11615
	11616	if (ret->mother_re) {
	11617	if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
	11618	/* Our storage points directly to our mother regexp, but that's
	11619	1: a buffer in a different thread
	11620	2: something we no longer hold a reference on
	11621	so we need to copy it locally. */
	11622	/* Note we need to sue SvCUR() on our mother_re, because it, in
	11623	turn, may well be pointing to its own mother_re. */
	11624	SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
	11625	SvCUR(ret->mother_re)+1));
	11626	SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
	11627	}
	11628	ret->mother_re = NULL;
	11629	}
	11630	ret->gofs = 0;
	11631	}
	11632	#endif /* PERL_IN_XSUB_RE */
	11633
	11634	/*
	11635	regdupe_internal()
	11636
	11637	This is the internal complement to regdupe() which is used to copy
	11638	the structure pointed to by the *pprivate pointer in the regexp.
	11639	This is the core version of the extension overridable cloning hook.
	11640	The regexp structure being duplicated will be copied by perl prior
	11641	to this and will be provided as the regexp *r argument, however
	11642	with the /old/ structures pprivate pointer value. Thus this routine
	11643	may override any copying normally done by perl.
	11644
	11645	It returns a pointer to the new regexp_internal structure.
	11646	*/
	11647
	11648	void *
	11649	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	11650	{
	11651	dVAR;
	11652	struct regexp const r = (struct regexp )SvANY(rx);
	11653	regexp_internal *reti;
	11654	int len, npar;
	11655	RXi_GET_DECL(r,ri);
	11656
	11657	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	11658
	11659	npar = r->nparens+1;
	11660	len = ProgLen(ri);
	11661
	11662	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
	11663	Copy(ri->program, reti->program, len+1, regnode);
	11664
	11665
	11666	reti->regstclass = NULL;
	11667
	11668	if (ri->data) {
	11669	struct reg_data *d;
	11670	const int count = ri->data->count;
	11671	int i;
	11672
	11673	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	11674	char, struct reg_data);
	11675	Newx(d->what, count, U8);
	11676
	11677	d->count = count;
	11678	for (i = 0; i < count; i++) {
	11679	d->what[i] = ri->data->what[i];
	11680	switch (d->what[i]) {
	11681	/* legal options are one of: sSfpontTua
	11682	see also regcomp.h and pregfree() */
	11683	case 'a': /* actually an AV, but the dup function is identical. */
	11684	case 's':
	11685	case 'S':
	11686	case 'p': /* actually an AV, but the dup function is identical. */
	11687	case 'u': /* actually an HV, but the dup function is identical. */
	11688	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	11689	break;
	11690	case 'f':
	11691	/* This is cheating. */
	11692	Newx(d->data[i], 1, struct regnode_charclass_class);
	11693	StructCopy(ri->data->data[i], d->data[i],
	11694	struct regnode_charclass_class);
	11695	reti->regstclass = (regnode*)d->data[i];
	11696	break;
	11697	case 'o':
	11698	/* Compiled op trees are readonly and in shared memory,
	11699	and can thus be shared without duplication. */
	11700	OP_REFCNT_LOCK;
	11701	d->data[i] = (void)OpREFCNT_inc((OP)ri->data->data[i]);
	11702	OP_REFCNT_UNLOCK;
	11703	break;
	11704	case 'T':
	11705	/* Trie stclasses are readonly and can thus be shared
	11706	* without duplication. We free the stclass in pregfree
	11707	* when the corresponding reg_ac_data struct is freed.
	11708	*/
	11709	reti->regstclass= ri->regstclass;
	11710	/* Fall through */
	11711	case 't':
	11712	OP_REFCNT_LOCK;
	11713	((reg_trie_data*)ri->data->data[i])->refcount++;
	11714	OP_REFCNT_UNLOCK;
	11715	/* Fall through */
	11716	case 'n':
	11717	d->data[i] = ri->data->data[i];
	11718	break;
	11719	default:
	11720	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
	11721	}
	11722	}
	11723
	11724	reti->data = d;
	11725	}
	11726	else
	11727	reti->data = NULL;
	11728
	11729	reti->name_list_idx = ri->name_list_idx;
	11730
	11731	#ifdef RE_TRACK_PATTERN_OFFSETS
	11732	if (ri->u.offsets) {
	11733	Newx(reti->u.offsets, 2*len+1, U32);
	11734	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	11735	}
	11736	#else
	11737	SetProgLen(reti,len);
	11738	#endif
	11739
	11740	return (void*)reti;
	11741	}
	11742
	11743	#endif /* USE_ITHREADS */
	11744
	11745	#ifndef PERL_IN_XSUB_RE
	11746
	11747	/*
	11748	- regnext - dig the "next" pointer out of a node
	11749	*/
	11750	regnode *
	11751	Perl_regnext(pTHX_ register regnode *p)
	11752	{
	11753	dVAR;
	11754	register I32 offset;
	11755
	11756	if (!p)
	11757	return(NULL);
	11758
	11759	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	11760	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
	11761	}
	11762
	11763	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	11764	if (offset == 0)
	11765	return(NULL);
	11766
	11767	return(p+offset);
	11768	}
	11769	#endif
	11770
	11771	STATIC void
	11772	S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
	11773	{
	11774	va_list args;
	11775	STRLEN l1 = strlen(pat1);
	11776	STRLEN l2 = strlen(pat2);
	11777	char buf[512];
	11778	SV *msv;
	11779	const char *message;
	11780
	11781	PERL_ARGS_ASSERT_RE_CROAK2;
	11782
	11783	if (l1 > 510)
	11784	l1 = 510;
	11785	if (l1 + l2 > 510)
	11786	l2 = 510 - l1;
	11787	Copy(pat1, buf, l1 , char);
	11788	Copy(pat2, buf + l1, l2 , char);
	11789	buf[l1 + l2] = '\n';
	11790	buf[l1 + l2 + 1] = '\0';
	11791	#ifdef I_STDARG
	11792	/* ANSI variant takes additional second argument */
	11793	va_start(args, pat2);
	11794	#else
	11795	va_start(args);
	11796	#endif
	11797	msv = vmess(buf, &args);
	11798	va_end(args);
	11799	message = SvPV_const(msv,l1);
	11800	if (l1 > 512)
	11801	l1 = 512;
	11802	Copy(message, buf, l1 , char);
	11803	buf[l1-1] = '\0'; /* Overwrite \n */
	11804	Perl_croak(aTHX_ "%s", buf);
	11805	}
	11806
	11807	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	11808
	11809	#ifndef PERL_IN_XSUB_RE
	11810	void
	11811	Perl_save_re_context(pTHX)
	11812	{
	11813	dVAR;
	11814
	11815	struct re_save_state *state;
	11816
	11817	SAVEVPTR(PL_curcop);
	11818	SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
	11819
	11820	state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
	11821	PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
	11822	SSPUSHUV(SAVEt_RE_STATE);
	11823
	11824	Copy(&PL_reg_state, state, 1, struct re_save_state);
	11825
	11826	PL_reg_start_tmp = 0;
	11827	PL_reg_start_tmpl = 0;
	11828	PL_reg_oldsaved = NULL;
	11829	PL_reg_oldsavedlen = 0;
	11830	PL_reg_maxiter = 0;
	11831	PL_reg_leftiter = 0;
	11832	PL_reg_poscache = NULL;
	11833	PL_reg_poscache_size = 0;
	11834	#ifdef PERL_OLD_COPY_ON_WRITE
	11835	PL_nrs = NULL;
	11836	#endif
	11837
	11838	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	11839	if (PL_curpm) {
	11840	const REGEXP * const rx = PM_GETRE(PL_curpm);
	11841	if (rx) {
	11842	U32 i;
	11843	for (i = 1; i <= RX_NPARENS(rx); i++) {
	11844	char digits[TYPE_CHARS(long)];
	11845	const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
	11846	GV const const gvp
	11847	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	11848
	11849	if (gvp) {
	11850	GV * const gv = *gvp;
	11851	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	11852	save_scalar(gv);
	11853	}
	11854	}
	11855	}
	11856	}
	11857	}
	11858	#endif
	11859
	11860	static void
	11861	clear_re(pTHX_ void *r)
	11862	{
	11863	dVAR;
	11864	ReREFCNT_dec((REGEXP *)r);
	11865	}
	11866
	11867	#ifdef DEBUGGING
	11868
	11869	STATIC void
	11870	S_put_byte(pTHX_ SV *sv, int c)
	11871	{
	11872	PERL_ARGS_ASSERT_PUT_BYTE;
	11873
	11874	/* Our definition of isPRINT() ignores locales, so only bytes that are
	11875	not part of UTF-8 are considered printable. I assume that the same
	11876	holds for UTF-EBCDIC.
	11877	Also, code point 255 is not printable in either (it's E0 in EBCDIC,
	11878	which Wikipedia says:
	11879
	11880	EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
	11881	ones (binary 1111 1111, hexadecimal FF). It is similar, but not
	11882	identical, to the ASCII delete (DEL) or rubout control character.
	11883	) So the old condition can be simplified to !isPRINT(c) */
	11884	if (!isPRINT(c)) {
	11885	if (c < 256) {
	11886	Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
	11887	}
	11888	else {
	11889	Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
	11890	}
	11891	}
	11892	else {
	11893	const char string = c;
	11894	if (c == '-' \|\| c == ']' \|\| c == '\\' \|\| c == '^')
	11895	sv_catpvs(sv, "\\");
	11896	sv_catpvn(sv, &string, 1);
	11897	}
	11898	}
	11899
	11900
	11901	#define CLEAR_OPTSTART \
	11902	if (optstart) STMT_START { \
	11903	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	11904	optstart=NULL; \
	11905	} STMT_END
	11906
	11907	#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	11908
	11909	STATIC const regnode *
	11910	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	11911	const regnode last, const regnode plast,
	11912	SV* sv, I32 indent, U32 depth)
	11913	{
	11914	dVAR;
	11915	register U8 op = PSEUDO; /* Arbitrary non-END op. */
	11916	register const regnode *next;
	11917	const regnode *optstart= NULL;
	11918
	11919	RXi_GET_DECL(r,ri);
	11920	GET_RE_DEBUG_FLAGS_DECL;
	11921
	11922	PERL_ARGS_ASSERT_DUMPUNTIL;
	11923
	11924	#ifdef DEBUG_DUMPUNTIL
	11925	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	11926	last ? last-start : 0,plast ? plast-start : 0);
	11927	#endif
	11928
	11929	if (plast && plast < last)
	11930	last= plast;
	11931
	11932	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	11933	/* While that wasn't END last time... */
	11934	NODE_ALIGN(node);
	11935	op = OP(node);
	11936	if (op == CLOSE \|\| op == WHILEM)
	11937	indent--;
	11938	next = regnext((regnode *)node);
	11939
	11940	/* Where, what. */
	11941	if (OP(node) == OPTIMIZED) {
	11942	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	11943	optstart = node;
	11944	else
	11945	goto after_print;
	11946	} else
	11947	CLEAR_OPTSTART;
	11948
	11949	regprop(r, sv, node);
	11950	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	11951	(int)(2*indent + 1), "", SvPVX_const(sv));
	11952
	11953	if (OP(node) != OPTIMIZED) {
	11954	if (next == NULL) /* Next ptr. */
	11955	PerlIO_printf(Perl_debug_log, " (0)");
	11956	else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
	11957	PerlIO_printf(Perl_debug_log, " (FAIL)");
	11958	else
	11959	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	11960	(void)PerlIO_putc(Perl_debug_log, '\n');
	11961	}
	11962
	11963	after_print:
	11964	if (PL_regkind[(U8)op] == BRANCHJ) {
	11965	assert(next);
	11966	{
	11967	register const regnode *nnode = (OP(next) == LONGJMP
	11968	? regnext((regnode *)next)
	11969	: next);
	11970	if (last && nnode > last)
	11971	nnode = last;
	11972	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	11973	}
	11974	}
	11975	else if (PL_regkind[(U8)op] == BRANCH) {
	11976	assert(next);
	11977	DUMPUNTIL(NEXTOPER(node), next);
	11978	}
	11979	else if ( PL_regkind[(U8)op] == TRIE ) {
	11980	const regnode *this_trie = node;
	11981	const char op = OP(node);
	11982	const U32 n = ARG(node);
	11983	const reg_ac_data * const ac = op>=AHOCORASICK ?
	11984	(reg_ac_data *)ri->data->data[n] :
	11985	NULL;
	11986	const reg_trie_data * const trie =
	11987	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	11988	#ifdef DEBUGGING
	11989	AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	11990	#endif
	11991	const regnode *nextbranch= NULL;
	11992	I32 word_idx;
	11993	sv_setpvs(sv, "");
	11994	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	11995	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	11996
	11997	PerlIO_printf(Perl_debug_log, "%*s%s ",
	11998	(int)(2*(indent+3)), "",
	11999	elem_ptr ? pv_pretty(sv, SvPV_nolen_const(elem_ptr), SvCUR(elem_ptr), 60,
	12000	PL_colors[0], PL_colors[1],
	12001	(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) \|
	12002	PERL_PV_PRETTY_ELLIPSES \|
	12003	PERL_PV_PRETTY_LTGT
	12004	)
	12005	: "???"
	12006	);
	12007	if (trie->jump) {
	12008	U16 dist= trie->jump[word_idx+1];
	12009	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	12010	(UV)((dist ? this_trie + dist : next) - start));
	12011	if (dist) {
	12012	if (!nextbranch)
	12013	nextbranch= this_trie + trie->jump[0];
	12014	DUMPUNTIL(this_trie + dist, nextbranch);
	12015	}
	12016	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	12017	nextbranch= regnext((regnode *)nextbranch);
	12018	} else {
	12019	PerlIO_printf(Perl_debug_log, "\n");
	12020	}
	12021	}
	12022	if (last && next > last)
	12023	node= last;
	12024	else
	12025	node= next;
	12026	}
	12027	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	12028	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	12029	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	12030	}
	12031	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	12032	assert(next);
	12033	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	12034	}
	12035	else if ( op == PLUS \|\| op == STAR) {
	12036	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	12037	}
	12038	else if (PL_regkind[(U8)op] == ANYOF) {
	12039	/* arglen 1 + class block */
	12040	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
	12041	? ANYOF_CLASS_SKIP : ANYOF_SKIP);
	12042	node = NEXTOPER(node);
	12043	}
	12044	else if (PL_regkind[(U8)op] == EXACT) {
	12045	/* Literal string, where present. */
	12046	node += NODE_SZ_STR(node) - 1;
	12047	node = NEXTOPER(node);
	12048	}
	12049	else {
	12050	node = NEXTOPER(node);
	12051	node += regarglen[(U8)op];
	12052	}
	12053	if (op == CURLYX \|\| op == OPEN)
	12054	indent++;
	12055	}
	12056	CLEAR_OPTSTART;
	12057	#ifdef DEBUG_DUMPUNTIL
	12058	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	12059	#endif
	12060	return node;
	12061	}
	12062
	12063	#endif /* DEBUGGING */
	12064
	12065	/*
	12066	* Local variables:
	12067	* c-indentation-style: bsd
	12068	* c-basic-offset: 4
	12069	* indent-tabs-mode: t
	12070	* End:
	12071	*
	12072	* ex: set ts=8 sts=4 sw=4 noet:
	12073	*/