perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	#else
	85	# include "regcomp.h"
	86	#endif
	87
	88	#include "dquote_static.c"
	89
	90	#ifdef op
	91	#undef op
	92	#endif /* op */
	93
	94	#ifdef MSDOS
	95	# if defined(BUGGY_MSC6)
	96	/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
	97	# pragma optimize("a",off)
	98	/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
	99	# pragma optimize("w",on )
	100	# endif /* BUGGY_MSC6 */
	101	#endif /* MSDOS */
	102
	103	#ifndef STATIC
	104	#define STATIC static
	105	#endif
	106
	107	typedef struct RExC_state_t {
	108	U32 flags; /* are we folding, multilining? */
	109	char precomp; / uncompiled string. */
	110	REGEXP rx_sv; / The SV that is the regexp. */
	111	regexp rx; / perl core regexp structure */
	112	regexp_internal rxi; / internal data for regexp object pprivate field */
	113	char start; / Start of input for compile */
	114	char end; / End of input for compile */
	115	char parse; / Input-scan pointer. */
	116	I32 whilem_seen; /* number of WHILEM in this expr */
	117	regnode emit_start; / Start of emitted-code area */
	118	regnode emit_bound; / First regnode outside of the allocated space */
	119	regnode emit; / Code-emit pointer; &regdummy = don't = compiling */
	120	I32 naughty; /* How bad is this pattern? */
	121	I32 sawback; /* Did we see \1, ...? */
	122	U32 seen;
	123	I32 size; /* Code size. */
	124	I32 npar; /* Capture buffer count, (OPEN). */
	125	I32 cpar; /* Capture buffer count, (CLOSE). */
	126	I32 nestroot; /* root parens we are in - used by accept */
	127	I32 extralen;
	128	I32 seen_zerolen;
	129	I32 seen_evals;
	130	regnode *open_parens; / pointers to open parens */
	131	regnode *close_parens; / pointers to close parens */
	132	regnode opend; / END node in program */
	133	I32 utf8; /* whether the pattern is utf8 or not */
	134	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	135	/* XXX use this for future optimisation of case
	136	* where pattern must be upgraded to utf8. */
	137	I32 uni_semantics; /* If a d charset modifier should use unicode
	138	rules, even if the pattern is not in
	139	utf8 */
	140	HV paren_names; / Paren names */
	141
	142	regnode *recurse; / Recurse regops */
	143	I32 recurse_count; /* Number of recurse regops */
	144	I32 in_lookbehind;
	145	I32 contains_locale;
	146	I32 override_recoding;
	147	#if ADD_TO_REGEXEC
	148	char starttry; / -Dr: where regtry was called. */
	149	#define RExC_starttry (pRExC_state->starttry)
	150	#endif
	151	#ifdef DEBUGGING
	152	const char *lastparse;
	153	I32 lastnum;
	154	AV paren_name_list; / idx -> name */
	155	#define RExC_lastparse (pRExC_state->lastparse)
	156	#define RExC_lastnum (pRExC_state->lastnum)
	157	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	158	#endif
	159	} RExC_state_t;
	160
	161	#define RExC_flags (pRExC_state->flags)
	162	#define RExC_precomp (pRExC_state->precomp)
	163	#define RExC_rx_sv (pRExC_state->rx_sv)
	164	#define RExC_rx (pRExC_state->rx)
	165	#define RExC_rxi (pRExC_state->rxi)
	166	#define RExC_start (pRExC_state->start)
	167	#define RExC_end (pRExC_state->end)
	168	#define RExC_parse (pRExC_state->parse)
	169	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	170	#ifdef RE_TRACK_PATTERN_OFFSETS
	171	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
	172	#endif
	173	#define RExC_emit (pRExC_state->emit)
	174	#define RExC_emit_start (pRExC_state->emit_start)
	175	#define RExC_emit_bound (pRExC_state->emit_bound)
	176	#define RExC_naughty (pRExC_state->naughty)
	177	#define RExC_sawback (pRExC_state->sawback)
	178	#define RExC_seen (pRExC_state->seen)
	179	#define RExC_size (pRExC_state->size)
	180	#define RExC_npar (pRExC_state->npar)
	181	#define RExC_nestroot (pRExC_state->nestroot)
	182	#define RExC_extralen (pRExC_state->extralen)
	183	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	184	#define RExC_seen_evals (pRExC_state->seen_evals)
	185	#define RExC_utf8 (pRExC_state->utf8)
	186	#define RExC_uni_semantics (pRExC_state->uni_semantics)
	187	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	188	#define RExC_open_parens (pRExC_state->open_parens)
	189	#define RExC_close_parens (pRExC_state->close_parens)
	190	#define RExC_opend (pRExC_state->opend)
	191	#define RExC_paren_names (pRExC_state->paren_names)
	192	#define RExC_recurse (pRExC_state->recurse)
	193	#define RExC_recurse_count (pRExC_state->recurse_count)
	194	#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
	195	#define RExC_contains_locale (pRExC_state->contains_locale)
	196	#define RExC_override_recoding (pRExC_state->override_recoding)
	197
	198
	199	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	200	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	201	((*s) == '{' && regcurly(s)))
	202
	203	#ifdef SPSTART
	204	#undef SPSTART /* dratted cpp namespace... */
	205	#endif
	206	/*
	207	* Flags to be passed up and down.
	208	*/
	209	#define WORST 0 /* Worst case. */
	210	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	211
	212	/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
	213	* character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
	214	#define SIMPLE 0x02
	215	#define SPSTART 0x04 /* Starts with * or +. */
	216	#define TRYAGAIN 0x08 /* Weeded out a declaration. */
	217	#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
	218
	219	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	220
	221	/* whether trie related optimizations are enabled */
	222	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	223	#define TRIE_STUDY_OPT
	224	#define FULL_TRIE_STUDY
	225	#define TRIE_STCLASS
	226	#endif
	227
	228
	229
	230	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	231	#define PBITVAL(paren) (1 << ((paren) & 7))
	232	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	233	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	234	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	235
	236	/* If not already in utf8, do a longjmp back to the beginning */
	237	#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
	238	#define REQUIRE_UTF8 STMT_START { \
	239	if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
	240	} STMT_END
	241
	242	/* About scan_data_t.
	243
	244	During optimisation we recurse through the regexp program performing
	245	various inplace (keyhole style) optimisations. In addition study_chunk
	246	and scan_commit populate this data structure with information about
	247	what strings MUST appear in the pattern. We look for the longest
	248	string that must appear at a fixed location, and we look for the
	249	longest string that may appear at a floating location. So for instance
	250	in the pattern:
	251
	252	/FOO[xX]A.*B[xX]BAR/
	253
	254	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	255	strings (because they follow a .* construct). study_chunk will identify
	256	both FOO and BAR as being the longest fixed and floating strings respectively.
	257
	258	The strings can be composites, for instance
	259
	260	/(f)(o)(o)/
	261
	262	will result in a composite fixed substring 'foo'.
	263
	264	For each string some basic information is maintained:
	265
	266	- offset or min_offset
	267	This is the position the string must appear at, or not before.
	268	It also implicitly (when combined with minlenp) tells us how many
	269	characters must match before the string we are searching for.
	270	Likewise when combined with minlenp and the length of the string it
	271	tells us how many characters must appear after the string we have
	272	found.
	273
	274	- max_offset
	275	Only used for floating strings. This is the rightmost point that
	276	the string can appear at. If set to I32 max it indicates that the
	277	string can occur infinitely far to the right.
	278
	279	- minlenp
	280	A pointer to the minimum length of the pattern that the string
	281	was found inside. This is important as in the case of positive
	282	lookahead or positive lookbehind we can have multiple patterns
	283	involved. Consider
	284
	285	/(?=FOO).*F/
	286
	287	The minimum length of the pattern overall is 3, the minimum length
	288	of the lookahead part is 3, but the minimum length of the part that
	289	will actually match is 1. So 'FOO's minimum length is 3, but the
	290	minimum length for the F is 1. This is important as the minimum length
	291	is used to determine offsets in front of and behind the string being
	292	looked for. Since strings can be composites this is the length of the
	293	pattern at the time it was committed with a scan_commit. Note that
	294	the length is calculated by study_chunk, so that the minimum lengths
	295	are not known until the full pattern has been compiled, thus the
	296	pointer to the value.
	297
	298	- lookbehind
	299
	300	In the case of lookbehind the string being searched for can be
	301	offset past the start point of the final matching string.
	302	If this value was just blithely removed from the min_offset it would
	303	invalidate some of the calculations for how many chars must match
	304	before or after (as they are derived from min_offset and minlen and
	305	the length of the string being searched for).
	306	When the final pattern is compiled and the data is moved from the
	307	scan_data_t structure into the regexp structure the information
	308	about lookbehind is factored in, with the information that would
	309	have been lost precalculated in the end_shift field for the
	310	associated string.
	311
	312	The fields pos_min and pos_delta are used to store the minimum offset
	313	and the delta to the maximum offset at the current point in the pattern.
	314
	315	*/
	316
	317	typedef struct scan_data_t {
	318	/I32 len_min; unused /
	319	/I32 len_delta; unused /
	320	I32 pos_min;
	321	I32 pos_delta;
	322	SV *last_found;
	323	I32 last_end; /* min value, <0 unless valid. */
	324	I32 last_start_min;
	325	I32 last_start_max;
	326	SV *longest; / Either &l_fixed, or &l_float. */
	327	SV longest_fixed; / longest fixed string found in pattern */
	328	I32 offset_fixed; /* offset where it starts */
	329	I32 minlen_fixed; / pointer to the minlen relevant to the string */
	330	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	331	SV longest_float; / longest floating string found in pattern */
	332	I32 offset_float_min; /* earliest point in string it can appear */
	333	I32 offset_float_max; /* latest point in string it can appear */
	334	I32 minlen_float; / pointer to the minlen relevant to the string */
	335	I32 lookbehind_float; /* is the position of the string modified by LB */
	336	I32 flags;
	337	I32 whilem_c;
	338	I32 *last_closep;
	339	struct regnode_charclass_class *start_class;
	340	} scan_data_t;
	341
	342	/*
	343	* Forward declarations for pregcomp()'s friends.
	344	*/
	345
	346	static const scan_data_t zero_scan_data =
	347	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	348
	349	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	350	#define SF_BEFORE_SEOL 0x0001
	351	#define SF_BEFORE_MEOL 0x0002
	352	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	353	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	354
	355	#ifdef NO_UNARY_PLUS
	356	# define SF_FIX_SHIFT_EOL (0+2)
	357	# define SF_FL_SHIFT_EOL (0+4)
	358	#else
	359	# define SF_FIX_SHIFT_EOL (+2)
	360	# define SF_FL_SHIFT_EOL (+4)
	361	#endif
	362
	363	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	364	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	365
	366	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	367	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	368	#define SF_IS_INF 0x0040
	369	#define SF_HAS_PAR 0x0080
	370	#define SF_IN_PAR 0x0100
	371	#define SF_HAS_EVAL 0x0200
	372	#define SCF_DO_SUBSTR 0x0400
	373	#define SCF_DO_STCLASS_AND 0x0800
	374	#define SCF_DO_STCLASS_OR 0x1000
	375	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	376	#define SCF_WHILEM_VISITED_POS 0x2000
	377
	378	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	379	#define SCF_SEEN_ACCEPT 0x8000
	380
	381	#define UTF cBOOL(RExC_utf8)
	382	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	383	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	384	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
	385	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
	386	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
	387	#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	388	#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
	389
	390	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	391
	392	#define OOB_UNICODE 12345678
	393	#define OOB_NAMEDCLASS -1
	394
	395	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	396	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	397
	398
	399	/* length of regex to show in messages that don't mark a position within */
	400	#define RegexLengthToShowInErrorMessages 127
	401
	402	/*
	403	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	404	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	405	* op/pragma/warn/regcomp.
	406	*/
	407	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	408	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	409
	410	#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
	411
	412	/*
	413	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	414	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	415	* "...".
	416	*/
	417	#define _FAIL(code) STMT_START { \
	418	const char *ellipses = ""; \
	419	IV len = RExC_end - RExC_precomp; \
	420	\
	421	if (!SIZE_ONLY) \
	422	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	423	if (len > RegexLengthToShowInErrorMessages) { \
	424	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	425	len = RegexLengthToShowInErrorMessages - 10; \
	426	ellipses = "..."; \
	427	} \
	428	code; \
	429	} STMT_END
	430
	431	#define FAIL(msg) _FAIL( \
	432	Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
	433	msg, (int)len, RExC_precomp, ellipses))
	434
	435	#define FAIL2(msg,arg) _FAIL( \
	436	Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
	437	arg, (int)len, RExC_precomp, ellipses))
	438
	439	/*
	440	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	441	*/
	442	#define Simple_vFAIL(m) STMT_START { \
	443	const IV offset = RExC_parse - RExC_precomp; \
	444	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	445	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	446	} STMT_END
	447
	448	/*
	449	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	450	*/
	451	#define vFAIL(m) STMT_START { \
	452	if (!SIZE_ONLY) \
	453	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	454	Simple_vFAIL(m); \
	455	} STMT_END
	456
	457	/*
	458	* Like Simple_vFAIL(), but accepts two arguments.
	459	*/
	460	#define Simple_vFAIL2(m,a1) STMT_START { \
	461	const IV offset = RExC_parse - RExC_precomp; \
	462	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
	463	(int)offset, RExC_precomp, RExC_precomp + offset); \
	464	} STMT_END
	465
	466	/*
	467	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	468	*/
	469	#define vFAIL2(m,a1) STMT_START { \
	470	if (!SIZE_ONLY) \
	471	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	472	Simple_vFAIL2(m, a1); \
	473	} STMT_END
	474
	475
	476	/*
	477	* Like Simple_vFAIL(), but accepts three arguments.
	478	*/
	479	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	480	const IV offset = RExC_parse - RExC_precomp; \
	481	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
	482	(int)offset, RExC_precomp, RExC_precomp + offset); \
	483	} STMT_END
	484
	485	/*
	486	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	487	*/
	488	#define vFAIL3(m,a1,a2) STMT_START { \
	489	if (!SIZE_ONLY) \
	490	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	491	Simple_vFAIL3(m, a1, a2); \
	492	} STMT_END
	493
	494	/*
	495	* Like Simple_vFAIL(), but accepts four arguments.
	496	*/
	497	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	498	const IV offset = RExC_parse - RExC_precomp; \
	499	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
	500	(int)offset, RExC_precomp, RExC_precomp + offset); \
	501	} STMT_END
	502
	503	#define ckWARNreg(loc,m) STMT_START { \
	504	const IV offset = loc - RExC_precomp; \
	505	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	506	(int)offset, RExC_precomp, RExC_precomp + offset); \
	507	} STMT_END
	508
	509	#define ckWARNregdep(loc,m) STMT_START { \
	510	const IV offset = loc - RExC_precomp; \
	511	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	512	m REPORT_LOCATION, \
	513	(int)offset, RExC_precomp, RExC_precomp + offset); \
	514	} STMT_END
	515
	516	#define ckWARN2regdep(loc,m, a1) STMT_START { \
	517	const IV offset = loc - RExC_precomp; \
	518	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	519	m REPORT_LOCATION, \
	520	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	521	} STMT_END
	522
	523	#define ckWARN2reg(loc, m, a1) STMT_START { \
	524	const IV offset = loc - RExC_precomp; \
	525	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	526	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	527	} STMT_END
	528
	529	#define vWARN3(loc, m, a1, a2) STMT_START { \
	530	const IV offset = loc - RExC_precomp; \
	531	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	532	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	533	} STMT_END
	534
	535	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	536	const IV offset = loc - RExC_precomp; \
	537	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	538	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	539	} STMT_END
	540
	541	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	542	const IV offset = loc - RExC_precomp; \
	543	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	544	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	545	} STMT_END
	546
	547	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	548	const IV offset = loc - RExC_precomp; \
	549	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	550	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	551	} STMT_END
	552
	553	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	554	const IV offset = loc - RExC_precomp; \
	555	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	556	a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
	557	} STMT_END
	558
	559
	560	/* Allow for side effects in s */
	561	#define REGC(c,s) STMT_START { \
	562	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	563	} STMT_END
	564
	565	/* Macros for recording node offsets. 20001227 mjd@plover.com
	566	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	567	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	568	* Element 0 holds the number n.
	569	* Position is 1 indexed.
	570	*/
	571	#ifndef RE_TRACK_PATTERN_OFFSETS
	572	#define Set_Node_Offset_To_R(node,byte)
	573	#define Set_Node_Offset(node,byte)
	574	#define Set_Cur_Node_Offset
	575	#define Set_Node_Length_To_R(node,len)
	576	#define Set_Node_Length(node,len)
	577	#define Set_Node_Cur_Length(node)
	578	#define Node_Offset(n)
	579	#define Node_Length(n)
	580	#define Set_Node_Offset_Length(node,offset,len)
	581	#define ProgLen(ri) ri->u.proglen
	582	#define SetProgLen(ri,x) ri->u.proglen = x
	583	#else
	584	#define ProgLen(ri) ri->u.offsets[0]
	585	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	586	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	587	if (! SIZE_ONLY) { \
	588	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	589	__LINE__, (int)(node), (int)(byte))); \
	590	if((node) < 0) { \
	591	Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
	592	} else { \
	593	RExC_offsets[2*(node)-1] = (byte); \
	594	} \
	595	} \
	596	} STMT_END
	597
	598	#define Set_Node_Offset(node,byte) \
	599	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	600	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	601
	602	#define Set_Node_Length_To_R(node,len) STMT_START { \
	603	if (! SIZE_ONLY) { \
	604	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	605	__LINE__, (int)(node), (int)(len))); \
	606	if((node) < 0) { \
	607	Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
	608	} else { \
	609	RExC_offsets[2*(node)] = (len); \
	610	} \
	611	} \
	612	} STMT_END
	613
	614	#define Set_Node_Length(node,len) \
	615	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	616	#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
	617	#define Set_Node_Cur_Length(node) \
	618	Set_Node_Length(node, RExC_parse - parse_start)
	619
	620	/* Get offsets and lengths */
	621	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	622	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	623
	624	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	625	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	626	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	627	} STMT_END
	628	#endif
	629
	630	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	631	#define EXPERIMENTAL_INPLACESCAN
	632	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	633
	634	#define DEBUG_STUDYDATA(str,data,depth) \
	635	DEBUG_OPTIMISE_MORE_r(if(data){ \
	636	PerlIO_printf(Perl_debug_log, \
	637	"%*s" str "Pos:%"IVdf"/%"IVdf \
	638	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	639	(int)(depth)*2, "", \
	640	(IV)((data)->pos_min), \
	641	(IV)((data)->pos_delta), \
	642	(UV)((data)->flags), \
	643	(IV)((data)->whilem_c), \
	644	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	645	is_inf ? "INF " : "" \
	646	); \
	647	if ((data)->last_found) \
	648	PerlIO_printf(Perl_debug_log, \
	649	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	650	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	651	SvPVX_const((data)->last_found), \
	652	(IV)((data)->last_end), \
	653	(IV)((data)->last_start_min), \
	654	(IV)((data)->last_start_max), \
	655	((data)->longest && \
	656	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	657	SvPVX_const((data)->longest_fixed), \
	658	(IV)((data)->offset_fixed), \
	659	((data)->longest && \
	660	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	661	SvPVX_const((data)->longest_float), \
	662	(IV)((data)->offset_float_min), \
	663	(IV)((data)->offset_float_max) \
	664	); \
	665	PerlIO_printf(Perl_debug_log,"\n"); \
	666	});
	667
	668	static void clear_re(pTHX_ void *r);
	669
	670	/* Mark that we cannot extend a found fixed substring at this point.
	671	Update the longest found anchored substring and the longest found
	672	floating substrings if needed. */
	673
	674	STATIC void
	675	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data, I32 *minlenp, int is_inf)
	676	{
	677	const STRLEN l = CHR_SVLEN(data->last_found);
	678	const STRLEN old_l = CHR_SVLEN(*data->longest);
	679	GET_RE_DEBUG_FLAGS_DECL;
	680
	681	PERL_ARGS_ASSERT_SCAN_COMMIT;
	682
	683	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	684	SvSetMagicSV(*data->longest, data->last_found);
	685	if (*data->longest == data->longest_fixed) {
	686	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	687	if (data->flags & SF_BEFORE_EOL)
	688	data->flags
	689	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	690	else
	691	data->flags &= ~SF_FIX_BEFORE_EOL;
	692	data->minlen_fixed=minlenp;
	693	data->lookbehind_fixed=0;
	694	}
	695	else { /* data->longest == data->longest_float /
	696	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	697	data->offset_float_max = (l
	698	? data->last_start_max
	699	: data->pos_min + data->pos_delta);
	700	if (is_inf \|\| (U32)data->offset_float_max > (U32)I32_MAX)
	701	data->offset_float_max = I32_MAX;
	702	if (data->flags & SF_BEFORE_EOL)
	703	data->flags
	704	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	705	else
	706	data->flags &= ~SF_FL_BEFORE_EOL;
	707	data->minlen_float=minlenp;
	708	data->lookbehind_float=0;
	709	}
	710	}
	711	SvCUR_set(data->last_found, 0);
	712	{
	713	SV * const sv = data->last_found;
	714	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	715	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	716	if (mg)
	717	mg->mg_len = 0;
	718	}
	719	}
	720	data->last_end = -1;
	721	data->flags &= ~SF_BEFORE_EOL;
	722	DEBUG_STUDYDATA("commit: ",data,0);
	723	}
	724
	725	/* Can match anything (initialization) */
	726	STATIC void
	727	S_cl_anything(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	728	{
	729	PERL_ARGS_ASSERT_CL_ANYTHING;
	730
	731	ANYOF_BITMAP_SETALL(cl);
	732	cl->flags = ANYOF_CLASS\|ANYOF_EOS\|ANYOF_UNICODE_ALL
	733	\|ANYOF_LOC_NONBITMAP_FOLD\|ANYOF_NON_UTF8_LATIN1_ALL;
	734
	735	/* If any portion of the regex is to operate under locale rules,
	736	* initialization includes it. The reason this isn't done for all regexes
	737	* is that the optimizer was written under the assumption that locale was
	738	* all-or-nothing. Given the complexity and lack of documentation in the
	739	* optimizer, and that there are inadequate test cases for locale, so many
	740	* parts of it may not work properly, it is safest to avoid locale unless
	741	* necessary. */
	742	if (RExC_contains_locale) {
	743	ANYOF_CLASS_SETALL(cl); /* /l uses class */
	744	cl->flags \|= ANYOF_LOCALE;
	745	}
	746	else {
	747	ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
	748	}
	749	}
	750
	751	/* Can match anything (initialization) */
	752	STATIC int
	753	S_cl_is_anything(const struct regnode_charclass_class *cl)
	754	{
	755	int value;
	756
	757	PERL_ARGS_ASSERT_CL_IS_ANYTHING;
	758
	759	for (value = 0; value <= ANYOF_MAX; value += 2)
	760	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
	761	return 1;
	762	if (!(cl->flags & ANYOF_UNICODE_ALL))
	763	return 0;
	764	if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
	765	return 0;
	766	return 1;
	767	}
	768
	769	/* Can match anything (initialization) */
	770	STATIC void
	771	S_cl_init(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	772	{
	773	PERL_ARGS_ASSERT_CL_INIT;
	774
	775	Zero(cl, 1, struct regnode_charclass_class);
	776	cl->type = ANYOF;
	777	cl_anything(pRExC_state, cl);
	778	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	779	}
	780
	781	/* These two functions currently do the exact same thing */
	782	#define cl_init_zero S_cl_init
	783
	784	/* 'AND' a given class with another one. Can create false positives. 'cl'
	785	* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
	786	* 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
	787	STATIC void
	788	S_cl_and(struct regnode_charclass_class *cl,
	789	const struct regnode_charclass_class *and_with)
	790	{
	791	PERL_ARGS_ASSERT_CL_AND;
	792
	793	assert(and_with->type == ANYOF);
	794
	795	/* I (khw) am not sure all these restrictions are necessary XXX */
	796	if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
	797	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
	798	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	799	&& !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	800	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
	801	int i;
	802
	803	if (and_with->flags & ANYOF_INVERT)
	804	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	805	cl->bitmap[i] &= ~and_with->bitmap[i];
	806	else
	807	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	808	cl->bitmap[i] &= and_with->bitmap[i];
	809	} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
	810
	811	if (and_with->flags & ANYOF_INVERT) {
	812
	813	/* Here, the and'ed node is inverted. Get the AND of the flags that
	814	* aren't affected by the inversion. Those that are affected are
	815	* handled individually below */
	816	U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
	817	cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
	818	cl->flags \|= affected_flags;
	819
	820	/* We currently don't know how to deal with things that aren't in the
	821	* bitmap, but we know that the intersection is no greater than what
	822	* is already in cl, so let there be false positives that get sorted
	823	* out after the synthetic start class succeeds, and the node is
	824	* matched for real. */
	825
	826	/* The inversion of these two flags indicate that the resulting
	827	* intersection doesn't have them */
	828	if (and_with->flags & ANYOF_UNICODE_ALL) {
	829	cl->flags &= ~ANYOF_UNICODE_ALL;
	830	}
	831	if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	832	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
	833	}
	834	}
	835	else { /* and'd node is not inverted */
	836	U8 outside_bitmap_but_not_utf8; /* Temp variable */
	837
	838	if (! ANYOF_NONBITMAP(and_with)) {
	839
	840	/* Here 'and_with' doesn't match anything outside the bitmap
	841	* (except possibly ANYOF_UNICODE_ALL), which means the
	842	* intersection can't either, except for ANYOF_UNICODE_ALL, in
	843	* which case we don't know what the intersection is, but it's no
	844	* greater than what cl already has, so can just leave it alone,
	845	* with possible false positives */
	846	if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
	847	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	848	cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
	849	}
	850	}
	851	else if (! ANYOF_NONBITMAP(cl)) {
	852
	853	/* Here, 'and_with' does match something outside the bitmap, and cl
	854	* doesn't have a list of things to match outside the bitmap. If
	855	* cl can match all code points above 255, the intersection will
	856	* be those above-255 code points that 'and_with' matches. If cl
	857	* can't match all Unicode code points, it means that it can't
	858	* match anything outside the bitmap (since the 'if' that got us
	859	* into this block tested for that), so we leave the bitmap empty.
	860	*/
	861	if (cl->flags & ANYOF_UNICODE_ALL) {
	862	ARG_SET(cl, ARG(and_with));
	863
	864	/* and_with's ARG may match things that don't require UTF8.
	865	* And now cl's will too, in spite of this being an 'and'. See
	866	* the comments below about the kludge */
	867	cl->flags \|= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
	868	}
	869	}
	870	else {
	871	/* Here, both 'and_with' and cl match something outside the
	872	* bitmap. Currently we do not do the intersection, so just match
	873	* whatever cl had at the beginning. */
	874	}
	875
	876
	877	/* Take the intersection of the two sets of flags. However, the
	878	* ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
	879	* kludge around the fact that this flag is not treated like the others
	880	* which are initialized in cl_anything(). The way the optimizer works
	881	* is that the synthetic start class (SSC) is initialized to match
	882	* anything, and then the first time a real node is encountered, its
	883	* values are AND'd with the SSC's with the result being the values of
	884	* the real node. However, there are paths through the optimizer where
	885	* the AND never gets called, so those initialized bits are set
	886	* inappropriately, which is not usually a big deal, as they just cause
	887	* false positives in the SSC, which will just mean a probably
	888	* imperceptible slow down in execution. However this bit has a
	889	* higher false positive consequence in that it can cause utf8.pm,
	890	* utf8_heavy.pl ... to be loaded when not necessary, which is a much
	891	* bigger slowdown and also causes significant extra memory to be used.
	892	* In order to prevent this, the code now takes a different tack. The
	893	* bit isn't set unless some part of the regular expression needs it,
	894	* but once set it won't get cleared. This means that these extra
	895	* modules won't get loaded unless there was some path through the
	896	* pattern that would have required them anyway, and so any false
	897	* positives that occur by not ANDing them out when they could be
	898	* aren't as severe as they would be if we treated this bit like all
	899	* the others */
	900	outside_bitmap_but_not_utf8 = (cl->flags \| and_with->flags)
	901	& ANYOF_NONBITMAP_NON_UTF8;
	902	cl->flags &= and_with->flags;
	903	cl->flags \|= outside_bitmap_but_not_utf8;
	904	}
	905	}
	906
	907	/* 'OR' a given class with another one. Can create false positives. 'cl'
	908	* should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
	909	* 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
	910	STATIC void
	911	S_cl_or(const RExC_state_t pRExC_state, struct regnode_charclass_class cl, const struct regnode_charclass_class *or_with)
	912	{
	913	PERL_ARGS_ASSERT_CL_OR;
	914
	915	if (or_with->flags & ANYOF_INVERT) {
	916
	917	/* Here, the or'd node is to be inverted. This means we take the
	918	* complement of everything not in the bitmap, but currently we don't
	919	* know what that is, so give up and match anything */
	920	if (ANYOF_NONBITMAP(or_with)) {
	921	cl_anything(pRExC_state, cl);
	922	}
	923	/* We do not use
	924	* (B1 \| CL1) \| (!B2 & !CL2) = (B1 \| !B2 & !CL2) \| (CL1 \| (!B2 & !CL2))
	925	* <= (B1 \| !B2) \| (CL1 \| !CL2)
	926	* which is wasteful if CL2 is small, but we ignore CL2:
	927	* (B1 \| CL1) \| (!B2 & !CL2) <= (B1 \| CL1) \| !B2 = (B1 \| !B2) \| CL1
	928	* XXXX Can we handle case-fold? Unclear:
	929	* (OK1(i) \| OK1(i')) \| !(OK1(i) \| OK1(i')) =
	930	* (OK1(i) \| OK1(i')) \| (!OK1(i) & !OK1(i'))
	931	*/
	932	else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	933	&& !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	934	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
	935	int i;
	936
	937	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	938	cl->bitmap[i] \|= ~or_with->bitmap[i];
	939	} /* XXXX: logic is complicated otherwise */
	940	else {
	941	cl_anything(pRExC_state, cl);
	942	}
	943
	944	/* And, we can just take the union of the flags that aren't affected
	945	* by the inversion */
	946	cl->flags \|= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
	947
	948	/* For the remaining flags:
	949	ANYOF_UNICODE_ALL and inverted means to not match anything above
	950	255, which means that the union with cl should just be
	951	what cl has in it, so can ignore this flag
	952	ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
	953	is 127-255 to match them, but then invert that, so the
	954	union with cl should just be what cl has in it, so can
	955	ignore this flag
	956	*/
	957	} else { /* 'or_with' is not inverted */
	958	/* (B1 \| CL1) \| (B2 \| CL2) = (B1 \| B2) \| (CL1 \| CL2)) */
	959	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	960	&& (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	961	\|\| (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
	962	int i;
	963
	964	/* OR char bitmap and class bitmap separately */
	965	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	966	cl->bitmap[i] \|= or_with->bitmap[i];
	967	if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
	968	for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
	969	cl->classflags[i] \|= or_with->classflags[i];
	970	cl->flags \|= ANYOF_CLASS;
	971	}
	972	}
	973	else { /* XXXX: logic is complicated, leave it along for a moment. */
	974	cl_anything(pRExC_state, cl);
	975	}
	976
	977	if (ANYOF_NONBITMAP(or_with)) {
	978
	979	/* Use the added node's outside-the-bit-map match if there isn't a
	980	* conflict. If there is a conflict (both nodes match something
	981	* outside the bitmap, but what they match outside is not the same
	982	* pointer, and hence not easily compared until XXX we extend
	983	* inversion lists this far), give up and allow the start class to
	984	* match everything outside the bitmap. If that stuff is all above
	985	* 255, can just set UNICODE_ALL, otherwise caould be anything. */
	986	if (! ANYOF_NONBITMAP(cl)) {
	987	ARG_SET(cl, ARG(or_with));
	988	}
	989	else if (ARG(cl) != ARG(or_with)) {
	990
	991	if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
	992	cl_anything(pRExC_state, cl);
	993	}
	994	else {
	995	cl->flags \|= ANYOF_UNICODE_ALL;
	996	}
	997	}
	998	}
	999
	1000	/* Take the union */
	1001	cl->flags \|= or_with->flags;
	1002	}
	1003	}
	1004
	1005	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	1006	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	1007	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	1008	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
	1009
	1010
	1011	#ifdef DEBUGGING
	1012	/*
	1013	dump_trie(trie,widecharmap,revcharmap)
	1014	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	1015	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	1016
	1017	These routines dump out a trie in a somewhat readable format.
	1018	The _interim_ variants are used for debugging the interim
	1019	tables that are used to generate the final compressed
	1020	representation which is what dump_trie expects.
	1021
	1022	Part of the reason for their existence is to provide a form
	1023	of documentation as to how the different representations function.
	1024
	1025	*/
	1026
	1027	/*
	1028	Dumps the final compressed table form of the trie to Perl_debug_log.
	1029	Used for debugging make_trie().
	1030	*/
	1031
	1032	STATIC void
	1033	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	1034	AV *revcharmap, U32 depth)
	1035	{
	1036	U32 state;
	1037	SV *sv=sv_newmortal();
	1038	int colwidth= widecharmap ? 6 : 4;
	1039	U16 word;
	1040	GET_RE_DEBUG_FLAGS_DECL;
	1041
	1042	PERL_ARGS_ASSERT_DUMP_TRIE;
	1043
	1044	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	1045	(int)depth * 2 + 2,"",
	1046	"Match","Base","Ofs" );
	1047
	1048	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	1049	SV ** const tmp = av_fetch( revcharmap, state, 0);
	1050	if ( tmp ) {
	1051	PerlIO_printf( Perl_debug_log, "%*s",
	1052	colwidth,
	1053	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1054	PL_colors[0], PL_colors[1],
	1055	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1056	PERL_PV_ESCAPE_FIRSTCHAR
	1057	)
	1058	);
	1059	}
	1060	}
	1061	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	1062	(int)depth * 2 + 2,"");
	1063
	1064	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	1065	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	1066	PerlIO_printf( Perl_debug_log, "\n");
	1067
	1068	for( state = 1 ; state < trie->statecount ; state++ ) {
	1069	const U32 base = trie->states[ state ].trans.base;
	1070
	1071	PerlIO_printf( Perl_debug_log, "%s#%4"UVXf"\|", (int)depth 2 + 2,"", (UV)state);
	1072
	1073	if ( trie->states[ state ].wordnum ) {
	1074	PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
	1075	} else {
	1076	PerlIO_printf( Perl_debug_log, "%6s", "" );
	1077	}
	1078
	1079	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	1080
	1081	if ( base ) {
	1082	U32 ofs = 0;
	1083
	1084	while( ( base + ofs < trie->uniquecharcount ) \|\|
	1085	( base + ofs - trie->uniquecharcount < trie->lasttrans
	1086	&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
	1087	ofs++;
	1088
	1089	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	1090
	1091	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1092	if ( ( base + ofs >= trie->uniquecharcount ) &&
	1093	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	1094	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	1095	{
	1096	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	1097	colwidth,
	1098	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
	1099	} else {
	1100	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	1101	}
	1102	}
	1103
	1104	PerlIO_printf( Perl_debug_log, "]");
	1105
	1106	}
	1107	PerlIO_printf( Perl_debug_log, "\n" );
	1108	}
	1109	PerlIO_printf(Perl_debug_log, "%sword_info N:(prev,len)=", (int)depth2, "");
	1110	for (word=1; word <= trie->wordcount; word++) {
	1111	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	1112	(int)word, (int)(trie->wordinfo[word].prev),
	1113	(int)(trie->wordinfo[word].len));
	1114	}
	1115	PerlIO_printf(Perl_debug_log, "\n" );
	1116	}
	1117	/*
	1118	Dumps a fully constructed but uncompressed trie in list form.
	1119	List tries normally only are used for construction when the number of
	1120	possible chars (trie->uniquecharcount) is very high.
	1121	Used for debugging make_trie().
	1122	*/
	1123	STATIC void
	1124	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	1125	HV widecharmap, AV revcharmap, U32 next_alloc,
	1126	U32 depth)
	1127	{
	1128	U32 state;
	1129	SV *sv=sv_newmortal();
	1130	int colwidth= widecharmap ? 6 : 4;
	1131	GET_RE_DEBUG_FLAGS_DECL;
	1132
	1133	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1134
	1135	/* print out the table precompression. */
	1136	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1137	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1138	"------:-----+-----------------\n" );
	1139
	1140	for( state=1 ; state < next_alloc ; state ++ ) {
	1141	U16 charid;
	1142
	1143	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1144	(int)depth * 2 + 2,"", (UV)state );
	1145	if ( ! trie->states[ state ].wordnum ) {
	1146	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1147	} else {
	1148	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1149	trie->states[ state ].wordnum
	1150	);
	1151	}
	1152	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1153	SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
	1154	if ( tmp ) {
	1155	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1156	colwidth,
	1157	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1158	PL_colors[0], PL_colors[1],
	1159	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1160	PERL_PV_ESCAPE_FIRSTCHAR
	1161	) ,
	1162	TRIE_LIST_ITEM(state,charid).forid,
	1163	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1164	);
	1165	if (!(charid % 10))
	1166	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1167	(int)((depth * 2) + 14), "");
	1168	}
	1169	}
	1170	PerlIO_printf( Perl_debug_log, "\n");
	1171	}
	1172	}
	1173
	1174	/*
	1175	Dumps a fully constructed but uncompressed trie in table form.
	1176	This is the normal DFA style state transition table, with a few
	1177	twists to facilitate compression later.
	1178	Used for debugging make_trie().
	1179	*/
	1180	STATIC void
	1181	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1182	HV widecharmap, AV revcharmap, U32 next_alloc,
	1183	U32 depth)
	1184	{
	1185	U32 state;
	1186	U16 charid;
	1187	SV *sv=sv_newmortal();
	1188	int colwidth= widecharmap ? 6 : 4;
	1189	GET_RE_DEBUG_FLAGS_DECL;
	1190
	1191	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1192
	1193	/*
	1194	print out the table precompression so that we can do a visual check
	1195	that they are identical.
	1196	*/
	1197
	1198	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1199
	1200	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1201	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1202	if ( tmp ) {
	1203	PerlIO_printf( Perl_debug_log, "%*s",
	1204	colwidth,
	1205	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1206	PL_colors[0], PL_colors[1],
	1207	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1208	PERL_PV_ESCAPE_FIRSTCHAR
	1209	)
	1210	);
	1211	}
	1212	}
	1213
	1214	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1215
	1216	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1217	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1218	}
	1219
	1220	PerlIO_printf( Perl_debug_log, "\n" );
	1221
	1222	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1223
	1224	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1225	(int)depth * 2 + 2,"",
	1226	(UV)TRIE_NODENUM( state ) );
	1227
	1228	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1229	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1230	if (v)
	1231	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1232	else
	1233	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1234	}
	1235	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1236	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
	1237	} else {
	1238	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
	1239	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1240	}
	1241	}
	1242	}
	1243
	1244	#endif
	1245
	1246
	1247	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1248	startbranch: the first branch in the whole branch sequence
	1249	first : start branch of sequence of branch-exact nodes.
	1250	May be the same as startbranch
	1251	last : Thing following the last branch.
	1252	May be the same as tail.
	1253	tail : item following the branch sequence
	1254	count : words in the sequence
	1255	flags : currently the OP() type we will be building one of /EXACT(\|F\|Fl)/
	1256	depth : indent depth
	1257
	1258	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1259
	1260	A trie is an N'ary tree where the branches are determined by digital
	1261	decomposition of the key. IE, at the root node you look up the 1st character and
	1262	follow that branch repeat until you find the end of the branches. Nodes can be
	1263	marked as "accepting" meaning they represent a complete word. Eg:
	1264
	1265	/he\|she\|his\|hers/
	1266
	1267	would convert into the following structure. Numbers represent states, letters
	1268	following numbers represent valid transitions on the letter from that state, if
	1269	the number is in square brackets it represents an accepting state, otherwise it
	1270	will be in parenthesis.
	1271
	1272	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1273	\| \|
	1274	\| (2)
	1275	\| \|
	1276	(1) +-i->(6)-+-s->[7]
	1277	\|
	1278	+-s->(3)-+-h->(4)-+-e->[5]
	1279
	1280	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1281
	1282	This shows that when matching against the string 'hers' we will begin at state 1
	1283	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1284	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1285	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1286	single traverse. We store a mapping from accepting to state to which word was
	1287	matched, and then when we have multiple possibilities we try to complete the
	1288	rest of the regex in the order in which they occured in the alternation.
	1289
	1290	The only prior NFA like behaviour that would be changed by the TRIE support is
	1291	the silent ignoring of duplicate alternations which are of the form:
	1292
	1293	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1294
	1295	Thus EVAL blocks following a trie may be called a different number of times with
	1296	and without the optimisation. With the optimisations dupes will be silently
	1297	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1298	the following demonstrates:
	1299
	1300	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1301
	1302	which prints out 'word' three times, but
	1303
	1304	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1305
	1306	which doesnt print it out at all. This is due to other optimisations kicking in.
	1307
	1308	Example of what happens on a structural level:
	1309
	1310	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1311
	1312	1: CURLYM[1] {1,32767}(18)
	1313	5: BRANCH(8)
	1314	6: EXACT <ac>(16)
	1315	8: BRANCH(11)
	1316	9: EXACT <ad>(16)
	1317	11: BRANCH(14)
	1318	12: EXACT <ab>(16)
	1319	16: SUCCEED(0)
	1320	17: NOTHING(18)
	1321	18: END(0)
	1322
	1323	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1324	and should turn into:
	1325
	1326	1: CURLYM[1] {1,32767}(18)
	1327	5: TRIE(16)
	1328	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1329	<ac>
	1330	<ad>
	1331	<ab>
	1332	16: SUCCEED(0)
	1333	17: NOTHING(18)
	1334	18: END(0)
	1335
	1336	Cases where tail != last would be like /(?foo\|bar)baz/:
	1337
	1338	1: BRANCH(4)
	1339	2: EXACT <foo>(8)
	1340	4: BRANCH(7)
	1341	5: EXACT <bar>(8)
	1342	7: TAIL(8)
	1343	8: EXACT <baz>(10)
	1344	10: END(0)
	1345
	1346	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1347	and would end up looking like:
	1348
	1349	1: TRIE(8)
	1350	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1351	<foo>
	1352	<bar>
	1353	7: TAIL(8)
	1354	8: EXACT <baz>(10)
	1355	10: END(0)
	1356
	1357	d = uvuni_to_utf8_flags(d, uv, 0);
	1358
	1359	is the recommended Unicode-aware way of saying
	1360
	1361	*(d++) = uv;
	1362	*/
	1363
	1364	#define TRIE_STORE_REVCHAR \
	1365	STMT_START { \
	1366	if (UTF) { \
	1367	SV *zlopp = newSV(2); \
	1368	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1369	unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
	1370	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1371	SvPOK_on(zlopp); \
	1372	SvUTF8_on(zlopp); \
	1373	av_push(revcharmap, zlopp); \
	1374	} else { \
	1375	char ooooff = (char)uvc; \
	1376	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1377	} \
	1378	} STMT_END
	1379
	1380	#define TRIE_READ_CHAR STMT_START { \
	1381	wordlen++; \
	1382	if ( UTF ) { \
	1383	if ( folder ) { \
	1384	if ( foldlen > 0 ) { \
	1385	uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
	1386	foldlen -= len; \
	1387	scan += len; \
	1388	len = 0; \
	1389	} else { \
	1390	uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
	1391	uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
	1392	foldlen -= UNISKIP( uvc ); \
	1393	scan = foldbuf + UNISKIP( uvc ); \
	1394	} \
	1395	} else { \
	1396	uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
	1397	} \
	1398	} else { \
	1399	uvc = (U32)*uc; \
	1400	len = 1; \
	1401	} \
	1402	} STMT_END
	1403
	1404
	1405
	1406	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1407	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1408	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1409	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1410	} \
	1411	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1412	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1413	TRIE_LIST_CUR( state )++; \
	1414	} STMT_END
	1415
	1416	#define TRIE_LIST_NEW(state) STMT_START { \
	1417	Newxz( trie->states[ state ].trans.list, \
	1418	4, reg_trie_trans_le ); \
	1419	TRIE_LIST_CUR( state ) = 1; \
	1420	TRIE_LIST_LEN( state ) = 4; \
	1421	} STMT_END
	1422
	1423	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1424	U16 dupe= trie->states[ state ].wordnum; \
	1425	regnode * const noper_next = regnext( noper ); \
	1426	\
	1427	DEBUG_r({ \
	1428	/* store the word for dumping */ \
	1429	SV* tmp; \
	1430	if (OP(noper) != NOTHING) \
	1431	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1432	else \
	1433	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1434	av_push( trie_words, tmp ); \
	1435	}); \
	1436	\
	1437	curword++; \
	1438	trie->wordinfo[curword].prev = 0; \
	1439	trie->wordinfo[curword].len = wordlen; \
	1440	trie->wordinfo[curword].accept = state; \
	1441	\
	1442	if ( noper_next < tail ) { \
	1443	if (!trie->jump) \
	1444	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
	1445	trie->jump[curword] = (U16)(noper_next - convert); \
	1446	if (!jumper) \
	1447	jumper = noper_next; \
	1448	if (!nextbranch) \
	1449	nextbranch= regnext(cur); \
	1450	} \
	1451	\
	1452	if ( dupe ) { \
	1453	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1454	/* chain, so that when the bits of chain are later */\
	1455	/* linked together, the dups appear in the chain */\
	1456	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1457	trie->wordinfo[dupe].prev = curword; \
	1458	} else { \
	1459	/* we haven't inserted this word yet. */ \
	1460	trie->states[ state ].wordnum = curword; \
	1461	} \
	1462	} STMT_END
	1463
	1464
	1465	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1466	( ( base + charid >= ucharcount \
	1467	&& base + charid < ubound \
	1468	&& state == trie->trans[ base - ucharcount + charid ].check \
	1469	&& trie->trans[ base - ucharcount + charid ].next ) \
	1470	? trie->trans[ base - ucharcount + charid ].next \
	1471	: ( state==1 ? special : 0 ) \
	1472	)
	1473
	1474	#define MADE_TRIE 1
	1475	#define MADE_JUMP_TRIE 2
	1476	#define MADE_EXACT_TRIE 4
	1477
	1478	STATIC I32
	1479	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch, regnode first, regnode last, regnode *tail, U32 word_count, U32 flags, U32 depth)
	1480	{
	1481	dVAR;
	1482	/* first pass, loop through and scan words */
	1483	reg_trie_data *trie;
	1484	HV *widecharmap = NULL;
	1485	AV *revcharmap = newAV();
	1486	regnode *cur;
	1487	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1488	STRLEN len = 0;
	1489	UV uvc = 0;
	1490	U16 curword = 0;
	1491	U32 next_alloc = 0;
	1492	regnode *jumper = NULL;
	1493	regnode *nextbranch = NULL;
	1494	regnode *convert = NULL;
	1495	U32 prev_states; / temp array mapping each state to previous one */
	1496	/* we just use folder as a flag in utf8 */
	1497	const U8 * folder = NULL;
	1498
	1499	#ifdef DEBUGGING
	1500	const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
	1501	AV *trie_words = NULL;
	1502	/* along with revcharmap, this only used during construction but both are
	1503	* useful during debugging so we store them in the struct when debugging.
	1504	*/
	1505	#else
	1506	const U32 data_slot = add_data( pRExC_state, 2, "tu" );
	1507	STRLEN trie_charcount=0;
	1508	#endif
	1509	SV *re_trie_maxbuff;
	1510	GET_RE_DEBUG_FLAGS_DECL;
	1511
	1512	PERL_ARGS_ASSERT_MAKE_TRIE;
	1513	#ifndef DEBUGGING
	1514	PERL_UNUSED_ARG(depth);
	1515	#endif
	1516
	1517	switch (flags) {
	1518	case EXACTFA:
	1519	case EXACTFU: folder = PL_fold_latin1; break;
	1520	case EXACTF: folder = PL_fold; break;
	1521	case EXACTFL: folder = PL_fold_locale; break;
	1522	}
	1523
	1524	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1525	trie->refcount = 1;
	1526	trie->startstate = 1;
	1527	trie->wordcount = word_count;
	1528	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1529	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1530	if (!(UTF && folder))
	1531	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1532	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1533	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1534
	1535	DEBUG_r({
	1536	trie_words = newAV();
	1537	});
	1538
	1539	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1540	if (!SvIOK(re_trie_maxbuff)) {
	1541	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1542	}
	1543	DEBUG_OPTIMISE_r({
	1544	PerlIO_printf( Perl_debug_log,
	1545	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	1546	(int)depth * 2 + 2, "",
	1547	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	1548	REG_NODE_NUM(last), REG_NODE_NUM(tail),
	1549	(int)depth);
	1550	});
	1551
	1552	/* Find the node we are going to overwrite */
	1553	if ( first == startbranch && OP( last ) != BRANCH ) {
	1554	/* whole branch chain */
	1555	convert = first;
	1556	} else {
	1557	/* branch sub-chain */
	1558	convert = NEXTOPER( first );
	1559	}
	1560
	1561	/* -- First loop and Setup --
	1562
	1563	We first traverse the branches and scan each word to determine if it
	1564	contains widechars, and how many unique chars there are, this is
	1565	important as we have to build a table with at least as many columns as we
	1566	have unique chars.
	1567
	1568	We use an array of integers to represent the character codes 0..255
	1569	(trie->charmap) and we use a an HV* to store Unicode characters. We use the
	1570	native representation of the character value as the key and IV's for the
	1571	coded index.
	1572
	1573	TODO If we keep track of how many times each character is used we can
	1574	remap the columns so that the table compression later on is more
	1575	efficient in terms of memory by ensuring the most common value is in the
	1576	middle and the least common are on the outside. IMO this would be better
	1577	than a most to least common mapping as theres a decent chance the most
	1578	common letter will share a node with the least common, meaning the node
	1579	will not be compressible. With a middle is most common approach the worst
	1580	case is when we have the least common nodes twice.
	1581
	1582	*/
	1583
	1584	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1585	regnode * const noper = NEXTOPER( cur );
	1586	const U8 uc = (U8)STRING( noper );
	1587	const U8 * const e = uc + STR_LEN( noper );
	1588	STRLEN foldlen = 0;
	1589	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1590	const U8 scan = (U8)NULL;
	1591	U32 wordlen = 0; /* required init */
	1592	STRLEN chars = 0;
	1593	bool set_bit = trie->bitmap ? 1 : 0; /store the first char in the bitmap?/
	1594
	1595	if (OP(noper) == NOTHING) {
	1596	trie->minlen= 0;
	1597	continue;
	1598	}
	1599	if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
	1600	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	1601	regardless of encoding */
	1602
	1603	for ( ; uc < e ; uc += len ) {
	1604	TRIE_CHARCOUNT(trie)++;
	1605	TRIE_READ_CHAR;
	1606	chars++;
	1607	if ( uvc < 256 ) {
	1608	if ( !trie->charmap[ uvc ] ) {
	1609	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	1610	if ( folder )
	1611	trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
	1612	TRIE_STORE_REVCHAR;
	1613	}
	1614	if ( set_bit ) {
	1615	/* store the codepoint in the bitmap, and its folded
	1616	* equivalent. */
	1617	TRIE_BITMAP_SET(trie,uvc);
	1618
	1619	/* store the folded codepoint */
	1620	if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
	1621
	1622	if ( !UTF ) {
	1623	/* store first byte of utf8 representation of
	1624	variant codepoints */
	1625	if (! UNI_IS_INVARIANT(uvc)) {
	1626	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	1627	}
	1628	}
	1629	set_bit = 0; /* We've done our bit :-) */
	1630	}
	1631	} else {
	1632	SV** svpp;
	1633	if ( !widecharmap )
	1634	widecharmap = newHV();
	1635
	1636	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	1637
	1638	if ( !svpp )
	1639	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	1640
	1641	if ( !SvTRUE( *svpp ) ) {
	1642	sv_setiv( *svpp, ++trie->uniquecharcount );
	1643	TRIE_STORE_REVCHAR;
	1644	}
	1645	}
	1646	}
	1647	if( cur == first ) {
	1648	trie->minlen=chars;
	1649	trie->maxlen=chars;
	1650	} else if (chars < trie->minlen) {
	1651	trie->minlen=chars;
	1652	} else if (chars > trie->maxlen) {
	1653	trie->maxlen=chars;
	1654	}
	1655
	1656	} /* end first pass */
	1657	DEBUG_TRIE_COMPILE_r(
	1658	PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	1659	(int)depth * 2 + 2,"",
	1660	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	1661	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	1662	(int)trie->minlen, (int)trie->maxlen )
	1663	);
	1664
	1665	/*
	1666	We now know what we are dealing with in terms of unique chars and
	1667	string sizes so we can calculate how much memory a naive
	1668	representation using a flat table will take. If it's over a reasonable
	1669	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	1670	conservative but potentially much slower representation using an array
	1671	of lists.
	1672
	1673	At the end we convert both representations into the same compressed
	1674	form that will be used in regexec.c for matching with. The latter
	1675	is a form that cannot be used to construct with but has memory
	1676	properties similar to the list form and access properties similar
	1677	to the table form making it both suitable for fast searches and
	1678	small enough that its feasable to store for the duration of a program.
	1679
	1680	See the comment in the code where the compressed table is produced
	1681	inplace from the flat tabe representation for an explanation of how
	1682	the compression works.
	1683
	1684	*/
	1685
	1686
	1687	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	1688	prev_states[1] = 0;
	1689
	1690	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
	1691	/*
	1692	Second Pass -- Array Of Lists Representation
	1693
	1694	Each state will be represented by a list of charid:state records
	1695	(reg_trie_trans_le) the first such element holds the CUR and LEN
	1696	points of the allocated array. (See defines above).
	1697
	1698	We build the initial structure using the lists, and then convert
	1699	it into the compressed table form which allows faster lookups
	1700	(but cant be modified once converted).
	1701	*/
	1702
	1703	STRLEN transcount = 1;
	1704
	1705	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1706	"%*sCompiling trie using list compiler\n",
	1707	(int)depth * 2 + 2, ""));
	1708
	1709	trie->states = (reg_trie_state *)
	1710	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1711	sizeof(reg_trie_state) );
	1712	TRIE_LIST_NEW(1);
	1713	next_alloc = 2;
	1714
	1715	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1716
	1717	regnode * const noper = NEXTOPER( cur );
	1718	U8 uc = (U8)STRING( noper );
	1719	const U8 * const e = uc + STR_LEN( noper );
	1720	U32 state = 1; /* required init */
	1721	U16 charid = 0; /* sanity init */
	1722	U8 scan = (U8)NULL; /* sanity init */
	1723	STRLEN foldlen = 0; /* required init */
	1724	U32 wordlen = 0; /* required init */
	1725	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1726
	1727	if (OP(noper) != NOTHING) {
	1728	for ( ; uc < e ; uc += len ) {
	1729
	1730	TRIE_READ_CHAR;
	1731
	1732	if ( uvc < 256 ) {
	1733	charid = trie->charmap[ uvc ];
	1734	} else {
	1735	SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1736	if ( !svpp ) {
	1737	charid = 0;
	1738	} else {
	1739	charid=(U16)SvIV( *svpp );
	1740	}
	1741	}
	1742	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1743	if ( charid ) {
	1744
	1745	U16 check;
	1746	U32 newstate = 0;
	1747
	1748	charid--;
	1749	if ( !trie->states[ state ].trans.list ) {
	1750	TRIE_LIST_NEW( state );
	1751	}
	1752	for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
	1753	if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
	1754	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	1755	break;
	1756	}
	1757	}
	1758	if ( ! newstate ) {
	1759	newstate = next_alloc++;
	1760	prev_states[newstate] = state;
	1761	TRIE_LIST_PUSH( state, charid, newstate );
	1762	transcount++;
	1763	}
	1764	state = newstate;
	1765	} else {
	1766	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1767	}
	1768	}
	1769	}
	1770	TRIE_HANDLE_WORD(state);
	1771
	1772	} /* end second pass */
	1773
	1774	/* next alloc is the NEXT state to be allocated */
	1775	trie->statecount = next_alloc;
	1776	trie->states = (reg_trie_state *)
	1777	PerlMemShared_realloc( trie->states,
	1778	next_alloc
	1779	* sizeof(reg_trie_state) );
	1780
	1781	/* and now dump it out before we compress it */
	1782	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	1783	revcharmap, next_alloc,
	1784	depth+1)
	1785	);
	1786
	1787	trie->trans = (reg_trie_trans *)
	1788	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	1789	{
	1790	U32 state;
	1791	U32 tp = 0;
	1792	U32 zp = 0;
	1793
	1794
	1795	for( state=1 ; state < next_alloc ; state ++ ) {
	1796	U32 base=0;
	1797
	1798	/*
	1799	DEBUG_TRIE_COMPILE_MORE_r(
	1800	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	1801	);
	1802	*/
	1803
	1804	if (trie->states[state].trans.list) {
	1805	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1806	U16 maxid=minid;
	1807	U16 idx;
	1808
	1809	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1810	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1811	if ( forid < minid ) {
	1812	minid=forid;
	1813	} else if ( forid > maxid ) {
	1814	maxid=forid;
	1815	}
	1816	}
	1817	if ( transcount < tp + maxid - minid + 1) {
	1818	transcount *= 2;
	1819	trie->trans = (reg_trie_trans *)
	1820	PerlMemShared_realloc( trie->trans,
	1821	transcount
	1822	* sizeof(reg_trie_trans) );
	1823	Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
	1824	}
	1825	base = trie->uniquecharcount + tp - minid;
	1826	if ( maxid == minid ) {
	1827	U32 set = 0;
	1828	for ( ; zp < tp ; zp++ ) {
	1829	if ( ! trie->trans[ zp ].next ) {
	1830	base = trie->uniquecharcount + zp - minid;
	1831	trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1832	trie->trans[ zp ].check = state;
	1833	set = 1;
	1834	break;
	1835	}
	1836	}
	1837	if ( !set ) {
	1838	trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1839	trie->trans[ tp ].check = state;
	1840	tp++;
	1841	zp = tp;
	1842	}
	1843	} else {
	1844	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1845	const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
	1846	trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
	1847	trie->trans[ tid ].check = state;
	1848	}
	1849	tp += ( maxid - minid + 1 );
	1850	}
	1851	Safefree(trie->states[ state ].trans.list);
	1852	}
	1853	/*
	1854	DEBUG_TRIE_COMPILE_MORE_r(
	1855	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	1856	);
	1857	*/
	1858	trie->states[ state ].trans.base=base;
	1859	}
	1860	trie->lasttrans = tp + 1;
	1861	}
	1862	} else {
	1863	/*
	1864	Second Pass -- Flat Table Representation.
	1865
	1866	we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
	1867	We know that we will need Charcount+1 trans at most to store the data
	1868	(one row per char at worst case) So we preallocate both structures
	1869	assuming worst case.
	1870
	1871	We then construct the trie using only the .next slots of the entry
	1872	structs.
	1873
	1874	We use the .check field of the first entry of the node temporarily to
	1875	make compression both faster and easier by keeping track of how many non
	1876	zero fields are in the node.
	1877
	1878	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	1879	transition.
	1880
	1881	There are two terms at use here: state as a TRIE_NODEIDX() which is a
	1882	number representing the first entry of the node, and state as a
	1883	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
	1884	TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
	1885	are 2 entrys per node. eg:
	1886
	1887	A B A B
	1888	1. 2 4 1. 3 7
	1889	2. 0 3 3. 0 5
	1890	3. 0 0 5. 0 0
	1891	4. 0 0 7. 0 0
	1892
	1893	The table is internally in the right hand, idx form. However as we also
	1894	have to deal with the states array which is indexed by nodenum we have to
	1895	use TRIE_NODENUM() to convert.
	1896
	1897	*/
	1898	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1899	"%*sCompiling trie using table compiler\n",
	1900	(int)depth * 2 + 2, ""));
	1901
	1902	trie->trans = (reg_trie_trans *)
	1903	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	1904	* trie->uniquecharcount + 1,
	1905	sizeof(reg_trie_trans) );
	1906	trie->states = (reg_trie_state *)
	1907	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1908	sizeof(reg_trie_state) );
	1909	next_alloc = trie->uniquecharcount + 1;
	1910
	1911
	1912	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1913
	1914	regnode * const noper = NEXTOPER( cur );
	1915	const U8 uc = (U8)STRING( noper );
	1916	const U8 * const e = uc + STR_LEN( noper );
	1917
	1918	U32 state = 1; /* required init */
	1919
	1920	U16 charid = 0; /* sanity init */
	1921	U32 accept_state = 0; /* sanity init */
	1922	U8 scan = (U8)NULL; /* sanity init */
	1923
	1924	STRLEN foldlen = 0; /* required init */
	1925	U32 wordlen = 0; /* required init */
	1926	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1927
	1928	if ( OP(noper) != NOTHING ) {
	1929	for ( ; uc < e ; uc += len ) {
	1930
	1931	TRIE_READ_CHAR;
	1932
	1933	if ( uvc < 256 ) {
	1934	charid = trie->charmap[ uvc ];
	1935	} else {
	1936	SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1937	charid = svpp ? (U16)SvIV(*svpp) : 0;
	1938	}
	1939	if ( charid ) {
	1940	charid--;
	1941	if ( !trie->trans[ state + charid ].next ) {
	1942	trie->trans[ state + charid ].next = next_alloc;
	1943	trie->trans[ state ].check++;
	1944	prev_states[TRIE_NODENUM(next_alloc)]
	1945	= TRIE_NODENUM(state);
	1946	next_alloc += trie->uniquecharcount;
	1947	}
	1948	state = trie->trans[ state + charid ].next;
	1949	} else {
	1950	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1951	}
	1952	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1953	}
	1954	}
	1955	accept_state = TRIE_NODENUM( state );
	1956	TRIE_HANDLE_WORD(accept_state);
	1957
	1958	} /* end second pass */
	1959
	1960	/* and now dump it out before we compress it */
	1961	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	1962	revcharmap,
	1963	next_alloc, depth+1));
	1964
	1965	{
	1966	/*
	1967	* Inplace compress the table.*
	1968
	1969	For sparse data sets the table constructed by the trie algorithm will
	1970	be mostly 0/FAIL transitions or to put it another way mostly empty.
	1971	(Note that leaf nodes will not contain any transitions.)
	1972
	1973	This algorithm compresses the tables by eliminating most such
	1974	transitions, at the cost of a modest bit of extra work during lookup:
	1975
	1976	- Each states[] entry contains a .base field which indicates the
	1977	index in the state[] array wheres its transition data is stored.
	1978
	1979	- If .base is 0 there are no valid transitions from that node.
	1980
	1981	- If .base is nonzero then charid is added to it to find an entry in
	1982	the trans array.
	1983
	1984	-If trans[states[state].base+charid].check!=state then the
	1985	transition is taken to be a 0/Fail transition. Thus if there are fail
	1986	transitions at the front of the node then the .base offset will point
	1987	somewhere inside the previous nodes data (or maybe even into a node
	1988	even earlier), but the .check field determines if the transition is
	1989	valid.
	1990
	1991	XXX - wrong maybe?
	1992	The following process inplace converts the table to the compressed
	1993	table: We first do not compress the root node 1,and mark all its
	1994	.check pointers as 1 and set its .base pointer as 1 as well. This
	1995	allows us to do a DFA construction from the compressed table later,
	1996	and ensures that any .base pointers we calculate later are greater
	1997	than 0.
	1998
	1999	- We set 'pos' to indicate the first entry of the second node.
	2000
	2001	- We then iterate over the columns of the node, finding the first and
	2002	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	2003	and set the .check pointers accordingly, and advance pos
	2004	appropriately and repreat for the next node. Note that when we copy
	2005	the next pointers we have to convert them from the original
	2006	NODEIDX form to NODENUM form as the former is not valid post
	2007	compression.
	2008
	2009	- If a node has no transitions used we mark its base as 0 and do not
	2010	advance the pos pointer.
	2011
	2012	- If a node only has one transition we use a second pointer into the
	2013	structure to fill in allocated fail transitions from other states.
	2014	This pointer is independent of the main pointer and scans forward
	2015	looking for null transitions that are allocated to a state. When it
	2016	finds one it writes the single transition into the "hole". If the
	2017	pointer doesnt find one the single transition is appended as normal.
	2018
	2019	- Once compressed we can Renew/realloc the structures to release the
	2020	excess space.
	2021
	2022	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	2023	specifically Fig 3.47 and the associated pseudocode.
	2024
	2025	demq
	2026	*/
	2027	const U32 laststate = TRIE_NODENUM( next_alloc );
	2028	U32 state, charid;
	2029	U32 pos = 0, zp=0;
	2030	trie->statecount = laststate;
	2031
	2032	for ( state = 1 ; state < laststate ; state++ ) {
	2033	U8 flag = 0;
	2034	const U32 stateidx = TRIE_NODEIDX( state );
	2035	const U32 o_used = trie->trans[ stateidx ].check;
	2036	U32 used = trie->trans[ stateidx ].check;
	2037	trie->trans[ stateidx ].check = 0;
	2038
	2039	for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
	2040	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	2041	if ( trie->trans[ stateidx + charid ].next ) {
	2042	if (o_used == 1) {
	2043	for ( ; zp < pos ; zp++ ) {
	2044	if ( ! trie->trans[ zp ].next ) {
	2045	break;
	2046	}
	2047	}
	2048	trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
	2049	trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2050	trie->trans[ zp ].check = state;
	2051	if ( ++zp > pos ) pos = zp;
	2052	break;
	2053	}
	2054	used--;
	2055	}
	2056	if ( !flag ) {
	2057	flag = 1;
	2058	trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
	2059	}
	2060	trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2061	trie->trans[ pos ].check = state;
	2062	pos++;
	2063	}
	2064	}
	2065	}
	2066	trie->lasttrans = pos + 1;
	2067	trie->states = (reg_trie_state *)
	2068	PerlMemShared_realloc( trie->states, laststate
	2069	* sizeof(reg_trie_state) );
	2070	DEBUG_TRIE_COMPILE_MORE_r(
	2071	PerlIO_printf( Perl_debug_log,
	2072	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	2073	(int)depth * 2 + 2,"",
	2074	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
	2075	(IV)next_alloc,
	2076	(IV)pos,
	2077	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	2078	);
	2079
	2080	} /* end table compress */
	2081	}
	2082	DEBUG_TRIE_COMPILE_MORE_r(
	2083	PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	2084	(int)depth * 2 + 2, "",
	2085	(UV)trie->statecount,
	2086	(UV)trie->lasttrans)
	2087	);
	2088	/* resize the trans array to remove unused space */
	2089	trie->trans = (reg_trie_trans *)
	2090	PerlMemShared_realloc( trie->trans, trie->lasttrans
	2091	* sizeof(reg_trie_trans) );
	2092
	2093	{ /* Modify the program and insert the new TRIE node */
	2094	U8 nodetype =(U8)(flags & 0xFF);
	2095	char *str=NULL;
	2096
	2097	#ifdef DEBUGGING
	2098	regnode *optimize = NULL;
	2099	#ifdef RE_TRACK_PATTERN_OFFSETS
	2100
	2101	U32 mjd_offset = 0;
	2102	U32 mjd_nodelen = 0;
	2103	#endif /* RE_TRACK_PATTERN_OFFSETS */
	2104	#endif /* DEBUGGING */
	2105	/*
	2106	This means we convert either the first branch or the first Exact,
	2107	depending on whether the thing following (in 'last') is a branch
	2108	or not and whther first is the startbranch (ie is it a sub part of
	2109	the alternation or is it the whole thing.)
	2110	Assuming its a sub part we convert the EXACT otherwise we convert
	2111	the whole branch sequence, including the first.
	2112	*/
	2113	/* Find the node we are going to overwrite */
	2114	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	2115	/* branch sub-chain */
	2116	NEXT_OFF( first ) = (U16)(last - first);
	2117	#ifdef RE_TRACK_PATTERN_OFFSETS
	2118	DEBUG_r({
	2119	mjd_offset= Node_Offset((convert));
	2120	mjd_nodelen= Node_Length((convert));
	2121	});
	2122	#endif
	2123	/* whole branch chain */
	2124	}
	2125	#ifdef RE_TRACK_PATTERN_OFFSETS
	2126	else {
	2127	DEBUG_r({
	2128	const regnode *nop = NEXTOPER( convert );
	2129	mjd_offset= Node_Offset((nop));
	2130	mjd_nodelen= Node_Length((nop));
	2131	});
	2132	}
	2133	DEBUG_OPTIMISE_r(
	2134	PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2135	(int)depth * 2 + 2, "",
	2136	(UV)mjd_offset, (UV)mjd_nodelen)
	2137	);
	2138	#endif
	2139	/* But first we check to see if there is a common prefix we can
	2140	split out as an EXACT and put in front of the TRIE node. */
	2141	trie->startstate= 1;
	2142	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2143	U32 state;
	2144	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2145	U32 ofs = 0;
	2146	I32 idx = -1;
	2147	U32 count = 0;
	2148	const U32 base = trie->states[ state ].trans.base;
	2149
	2150	if ( trie->states[state].wordnum )
	2151	count = 1;
	2152
	2153	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2154	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2155	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2156	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2157	{
	2158	if ( ++count > 1 ) {
	2159	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2160	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2161	if ( state == 1 ) break;
	2162	if ( count == 2 ) {
	2163	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2164	DEBUG_OPTIMISE_r(
	2165	PerlIO_printf(Perl_debug_log,
	2166	"%*sNew Start State=%"UVuf" Class: [",
	2167	(int)depth * 2 + 2, "",
	2168	(UV)state));
	2169	if (idx >= 0) {
	2170	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2171	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2172
	2173	TRIE_BITMAP_SET(trie,*ch);
	2174	if ( folder )
	2175	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2176	DEBUG_OPTIMISE_r(
	2177	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2178	);
	2179	}
	2180	}
	2181	TRIE_BITMAP_SET(trie,*ch);
	2182	if ( folder )
	2183	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2184	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2185	}
	2186	idx = ofs;
	2187	}
	2188	}
	2189	if ( count == 1 ) {
	2190	SV **tmp = av_fetch( revcharmap, idx, 0);
	2191	STRLEN len;
	2192	char ch = SvPV( tmp, len );
	2193	DEBUG_OPTIMISE_r({
	2194	SV *sv=sv_newmortal();
	2195	PerlIO_printf( Perl_debug_log,
	2196	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2197	(int)depth * 2 + 2, "",
	2198	(UV)state, (UV)idx,
	2199	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2200	PL_colors[0], PL_colors[1],
	2201	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2202	PERL_PV_ESCAPE_FIRSTCHAR
	2203	)
	2204	);
	2205	});
	2206	if ( state==1 ) {
	2207	OP( convert ) = nodetype;
	2208	str=STRING(convert);
	2209	STR_LEN(convert)=0;
	2210	}
	2211	STR_LEN(convert) += len;
	2212	while (len--)
	2213	str++ = ch++;
	2214	} else {
	2215	#ifdef DEBUGGING
	2216	if (state>1)
	2217	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2218	#endif
	2219	break;
	2220	}
	2221	}
	2222	trie->prefixlen = (state-1);
	2223	if (str) {
	2224	regnode *n = convert+NODE_SZ_STR(convert);
	2225	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2226	trie->startstate = state;
	2227	trie->minlen -= (state - 1);
	2228	trie->maxlen -= (state - 1);
	2229	#ifdef DEBUGGING
	2230	/* At least the UNICOS C compiler choked on this
	2231	* being argument to DEBUG_r(), so let's just have
	2232	* it right here. */
	2233	if (
	2234	#ifdef PERL_EXT_RE_BUILD
	2235	1
	2236	#else
	2237	DEBUG_r_TEST
	2238	#endif
	2239	) {
	2240	regnode *fix = convert;
	2241	U32 word = trie->wordcount;
	2242	mjd_nodelen++;
	2243	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2244	while( ++fix < n ) {
	2245	Set_Node_Offset_Length(fix, 0, 0);
	2246	}
	2247	while (word--) {
	2248	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2249	if (tmp) {
	2250	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2251	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2252	else
	2253	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2254	}
	2255	}
	2256	}
	2257	#endif
	2258	if (trie->maxlen) {
	2259	convert = n;
	2260	} else {
	2261	NEXT_OFF(convert) = (U16)(tail - convert);
	2262	DEBUG_r(optimize= n);
	2263	}
	2264	}
	2265	}
	2266	if (!jumper)
	2267	jumper = last;
	2268	if ( trie->maxlen ) {
	2269	NEXT_OFF( convert ) = (U16)(tail - convert);
	2270	ARG_SET( convert, data_slot );
	2271	/* Store the offset to the first unabsorbed branch in
	2272	jump[0], which is otherwise unused by the jump logic.
	2273	We use this when dumping a trie and during optimisation. */
	2274	if (trie->jump)
	2275	trie->jump[0] = (U16)(nextbranch - convert);
	2276
	2277	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2278	* and there is a bitmap
	2279	* and the first "jump target" node we found leaves enough room
	2280	* then convert the TRIE node into a TRIEC node, with the bitmap
	2281	* embedded inline in the opcode - this is hypothetically faster.
	2282	*/
	2283	if ( !trie->states[trie->startstate].wordnum
	2284	&& trie->bitmap
	2285	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2286	{
	2287	OP( convert ) = TRIEC;
	2288	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2289	PerlMemShared_free(trie->bitmap);
	2290	trie->bitmap= NULL;
	2291	} else
	2292	OP( convert ) = TRIE;
	2293
	2294	/* store the type in the flags */
	2295	convert->flags = nodetype;
	2296	DEBUG_r({
	2297	optimize = convert
	2298	+ NODE_STEP_REGNODE
	2299	+ regarglen[ OP( convert ) ];
	2300	});
	2301	/* XXX We really should free up the resource in trie now,
	2302	as we won't use them - (which resources?) dmq */
	2303	}
	2304	/* needed for dumping*/
	2305	DEBUG_r(if (optimize) {
	2306	regnode *opt = convert;
	2307
	2308	while ( ++opt < optimize) {
	2309	Set_Node_Offset_Length(opt,0,0);
	2310	}
	2311	/*
	2312	Try to clean up some of the debris left after the
	2313	optimisation.
	2314	*/
	2315	while( optimize < jumper ) {
	2316	mjd_nodelen += Node_Length((optimize));
	2317	OP( optimize ) = OPTIMIZED;
	2318	Set_Node_Offset_Length(optimize,0,0);
	2319	optimize++;
	2320	}
	2321	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2322	});
	2323	} /* end node insert */
	2324
	2325	/* Finish populating the prev field of the wordinfo array. Walk back
	2326	* from each accept state until we find another accept state, and if
	2327	* so, point the first word's .prev field at the second word. If the
	2328	* second already has a .prev field set, stop now. This will be the
	2329	* case either if we've already processed that word's accept state,
	2330	* or that state had multiple words, and the overspill words were
	2331	* already linked up earlier.
	2332	*/
	2333	{
	2334	U16 word;
	2335	U32 state;
	2336	U16 prev;
	2337
	2338	for (word=1; word <= trie->wordcount; word++) {
	2339	prev = 0;
	2340	if (trie->wordinfo[word].prev)
	2341	continue;
	2342	state = trie->wordinfo[word].accept;
	2343	while (state) {
	2344	state = prev_states[state];
	2345	if (!state)
	2346	break;
	2347	prev = trie->states[state].wordnum;
	2348	if (prev)
	2349	break;
	2350	}
	2351	trie->wordinfo[word].prev = prev;
	2352	}
	2353	Safefree(prev_states);
	2354	}
	2355
	2356
	2357	/* and now dump out the compressed format */
	2358	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2359
	2360	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2361	#ifdef DEBUGGING
	2362	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2363	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2364	#else
	2365	SvREFCNT_dec(revcharmap);
	2366	#endif
	2367	return trie->jump
	2368	? MADE_JUMP_TRIE
	2369	: trie->startstate>1
	2370	? MADE_EXACT_TRIE
	2371	: MADE_TRIE;
	2372	}
	2373
	2374	STATIC void
	2375	S_make_trie_failtable(pTHX_ RExC_state_t pRExC_state, regnode source, regnode *stclass, U32 depth)
	2376	{
	2377	/* The Trie is constructed and compressed now so we can build a fail array if it's needed
	2378
	2379	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
	2380	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
	2381	ISBN 0-201-10088-6
	2382
	2383	We find the fail state for each state in the trie, this state is the longest proper
	2384	suffix of the current state's 'word' that is also a proper prefix of another word in our
	2385	trie. State 1 represents the word '' and is thus the default fail state. This allows
	2386	the DFA not to have to restart after its tried and failed a word at a given point, it
	2387	simply continues as though it had been matching the other word in the first place.
	2388	Consider
	2389	'abcdgu'=~/abcdefg\|cdgu/
	2390	When we get to 'd' we are still matching the first word, we would encounter 'g' which would
	2391	fail, which would bring us to the state representing 'd' in the second word where we would
	2392	try 'g' and succeed, proceeding to match 'cdgu'.
	2393	*/
	2394	/* add a fail transition */
	2395	const U32 trie_offset = ARG(source);
	2396	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2397	U32 *q;
	2398	const U32 ucharcount = trie->uniquecharcount;
	2399	const U32 numstates = trie->statecount;
	2400	const U32 ubound = trie->lasttrans + ucharcount;
	2401	U32 q_read = 0;
	2402	U32 q_write = 0;
	2403	U32 charid;
	2404	U32 base = trie->states[ 1 ].trans.base;
	2405	U32 *fail;
	2406	reg_ac_data *aho;
	2407	const U32 data_slot = add_data( pRExC_state, 1, "T" );
	2408	GET_RE_DEBUG_FLAGS_DECL;
	2409
	2410	PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
	2411	#ifndef DEBUGGING
	2412	PERL_UNUSED_ARG(depth);
	2413	#endif
	2414
	2415
	2416	ARG_SET( stclass, data_slot );
	2417	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	2418	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	2419	aho->trie=trie_offset;
	2420	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	2421	Copy( trie->states, aho->states, numstates, reg_trie_state );
	2422	Newxz( q, numstates, U32);
	2423	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	2424	aho->refcount = 1;
	2425	fail = aho->fail;
	2426	/* initialize fail[0..1] to be 1 so that we always have
	2427	a valid final fail state */
	2428	fail[ 0 ] = fail[ 1 ] = 1;
	2429
	2430	for ( charid = 0; charid < ucharcount ; charid++ ) {
	2431	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	2432	if ( newstate ) {
	2433	q[ q_write ] = newstate;
	2434	/* set to point at the root */
	2435	fail[ q[ q_write++ ] ]=1;
	2436	}
	2437	}
	2438	while ( q_read < q_write) {
	2439	const U32 cur = q[ q_read++ % numstates ];
	2440	base = trie->states[ cur ].trans.base;
	2441
	2442	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	2443	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	2444	if (ch_state) {
	2445	U32 fail_state = cur;
	2446	U32 fail_base;
	2447	do {
	2448	fail_state = fail[ fail_state ];
	2449	fail_base = aho->states[ fail_state ].trans.base;
	2450	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	2451
	2452	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	2453	fail[ ch_state ] = fail_state;
	2454	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	2455	{
	2456	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	2457	}
	2458	q[ q_write++ % numstates] = ch_state;
	2459	}
	2460	}
	2461	}
	2462	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	2463	when we fail in state 1, this allows us to use the
	2464	charclass scan to find a valid start char. This is based on the principle
	2465	that theres a good chance the string being searched contains lots of stuff
	2466	that cant be a start char.
	2467	*/
	2468	fail[ 0 ] = fail[ 1 ] = 0;
	2469	DEBUG_TRIE_COMPILE_r({
	2470	PerlIO_printf(Perl_debug_log,
	2471	"%*sStclass Failtable (%"UVuf" states): 0",
	2472	(int)(depth * 2), "", (UV)numstates
	2473	);
	2474	for( q_read=1; q_read<numstates; q_read++ ) {
	2475	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	2476	}
	2477	PerlIO_printf(Perl_debug_log, "\n");
	2478	});
	2479	Safefree(q);
	2480	/RExC_seen \|= REG_SEEN_TRIEDFA;/
	2481	}
	2482
	2483
	2484	/*
	2485	* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
	2486	* These need to be revisited when a newer toolchain becomes available.
	2487	*/
	2488	#if defined(__sparc64__) && defined(__GNUC__)
	2489	# if __GNUC__ < 2 \|\| (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
	2490	# undef SPARC64_GCC_WORKAROUND
	2491	# define SPARC64_GCC_WORKAROUND 1
	2492	# endif
	2493	#endif
	2494
	2495	#define DEBUG_PEEP(str,scan,depth) \
	2496	DEBUG_OPTIMISE_r({if (scan){ \
	2497	SV * const mysv=sv_newmortal(); \
	2498	regnode *Next = regnext(scan); \
	2499	regprop(RExC_rx, mysv, scan); \
	2500	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	2501	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	2502	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	2503	}});
	2504
	2505
	2506
	2507
	2508
	2509	#define JOIN_EXACT(scan,min,flags) \
	2510	if (PL_regkind[OP(scan)] == EXACT) \
	2511	join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
	2512
	2513	STATIC U32
	2514	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan, I32 min, U32 flags,regnode val, U32 depth) {
	2515	/* Merge several consecutive EXACTish nodes into one. */
	2516	regnode *n = regnext(scan);
	2517	U32 stringok = 1;
	2518	regnode *next = scan + NODE_SZ_STR(scan);
	2519	U32 merged = 0;
	2520	U32 stopnow = 0;
	2521	#ifdef DEBUGGING
	2522	regnode *stop = scan;
	2523	GET_RE_DEBUG_FLAGS_DECL;
	2524	#else
	2525	PERL_UNUSED_ARG(depth);
	2526	#endif
	2527
	2528	PERL_ARGS_ASSERT_JOIN_EXACT;
	2529	#ifndef EXPERIMENTAL_INPLACESCAN
	2530	PERL_UNUSED_ARG(flags);
	2531	PERL_UNUSED_ARG(val);
	2532	#endif
	2533	DEBUG_PEEP("join",scan,depth);
	2534
	2535	/* Skip NOTHING, merge EXACT. /
	2536	while (n &&
	2537	( PL_regkind[OP(n)] == NOTHING \|\|
	2538	(stringok && (OP(n) == OP(scan))))
	2539	&& NEXT_OFF(n)
	2540	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
	2541
	2542	if (OP(n) == TAIL \|\| n > next)
	2543	stringok = 0;
	2544	if (PL_regkind[OP(n)] == NOTHING) {
	2545	DEBUG_PEEP("skip:",n,depth);
	2546	NEXT_OFF(scan) += NEXT_OFF(n);
	2547	next = n + NODE_STEP_REGNODE;
	2548	#ifdef DEBUGGING
	2549	if (stringok)
	2550	stop = n;
	2551	#endif
	2552	n = regnext(n);
	2553	}
	2554	else if (stringok) {
	2555	const unsigned int oldl = STR_LEN(scan);
	2556	regnode * const nnext = regnext(n);
	2557
	2558	DEBUG_PEEP("merg",n,depth);
	2559
	2560	merged++;
	2561	if (oldl + STR_LEN(n) > U8_MAX)
	2562	break;
	2563	NEXT_OFF(scan) += NEXT_OFF(n);
	2564	STR_LEN(scan) += STR_LEN(n);
	2565	next = n + NODE_SZ_STR(n);
	2566	/* Now we can overwrite n : /
	2567	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	2568	#ifdef DEBUGGING
	2569	stop = next - 1;
	2570	#endif
	2571	n = nnext;
	2572	if (stopnow) break;
	2573	}
	2574
	2575	#ifdef EXPERIMENTAL_INPLACESCAN
	2576	if (flags && !NEXT_OFF(n)) {
	2577	DEBUG_PEEP("atch", val, depth);
	2578	if (reg_off_by_arg[OP(n)]) {
	2579	ARG_SET(n, val - n);
	2580	}
	2581	else {
	2582	NEXT_OFF(n) = val - n;
	2583	}
	2584	stopnow = 1;
	2585	}
	2586	#endif
	2587	}
	2588	#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
	2589	#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
	2590	#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
	2591	#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
	2592
	2593	if (UTF
	2594	&& ( OP(scan) == EXACTF \|\| OP(scan) == EXACTFU \|\| OP(scan) == EXACTFA)
	2595	&& ( STR_LEN(scan) >= 6 ) )
	2596	{
	2597	/*
	2598	Two problematic code points in Unicode casefolding of EXACT nodes:
	2599
	2600	U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	2601	U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
	2602
	2603	which casefold to
	2604
	2605	Unicode UTF-8
	2606
	2607	U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
	2608	U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
	2609
	2610	This means that in case-insensitive matching (or "loose matching",
	2611	as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
	2612	length of the above casefolded versions) can match a target string
	2613	of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
	2614	This would rather mess up the minimum length computation.
	2615
	2616	What we'll do is to look for the tail four bytes, and then peek
	2617	at the preceding two bytes to see whether we need to decrease
	2618	the minimum length by four (six minus two).
	2619
	2620	Thanks to the design of UTF-8, there cannot be false matches:
	2621	A sequence of valid UTF-8 bytes cannot be a subsequence of
	2622	another valid sequence of UTF-8 bytes.
	2623
	2624	*/
	2625	char * const s0 = STRING(scan), s, t;
	2626	char * const s1 = s0 + STR_LEN(scan) - 1;
	2627	char * const s2 = s1 - 4;
	2628	#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
	2629	const char t0[] = "\xaf\x49\xaf\x42";
	2630	#else
	2631	const char t0[] = "\xcc\x88\xcc\x81";
	2632	#endif
	2633	const char * const t1 = t0 + 3;
	2634
	2635	for (s = s0 + 2;
	2636	s < s2 && (t = ninstr(s, s1, t0, t1));
	2637	s = t + 4) {
	2638	#ifdef EBCDIC
	2639	if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) \|\|
	2640	((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
	2641	#else
	2642	if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) \|\|
	2643	((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
	2644	#endif
	2645	*min -= 4;
	2646	}
	2647	}
	2648
	2649	#ifdef DEBUGGING
	2650	/* Allow dumping */
	2651	n = scan + NODE_SZ_STR(scan);
	2652	while (n <= stop) {
	2653	if (PL_regkind[OP(n)] != NOTHING \|\| OP(n) == NOTHING) {
	2654	OP(n) = OPTIMIZED;
	2655	NEXT_OFF(n) = 0;
	2656	}
	2657	n++;
	2658	}
	2659	#endif
	2660	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	2661	return stopnow;
	2662	}
	2663
	2664	/* REx optimizer. Converts nodes into quicker variants "in place".
	2665	Finds fixed substrings. */
	2666
	2667	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	2668	to the position after last scanned or to NULL. */
	2669
	2670	#define INIT_AND_WITHP \
	2671	assert(!and_withp); \
	2672	Newx(and_withp,1,struct regnode_charclass_class); \
	2673	SAVEFREEPV(and_withp)
	2674
	2675	/* this is a chain of data about sub patterns we are processing that
	2676	need to be handled separately/specially in study_chunk. Its so
	2677	we can simulate recursion without losing state. */
	2678	struct scan_frame;
	2679	typedef struct scan_frame {
	2680	regnode last; / last node to process in this frame */
	2681	regnode next; / next node to process when last is reached */
	2682	struct scan_frame prev; /previous frame*/
	2683	I32 stop; /* what stopparen do we use */
	2684	} scan_frame;
	2685
	2686
	2687	#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
	2688
	2689	#define CASE_SYNST_FNC(nAmE) \
	2690	case nAmE: \
	2691	if (flags & SCF_DO_STCLASS_AND) { \
	2692	for (value = 0; value < 256; value++) \
	2693	if (!is_ ## nAmE ## _cp(value)) \
	2694	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2695	} \
	2696	else { \
	2697	for (value = 0; value < 256; value++) \
	2698	if (is_ ## nAmE ## _cp(value)) \
	2699	ANYOF_BITMAP_SET(data->start_class, value); \
	2700	} \
	2701	break; \
	2702	case N ## nAmE: \
	2703	if (flags & SCF_DO_STCLASS_AND) { \
	2704	for (value = 0; value < 256; value++) \
	2705	if (is_ ## nAmE ## _cp(value)) \
	2706	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2707	} \
	2708	else { \
	2709	for (value = 0; value < 256; value++) \
	2710	if (!is_ ## nAmE ## _cp(value)) \
	2711	ANYOF_BITMAP_SET(data->start_class, value); \
	2712	} \
	2713	break
	2714
	2715
	2716
	2717	STATIC I32
	2718	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	2719	I32 minlenp, I32 deltap,
	2720	regnode *last,
	2721	scan_data_t *data,
	2722	I32 stopparen,
	2723	U8* recursed,
	2724	struct regnode_charclass_class *and_withp,
	2725	U32 flags, U32 depth)
	2726	/* scanp: Start here (read-write). */
	2727	/* deltap: Write maxlen-minlen here. */
	2728	/* last: Stop before this one. */
	2729	/* data: string data about the pattern */
	2730	/* stopparen: treat close N as END */
	2731	/* recursed: which subroutines have we recursed into */
	2732	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	2733	{
	2734	dVAR;
	2735	I32 min = 0, pars = 0, code;
	2736	regnode scan = scanp, *next;
	2737	I32 delta = 0;
	2738	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	2739	int is_inf_internal = 0; /* The studied chunk is infinite */
	2740	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	2741	scan_data_t data_fake;
	2742	SV *re_trie_maxbuff = NULL;
	2743	regnode *first_non_open = scan;
	2744	I32 stopmin = I32_MAX;
	2745	scan_frame *frame = NULL;
	2746	GET_RE_DEBUG_FLAGS_DECL;
	2747
	2748	PERL_ARGS_ASSERT_STUDY_CHUNK;
	2749
	2750	#ifdef DEBUGGING
	2751	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	2752	#endif
	2753
	2754	if ( depth == 0 ) {
	2755	while (first_non_open && OP(first_non_open) == OPEN)
	2756	first_non_open=regnext(first_non_open);
	2757	}
	2758
	2759
	2760	fake_study_recurse:
	2761	while ( scan && OP(scan) != END && scan < last ){
	2762	/* Peephole optimizer: */
	2763	DEBUG_STUDYDATA("Peep:", data,depth);
	2764	DEBUG_PEEP("Peep",scan,depth);
	2765	JOIN_EXACT(scan,&min,0);
	2766
	2767	/* Follow the next-chain of the current node and optimize
	2768	away all the NOTHINGs from it. */
	2769	if (OP(scan) != CURLYX) {
	2770	const int max = (reg_off_by_arg[OP(scan)]
	2771	? I32_MAX
	2772	/* I32 may be smaller than U16 on CRAYs! */
	2773	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	2774	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	2775	int noff;
	2776	regnode *n = scan;
	2777
	2778	/* Skip NOTHING and LONGJMP. */
	2779	while ((n = regnext(n))
	2780	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	2781	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	2782	&& off + noff < max)
	2783	off += noff;
	2784	if (reg_off_by_arg[OP(scan)])
	2785	ARG(scan) = off;
	2786	else
	2787	NEXT_OFF(scan) = off;
	2788	}
	2789
	2790
	2791
	2792	/* The principal pseudo-switch. Cannot be a switch, since we
	2793	look into several different things. */
	2794	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	2795	\|\| OP(scan) == IFTHEN) {
	2796	next = regnext(scan);
	2797	code = OP(scan);
	2798	/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
	2799
	2800	if (OP(next) == code \|\| code == IFTHEN) {
	2801	/* NOTE - There is similar code to this block below for handling
	2802	TRIE nodes on a re-study. If you change stuff here check there
	2803	too. */
	2804	I32 max1 = 0, min1 = I32_MAX, num = 0;
	2805	struct regnode_charclass_class accum;
	2806	regnode * const startbranch=scan;
	2807
	2808	if (flags & SCF_DO_SUBSTR)
	2809	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
	2810	if (flags & SCF_DO_STCLASS)
	2811	cl_init_zero(pRExC_state, &accum);
	2812
	2813	while (OP(scan) == code) {
	2814	I32 deltanext, minnext, f = 0, fake;
	2815	struct regnode_charclass_class this_class;
	2816
	2817	num++;
	2818	data_fake.flags = 0;
	2819	if (data) {
	2820	data_fake.whilem_c = data->whilem_c;
	2821	data_fake.last_closep = data->last_closep;
	2822	}
	2823	else
	2824	data_fake.last_closep = &fake;
	2825
	2826	data_fake.pos_delta = delta;
	2827	next = regnext(scan);
	2828	scan = NEXTOPER(scan);
	2829	if (code != BRANCH)
	2830	scan = NEXTOPER(scan);
	2831	if (flags & SCF_DO_STCLASS) {
	2832	cl_init(pRExC_state, &this_class);
	2833	data_fake.start_class = &this_class;
	2834	f = SCF_DO_STCLASS_AND;
	2835	}
	2836	if (flags & SCF_WHILEM_VISITED_POS)
	2837	f \|= SCF_WHILEM_VISITED_POS;
	2838
	2839	/* we suppose the run is continuous, last=next...*/
	2840	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	2841	next, &data_fake,
	2842	stopparen, recursed, NULL, f,depth+1);
	2843	if (min1 > minnext)
	2844	min1 = minnext;
	2845	if (max1 < minnext + deltanext)
	2846	max1 = minnext + deltanext;
	2847	if (deltanext == I32_MAX)
	2848	is_inf = is_inf_internal = 1;
	2849	scan = next;
	2850	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	2851	pars++;
	2852	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	2853	if ( stopmin > minnext)
	2854	stopmin = min + min1;
	2855	flags &= ~SCF_DO_SUBSTR;
	2856	if (data)
	2857	data->flags \|= SCF_SEEN_ACCEPT;
	2858	}
	2859	if (data) {
	2860	if (data_fake.flags & SF_HAS_EVAL)
	2861	data->flags \|= SF_HAS_EVAL;
	2862	data->whilem_c = data_fake.whilem_c;
	2863	}
	2864	if (flags & SCF_DO_STCLASS)
	2865	cl_or(pRExC_state, &accum, &this_class);
	2866	}
	2867	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	2868	min1 = 0;
	2869	if (flags & SCF_DO_SUBSTR) {
	2870	data->pos_min += min1;
	2871	data->pos_delta += max1 - min1;
	2872	if (max1 != min1 \|\| is_inf)
	2873	data->longest = &(data->longest_float);
	2874	}
	2875	min += min1;
	2876	delta += max1 - min1;
	2877	if (flags & SCF_DO_STCLASS_OR) {
	2878	cl_or(pRExC_state, data->start_class, &accum);
	2879	if (min1) {
	2880	cl_and(data->start_class, and_withp);
	2881	flags &= ~SCF_DO_STCLASS;
	2882	}
	2883	}
	2884	else if (flags & SCF_DO_STCLASS_AND) {
	2885	if (min1) {
	2886	cl_and(data->start_class, &accum);
	2887	flags &= ~SCF_DO_STCLASS;
	2888	}
	2889	else {
	2890	/* Switch to OR mode: cache the old value of
	2891	* data->start_class */
	2892	INIT_AND_WITHP;
	2893	StructCopy(data->start_class, and_withp,
	2894	struct regnode_charclass_class);
	2895	flags &= ~SCF_DO_STCLASS_AND;
	2896	StructCopy(&accum, data->start_class,
	2897	struct regnode_charclass_class);
	2898	flags \|= SCF_DO_STCLASS_OR;
	2899	data->start_class->flags \|= ANYOF_EOS;
	2900	}
	2901	}
	2902
	2903	if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
	2904	/* demq.
	2905
	2906	Assuming this was/is a branch we are dealing with: 'scan' now
	2907	points at the item that follows the branch sequence, whatever
	2908	it is. We now start at the beginning of the sequence and look
	2909	for subsequences of
	2910
	2911	BRANCH->EXACT=>x1
	2912	BRANCH->EXACT=>x2
	2913	tail
	2914
	2915	which would be constructed from a pattern like /A\|LIST\|OF\|WORDS/
	2916
	2917	If we can find such a subsequence we need to turn the first
	2918	element into a trie and then add the subsequent branch exact
	2919	strings to the trie.
	2920
	2921	We have two cases
	2922
	2923	1. patterns where the whole set of branches can be converted.
	2924
	2925	2. patterns where only a subset can be converted.
	2926
	2927	In case 1 we can replace the whole set with a single regop
	2928	for the trie. In case 2 we need to keep the start and end
	2929	branches so
	2930
	2931	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	2932	becomes BRANCH TRIE; BRANCH X;
	2933
	2934	There is an additional case, that being where there is a
	2935	common prefix, which gets split out into an EXACT like node
	2936	preceding the TRIE node.
	2937
	2938	If x(1..n)==tail then we can do a simple trie, if not we make
	2939	a "jump" trie, such that when we match the appropriate word
	2940	we "jump" to the appropriate tail node. Essentially we turn
	2941	a nested if into a case structure of sorts.
	2942
	2943	*/
	2944
	2945	int made=0;
	2946	if (!re_trie_maxbuff) {
	2947	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	2948	if (!SvIOK(re_trie_maxbuff))
	2949	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	2950	}
	2951	if ( SvIV(re_trie_maxbuff)>=0 ) {
	2952	regnode *cur;
	2953	regnode first = (regnode )NULL;
	2954	regnode last = (regnode )NULL;
	2955	regnode *tail = scan;
	2956	U8 optype = 0;
	2957	U32 count=0;
	2958
	2959	#ifdef DEBUGGING
	2960	SV * const mysv = sv_newmortal(); /* for dumping */
	2961	#endif
	2962	/* var tail is used because there may be a TAIL
	2963	regop in the way. Ie, the exacts will point to the
	2964	thing following the TAIL, but the last branch will
	2965	point at the TAIL. So we advance tail. If we
	2966	have nested (?:) we may have to move through several
	2967	tails.
	2968	*/
	2969
	2970	while ( OP( tail ) == TAIL ) {
	2971	/* this is the TAIL generated by (?:) */
	2972	tail = regnext( tail );
	2973	}
	2974
	2975
	2976	DEBUG_OPTIMISE_r({
	2977	regprop(RExC_rx, mysv, tail );
	2978	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	2979	(int)depth * 2 + 2, "",
	2980	"Looking for TRIE'able sequences. Tail node is: ",
	2981	SvPV_nolen_const( mysv )
	2982	);
	2983	});
	2984
	2985	/*
	2986
	2987	step through the branches, cur represents each
	2988	branch, noper is the first thing to be matched
	2989	as part of that branch and noper_next is the
	2990	regnext() of that node. if noper is an EXACT
	2991	and noper_next is the same as scan (our current
	2992	position in the regex) then the EXACT branch is
	2993	a possible optimization target. Once we have
	2994	two or more consecutive such branches we can
	2995	create a trie of the EXACT's contents and stich
	2996	it in place. If the sequence represents all of
	2997	the branches we eliminate the whole thing and
	2998	replace it with a single TRIE. If it is a
	2999	subsequence then we need to stitch it in. This
	3000	means the first branch has to remain, and needs
	3001	to be repointed at the item on the branch chain
	3002	following the last branch optimized. This could
	3003	be either a BRANCH, in which case the
	3004	subsequence is internal, or it could be the
	3005	item following the branch sequence in which
	3006	case the subsequence is at the end.
	3007
	3008	*/
	3009
	3010	/* dont use tail as the end marker for this traverse */
	3011	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	3012	regnode * const noper = NEXTOPER( cur );
	3013	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	3014	regnode * const noper_next = regnext( noper );
	3015	#endif
	3016
	3017	DEBUG_OPTIMISE_r({
	3018	regprop(RExC_rx, mysv, cur);
	3019	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	3020	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	3021
	3022	regprop(RExC_rx, mysv, noper);
	3023	PerlIO_printf( Perl_debug_log, " -> %s",
	3024	SvPV_nolen_const(mysv));
	3025
	3026	if ( noper_next ) {
	3027	regprop(RExC_rx, mysv, noper_next );
	3028	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	3029	SvPV_nolen_const(mysv));
	3030	}
	3031	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
	3032	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
	3033	});
	3034	if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
	3035	: PL_regkind[ OP( noper ) ] == EXACT )
	3036	\|\| OP(noper) == NOTHING )
	3037	#ifdef NOJUMPTRIE
	3038	&& noper_next == tail
	3039	#endif
	3040	&& count < U16_MAX)
	3041	{
	3042	count++;
	3043	if ( !first \|\| optype == NOTHING ) {
	3044	if (!first) first = cur;
	3045	optype = OP( noper );
	3046	} else {
	3047	last = cur;
	3048	}
	3049	} else {
	3050	/*
	3051	Currently the trie logic handles case insensitive matching properly only
	3052	when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
	3053	semantics).
	3054
	3055	If/when this is fixed the following define can be swapped
	3056	in below to fully enable trie logic.
	3057
	3058	#define TRIE_TYPE_IS_SAFE 1
	3059
	3060	*/
	3061	#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) \|\| optype==EXACT)
	3062
	3063	if ( last && TRIE_TYPE_IS_SAFE ) {
	3064	make_trie( pRExC_state,
	3065	startbranch, first, cur, tail, count,
	3066	optype, depth+1 );
	3067	}
	3068	if ( PL_regkind[ OP( noper ) ] == EXACT
	3069	#ifdef NOJUMPTRIE
	3070	&& noper_next == tail
	3071	#endif
	3072	){
	3073	count = 1;
	3074	first = cur;
	3075	optype = OP( noper );
	3076	} else {
	3077	count = 0;
	3078	first = NULL;
	3079	optype = 0;
	3080	}
	3081	last = NULL;
	3082	}
	3083	}
	3084	DEBUG_OPTIMISE_r({
	3085	regprop(RExC_rx, mysv, cur);
	3086	PerlIO_printf( Perl_debug_log,
	3087	"%s- %s (%d) <SCAN FINISHED>\n", (int)depth 2 + 2,
	3088	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3089
	3090	});
	3091
	3092	if ( last && TRIE_TYPE_IS_SAFE ) {
	3093	made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
	3094	#ifdef TRIE_STUDY_OPT
	3095	if ( ((made == MADE_EXACT_TRIE &&
	3096	startbranch == first)
	3097	\|\| ( first_non_open == first )) &&
	3098	depth==0 ) {
	3099	flags \|= SCF_TRIE_RESTUDY;
	3100	if ( startbranch == first
	3101	&& scan == tail )
	3102	{
	3103	RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
	3104	}
	3105	}
	3106	#endif
	3107	}
	3108	}
	3109
	3110	} /* do trie */
	3111
	3112	}
	3113	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	3114	scan = NEXTOPER(NEXTOPER(scan));
	3115	} else /* single branch is optimized. */
	3116	scan = NEXTOPER(scan);
	3117	continue;
	3118	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	3119	scan_frame *newframe = NULL;
	3120	I32 paren;
	3121	regnode *start;
	3122	regnode *end;
	3123
	3124	if (OP(scan) != SUSPEND) {
	3125	/* set the pointer */
	3126	if (OP(scan) == GOSUB) {
	3127	paren = ARG(scan);
	3128	RExC_recurse[ARG2L(scan)] = scan;
	3129	start = RExC_open_parens[paren-1];
	3130	end = RExC_close_parens[paren-1];
	3131	} else {
	3132	paren = 0;
	3133	start = RExC_rxi->program + 1;
	3134	end = RExC_opend;
	3135	}
	3136	if (!recursed) {
	3137	Newxz(recursed, (((RExC_npar)>>3) +1), U8);
	3138	SAVEFREEPV(recursed);
	3139	}
	3140	if (!PAREN_TEST(recursed,paren+1)) {
	3141	PAREN_SET(recursed,paren+1);
	3142	Newx(newframe,1,scan_frame);
	3143	} else {
	3144	if (flags & SCF_DO_SUBSTR) {
	3145	SCAN_COMMIT(pRExC_state,data,minlenp);
	3146	data->longest = &(data->longest_float);
	3147	}
	3148	is_inf = is_inf_internal = 1;
	3149	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3150	cl_anything(pRExC_state, data->start_class);
	3151	flags &= ~SCF_DO_STCLASS;
	3152	}
	3153	} else {
	3154	Newx(newframe,1,scan_frame);
	3155	paren = stopparen;
	3156	start = scan+2;
	3157	end = regnext(scan);
	3158	}
	3159	if (newframe) {
	3160	assert(start);
	3161	assert(end);
	3162	SAVEFREEPV(newframe);
	3163	newframe->next = regnext(scan);
	3164	newframe->last = last;
	3165	newframe->stop = stopparen;
	3166	newframe->prev = frame;
	3167
	3168	frame = newframe;
	3169	scan = start;
	3170	stopparen = paren;
	3171	last = end;
	3172
	3173	continue;
	3174	}
	3175	}
	3176	else if (OP(scan) == EXACT) {
	3177	I32 l = STR_LEN(scan);
	3178	UV uc;
	3179	if (UTF) {
	3180	const U8 * const s = (U8*)STRING(scan);
	3181	l = utf8_length(s, s + l);
	3182	uc = utf8_to_uvchr(s, NULL);
	3183	} else {
	3184	uc = ((U8)STRING(scan));
	3185	}
	3186	min += l;
	3187	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	3188	/* The code below prefers earlier match for fixed
	3189	offset, later match for variable offset. */
	3190	if (data->last_end == -1) { /* Update the start info. */
	3191	data->last_start_min = data->pos_min;
	3192	data->last_start_max = is_inf
	3193	? I32_MAX : data->pos_min + data->pos_delta;
	3194	}
	3195	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	3196	if (UTF)
	3197	SvUTF8_on(data->last_found);
	3198	{
	3199	SV * const sv = data->last_found;
	3200	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3201	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3202	if (mg && mg->mg_len >= 0)
	3203	mg->mg_len += utf8_length((U8*)STRING(scan),
	3204	(U8*)STRING(scan)+STR_LEN(scan));
	3205	}
	3206	data->last_end = data->pos_min + l;
	3207	data->pos_min += l; /* As in the first entry. */
	3208	data->flags &= ~SF_BEFORE_EOL;
	3209	}
	3210	if (flags & SCF_DO_STCLASS_AND) {
	3211	/* Check whether it is compatible with what we know already! */
	3212	int compat = 1;
	3213
	3214
	3215	/* If compatible, we or it in below. It is compatible if is
	3216	* in the bitmp and either 1) its bit or its fold is set, or 2)
	3217	* it's for a locale. Even if there isn't unicode semantics
	3218	* here, at runtime there may be because of matching against a
	3219	* utf8 string, so accept a possible false positive for
	3220	* latin1-range folds */
	3221	if (uc >= 0x100 \|\|
	3222	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3223	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3224	&& (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
	3225	\|\| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3226	)
	3227	{
	3228	compat = 0;
	3229	}
	3230	ANYOF_CLASS_ZERO(data->start_class);
	3231	ANYOF_BITMAP_ZERO(data->start_class);
	3232	if (compat)
	3233	ANYOF_BITMAP_SET(data->start_class, uc);
	3234	else if (uc >= 0x100) {
	3235	int i;
	3236
	3237	/* Some Unicode code points fold to the Latin1 range; as
	3238	* XXX temporary code, instead of figuring out if this is
	3239	* one, just assume it is and set all the start class bits
	3240	* that could be some such above 255 code point's fold
	3241	* which will generate fals positives. As the code
	3242	* elsewhere that does compute the fold settles down, it
	3243	* can be extracted out and re-used here */
	3244	for (i = 0; i < 256; i++){
	3245	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3246	ANYOF_BITMAP_SET(data->start_class, i);
	3247	}
	3248	}
	3249	}
	3250	data->start_class->flags &= ~ANYOF_EOS;
	3251	if (uc < 0x100)
	3252	data->start_class->flags &= ~ANYOF_UNICODE_ALL;
	3253	}
	3254	else if (flags & SCF_DO_STCLASS_OR) {
	3255	/* false positive possible if the class is case-folded */
	3256	if (uc < 0x100)
	3257	ANYOF_BITMAP_SET(data->start_class, uc);
	3258	else
	3259	data->start_class->flags \|= ANYOF_UNICODE_ALL;
	3260	data->start_class->flags &= ~ANYOF_EOS;
	3261	cl_and(data->start_class, and_withp);
	3262	}
	3263	flags &= ~SCF_DO_STCLASS;
	3264	}
	3265	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
	3266	I32 l = STR_LEN(scan);
	3267	UV uc = ((U8)STRING(scan));
	3268
	3269	/* Search for fixed substrings supports EXACT only. */
	3270	if (flags & SCF_DO_SUBSTR) {
	3271	assert(data);
	3272	SCAN_COMMIT(pRExC_state, data, minlenp);
	3273	}
	3274	if (UTF) {
	3275	const U8 * const s = (U8 *)STRING(scan);
	3276	l = utf8_length(s, s + l);
	3277	uc = utf8_to_uvchr(s, NULL);
	3278	}
	3279	min += l;
	3280	if (flags & SCF_DO_SUBSTR)
	3281	data->pos_min += l;
	3282	if (flags & SCF_DO_STCLASS_AND) {
	3283	/* Check whether it is compatible with what we know already! */
	3284	int compat = 1;
	3285	if (uc >= 0x100 \|\|
	3286	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3287	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3288	&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3289	{
	3290	compat = 0;
	3291	}
	3292	ANYOF_CLASS_ZERO(data->start_class);
	3293	ANYOF_BITMAP_ZERO(data->start_class);
	3294	if (compat) {
	3295	ANYOF_BITMAP_SET(data->start_class, uc);
	3296	data->start_class->flags &= ~ANYOF_EOS;
	3297	data->start_class->flags \|= ANYOF_LOC_NONBITMAP_FOLD;
	3298	if (OP(scan) == EXACTFL) {
	3299	/* XXX This set is probably no longer necessary, and
	3300	* probably wrong as LOCALE now is on in the initial
	3301	* state */
	3302	data->start_class->flags \|= ANYOF_LOCALE;
	3303	}
	3304	else {
	3305
	3306	/* Also set the other member of the fold pair. In case
	3307	* that unicode semantics is called for at runtime, use
	3308	* the full latin1 fold. (Can't do this for locale,
	3309	* because not known until runtime */
	3310	ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
	3311	}
	3312	}
	3313	else if (uc >= 0x100) {
	3314	int i;
	3315	for (i = 0; i < 256; i++){
	3316	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3317	ANYOF_BITMAP_SET(data->start_class, i);
	3318	}
	3319	}
	3320	}
	3321	}
	3322	else if (flags & SCF_DO_STCLASS_OR) {
	3323	if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
	3324	/* false positive possible if the class is case-folded.
	3325	Assume that the locale settings are the same... */
	3326	if (uc < 0x100) {
	3327	ANYOF_BITMAP_SET(data->start_class, uc);
	3328	if (OP(scan) != EXACTFL) {
	3329
	3330	/* And set the other member of the fold pair, but
	3331	* can't do that in locale because not known until
	3332	* run-time */
	3333	ANYOF_BITMAP_SET(data->start_class,
	3334	PL_fold_latin1[uc]);
	3335	}
	3336	}
	3337	data->start_class->flags &= ~ANYOF_EOS;
	3338	}
	3339	cl_and(data->start_class, and_withp);
	3340	}
	3341	flags &= ~SCF_DO_STCLASS;
	3342	}
	3343	else if (REGNODE_VARIES(OP(scan))) {
	3344	I32 mincount, maxcount, minnext, deltanext, fl = 0;
	3345	I32 f = flags, pos_before = 0;
	3346	regnode * const oscan = scan;
	3347	struct regnode_charclass_class this_class;
	3348	struct regnode_charclass_class *oclass = NULL;
	3349	I32 next_is_eval = 0;
	3350
	3351	switch (PL_regkind[OP(scan)]) {
	3352	case WHILEM: /* End of (?:...)* . */
	3353	scan = NEXTOPER(scan);
	3354	goto finish;
	3355	case PLUS:
	3356	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	3357	next = NEXTOPER(scan);
	3358	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	3359	mincount = 1;
	3360	maxcount = REG_INFTY;
	3361	next = regnext(scan);
	3362	scan = NEXTOPER(scan);
	3363	goto do_curly;
	3364	}
	3365	}
	3366	if (flags & SCF_DO_SUBSTR)
	3367	data->pos_min++;
	3368	min++;
	3369	/* Fall through. */
	3370	case STAR:
	3371	if (flags & SCF_DO_STCLASS) {
	3372	mincount = 0;
	3373	maxcount = REG_INFTY;
	3374	next = regnext(scan);
	3375	scan = NEXTOPER(scan);
	3376	goto do_curly;
	3377	}
	3378	is_inf = is_inf_internal = 1;
	3379	scan = regnext(scan);
	3380	if (flags & SCF_DO_SUBSTR) {
	3381	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
	3382	data->longest = &(data->longest_float);
	3383	}
	3384	goto optimize_curly_tail;
	3385	case CURLY:
	3386	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	3387	&& (scan->flags == stopparen))
	3388	{
	3389	mincount = 1;
	3390	maxcount = 1;
	3391	} else {
	3392	mincount = ARG1(scan);
	3393	maxcount = ARG2(scan);
	3394	}
	3395	next = regnext(scan);
	3396	if (OP(scan) == CURLYX) {
	3397	I32 lp = (data ? *(data->last_closep) : 0);
	3398	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	3399	}
	3400	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	3401	next_is_eval = (OP(scan) == EVAL);
	3402	do_curly:
	3403	if (flags & SCF_DO_SUBSTR) {
	3404	if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
	3405	pos_before = data->pos_min;
	3406	}
	3407	if (data) {
	3408	fl = data->flags;
	3409	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	3410	if (is_inf)
	3411	data->flags \|= SF_IS_INF;
	3412	}
	3413	if (flags & SCF_DO_STCLASS) {
	3414	cl_init(pRExC_state, &this_class);
	3415	oclass = data->start_class;
	3416	data->start_class = &this_class;
	3417	f \|= SCF_DO_STCLASS_AND;
	3418	f &= ~SCF_DO_STCLASS_OR;
	3419	}
	3420	/* Exclude from super-linear cache processing any {n,m}
	3421	regops for which the combination of input pos and regex
	3422	pos is not enough information to determine if a match
	3423	will be possible.
	3424
	3425	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	3426	regex pos at the \s*, the prospects for a match depend not
	3427	only on the input position but also on how many (bar\s*)
	3428	repeats into the {4,8} we are. */
	3429	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	3430	f &= ~SCF_WHILEM_VISITED_POS;
	3431
	3432	/* This will finish on WHILEM, setting scan, or on NULL: */
	3433	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3434	last, data, stopparen, recursed, NULL,
	3435	(mincount == 0
	3436	? (f & ~SCF_DO_SUBSTR) : f),depth+1);
	3437
	3438	if (flags & SCF_DO_STCLASS)
	3439	data->start_class = oclass;
	3440	if (mincount == 0 \|\| minnext == 0) {
	3441	if (flags & SCF_DO_STCLASS_OR) {
	3442	cl_or(pRExC_state, data->start_class, &this_class);
	3443	}
	3444	else if (flags & SCF_DO_STCLASS_AND) {
	3445	/* Switch to OR mode: cache the old value of
	3446	* data->start_class */
	3447	INIT_AND_WITHP;
	3448	StructCopy(data->start_class, and_withp,
	3449	struct regnode_charclass_class);
	3450	flags &= ~SCF_DO_STCLASS_AND;
	3451	StructCopy(&this_class, data->start_class,
	3452	struct regnode_charclass_class);
	3453	flags \|= SCF_DO_STCLASS_OR;
	3454	data->start_class->flags \|= ANYOF_EOS;
	3455	}
	3456	} else { /* Non-zero len */
	3457	if (flags & SCF_DO_STCLASS_OR) {
	3458	cl_or(pRExC_state, data->start_class, &this_class);
	3459	cl_and(data->start_class, and_withp);
	3460	}
	3461	else if (flags & SCF_DO_STCLASS_AND)
	3462	cl_and(data->start_class, &this_class);
	3463	flags &= ~SCF_DO_STCLASS;
	3464	}
	3465	if (!scan) /* It was not CURLYX, but CURLY. */
	3466	scan = next;
	3467	if ( /* ? quantifier ok, except for (?{ ... }) */
	3468	(next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	3469	&& (minnext == 0) && (deltanext == 0)
	3470	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	3471	&& maxcount <= REG_INFTY/3) /* Complement check for big count */
	3472	{
	3473	ckWARNreg(RExC_parse,
	3474	"Quantifier unexpected on zero-length expression");
	3475	}
	3476
	3477	min += minnext * mincount;
	3478	is_inf_internal \|= ((maxcount == REG_INFTY
	3479	&& (minnext + deltanext) > 0)
	3480	\|\| deltanext == I32_MAX);
	3481	is_inf \|= is_inf_internal;
	3482	delta += (minnext + deltanext) * maxcount - minnext * mincount;
	3483
	3484	/* Try powerful optimization CURLYX => CURLYN. */
	3485	if ( OP(oscan) == CURLYX && data
	3486	&& data->flags & SF_IN_PAR
	3487	&& !(data->flags & SF_HAS_EVAL)
	3488	&& !deltanext && minnext == 1 ) {
	3489	/* Try to optimize to CURLYN. */
	3490	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	3491	regnode * const nxt1 = nxt;
	3492	#ifdef DEBUGGING
	3493	regnode *nxt2;
	3494	#endif
	3495
	3496	/* Skip open. */
	3497	nxt = regnext(nxt);
	3498	if (!REGNODE_SIMPLE(OP(nxt))
	3499	&& !(PL_regkind[OP(nxt)] == EXACT
	3500	&& STR_LEN(nxt) == 1))
	3501	goto nogo;
	3502	#ifdef DEBUGGING
	3503	nxt2 = nxt;
	3504	#endif
	3505	nxt = regnext(nxt);
	3506	if (OP(nxt) != CLOSE)
	3507	goto nogo;
	3508	if (RExC_open_parens) {
	3509	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3510	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	3511	}
	3512	/* Now we know that nxt2 is the only contents: */
	3513	oscan->flags = (U8)ARG(nxt);
	3514	OP(oscan) = CURLYN;
	3515	OP(nxt1) = NOTHING; /* was OPEN. */
	3516
	3517	#ifdef DEBUGGING
	3518	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3519	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	3520	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	3521	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3522	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3523	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	3524	#endif
	3525	}
	3526	nogo:
	3527
	3528	/* Try optimization CURLYX => CURLYM. */
	3529	if ( OP(oscan) == CURLYX && data
	3530	&& !(data->flags & SF_HAS_PAR)
	3531	&& !(data->flags & SF_HAS_EVAL)
	3532	&& !deltanext /* atom is fixed width */
	3533	&& minnext != 0 /* CURLYM can't handle zero width */
	3534	) {
	3535	/* XXXX How to optimize if data == 0? */
	3536	/* Optimize to a simpler form. */
	3537	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	3538	regnode *nxt2;
	3539
	3540	OP(oscan) = CURLYM;
	3541	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	3542	&& (OP(nxt2) != WHILEM))
	3543	nxt = nxt2;
	3544	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	3545	/* Need to optimize away parenths. */
	3546	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	3547	/* Set the parenth number. */
	3548	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	3549
	3550	oscan->flags = (U8)ARG(nxt);
	3551	if (RExC_open_parens) {
	3552	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3553	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	3554	}
	3555	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	3556	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3557
	3558	#ifdef DEBUGGING
	3559	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3560	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3561	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	3562	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	3563	#endif
	3564	#if 0
	3565	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	3566	regnode *nnxt = regnext(nxt1);
	3567	if (nnxt == nxt) {
	3568	if (reg_off_by_arg[OP(nxt1)])
	3569	ARG_SET(nxt1, nxt2 - nxt1);
	3570	else if (nxt2 - nxt1 < U16_MAX)
	3571	NEXT_OFF(nxt1) = nxt2 - nxt1;
	3572	else
	3573	OP(nxt) = NOTHING; /* Cannot beautify */
	3574	}
	3575	nxt1 = nnxt;
	3576	}
	3577	#endif
	3578	/* Optimize again: */
	3579	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	3580	NULL, stopparen, recursed, NULL, 0,depth+1);
	3581	}
	3582	else
	3583	oscan->flags = 0;
	3584	}
	3585	else if ((OP(oscan) == CURLYX)
	3586	&& (flags & SCF_WHILEM_VISITED_POS)
	3587	/* See the comment on a similar expression above.
	3588	However, this time it's not a subexpression
	3589	we care about, but the expression itself. */
	3590	&& (maxcount == REG_INFTY)
	3591	&& data && ++data->whilem_c < 16) {
	3592	/* This stays as CURLYX, we can put the count/of pair. */
	3593	/* Find WHILEM (as in regexec.c) */
	3594	regnode *nxt = oscan + NEXT_OFF(oscan);
	3595
	3596	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	3597	nxt += ARG(nxt);
	3598	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	3599	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	3600	}
	3601	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	3602	pars++;
	3603	if (flags & SCF_DO_SUBSTR) {
	3604	SV *last_str = NULL;
	3605	int counted = mincount != 0;
	3606
	3607	if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
	3608	#if defined(SPARC64_GCC_WORKAROUND)
	3609	I32 b = 0;
	3610	STRLEN l = 0;
	3611	const char *s = NULL;
	3612	I32 old = 0;
	3613
	3614	if (pos_before >= data->last_start_min)
	3615	b = pos_before;
	3616	else
	3617	b = data->last_start_min;
	3618
	3619	l = 0;
	3620	s = SvPV_const(data->last_found, l);
	3621	old = b - data->last_start_min;
	3622
	3623	#else
	3624	I32 b = pos_before >= data->last_start_min
	3625	? pos_before : data->last_start_min;
	3626	STRLEN l;
	3627	const char * const s = SvPV_const(data->last_found, l);
	3628	I32 old = b - data->last_start_min;
	3629	#endif
	3630
	3631	if (UTF)
	3632	old = utf8_hop((U8)s, old) - (U8)s;
	3633	l -= old;
	3634	/* Get the added string: */
	3635	last_str = newSVpvn_utf8(s + old, l, UTF);
	3636	if (deltanext == 0 && pos_before == b) {
	3637	/* What was added is a constant string */
	3638	if (mincount > 1) {
	3639	SvGROW(last_str, (mincount * l) + 1);
	3640	repeatcpy(SvPVX(last_str) + l,
	3641	SvPVX_const(last_str), l, mincount - 1);
	3642	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	3643	/* Add additional parts. */
	3644	SvCUR_set(data->last_found,
	3645	SvCUR(data->last_found) - l);
	3646	sv_catsv(data->last_found, last_str);
	3647	{
	3648	SV * sv = data->last_found;
	3649	MAGIC *mg =
	3650	SvUTF8(sv) && SvMAGICAL(sv) ?
	3651	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3652	if (mg && mg->mg_len >= 0)
	3653	mg->mg_len += CHR_SVLEN(last_str) - l;
	3654	}
	3655	data->last_end += l * (mincount - 1);
	3656	}
	3657	} else {
	3658	/* start offset must point into the last copy */
	3659	data->last_start_min += minnext * (mincount - 1);
	3660	data->last_start_max += is_inf ? I32_MAX
	3661	: (maxcount - 1) * (minnext + data->pos_delta);
	3662	}
	3663	}
	3664	/* It is counted once already... */
	3665	data->pos_min += minnext * (mincount - counted);
	3666	data->pos_delta += - counted * deltanext +
	3667	(minnext + deltanext) * maxcount - minnext * mincount;
	3668	if (mincount != maxcount) {
	3669	/* Cannot extend fixed substrings found inside
	3670	the group. */
	3671	SCAN_COMMIT(pRExC_state,data,minlenp);
	3672	if (mincount && last_str) {
	3673	SV * const sv = data->last_found;
	3674	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3675	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3676
	3677	if (mg)
	3678	mg->mg_len = -1;
	3679	sv_setsv(sv, last_str);
	3680	data->last_end = data->pos_min;
	3681	data->last_start_min =
	3682	data->pos_min - CHR_SVLEN(last_str);
	3683	data->last_start_max = is_inf
	3684	? I32_MAX
	3685	: data->pos_min + data->pos_delta
	3686	- CHR_SVLEN(last_str);
	3687	}
	3688	data->longest = &(data->longest_float);
	3689	}
	3690	SvREFCNT_dec(last_str);
	3691	}
	3692	if (data && (fl & SF_HAS_EVAL))
	3693	data->flags \|= SF_HAS_EVAL;
	3694	optimize_curly_tail:
	3695	if (OP(oscan) != CURLYX) {
	3696	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	3697	&& NEXT_OFF(next))
	3698	NEXT_OFF(oscan) += NEXT_OFF(next);
	3699	}
	3700	continue;
	3701	default: /* REF, ANYOFV, and CLUMP only? */
	3702	if (flags & SCF_DO_SUBSTR) {
	3703	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3704	data->longest = &(data->longest_float);
	3705	}
	3706	is_inf = is_inf_internal = 1;
	3707	if (flags & SCF_DO_STCLASS_OR)
	3708	cl_anything(pRExC_state, data->start_class);
	3709	flags &= ~SCF_DO_STCLASS;
	3710	break;
	3711	}
	3712	}
	3713	else if (OP(scan) == LNBREAK) {
	3714	if (flags & SCF_DO_STCLASS) {
	3715	int value = 0;
	3716	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3717	if (flags & SCF_DO_STCLASS_AND) {
	3718	for (value = 0; value < 256; value++)
	3719	if (!is_VERTWS_cp(value))
	3720	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3721	}
	3722	else {
	3723	for (value = 0; value < 256; value++)
	3724	if (is_VERTWS_cp(value))
	3725	ANYOF_BITMAP_SET(data->start_class, value);
	3726	}
	3727	if (flags & SCF_DO_STCLASS_OR)
	3728	cl_and(data->start_class, and_withp);
	3729	flags &= ~SCF_DO_STCLASS;
	3730	}
	3731	min += 1;
	3732	delta += 1;
	3733	if (flags & SCF_DO_SUBSTR) {
	3734	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3735	data->pos_min += 1;
	3736	data->pos_delta += 1;
	3737	data->longest = &(data->longest_float);
	3738	}
	3739	}
	3740	else if (OP(scan) == FOLDCHAR) {
	3741	int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
	3742	flags &= ~SCF_DO_STCLASS;
	3743	min += 1;
	3744	delta += d;
	3745	if (flags & SCF_DO_SUBSTR) {
	3746	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	3747	data->pos_min += 1;
	3748	data->pos_delta += d;
	3749	data->longest = &(data->longest_float);
	3750	}
	3751	}
	3752	else if (REGNODE_SIMPLE(OP(scan))) {
	3753	int value = 0;
	3754
	3755	if (flags & SCF_DO_SUBSTR) {
	3756	SCAN_COMMIT(pRExC_state,data,minlenp);
	3757	data->pos_min++;
	3758	}
	3759	min++;
	3760	if (flags & SCF_DO_STCLASS) {
	3761	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	3762
	3763	/* Some of the logic below assumes that switching
	3764	locale on will only add false positives. */
	3765	switch (PL_regkind[OP(scan)]) {
	3766	case SANY:
	3767	default:
	3768	do_default:
	3769	/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
	3770	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3771	cl_anything(pRExC_state, data->start_class);
	3772	break;
	3773	case REG_ANY:
	3774	if (OP(scan) == SANY)
	3775	goto do_default;
	3776	if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
	3777	value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
	3778	\|\| ANYOF_CLASS_TEST_ANY_SET(data->start_class));
	3779	cl_anything(pRExC_state, data->start_class);
	3780	}
	3781	if (flags & SCF_DO_STCLASS_AND \|\| !value)
	3782	ANYOF_BITMAP_CLEAR(data->start_class,'\n');
	3783	break;
	3784	case ANYOF:
	3785	if (flags & SCF_DO_STCLASS_AND)
	3786	cl_and(data->start_class,
	3787	(struct regnode_charclass_class*)scan);
	3788	else
	3789	cl_or(pRExC_state, data->start_class,
	3790	(struct regnode_charclass_class*)scan);
	3791	break;
	3792	case ALNUM:
	3793	if (flags & SCF_DO_STCLASS_AND) {
	3794	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3795	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
	3796	if (OP(scan) == ALNUMU) {
	3797	for (value = 0; value < 256; value++) {
	3798	if (!isWORDCHAR_L1(value)) {
	3799	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3800	}
	3801	}
	3802	} else {
	3803	for (value = 0; value < 256; value++) {
	3804	if (!isALNUM(value)) {
	3805	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3806	}
	3807	}
	3808	}
	3809	}
	3810	}
	3811	else {
	3812	if (data->start_class->flags & ANYOF_LOCALE)
	3813	ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
	3814
	3815	/* Even if under locale, set the bits for non-locale
	3816	* in case it isn't a true locale-node. This will
	3817	* create false positives if it truly is locale */
	3818	if (OP(scan) == ALNUMU) {
	3819	for (value = 0; value < 256; value++) {
	3820	if (isWORDCHAR_L1(value)) {
	3821	ANYOF_BITMAP_SET(data->start_class, value);
	3822	}
	3823	}
	3824	} else {
	3825	for (value = 0; value < 256; value++) {
	3826	if (isALNUM(value)) {
	3827	ANYOF_BITMAP_SET(data->start_class, value);
	3828	}
	3829	}
	3830	}
	3831	}
	3832	break;
	3833	case NALNUM:
	3834	if (flags & SCF_DO_STCLASS_AND) {
	3835	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3836	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
	3837	if (OP(scan) == NALNUMU) {
	3838	for (value = 0; value < 256; value++) {
	3839	if (isWORDCHAR_L1(value)) {
	3840	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3841	}
	3842	}
	3843	} else {
	3844	for (value = 0; value < 256; value++) {
	3845	if (isALNUM(value)) {
	3846	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3847	}
	3848	}
	3849	}
	3850	}
	3851	}
	3852	else {
	3853	if (data->start_class->flags & ANYOF_LOCALE)
	3854	ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
	3855
	3856	/* Even if under locale, set the bits for non-locale in
	3857	* case it isn't a true locale-node. This will create
	3858	* false positives if it truly is locale */
	3859	if (OP(scan) == NALNUMU) {
	3860	for (value = 0; value < 256; value++) {
	3861	if (! isWORDCHAR_L1(value)) {
	3862	ANYOF_BITMAP_SET(data->start_class, value);
	3863	}
	3864	}
	3865	} else {
	3866	for (value = 0; value < 256; value++) {
	3867	if (! isALNUM(value)) {
	3868	ANYOF_BITMAP_SET(data->start_class, value);
	3869	}
	3870	}
	3871	}
	3872	}
	3873	break;
	3874	case SPACE:
	3875	if (flags & SCF_DO_STCLASS_AND) {
	3876	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3877	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
	3878	if (OP(scan) == SPACEU) {
	3879	for (value = 0; value < 256; value++) {
	3880	if (!isSPACE_L1(value)) {
	3881	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3882	}
	3883	}
	3884	} else {
	3885	for (value = 0; value < 256; value++) {
	3886	if (!isSPACE(value)) {
	3887	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3888	}
	3889	}
	3890	}
	3891	}
	3892	}
	3893	else {
	3894	if (data->start_class->flags & ANYOF_LOCALE) {
	3895	ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
	3896	}
	3897	if (OP(scan) == SPACEU) {
	3898	for (value = 0; value < 256; value++) {
	3899	if (isSPACE_L1(value)) {
	3900	ANYOF_BITMAP_SET(data->start_class, value);
	3901	}
	3902	}
	3903	} else {
	3904	for (value = 0; value < 256; value++) {
	3905	if (isSPACE(value)) {
	3906	ANYOF_BITMAP_SET(data->start_class, value);
	3907	}
	3908	}
	3909	}
	3910	}
	3911	break;
	3912	case NSPACE:
	3913	if (flags & SCF_DO_STCLASS_AND) {
	3914	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3915	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
	3916	if (OP(scan) == NSPACEU) {
	3917	for (value = 0; value < 256; value++) {
	3918	if (isSPACE_L1(value)) {
	3919	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3920	}
	3921	}
	3922	} else {
	3923	for (value = 0; value < 256; value++) {
	3924	if (isSPACE(value)) {
	3925	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3926	}
	3927	}
	3928	}
	3929	}
	3930	}
	3931	else {
	3932	if (data->start_class->flags & ANYOF_LOCALE)
	3933	ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
	3934	if (OP(scan) == NSPACEU) {
	3935	for (value = 0; value < 256; value++) {
	3936	if (!isSPACE_L1(value)) {
	3937	ANYOF_BITMAP_SET(data->start_class, value);
	3938	}
	3939	}
	3940	}
	3941	else {
	3942	for (value = 0; value < 256; value++) {
	3943	if (!isSPACE(value)) {
	3944	ANYOF_BITMAP_SET(data->start_class, value);
	3945	}
	3946	}
	3947	}
	3948	}
	3949	break;
	3950	case DIGIT:
	3951	if (flags & SCF_DO_STCLASS_AND) {
	3952	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	3953	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
	3954	for (value = 0; value < 256; value++)
	3955	if (!isDIGIT(value))
	3956	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3957	}
	3958	}
	3959	else {
	3960	if (data->start_class->flags & ANYOF_LOCALE)
	3961	ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
	3962	for (value = 0; value < 256; value++)
	3963	if (isDIGIT(value))
	3964	ANYOF_BITMAP_SET(data->start_class, value);
	3965	}
	3966	break;
	3967	case NDIGIT:
	3968	if (flags & SCF_DO_STCLASS_AND) {
	3969	if (!(data->start_class->flags & ANYOF_LOCALE))
	3970	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
	3971	for (value = 0; value < 256; value++)
	3972	if (isDIGIT(value))
	3973	ANYOF_BITMAP_CLEAR(data->start_class, value);
	3974	}
	3975	else {
	3976	if (data->start_class->flags & ANYOF_LOCALE)
	3977	ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
	3978	for (value = 0; value < 256; value++)
	3979	if (!isDIGIT(value))
	3980	ANYOF_BITMAP_SET(data->start_class, value);
	3981	}
	3982	break;
	3983	CASE_SYNST_FNC(VERTWS);
	3984	CASE_SYNST_FNC(HORIZWS);
	3985
	3986	}
	3987	if (flags & SCF_DO_STCLASS_OR)
	3988	cl_and(data->start_class, and_withp);
	3989	flags &= ~SCF_DO_STCLASS;
	3990	}
	3991	}
	3992	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	3993	data->flags \|= (OP(scan) == MEOL
	3994	? SF_BEFORE_MEOL
	3995	: SF_BEFORE_SEOL);
	3996	}
	3997	else if ( PL_regkind[OP(scan)] == BRANCHJ
	3998	/* Lookbehind, or need to calculate parens/evals/stclass: */
	3999	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	4000	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM)) {
	4001	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4002	\|\| OP(scan) == UNLESSM )
	4003	{
	4004	/* Negative Lookahead/lookbehind
	4005	In this case we can't do fixed string optimisation.
	4006	*/
	4007
	4008	I32 deltanext, minnext, fake = 0;
	4009	regnode *nscan;
	4010	struct regnode_charclass_class intrnl;
	4011	int f = 0;
	4012
	4013	data_fake.flags = 0;
	4014	if (data) {
	4015	data_fake.whilem_c = data->whilem_c;
	4016	data_fake.last_closep = data->last_closep;
	4017	}
	4018	else
	4019	data_fake.last_closep = &fake;
	4020	data_fake.pos_delta = delta;
	4021	if ( flags & SCF_DO_STCLASS && !scan->flags
	4022	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4023	cl_init(pRExC_state, &intrnl);
	4024	data_fake.start_class = &intrnl;
	4025	f \|= SCF_DO_STCLASS_AND;
	4026	}
	4027	if (flags & SCF_WHILEM_VISITED_POS)
	4028	f \|= SCF_WHILEM_VISITED_POS;
	4029	next = regnext(scan);
	4030	nscan = NEXTOPER(NEXTOPER(scan));
	4031	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	4032	last, &data_fake, stopparen, recursed, NULL, f, depth+1);
	4033	if (scan->flags) {
	4034	if (deltanext) {
	4035	FAIL("Variable length lookbehind not implemented");
	4036	}
	4037	else if (minnext > (I32)U8_MAX) {
	4038	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4039	}
	4040	scan->flags = (U8)minnext;
	4041	}
	4042	if (data) {
	4043	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4044	pars++;
	4045	if (data_fake.flags & SF_HAS_EVAL)
	4046	data->flags \|= SF_HAS_EVAL;
	4047	data->whilem_c = data_fake.whilem_c;
	4048	}
	4049	if (f & SCF_DO_STCLASS_AND) {
	4050	if (flags & SCF_DO_STCLASS_OR) {
	4051	/* OR before, AND after: ideally we would recurse with
	4052	* data_fake to get the AND applied by study of the
	4053	* remainder of the pattern, and then derecurse;
	4054	* * HACK * for now just treat as "no information".
	4055	* See [perl #56690].
	4056	*/
	4057	cl_init(pRExC_state, data->start_class);
	4058	} else {
	4059	/* AND before and after: combine and continue */
	4060	const int was = (data->start_class->flags & ANYOF_EOS);
	4061
	4062	cl_and(data->start_class, &intrnl);
	4063	if (was)
	4064	data->start_class->flags \|= ANYOF_EOS;
	4065	}
	4066	}
	4067	}
	4068	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4069	else {
	4070	/* Positive Lookahead/lookbehind
	4071	In this case we can do fixed string optimisation,
	4072	but we must be careful about it. Note in the case of
	4073	lookbehind the positions will be offset by the minimum
	4074	length of the pattern, something we won't know about
	4075	until after the recurse.
	4076	*/
	4077	I32 deltanext, fake = 0;
	4078	regnode *nscan;
	4079	struct regnode_charclass_class intrnl;
	4080	int f = 0;
	4081	/* We use SAVEFREEPV so that when the full compile
	4082	is finished perl will clean up the allocated
	4083	minlens when it's all done. This way we don't
	4084	have to worry about freeing them when we know
	4085	they wont be used, which would be a pain.
	4086	*/
	4087	I32 *minnextp;
	4088	Newx( minnextp, 1, I32 );
	4089	SAVEFREEPV(minnextp);
	4090
	4091	if (data) {
	4092	StructCopy(data, &data_fake, scan_data_t);
	4093	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	4094	f \|= SCF_DO_SUBSTR;
	4095	if (scan->flags)
	4096	SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
	4097	data_fake.last_found=newSVsv(data->last_found);
	4098	}
	4099	}
	4100	else
	4101	data_fake.last_closep = &fake;
	4102	data_fake.flags = 0;
	4103	data_fake.pos_delta = delta;
	4104	if (is_inf)
	4105	data_fake.flags \|= SF_IS_INF;
	4106	if ( flags & SCF_DO_STCLASS && !scan->flags
	4107	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4108	cl_init(pRExC_state, &intrnl);
	4109	data_fake.start_class = &intrnl;
	4110	f \|= SCF_DO_STCLASS_AND;
	4111	}
	4112	if (flags & SCF_WHILEM_VISITED_POS)
	4113	f \|= SCF_WHILEM_VISITED_POS;
	4114	next = regnext(scan);
	4115	nscan = NEXTOPER(NEXTOPER(scan));
	4116
	4117	*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
	4118	last, &data_fake, stopparen, recursed, NULL, f,depth+1);
	4119	if (scan->flags) {
	4120	if (deltanext) {
	4121	FAIL("Variable length lookbehind not implemented");
	4122	}
	4123	else if (*minnextp > (I32)U8_MAX) {
	4124	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4125	}
	4126	scan->flags = (U8)*minnextp;
	4127	}
	4128
	4129	*minnextp += min;
	4130
	4131	if (f & SCF_DO_STCLASS_AND) {
	4132	const int was = (data->start_class->flags & ANYOF_EOS);
	4133
	4134	cl_and(data->start_class, &intrnl);
	4135	if (was)
	4136	data->start_class->flags \|= ANYOF_EOS;
	4137	}
	4138	if (data) {
	4139	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4140	pars++;
	4141	if (data_fake.flags & SF_HAS_EVAL)
	4142	data->flags \|= SF_HAS_EVAL;
	4143	data->whilem_c = data_fake.whilem_c;
	4144	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	4145	if (RExC_rx->minlen<*minnextp)
	4146	RExC_rx->minlen=*minnextp;
	4147	SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
	4148	SvREFCNT_dec(data_fake.last_found);
	4149
	4150	if ( data_fake.minlen_fixed != minlenp )
	4151	{
	4152	data->offset_fixed= data_fake.offset_fixed;
	4153	data->minlen_fixed= data_fake.minlen_fixed;
	4154	data->lookbehind_fixed+= scan->flags;
	4155	}
	4156	if ( data_fake.minlen_float != minlenp )
	4157	{
	4158	data->minlen_float= data_fake.minlen_float;
	4159	data->offset_float_min=data_fake.offset_float_min;
	4160	data->offset_float_max=data_fake.offset_float_max;
	4161	data->lookbehind_float+= scan->flags;
	4162	}
	4163	}
	4164	}
	4165
	4166
	4167	}
	4168	#endif
	4169	}
	4170	else if (OP(scan) == OPEN) {
	4171	if (stopparen != (I32)ARG(scan))
	4172	pars++;
	4173	}
	4174	else if (OP(scan) == CLOSE) {
	4175	if (stopparen == (I32)ARG(scan)) {
	4176	break;
	4177	}
	4178	if ((I32)ARG(scan) == is_par) {
	4179	next = regnext(scan);
	4180
	4181	if ( next && (OP(next) != WHILEM) && next < last)
	4182	is_par = 0; /* Disable optimization */
	4183	}
	4184	if (data)
	4185	*(data->last_closep) = ARG(scan);
	4186	}
	4187	else if (OP(scan) == EVAL) {
	4188	if (data)
	4189	data->flags \|= SF_HAS_EVAL;
	4190	}
	4191	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	4192	if (flags & SCF_DO_SUBSTR) {
	4193	SCAN_COMMIT(pRExC_state,data,minlenp);
	4194	flags &= ~SCF_DO_SUBSTR;
	4195	}
	4196	if (data && OP(scan)==ACCEPT) {
	4197	data->flags \|= SCF_SEEN_ACCEPT;
	4198	if (stopmin > min)
	4199	stopmin = min;
	4200	}
	4201	}
	4202	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	4203	{
	4204	if (flags & SCF_DO_SUBSTR) {
	4205	SCAN_COMMIT(pRExC_state,data,minlenp);
	4206	data->longest = &(data->longest_float);
	4207	}
	4208	is_inf = is_inf_internal = 1;
	4209	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4210	cl_anything(pRExC_state, data->start_class);
	4211	flags &= ~SCF_DO_STCLASS;
	4212	}
	4213	else if (OP(scan) == GPOS) {
	4214	if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
	4215	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	4216	{
	4217	if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
	4218	RExC_rx->extflags \|= RXf_ANCH_GPOS;
	4219	if (RExC_rx->gofs < (U32)min)
	4220	RExC_rx->gofs = min;
	4221	} else {
	4222	RExC_rx->extflags \|= RXf_GPOS_FLOAT;
	4223	RExC_rx->gofs = 0;
	4224	}
	4225	}
	4226	#ifdef TRIE_STUDY_OPT
	4227	#ifdef FULL_TRIE_STUDY
	4228	else if (PL_regkind[OP(scan)] == TRIE) {
	4229	/* NOTE - There is similar code to this block above for handling
	4230	BRANCH nodes on the initial study. If you change stuff here
	4231	check there too. */
	4232	regnode *trie_node= scan;
	4233	regnode *tail= regnext(scan);
	4234	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4235	I32 max1 = 0, min1 = I32_MAX;
	4236	struct regnode_charclass_class accum;
	4237
	4238	if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
	4239	SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
	4240	if (flags & SCF_DO_STCLASS)
	4241	cl_init_zero(pRExC_state, &accum);
	4242
	4243	if (!trie->jump) {
	4244	min1= trie->minlen;
	4245	max1= trie->maxlen;
	4246	} else {
	4247	const regnode *nextbranch= NULL;
	4248	U32 word;
	4249
	4250	for ( word=1 ; word <= trie->wordcount ; word++)
	4251	{
	4252	I32 deltanext=0, minnext=0, f = 0, fake;
	4253	struct regnode_charclass_class this_class;
	4254
	4255	data_fake.flags = 0;
	4256	if (data) {
	4257	data_fake.whilem_c = data->whilem_c;
	4258	data_fake.last_closep = data->last_closep;
	4259	}
	4260	else
	4261	data_fake.last_closep = &fake;
	4262	data_fake.pos_delta = delta;
	4263	if (flags & SCF_DO_STCLASS) {
	4264	cl_init(pRExC_state, &this_class);
	4265	data_fake.start_class = &this_class;
	4266	f = SCF_DO_STCLASS_AND;
	4267	}
	4268	if (flags & SCF_WHILEM_VISITED_POS)
	4269	f \|= SCF_WHILEM_VISITED_POS;
	4270
	4271	if (trie->jump[word]) {
	4272	if (!nextbranch)
	4273	nextbranch = trie_node + trie->jump[0];
	4274	scan= trie_node + trie->jump[word];
	4275	/* We go from the jump point to the branch that follows
	4276	it. Note this means we need the vestigal unused branches
	4277	even though they arent otherwise used.
	4278	*/
	4279	minnext = study_chunk(pRExC_state, &scan, minlenp,
	4280	&deltanext, (regnode *)nextbranch, &data_fake,
	4281	stopparen, recursed, NULL, f,depth+1);
	4282	}
	4283	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	4284	nextbranch= regnext((regnode*)nextbranch);
	4285
	4286	if (min1 > (I32)(minnext + trie->minlen))
	4287	min1 = minnext + trie->minlen;
	4288	if (max1 < (I32)(minnext + deltanext + trie->maxlen))
	4289	max1 = minnext + deltanext + trie->maxlen;
	4290	if (deltanext == I32_MAX)
	4291	is_inf = is_inf_internal = 1;
	4292
	4293	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4294	pars++;
	4295	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	4296	if ( stopmin > min + min1)
	4297	stopmin = min + min1;
	4298	flags &= ~SCF_DO_SUBSTR;
	4299	if (data)
	4300	data->flags \|= SCF_SEEN_ACCEPT;
	4301	}
	4302	if (data) {
	4303	if (data_fake.flags & SF_HAS_EVAL)
	4304	data->flags \|= SF_HAS_EVAL;
	4305	data->whilem_c = data_fake.whilem_c;
	4306	}
	4307	if (flags & SCF_DO_STCLASS)
	4308	cl_or(pRExC_state, &accum, &this_class);
	4309	}
	4310	}
	4311	if (flags & SCF_DO_SUBSTR) {
	4312	data->pos_min += min1;
	4313	data->pos_delta += max1 - min1;
	4314	if (max1 != min1 \|\| is_inf)
	4315	data->longest = &(data->longest_float);
	4316	}
	4317	min += min1;
	4318	delta += max1 - min1;
	4319	if (flags & SCF_DO_STCLASS_OR) {
	4320	cl_or(pRExC_state, data->start_class, &accum);
	4321	if (min1) {
	4322	cl_and(data->start_class, and_withp);
	4323	flags &= ~SCF_DO_STCLASS;
	4324	}
	4325	}
	4326	else if (flags & SCF_DO_STCLASS_AND) {
	4327	if (min1) {
	4328	cl_and(data->start_class, &accum);
	4329	flags &= ~SCF_DO_STCLASS;
	4330	}
	4331	else {
	4332	/* Switch to OR mode: cache the old value of
	4333	* data->start_class */
	4334	INIT_AND_WITHP;
	4335	StructCopy(data->start_class, and_withp,
	4336	struct regnode_charclass_class);
	4337	flags &= ~SCF_DO_STCLASS_AND;
	4338	StructCopy(&accum, data->start_class,
	4339	struct regnode_charclass_class);
	4340	flags \|= SCF_DO_STCLASS_OR;
	4341	data->start_class->flags \|= ANYOF_EOS;
	4342	}
	4343	}
	4344	scan= tail;
	4345	continue;
	4346	}
	4347	#else
	4348	else if (PL_regkind[OP(scan)] == TRIE) {
	4349	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4350	U8*bang=NULL;
	4351
	4352	min += trie->minlen;
	4353	delta += (trie->maxlen - trie->minlen);
	4354	flags &= ~SCF_DO_STCLASS; /* xxx */
	4355	if (flags & SCF_DO_SUBSTR) {
	4356	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4357	data->pos_min += trie->minlen;
	4358	data->pos_delta += (trie->maxlen - trie->minlen);
	4359	if (trie->maxlen != trie->minlen)
	4360	data->longest = &(data->longest_float);
	4361	}
	4362	if (trie->jump) /* no more substrings -- for now /grr*/
	4363	flags &= ~SCF_DO_SUBSTR;
	4364	}
	4365	#endif /* old or new */
	4366	#endif /* TRIE_STUDY_OPT */
	4367
	4368	/* Else: zero-length, ignore. */
	4369	scan = regnext(scan);
	4370	}
	4371	if (frame) {
	4372	last = frame->last;
	4373	scan = frame->next;
	4374	stopparen = frame->stop;
	4375	frame = frame->prev;
	4376	goto fake_study_recurse;
	4377	}
	4378
	4379	finish:
	4380	assert(!frame);
	4381	DEBUG_STUDYDATA("pre-fin:",data,depth);
	4382
	4383	*scanp = scan;
	4384	*deltap = is_inf_internal ? I32_MAX : delta;
	4385	if (flags & SCF_DO_SUBSTR && is_inf)
	4386	data->pos_delta = I32_MAX - data->pos_min;
	4387	if (is_par > (I32)U8_MAX)
	4388	is_par = 0;
	4389	if (is_par && pars==1 && data) {
	4390	data->flags \|= SF_IN_PAR;
	4391	data->flags &= ~SF_HAS_PAR;
	4392	}
	4393	else if (pars && data) {
	4394	data->flags \|= SF_HAS_PAR;
	4395	data->flags &= ~SF_IN_PAR;
	4396	}
	4397	if (flags & SCF_DO_STCLASS_OR)
	4398	cl_and(data->start_class, and_withp);
	4399	if (flags & SCF_TRIE_RESTUDY)
	4400	data->flags \|= SCF_TRIE_RESTUDY;
	4401
	4402	DEBUG_STUDYDATA("post-fin:",data,depth);
	4403
	4404	return min < stopmin ? min : stopmin;
	4405	}
	4406
	4407	STATIC U32
	4408	S_add_data(RExC_state_t pRExC_state, U32 n, const char s)
	4409	{
	4410	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	4411
	4412	PERL_ARGS_ASSERT_ADD_DATA;
	4413
	4414	Renewc(RExC_rxi->data,
	4415	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	4416	char, struct reg_data);
	4417	if(count)
	4418	Renew(RExC_rxi->data->what, count + n, U8);
	4419	else
	4420	Newx(RExC_rxi->data->what, n, U8);
	4421	RExC_rxi->data->count = count + n;
	4422	Copy(s, RExC_rxi->data->what + count, n, U8);
	4423	return count;
	4424	}
	4425
	4426	/XXX: todo make this not included in a non debugging perl /
	4427	#ifndef PERL_IN_XSUB_RE
	4428	void
	4429	Perl_reginitcolors(pTHX)
	4430	{
	4431	dVAR;
	4432	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	4433	if (s) {
	4434	char *t = savepv(s);
	4435	int i = 0;
	4436	PL_colors[0] = t;
	4437	while (++i < 6) {
	4438	t = strchr(t, '\t');
	4439	if (t) {
	4440	*t = '\0';
	4441	PL_colors[i] = ++t;
	4442	}
	4443	else
	4444	PL_colors[i] = t = (char *)"";
	4445	}
	4446	} else {
	4447	int i = 0;
	4448	while (i < 6)
	4449	PL_colors[i++] = (char *)"";
	4450	}
	4451	PL_colorset = 1;
	4452	}
	4453	#endif
	4454
	4455
	4456	#ifdef TRIE_STUDY_OPT
	4457	#define CHECK_RESTUDY_GOTO \
	4458	if ( \
	4459	(data.flags & SCF_TRIE_RESTUDY) \
	4460	&& ! restudied++ \
	4461	) goto reStudy
	4462	#else
	4463	#define CHECK_RESTUDY_GOTO
	4464	#endif
	4465
	4466	/*
	4467	- pregcomp - compile a regular expression into internal code
	4468	*
	4469	* We can't allocate space until we know how big the compiled form will be,
	4470	* but we can't compile it (and thus know how big it is) until we've got a
	4471	* place to put the code. So we cheat: we compile it twice, once with code
	4472	* generation turned off and size counting turned on, and once "for real".
	4473	* This also means that we don't allocate space until we are sure that the
	4474	* thing really will compile successfully, and we never have to move the
	4475	* code and thus invalidate pointers into it. (Note that it has to be in
	4476	* one piece because free() must be able to free it all.) [NB: not true in perl]
	4477	*
	4478	* Beware that the optimization-preparation code in here knows about some
	4479	* of the structure of the compiled regexp. [I'll say.]
	4480	*/
	4481
	4482
	4483
	4484	#ifndef PERL_IN_XSUB_RE
	4485	#define RE_ENGINE_PTR &PL_core_reg_engine
	4486	#else
	4487	extern const struct regexp_engine my_reg_engine;
	4488	#define RE_ENGINE_PTR &my_reg_engine
	4489	#endif
	4490
	4491	#ifndef PERL_IN_XSUB_RE
	4492	REGEXP *
	4493	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	4494	{
	4495	dVAR;
	4496	HV * const table = GvHV(PL_hintgv);
	4497
	4498	PERL_ARGS_ASSERT_PREGCOMP;
	4499
	4500	/* Dispatch a request to compile a regexp to correct
	4501	regexp engine. */
	4502	if (table) {
	4503	SV **ptr= hv_fetchs(table, "regcomp", FALSE);
	4504	GET_RE_DEBUG_FLAGS_DECL;
	4505	if (ptr && SvIOK(ptr) && SvIV(ptr)) {
	4506	const regexp_engine eng=INT2PTR(regexp_engine,SvIV(*ptr));
	4507	DEBUG_COMPILE_r({
	4508	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	4509	SvIV(*ptr));
	4510	});
	4511	return CALLREGCOMP_ENG(eng, pattern, flags);
	4512	}
	4513	}
	4514	return Perl_re_compile(aTHX_ pattern, flags);
	4515	}
	4516	#endif
	4517
	4518	REGEXP *
	4519	Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
	4520	{
	4521	dVAR;
	4522	REGEXP *rx;
	4523	struct regexp *r;
	4524	register regexp_internal *ri;
	4525	STRLEN plen;
	4526	char *exp;
	4527	char* xend;
	4528	regnode *scan;
	4529	I32 flags;
	4530	I32 minlen = 0;
	4531	U32 pm_flags;
	4532
	4533	/* these are all flags - maybe they should be turned
	4534	* into a single int with different bit masks */
	4535	I32 sawlookahead = 0;
	4536	I32 sawplus = 0;
	4537	I32 sawopen = 0;
	4538	bool used_setjump = FALSE;
	4539	regex_charset initial_charset = get_regex_charset(orig_pm_flags);
	4540
	4541	U8 jump_ret = 0;
	4542	dJMPENV;
	4543	scan_data_t data;
	4544	RExC_state_t RExC_state;
	4545	RExC_state_t * const pRExC_state = &RExC_state;
	4546	#ifdef TRIE_STUDY_OPT
	4547	int restudied;
	4548	RExC_state_t copyRExC_state;
	4549	#endif
	4550	GET_RE_DEBUG_FLAGS_DECL;
	4551
	4552	PERL_ARGS_ASSERT_RE_COMPILE;
	4553
	4554	DEBUG_r(if (!PL_colorset) reginitcolors());
	4555
	4556	RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
	4557	RExC_uni_semantics = 0;
	4558	RExC_contains_locale = 0;
	4559
	4560	/**************** LONG JUMP TARGET HERE*********************/
	4561	/* Longjmp back to here if have to switch in midstream to utf8 */
	4562	if (! RExC_orig_utf8) {
	4563	JMPENV_PUSH(jump_ret);
	4564	used_setjump = TRUE;
	4565	}
	4566
	4567	if (jump_ret == 0) { /* First time through */
	4568	exp = SvPV(pattern, plen);
	4569	xend = exp + plen;
	4570	/* ignore the utf8ness if the pattern is 0 length */
	4571	if (plen == 0) {
	4572	RExC_utf8 = RExC_orig_utf8 = 0;
	4573	}
	4574
	4575	DEBUG_COMPILE_r({
	4576	SV *dsv= sv_newmortal();
	4577	RE_PV_QUOTED_DECL(s, RExC_utf8,
	4578	dsv, exp, plen, 60);
	4579	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	4580	PL_colors[4],PL_colors[5],s);
	4581	});
	4582	}
	4583	else { /* longjumped back */
	4584	STRLEN len = plen;
	4585
	4586	/* If the cause for the longjmp was other than changing to utf8, pop
	4587	* our own setjmp, and longjmp to the correct handler */
	4588	if (jump_ret != UTF8_LONGJMP) {
	4589	JMPENV_POP;
	4590	JMPENV_JUMP(jump_ret);
	4591	}
	4592
	4593	GET_RE_DEBUG_FLAGS;
	4594
	4595	/* It's possible to write a regexp in ascii that represents Unicode
	4596	codepoints outside of the byte range, such as via \x{100}. If we
	4597	detect such a sequence we have to convert the entire pattern to utf8
	4598	and then recompile, as our sizing calculation will have been based
	4599	on 1 byte == 1 character, but we will need to use utf8 to encode
	4600	at least some part of the pattern, and therefore must convert the whole
	4601	thing.
	4602	-- dmq */
	4603	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	4604	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	4605	exp = (char)Perl_bytes_to_utf8(aTHX_ (U8)SvPV(pattern, plen), &len);
	4606	xend = exp + len;
	4607	RExC_orig_utf8 = RExC_utf8 = 1;
	4608	SAVEFREEPV(exp);
	4609	}
	4610
	4611	#ifdef TRIE_STUDY_OPT
	4612	restudied = 0;
	4613	#endif
	4614
	4615	pm_flags = orig_pm_flags;
	4616
	4617	if (initial_charset == REGEX_LOCALE_CHARSET) {
	4618	RExC_contains_locale = 1;
	4619	}
	4620	else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
	4621
	4622	/* Set to use unicode semantics if the pattern is in utf8 and has the
	4623	* 'depends' charset specified, as it means unicode when utf8 */
	4624	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4625	}
	4626
	4627	RExC_precomp = exp;
	4628	RExC_flags = pm_flags;
	4629	RExC_sawback = 0;
	4630
	4631	RExC_seen = 0;
	4632	RExC_in_lookbehind = 0;
	4633	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	4634	RExC_seen_evals = 0;
	4635	RExC_extralen = 0;
	4636	RExC_override_recoding = 0;
	4637
	4638	/* First pass: determine size, legality. */
	4639	RExC_parse = exp;
	4640	RExC_start = exp;
	4641	RExC_end = xend;
	4642	RExC_naughty = 0;
	4643	RExC_npar = 1;
	4644	RExC_nestroot = 0;
	4645	RExC_size = 0L;
	4646	RExC_emit = &PL_regdummy;
	4647	RExC_whilem_seen = 0;
	4648	RExC_open_parens = NULL;
	4649	RExC_close_parens = NULL;
	4650	RExC_opend = NULL;
	4651	RExC_paren_names = NULL;
	4652	#ifdef DEBUGGING
	4653	RExC_paren_name_list = NULL;
	4654	#endif
	4655	RExC_recurse = NULL;
	4656	RExC_recurse_count = 0;
	4657
	4658	#if 0 /* REGC() is (currently) a NOP at the first pass.
	4659	* Clever compilers notice this and complain. --jhi */
	4660	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	4661	#endif
	4662	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
	4663	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4664	RExC_precomp = NULL;
	4665	return(NULL);
	4666	}
	4667
	4668	/* Here, finished first pass. Get rid of any added setjmp */
	4669	if (used_setjump) {
	4670	JMPENV_POP;
	4671	}
	4672
	4673	DEBUG_PARSE_r({
	4674	PerlIO_printf(Perl_debug_log,
	4675	"Required size %"IVdf" nodes\n"
	4676	"Starting second pass (creation)\n",
	4677	(IV)RExC_size);
	4678	RExC_lastnum=0;
	4679	RExC_lastparse=NULL;
	4680	});
	4681
	4682	/* The first pass could have found things that force Unicode semantics */
	4683	if ((RExC_utf8 \|\| RExC_uni_semantics)
	4684	&& get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
	4685	{
	4686	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
	4687	}
	4688
	4689	/* Small enough for pointer-storage convention?
	4690	If extralen==0, this means that we will not need long jumps. */
	4691	if (RExC_size >= 0x10000L && RExC_extralen)
	4692	RExC_size += RExC_extralen;
	4693	else
	4694	RExC_extralen = 0;
	4695	if (RExC_whilem_seen > 15)
	4696	RExC_whilem_seen = 15;
	4697
	4698	/* Allocate space and zero-initialize. Note, the two step process
	4699	of zeroing when in debug mode, thus anything assigned has to
	4700	happen after that */
	4701	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	4702	r = (struct regexp*)SvANY(rx);
	4703	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	4704	char, regexp_internal);
	4705	if ( r == NULL \|\| ri == NULL )
	4706	FAIL("Regexp out of space");
	4707	#ifdef DEBUGGING
	4708	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	4709	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
	4710	#else
	4711	/* bulk initialize base fields with 0. */
	4712	Zero(ri, sizeof(regexp_internal), char);
	4713	#endif
	4714
	4715	/* non-zero initialization begins here */
	4716	RXi_SET( r, ri );
	4717	r->engine= RE_ENGINE_PTR;
	4718	r->extflags = pm_flags;
	4719	{
	4720	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	4721	bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
	4722
	4723	/* The caret is output if there are any defaults: if not all the STD
	4724	* flags are set, or if no character set specifier is needed */
	4725	bool has_default =
	4726	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	4727	\|\| ! has_charset);
	4728	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
	4729	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	4730	>> RXf_PMf_STD_PMMOD_SHIFT);
	4731	const char fptr = STD_PAT_MODS; /"msix"*/
	4732	char *p;
	4733	/* Allocate for the worst case, which is all the std flags are turned
	4734	* on. If more precision is desired, we could do a population count of
	4735	* the flags set. This could be done with a small lookup table, or by
	4736	* shifting, masking and adding, or even, when available, assembly
	4737	* language for a machine-language population count.
	4738	* We never output a minus, as all those are defaults, so are
	4739	* covered by the caret */
	4740	const STRLEN wraplen = plen + has_p + has_runon
	4741	+ has_default /* If needs a caret */
	4742
	4743	/* If needs a character set specifier */
	4744	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	4745	+ (sizeof(STD_PAT_MODS) - 1)
	4746	+ (sizeof("(?:)") - 1);
	4747
	4748	p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
	4749	SvPOK_on(rx);
	4750	SvFLAGS(rx) \|= SvUTF8(pattern);
	4751	p++='('; p++='?';
	4752
	4753	/* If a default, cover it using the caret */
	4754	if (has_default) {
	4755	*p++= DEFAULT_PAT_MOD;
	4756	}
	4757	if (has_charset) {
	4758	STRLEN len;
	4759	const char* const name = get_regex_charset_name(r->extflags, &len);
	4760	Copy(name, p, len, char);
	4761	p += len;
	4762	}
	4763	if (has_p)
	4764	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	4765	{
	4766	char ch;
	4767	while((ch = *fptr++)) {
	4768	if(reganch & 1)
	4769	*p++ = ch;
	4770	reganch >>= 1;
	4771	}
	4772	}
	4773
	4774	*p++ = ':';
	4775	Copy(RExC_precomp, p, plen, char);
	4776	assert ((RX_WRAPPED(rx) - p) < 16);
	4777	r->pre_prefix = p - RX_WRAPPED(rx);
	4778	p += plen;
	4779	if (has_runon)
	4780	*p++ = '\n';
	4781	*p++ = ')';
	4782	*p = 0;
	4783	SvCUR_set(rx, p - SvPVX_const(rx));
	4784	}
	4785
	4786	r->intflags = 0;
	4787	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	4788
	4789	if (RExC_seen & REG_SEEN_RECURSE) {
	4790	Newxz(RExC_open_parens, RExC_npar,regnode *);
	4791	SAVEFREEPV(RExC_open_parens);
	4792	Newxz(RExC_close_parens,RExC_npar,regnode *);
	4793	SAVEFREEPV(RExC_close_parens);
	4794	}
	4795
	4796	/* Useful during FAIL. */
	4797	#ifdef RE_TRACK_PATTERN_OFFSETS
	4798	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	4799	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	4800	"%s %"UVuf" bytes for offset annotations.\n",
	4801	ri->u.offsets ? "Got" : "Couldn't get",
	4802	(UV)((2RExC_size+1) sizeof(U32))));
	4803	#endif
	4804	SetProgLen(ri,RExC_size);
	4805	RExC_rx_sv = rx;
	4806	RExC_rx = r;
	4807	RExC_rxi = ri;
	4808
	4809	/* Second pass: emit code. */
	4810	RExC_flags = pm_flags; /* don't let top level (?i) bleed */
	4811	RExC_parse = exp;
	4812	RExC_end = xend;
	4813	RExC_naughty = 0;
	4814	RExC_npar = 1;
	4815	RExC_emit_start = ri->program;
	4816	RExC_emit = ri->program;
	4817	RExC_emit_bound = ri->program + RExC_size + 1;
	4818
	4819	/* Store the count of eval-groups for security checks: */
	4820	RExC_rx->seen_evals = RExC_seen_evals;
	4821	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	4822	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	4823	ReREFCNT_dec(rx);
	4824	return(NULL);
	4825	}
	4826	/* XXXX To minimize changes to RE engine we always allocate
	4827	3-units-long substrs field. */
	4828	Newx(r->substrs, 1, struct reg_substr_data);
	4829	if (RExC_recurse_count) {
	4830	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	4831	SAVEFREEPV(RExC_recurse);
	4832	}
	4833
	4834	reStudy:
	4835	r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
	4836	Zero(r->substrs, 1, struct reg_substr_data);
	4837
	4838	#ifdef TRIE_STUDY_OPT
	4839	if (!restudied) {
	4840	StructCopy(&zero_scan_data, &data, scan_data_t);
	4841	copyRExC_state = RExC_state;
	4842	} else {
	4843	U32 seen=RExC_seen;
	4844	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	4845
	4846	RExC_state = copyRExC_state;
	4847	if (seen & REG_TOP_LEVEL_BRANCHES)
	4848	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	4849	else
	4850	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
	4851	if (data.last_found) {
	4852	SvREFCNT_dec(data.longest_fixed);
	4853	SvREFCNT_dec(data.longest_float);
	4854	SvREFCNT_dec(data.last_found);
	4855	}
	4856	StructCopy(&zero_scan_data, &data, scan_data_t);
	4857	}
	4858	#else
	4859	StructCopy(&zero_scan_data, &data, scan_data_t);
	4860	#endif
	4861
	4862	/* Dig out information for optimizations. */
	4863	r->extflags = RExC_flags; /* was pm_op */
	4864	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	4865
	4866	if (UTF)
	4867	SvUTF8_on(rx); /* Unicode in it? */
	4868	ri->regstclass = NULL;
	4869	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	4870	r->intflags \|= PREGf_NAUGHTY;
	4871	scan = ri->program + 1; /* First BRANCH. */
	4872
	4873	/* testing for BRANCH here tells us whether there is "must appear"
	4874	data in the pattern. If there is then we can use it for optimisations */
	4875	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
	4876	I32 fake;
	4877	STRLEN longest_float_length, longest_fixed_length;
	4878	struct regnode_charclass_class ch_class; /* pointed to by data */
	4879	int stclass_flag;
	4880	I32 last_close = 0; /* pointed to by data */
	4881	regnode *first= scan;
	4882	regnode *first_next= regnext(first);
	4883	/*
	4884	* Skip introductions and multiplicators >= 1
	4885	* so that we can extract the 'meat' of the pattern that must
	4886	* match in the large if() sequence following.
	4887	* NOTE that EXACT is NOT covered here, as it is normally
	4888	* picked up by the optimiser separately.
	4889	*
	4890	* This is unfortunate as the optimiser isnt handling lookahead
	4891	* properly currently.
	4892	*
	4893	*/
	4894	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	4895	/* An OR of one alternative - should not happen now. */
	4896	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	4897	/* for now we can't handle lookbehind IFMATCH*/
	4898	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	4899	(OP(first) == PLUS) \|\|
	4900	(OP(first) == MINMOD) \|\|
	4901	/* An {n,m} with n>0 */
	4902	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	4903	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	4904	{
	4905	/*
	4906	* the only op that could be a regnode is PLUS, all the rest
	4907	* will be regnode_1 or regnode_2.
	4908	*
	4909	*/
	4910	if (OP(first) == PLUS)
	4911	sawplus = 1;
	4912	else
	4913	first += regarglen[OP(first)];
	4914
	4915	first = NEXTOPER(first);
	4916	first_next= regnext(first);
	4917	}
	4918
	4919	/* Starting-point info. */
	4920	again:
	4921	DEBUG_PEEP("first:",first,0);
	4922	/* Ignore EXACT as we deal with it later. */
	4923	if (PL_regkind[OP(first)] == EXACT) {
	4924	if (OP(first) == EXACT)
	4925	NOOP; /* Empty, get anchored substr later. */
	4926	else
	4927	ri->regstclass = first;
	4928	}
	4929	#ifdef TRIE_STCLASS
	4930	else if (PL_regkind[OP(first)] == TRIE &&
	4931	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	4932	{
	4933	regnode *trie_op;
	4934	/* this can happen only on restudy */
	4935	if ( OP(first) == TRIE ) {
	4936	struct regnode_1 trieop = (struct regnode_1 )
	4937	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	4938	StructCopy(first,trieop,struct regnode_1);
	4939	trie_op=(regnode *)trieop;
	4940	} else {
	4941	struct regnode_charclass trieop = (struct regnode_charclass )
	4942	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	4943	StructCopy(first,trieop,struct regnode_charclass);
	4944	trie_op=(regnode *)trieop;
	4945	}
	4946	OP(trie_op)+=2;
	4947	make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
	4948	ri->regstclass = trie_op;
	4949	}
	4950	#endif
	4951	else if (REGNODE_SIMPLE(OP(first)))
	4952	ri->regstclass = first;
	4953	else if (PL_regkind[OP(first)] == BOUND \|\|
	4954	PL_regkind[OP(first)] == NBOUND)
	4955	ri->regstclass = first;
	4956	else if (PL_regkind[OP(first)] == BOL) {
	4957	r->extflags \|= (OP(first) == MBOL
	4958	? RXf_ANCH_MBOL
	4959	: (OP(first) == SBOL
	4960	? RXf_ANCH_SBOL
	4961	: RXf_ANCH_BOL));
	4962	first = NEXTOPER(first);
	4963	goto again;
	4964	}
	4965	else if (OP(first) == GPOS) {
	4966	r->extflags \|= RXf_ANCH_GPOS;
	4967	first = NEXTOPER(first);
	4968	goto again;
	4969	}
	4970	else if ((!sawopen \|\| !RExC_sawback) &&
	4971	(OP(first) == STAR &&
	4972	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	4973	!(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
	4974	{
	4975	/* turn .* into ^.* with an implied $=1 /
	4976	const int type =
	4977	(OP(NEXTOPER(first)) == REG_ANY)
	4978	? RXf_ANCH_MBOL
	4979	: RXf_ANCH_SBOL;
	4980	r->extflags \|= type;
	4981	r->intflags \|= PREGf_IMPLICIT;
	4982	first = NEXTOPER(first);
	4983	goto again;
	4984	}
	4985	if (sawplus && !sawlookahead && (!sawopen \|\| !RExC_sawback)
	4986	&& !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
	4987	/* x+ must match at the 1st pos of run of x's */
	4988	r->intflags \|= PREGf_SKIP;
	4989
	4990	/* Scan is after the zeroth branch, first is atomic matcher. */
	4991	#ifdef TRIE_STUDY_OPT
	4992	DEBUG_PARSE_r(
	4993	if (!restudied)
	4994	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	4995	(IV)(first - scan + 1))
	4996	);
	4997	#else
	4998	DEBUG_PARSE_r(
	4999	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	5000	(IV)(first - scan + 1))
	5001	);
	5002	#endif
	5003
	5004
	5005	/*
	5006	* If there's something expensive in the r.e., find the
	5007	* longest literal string that must appear and make it the
	5008	* regmust. Resolve ties in favor of later strings, since
	5009	* the regstart check works with the beginning of the r.e.
	5010	* and avoiding duplication strengthens checking. Not a
	5011	* strong reason, but sufficient in the absence of others.
	5012	* [Now we resolve ties in favor of the earlier string if
	5013	* it happens that c_offset_min has been invalidated, since the
	5014	* earlier string may buy us something the later one won't.]
	5015	*/
	5016
	5017	data.longest_fixed = newSVpvs("");
	5018	data.longest_float = newSVpvs("");
	5019	data.last_found = newSVpvs("");
	5020	data.longest = &(data.longest_fixed);
	5021	first = scan;
	5022	if (!ri->regstclass) {
	5023	cl_init(pRExC_state, &ch_class);
	5024	data.start_class = &ch_class;
	5025	stclass_flag = SCF_DO_STCLASS_AND;
	5026	} else /* XXXX Check for BOUND? */
	5027	stclass_flag = 0;
	5028	data.last_closep = &last_close;
	5029
	5030	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
	5031	&data, -1, NULL, NULL,
	5032	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag,0);
	5033
	5034
	5035	CHECK_RESTUDY_GOTO;
	5036
	5037
	5038	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	5039	&& data.last_start_min == 0 && data.last_end > 0
	5040	&& !RExC_seen_zerolen
	5041	&& !(RExC_seen & REG_SEEN_VERBARG)
	5042	&& (!(RExC_seen & REG_SEEN_GPOS) \|\| (r->extflags & RXf_ANCH_GPOS)))
	5043	r->extflags \|= RXf_CHECK_ALL;
	5044	scan_commit(pRExC_state, &data,&minlen,0);
	5045	SvREFCNT_dec(data.last_found);
	5046
	5047	/* Note that code very similar to this but for anchored string
	5048	follows immediately below, changes may need to be made to both.
	5049	Be careful.
	5050	*/
	5051	longest_float_length = CHR_SVLEN(data.longest_float);
	5052	if (longest_float_length
	5053	\|\| (data.flags & SF_FL_BEFORE_EOL
	5054	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5055	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	5056	{
	5057	I32 t,ml;
	5058
	5059	if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	5060	&& data.offset_fixed == data.offset_float_min
	5061	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
	5062	goto remove_float; /* As in (a)+. */
	5063
	5064	/* copy the information about the longest float from the reg_scan_data
	5065	over to the program. */
	5066	if (SvUTF8(data.longest_float)) {
	5067	r->float_utf8 = data.longest_float;
	5068	r->float_substr = NULL;
	5069	} else {
	5070	r->float_substr = data.longest_float;
	5071	r->float_utf8 = NULL;
	5072	}
	5073	/* float_end_shift is how many chars that must be matched that
	5074	follow this item. We calculate it ahead of time as once the
	5075	lookbehind offset is added in we lose the ability to correctly
	5076	calculate it.*/
	5077	ml = data.minlen_float ? *(data.minlen_float)
	5078	: (I32)longest_float_length;
	5079	r->float_end_shift = ml - data.offset_float_min
	5080	- longest_float_length + (SvTAIL(data.longest_float) != 0)
	5081	+ data.lookbehind_float;
	5082	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	5083	r->float_max_offset = data.offset_float_max;
	5084	if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
	5085	r->float_max_offset -= data.lookbehind_float;
	5086
	5087	t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
	5088	&& (!(data.flags & SF_FL_BEFORE_MEOL)
	5089	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5090	fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
	5091	}
	5092	else {
	5093	remove_float:
	5094	r->float_substr = r->float_utf8 = NULL;
	5095	SvREFCNT_dec(data.longest_float);
	5096	longest_float_length = 0;
	5097	}
	5098
	5099	/* Note that code very similar to this but for floating string
	5100	is immediately above, changes may need to be made to both.
	5101	Be careful.
	5102	*/
	5103	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	5104	if (longest_fixed_length
	5105	\|\| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
	5106	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5107	\|\| (RExC_flags & RXf_PMf_MULTILINE))))
	5108	{
	5109	I32 t,ml;
	5110
	5111	/* copy the information about the longest fixed
	5112	from the reg_scan_data over to the program. */
	5113	if (SvUTF8(data.longest_fixed)) {
	5114	r->anchored_utf8 = data.longest_fixed;
	5115	r->anchored_substr = NULL;
	5116	} else {
	5117	r->anchored_substr = data.longest_fixed;
	5118	r->anchored_utf8 = NULL;
	5119	}
	5120	/* fixed_end_shift is how many chars that must be matched that
	5121	follow this item. We calculate it ahead of time as once the
	5122	lookbehind offset is added in we lose the ability to correctly
	5123	calculate it.*/
	5124	ml = data.minlen_fixed ? *(data.minlen_fixed)
	5125	: (I32)longest_fixed_length;
	5126	r->anchored_end_shift = ml - data.offset_fixed
	5127	- longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
	5128	+ data.lookbehind_fixed;
	5129	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	5130
	5131	t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
	5132	&& (!(data.flags & SF_FIX_BEFORE_MEOL)
	5133	\|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5134	fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
	5135	}
	5136	else {
	5137	r->anchored_substr = r->anchored_utf8 = NULL;
	5138	SvREFCNT_dec(data.longest_fixed);
	5139	longest_fixed_length = 0;
	5140	}
	5141	if (ri->regstclass
	5142	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	5143	ri->regstclass = NULL;
	5144
	5145	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	5146	&& stclass_flag
	5147	&& !(data.start_class->flags & ANYOF_EOS)
	5148	&& !cl_is_anything(data.start_class))
	5149	{
	5150	const U32 n = add_data(pRExC_state, 1, "f");
	5151	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5152
	5153	Newx(RExC_rxi->data->data[n], 1,
	5154	struct regnode_charclass_class);
	5155	StructCopy(data.start_class,
	5156	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5157	struct regnode_charclass_class);
	5158	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5159	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5160	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	5161	regprop(r, sv, (regnode*)data.start_class);
	5162	PerlIO_printf(Perl_debug_log,
	5163	"synthetic stclass \"%s\".\n",
	5164	SvPVX_const(sv));});
	5165	}
	5166
	5167	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
	5168	if (longest_fixed_length > longest_float_length) {
	5169	r->check_end_shift = r->anchored_end_shift;
	5170	r->check_substr = r->anchored_substr;
	5171	r->check_utf8 = r->anchored_utf8;
	5172	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	5173	if (r->extflags & RXf_ANCH_SINGLE)
	5174	r->extflags \|= RXf_NOSCAN;
	5175	}
	5176	else {
	5177	r->check_end_shift = r->float_end_shift;
	5178	r->check_substr = r->float_substr;
	5179	r->check_utf8 = r->float_utf8;
	5180	r->check_offset_min = r->float_min_offset;
	5181	r->check_offset_max = r->float_max_offset;
	5182	}
	5183	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
	5184	This should be changed ASAP! */
	5185	if ((r->check_substr \|\| r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
	5186	r->extflags \|= RXf_USE_INTUIT;
	5187	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	5188	r->extflags \|= RXf_INTUIT_TAIL;
	5189	}
	5190	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	5191	if ( (STRLEN)minlen < longest_float_length )
	5192	minlen= longest_float_length;
	5193	if ( (STRLEN)minlen < longest_fixed_length )
	5194	minlen= longest_fixed_length;
	5195	*/
	5196	}
	5197	else {
	5198	/* Several toplevels. Best we can is to set minlen. */
	5199	I32 fake;
	5200	struct regnode_charclass_class ch_class;
	5201	I32 last_close = 0;
	5202
	5203	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	5204
	5205	scan = ri->program + 1;
	5206	cl_init(pRExC_state, &ch_class);
	5207	data.start_class = &ch_class;
	5208	data.last_closep = &last_close;
	5209
	5210
	5211	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
	5212	&data, -1, NULL, NULL, SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS,0);
	5213
	5214	CHECK_RESTUDY_GOTO;
	5215
	5216	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	5217	= r->float_substr = r->float_utf8 = NULL;
	5218
	5219	if (!(data.start_class->flags & ANYOF_EOS)
	5220	&& !cl_is_anything(data.start_class))
	5221	{
	5222	const U32 n = add_data(pRExC_state, 1, "f");
	5223	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	5224
	5225	Newx(RExC_rxi->data->data[n], 1,
	5226	struct regnode_charclass_class);
	5227	StructCopy(data.start_class,
	5228	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	5229	struct regnode_charclass_class);
	5230	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	5231	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	5232	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	5233	regprop(r, sv, (regnode*)data.start_class);
	5234	PerlIO_printf(Perl_debug_log,
	5235	"synthetic stclass \"%s\".\n",
	5236	SvPVX_const(sv));});
	5237	}
	5238	}
	5239
	5240	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	5241	the "real" pattern. */
	5242	DEBUG_OPTIMISE_r({
	5243	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
	5244	(IV)minlen, (IV)r->minlen);
	5245	});
	5246	r->minlenret = minlen;
	5247	if (r->minlen < minlen)
	5248	r->minlen = minlen;
	5249
	5250	if (RExC_seen & REG_SEEN_GPOS)
	5251	r->extflags \|= RXf_GPOS_SEEN;
	5252	if (RExC_seen & REG_SEEN_LOOKBEHIND)
	5253	r->extflags \|= RXf_LOOKBEHIND_SEEN;
	5254	if (RExC_seen & REG_SEEN_EVAL)
	5255	r->extflags \|= RXf_EVAL_SEEN;
	5256	if (RExC_seen & REG_SEEN_CANY)
	5257	r->extflags \|= RXf_CANY_SEEN;
	5258	if (RExC_seen & REG_SEEN_VERBARG)
	5259	r->intflags \|= PREGf_VERBARG_SEEN;
	5260	if (RExC_seen & REG_SEEN_CUTGROUP)
	5261	r->intflags \|= PREGf_CUTGROUP_SEEN;
	5262	if (RExC_paren_names)
	5263	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	5264	else
	5265	RXp_PAREN_NAMES(r) = NULL;
	5266
	5267	#ifdef STUPID_PATTERN_CHECKS
	5268	if (RX_PRELEN(rx) == 0)
	5269	r->extflags \|= RXf_NULL;
	5270	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5271	/* XXX: this should happen BEFORE we compile */
	5272	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5273	else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
	5274	r->extflags \|= RXf_WHITE;
	5275	else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
	5276	r->extflags \|= RXf_START_ONLY;
	5277	#else
	5278	if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
	5279	/* XXX: this should happen BEFORE we compile */
	5280	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	5281	else {
	5282	regnode *first = ri->program + 1;
	5283	U8 fop = OP(first);
	5284
	5285	if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
	5286	r->extflags \|= RXf_NULL;
	5287	else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
	5288	r->extflags \|= RXf_START_ONLY;
	5289	else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
	5290	&& OP(regnext(first)) == END)
	5291	r->extflags \|= RXf_WHITE;
	5292	}
	5293	#endif
	5294	#ifdef DEBUGGING
	5295	if (RExC_paren_names) {
	5296	ri->name_list_idx = add_data( pRExC_state, 1, "a" );
	5297	ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
	5298	} else
	5299	#endif
	5300	ri->name_list_idx = 0;
	5301
	5302	if (RExC_recurse_count) {
	5303	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	5304	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	5305	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	5306	}
	5307	}
	5308	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	5309	/* assume we don't need to swap parens around before we match */
	5310
	5311	DEBUG_DUMP_r({
	5312	PerlIO_printf(Perl_debug_log,"Final program:\n");
	5313	regdump(r);
	5314	});
	5315	#ifdef RE_TRACK_PATTERN_OFFSETS
	5316	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	5317	const U32 len = ri->u.offsets[0];
	5318	U32 i;
	5319	GET_RE_DEBUG_FLAGS_DECL;
	5320	PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	5321	for (i = 1; i <= len; i++) {
	5322	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	5323	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	5324	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	5325	}
	5326	PerlIO_printf(Perl_debug_log, "\n");
	5327	});
	5328	#endif
	5329	return rx;
	5330	}
	5331
	5332	#undef RE_ENGINE_PTR
	5333
	5334
	5335	SV*
	5336	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	5337	const U32 flags)
	5338	{
	5339	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	5340
	5341	PERL_UNUSED_ARG(value);
	5342
	5343	if (flags & RXapif_FETCH) {
	5344	return reg_named_buff_fetch(rx, key, flags);
	5345	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	5346	Perl_croak_no_modify(aTHX);
	5347	return NULL;
	5348	} else if (flags & RXapif_EXISTS) {
	5349	return reg_named_buff_exists(rx, key, flags)
	5350	? &PL_sv_yes
	5351	: &PL_sv_no;
	5352	} else if (flags & RXapif_REGNAMES) {
	5353	return reg_named_buff_all(rx, flags);
	5354	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	5355	return reg_named_buff_scalar(rx, flags);
	5356	} else {
	5357	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	5358	return NULL;
	5359	}
	5360	}
	5361
	5362	SV*
	5363	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	5364	const U32 flags)
	5365	{
	5366	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	5367	PERL_UNUSED_ARG(lastkey);
	5368
	5369	if (flags & RXapif_FIRSTKEY)
	5370	return reg_named_buff_firstkey(rx, flags);
	5371	else if (flags & RXapif_NEXTKEY)
	5372	return reg_named_buff_nextkey(rx, flags);
	5373	else {
	5374	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
	5375	return NULL;
	5376	}
	5377	}
	5378
	5379	SV*
	5380	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	5381	const U32 flags)
	5382	{
	5383	AV *retarray = NULL;
	5384	SV *ret;
	5385	struct regexp const rx = (struct regexp )SvANY(r);
	5386
	5387	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	5388
	5389	if (flags & RXapif_ALL)
	5390	retarray=newAV();
	5391
	5392	if (rx && RXp_PAREN_NAMES(rx)) {
	5393	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	5394	if (he_str) {
	5395	IV i;
	5396	SV* sv_dat=HeVAL(he_str);
	5397	I32 nums=(I32)SvPVX(sv_dat);
	5398	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	5399	if ((I32)(rx->nparens) >= nums[i]
	5400	&& rx->offs[nums[i]].start != -1
	5401	&& rx->offs[nums[i]].end != -1)
	5402	{
	5403	ret = newSVpvs("");
	5404	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	5405	if (!retarray)
	5406	return ret;
	5407	} else {
	5408	ret = newSVsv(&PL_sv_undef);
	5409	}
	5410	if (retarray)
	5411	av_push(retarray, ret);
	5412	}
	5413	if (retarray)
	5414	return newRV_noinc(MUTABLE_SV(retarray));
	5415	}
	5416	}
	5417	return NULL;
	5418	}
	5419
	5420	bool
	5421	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	5422	const U32 flags)
	5423	{
	5424	struct regexp const rx = (struct regexp )SvANY(r);
	5425
	5426	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	5427
	5428	if (rx && RXp_PAREN_NAMES(rx)) {
	5429	if (flags & RXapif_ALL) {
	5430	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	5431	} else {
	5432	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	5433	if (sv) {
	5434	SvREFCNT_dec(sv);
	5435	return TRUE;
	5436	} else {
	5437	return FALSE;
	5438	}
	5439	}
	5440	} else {
	5441	return FALSE;
	5442	}
	5443	}
	5444
	5445	SV*
	5446	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	5447	{
	5448	struct regexp const rx = (struct regexp )SvANY(r);
	5449
	5450	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	5451
	5452	if ( rx && RXp_PAREN_NAMES(rx) ) {
	5453	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	5454
	5455	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	5456	} else {
	5457	return FALSE;
	5458	}
	5459	}
	5460
	5461	SV*
	5462	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	5463	{
	5464	struct regexp const rx = (struct regexp )SvANY(r);
	5465	GET_RE_DEBUG_FLAGS_DECL;
	5466
	5467	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	5468
	5469	if (rx && RXp_PAREN_NAMES(rx)) {
	5470	HV *hv = RXp_PAREN_NAMES(rx);
	5471	HE *temphe;
	5472	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5473	IV i;
	5474	IV parno = 0;
	5475	SV* sv_dat = HeVAL(temphe);
	5476	I32 nums = (I32)SvPVX(sv_dat);
	5477	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5478	if ((I32)(rx->lastparen) >= nums[i] &&
	5479	rx->offs[nums[i]].start != -1 &&
	5480	rx->offs[nums[i]].end != -1)
	5481	{
	5482	parno = nums[i];
	5483	break;
	5484	}
	5485	}
	5486	if (parno \|\| flags & RXapif_ALL) {
	5487	return newSVhek(HeKEY_hek(temphe));
	5488	}
	5489	}
	5490	}
	5491	return NULL;
	5492	}
	5493
	5494	SV*
	5495	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	5496	{
	5497	SV *ret;
	5498	AV *av;
	5499	I32 length;
	5500	struct regexp const rx = (struct regexp )SvANY(r);
	5501
	5502	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	5503
	5504	if (rx && RXp_PAREN_NAMES(rx)) {
	5505	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	5506	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	5507	} else if (flags & RXapif_ONE) {
	5508	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	5509	av = MUTABLE_AV(SvRV(ret));
	5510	length = av_len(av);
	5511	SvREFCNT_dec(ret);
	5512	return newSViv(length + 1);
	5513	} else {
	5514	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
	5515	return NULL;
	5516	}
	5517	}
	5518	return &PL_sv_undef;
	5519	}
	5520
	5521	SV*
	5522	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	5523	{
	5524	struct regexp const rx = (struct regexp )SvANY(r);
	5525	AV *av = newAV();
	5526
	5527	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	5528
	5529	if (rx && RXp_PAREN_NAMES(rx)) {
	5530	HV *hv= RXp_PAREN_NAMES(rx);
	5531	HE *temphe;
	5532	(void)hv_iterinit(hv);
	5533	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	5534	IV i;
	5535	IV parno = 0;
	5536	SV* sv_dat = HeVAL(temphe);
	5537	I32 nums = (I32)SvPVX(sv_dat);
	5538	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	5539	if ((I32)(rx->lastparen) >= nums[i] &&
	5540	rx->offs[nums[i]].start != -1 &&
	5541	rx->offs[nums[i]].end != -1)
	5542	{
	5543	parno = nums[i];
	5544	break;
	5545	}
	5546	}
	5547	if (parno \|\| flags & RXapif_ALL) {
	5548	av_push(av, newSVhek(HeKEY_hek(temphe)));
	5549	}
	5550	}
	5551	}
	5552
	5553	return newRV_noinc(MUTABLE_SV(av));
	5554	}
	5555
	5556	void
	5557	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	5558	SV * const sv)
	5559	{
	5560	struct regexp const rx = (struct regexp )SvANY(r);
	5561	char *s = NULL;
	5562	I32 i = 0;
	5563	I32 s1, t1;
	5564
	5565	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	5566
	5567	if (!rx->subbeg) {
	5568	sv_setsv(sv,&PL_sv_undef);
	5569	return;
	5570	}
	5571	else
	5572	if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
	5573	/* $` */
	5574	i = rx->offs[0].start;
	5575	s = rx->subbeg;
	5576	}
	5577	else
	5578	if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
	5579	/* $' */
	5580	s = rx->subbeg + rx->offs[0].end;
	5581	i = rx->sublen - rx->offs[0].end;
	5582	}
	5583	else
	5584	if ( 0 <= paren && paren <= (I32)rx->nparens &&
	5585	(s1 = rx->offs[paren].start) != -1 &&
	5586	(t1 = rx->offs[paren].end) != -1)
	5587	{
	5588	/* $& $1 ... */
	5589	i = t1 - s1;
	5590	s = rx->subbeg + s1;
	5591	} else {
	5592	sv_setsv(sv,&PL_sv_undef);
	5593	return;
	5594	}
	5595	assert(rx->sublen >= (s - rx->subbeg) + i );
	5596	if (i >= 0) {
	5597	const int oldtainted = PL_tainted;
	5598	TAINT_NOT;
	5599	sv_setpvn(sv, s, i);
	5600	PL_tainted = oldtainted;
	5601	if ( (rx->extflags & RXf_CANY_SEEN)
	5602	? (RXp_MATCH_UTF8(rx)
	5603	&& (!i \|\| is_utf8_string((U8*)s, i)))
	5604	: (RXp_MATCH_UTF8(rx)) )
	5605	{
	5606	SvUTF8_on(sv);
	5607	}
	5608	else
	5609	SvUTF8_off(sv);
	5610	if (PL_tainting) {
	5611	if (RXp_MATCH_TAINTED(rx)) {
	5612	if (SvTYPE(sv) >= SVt_PVMG) {
	5613	MAGIC* const mg = SvMAGIC(sv);
	5614	MAGIC* mgt;
	5615	PL_tainted = 1;
	5616	SvMAGIC_set(sv, mg->mg_moremagic);
	5617	SvTAINT(sv);
	5618	if ((mgt = SvMAGIC(sv))) {
	5619	mg->mg_moremagic = mgt;
	5620	SvMAGIC_set(sv, mg);
	5621	}
	5622	} else {
	5623	PL_tainted = 1;
	5624	SvTAINT(sv);
	5625	}
	5626	} else
	5627	SvTAINTED_off(sv);
	5628	}
	5629	} else {
	5630	sv_setsv(sv,&PL_sv_undef);
	5631	return;
	5632	}
	5633	}
	5634
	5635	void
	5636	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	5637	SV const * const value)
	5638	{
	5639	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	5640
	5641	PERL_UNUSED_ARG(rx);
	5642	PERL_UNUSED_ARG(paren);
	5643	PERL_UNUSED_ARG(value);
	5644
	5645	if (!PL_localizing)
	5646	Perl_croak_no_modify(aTHX);
	5647	}
	5648
	5649	I32
	5650	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	5651	const I32 paren)
	5652	{
	5653	struct regexp const rx = (struct regexp )SvANY(r);
	5654	I32 i;
	5655	I32 s1, t1;
	5656
	5657	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	5658
	5659	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	5660	switch (paren) {
	5661	/* $` / ${^PREMATCH} */
	5662	case RX_BUFF_IDX_PREMATCH:
	5663	if (rx->offs[0].start != -1) {
	5664	i = rx->offs[0].start;
	5665	if (i > 0) {
	5666	s1 = 0;
	5667	t1 = i;
	5668	goto getlen;
	5669	}
	5670	}
	5671	return 0;
	5672	/* $' / ${^POSTMATCH} */
	5673	case RX_BUFF_IDX_POSTMATCH:
	5674	if (rx->offs[0].end != -1) {
	5675	i = rx->sublen - rx->offs[0].end;
	5676	if (i > 0) {
	5677	s1 = rx->offs[0].end;
	5678	t1 = rx->sublen;
	5679	goto getlen;
	5680	}
	5681	}
	5682	return 0;
	5683	/* $& / ${^MATCH}, $1, $2, ... */
	5684	default:
	5685	if (paren <= (I32)rx->nparens &&
	5686	(s1 = rx->offs[paren].start) != -1 &&
	5687	(t1 = rx->offs[paren].end) != -1)
	5688	{
	5689	i = t1 - s1;
	5690	goto getlen;
	5691	} else {
	5692	if (ckWARN(WARN_UNINITIALIZED))
	5693	report_uninit((const SV *)sv);
	5694	return 0;
	5695	}
	5696	}
	5697	getlen:
	5698	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	5699	const char * const s = rx->subbeg + s1;
	5700	const U8 *ep;
	5701	STRLEN el;
	5702
	5703	i = t1 - s1;
	5704	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	5705	i = el;
	5706	}
	5707	return i;
	5708	}
	5709
	5710	SV*
	5711	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	5712	{
	5713	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	5714	PERL_UNUSED_ARG(rx);
	5715	if (0)
	5716	return NULL;
	5717	else
	5718	return newSVpvs("Regexp");
	5719	}
	5720
	5721	/* Scans the name of a named buffer from the pattern.
	5722	* If flags is REG_RSN_RETURN_NULL returns null.
	5723	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	5724	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	5725	* to the parsed name as looked up in the RExC_paren_names hash.
	5726	* If there is an error throws a vFAIL().. type exception.
	5727	*/
	5728
	5729	#define REG_RSN_RETURN_NULL 0
	5730	#define REG_RSN_RETURN_NAME 1
	5731	#define REG_RSN_RETURN_DATA 2
	5732
	5733	STATIC SV*
	5734	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	5735	{
	5736	char *name_start = RExC_parse;
	5737
	5738	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	5739
	5740	if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	5741	/* skip IDFIRST by using do...while */
	5742	if (UTF)
	5743	do {
	5744	RExC_parse += UTF8SKIP(RExC_parse);
	5745	} while (isALNUM_utf8((U8*)RExC_parse));
	5746	else
	5747	do {
	5748	RExC_parse++;
	5749	} while (isALNUM(*RExC_parse));
	5750	}
	5751
	5752	if ( flags ) {
	5753	SV* sv_name
	5754	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	5755	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	5756	if ( flags == REG_RSN_RETURN_NAME)
	5757	return sv_name;
	5758	else if (flags==REG_RSN_RETURN_DATA) {
	5759	HE *he_str = NULL;
	5760	SV *sv_dat = NULL;
	5761	if ( ! sv_name ) /* should not happen*/
	5762	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	5763	if (RExC_paren_names)
	5764	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	5765	if ( he_str )
	5766	sv_dat = HeVAL(he_str);
	5767	if ( ! sv_dat )
	5768	vFAIL("Reference to nonexistent named group");
	5769	return sv_dat;
	5770	}
	5771	else {
	5772	Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
	5773	}
	5774	/* NOT REACHED */
	5775	}
	5776	return NULL;
	5777	}
	5778
	5779	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	5780	int rem=(int)(RExC_end - RExC_parse); \
	5781	int cut; \
	5782	int num; \
	5783	int iscut=0; \
	5784	if (rem>10) { \
	5785	rem=10; \
	5786	iscut=1; \
	5787	} \
	5788	cut=10-rem; \
	5789	if (RExC_lastparse!=RExC_parse) \
	5790	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	5791	rem, RExC_parse, \
	5792	cut + 4, \
	5793	iscut ? "..." : "<" \
	5794	); \
	5795	else \
	5796	PerlIO_printf(Perl_debug_log,"%16s",""); \
	5797	\
	5798	if (SIZE_ONLY) \
	5799	num = RExC_size + 1; \
	5800	else \
	5801	num=REG_NODE_NUM(RExC_emit); \
	5802	if (RExC_lastnum!=num) \
	5803	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	5804	else \
	5805	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	5806	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	5807	(int)((depth*2)), "", \
	5808	(funcname) \
	5809	); \
	5810	RExC_lastnum=num; \
	5811	RExC_lastparse=RExC_parse; \
	5812	})
	5813
	5814
	5815
	5816	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	5817	DEBUG_PARSE_MSG((funcname)); \
	5818	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	5819	})
	5820	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	5821	DEBUG_PARSE_MSG((funcname)); \
	5822	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	5823	})
	5824
	5825	/* This section of code defines the inversion list object and its methods. The
	5826	* interfaces are highly subject to change, so as much as possible is static to
	5827	* this file. An inversion list is here implemented as a malloc'd C array with
	5828	* some added info. More will be coming when functionality is added later.
	5829	*
	5830	* It is currently implemented as an HV to the outside world, but is actually
	5831	* an SV pointing to an array of UVs that the SV thinks are bytes. This allows
	5832	* us to have an array of UV whose memory management is automatically handled
	5833	* by the existing facilities for SV's.
	5834	*
	5835	* Some of the methods should always be private to the implementation, and some
	5836	* should eventually be made public */
	5837
	5838	#define INVLIST_INITIAL_LEN 10
	5839
	5840	PERL_STATIC_INLINE UV*
	5841	S_invlist_array(pTHX_ HV* const invlist)
	5842	{
	5843	/* Returns the pointer to the inversion list's array. Every time the
	5844	* length changes, this needs to be called in case malloc or realloc moved
	5845	* it */
	5846
	5847	PERL_ARGS_ASSERT_INVLIST_ARRAY;
	5848
	5849	return (UV *) SvPVX(invlist);
	5850	}
	5851
	5852	PERL_STATIC_INLINE UV
	5853	S_invlist_len(pTHX_ HV* const invlist)
	5854	{
	5855	/* Returns the current number of elements in the inversion list's array */
	5856
	5857	PERL_ARGS_ASSERT_INVLIST_LEN;
	5858
	5859	return SvCUR(invlist) / sizeof(UV);
	5860	}
	5861
	5862	PERL_STATIC_INLINE UV
	5863	S_invlist_max(pTHX_ HV* const invlist)
	5864	{
	5865	/* Returns the maximum number of elements storable in the inversion list's
	5866	* array, without having to realloc() */
	5867
	5868	PERL_ARGS_ASSERT_INVLIST_MAX;
	5869
	5870	return SvLEN(invlist) / sizeof(UV);
	5871	}
	5872
	5873	PERL_STATIC_INLINE void
	5874	S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
	5875	{
	5876	/* Sets the current number of elements stored in the inversion list */
	5877
	5878	PERL_ARGS_ASSERT_INVLIST_SET_LEN;
	5879
	5880	SvCUR_set(invlist, len * sizeof(UV));
	5881	}
	5882
	5883	PERL_STATIC_INLINE void
	5884	S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
	5885	{
	5886
	5887	/* Sets the maximum number of elements storable in the inversion list
	5888	* without having to realloc() */
	5889
	5890	PERL_ARGS_ASSERT_INVLIST_SET_MAX;
	5891
	5892	if (max < invlist_len(invlist)) {
	5893	Perl_croak(aTHX_ "panic: Can't make max size '%"UVuf"' less than current length %"UVuf" in inversion list", invlist_max(invlist), invlist_len(invlist));
	5894	}
	5895
	5896	SvLEN_set(invlist, max * sizeof(UV));
	5897	}
	5898
	5899	#ifndef PERL_IN_XSUB_RE
	5900	HV*
	5901	Perl__new_invlist(pTHX_ IV initial_size)
	5902	{
	5903
	5904	/* Return a pointer to a newly constructed inversion list, with enough
	5905	* space to store 'initial_size' elements. If that number is negative, a
	5906	* system default is used instead */
	5907
	5908	if (initial_size < 0) {
	5909	initial_size = INVLIST_INITIAL_LEN;
	5910	}
	5911
	5912	/* Allocate the initial space */
	5913	return (HV ) newSV(initial_size sizeof(UV));
	5914	}
	5915	#endif
	5916
	5917	PERL_STATIC_INLINE void
	5918	S_invlist_destroy(pTHX_ HV* const invlist)
	5919	{
	5920	/* Inversion list destructor */
	5921
	5922	PERL_ARGS_ASSERT_INVLIST_DESTROY;
	5923
	5924	SvREFCNT_dec(invlist);
	5925	}
	5926
	5927	STATIC void
	5928	S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
	5929	{
	5930	/* Grow the maximum size of an inversion list */
	5931
	5932	PERL_ARGS_ASSERT_INVLIST_EXTEND;
	5933
	5934	SvGROW((SV )invlist, new_max sizeof(UV));
	5935	}
	5936
	5937	PERL_STATIC_INLINE void
	5938	S_invlist_trim(pTHX_ HV* const invlist)
	5939	{
	5940	PERL_ARGS_ASSERT_INVLIST_TRIM;
	5941
	5942	/* Change the length of the inversion list to how many entries it currently
	5943	* has */
	5944
	5945	SvPV_shrink_to_cur((SV *) invlist);
	5946	}
	5947
	5948	/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
	5949	* etc */
	5950
	5951	#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
	5952	#define PREV_ELEMENT_IN_INVLIST_SET(i) ! ELEMENT_IN_INVLIST_SET(i)
	5953
	5954	#ifndef PERL_IN_XSUB_RE
	5955	void
	5956	Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
	5957	{
	5958	/* Subject to change or removal. Append the range from 'start' to 'end' at
	5959	* the end of the inversion list. The range must be above any existing
	5960	* ones. */
	5961
	5962	UV* array = invlist_array(invlist);
	5963	UV max = invlist_max(invlist);
	5964	UV len = invlist_len(invlist);
	5965
	5966	PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
	5967
	5968	if (len > 0) {
	5969
	5970	/* Here, the existing list is non-empty. The current max entry in the
	5971	* list is generally the first value not in the set, except when the
	5972	* set extends to the end of permissible values, in which case it is
	5973	* the first entry in that final set, and so this call is an attempt to
	5974	* append out-of-order */
	5975
	5976	UV final_element = len - 1;
	5977	if (array[final_element] > start
	5978	\|\| ELEMENT_IN_INVLIST_SET(final_element))
	5979	{
	5980	Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
	5981	}
	5982
	5983	/* Here, it is a legal append. If the new range begins with the first
	5984	* value not in the set, it is extending the set, so the new first
	5985	* value not in the set is one greater than the newly extended range.
	5986	* */
	5987	if (array[final_element] == start) {
	5988	if (end != UV_MAX) {
	5989	array[final_element] = end + 1;
	5990	}
	5991	else {
	5992	/* But if the end is the maximum representable on the machine,
	5993	* just let the range that this would extend have no end */
	5994	invlist_set_len(invlist, len - 1);
	5995	}
	5996	return;
	5997	}
	5998	}
	5999
	6000	/* Here the new range doesn't extend any existing set. Add it */
	6001
	6002	len += 2; /* Includes an element each for the start and end of range */
	6003
	6004	/* If overflows the existing space, extend, which may cause the array to be
	6005	* moved */
	6006	if (max < len) {
	6007	invlist_extend(invlist, len);
	6008	array = invlist_array(invlist);
	6009	}
	6010
	6011	invlist_set_len(invlist, len);
	6012
	6013	/* The next item on the list starts the range, the one after that is
	6014	* one past the new range. */
	6015	array[len - 2] = start;
	6016	if (end != UV_MAX) {
	6017	array[len - 1] = end + 1;
	6018	}
	6019	else {
	6020	/* But if the end is the maximum representable on the machine, just let
	6021	* the range have no end */
	6022	invlist_set_len(invlist, len - 1);
	6023	}
	6024	}
	6025	#endif
	6026
	6027	STATIC HV*
	6028	S_invlist_union(pTHX_ HV* const a, HV* const b)
	6029	{
	6030	/* Return a new inversion list which is the union of two inversion lists.
	6031	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	6032	* Richard Gillam, published by Addison-Wesley, and explained at some
	6033	* length there. The preface says to incorporate its examples into your
	6034	* code at your own risk.
	6035	*
	6036	* The algorithm is like a merge sort.
	6037	*
	6038	* XXX A potential performance improvement is to keep track as we go along
	6039	* if only one of the inputs contributes to the result, meaning the other
	6040	* is a subset of that one. In that case, we can skip the final copy and
	6041	* return the larger of the input lists */
	6042
	6043	UV* array_a = invlist_array(a); /* a's array */
	6044	UV* array_b = invlist_array(b);
	6045	UV len_a = invlist_len(a); /* length of a's array */
	6046	UV len_b = invlist_len(b);
	6047
	6048	HV* u; /* the resulting union */
	6049	UV* array_u;
	6050	UV len_u;
	6051
	6052	UV i_a = 0; /* current index into a's array */
	6053	UV i_b = 0;
	6054	UV i_u = 0;
	6055
	6056	/* running count, as explained in the algorithm source book; items are
	6057	* stopped accumulating and are output when the count changes to/from 0.
	6058	* The count is incremented when we start a range that's in the set, and
	6059	* decremented when we start a range that's not in the set. So its range
	6060	* is 0 to 2. Only when the count is zero is something not in the set.
	6061	*/
	6062	UV count = 0;
	6063
	6064	PERL_ARGS_ASSERT_INVLIST_UNION;
	6065
	6066	/* Size the union for the worst case: that the sets are completely
	6067	* disjoint */
	6068	u = _new_invlist(len_a + len_b);
	6069	array_u = invlist_array(u);
	6070
	6071	/* Go through each list item by item, stopping when exhausted one of
	6072	* them */
	6073	while (i_a < len_a && i_b < len_b) {
	6074	UV cp; /* The element to potentially add to the union's array */
	6075	bool cp_in_set; /* is it in the the input list's set or not */
	6076
	6077	/* We need to take one or the other of the two inputs for the union.
	6078	* Since we are merging two sorted lists, we take the smaller of the
	6079	* next items. In case of a tie, we take the one that is in its set
	6080	* first. If we took one not in the set first, it would decrement the
	6081	* count, possibly to 0 which would cause it to be output as ending the
	6082	* range, and the next time through we would take the same number, and
	6083	* output it again as beginning the next range. By doing it the
	6084	* opposite way, there is no possibility that the count will be
	6085	* momentarily decremented to 0, and thus the two adjoining ranges will
	6086	* be seamlessly merged. (In a tie and both are in the set or both not
	6087	* in the set, it doesn't matter which we take first.) */
	6088	if (array_a[i_a] < array_b[i_b]
	6089	\|\| (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
	6090	{
	6091	cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
	6092	cp= array_a[i_a++];
	6093	}
	6094	else {
	6095	cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
	6096	cp= array_b[i_b++];
	6097	}
	6098
	6099	/* Here, have chosen which of the two inputs to look at. Only output
	6100	* if the running count changes to/from 0, which marks the
	6101	* beginning/end of a range in that's in the set */
	6102	if (cp_in_set) {
	6103	if (count == 0) {
	6104	array_u[i_u++] = cp;
	6105	}
	6106	count++;
	6107	}
	6108	else {
	6109	count--;
	6110	if (count == 0) {
	6111	array_u[i_u++] = cp;
	6112	}
	6113	}
	6114	}
	6115
	6116	/* Here, we are finished going through at least one of the lists, which
	6117	* means there is something remaining in at most one. We check if the list
	6118	* that hasn't been exhausted is positioned such that we are in the middle
	6119	* of a range in its set or not. (i_a and i_b point to the element beyond
	6120	* the one we care about.) If in the set, we decrement 'count'; if 0, there
	6121	* is potentially more to output.
	6122	* There are four cases:
	6123	* 1) Both weren't in their sets, count is 0, and remains 0. What's left
	6124	* in the union is entirely from the non-exhausted set.
	6125	* 2) Both were in their sets, count is 2. Nothing further should
	6126	* be output, as everything that remains will be in the exhausted
	6127	* list's set, hence in the union; decrementing to 1 but not 0 insures
	6128	* that
	6129	* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
	6130	* Nothing further should be output because the union includes
	6131	* everything from the exhausted set. Not decrementing ensures that.
	6132	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
	6133	* decrementing to 0 insures that we look at the remainder of the
	6134	* non-exhausted set */
	6135	if ((i_a != len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
	6136	\|\| (i_b != len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
	6137	{
	6138	count--;
	6139	}
	6140
	6141	/* The final length is what we've output so far, plus what else is about to
	6142	* be output. (If 'count' is non-zero, then the input list we exhausted
	6143	* has everything remaining up to the machine's limit in its set, and hence
	6144	* in the union, so there will be no further output. */
	6145	len_u = i_u;
	6146	if (count == 0) {
	6147	/* At most one of the subexpressions will be non-zero */
	6148	len_u += (len_a - i_a) + (len_b - i_b);
	6149	}
	6150
	6151	/* Set result to final length, which can change the pointer to array_u, so
	6152	* re-find it */
	6153	if (len_u != invlist_len(u)) {
	6154	invlist_set_len(u, len_u);
	6155	invlist_trim(u);
	6156	array_u = invlist_array(u);
	6157	}
	6158
	6159	/* When 'count' is 0, the list that was exhausted (if one was shorter than
	6160	* the other) ended with everything above it not in its set. That means
	6161	* that the remaining part of the union is precisely the same as the
	6162	* non-exhausted list, so can just copy it unchanged. (If both list were
	6163	* exhausted at the same time, then the operations below will be both 0.)
	6164	*/
	6165	if (count == 0) {
	6166	IV copy_count; /* At most one will have a non-zero copy count */
	6167	if ((copy_count = len_a - i_a) > 0) {
	6168	Copy(array_a + i_a, array_u + i_u, copy_count, UV);
	6169	}
	6170	else if ((copy_count = len_b - i_b) > 0) {
	6171	Copy(array_b + i_b, array_u + i_u, copy_count, UV);
	6172	}
	6173	}
	6174
	6175	return u;
	6176	}
	6177
	6178	STATIC HV*
	6179	S_invlist_intersection(pTHX_ HV* const a, HV* const b)
	6180	{
	6181	/* Return the intersection of two inversion lists. The basis for this
	6182	* comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
	6183	* by Addison-Wesley, and explained at some length there. The preface says
	6184	* to incorporate its examples into your code at your own risk. In fact,
	6185	* it had bugs
	6186	*
	6187	* The algorithm is like a merge sort, and is essentially the same as the
	6188	* union above
	6189	*/
	6190
	6191	UV* array_a = invlist_array(a); /* a's array */
	6192	UV* array_b = invlist_array(b);
	6193	UV len_a = invlist_len(a); /* length of a's array */
	6194	UV len_b = invlist_len(b);
	6195
	6196	HV* r; /* the resulting intersection */
	6197	UV* array_r;
	6198	UV len_r;
	6199
	6200	UV i_a = 0; /* current index into a's array */
	6201	UV i_b = 0;
	6202	UV i_r = 0;
	6203
	6204	/* running count, as explained in the algorithm source book; items are
	6205	* stopped accumulating and are output when the count changes to/from 2.
	6206	* The count is incremented when we start a range that's in the set, and
	6207	* decremented when we start a range that's not in the set. So its range
	6208	* is 0 to 2. Only when the count is 2 is something in the intersection.
	6209	*/
	6210	UV count = 0;
	6211
	6212	PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
	6213
	6214	/* Size the intersection for the worst case: that the intersection ends up
	6215	* fragmenting everything to be completely disjoint */
	6216	r= _new_invlist(len_a + len_b);
	6217	array_r = invlist_array(r);
	6218
	6219	/* Go through each list item by item, stopping when exhausted one of
	6220	* them */
	6221	while (i_a < len_a && i_b < len_b) {
	6222	UV cp; /* The element to potentially add to the intersection's
	6223	array */
	6224	bool cp_in_set; /* Is it in the input list's set or not */
	6225
	6226	/* We need to take one or the other of the two inputs for the
	6227	* intersection. Since we are merging two sorted lists, we take the
	6228	* smaller of the next items. In case of a tie, we take the one that
	6229	* is not in its set first (a difference from the union algorithm). If
	6230	* we took one in the set first, it would increment the count, possibly
	6231	* to 2 which would cause it to be output as starting a range in the
	6232	* intersection, and the next time through we would take that same
	6233	* number, and output it again as ending the set. By doing it the
	6234	* opposite of this, there is no possibility that the count will be
	6235	* momentarily incremented to 2. (In a tie and both are in the set or
	6236	* both not in the set, it doesn't matter which we take first.) */
	6237	if (array_a[i_a] < array_b[i_b]
	6238	\|\| (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
	6239	{
	6240	cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
	6241	cp= array_a[i_a++];
	6242	}
	6243	else {
	6244	cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
	6245	cp= array_b[i_b++];
	6246	}
	6247
	6248	/* Here, have chosen which of the two inputs to look at. Only output
	6249	* if the running count changes to/from 2, which marks the
	6250	* beginning/end of a range that's in the intersection */
	6251	if (cp_in_set) {
	6252	count++;
	6253	if (count == 2) {
	6254	array_r[i_r++] = cp;
	6255	}
	6256	}
	6257	else {
	6258	if (count == 2) {
	6259	array_r[i_r++] = cp;
	6260	}
	6261	count--;
	6262	}
	6263	}
	6264
	6265	/* Here, we are finished going through at least one of the lists, which
	6266	* means there is something remaining in at most one. We check if the list
	6267	* that has been exhausted is positioned such that we are in the middle
	6268	* of a range in its set or not. (i_a and i_b point to elements 1 beyond
	6269	* the ones we care about.) There are four cases:
	6270	* 1) Both weren't in their sets, count is 0, and remains 0. There's
	6271	* nothing left in the intersection.
	6272	* 2) Both were in their sets, count is 2 and perhaps is incremented to
	6273	* above 2. What should be output is exactly that which is in the
	6274	* non-exhausted set, as everything it has is also in the intersection
	6275	* set, and everything it doesn't have can't be in the intersection
	6276	* 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
	6277	* gets incremented to 2. Like the previous case, the intersection is
	6278	* everything that remains in the non-exhausted set.
	6279	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
	6280	* remains 1. And the intersection has nothing more. */
	6281	if ((i_a == len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
	6282	\|\| (i_b == len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
	6283	{
	6284	count++;
	6285	}
	6286
	6287	/* The final length is what we've output so far plus what else is in the
	6288	* intersection. At most one of the subexpressions below will be non-zero */
	6289	len_r = i_r;
	6290	if (count >= 2) {
	6291	len_r += (len_a - i_a) + (len_b - i_b);
	6292	}
	6293
	6294	/* Set result to final length, which can change the pointer to array_r, so
	6295	* re-find it */
	6296	if (len_r != invlist_len(r)) {
	6297	invlist_set_len(r, len_r);
	6298	invlist_trim(r);
	6299	array_r = invlist_array(r);
	6300	}
	6301
	6302	/* Finish outputting any remaining */
	6303	if (count >= 2) { /* At most one will have a non-zero copy count */
	6304	IV copy_count;
	6305	if ((copy_count = len_a - i_a) > 0) {
	6306	Copy(array_a + i_a, array_r + i_r, copy_count, UV);
	6307	}
	6308	else if ((copy_count = len_b - i_b) > 0) {
	6309	Copy(array_b + i_b, array_r + i_r, copy_count, UV);
	6310	}
	6311	}
	6312
	6313	return r;
	6314	}
	6315
	6316	STATIC HV*
	6317	S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
	6318	{
	6319	/* Add the range from 'start' to 'end' inclusive to the inversion list's
	6320	* set. A pointer to the inversion list is returned. This may actually be
	6321	* a new list, in which case the passed in one has been destroyed. The
	6322	* passed in inversion list can be NULL, in which case a new one is created
	6323	* with just the one range in it */
	6324
	6325	HV* range_invlist;
	6326	HV* added_invlist;
	6327	UV len;
	6328
	6329	if (invlist == NULL) {
	6330	invlist = _new_invlist(2);
	6331	len = 0;
	6332	}
	6333	else {
	6334	len = invlist_len(invlist);
	6335	}
	6336
	6337	/* If comes after the final entry, can just append it to the end */
	6338	if (len == 0
	6339	\|\| start >= invlist_array(invlist)
	6340	[invlist_len(invlist) - 1])
	6341	{
	6342	_append_range_to_invlist(invlist, start, end);
	6343	return invlist;
	6344	}
	6345
	6346	/* Here, can't just append things, create and return a new inversion list
	6347	* which is the union of this range and the existing inversion list */
	6348	range_invlist = _new_invlist(2);
	6349	_append_range_to_invlist(range_invlist, start, end);
	6350
	6351	added_invlist = invlist_union(invlist, range_invlist);
	6352
	6353	/* The passed in list can be freed, as well as our temporary */
	6354	invlist_destroy(range_invlist);
	6355	if (invlist != added_invlist) {
	6356	invlist_destroy(invlist);
	6357	}
	6358
	6359	return added_invlist;
	6360	}
	6361
	6362	PERL_STATIC_INLINE HV*
	6363	S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
	6364	return add_range_to_invlist(invlist, cp, cp);
	6365	}
	6366
	6367	/* End of inversion list object */
	6368
	6369	/*
	6370	- reg - regular expression, i.e. main body or parenthesized thing
	6371	*
	6372	* Caller must absorb opening parenthesis.
	6373	*
	6374	* Combining parenthesis handling with the base level of regular expression
	6375	* is a trifle forced, but the need to tie the tails of the branches to what
	6376	* follows makes it hard to avoid.
	6377	*/
	6378	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	6379	#ifdef DEBUGGING
	6380	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	6381	#else
	6382	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	6383	#endif
	6384
	6385	STATIC regnode *
	6386	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	6387	/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
	6388	{
	6389	dVAR;
	6390	register regnode ret; / Will be the head of the group. */
	6391	register regnode *br;
	6392	register regnode *lastbr;
	6393	register regnode *ender = NULL;
	6394	register I32 parno = 0;
	6395	I32 flags;
	6396	U32 oregflags = RExC_flags;
	6397	bool have_branch = 0;
	6398	bool is_open = 0;
	6399	I32 freeze_paren = 0;
	6400	I32 after_freeze = 0;
	6401
	6402	/* for (?g), (?gc), and (?o) warnings; warning
	6403	about (?c) will warn about (?g) -- japhy */
	6404
	6405	#define WASTED_O 0x01
	6406	#define WASTED_G 0x02
	6407	#define WASTED_C 0x04
	6408	#define WASTED_GC (0x02\|0x04)
	6409	I32 wastedflags = 0x00;
	6410
	6411	char * parse_start = RExC_parse; /* MJD */
	6412	char * const oregcomp_parse = RExC_parse;
	6413
	6414	GET_RE_DEBUG_FLAGS_DECL;
	6415
	6416	PERL_ARGS_ASSERT_REG;
	6417	DEBUG_PARSE("reg ");
	6418
	6419	flagp = 0; / Tentatively. */
	6420
	6421
	6422	/* Make an OPEN node, if parenthesized. */
	6423	if (paren) {
	6424	if ( RExC_parse == '') { /* (VERB:ARG) /
	6425	char *start_verb = RExC_parse;
	6426	STRLEN verb_len = 0;
	6427	char *start_arg = NULL;
	6428	unsigned char op = 0;
	6429	int argok = 1;
	6430	int internal_argval = 0; /* internal_argval is only useful if !argok */
	6431	while ( RExC_parse && RExC_parse != ')' ) {
	6432	if ( *RExC_parse == ':' ) {
	6433	start_arg = RExC_parse + 1;
	6434	break;
	6435	}
	6436	RExC_parse++;
	6437	}
	6438	++start_verb;
	6439	verb_len = RExC_parse - start_verb;
	6440	if ( start_arg ) {
	6441	RExC_parse++;
	6442	while ( RExC_parse && RExC_parse != ')' )
	6443	RExC_parse++;
	6444	if ( *RExC_parse != ')' )
	6445	vFAIL("Unterminated verb pattern argument");
	6446	if ( RExC_parse == start_arg )
	6447	start_arg = NULL;
	6448	} else {
	6449	if ( *RExC_parse != ')' )
	6450	vFAIL("Unterminated verb pattern");
	6451	}
	6452
	6453	switch ( *start_verb ) {
	6454	case 'A': /* (ACCEPT) /
	6455	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	6456	op = ACCEPT;
	6457	internal_argval = RExC_nestroot;
	6458	}
	6459	break;
	6460	case 'C': /* (COMMIT) /
	6461	if ( memEQs(start_verb,verb_len,"COMMIT") )
	6462	op = COMMIT;
	6463	break;
	6464	case 'F': /* (FAIL) /
	6465	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	6466	op = OPFAIL;
	6467	argok = 0;
	6468	}
	6469	break;
	6470	case ':': /* (:NAME) /
	6471	case 'M': /* (MARK:NAME) /
	6472	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	6473	op = MARKPOINT;
	6474	argok = -1;
	6475	}
	6476	break;
	6477	case 'P': /* (PRUNE) /
	6478	if ( memEQs(start_verb,verb_len,"PRUNE") )
	6479	op = PRUNE;
	6480	break;
	6481	case 'S': /* (SKIP) /
	6482	if ( memEQs(start_verb,verb_len,"SKIP") )
	6483	op = SKIP;
	6484	break;
	6485	case 'T': /* (THEN) /
	6486	/* [19:06] <TimToady> :: is then */
	6487	if ( memEQs(start_verb,verb_len,"THEN") ) {
	6488	op = CUTGROUP;
	6489	RExC_seen \|= REG_SEEN_CUTGROUP;
	6490	}
	6491	break;
	6492	}
	6493	if ( ! op ) {
	6494	RExC_parse++;
	6495	vFAIL3("Unknown verb pattern '%.*s'",
	6496	verb_len, start_verb);
	6497	}
	6498	if ( argok ) {
	6499	if ( start_arg && internal_argval ) {
	6500	vFAIL3("Verb pattern '%.*s' may not have an argument",
	6501	verb_len, start_verb);
	6502	} else if ( argok < 0 && !start_arg ) {
	6503	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	6504	verb_len, start_verb);
	6505	} else {
	6506	ret = reganode(pRExC_state, op, internal_argval);
	6507	if ( ! internal_argval && ! SIZE_ONLY ) {
	6508	if (start_arg) {
	6509	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	6510	ARG(ret) = add_data( pRExC_state, 1, "S" );
	6511	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	6512	ret->flags = 0;
	6513	} else {
	6514	ret->flags = 1;
	6515	}
	6516	}
	6517	}
	6518	if (!internal_argval)
	6519	RExC_seen \|= REG_SEEN_VERBARG;
	6520	} else if ( start_arg ) {
	6521	vFAIL3("Verb pattern '%.*s' may not have an argument",
	6522	verb_len, start_verb);
	6523	} else {
	6524	ret = reg_node(pRExC_state, op);
	6525	}
	6526	nextchar(pRExC_state);
	6527	return ret;
	6528	} else
	6529	if (RExC_parse == '?') { / (?...) */
	6530	bool is_logical = 0;
	6531	const char * const seqstart = RExC_parse;
	6532	bool has_use_defaults = FALSE;
	6533
	6534	RExC_parse++;
	6535	paren = *RExC_parse++;
	6536	ret = NULL; /* For look-ahead/behind. */
	6537	switch (paren) {
	6538
	6539	case 'P': /* (?P...) variants for those used to PCRE/Python */
	6540	paren = *RExC_parse++;
	6541	if ( paren == '<') /* (?P<...>) named capture */
	6542	goto named_capture;
	6543	else if (paren == '>') { /* (?P>name) named recursion */
	6544	goto named_recursion;
	6545	}
	6546	else if (paren == '=') { /* (?P=...) named backref */
	6547	/* this pretty much dupes the code for \k<NAME> in regatom(), if
	6548	you change this make sure you change that */
	6549	char* name_start = RExC_parse;
	6550	U32 num = 0;
	6551	SV *sv_dat = reg_scan_name(pRExC_state,
	6552	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6553	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	6554	vFAIL2("Sequence %.3s... not terminated",parse_start);
	6555
	6556	if (!SIZE_ONLY) {
	6557	num = add_data( pRExC_state, 1, "S" );
	6558	RExC_rxi->data->data[num]=(void*)sv_dat;
	6559	SvREFCNT_inc_simple_void(sv_dat);
	6560	}
	6561	RExC_sawback = 1;
	6562	ret = reganode(pRExC_state,
	6563	((! FOLD)
	6564	? NREF
	6565	: (MORE_ASCII_RESTRICTED)
	6566	? NREFFA
	6567	: (AT_LEAST_UNI_SEMANTICS)
	6568	? NREFFU
	6569	: (LOC)
	6570	? NREFFL
	6571	: NREFF),
	6572	num);
	6573	*flagp \|= HASWIDTH;
	6574
	6575	Set_Node_Offset(ret, parse_start+1);
	6576	Set_Node_Cur_Length(ret); /* MJD */
	6577
	6578	nextchar(pRExC_state);
	6579	return ret;
	6580	}
	6581	RExC_parse++;
	6582	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6583	/NOTREACHED/
	6584	case '<': /* (?<...) */
	6585	if (*RExC_parse == '!')
	6586	paren = ',';
	6587	else if (*RExC_parse != '=')
	6588	named_capture:
	6589	{ /* (?<...>) */
	6590	char *name_start;
	6591	SV *svname;
	6592	paren= '>';
	6593	case '\'': /* (?'...') */
	6594	name_start= RExC_parse;
	6595	svname = reg_scan_name(pRExC_state,
	6596	SIZE_ONLY ? /* reverse test from the others */
	6597	REG_RSN_RETURN_NAME :
	6598	REG_RSN_RETURN_NULL);
	6599	if (RExC_parse == name_start) {
	6600	RExC_parse++;
	6601	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6602	/NOTREACHED/
	6603	}
	6604	if (*RExC_parse != paren)
	6605	vFAIL2("Sequence (?%c... not terminated",
	6606	paren=='>' ? '<' : paren);
	6607	if (SIZE_ONLY) {
	6608	HE *he_str;
	6609	SV *sv_dat = NULL;
	6610	if (!svname) /* shouldn't happen */
	6611	Perl_croak(aTHX_
	6612	"panic: reg_scan_name returned NULL");
	6613	if (!RExC_paren_names) {
	6614	RExC_paren_names= newHV();
	6615	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	6616	#ifdef DEBUGGING
	6617	RExC_paren_name_list= newAV();
	6618	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	6619	#endif
	6620	}
	6621	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	6622	if ( he_str )
	6623	sv_dat = HeVAL(he_str);
	6624	if ( ! sv_dat ) {
	6625	/* croak baby croak */
	6626	Perl_croak(aTHX_
	6627	"panic: paren_name hash element allocation failed");
	6628	} else if ( SvPOK(sv_dat) ) {
	6629	/* (?\|...) can mean we have dupes so scan to check
	6630	its already been stored. Maybe a flag indicating
	6631	we are inside such a construct would be useful,
	6632	but the arrays are likely to be quite small, so
	6633	for now we punt -- dmq */
	6634	IV count = SvIV(sv_dat);
	6635	I32 pv = (I32)SvPVX(sv_dat);
	6636	IV i;
	6637	for ( i = 0 ; i < count ; i++ ) {
	6638	if ( pv[i] == RExC_npar ) {
	6639	count = 0;
	6640	break;
	6641	}
	6642	}
	6643	if ( count ) {
	6644	pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
	6645	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	6646	pv[count] = RExC_npar;
	6647	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	6648	}
	6649	} else {
	6650	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	6651	sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
	6652	SvIOK_on(sv_dat);
	6653	SvIV_set(sv_dat, 1);
	6654	}
	6655	#ifdef DEBUGGING
	6656	/* Yes this does cause a memory leak in debugging Perls */
	6657	if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
	6658	SvREFCNT_dec(svname);
	6659	#endif
	6660
	6661	/sv_dump(sv_dat);/
	6662	}
	6663	nextchar(pRExC_state);
	6664	paren = 1;
	6665	goto capturing_parens;
	6666	}
	6667	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	6668	RExC_in_lookbehind++;
	6669	RExC_parse++;
	6670	case '=': /* (?=...) */
	6671	RExC_seen_zerolen++;
	6672	break;
	6673	case '!': /* (?!...) */
	6674	RExC_seen_zerolen++;
	6675	if (*RExC_parse == ')') {
	6676	ret=reg_node(pRExC_state, OPFAIL);
	6677	nextchar(pRExC_state);
	6678	return ret;
	6679	}
	6680	break;
	6681	case '\|': /* (?\|...) */
	6682	/* branch reset, behave like a (?:...) except that
	6683	buffers in alternations share the same numbers */
	6684	paren = ':';
	6685	after_freeze = freeze_paren = RExC_npar;
	6686	break;
	6687	case ':': /* (?:...) */
	6688	case '>': /* (?>...) */
	6689	break;
	6690	case '$': /* (?$...) */
	6691	case '@': /* (?@...) */
	6692	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	6693	break;
	6694	case '#': /* (?#...) */
	6695	while (RExC_parse && RExC_parse != ')')
	6696	RExC_parse++;
	6697	if (*RExC_parse != ')')
	6698	FAIL("Sequence (?#... not terminated");
	6699	nextchar(pRExC_state);
	6700	*flagp = TRYAGAIN;
	6701	return NULL;
	6702	case '0' : /* (?0) */
	6703	case 'R' : /* (?R) */
	6704	if (*RExC_parse != ')')
	6705	FAIL("Sequence (?R) not terminated");
	6706	ret = reg_node(pRExC_state, GOSTART);
	6707	*flagp \|= POSTPONED;
	6708	nextchar(pRExC_state);
	6709	return ret;
	6710	/notreached/
	6711	{ /* named and numeric backreferences */
	6712	I32 num;
	6713	case '&': /* (?&NAME) */
	6714	parse_start = RExC_parse - 1;
	6715	named_recursion:
	6716	{
	6717	SV *sv_dat = reg_scan_name(pRExC_state,
	6718	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6719	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	6720	}
	6721	goto gen_recurse_regop;
	6722	/* NOT REACHED */
	6723	case '+':
	6724	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	6725	RExC_parse++;
	6726	vFAIL("Illegal pattern");
	6727	}
	6728	goto parse_recursion;
	6729	/* NOT REACHED*/
	6730	case '-': /* (?-1) */
	6731	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	6732	RExC_parse--; /* rewind to let it be handled later */
	6733	goto parse_flags;
	6734	}
	6735	/FALLTHROUGH /
	6736	case '1': case '2': case '3': case '4': /* (?1) */
	6737	case '5': case '6': case '7': case '8': case '9':
	6738	RExC_parse--;
	6739	parse_recursion:
	6740	num = atoi(RExC_parse);
	6741	parse_start = RExC_parse - 1; /* MJD */
	6742	if (*RExC_parse == '-')
	6743	RExC_parse++;
	6744	while (isDIGIT(*RExC_parse))
	6745	RExC_parse++;
	6746	if (*RExC_parse!=')')
	6747	vFAIL("Expecting close bracket");
	6748
	6749	gen_recurse_regop:
	6750	if ( paren == '-' ) {
	6751	/*
	6752	Diagram of capture buffer numbering.
	6753	Top line is the normal capture buffer numbers
	6754	Bottom line is the negative indexing as from
	6755	the X (the (?-2))
	6756
	6757	+ 1 2 3 4 5 X 6 7
	6758	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	6759	- 5 4 3 2 1 X x x
	6760
	6761	*/
	6762	num = RExC_npar + num;
	6763	if (num < 1) {
	6764	RExC_parse++;
	6765	vFAIL("Reference to nonexistent group");
	6766	}
	6767	} else if ( paren == '+' ) {
	6768	num = RExC_npar + num - 1;
	6769	}
	6770
	6771	ret = reganode(pRExC_state, GOSUB, num);
	6772	if (!SIZE_ONLY) {
	6773	if (num > (I32)RExC_rx->nparens) {
	6774	RExC_parse++;
	6775	vFAIL("Reference to nonexistent group");
	6776	}
	6777	ARG2L_SET( ret, RExC_recurse_count++);
	6778	RExC_emit++;
	6779	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	6780	"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
	6781	} else {
	6782	RExC_size++;
	6783	}
	6784	RExC_seen \|= REG_SEEN_RECURSE;
	6785	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	6786	Set_Node_Offset(ret, parse_start); /* MJD */
	6787
	6788	*flagp \|= POSTPONED;
	6789	nextchar(pRExC_state);
	6790	return ret;
	6791	} /* named and numeric backreferences */
	6792	/* NOT REACHED */
	6793
	6794	case '?': /* (??...) */
	6795	is_logical = 1;
	6796	if (*RExC_parse != '{') {
	6797	RExC_parse++;
	6798	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	6799	/NOTREACHED/
	6800	}
	6801	*flagp \|= POSTPONED;
	6802	paren = *RExC_parse++;
	6803	/* FALL THROUGH */
	6804	case '{': /* (?{...}) */
	6805	{
	6806	I32 count = 1;
	6807	U32 n = 0;
	6808	char c;
	6809	char *s = RExC_parse;
	6810
	6811	RExC_seen_zerolen++;
	6812	RExC_seen \|= REG_SEEN_EVAL;
	6813	while (count && (c = *RExC_parse)) {
	6814	if (c == '\\') {
	6815	if (RExC_parse[1])
	6816	RExC_parse++;
	6817	}
	6818	else if (c == '{')
	6819	count++;
	6820	else if (c == '}')
	6821	count--;
	6822	RExC_parse++;
	6823	}
	6824	if (*RExC_parse != ')') {
	6825	RExC_parse = s;
	6826	vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
	6827	}
	6828	if (!SIZE_ONLY) {
	6829	PAD *pad;
	6830	OP_4tree sop, rop;
	6831	SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
	6832
	6833	ENTER;
	6834	Perl_save_re_context(aTHX);
	6835	rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
	6836	sop->op_private \|= OPpREFCOUNTED;
	6837	/* re_dup will OpREFCNT_inc */
	6838	OpREFCNT_set(sop, 1);
	6839	LEAVE;
	6840
	6841	n = add_data(pRExC_state, 3, "nop");
	6842	RExC_rxi->data->data[n] = (void*)rop;
	6843	RExC_rxi->data->data[n+1] = (void*)sop;
	6844	RExC_rxi->data->data[n+2] = (void*)pad;
	6845	SvREFCNT_dec(sv);
	6846	}
	6847	else { /* First pass */
	6848	if (PL_reginterp_cnt < ++RExC_seen_evals
	6849	&& IN_PERL_RUNTIME)
	6850	/* No compiled RE interpolated, has runtime
	6851	components ===> unsafe. */
	6852	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	6853	if (PL_tainting && PL_tainted)
	6854	FAIL("Eval-group in insecure regular expression");
	6855	#if PERL_VERSION > 8
	6856	if (IN_PERL_COMPILETIME)
	6857	PL_cv_has_eval = 1;
	6858	#endif
	6859	}
	6860
	6861	nextchar(pRExC_state);
	6862	if (is_logical) {
	6863	ret = reg_node(pRExC_state, LOGICAL);
	6864	if (!SIZE_ONLY)
	6865	ret->flags = 2;
	6866	REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
	6867	/* deal with the length of this later - MJD */
	6868	return ret;
	6869	}
	6870	ret = reganode(pRExC_state, EVAL, n);
	6871	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	6872	Set_Node_Offset(ret, parse_start);
	6873	return ret;
	6874	}
	6875	case '(': /* (?(?{...})...) and (?(?=...)...) */
	6876	{
	6877	int is_define= 0;
	6878	if (RExC_parse[0] == '?') { /* (?(?...)) */
	6879	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	6880	\|\| RExC_parse[1] == '<'
	6881	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	6882	I32 flag;
	6883
	6884	ret = reg_node(pRExC_state, LOGICAL);
	6885	if (!SIZE_ONLY)
	6886	ret->flags = 1;
	6887	REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
	6888	goto insert_if;
	6889	}
	6890	}
	6891	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	6892	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	6893	{
	6894	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	6895	char *name_start= RExC_parse++;
	6896	U32 num = 0;
	6897	SV *sv_dat=reg_scan_name(pRExC_state,
	6898	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6899	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	6900	vFAIL2("Sequence (?(%c... not terminated",
	6901	(ch == '>' ? '<' : ch));
	6902	RExC_parse++;
	6903	if (!SIZE_ONLY) {
	6904	num = add_data( pRExC_state, 1, "S" );
	6905	RExC_rxi->data->data[num]=(void*)sv_dat;
	6906	SvREFCNT_inc_simple_void(sv_dat);
	6907	}
	6908	ret = reganode(pRExC_state,NGROUPP,num);
	6909	goto insert_if_check_paren;
	6910	}
	6911	else if (RExC_parse[0] == 'D' &&
	6912	RExC_parse[1] == 'E' &&
	6913	RExC_parse[2] == 'F' &&
	6914	RExC_parse[3] == 'I' &&
	6915	RExC_parse[4] == 'N' &&
	6916	RExC_parse[5] == 'E')
	6917	{
	6918	ret = reganode(pRExC_state,DEFINEP,0);
	6919	RExC_parse +=6 ;
	6920	is_define = 1;
	6921	goto insert_if_check_paren;
	6922	}
	6923	else if (RExC_parse[0] == 'R') {
	6924	RExC_parse++;
	6925	parno = 0;
	6926	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	6927	parno = atoi(RExC_parse++);
	6928	while (isDIGIT(*RExC_parse))
	6929	RExC_parse++;
	6930	} else if (RExC_parse[0] == '&') {
	6931	SV *sv_dat;
	6932	RExC_parse++;
	6933	sv_dat = reg_scan_name(pRExC_state,
	6934	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	6935	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	6936	}
	6937	ret = reganode(pRExC_state,INSUBP,parno);
	6938	goto insert_if_check_paren;
	6939	}
	6940	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	6941	/* (?(1)...) */
	6942	char c;
	6943	parno = atoi(RExC_parse++);
	6944
	6945	while (isDIGIT(*RExC_parse))
	6946	RExC_parse++;
	6947	ret = reganode(pRExC_state, GROUPP, parno);
	6948
	6949	insert_if_check_paren:
	6950	if ((c = *nextchar(pRExC_state)) != ')')
	6951	vFAIL("Switch condition not recognized");
	6952	insert_if:
	6953	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	6954	br = regbranch(pRExC_state, &flags, 1,depth+1);
	6955	if (br == NULL)
	6956	br = reganode(pRExC_state, LONGJMP, 0);
	6957	else
	6958	REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
	6959	c = *nextchar(pRExC_state);
	6960	if (flags&HASWIDTH)
	6961	*flagp \|= HASWIDTH;
	6962	if (c == '\|') {
	6963	if (is_define)
	6964	vFAIL("(?(DEFINE)....) does not allow branches");
	6965	lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
	6966	regbranch(pRExC_state, &flags, 1,depth+1);
	6967	REGTAIL(pRExC_state, ret, lastbr);
	6968	if (flags&HASWIDTH)
	6969	*flagp \|= HASWIDTH;
	6970	c = *nextchar(pRExC_state);
	6971	}
	6972	else
	6973	lastbr = NULL;
	6974	if (c != ')')
	6975	vFAIL("Switch (?(condition)... contains too many branches");
	6976	ender = reg_node(pRExC_state, TAIL);
	6977	REGTAIL(pRExC_state, br, ender);
	6978	if (lastbr) {
	6979	REGTAIL(pRExC_state, lastbr, ender);
	6980	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	6981	}
	6982	else
	6983	REGTAIL(pRExC_state, ret, ender);
	6984	RExC_size++; /* XXX WHY do we need this?!!
	6985	For large programs it seems to be required
	6986	but I can't figure out why. -- dmq*/
	6987	return ret;
	6988	}
	6989	else {
	6990	vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
	6991	}
	6992	}
	6993	case 0:
	6994	RExC_parse--; /* for vFAIL to print correctly */
	6995	vFAIL("Sequence (? incomplete");
	6996	break;
	6997	case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
	6998	that follow */
	6999	has_use_defaults = TRUE;
	7000	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	7001	set_regex_charset(&RExC_flags, (RExC_utf8 \|\| RExC_uni_semantics)
	7002	? REGEX_UNICODE_CHARSET
	7003	: REGEX_DEPENDS_CHARSET);
	7004	goto parse_flags;
	7005	default:
	7006	--RExC_parse;
	7007	parse_flags: /* (?i) */
	7008	{
	7009	U32 posflags = 0, negflags = 0;
	7010	U32 *flagsp = &posflags;
	7011	char has_charset_modifier = '\0';
	7012	regex_charset cs = (RExC_utf8 \|\| RExC_uni_semantics)
	7013	? REGEX_UNICODE_CHARSET
	7014	: REGEX_DEPENDS_CHARSET;
	7015
	7016	while (*RExC_parse) {
	7017	/* && strchr("iogcmsx", RExC_parse) /
	7018	/* (?g), (?gc) and (?o) are useless here
	7019	and must be globally applied -- japhy */
	7020	switch (*RExC_parse) {
	7021	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	7022	case LOCALE_PAT_MOD:
	7023	if (has_charset_modifier) {
	7024	goto excess_modifier;
	7025	}
	7026	else if (flagsp == &negflags) {
	7027	goto neg_modifier;
	7028	}
	7029	cs = REGEX_LOCALE_CHARSET;
	7030	has_charset_modifier = LOCALE_PAT_MOD;
	7031	RExC_contains_locale = 1;
	7032	break;
	7033	case UNICODE_PAT_MOD:
	7034	if (has_charset_modifier) {
	7035	goto excess_modifier;
	7036	}
	7037	else if (flagsp == &negflags) {
	7038	goto neg_modifier;
	7039	}
	7040	cs = REGEX_UNICODE_CHARSET;
	7041	has_charset_modifier = UNICODE_PAT_MOD;
	7042	break;
	7043	case ASCII_RESTRICT_PAT_MOD:
	7044	if (flagsp == &negflags) {
	7045	goto neg_modifier;
	7046	}
	7047	if (has_charset_modifier) {
	7048	if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
	7049	goto excess_modifier;
	7050	}
	7051	/* Doubled modifier implies more restricted */
	7052	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	7053	}
	7054	else {
	7055	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	7056	}
	7057	has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
	7058	break;
	7059	case DEPENDS_PAT_MOD:
	7060	if (has_use_defaults) {
	7061	goto fail_modifiers;
	7062	}
	7063	else if (flagsp == &negflags) {
	7064	goto neg_modifier;
	7065	}
	7066	else if (has_charset_modifier) {
	7067	goto excess_modifier;
	7068	}
	7069
	7070	/* The dual charset means unicode semantics if the
	7071	* pattern (or target, not known until runtime) are
	7072	* utf8, or something in the pattern indicates unicode
	7073	* semantics */
	7074	cs = (RExC_utf8 \|\| RExC_uni_semantics)
	7075	? REGEX_UNICODE_CHARSET
	7076	: REGEX_DEPENDS_CHARSET;
	7077	has_charset_modifier = DEPENDS_PAT_MOD;
	7078	break;
	7079	excess_modifier:
	7080	RExC_parse++;
	7081	if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
	7082	vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
	7083	}
	7084	else if (has_charset_modifier == *(RExC_parse - 1)) {
	7085	vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
	7086	}
	7087	else {
	7088	vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
	7089	}
	7090	/NOTREACHED/
	7091	neg_modifier:
	7092	RExC_parse++;
	7093	vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
	7094	/NOTREACHED/
	7095	case ONCE_PAT_MOD: /* 'o' */
	7096	case GLOBAL_PAT_MOD: /* 'g' */
	7097	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	7098	const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
	7099	if (! (wastedflags & wflagbit) ) {
	7100	wastedflags \|= wflagbit;
	7101	vWARN5(
	7102	RExC_parse + 1,
	7103	"Useless (%s%c) - %suse /%c modifier",
	7104	flagsp == &negflags ? "?-" : "?",
	7105	*RExC_parse,
	7106	flagsp == &negflags ? "don't " : "",
	7107	*RExC_parse
	7108	);
	7109	}
	7110	}
	7111	break;
	7112
	7113	case CONTINUE_PAT_MOD: /* 'c' */
	7114	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	7115	if (! (wastedflags & WASTED_C) ) {
	7116	wastedflags \|= WASTED_GC;
	7117	vWARN3(
	7118	RExC_parse + 1,
	7119	"Useless (%sc) - %suse /gc modifier",
	7120	flagsp == &negflags ? "?-" : "?",
	7121	flagsp == &negflags ? "don't " : ""
	7122	);
	7123	}
	7124	}
	7125	break;
	7126	case KEEPCOPY_PAT_MOD: /* 'p' */
	7127	if (flagsp == &negflags) {
	7128	if (SIZE_ONLY)
	7129	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	7130	} else {
	7131	*flagsp \|= RXf_PMf_KEEPCOPY;
	7132	}
	7133	break;
	7134	case '-':
	7135	/* A flag is a default iff it is following a minus, so
	7136	* if there is a minus, it means will be trying to
	7137	* re-specify a default which is an error */
	7138	if (has_use_defaults \|\| flagsp == &negflags) {
	7139	fail_modifiers:
	7140	RExC_parse++;
	7141	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7142	/NOTREACHED/
	7143	}
	7144	flagsp = &negflags;
	7145	wastedflags = 0; /* reset so (?g-c) warns twice */
	7146	break;
	7147	case ':':
	7148	paren = ':';
	7149	/FALLTHROUGH/
	7150	case ')':
	7151	RExC_flags \|= posflags;
	7152	RExC_flags &= ~negflags;
	7153	set_regex_charset(&RExC_flags, cs);
	7154	if (paren != ':') {
	7155	oregflags \|= posflags;
	7156	oregflags &= ~negflags;
	7157	set_regex_charset(&oregflags, cs);
	7158	}
	7159	nextchar(pRExC_state);
	7160	if (paren != ':') {
	7161	*flagp = TRYAGAIN;
	7162	return NULL;
	7163	} else {
	7164	ret = NULL;
	7165	goto parse_rest;
	7166	}
	7167	/NOTREACHED/
	7168	default:
	7169	RExC_parse++;
	7170	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	7171	/NOTREACHED/
	7172	}
	7173	++RExC_parse;
	7174	}
	7175	}} /* one for the default block, one for the switch */
	7176	}
	7177	else { /* (...) */
	7178	capturing_parens:
	7179	parno = RExC_npar;
	7180	RExC_npar++;
	7181
	7182	ret = reganode(pRExC_state, OPEN, parno);
	7183	if (!SIZE_ONLY ){
	7184	if (!RExC_nestroot)
	7185	RExC_nestroot = parno;
	7186	if (RExC_seen & REG_SEEN_RECURSE
	7187	&& !RExC_open_parens[parno-1])
	7188	{
	7189	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	7190	"Setting open paren #%"IVdf" to %d\n",
	7191	(IV)parno, REG_NODE_NUM(ret)));
	7192	RExC_open_parens[parno-1]= ret;
	7193	}
	7194	}
	7195	Set_Node_Length(ret, 1); /* MJD */
	7196	Set_Node_Offset(ret, RExC_parse); /* MJD */
	7197	is_open = 1;
	7198	}
	7199	}
	7200	else /* ! paren */
	7201	ret = NULL;
	7202
	7203	parse_rest:
	7204	/* Pick up the branches, linking them together. */
	7205	parse_start = RExC_parse; /* MJD */
	7206	br = regbranch(pRExC_state, &flags, 1,depth+1);
	7207
	7208	/* branch_len = (paren != 0); */
	7209
	7210	if (br == NULL)
	7211	return(NULL);
	7212	if (*RExC_parse == '\|') {
	7213	if (!SIZE_ONLY && RExC_extralen) {
	7214	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	7215	}
	7216	else { /* MJD */
	7217	reginsert(pRExC_state, BRANCH, br, depth+1);
	7218	Set_Node_Length(br, paren != 0);
	7219	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	7220	}
	7221	have_branch = 1;
	7222	if (SIZE_ONLY)
	7223	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	7224	}
	7225	else if (paren == ':') {
	7226	*flagp \|= flags&SIMPLE;
	7227	}
	7228	if (is_open) { /* Starts with OPEN. */
	7229	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	7230	}
	7231	else if (paren != '?') /* Not Conditional */
	7232	ret = br;
	7233	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	7234	lastbr = br;
	7235	while (*RExC_parse == '\|') {
	7236	if (!SIZE_ONLY && RExC_extralen) {
	7237	ender = reganode(pRExC_state, LONGJMP,0);
	7238	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
	7239	}
	7240	if (SIZE_ONLY)
	7241	RExC_extralen += 2; /* Account for LONGJMP. */
	7242	nextchar(pRExC_state);
	7243	if (freeze_paren) {
	7244	if (RExC_npar > after_freeze)
	7245	after_freeze = RExC_npar;
	7246	RExC_npar = freeze_paren;
	7247	}
	7248	br = regbranch(pRExC_state, &flags, 0, depth+1);
	7249
	7250	if (br == NULL)
	7251	return(NULL);
	7252	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	7253	lastbr = br;
	7254	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	7255	}
	7256
	7257	if (have_branch \|\| paren != ':') {
	7258	/* Make a closing node, and hook it on the end. */
	7259	switch (paren) {
	7260	case ':':
	7261	ender = reg_node(pRExC_state, TAIL);
	7262	break;
	7263	case 1:
	7264	ender = reganode(pRExC_state, CLOSE, parno);
	7265	if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
	7266	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	7267	"Setting close paren #%"IVdf" to %d\n",
	7268	(IV)parno, REG_NODE_NUM(ender)));
	7269	RExC_close_parens[parno-1]= ender;
	7270	if (RExC_nestroot == parno)
	7271	RExC_nestroot = 0;
	7272	}
	7273	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	7274	Set_Node_Length(ender,1); /* MJD */
	7275	break;
	7276	case '<':
	7277	case ',':
	7278	case '=':
	7279	case '!':
	7280	*flagp &= ~HASWIDTH;
	7281	/* FALL THROUGH */
	7282	case '>':
	7283	ender = reg_node(pRExC_state, SUCCEED);
	7284	break;
	7285	case 0:
	7286	ender = reg_node(pRExC_state, END);
	7287	if (!SIZE_ONLY) {
	7288	assert(!RExC_opend); /* there can only be one! */
	7289	RExC_opend = ender;
	7290	}
	7291	break;
	7292	}
	7293	REGTAIL(pRExC_state, lastbr, ender);
	7294
	7295	if (have_branch && !SIZE_ONLY) {
	7296	if (depth==1)
	7297	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	7298
	7299	/* Hook the tails of the branches to the closing node. */
	7300	for (br = ret; br; br = regnext(br)) {
	7301	const U8 op = PL_regkind[OP(br)];
	7302	if (op == BRANCH) {
	7303	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	7304	}
	7305	else if (op == BRANCHJ) {
	7306	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	7307	}
	7308	}
	7309	}
	7310	}
	7311
	7312	{
	7313	const char *p;
	7314	static const char parens[] = "=!<,>";
	7315
	7316	if (paren && (p = strchr(parens, paren))) {
	7317	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	7318	int flag = (p - parens) > 1;
	7319
	7320	if (paren == '>')
	7321	node = SUSPEND, flag = 0;
	7322	reginsert(pRExC_state, node,ret, depth+1);
	7323	Set_Node_Cur_Length(ret);
	7324	Set_Node_Offset(ret, parse_start + 1);
	7325	ret->flags = flag;
	7326	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	7327	}
	7328	}
	7329
	7330	/* Check for proper termination. */
	7331	if (paren) {
	7332	RExC_flags = oregflags;
	7333	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	7334	RExC_parse = oregcomp_parse;
	7335	vFAIL("Unmatched (");
	7336	}
	7337	}
	7338	else if (!paren && RExC_parse < RExC_end) {
	7339	if (*RExC_parse == ')') {
	7340	RExC_parse++;
	7341	vFAIL("Unmatched )");
	7342	}
	7343	else
	7344	FAIL("Junk on end of regexp"); /* "Can't happen". */
	7345	/* NOTREACHED */
	7346	}
	7347
	7348	if (RExC_in_lookbehind) {
	7349	RExC_in_lookbehind--;
	7350	}
	7351	if (after_freeze > RExC_npar)
	7352	RExC_npar = after_freeze;
	7353	return(ret);
	7354	}
	7355
	7356	/*
	7357	- regbranch - one alternative of an \| operator
	7358	*
	7359	* Implements the concatenation operator.
	7360	*/
	7361	STATIC regnode *
	7362	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	7363	{
	7364	dVAR;
	7365	register regnode *ret;
	7366	register regnode *chain = NULL;
	7367	register regnode *latest;
	7368	I32 flags = 0, c = 0;
	7369	GET_RE_DEBUG_FLAGS_DECL;
	7370
	7371	PERL_ARGS_ASSERT_REGBRANCH;
	7372
	7373	DEBUG_PARSE("brnc");
	7374
	7375	if (first)
	7376	ret = NULL;
	7377	else {
	7378	if (!SIZE_ONLY && RExC_extralen)
	7379	ret = reganode(pRExC_state, BRANCHJ,0);
	7380	else {
	7381	ret = reg_node(pRExC_state, BRANCH);
	7382	Set_Node_Length(ret, 1);
	7383	}
	7384	}
	7385
	7386	if (!first && SIZE_ONLY)
	7387	RExC_extralen += 1; /* BRANCHJ */
	7388
	7389	flagp = WORST; / Tentatively. */
	7390
	7391	RExC_parse--;
	7392	nextchar(pRExC_state);
	7393	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	7394	flags &= ~TRYAGAIN;
	7395	latest = regpiece(pRExC_state, &flags,depth+1);
	7396	if (latest == NULL) {
	7397	if (flags & TRYAGAIN)
	7398	continue;
	7399	return(NULL);
	7400	}
	7401	else if (ret == NULL)
	7402	ret = latest;
	7403	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	7404	if (chain == NULL) /* First piece. */
	7405	*flagp \|= flags&SPSTART;
	7406	else {
	7407	RExC_naughty++;
	7408	REGTAIL(pRExC_state, chain, latest);
	7409	}
	7410	chain = latest;
	7411	c++;
	7412	}
	7413	if (chain == NULL) { /* Loop ran zero times. */
	7414	chain = reg_node(pRExC_state, NOTHING);
	7415	if (ret == NULL)
	7416	ret = chain;
	7417	}
	7418	if (c == 1) {
	7419	*flagp \|= flags&SIMPLE;
	7420	}
	7421
	7422	return ret;
	7423	}
	7424
	7425	/*
	7426	- regpiece - something followed by possible [*+?]
	7427	*
	7428	* Note that the branching code sequences used for ? and the general cases
	7429	* of * and + are somewhat optimized: they use the same NOTHING node as
	7430	* both the endmarker for their branch list and the body of the last branch.
	7431	* It might seem that this node could be dispensed with entirely, but the
	7432	* endmarker role is not redundant.
	7433	*/
	7434	STATIC regnode *
	7435	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	7436	{
	7437	dVAR;
	7438	register regnode *ret;
	7439	register char op;
	7440	register char *next;
	7441	I32 flags;
	7442	const char * const origparse = RExC_parse;
	7443	I32 min;
	7444	I32 max = REG_INFTY;
	7445	#ifdef RE_TRACK_PATTERN_OFFSETS
	7446	char *parse_start;
	7447	#endif
	7448	const char *maxpos = NULL;
	7449	GET_RE_DEBUG_FLAGS_DECL;
	7450
	7451	PERL_ARGS_ASSERT_REGPIECE;
	7452
	7453	DEBUG_PARSE("piec");
	7454
	7455	ret = regatom(pRExC_state, &flags,depth+1);
	7456	if (ret == NULL) {
	7457	if (flags & TRYAGAIN)
	7458	*flagp \|= TRYAGAIN;
	7459	return(NULL);
	7460	}
	7461
	7462	op = *RExC_parse;
	7463
	7464	if (op == '{' && regcurly(RExC_parse)) {
	7465	maxpos = NULL;
	7466	#ifdef RE_TRACK_PATTERN_OFFSETS
	7467	parse_start = RExC_parse; /* MJD */
	7468	#endif
	7469	next = RExC_parse + 1;
	7470	while (isDIGIT(next) \|\| next == ',') {
	7471	if (*next == ',') {
	7472	if (maxpos)
	7473	break;
	7474	else
	7475	maxpos = next;
	7476	}
	7477	next++;
	7478	}
	7479	if (next == '}') { / got one */
	7480	if (!maxpos)
	7481	maxpos = next;
	7482	RExC_parse++;
	7483	min = atoi(RExC_parse);
	7484	if (*maxpos == ',')
	7485	maxpos++;
	7486	else
	7487	maxpos = RExC_parse;
	7488	max = atoi(maxpos);
	7489	if (!max && *maxpos != '0')
	7490	max = REG_INFTY; /* meaning "infinity" */
	7491	else if (max >= REG_INFTY)
	7492	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	7493	RExC_parse = next;
	7494	nextchar(pRExC_state);
	7495
	7496	do_curly:
	7497	if ((flags&SIMPLE)) {
	7498	RExC_naughty += 2 + RExC_naughty / 2;
	7499	reginsert(pRExC_state, CURLY, ret, depth+1);
	7500	Set_Node_Offset(ret, parse_start+1); /* MJD */
	7501	Set_Node_Cur_Length(ret);
	7502	}
	7503	else {
	7504	regnode * const w = reg_node(pRExC_state, WHILEM);
	7505
	7506	w->flags = 0;
	7507	REGTAIL(pRExC_state, ret, w);
	7508	if (!SIZE_ONLY && RExC_extralen) {
	7509	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	7510	reginsert(pRExC_state, NOTHING,ret, depth+1);
	7511	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	7512	}
	7513	reginsert(pRExC_state, CURLYX,ret, depth+1);
	7514	/* MJD hk */
	7515	Set_Node_Offset(ret, parse_start+1);
	7516	Set_Node_Length(ret,
	7517	op == '{' ? (RExC_parse - parse_start) : 1);
	7518
	7519	if (!SIZE_ONLY && RExC_extralen)
	7520	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	7521	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	7522	if (SIZE_ONLY)
	7523	RExC_whilem_seen++, RExC_extralen += 3;
	7524	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	7525	}
	7526	ret->flags = 0;
	7527
	7528	if (min > 0)
	7529	*flagp = WORST;
	7530	if (max > 0)
	7531	*flagp \|= HASWIDTH;
	7532	if (max < min)
	7533	vFAIL("Can't do {n,m} with n > m");
	7534	if (!SIZE_ONLY) {
	7535	ARG1_SET(ret, (U16)min);
	7536	ARG2_SET(ret, (U16)max);
	7537	}
	7538
	7539	goto nest_check;
	7540	}
	7541	}
	7542
	7543	if (!ISMULT1(op)) {
	7544	*flagp = flags;
	7545	return(ret);
	7546	}
	7547
	7548	#if 0 /* Now runtime fix should be reliable. */
	7549
	7550	/* if this is reinstated, don't forget to put this back into perldiag:
	7551
	7552	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	7553
	7554	(F) The part of the regexp subject to either the * or + quantifier
	7555	could match an empty string. The {#} shows in the regular
	7556	expression about where the problem was discovered.
	7557
	7558	*/
	7559
	7560	if (!(flags&HASWIDTH) && op != '?')
	7561	vFAIL("Regexp *+ operand could be empty");
	7562	#endif
	7563
	7564	#ifdef RE_TRACK_PATTERN_OFFSETS
	7565	parse_start = RExC_parse;
	7566	#endif
	7567	nextchar(pRExC_state);
	7568
	7569	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	7570
	7571	if (op == '*' && (flags&SIMPLE)) {
	7572	reginsert(pRExC_state, STAR, ret, depth+1);
	7573	ret->flags = 0;
	7574	RExC_naughty += 4;
	7575	}
	7576	else if (op == '*') {
	7577	min = 0;
	7578	goto do_curly;
	7579	}
	7580	else if (op == '+' && (flags&SIMPLE)) {
	7581	reginsert(pRExC_state, PLUS, ret, depth+1);
	7582	ret->flags = 0;
	7583	RExC_naughty += 3;
	7584	}
	7585	else if (op == '+') {
	7586	min = 1;
	7587	goto do_curly;
	7588	}
	7589	else if (op == '?') {
	7590	min = 0; max = 1;
	7591	goto do_curly;
	7592	}
	7593	nest_check:
	7594	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	7595	ckWARN3reg(RExC_parse,
	7596	"%.*s matches null string many times",
	7597	(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
	7598	origparse);
	7599	}
	7600
	7601	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	7602	nextchar(pRExC_state);
	7603	reginsert(pRExC_state, MINMOD, ret, depth+1);
	7604	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	7605	}
	7606	#ifndef REG_ALLOW_MINMOD_SUSPEND
	7607	else
	7608	#endif
	7609	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	7610	regnode *ender;
	7611	nextchar(pRExC_state);
	7612	ender = reg_node(pRExC_state, SUCCEED);
	7613	REGTAIL(pRExC_state, ret, ender);
	7614	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	7615	ret->flags = 0;
	7616	ender = reg_node(pRExC_state, TAIL);
	7617	REGTAIL(pRExC_state, ret, ender);
	7618	/ret= ender;/
	7619	}
	7620
	7621	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	7622	RExC_parse++;
	7623	vFAIL("Nested quantifiers");
	7624	}
	7625
	7626	return(ret);
	7627	}
	7628
	7629
	7630	/* reg_namedseq(pRExC_state,UVp, UV depth)
	7631
	7632	This is expected to be called by a parser routine that has
	7633	recognized '\N' and needs to handle the rest. RExC_parse is
	7634	expected to point at the first char following the N at the time
	7635	of the call.
	7636
	7637	The \N may be inside (indicated by valuep not being NULL) or outside a
	7638	character class.
	7639
	7640	\N may begin either a named sequence, or if outside a character class, mean
	7641	to match a non-newline. For non single-quoted regexes, the tokenizer has
	7642	attempted to decide which, and in the case of a named sequence converted it
	7643	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	7644	where c1... are the characters in the sequence. For single-quoted regexes,
	7645	the tokenizer passes the \N sequence through unchanged; this code will not
	7646	attempt to determine this nor expand those. The net effect is that if the
	7647	beginning of the passed-in pattern isn't '{U+' or there is no '}', it
	7648	signals that this \N occurrence means to match a non-newline.
	7649
	7650	Only the \N{U+...} form should occur in a character class, for the same
	7651	reason that '.' inside a character class means to just match a period: it
	7652	just doesn't make sense.
	7653
	7654	If valuep is non-null then it is assumed that we are parsing inside
	7655	of a charclass definition and the first codepoint in the resolved
	7656	string is returned via *valuep and the routine will return NULL.
	7657	In this mode if a multichar string is returned from the charnames
	7658	handler, a warning will be issued, and only the first char in the
	7659	sequence will be examined. If the string returned is zero length
	7660	then the value of *valuep is undefined and NON-NULL will
	7661	be returned to indicate failure. (This will NOT be a valid pointer
	7662	to a regnode.)
	7663
	7664	If valuep is null then it is assumed that we are parsing normal text and a
	7665	new EXACT node is inserted into the program containing the resolved string,
	7666	and a pointer to the new node is returned. But if the string is zero length
	7667	a NOTHING node is emitted instead.
	7668
	7669	On success RExC_parse is set to the char following the endbrace.
	7670	Parsing failures will generate a fatal error via vFAIL(...)
	7671	*/
	7672	STATIC regnode *
	7673	S_reg_namedseq(pTHX_ RExC_state_t pRExC_state, UV valuep, I32 *flagp, U32 depth)
	7674	{
	7675	char * endbrace; /* '}' following the name */
	7676	regnode *ret = NULL;
	7677	char* p;
	7678
	7679	GET_RE_DEBUG_FLAGS_DECL;
	7680
	7681	PERL_ARGS_ASSERT_REG_NAMEDSEQ;
	7682
	7683	GET_RE_DEBUG_FLAGS;
	7684
	7685	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	7686	* modifier. The other meaning does not */
	7687	p = (RExC_flags & RXf_PMf_EXTENDED)
	7688	? regwhite( pRExC_state, RExC_parse )
	7689	: RExC_parse;
	7690
	7691	/* Disambiguate between \N meaning a named character versus \N meaning
	7692	* [^\n]. The former is assumed when it can't be the latter. */
	7693	if (*p != '{' \|\| regcurly(p)) {
	7694	RExC_parse = p;
	7695	if (valuep) {
	7696	/* no bare \N in a charclass */
	7697	vFAIL("\\N in a character class must be a named character: \\N{...}");
	7698	}
	7699	nextchar(pRExC_state);
	7700	ret = reg_node(pRExC_state, REG_ANY);
	7701	*flagp \|= HASWIDTH\|SIMPLE;
	7702	RExC_naughty++;
	7703	RExC_parse--;
	7704	Set_Node_Length(ret, 1); /* MJD */
	7705	return ret;
	7706	}
	7707
	7708	/* Here, we have decided it should be a named sequence */
	7709
	7710	/* The test above made sure that the next real character is a '{', but
	7711	* under the /x modifier, it could be separated by space (or a comment and
	7712	* \n) and this is not allowed (for consistency with \x{...} and the
	7713	* tokenizer handling of \N{NAME}). */
	7714	if (*RExC_parse != '{') {
	7715	vFAIL("Missing braces on \\N{}");
	7716	}
	7717
	7718	RExC_parse++; /* Skip past the '{' */
	7719
	7720	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	7721	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	7722	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
	7723	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
	7724	{
	7725	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	7726	vFAIL("\\N{NAME} must be resolved by the lexer");
	7727	}
	7728
	7729	if (endbrace == RExC_parse) { /* empty: \N{} */
	7730	if (! valuep) {
	7731	RExC_parse = endbrace + 1;
	7732	return reg_node(pRExC_state,NOTHING);
	7733	}
	7734
	7735	if (SIZE_ONLY) {
	7736	ckWARNreg(RExC_parse,
	7737	"Ignoring zero length \\N{} in character class"
	7738	);
	7739	RExC_parse = endbrace + 1;
	7740	}
	7741	*valuep = 0;
	7742	return (regnode ) &RExC_parse; / Invalid regnode pointer */
	7743	}
	7744
	7745	REQUIRE_UTF8; /* named sequences imply Unicode semantics */
	7746	RExC_parse += 2; /* Skip past the 'U+' */
	7747
	7748	if (valuep) { /* In a bracketed char class */
	7749	/* We only pay attention to the first char of
	7750	multichar strings being returned. I kinda wonder
	7751	if this makes sense as it does change the behaviour
	7752	from earlier versions, OTOH that behaviour was broken
	7753	as well. XXX Solution is to recharacterize as
	7754	[rest-of-class]\|multi1\|multi2... */
	7755
	7756	STRLEN length_of_hex;
	7757	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	7758	\| PERL_SCAN_DISALLOW_PREFIX
	7759	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	7760
	7761	char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
	7762	if (endchar < endbrace) {
	7763	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	7764	}
	7765
	7766	length_of_hex = (STRLEN)(endchar - RExC_parse);
	7767	*valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
	7768
	7769	/* The tokenizer should have guaranteed validity, but it's possible to
	7770	* bypass it by using single quoting, so check */
	7771	if (length_of_hex == 0
	7772	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	7773	{
	7774	RExC_parse += length_of_hex; /* Includes all the valid */
	7775	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	7776	? UTF8SKIP(RExC_parse)
	7777	: 1;
	7778	/* Guard against malformed utf8 */
	7779	if (RExC_parse >= endchar) RExC_parse = endchar;
	7780	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	7781	}
	7782
	7783	RExC_parse = endbrace + 1;
	7784	if (endchar == endbrace) return NULL;
	7785
	7786	ret = (regnode ) &RExC_parse; / Invalid regnode pointer */
	7787	}
	7788	else { /* Not a char class */
	7789
	7790	/* What is done here is to convert this to a sub-pattern of the form
	7791	* (?:\x{char1}\x{char2}...)
	7792	* and then call reg recursively. That way, it retains its atomicness,
	7793	* while not having to worry about special handling that some code
	7794	* points may have. toke.c has converted the original Unicode values
	7795	* to native, so that we can just pass on the hex values unchanged. We
	7796	* do have to set a flag to keep recoding from happening in the
	7797	* recursion */
	7798
	7799	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8\|SVs_TEMP);
	7800	STRLEN len;
	7801	char endchar; / Points to '.' or '}' ending cur char in the input
	7802	stream */
	7803	char *orig_end = RExC_end;
	7804
	7805	while (RExC_parse < endbrace) {
	7806
	7807	/* Code points are separated by dots. If none, there is only one
	7808	* code point, and is terminated by the brace */
	7809	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	7810
	7811	/* Convert to notation the rest of the code understands */
	7812	sv_catpv(substitute_parse, "\\x{");
	7813	sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
	7814	sv_catpv(substitute_parse, "}");
	7815
	7816	/* Point to the beginning of the next character in the sequence. */
	7817	RExC_parse = endchar + 1;
	7818	}
	7819	sv_catpv(substitute_parse, ")");
	7820
	7821	RExC_parse = SvPV(substitute_parse, len);
	7822
	7823	/* Don't allow empty number */
	7824	if (len < 8) {
	7825	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	7826	}
	7827	RExC_end = RExC_parse + len;
	7828
	7829	/* The values are Unicode, and therefore not subject to recoding */
	7830	RExC_override_recoding = 1;
	7831
	7832	ret = reg(pRExC_state, 1, flagp, depth+1);
	7833
	7834	RExC_parse = endbrace;
	7835	RExC_end = orig_end;
	7836	RExC_override_recoding = 0;
	7837
	7838	nextchar(pRExC_state);
	7839	}
	7840
	7841	return ret;
	7842	}
	7843
	7844
	7845	/*
	7846	* reg_recode
	7847	*
	7848	* It returns the code point in utf8 for the value in *encp.
	7849	* value: a code value in the source encoding
	7850	* encp: a pointer to an Encode object
	7851	*
	7852	* If the result from Encode is not a single character,
	7853	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	7854	*/
	7855	STATIC UV
	7856	S_reg_recode(pTHX_ const char value, SV **encp)
	7857	{
	7858	STRLEN numlen = 1;
	7859	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	7860	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	7861	const STRLEN newlen = SvCUR(sv);
	7862	UV uv = UNICODE_REPLACEMENT;
	7863
	7864	PERL_ARGS_ASSERT_REG_RECODE;
	7865
	7866	if (newlen)
	7867	uv = SvUTF8(sv)
	7868	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	7869	: (U8)s;
	7870
	7871	if (!newlen \|\| numlen != newlen) {
	7872	uv = UNICODE_REPLACEMENT;
	7873	*encp = NULL;
	7874	}
	7875	return uv;
	7876	}
	7877
	7878
	7879	/*
	7880	- regatom - the lowest level
	7881
	7882	Try to identify anything special at the start of the pattern. If there
	7883	is, then handle it as required. This may involve generating a single regop,
	7884	such as for an assertion; or it may involve recursing, such as to
	7885	handle a () structure.
	7886
	7887	If the string doesn't start with something special then we gobble up
	7888	as much literal text as we can.
	7889
	7890	Once we have been able to handle whatever type of thing started the
	7891	sequence, we return.
	7892
	7893	Note: we have to be careful with escapes, as they can be both literal
	7894	and special, and in the case of \10 and friends can either, depending
	7895	on context. Specifically there are two separate switches for handling
	7896	escape sequences, with the one for handling literal escapes requiring
	7897	a dummy entry for all of the special escapes that are actually handled
	7898	by the other.
	7899	*/
	7900
	7901	STATIC regnode *
	7902	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	7903	{
	7904	dVAR;
	7905	register regnode *ret = NULL;
	7906	I32 flags;
	7907	char *parse_start = RExC_parse;
	7908	U8 op;
	7909	GET_RE_DEBUG_FLAGS_DECL;
	7910	DEBUG_PARSE("atom");
	7911	flagp = WORST; / Tentatively. */
	7912
	7913	PERL_ARGS_ASSERT_REGATOM;
	7914
	7915	tryagain:
	7916	switch ((U8)*RExC_parse) {
	7917	case '^':
	7918	RExC_seen_zerolen++;
	7919	nextchar(pRExC_state);
	7920	if (RExC_flags & RXf_PMf_MULTILINE)
	7921	ret = reg_node(pRExC_state, MBOL);
	7922	else if (RExC_flags & RXf_PMf_SINGLELINE)
	7923	ret = reg_node(pRExC_state, SBOL);
	7924	else
	7925	ret = reg_node(pRExC_state, BOL);
	7926	Set_Node_Length(ret, 1); /* MJD */
	7927	break;
	7928	case '$':
	7929	nextchar(pRExC_state);
	7930	if (*RExC_parse)
	7931	RExC_seen_zerolen++;
	7932	if (RExC_flags & RXf_PMf_MULTILINE)
	7933	ret = reg_node(pRExC_state, MEOL);
	7934	else if (RExC_flags & RXf_PMf_SINGLELINE)
	7935	ret = reg_node(pRExC_state, SEOL);
	7936	else
	7937	ret = reg_node(pRExC_state, EOL);
	7938	Set_Node_Length(ret, 1); /* MJD */
	7939	break;
	7940	case '.':
	7941	nextchar(pRExC_state);
	7942	if (RExC_flags & RXf_PMf_SINGLELINE)
	7943	ret = reg_node(pRExC_state, SANY);
	7944	else
	7945	ret = reg_node(pRExC_state, REG_ANY);
	7946	*flagp \|= HASWIDTH\|SIMPLE;
	7947	RExC_naughty++;
	7948	Set_Node_Length(ret, 1); /* MJD */
	7949	break;
	7950	case '[':
	7951	{
	7952	char * const oregcomp_parse = ++RExC_parse;
	7953	ret = regclass(pRExC_state,depth+1);
	7954	if (*RExC_parse != ']') {
	7955	RExC_parse = oregcomp_parse;
	7956	vFAIL("Unmatched [");
	7957	}
	7958	nextchar(pRExC_state);
	7959	*flagp \|= HASWIDTH\|SIMPLE;
	7960	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	7961	break;
	7962	}
	7963	case '(':
	7964	nextchar(pRExC_state);
	7965	ret = reg(pRExC_state, 1, &flags,depth+1);
	7966	if (ret == NULL) {
	7967	if (flags & TRYAGAIN) {
	7968	if (RExC_parse == RExC_end) {
	7969	/* Make parent create an empty node if needed. */
	7970	*flagp \|= TRYAGAIN;
	7971	return(NULL);
	7972	}
	7973	goto tryagain;
	7974	}
	7975	return(NULL);
	7976	}
	7977	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	7978	break;
	7979	case '\|':
	7980	case ')':
	7981	if (flags & TRYAGAIN) {
	7982	*flagp \|= TRYAGAIN;
	7983	return NULL;
	7984	}
	7985	vFAIL("Internal urp");
	7986	/* Supposed to be caught earlier. */
	7987	break;
	7988	case '{':
	7989	if (!regcurly(RExC_parse)) {
	7990	RExC_parse++;
	7991	goto defchar;
	7992	}
	7993	/* FALL THROUGH */
	7994	case '?':
	7995	case '+':
	7996	case '*':
	7997	RExC_parse++;
	7998	vFAIL("Quantifier follows nothing");
	7999	break;
	8000	case '\\':
	8001	/* Special Escapes
	8002
	8003	This switch handles escape sequences that resolve to some kind
	8004	of special regop and not to literal text. Escape sequnces that
	8005	resolve to literal text are handled below in the switch marked
	8006	"Literal Escapes".
	8007
	8008	Every entry in this switch must have a corresponding entry
	8009	in the literal escape switch. However, the opposite is not
	8010	required, as the default for this switch is to jump to the
	8011	literal text handling code.
	8012	*/
	8013	switch ((U8)*++RExC_parse) {
	8014	/* Special Escapes */
	8015	case 'A':
	8016	RExC_seen_zerolen++;
	8017	ret = reg_node(pRExC_state, SBOL);
	8018	*flagp \|= SIMPLE;
	8019	goto finish_meta_pat;
	8020	case 'G':
	8021	ret = reg_node(pRExC_state, GPOS);
	8022	RExC_seen \|= REG_SEEN_GPOS;
	8023	*flagp \|= SIMPLE;
	8024	goto finish_meta_pat;
	8025	case 'K':
	8026	RExC_seen_zerolen++;
	8027	ret = reg_node(pRExC_state, KEEPS);
	8028	*flagp \|= SIMPLE;
	8029	/* XXX:dmq : disabling in-place substitution seems to
	8030	* be necessary here to avoid cases of memory corruption, as
	8031	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	8032	*/
	8033	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8034	goto finish_meta_pat;
	8035	case 'Z':
	8036	ret = reg_node(pRExC_state, SEOL);
	8037	*flagp \|= SIMPLE;
	8038	RExC_seen_zerolen++; /* Do not optimize RE away */
	8039	goto finish_meta_pat;
	8040	case 'z':
	8041	ret = reg_node(pRExC_state, EOS);
	8042	*flagp \|= SIMPLE;
	8043	RExC_seen_zerolen++; /* Do not optimize RE away */
	8044	goto finish_meta_pat;
	8045	case 'C':
	8046	ret = reg_node(pRExC_state, CANY);
	8047	RExC_seen \|= REG_SEEN_CANY;
	8048	*flagp \|= HASWIDTH\|SIMPLE;
	8049	goto finish_meta_pat;
	8050	case 'X':
	8051	ret = reg_node(pRExC_state, CLUMP);
	8052	*flagp \|= HASWIDTH;
	8053	goto finish_meta_pat;
	8054	case 'w':
	8055	switch (get_regex_charset(RExC_flags)) {
	8056	case REGEX_LOCALE_CHARSET:
	8057	op = ALNUML;
	8058	break;
	8059	case REGEX_UNICODE_CHARSET:
	8060	op = ALNUMU;
	8061	break;
	8062	case REGEX_ASCII_RESTRICTED_CHARSET:
	8063	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8064	op = ALNUMA;
	8065	break;
	8066	case REGEX_DEPENDS_CHARSET:
	8067	op = ALNUM;
	8068	break;
	8069	default:
	8070	goto bad_charset;
	8071	}
	8072	ret = reg_node(pRExC_state, op);
	8073	*flagp \|= HASWIDTH\|SIMPLE;
	8074	goto finish_meta_pat;
	8075	case 'W':
	8076	switch (get_regex_charset(RExC_flags)) {
	8077	case REGEX_LOCALE_CHARSET:
	8078	op = NALNUML;
	8079	break;
	8080	case REGEX_UNICODE_CHARSET:
	8081	op = NALNUMU;
	8082	break;
	8083	case REGEX_ASCII_RESTRICTED_CHARSET:
	8084	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8085	op = NALNUMA;
	8086	break;
	8087	case REGEX_DEPENDS_CHARSET:
	8088	op = NALNUM;
	8089	break;
	8090	default:
	8091	goto bad_charset;
	8092	}
	8093	ret = reg_node(pRExC_state, op);
	8094	*flagp \|= HASWIDTH\|SIMPLE;
	8095	goto finish_meta_pat;
	8096	case 'b':
	8097	RExC_seen_zerolen++;
	8098	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8099	switch (get_regex_charset(RExC_flags)) {
	8100	case REGEX_LOCALE_CHARSET:
	8101	op = BOUNDL;
	8102	break;
	8103	case REGEX_UNICODE_CHARSET:
	8104	op = BOUNDU;
	8105	break;
	8106	case REGEX_ASCII_RESTRICTED_CHARSET:
	8107	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8108	op = BOUNDA;
	8109	break;
	8110	case REGEX_DEPENDS_CHARSET:
	8111	op = BOUND;
	8112	break;
	8113	default:
	8114	goto bad_charset;
	8115	}
	8116	ret = reg_node(pRExC_state, op);
	8117	FLAGS(ret) = get_regex_charset(RExC_flags);
	8118	*flagp \|= SIMPLE;
	8119	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	8120	ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
	8121	}
	8122	goto finish_meta_pat;
	8123	case 'B':
	8124	RExC_seen_zerolen++;
	8125	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8126	switch (get_regex_charset(RExC_flags)) {
	8127	case REGEX_LOCALE_CHARSET:
	8128	op = NBOUNDL;
	8129	break;
	8130	case REGEX_UNICODE_CHARSET:
	8131	op = NBOUNDU;
	8132	break;
	8133	case REGEX_ASCII_RESTRICTED_CHARSET:
	8134	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8135	op = NBOUNDA;
	8136	break;
	8137	case REGEX_DEPENDS_CHARSET:
	8138	op = NBOUND;
	8139	break;
	8140	default:
	8141	goto bad_charset;
	8142	}
	8143	ret = reg_node(pRExC_state, op);
	8144	FLAGS(ret) = get_regex_charset(RExC_flags);
	8145	*flagp \|= SIMPLE;
	8146	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	8147	ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
	8148	}
	8149	goto finish_meta_pat;
	8150	case 's':
	8151	switch (get_regex_charset(RExC_flags)) {
	8152	case REGEX_LOCALE_CHARSET:
	8153	op = SPACEL;
	8154	break;
	8155	case REGEX_UNICODE_CHARSET:
	8156	op = SPACEU;
	8157	break;
	8158	case REGEX_ASCII_RESTRICTED_CHARSET:
	8159	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8160	op = SPACEA;
	8161	break;
	8162	case REGEX_DEPENDS_CHARSET:
	8163	op = SPACE;
	8164	break;
	8165	default:
	8166	goto bad_charset;
	8167	}
	8168	ret = reg_node(pRExC_state, op);
	8169	*flagp \|= HASWIDTH\|SIMPLE;
	8170	goto finish_meta_pat;
	8171	case 'S':
	8172	switch (get_regex_charset(RExC_flags)) {
	8173	case REGEX_LOCALE_CHARSET:
	8174	op = NSPACEL;
	8175	break;
	8176	case REGEX_UNICODE_CHARSET:
	8177	op = NSPACEU;
	8178	break;
	8179	case REGEX_ASCII_RESTRICTED_CHARSET:
	8180	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8181	op = NSPACEA;
	8182	break;
	8183	case REGEX_DEPENDS_CHARSET:
	8184	op = NSPACE;
	8185	break;
	8186	default:
	8187	goto bad_charset;
	8188	}
	8189	ret = reg_node(pRExC_state, op);
	8190	*flagp \|= HASWIDTH\|SIMPLE;
	8191	goto finish_meta_pat;
	8192	case 'd':
	8193	switch (get_regex_charset(RExC_flags)) {
	8194	case REGEX_LOCALE_CHARSET:
	8195	op = DIGITL;
	8196	break;
	8197	case REGEX_ASCII_RESTRICTED_CHARSET:
	8198	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8199	op = DIGITA;
	8200	break;
	8201	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	8202	case REGEX_UNICODE_CHARSET:
	8203	op = DIGIT;
	8204	break;
	8205	default:
	8206	goto bad_charset;
	8207	}
	8208	ret = reg_node(pRExC_state, op);
	8209	*flagp \|= HASWIDTH\|SIMPLE;
	8210	goto finish_meta_pat;
	8211	case 'D':
	8212	switch (get_regex_charset(RExC_flags)) {
	8213	case REGEX_LOCALE_CHARSET:
	8214	op = NDIGITL;
	8215	break;
	8216	case REGEX_ASCII_RESTRICTED_CHARSET:
	8217	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	8218	op = NDIGITA;
	8219	break;
	8220	case REGEX_DEPENDS_CHARSET: /* No difference between these */
	8221	case REGEX_UNICODE_CHARSET:
	8222	op = NDIGIT;
	8223	break;
	8224	default:
	8225	goto bad_charset;
	8226	}
	8227	ret = reg_node(pRExC_state, op);
	8228	*flagp \|= HASWIDTH\|SIMPLE;
	8229	goto finish_meta_pat;
	8230	case 'R':
	8231	ret = reg_node(pRExC_state, LNBREAK);
	8232	*flagp \|= HASWIDTH\|SIMPLE;
	8233	goto finish_meta_pat;
	8234	case 'h':
	8235	ret = reg_node(pRExC_state, HORIZWS);
	8236	*flagp \|= HASWIDTH\|SIMPLE;
	8237	goto finish_meta_pat;
	8238	case 'H':
	8239	ret = reg_node(pRExC_state, NHORIZWS);
	8240	*flagp \|= HASWIDTH\|SIMPLE;
	8241	goto finish_meta_pat;
	8242	case 'v':
	8243	ret = reg_node(pRExC_state, VERTWS);
	8244	*flagp \|= HASWIDTH\|SIMPLE;
	8245	goto finish_meta_pat;
	8246	case 'V':
	8247	ret = reg_node(pRExC_state, NVERTWS);
	8248	*flagp \|= HASWIDTH\|SIMPLE;
	8249	finish_meta_pat:
	8250	nextchar(pRExC_state);
	8251	Set_Node_Length(ret, 2); /* MJD */
	8252	break;
	8253	case 'p':
	8254	case 'P':
	8255	{
	8256	char* const oldregxend = RExC_end;
	8257	#ifdef DEBUGGING
	8258	char* parse_start = RExC_parse - 2;
	8259	#endif
	8260
	8261	if (RExC_parse[1] == '{') {
	8262	/* a lovely hack--pretend we saw [\pX] instead */
	8263	RExC_end = strchr(RExC_parse, '}');
	8264	if (!RExC_end) {
	8265	const U8 c = (U8)*RExC_parse;
	8266	RExC_parse += 2;
	8267	RExC_end = oldregxend;
	8268	vFAIL2("Missing right brace on \\%c{}", c);
	8269	}
	8270	RExC_end++;
	8271	}
	8272	else {
	8273	RExC_end = RExC_parse + 2;
	8274	if (RExC_end > oldregxend)
	8275	RExC_end = oldregxend;
	8276	}
	8277	RExC_parse--;
	8278
	8279	ret = regclass(pRExC_state,depth+1);
	8280
	8281	RExC_end = oldregxend;
	8282	RExC_parse--;
	8283
	8284	Set_Node_Offset(ret, parse_start + 2);
	8285	Set_Node_Cur_Length(ret);
	8286	nextchar(pRExC_state);
	8287	*flagp \|= HASWIDTH\|SIMPLE;
	8288	}
	8289	break;
	8290	case 'N':
	8291	/* Handle \N and \N{NAME} here and not below because it can be
	8292	multicharacter. join_exact() will join them up later on.
	8293	Also this makes sure that things like /\N{BLAH}+/ and
	8294	\N{BLAH} being multi char Just Happen. dmq*/
	8295	++RExC_parse;
	8296	ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
	8297	break;
	8298	case 'k': /* Handle \k<NAME> and \k'NAME' */
	8299	parse_named_seq:
	8300	{
	8301	char ch= RExC_parse[1];
	8302	if (ch != '<' && ch != '\'' && ch != '{') {
	8303	RExC_parse++;
	8304	vFAIL2("Sequence %.2s... not terminated",parse_start);
	8305	} else {
	8306	/* this pretty much dupes the code for (?P=...) in reg(), if
	8307	you change this make sure you change that */
	8308	char* name_start = (RExC_parse += 2);
	8309	U32 num = 0;
	8310	SV *sv_dat = reg_scan_name(pRExC_state,
	8311	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8312	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	8313	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	8314	vFAIL2("Sequence %.3s... not terminated",parse_start);
	8315
	8316	if (!SIZE_ONLY) {
	8317	num = add_data( pRExC_state, 1, "S" );
	8318	RExC_rxi->data->data[num]=(void*)sv_dat;
	8319	SvREFCNT_inc_simple_void(sv_dat);
	8320	}
	8321
	8322	RExC_sawback = 1;
	8323	ret = reganode(pRExC_state,
	8324	((! FOLD)
	8325	? NREF
	8326	: (MORE_ASCII_RESTRICTED)
	8327	? NREFFA
	8328	: (AT_LEAST_UNI_SEMANTICS)
	8329	? NREFFU
	8330	: (LOC)
	8331	? NREFFL
	8332	: NREFF),
	8333	num);
	8334	*flagp \|= HASWIDTH;
	8335
	8336	/* override incorrect value set in reganode MJD */
	8337	Set_Node_Offset(ret, parse_start+1);
	8338	Set_Node_Cur_Length(ret); /* MJD */
	8339	nextchar(pRExC_state);
	8340
	8341	}
	8342	break;
	8343	}
	8344	case 'g':
	8345	case '1': case '2': case '3': case '4':
	8346	case '5': case '6': case '7': case '8': case '9':
	8347	{
	8348	I32 num;
	8349	bool isg = *RExC_parse == 'g';
	8350	bool isrel = 0;
	8351	bool hasbrace = 0;
	8352	if (isg) {
	8353	RExC_parse++;
	8354	if (*RExC_parse == '{') {
	8355	RExC_parse++;
	8356	hasbrace = 1;
	8357	}
	8358	if (*RExC_parse == '-') {
	8359	RExC_parse++;
	8360	isrel = 1;
	8361	}
	8362	if (hasbrace && !isDIGIT(*RExC_parse)) {
	8363	if (isrel) RExC_parse--;
	8364	RExC_parse -= 2;
	8365	goto parse_named_seq;
	8366	} }
	8367	num = atoi(RExC_parse);
	8368	if (isg && num == 0)
	8369	vFAIL("Reference to invalid group 0");
	8370	if (isrel) {
	8371	num = RExC_npar - num;
	8372	if (num < 1)
	8373	vFAIL("Reference to nonexistent or unclosed group");
	8374	}
	8375	if (!isg && num > 9 && num >= RExC_npar)
	8376	goto defchar;
	8377	else {
	8378	char * const parse_start = RExC_parse - 1; /* MJD */
	8379	while (isDIGIT(*RExC_parse))
	8380	RExC_parse++;
	8381	if (parse_start == RExC_parse - 1)
	8382	vFAIL("Unterminated \\g... pattern");
	8383	if (hasbrace) {
	8384	if (*RExC_parse != '}')
	8385	vFAIL("Unterminated \\g{...} pattern");
	8386	RExC_parse++;
	8387	}
	8388	if (!SIZE_ONLY) {
	8389	if (num > (I32)RExC_rx->nparens)
	8390	vFAIL("Reference to nonexistent group");
	8391	}
	8392	RExC_sawback = 1;
	8393	ret = reganode(pRExC_state,
	8394	((! FOLD)
	8395	? REF
	8396	: (MORE_ASCII_RESTRICTED)
	8397	? REFFA
	8398	: (AT_LEAST_UNI_SEMANTICS)
	8399	? REFFU
	8400	: (LOC)
	8401	? REFFL
	8402	: REFF),
	8403	num);
	8404	*flagp \|= HASWIDTH;
	8405
	8406	/* override incorrect value set in reganode MJD */
	8407	Set_Node_Offset(ret, parse_start+1);
	8408	Set_Node_Cur_Length(ret); /* MJD */
	8409	RExC_parse--;
	8410	nextchar(pRExC_state);
	8411	}
	8412	}
	8413	break;
	8414	case '\0':
	8415	if (RExC_parse >= RExC_end)
	8416	FAIL("Trailing \\");
	8417	/* FALL THROUGH */
	8418	default:
	8419	/* Do not generate "unrecognized" warnings here, we fall
	8420	back into the quick-grab loop below */
	8421	parse_start--;
	8422	goto defchar;
	8423	}
	8424	break;
	8425
	8426	case '#':
	8427	if (RExC_flags & RXf_PMf_EXTENDED) {
	8428	if ( reg_skipcomment( pRExC_state ) )
	8429	goto tryagain;
	8430	}
	8431	/* FALL THROUGH */
	8432
	8433	default:
	8434
	8435	parse_start = RExC_parse - 1;
	8436
	8437	RExC_parse++;
	8438
	8439	defchar: {
	8440	typedef enum {
	8441	generic_char = 0,
	8442	char_s,
	8443	upsilon_1,
	8444	upsilon_2,
	8445	iota_1,
	8446	iota_2,
	8447	} char_state;
	8448	char_state latest_char_state = generic_char;
	8449	register STRLEN len;
	8450	register UV ender;
	8451	register char *p;
	8452	char *s;
	8453	STRLEN foldlen;
	8454	U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
	8455	regnode * orig_emit;
	8456
	8457	ender = 0;
	8458	orig_emit = RExC_emit; /* Save the original output node position in
	8459	case we need to output a different node
	8460	type */
	8461	ret = reg_node(pRExC_state,
	8462	(U8) ((! FOLD) ? EXACT
	8463	: (LOC)
	8464	? EXACTFL
	8465	: (MORE_ASCII_RESTRICTED)
	8466	? EXACTFA
	8467	: (AT_LEAST_UNI_SEMANTICS)
	8468	? EXACTFU
	8469	: EXACTF)
	8470	);
	8471	s = STRING(ret);
	8472	for (len = 0, p = RExC_parse - 1;
	8473	len < 127 && p < RExC_end;
	8474	len++)
	8475	{
	8476	char * const oldp = p;
	8477
	8478	if (RExC_flags & RXf_PMf_EXTENDED)
	8479	p = regwhite( pRExC_state, p );
	8480	switch ((U8)*p) {
	8481	case '^':
	8482	case '$':
	8483	case '.':
	8484	case '[':
	8485	case '(':
	8486	case ')':
	8487	case '\|':
	8488	goto loopdone;
	8489	case '\\':
	8490	/* Literal Escapes Switch
	8491
	8492	This switch is meant to handle escape sequences that
	8493	resolve to a literal character.
	8494
	8495	Every escape sequence that represents something
	8496	else, like an assertion or a char class, is handled
	8497	in the switch marked 'Special Escapes' above in this
	8498	routine, but also has an entry here as anything that
	8499	isn't explicitly mentioned here will be treated as
	8500	an unescaped equivalent literal.
	8501	*/
	8502
	8503	switch ((U8)*++p) {
	8504	/* These are all the special escapes. */
	8505	case 'A': /* Start assertion */
	8506	case 'b': case 'B': /* Word-boundary assertion*/
	8507	case 'C': /* Single char !DANGEROUS! */
	8508	case 'd': case 'D': /* digit class */
	8509	case 'g': case 'G': /* generic-backref, pos assertion */
	8510	case 'h': case 'H': /* HORIZWS */
	8511	case 'k': case 'K': /* named backref, keep marker */
	8512	case 'N': /* named char sequence */
	8513	case 'p': case 'P': /* Unicode property */
	8514	case 'R': /* LNBREAK */
	8515	case 's': case 'S': /* space class */
	8516	case 'v': case 'V': /* VERTWS */
	8517	case 'w': case 'W': /* word class */
	8518	case 'X': /* eXtended Unicode "combining character sequence" */
	8519	case 'z': case 'Z': /* End of line/string assertion */
	8520	--p;
	8521	goto loopdone;
	8522
	8523	/* Anything after here is an escape that resolves to a
	8524	literal. (Except digits, which may or may not)
	8525	*/
	8526	case 'n':
	8527	ender = '\n';
	8528	p++;
	8529	break;
	8530	case 'r':
	8531	ender = '\r';
	8532	p++;
	8533	break;
	8534	case 't':
	8535	ender = '\t';
	8536	p++;
	8537	break;
	8538	case 'f':
	8539	ender = '\f';
	8540	p++;
	8541	break;
	8542	case 'e':
	8543	ender = ASCII_TO_NATIVE('\033');
	8544	p++;
	8545	break;
	8546	case 'a':
	8547	ender = ASCII_TO_NATIVE('\007');
	8548	p++;
	8549	break;
	8550	case 'o':
	8551	{
	8552	STRLEN brace_len = len;
	8553	UV result;
	8554	const char* error_msg;
	8555
	8556	bool valid = grok_bslash_o(p,
	8557	&result,
	8558	&brace_len,
	8559	&error_msg,
	8560	1);
	8561	p += brace_len;
	8562	if (! valid) {
	8563	RExC_parse = p; /* going to die anyway; point
	8564	to exact spot of failure */
	8565	vFAIL(error_msg);
	8566	}
	8567	else
	8568	{
	8569	ender = result;
	8570	}
	8571	if (PL_encoding && ender < 0x100) {
	8572	goto recode_encoding;
	8573	}
	8574	if (ender > 0xff) {
	8575	REQUIRE_UTF8;
	8576	}
	8577	break;
	8578	}
	8579	case 'x':
	8580	if (*++p == '{') {
	8581	char* const e = strchr(p, '}');
	8582
	8583	if (!e) {
	8584	RExC_parse = p + 1;
	8585	vFAIL("Missing right brace on \\x{}");
	8586	}
	8587	else {
	8588	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	8589	\| PERL_SCAN_DISALLOW_PREFIX;
	8590	STRLEN numlen = e - p - 1;
	8591	ender = grok_hex(p + 1, &numlen, &flags, NULL);
	8592	if (ender > 0xff)
	8593	REQUIRE_UTF8;
	8594	p = e + 1;
	8595	}
	8596	}
	8597	else {
	8598	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	8599	STRLEN numlen = 2;
	8600	ender = grok_hex(p, &numlen, &flags, NULL);
	8601	p += numlen;
	8602	}
	8603	if (PL_encoding && ender < 0x100)
	8604	goto recode_encoding;
	8605	break;
	8606	case 'c':
	8607	p++;
	8608	ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
	8609	break;
	8610	case '0': case '1': case '2': case '3':case '4':
	8611	case '5': case '6': case '7': case '8':case '9':
	8612	if (*p == '0' \|\|
	8613	(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
	8614	{
	8615	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	8616	STRLEN numlen = 3;
	8617	ender = grok_oct(p, &numlen, &flags, NULL);
	8618	if (ender > 0xff) {
	8619	REQUIRE_UTF8;
	8620	}
	8621	p += numlen;
	8622	}
	8623	else {
	8624	--p;
	8625	goto loopdone;
	8626	}
	8627	if (PL_encoding && ender < 0x100)
	8628	goto recode_encoding;
	8629	break;
	8630	recode_encoding:
	8631	if (! RExC_override_recoding) {
	8632	SV* enc = PL_encoding;
	8633	ender = reg_recode((const char)(U8)ender, &enc);
	8634	if (!enc && SIZE_ONLY)
	8635	ckWARNreg(p, "Invalid escape in the specified encoding");
	8636	REQUIRE_UTF8;
	8637	}
	8638	break;
	8639	case '\0':
	8640	if (p >= RExC_end)
	8641	FAIL("Trailing \\");
	8642	/* FALL THROUGH */
	8643	default:
	8644	if (!SIZE_ONLY&& isALPHA(*p)) {
	8645	/* Include any { following the alpha to emphasize
	8646	* that it could be part of an escape at some point
	8647	* in the future */
	8648	int len = (*(p + 1) == '{') ? 2 : 1;
	8649	ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
	8650	}
	8651	goto normal_default;
	8652	}
	8653	break;
	8654	default:
	8655	normal_default:
	8656	if (UTF8_IS_START(*p) && UTF) {
	8657	STRLEN numlen;
	8658	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	8659	&numlen, UTF8_ALLOW_DEFAULT);
	8660	p += numlen;
	8661	}
	8662	else
	8663	ender = (U8) *p++;
	8664	break;
	8665	} /* End of switch on the literal */
	8666
	8667	/* Certain characters are problematic because their folded
	8668	* length is so different from their original length that it
	8669	* isn't handleable by the optimizer. They are therefore not
	8670	* placed in an EXACTish node; and are here handled specially.
	8671	* (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
	8672	* putting it in a special node keeps regexec from having to
	8673	* deal with a non-utf8 multi-char fold */
	8674	if (FOLD
	8675	&& (ender > 255 \|\| (! MORE_ASCII_RESTRICTED && ! LOC)))
	8676	{
	8677	/* We look for either side of the fold. For example \xDF
	8678	* folds to 'ss'. We look for both the single character
	8679	* \xDF and the sequence 'ss'. When we find something that
	8680	* could be one of those, we stop and flush whatever we
	8681	* have output so far into the EXACTish node that was being
	8682	* built. Then restore the input pointer to what it was.
	8683	* regatom will return that EXACT node, and will be called
	8684	* again, positioned so the first character is the one in
	8685	* question, which we return in a different node type.
	8686	* The multi-char folds are a sequence, so the occurrence
	8687	* of the first character in that sequence doesn't
	8688	* necessarily mean that what follows is the rest of the
	8689	* sequence. We keep track of that with a state machine,
	8690	* with the state being set to the latest character
	8691	* processed before the current one. Most characters will
	8692	* set the state to 0, but if one occurs that is part of a
	8693	* potential tricky fold sequence, the state is set to that
	8694	* character, and the next loop iteration sees if the state
	8695	* should progress towards the final folded-from character,
	8696	* or if it was a false alarm. If it turns out to be a
	8697	* false alarm, the character(s) will be output in a new
	8698	* EXACTish node, and join_exact() will later combine them.
	8699	* In the case of the 'ss' sequence, which is more common
	8700	* and more easily checked, some look-ahead is done to
	8701	* save time by ruling-out some false alarms */
	8702	switch (ender) {
	8703	default:
	8704	latest_char_state = generic_char;
	8705	break;
	8706	case 's':
	8707	case 'S':
	8708	case 0x17F: /* LATIN SMALL LETTER LONG S */
	8709	if (AT_LEAST_UNI_SEMANTICS) {
	8710	if (latest_char_state == char_s) { /* 'ss' */
	8711	ender = LATIN_SMALL_LETTER_SHARP_S;
	8712	goto do_tricky;
	8713	}
	8714	else if (p < RExC_end) {
	8715
	8716	/* Look-ahead at the next character. If it
	8717	* is also an s, we handle as a sharp s
	8718	* tricky regnode. */
	8719	if (p == 's' \|\| p == 'S') {
	8720
	8721	/* But first flush anything in the
	8722	* EXACTish buffer */
	8723	if (len != 0) {
	8724	p = oldp;
	8725	goto loopdone;
	8726	}
	8727	p++; /* Account for swallowing this
	8728	's' up */
	8729	ender = LATIN_SMALL_LETTER_SHARP_S;
	8730	goto do_tricky;
	8731	}
	8732	/* Here, the next character is not a
	8733	* literal 's', but still could
	8734	* evaluate to one if part of a \o{},
	8735	* \x or \OCTAL-DIGIT. The minimum
	8736	* length required for that is 4, eg
	8737	* \x53 or \123 */
	8738	else if (*p == '\\'
	8739	&& p < RExC_end - 4
	8740	&& (isDIGIT(*(p + 1))
	8741	\|\| *(p + 1) == 'x'
	8742	\|\| *(p + 1) == 'o' ))
	8743	{
	8744
	8745	/* Here, it could be an 's', too much
	8746	* bother to figure it out here. Flush
	8747	* the buffer if any; when come back
	8748	* here, set the state so know that the
	8749	* previous char was an 's' */
	8750	if (len != 0) {
	8751	latest_char_state = generic_char;
	8752	p = oldp;
	8753	goto loopdone;
	8754	}
	8755	latest_char_state = char_s;
	8756	break;
	8757	}
	8758	}
	8759	}
	8760
	8761	/* Here, can't be an 'ss' sequence, or at least not
	8762	* one that could fold to/from the sharp ss */
	8763	latest_char_state = generic_char;
	8764	break;
	8765	case 0x03C5: /* First char in upsilon series */
	8766	case 0x03A5: /* Also capital UPSILON, which folds to
	8767	03C5, and hence exhibits the same
	8768	problem */
	8769	if (p < RExC_end - 4) { /* Need >= 4 bytes left */
	8770	latest_char_state = upsilon_1;
	8771	if (len != 0) {
	8772	p = oldp;
	8773	goto loopdone;
	8774	}
	8775	}
	8776	else {
	8777	latest_char_state = generic_char;
	8778	}
	8779	break;
	8780	case 0x03B9: /* First char in iota series */
	8781	case 0x0399: /* Also capital IOTA */
	8782	case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */
	8783	case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds
	8784	to 3B9 */
	8785	if (p < RExC_end - 4) {
	8786	latest_char_state = iota_1;
	8787	if (len != 0) {
	8788	p = oldp;
	8789	goto loopdone;
	8790	}
	8791	}
	8792	else {
	8793	latest_char_state = generic_char;
	8794	}
	8795	break;
	8796	case 0x0308:
	8797	if (latest_char_state == upsilon_1) {
	8798	latest_char_state = upsilon_2;
	8799	}
	8800	else if (latest_char_state == iota_1) {
	8801	latest_char_state = iota_2;
	8802	}
	8803	else {
	8804	latest_char_state = generic_char;
	8805	}
	8806	break;
	8807	case 0x301:
	8808	if (latest_char_state == upsilon_2) {
	8809	ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
	8810	goto do_tricky;
	8811	}
	8812	else if (latest_char_state == iota_2) {
	8813	ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
	8814	goto do_tricky;
	8815	}
	8816	latest_char_state = generic_char;
	8817	break;
	8818
	8819	/* These are the tricky fold characters. Flush any
	8820	* buffer first. (When adding to this list, also should
	8821	* add them to fold_grind.t to make sure get tested) */
	8822	case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
	8823	case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
	8824	case LATIN_SMALL_LETTER_SHARP_S:
	8825	case LATIN_CAPITAL_LETTER_SHARP_S:
	8826	case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
	8827	case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
	8828	if (len != 0) {
	8829	p = oldp;
	8830	goto loopdone;
	8831	}
	8832	/* FALL THROUGH */
	8833	do_tricky: {
	8834	char* const oldregxend = RExC_end;
	8835	U8 tmpbuf[UTF8_MAXBYTES+1];
	8836
	8837	/* Here, we know we need to generate a special
	8838	* regnode, and 'ender' contains the tricky
	8839	* character. What's done is to pretend it's in a
	8840	* [bracketed] class, and let the code that deals
	8841	* with those handle it, as that code has all the
	8842	* intelligence necessary. First save the current
	8843	* parse state, get rid of the already allocated
	8844	* but empty EXACT node that the ANYOFV node will
	8845	* replace, and point the parse to a buffer which
	8846	* we fill with the character we want the regclass
	8847	* code to think is being parsed */
	8848	RExC_emit = orig_emit;
	8849	RExC_parse = (char *) tmpbuf;
	8850	if (UTF) {
	8851	U8 *d = uvchr_to_utf8(tmpbuf, ender);
	8852	*d = '\0';
	8853	RExC_end = (char *) d;
	8854	}
	8855	else { /* ender above 255 already excluded */
	8856	tmpbuf[0] = (U8) ender;
	8857	tmpbuf[1] = '\0';
	8858	RExC_end = RExC_parse + 1;
	8859	}
	8860
	8861	ret = regclass(pRExC_state,depth+1);
	8862
	8863	/* Here, have parsed the buffer. Reset the parse to
	8864	* the actual input, and return */
	8865	RExC_end = oldregxend;
	8866	RExC_parse = p - 1;
	8867
	8868	Set_Node_Offset(ret, RExC_parse);
	8869	Set_Node_Cur_Length(ret);
	8870	nextchar(pRExC_state);
	8871	*flagp \|= HASWIDTH\|SIMPLE;
	8872	return ret;
	8873	}
	8874	}
	8875	}
	8876
	8877	if ( RExC_flags & RXf_PMf_EXTENDED)
	8878	p = regwhite( pRExC_state, p );
	8879	if (UTF && FOLD) {
	8880	/* Prime the casefolded buffer. Locale rules, which apply
	8881	* only to code points < 256, aren't known until execution,
	8882	* so for them, just output the original character using
	8883	* utf8 */
	8884	if (LOC && ender < 256) {
	8885	if (UNI_IS_INVARIANT(ender)) {
	8886	*tmpbuf = (U8) ender;
	8887	foldlen = 1;
	8888	} else {
	8889	*tmpbuf = UTF8_TWO_BYTE_HI(ender);
	8890	*(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
	8891	foldlen = 2;
	8892	}
	8893	}
	8894	else if (isASCII(ender)) { /* Note: Here can't also be LOC
	8895	*/
	8896	ender = toLOWER(ender);
	8897	*tmpbuf = (U8) ender;
	8898	foldlen = 1;
	8899	}
	8900	else if (! MORE_ASCII_RESTRICTED && ! LOC) {
	8901
	8902	/* Locale and /aa require more selectivity about the
	8903	* fold, so are handled below. Otherwise, here, just
	8904	* use the fold */
	8905	ender = toFOLD_uni(ender, tmpbuf, &foldlen);
	8906	}
	8907	else {
	8908	/* Under locale rules or /aa we are not to mix,
	8909	* respectively, ords < 256 or ASCII with non-. So
	8910	* reject folds that mix them, using only the
	8911	* non-folded code point. So do the fold to a
	8912	* temporary, and inspect each character in it. */
	8913	U8 trialbuf[UTF8_MAXBYTES_CASE+1];
	8914	U8* s = trialbuf;
	8915	UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
	8916	U8* e = s + foldlen;
	8917	bool fold_ok = TRUE;
	8918
	8919	while (s < e) {
	8920	if (isASCII(*s)
	8921	\|\| (LOC && (UTF8_IS_INVARIANT(*s)
	8922	\|\| UTF8_IS_DOWNGRADEABLE_START(*s))))
	8923	{
	8924	fold_ok = FALSE;
	8925	break;
	8926	}
	8927	s += UTF8SKIP(s);
	8928	}
	8929	if (fold_ok) {
	8930	Copy(trialbuf, tmpbuf, foldlen, U8);
	8931	ender = tmpender;
	8932	}
	8933	else {
	8934	uvuni_to_utf8(tmpbuf, ender);
	8935	foldlen = UNISKIP(ender);
	8936	}
	8937	}
	8938	}
	8939	if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+. /
	8940	if (len)
	8941	p = oldp;
	8942	else if (UTF) {
	8943	if (FOLD) {
	8944	/* Emit all the Unicode characters. */
	8945	STRLEN numlen;
	8946	for (foldbuf = tmpbuf;
	8947	foldlen;
	8948	foldlen -= numlen) {
	8949	ender = utf8_to_uvchr(foldbuf, &numlen);
	8950	if (numlen > 0) {
	8951	const STRLEN unilen = reguni(pRExC_state, ender, s);
	8952	s += unilen;
	8953	len += unilen;
	8954	/* In EBCDIC the numlen
	8955	* and unilen can differ. */
	8956	foldbuf += numlen;
	8957	if (numlen >= foldlen)
	8958	break;
	8959	}
	8960	else
	8961	break; /* "Can't happen." */
	8962	}
	8963	}
	8964	else {
	8965	const STRLEN unilen = reguni(pRExC_state, ender, s);
	8966	if (unilen > 0) {
	8967	s += unilen;
	8968	len += unilen;
	8969	}
	8970	}
	8971	}
	8972	else {
	8973	len++;
	8974	REGC((char)ender, s++);
	8975	}
	8976	break;
	8977	}
	8978	if (UTF) {
	8979	if (FOLD) {
	8980	/* Emit all the Unicode characters. */
	8981	STRLEN numlen;
	8982	for (foldbuf = tmpbuf;
	8983	foldlen;
	8984	foldlen -= numlen) {
	8985	ender = utf8_to_uvchr(foldbuf, &numlen);
	8986	if (numlen > 0) {
	8987	const STRLEN unilen = reguni(pRExC_state, ender, s);
	8988	len += unilen;
	8989	s += unilen;
	8990	/* In EBCDIC the numlen
	8991	* and unilen can differ. */
	8992	foldbuf += numlen;
	8993	if (numlen >= foldlen)
	8994	break;
	8995	}
	8996	else
	8997	break;
	8998	}
	8999	}
	9000	else {
	9001	const STRLEN unilen = reguni(pRExC_state, ender, s);
	9002	if (unilen > 0) {
	9003	s += unilen;
	9004	len += unilen;
	9005	}
	9006	}
	9007	len--;
	9008	}
	9009	else {
	9010	REGC((char)ender, s++);
	9011	}
	9012	}
	9013	loopdone: /* Jumped to when encounters something that shouldn't be in
	9014	the node */
	9015	RExC_parse = p - 1;
	9016	Set_Node_Cur_Length(ret); /* MJD */
	9017	nextchar(pRExC_state);
	9018	{
	9019	/* len is STRLEN which is unsigned, need to copy to signed */
	9020	IV iv = len;
	9021	if (iv < 0)
	9022	vFAIL("Internal disaster");
	9023	}
	9024	if (len > 0)
	9025	*flagp \|= HASWIDTH;
	9026	if (len == 1 && UNI_IS_INVARIANT(ender))
	9027	*flagp \|= SIMPLE;
	9028
	9029	if (SIZE_ONLY)
	9030	RExC_size += STR_SZ(len);
	9031	else {
	9032	STR_LEN(ret) = len;
	9033	RExC_emit += STR_SZ(len);
	9034	}
	9035	}
	9036	break;
	9037	}
	9038
	9039	return(ret);
	9040
	9041	/* Jumped to when an unrecognized character set is encountered */
	9042	bad_charset:
	9043	Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
	9044	return(NULL);
	9045	}
	9046
	9047	STATIC char *
	9048	S_regwhite( RExC_state_t pRExC_state, char p )
	9049	{
	9050	const char *e = RExC_end;
	9051
	9052	PERL_ARGS_ASSERT_REGWHITE;
	9053
	9054	while (p < e) {
	9055	if (isSPACE(*p))
	9056	++p;
	9057	else if (*p == '#') {
	9058	bool ended = 0;
	9059	do {
	9060	if (*p++ == '\n') {
	9061	ended = 1;
	9062	break;
	9063	}
	9064	} while (p < e);
	9065	if (!ended)
	9066	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	9067	}
	9068	else
	9069	break;
	9070	}
	9071	return p;
	9072	}
	9073
	9074	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	9075	Character classes ([:foo:]) can also be negated ([:^foo:]).
	9076	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	9077	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	9078	but trigger failures because they are currently unimplemented. */
	9079
	9080	#define POSIXCC_DONE(c) ((c) == ':')
	9081	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	9082	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	9083
	9084	STATIC I32
	9085	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
	9086	{
	9087	dVAR;
	9088	I32 namedclass = OOB_NAMEDCLASS;
	9089
	9090	PERL_ARGS_ASSERT_REGPPOSIXCC;
	9091
	9092	if (value == '[' && RExC_parse + 1 < RExC_end &&
	9093	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	9094	POSIXCC(UCHARAT(RExC_parse))) {
	9095	const char c = UCHARAT(RExC_parse);
	9096	char* const s = RExC_parse++;
	9097
	9098	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	9099	RExC_parse++;
	9100	if (RExC_parse == RExC_end)
	9101	/* Grandfather lone [:, [=, [. */
	9102	RExC_parse = s;
	9103	else {
	9104	const char* const t = RExC_parse++; /* skip over the c */
	9105	assert(*t == c);
	9106
	9107	if (UCHARAT(RExC_parse) == ']') {
	9108	const char *posixcc = s + 1;
	9109	RExC_parse++; /* skip over the ending ] */
	9110
	9111	if (*s == ':') {
	9112	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	9113	const I32 skip = t - posixcc;
	9114
	9115	/* Initially switch on the length of the name. */
	9116	switch (skip) {
	9117	case 4:
	9118	if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
	9119	namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
	9120	break;
	9121	case 5:
	9122	/* Names all of length 5. */
	9123	/* alnum alpha ascii blank cntrl digit graph lower
	9124	print punct space upper */
	9125	/* Offset 4 gives the best switch position. */
	9126	switch (posixcc[4]) {
	9127	case 'a':
	9128	if (memEQ(posixcc, "alph", 4)) /* alpha */
	9129	namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
	9130	break;
	9131	case 'e':
	9132	if (memEQ(posixcc, "spac", 4)) /* space */
	9133	namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
	9134	break;
	9135	case 'h':
	9136	if (memEQ(posixcc, "grap", 4)) /* graph */
	9137	namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
	9138	break;
	9139	case 'i':
	9140	if (memEQ(posixcc, "asci", 4)) /* ascii */
	9141	namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
	9142	break;
	9143	case 'k':
	9144	if (memEQ(posixcc, "blan", 4)) /* blank */
	9145	namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
	9146	break;
	9147	case 'l':
	9148	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	9149	namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
	9150	break;
	9151	case 'm':
	9152	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	9153	namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
	9154	break;
	9155	case 'r':
	9156	if (memEQ(posixcc, "lowe", 4)) /* lower */
	9157	namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
	9158	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	9159	namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
	9160	break;
	9161	case 't':
	9162	if (memEQ(posixcc, "digi", 4)) /* digit */
	9163	namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
	9164	else if (memEQ(posixcc, "prin", 4)) /* print */
	9165	namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
	9166	else if (memEQ(posixcc, "punc", 4)) /* punct */
	9167	namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
	9168	break;
	9169	}
	9170	break;
	9171	case 6:
	9172	if (memEQ(posixcc, "xdigit", 6))
	9173	namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
	9174	break;
	9175	}
	9176
	9177	if (namedclass == OOB_NAMEDCLASS)
	9178	Simple_vFAIL3("POSIX class [:%.*s:] unknown",
	9179	t - s - 1, s + 1);
	9180	assert (posixcc[skip] == ':');
	9181	assert (posixcc[skip+1] == ']');
	9182	} else if (!SIZE_ONLY) {
	9183	/* [[=foo=]] and [[.foo.]] are still future. */
	9184
	9185	/* adjust RExC_parse so the warning shows after
	9186	the class closes */
	9187	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	9188	RExC_parse++;
	9189	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	9190	}
	9191	} else {
	9192	/* Maternal grandfather:
	9193	* "[:" ending in ":" but not in ":]" */
	9194	RExC_parse = s;
	9195	}
	9196	}
	9197	}
	9198
	9199	return namedclass;
	9200	}
	9201
	9202	STATIC void
	9203	S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
	9204	{
	9205	dVAR;
	9206
	9207	PERL_ARGS_ASSERT_CHECKPOSIXCC;
	9208
	9209	if (POSIXCC(UCHARAT(RExC_parse))) {
	9210	const char *s = RExC_parse;
	9211	const char c = *s++;
	9212
	9213	while (isALNUM(*s))
	9214	s++;
	9215	if (s && c == s && s[1] == ']') {
	9216	ckWARN3reg(s+2,
	9217	"POSIX syntax [%c %c] belongs inside character classes",
	9218	c, c);
	9219
	9220	/* [[=foo=]] and [[.foo.]] are still future. */
	9221	if (POSIXCC_NOTYET(c)) {
	9222	/* adjust RExC_parse so the error shows after
	9223	the class closes */
	9224	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
	9225	NOOP;
	9226	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	9227	}
	9228	}
	9229	}
	9230	}
	9231
	9232	/* No locale test, and always Unicode semantics */
	9233	#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
	9234	ANYOF_##NAME: \
	9235	for (value = 0; value < 256; value++) \
	9236	if (TEST) \
	9237	stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9238	yesno = '+'; \
	9239	what = WORD; \
	9240	break; \
	9241	case ANYOF_N##NAME: \
	9242	for (value = 0; value < 256; value++) \
	9243	if (!TEST) \
	9244	stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9245	yesno = '!'; \
	9246	what = WORD; \
	9247	break
	9248
	9249	/* Like the above, but there are differences if we are in uni-8-bit or not, so
	9250	* there are two tests passed in, to use depending on that. There aren't any
	9251	* cases where the label is different from the name, so no need for that
	9252	* parameter */
	9253	#define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
	9254	ANYOF_##NAME: \
	9255	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
	9256	else if (UNI_SEMANTICS) { \
	9257	for (value = 0; value < 256; value++) { \
	9258	if (TEST_8(value)) stored += \
	9259	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9260	} \
	9261	} \
	9262	else { \
	9263	for (value = 0; value < 128; value++) { \
	9264	if (TEST_7(UNI_TO_NATIVE(value))) stored += \
	9265	set_regclass_bit(pRExC_state, ret, \
	9266	(U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9267	} \
	9268	} \
	9269	yesno = '+'; \
	9270	what = WORD; \
	9271	break; \
	9272	case ANYOF_N##NAME: \
	9273	if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
	9274	else if (UNI_SEMANTICS) { \
	9275	for (value = 0; value < 256; value++) { \
	9276	if (! TEST_8(value)) stored += \
	9277	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
	9278	} \
	9279	} \
	9280	else { \
	9281	for (value = 0; value < 128; value++) { \
	9282	if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
	9283	pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9284	} \
	9285	if (AT_LEAST_ASCII_RESTRICTED) { \
	9286	for (value = 128; value < 256; value++) { \
	9287	stored += set_regclass_bit( \
	9288	pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
	9289	} \
	9290	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL; \
	9291	} \
	9292	else { \
	9293	/* For a non-ut8 target string with DEPENDS semantics, all above \
	9294	* ASCII Latin1 code points match the complement of any of the \
	9295	* classes. But in utf8, they have their Unicode semantics, so \
	9296	* can't just set them in the bitmap, or else regexec.c will think \
	9297	* they matched when they shouldn't. */ \
	9298	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	9299	} \
	9300	} \
	9301	yesno = '!'; \
	9302	what = WORD; \
	9303	break
	9304
	9305	STATIC U8
	9306	S_set_regclass_bit_fold(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, HV invlist_ptr, AV alternate_ptr)
	9307	{
	9308
	9309	/* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
	9310	* Locale folding is done at run-time, so this function should not be
	9311	* called for nodes that are for locales.
	9312	*
	9313	* This function sets the bit corresponding to the fold of the input
	9314	* 'value', if not already set. The fold of 'f' is 'F', and the fold of
	9315	* 'F' is 'f'.
	9316	*
	9317	* It also knows about the characters that are in the bitmap that have
	9318	* folds that are matchable only outside it, and sets the appropriate lists
	9319	* and flags.
	9320	*
	9321	* It returns the number of bits that actually changed from 0 to 1 */
	9322
	9323	U8 stored = 0;
	9324	U8 fold;
	9325
	9326	PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
	9327
	9328	fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
	9329	: PL_fold[value];
	9330
	9331	/* It assumes the bit for 'value' has already been set */
	9332	if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
	9333	ANYOF_BITMAP_SET(node, fold);
	9334	stored++;
	9335	}
	9336	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) \|\| ! MORE_ASCII_RESTRICTED)) {
	9337	/* Certain Latin1 characters have matches outside the bitmap. To get
	9338	* here, 'value' is one of those characters. None of these matches is
	9339	* valid for ASCII characters under /aa, which have been excluded by
	9340	* the 'if' above. The matches fall into three categories:
	9341	* 1) They are singly folded-to or -from an above 255 character, as
	9342	* LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
	9343	* WITH DIAERESIS;
	9344	* 2) They are part of a multi-char fold with another character in the
	9345	* bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
	9346	* 3) They are part of a multi-char fold with a character not in the
	9347	* bitmap, such as various ligatures.
	9348	* We aren't dealing fully with multi-char folds, except we do deal
	9349	* with the pattern containing a character that has a multi-char fold
	9350	* (not so much the inverse).
	9351	* For types 1) and 3), the matches only happen when the target string
	9352	* is utf8; that's not true for 2), and we set a flag for it.
	9353	*
	9354	* The code below adds to the passed in inversion list the single fold
	9355	* closures for 'value'. The values are hard-coded here so that an
	9356	* innocent-looking character class, like /[ks]/i won't have to go out
	9357	* to disk to find the possible matches. XXX It would be better to
	9358	* generate these via regen, in case a new version of the Unicode
	9359	* standard adds new mappings, though that is not really likely. */
	9360	switch (value) {
	9361	case 'k':
	9362	case 'K':
	9363	/* KELVIN SIGN */
	9364	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212A);
	9365	break;
	9366	case 's':
	9367	case 'S':
	9368	/* LATIN SMALL LETTER LONG S */
	9369	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x017F);
	9370	break;
	9371	case MICRO_SIGN:
	9372	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9373	GREEK_SMALL_LETTER_MU);
	9374	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9375	GREEK_CAPITAL_LETTER_MU);
	9376	break;
	9377	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	9378	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	9379	/* ANGSTROM SIGN */
	9380	invlist_ptr = add_cp_to_invlist(invlist_ptr, 0x212B);
	9381	if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
	9382	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9383	PL_fold_latin1[value]);
	9384	}
	9385	break;
	9386	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	9387	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9388	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	9389	break;
	9390	case LATIN_SMALL_LETTER_SHARP_S:
	9391	invlist_ptr = add_cp_to_invlist(invlist_ptr,
	9392	LATIN_CAPITAL_LETTER_SHARP_S);
	9393
	9394	/* Under /a, /d, and /u, this can match the two chars "ss" */
	9395	if (! MORE_ASCII_RESTRICTED) {
	9396	add_alternate(alternate_ptr, (U8 *) "ss", 2);
	9397
	9398	/* And under /u or /a, it can match even if the target is
	9399	* not utf8 */
	9400	if (AT_LEAST_UNI_SEMANTICS) {
	9401	ANYOF_FLAGS(node) \|= ANYOF_NONBITMAP_NON_UTF8;
	9402	}
	9403	}
	9404	break;
	9405	case 'F': case 'f':
	9406	case 'I': case 'i':
	9407	case 'L': case 'l':
	9408	case 'T': case 't':
	9409	case 'A': case 'a':
	9410	case 'H': case 'h':
	9411	case 'J': case 'j':
	9412	case 'N': case 'n':
	9413	case 'W': case 'w':
	9414	case 'Y': case 'y':
	9415	/* These all are targets of multi-character folds from code
	9416	* points that require UTF8 to express, so they can't match
	9417	* unless the target string is in UTF-8, so no action here is
	9418	* necessary, as regexec.c properly handles the general case
	9419	* for UTF-8 matching */
	9420	break;
	9421	default:
	9422	/* Use deprecated warning to increase the chances of this
	9423	* being output */
	9424	ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
	9425	break;
	9426	}
	9427	}
	9428	else if (DEPENDS_SEMANTICS
	9429	&& ! isASCII(value)
	9430	&& PL_fold_latin1[value] != value)
	9431	{
	9432	/* Under DEPENDS rules, non-ASCII Latin1 characters match their
	9433	* folds only when the target string is in UTF-8. We add the fold
	9434	* here to the list of things to match outside the bitmap, which
	9435	* won't be looked at unless it is UTF8 (or else if something else
	9436	* says to look even if not utf8, but those things better not happen
	9437	* under DEPENDS semantics. */
	9438	invlist_ptr = add_cp_to_invlist(invlist_ptr, PL_fold_latin1[value]);
	9439	}
	9440
	9441	return stored;
	9442	}
	9443
	9444
	9445	PERL_STATIC_INLINE U8
	9446	S_set_regclass_bit(pTHX_ RExC_state_t pRExC_state, regnode node, const U8 value, HV invlist_ptr, AV alternate_ptr)
	9447	{
	9448	/* This inline function sets a bit in the bitmap if not already set, and if
	9449	* appropriate, its fold, returning the number of bits that actually
	9450	* changed from 0 to 1 */
	9451
	9452	U8 stored;
	9453
	9454	PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
	9455
	9456	if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
	9457	return 0;
	9458	}
	9459
	9460	ANYOF_BITMAP_SET(node, value);
	9461	stored = 1;
	9462
	9463	if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
	9464	stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
	9465	}
	9466
	9467	return stored;
	9468	}
	9469
	9470	STATIC void
	9471	S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
	9472	{
	9473	/* Adds input 'string' with length 'len' to the ANYOF node's unicode
	9474	* alternate list, pointed to by 'alternate_ptr'. This is an array of
	9475	* the multi-character folds of characters in the node */
	9476	SV *sv;
	9477
	9478	PERL_ARGS_ASSERT_ADD_ALTERNATE;
	9479
	9480	if (! *alternate_ptr) {
	9481	*alternate_ptr = newAV();
	9482	}
	9483	sv = newSVpvn_utf8((char*)string, len, TRUE);
	9484	av_push(*alternate_ptr, sv);
	9485	return;
	9486	}
	9487
	9488	/*
	9489	parse a class specification and produce either an ANYOF node that
	9490	matches the pattern or perhaps will be optimized into an EXACTish node
	9491	instead. The node contains a bit map for the first 256 characters, with the
	9492	corresponding bit set if that character is in the list. For characters
	9493	above 255, a range list is used */
	9494
	9495	STATIC regnode *
	9496	S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
	9497	{
	9498	dVAR;
	9499	register UV nextvalue;
	9500	register IV prevvalue = OOB_UNICODE;
	9501	register IV range = 0;
	9502	UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
	9503	register regnode *ret;
	9504	STRLEN numlen;
	9505	IV namedclass;
	9506	char *rangebegin = NULL;
	9507	bool need_class = 0;
	9508	bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
	9509	SV *listsv = NULL;
	9510	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	9511	than just initialized. */
	9512	UV n;
	9513
	9514	/* code points this node matches that can't be stored in the bitmap */
	9515	HV* nonbitmap = NULL;
	9516
	9517	/* The items that are to match that aren't stored in the bitmap, but are a
	9518	* result of things that are stored there. This is the fold closure of
	9519	* such a character, either because it has DEPENDS semantics and shouldn't
	9520	* be matched unless the target string is utf8, or is a code point that is
	9521	* too large for the bit map, as for example, the fold of the MICRO SIGN is
	9522	* above 255. This all is solely for performance reasons. By having this
	9523	* code know the outside-the-bitmap folds that the bitmapped characters are
	9524	* involved with, we don't have to go out to disk to find the list of
	9525	* matches, unless the character class includes code points that aren't
	9526	* storable in the bit map. That means that a character class with an 's'
	9527	* in it, for example, doesn't need to go out to disk to find everything
	9528	* that matches. A 2nd list is used so that the 'nonbitmap' list is kept
	9529	* empty unless there is something whose fold we don't know about, and will
	9530	* have to go out to the disk to find. */
	9531	HV* l1_fold_invlist = NULL;
	9532
	9533	/* List of multi-character folds that are matched by this node */
	9534	AV* unicode_alternate = NULL;
	9535	#ifdef EBCDIC
	9536	UV literal_endpoint = 0;
	9537	#endif
	9538	UV stored = 0; /* how many chars stored in the bitmap */
	9539
	9540	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	9541	case we need to change the emitted regop to an EXACT. */
	9542	const char * orig_parse = RExC_parse;
	9543	GET_RE_DEBUG_FLAGS_DECL;
	9544
	9545	PERL_ARGS_ASSERT_REGCLASS;
	9546	#ifndef DEBUGGING
	9547	PERL_UNUSED_ARG(depth);
	9548	#endif
	9549
	9550	DEBUG_PARSE("clas");
	9551
	9552	/* Assume we are going to generate an ANYOF node. */
	9553	ret = reganode(pRExC_state, ANYOF, 0);
	9554
	9555
	9556	if (!SIZE_ONLY) {
	9557	ANYOF_FLAGS(ret) = 0;
	9558	}
	9559
	9560	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	9561	RExC_naughty++;
	9562	RExC_parse++;
	9563	if (!SIZE_ONLY)
	9564	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	9565
	9566	/* We have decided to not allow multi-char folds in inverted character
	9567	* classes, due to the confusion that can happen, especially with
	9568	* classes that are designed for a non-Unicode world: You have the
	9569	* peculiar case that:
	9570	"s s" =~ /^[^\xDF]+$/i => Y
	9571	"ss" =~ /^[^\xDF]+$/i => N
	9572	*
	9573	* See [perl #89750] */
	9574	allow_full_fold = FALSE;
	9575	}
	9576
	9577	if (SIZE_ONLY) {
	9578	RExC_size += ANYOF_SKIP;
	9579	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	9580	}
	9581	else {
	9582	RExC_emit += ANYOF_SKIP;
	9583	if (LOC) {
	9584	ANYOF_FLAGS(ret) \|= ANYOF_LOCALE;
	9585	}
	9586	ANYOF_BITMAP_ZERO(ret);
	9587	listsv = newSVpvs("# comment\n");
	9588	initial_listsv_len = SvCUR(listsv);
	9589	}
	9590
	9591	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	9592
	9593	if (!SIZE_ONLY && POSIXCC(nextvalue))
	9594	checkposixcc(pRExC_state);
	9595
	9596	/* allow 1st char to be ] (allowing it to be - is dealt with later) */
	9597	if (UCHARAT(RExC_parse) == ']')
	9598	goto charclassloop;
	9599
	9600	parseit:
	9601	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	9602
	9603	charclassloop:
	9604
	9605	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	9606
	9607	if (!range)
	9608	rangebegin = RExC_parse;
	9609	if (UTF) {
	9610	value = utf8n_to_uvchr((U8*)RExC_parse,
	9611	RExC_end - RExC_parse,
	9612	&numlen, UTF8_ALLOW_DEFAULT);
	9613	RExC_parse += numlen;
	9614	}
	9615	else
	9616	value = UCHARAT(RExC_parse++);
	9617
	9618	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	9619	if (value == '[' && POSIXCC(nextvalue))
	9620	namedclass = regpposixcc(pRExC_state, value);
	9621	else if (value == '\\') {
	9622	if (UTF) {
	9623	value = utf8n_to_uvchr((U8*)RExC_parse,
	9624	RExC_end - RExC_parse,
	9625	&numlen, UTF8_ALLOW_DEFAULT);
	9626	RExC_parse += numlen;
	9627	}
	9628	else
	9629	value = UCHARAT(RExC_parse++);
	9630	/* Some compilers cannot handle switching on 64-bit integer
	9631	* values, therefore value cannot be an UV. Yes, this will
	9632	* be a problem later if we want switch on Unicode.
	9633	* A similar issue a little bit later when switching on
	9634	* namedclass. --jhi */
	9635	switch ((I32)value) {
	9636	case 'w': namedclass = ANYOF_ALNUM; break;
	9637	case 'W': namedclass = ANYOF_NALNUM; break;
	9638	case 's': namedclass = ANYOF_SPACE; break;
	9639	case 'S': namedclass = ANYOF_NSPACE; break;
	9640	case 'd': namedclass = ANYOF_DIGIT; break;
	9641	case 'D': namedclass = ANYOF_NDIGIT; break;
	9642	case 'v': namedclass = ANYOF_VERTWS; break;
	9643	case 'V': namedclass = ANYOF_NVERTWS; break;
	9644	case 'h': namedclass = ANYOF_HORIZWS; break;
	9645	case 'H': namedclass = ANYOF_NHORIZWS; break;
	9646	case 'N': /* Handle \N{NAME} in class */
	9647	{
	9648	/* We only pay attention to the first char of
	9649	multichar strings being returned. I kinda wonder
	9650	if this makes sense as it does change the behaviour
	9651	from earlier versions, OTOH that behaviour was broken
	9652	as well. */
	9653	UV v; /* value is register so we cant & it /grrr */
	9654	if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
	9655	goto parseit;
	9656	}
	9657	value= v;
	9658	}
	9659	break;
	9660	case 'p':
	9661	case 'P':
	9662	{
	9663	char *e;
	9664	if (RExC_parse >= RExC_end)
	9665	vFAIL2("Empty \\%c{}", (U8)value);
	9666	if (*RExC_parse == '{') {
	9667	const U8 c = (U8)value;
	9668	e = strchr(RExC_parse++, '}');
	9669	if (!e)
	9670	vFAIL2("Missing right brace on \\%c{}", c);
	9671	while (isSPACE(UCHARAT(RExC_parse)))
	9672	RExC_parse++;
	9673	if (e == RExC_parse)
	9674	vFAIL2("Empty \\%c{}", c);
	9675	n = e - RExC_parse;
	9676	while (isSPACE(UCHARAT(RExC_parse + n - 1)))
	9677	n--;
	9678	}
	9679	else {
	9680	e = RExC_parse;
	9681	n = 1;
	9682	}
	9683	if (!SIZE_ONLY) {
	9684	if (UCHARAT(RExC_parse) == '^') {
	9685	RExC_parse++;
	9686	n--;
	9687	value = value == 'p' ? 'P' : 'p'; /* toggle */
	9688	while (isSPACE(UCHARAT(RExC_parse))) {
	9689	RExC_parse++;
	9690	n--;
	9691	}
	9692	}
	9693
	9694	/* Add the property name to the list. If /i matching, give
	9695	* a different name which consists of the normal name
	9696	* sandwiched between two underscores and '_i'. The design
	9697	* is discussed in the commit message for this. */
	9698	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
	9699	(value=='p' ? '+' : '!'),
	9700	(FOLD) ? "__" : "",
	9701	(int)n,
	9702	RExC_parse,
	9703	(FOLD) ? "_i" : ""
	9704	);
	9705	}
	9706	RExC_parse = e + 1;
	9707
	9708	/* The \p could match something in the Latin1 range, hence
	9709	* something that isn't utf8 */
	9710	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	9711	namedclass = ANYOF_MAX; /* no official name, but it's named */
	9712
	9713	/* \p means they want Unicode semantics */
	9714	RExC_uni_semantics = 1;
	9715	}
	9716	break;
	9717	case 'n': value = '\n'; break;
	9718	case 'r': value = '\r'; break;
	9719	case 't': value = '\t'; break;
	9720	case 'f': value = '\f'; break;
	9721	case 'b': value = '\b'; break;
	9722	case 'e': value = ASCII_TO_NATIVE('\033');break;
	9723	case 'a': value = ASCII_TO_NATIVE('\007');break;
	9724	case 'o':
	9725	RExC_parse--; /* function expects to be pointed at the 'o' */
	9726	{
	9727	const char* error_msg;
	9728	bool valid = grok_bslash_o(RExC_parse,
	9729	&value,
	9730	&numlen,
	9731	&error_msg,
	9732	SIZE_ONLY);
	9733	RExC_parse += numlen;
	9734	if (! valid) {
	9735	vFAIL(error_msg);
	9736	}
	9737	}
	9738	if (PL_encoding && value < 0x100) {
	9739	goto recode_encoding;
	9740	}
	9741	break;
	9742	case 'x':
	9743	if (*RExC_parse == '{') {
	9744	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
	9745	\| PERL_SCAN_DISALLOW_PREFIX;
	9746	char * const e = strchr(RExC_parse++, '}');
	9747	if (!e)
	9748	vFAIL("Missing right brace on \\x{}");
	9749
	9750	numlen = e - RExC_parse;
	9751	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	9752	RExC_parse = e + 1;
	9753	}
	9754	else {
	9755	I32 flags = PERL_SCAN_DISALLOW_PREFIX;
	9756	numlen = 2;
	9757	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	9758	RExC_parse += numlen;
	9759	}
	9760	if (PL_encoding && value < 0x100)
	9761	goto recode_encoding;
	9762	break;
	9763	case 'c':
	9764	value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
	9765	break;
	9766	case '0': case '1': case '2': case '3': case '4':
	9767	case '5': case '6': case '7':
	9768	{
	9769	/* Take 1-3 octal digits */
	9770	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	9771	numlen = 3;
	9772	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	9773	RExC_parse += numlen;
	9774	if (PL_encoding && value < 0x100)
	9775	goto recode_encoding;
	9776	break;
	9777	}
	9778	recode_encoding:
	9779	if (! RExC_override_recoding) {
	9780	SV* enc = PL_encoding;
	9781	value = reg_recode((const char)(U8)value, &enc);
	9782	if (!enc && SIZE_ONLY)
	9783	ckWARNreg(RExC_parse,
	9784	"Invalid escape in the specified encoding");
	9785	break;
	9786	}
	9787	default:
	9788	/* Allow \_ to not give an error */
	9789	if (!SIZE_ONLY && isALNUM(value) && value != '_') {
	9790	ckWARN2reg(RExC_parse,
	9791	"Unrecognized escape \\%c in character class passed through",
	9792	(int)value);
	9793	}
	9794	break;
	9795	}
	9796	} /* end of \blah */
	9797	#ifdef EBCDIC
	9798	else
	9799	literal_endpoint++;
	9800	#endif
	9801
	9802	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	9803
	9804	/* What matches in a locale is not known until runtime, so need to
	9805	* (one time per class) allocate extra space to pass to regexec.
	9806	* The space will contain a bit for each named class that is to be
	9807	* matched against. This isn't needed for \p{} and pseudo-classes,
	9808	* as they are not affected by locale, and hence are dealt with
	9809	* separately */
	9810	if (LOC && namedclass < ANYOF_MAX && ! need_class) {
	9811	need_class = 1;
	9812	if (SIZE_ONLY) {
	9813	RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	9814	}
	9815	else {
	9816	RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	9817	ANYOF_CLASS_ZERO(ret);
	9818	}
	9819	ANYOF_FLAGS(ret) \|= ANYOF_CLASS;
	9820	}
	9821
	9822	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	9823	* literal, as is the character that began the false range, i.e.
	9824	* the 'a' in the examples */
	9825	if (range) {
	9826	if (!SIZE_ONLY) {
	9827	const int w =
	9828	RExC_parse >= rangebegin ?
	9829	RExC_parse - rangebegin : 0;
	9830	ckWARN4reg(RExC_parse,
	9831	"False [] range \"%.s\"",
	9832	w, w, rangebegin);
	9833
	9834	stored +=
	9835	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	9836	if (prevvalue < 256) {
	9837	stored +=
	9838	set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
	9839	}
	9840	else {
	9841	nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
	9842	}
	9843	}
	9844
	9845	range = 0; /* this was not a true range */
	9846	}
	9847
	9848
	9849
	9850	if (!SIZE_ONLY) {
	9851	const char *what = NULL;
	9852	char yesno = 0;
	9853
	9854	/* Possible truncation here but in some 64-bit environments
	9855	* the compiler gets heartburn about switch on 64-bit values.
	9856	* A similar issue a little earlier when switching on value.
	9857	* --jhi */
	9858	switch ((I32)namedclass) {
	9859
	9860	case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
	9861	case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
	9862	case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
	9863	case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
	9864	case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
	9865	case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
	9866	case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
	9867	case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
	9868	case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
	9869	case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
	9870	/* \s, \w match all unicode if utf8. */
	9871	case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
	9872	case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
	9873	case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
	9874	case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
	9875	case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
	9876	case ANYOF_ASCII:
	9877	if (LOC)
	9878	ANYOF_CLASS_SET(ret, ANYOF_ASCII);
	9879	else {
	9880	for (value = 0; value < 128; value++)
	9881	stored +=
	9882	set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
	9883	}
	9884	yesno = '+';
	9885	what = NULL; /* Doesn't match outside ascii, so
	9886	don't want to add +utf8:: */
	9887	break;
	9888	case ANYOF_NASCII:
	9889	if (LOC)
	9890	ANYOF_CLASS_SET(ret, ANYOF_NASCII);
	9891	else {
	9892	for (value = 128; value < 256; value++)
	9893	stored +=
	9894	set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
	9895	}
	9896	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	9897	yesno = '!';
	9898	what = "ASCII";
	9899	break;
	9900	case ANYOF_DIGIT:
	9901	if (LOC)
	9902	ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
	9903	else {
	9904	/* consecutive digits assumed */
	9905	for (value = '0'; value <= '9'; value++)
	9906	stored +=
	9907	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	9908	}
	9909	yesno = '+';
	9910	what = "Digit";
	9911	break;
	9912	case ANYOF_NDIGIT:
	9913	if (LOC)
	9914	ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
	9915	else {
	9916	/* consecutive digits assumed */
	9917	for (value = 0; value < '0'; value++)
	9918	stored +=
	9919	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	9920	for (value = '9' + 1; value < 256; value++)
	9921	stored +=
	9922	set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
	9923	}
	9924	yesno = '!';
	9925	what = "Digit";
	9926	if (AT_LEAST_ASCII_RESTRICTED ) {
	9927	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	9928	}
	9929	break;
	9930	case ANYOF_MAX:
	9931	/* this is to handle \p and \P */
	9932	break;
	9933	default:
	9934	vFAIL("Invalid [::] class");
	9935	break;
	9936	}
	9937	if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
	9938	/* Strings such as "+utf8::isWord\n" */
	9939	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
	9940	}
	9941
	9942	continue;
	9943	}
	9944	} /* end of namedclass \blah */
	9945
	9946	if (range) {
	9947	if (prevvalue > (IV)value) /* b-a */ {
	9948	const int w = RExC_parse - rangebegin;
	9949	Simple_vFAIL4("Invalid [] range \"%.s\"", w, w, rangebegin);
	9950	range = 0; /* not a valid range */
	9951	}
	9952	}
	9953	else {
	9954	prevvalue = value; /* save the beginning of the range */
	9955	if (RExC_parse+1 < RExC_end
	9956	&& *RExC_parse == '-'
	9957	&& RExC_parse[1] != ']')
	9958	{
	9959	RExC_parse++;
	9960
	9961	/* a bad range like \w-, [:word:]- ? */
	9962	if (namedclass > OOB_NAMEDCLASS) {
	9963	if (ckWARN(WARN_REGEXP)) {
	9964	const int w =
	9965	RExC_parse >= rangebegin ?
	9966	RExC_parse - rangebegin : 0;
	9967	vWARN4(RExC_parse,
	9968	"False [] range \"%.s\"",
	9969	w, w, rangebegin);
	9970	}
	9971	if (!SIZE_ONLY)
	9972	stored +=
	9973	set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
	9974	} else
	9975	range = 1; /* yeah, it's a range! */
	9976	continue; /* but do it the next time */
	9977	}
	9978	}
	9979
	9980	/* non-Latin1 code point implies unicode semantics. Must be set in
	9981	* pass1 so is there for the whole of pass 2 */
	9982	if (value > 255) {
	9983	RExC_uni_semantics = 1;
	9984	}
	9985
	9986	/* now is the next time */
	9987	if (!SIZE_ONLY) {
	9988	if (prevvalue < 256) {
	9989	const IV ceilvalue = value < 256 ? value : 255;
	9990	IV i;
	9991	#ifdef EBCDIC
	9992	/* In EBCDIC [\x89-\x91] should include
	9993	* the \x8e but [i-j] should not. */
	9994	if (literal_endpoint == 2 &&
	9995	((isLOWER(prevvalue) && isLOWER(ceilvalue)) \|\|
	9996	(isUPPER(prevvalue) && isUPPER(ceilvalue))))
	9997	{
	9998	if (isLOWER(prevvalue)) {
	9999	for (i = prevvalue; i <= ceilvalue; i++)
	10000	if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	10001	stored +=
	10002	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10003	}
	10004	} else {
	10005	for (i = prevvalue; i <= ceilvalue; i++)
	10006	if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
	10007	stored +=
	10008	set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10009	}
	10010	}
	10011	}
	10012	else
	10013	#endif
	10014	for (i = prevvalue; i <= ceilvalue; i++) {
	10015	stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
	10016	}
	10017	}
	10018	if (value > 255) {
	10019	const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
	10020	const UV natvalue = NATIVE_TO_UNI(value);
	10021	nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
	10022	}
	10023	#ifdef EBCDIC
	10024	literal_endpoint = 0;
	10025	#endif
	10026	}
	10027
	10028	range = 0; /* this range (if it was one) is done now */
	10029	}
	10030
	10031
	10032
	10033	if (SIZE_ONLY)
	10034	return ret;
	10035	/**** !SIZE_ONLY AFTER HERE *******/
	10036
	10037	/* If folding and there are code points above 255, we calculate all
	10038	* characters that could fold to or from the ones already on the list */
	10039	if (FOLD && nonbitmap) {
	10040	UV i;
	10041
	10042	HV* fold_intersection;
	10043	UV* fold_list;
	10044
	10045	/* This is a list of all the characters that participate in folds
	10046	* (except marks, etc in multi-char folds */
	10047	if (! PL_utf8_foldable) {
	10048	SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
	10049	PL_utf8_foldable = _swash_to_invlist(swash);
	10050	}
	10051
	10052	/* This is a hash that for a particular fold gives all characters
	10053	* that are involved in it */
	10054	if (! PL_utf8_foldclosures) {
	10055
	10056	/* If we were unable to find any folds, then we likely won't be
	10057	* able to find the closures. So just create an empty list.
	10058	* Folding will effectively be restricted to the non-Unicode rules
	10059	* hard-coded into Perl. (This case happens legitimately during
	10060	* compilation of Perl itself before the Unicode tables are
	10061	* generated) */
	10062	if (invlist_len(PL_utf8_foldable) == 0) {
	10063	PL_utf8_foldclosures = _new_invlist(0);
	10064	} else {
	10065	/* If the folds haven't been read in, call a fold function
	10066	* to force that */
	10067	if (! PL_utf8_tofold) {
	10068	U8 dummy[UTF8_MAXBYTES+1];
	10069	STRLEN dummy_len;
	10070	to_utf8_fold((U8*) "A", dummy, &dummy_len);
	10071	}
	10072	PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
	10073	}
	10074	}
	10075
	10076	/* Only the characters in this class that participate in folds need
	10077	* be checked. Get the intersection of this class and all the
	10078	* possible characters that are foldable. This can quickly narrow
	10079	* down a large class */
	10080	fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
	10081
	10082	/* Now look at the foldable characters in this class individually */
	10083	fold_list = invlist_array(fold_intersection);
	10084	for (i = 0; i < invlist_len(fold_intersection); i++) {
	10085	UV j;
	10086
	10087	/* The next entry is the beginning of the range that is in the
	10088	* class */
	10089	UV start = fold_list[i++];
	10090
	10091
	10092	/* The next entry is the beginning of the next range, which
	10093	* isn't in the class, so the end of the current range is one
	10094	* less than that */
	10095	UV end = fold_list[i] - 1;
	10096
	10097	/* Look at every character in the range */
	10098	for (j = start; j <= end; j++) {
	10099
	10100	/* Get its fold */
	10101	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	10102	STRLEN foldlen;
	10103	const UV f =
	10104	_to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
	10105
	10106	if (foldlen > (STRLEN)UNISKIP(f)) {
	10107
	10108	/* Any multicharacter foldings (disallowed in
	10109	* lookbehind patterns) require the following
	10110	* transform: [ABCDEF] -> (?:[ABCabcDEFd]\|pq\|rst) where
	10111	* E folds into "pq" and F folds into "rst", all other
	10112	* characters fold to single characters. We save away
	10113	* these multicharacter foldings, to be later saved as
	10114	* part of the additional "s" data. */
	10115	if (! RExC_in_lookbehind) {
	10116	U8* loc = foldbuf;
	10117	U8* e = foldbuf + foldlen;
	10118
	10119	/* If any of the folded characters of this are in
	10120	* the Latin1 range, tell the regex engine that
	10121	* this can match a non-utf8 target string. The
	10122	* only multi-byte fold whose source is in the
	10123	* Latin1 range (U+00DF) applies only when the
	10124	* target string is utf8, or under unicode rules */
	10125	if (j > 255 \|\| AT_LEAST_UNI_SEMANTICS) {
	10126	while (loc < e) {
	10127
	10128	/* Can't mix ascii with non- under /aa */
	10129	if (MORE_ASCII_RESTRICTED
	10130	&& (isASCII(*loc) != isASCII(j)))
	10131	{
	10132	goto end_multi_fold;
	10133	}
	10134	if (UTF8_IS_INVARIANT(*loc)
	10135	\|\| UTF8_IS_DOWNGRADEABLE_START(*loc))
	10136	{
	10137	/* Can't mix above and below 256 under
	10138	* LOC */
	10139	if (LOC) {
	10140	goto end_multi_fold;
	10141	}
	10142	ANYOF_FLAGS(ret)
	10143	\|= ANYOF_NONBITMAP_NON_UTF8;
	10144	break;
	10145	}
	10146	loc += UTF8SKIP(loc);
	10147	}
	10148	}
	10149
	10150	add_alternate(&unicode_alternate, foldbuf, foldlen);
	10151	end_multi_fold: ;
	10152	}
	10153
	10154	/* This is special-cased, as it is the only letter which
	10155	* has both a multi-fold and single-fold in Latin1. All
	10156	* the other chars that have single and multi-folds are
	10157	* always in utf8, and the utf8 folding algorithm catches
	10158	* them */
	10159	if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
	10160	stored += set_regclass_bit(pRExC_state,
	10161	ret,
	10162	LATIN_SMALL_LETTER_SHARP_S,
	10163	&l1_fold_invlist, &unicode_alternate);
	10164	}
	10165	}
	10166	else {
	10167	/* Single character fold. Add everything in its fold
	10168	* closure to the list that this node should match */
	10169	SV** listp;
	10170
	10171	/* The fold closures data structure is a hash with the
	10172	* keys being every character that is folded to, like
	10173	* 'k', and the values each an array of everything that
	10174	* folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
	10175	if ((listp = hv_fetch(PL_utf8_foldclosures,
	10176	(char *) foldbuf, foldlen, FALSE)))
	10177	{
	10178	AV* list = (AV) listp;
	10179	IV k;
	10180	for (k = 0; k <= av_len(list); k++) {
	10181	SV** c_p = av_fetch(list, k, FALSE);
	10182	UV c;
	10183	if (c_p == NULL) {
	10184	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	10185	}
	10186	c = SvUV(*c_p);
	10187
	10188	/* /aa doesn't allow folds between ASCII and
	10189	* non-; /l doesn't allow them between above
	10190	* and below 256 */
	10191	if ((MORE_ASCII_RESTRICTED
	10192	&& (isASCII(c) != isASCII(j)))
	10193	\|\| (LOC && ((c < 256) != (j < 256))))
	10194	{
	10195	continue;
	10196	}
	10197
	10198	if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
	10199	stored += set_regclass_bit(pRExC_state,
	10200	ret,
	10201	(U8) c,
	10202	&l1_fold_invlist, &unicode_alternate);
	10203	}
	10204	/* It may be that the code point is already
	10205	* in this range or already in the bitmap,
	10206	* in which case we need do nothing */
	10207	else if ((c < start \|\| c > end)
	10208	&& (c > 255
	10209	\|\| ! ANYOF_BITMAP_TEST(ret, c)))
	10210	{
	10211	nonbitmap = add_cp_to_invlist(nonbitmap, c);
	10212	}
	10213	}
	10214	}
	10215	}
	10216	}
	10217	}
	10218	invlist_destroy(fold_intersection);
	10219	}
	10220
	10221	/* Combine the two lists into one. */
	10222	if (l1_fold_invlist) {
	10223	if (nonbitmap) {
	10224	HV* temp = invlist_union(nonbitmap, l1_fold_invlist);
	10225	invlist_destroy(nonbitmap);
	10226	nonbitmap = temp;
	10227	invlist_destroy(l1_fold_invlist);
	10228	}
	10229	else {
	10230	nonbitmap = l1_fold_invlist;
	10231	}
	10232	}
	10233
	10234	/* Here, we have calculated what code points should be in the character
	10235	* class. Now we can see about various optimizations. Fold calculation
	10236	* needs to take place before inversion. Otherwise /[^k]/i would invert to
	10237	* include K, which under /i would match k. */
	10238
	10239	/* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
	10240	* set the FOLD flag yet, so this this does optimize those. It doesn't
	10241	* optimize locale. Doing so perhaps could be done as long as there is
	10242	* nothing like \w in it; some thought also would have to be given to the
	10243	* interaction with above 0x100 chars */
	10244	if (! LOC
	10245	&& (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
	10246	&& ! unicode_alternate
	10247	&& ! nonbitmap
	10248	&& SvCUR(listsv) == initial_listsv_len)
	10249	{
	10250	for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
	10251	ANYOF_BITMAP(ret)[value] ^= 0xFF;
	10252	stored = 256 - stored;
	10253
	10254	/* The inversion means that everything above 255 is matched; and at the
	10255	* same time we clear the invert flag */
	10256	ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
	10257	}
	10258
	10259	/* Folding in the bitmap is taken care of above, but not for locale (for
	10260	* which we have to wait to see what folding is in effect at runtime), and
	10261	* for things not in the bitmap. Set run-time fold flag for these */
	10262	if (FOLD && (LOC \|\| nonbitmap \|\| unicode_alternate)) {
	10263	ANYOF_FLAGS(ret) \|= ANYOF_LOC_NONBITMAP_FOLD;
	10264	}
	10265
	10266	/* A single character class can be "optimized" into an EXACTish node.
	10267	* Note that since we don't currently count how many characters there are
	10268	* outside the bitmap, we are XXX missing optimization possibilities for
	10269	* them. This optimization can't happen unless this is a truly single
	10270	* character class, which means that it can't be an inversion into a
	10271	* many-character class, and there must be no possibility of there being
	10272	* things outside the bitmap. 'stored' (only) for locales doesn't include
	10273	* \w, etc, so have to make a special test that they aren't present
	10274	*
	10275	* Similarly A 2-character class of the very special form like [bB] can be
	10276	* optimized into an EXACTFish node, but only for non-locales, and for
	10277	* characters which only have the two folds; so things like 'fF' and 'Ii'
	10278	* wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
	10279	* FI'. */
	10280	if (! nonbitmap
	10281	&& ! unicode_alternate
	10282	&& SvCUR(listsv) == initial_listsv_len
	10283	&& ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT\|ANYOF_UNICODE_ALL))
	10284	&& (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	10285	\|\| (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
	10286	\|\| (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
	10287	&& (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
	10288	/* If the latest code point has a fold whose
	10289	* bit is set, it must be the only other one */
	10290	&& ((prevvalue = PL_fold_latin1[value]) != (IV)value)
	10291	&& ANYOF_BITMAP_TEST(ret, prevvalue)))))
	10292	{
	10293	/* Note that the information needed to decide to do this optimization
	10294	* is not currently available until the 2nd pass, and that the actually
	10295	* used EXACTish node takes less space than the calculated ANYOF node,
	10296	* and hence the amount of space calculated in the first pass is larger
	10297	* than actually used, so this optimization doesn't gain us any space.
	10298	* But an EXACT node is faster than an ANYOF node, and can be combined
	10299	* with any adjacent EXACT nodes later by the optimizer for further
	10300	* gains. The speed of executing an EXACTF is similar to an ANYOF
	10301	* node, so the optimization advantage comes from the ability to join
	10302	* it to adjacent EXACT nodes */
	10303
	10304	const char * cur_parse= RExC_parse;
	10305	U8 op;
	10306	RExC_emit = (regnode *)orig_emit;
	10307	RExC_parse = (char *)orig_parse;
	10308
	10309	if (stored == 1) {
	10310
	10311	/* A locale node with one point can be folded; all the other cases
	10312	* with folding will have two points, since we calculate them above
	10313	*/
	10314	if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
	10315	op = EXACTFL;
	10316	}
	10317	else {
	10318	op = EXACT;
	10319	}
	10320	} /* else 2 chars in the bit map: the folds of each other */
	10321	else if (AT_LEAST_UNI_SEMANTICS \|\| !isASCII(value)) {
	10322
	10323	/* To join adjacent nodes, they must be the exact EXACTish type.
	10324	* Try to use the most likely type, by using EXACTFU if the regex
	10325	* calls for them, or is required because the character is
	10326	* non-ASCII */
	10327	op = EXACTFU;
	10328	}
	10329	else { /* Otherwise, more likely to be EXACTF type */
	10330	op = EXACTF;
	10331	}
	10332
	10333	ret = reg_node(pRExC_state, op);
	10334	RExC_parse = (char *)cur_parse;
	10335	if (UTF && ! NATIVE_IS_INVARIANT(value)) {
	10336	*STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
	10337	*(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
	10338	STR_LEN(ret)= 2;
	10339	RExC_emit += STR_SZ(2);
	10340	}
	10341	else {
	10342	*STRING(ret)= (char)value;
	10343	STR_LEN(ret)= 1;
	10344	RExC_emit += STR_SZ(1);
	10345	}
	10346	SvREFCNT_dec(listsv);
	10347	return ret;
	10348	}
	10349
	10350	if (nonbitmap) {
	10351	UV* nonbitmap_array = invlist_array(nonbitmap);
	10352	UV nonbitmap_len = invlist_len(nonbitmap);
	10353	UV i;
	10354
	10355	/* Here have the full list of items to match that aren't in the
	10356	* bitmap. Convert to the structure that the rest of the code is
	10357	* expecting. XXX That rest of the code should convert to this
	10358	* structure */
	10359	for (i = 0; i < nonbitmap_len; i++) {
	10360
	10361	/* The next entry is the beginning of the range that is in the
	10362	* class */
	10363	UV start = nonbitmap_array[i++];
	10364	UV end;
	10365
	10366	/* The next entry is the beginning of the next range, which isn't
	10367	* in the class, so the end of the current range is one less than
	10368	* that. But if there is no next range, it means that the range
	10369	* begun by 'start' extends to infinity, which for this platform
	10370	* ends at UV_MAX */
	10371	if (i == nonbitmap_len) {
	10372	end = UV_MAX;
	10373	}
	10374	else {
	10375	end = nonbitmap_array[i] - 1;
	10376	}
	10377
	10378	if (start == end) {
	10379	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
	10380	}
	10381	else {
	10382	/* The \t sets the whole range */
	10383	Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
	10384	/* XXX EBCDIC */
	10385	start, end);
	10386	}
	10387	}
	10388	invlist_destroy(nonbitmap);
	10389	}
	10390
	10391	if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
	10392	ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
	10393	SvREFCNT_dec(listsv);
	10394	SvREFCNT_dec(unicode_alternate);
	10395	}
	10396	else {
	10397
	10398	AV * const av = newAV();
	10399	SV *rv;
	10400	/* The 0th element stores the character class description
	10401	* in its textual form: used later (regexec.c:Perl_regclass_swash())
	10402	* to initialize the appropriate swash (which gets stored in
	10403	* the 1st element), and also useful for dumping the regnode.
	10404	* The 2nd element stores the multicharacter foldings,
	10405	* used later (regexec.c:S_reginclass()). */
	10406	av_store(av, 0, listsv);
	10407	av_store(av, 1, NULL);
	10408
	10409	/* Store any computed multi-char folds only if we are allowing
	10410	* them */
	10411	if (allow_full_fold) {
	10412	av_store(av, 2, MUTABLE_SV(unicode_alternate));
	10413	if (unicode_alternate) { /* This node is variable length */
	10414	OP(ret) = ANYOFV;
	10415	}
	10416	}
	10417	else {
	10418	av_store(av, 2, NULL);
	10419	}
	10420	rv = newRV_noinc(MUTABLE_SV(av));
	10421	n = add_data(pRExC_state, 1, "s");
	10422	RExC_rxi->data->data[n] = (void*)rv;
	10423	ARG_SET(ret, n);
	10424	}
	10425	return ret;
	10426	}
	10427	#undef _C_C_T_
	10428
	10429
	10430	/* reg_skipcomment()
	10431
	10432	Absorbs an /x style # comments from the input stream.
	10433	Returns true if there is more text remaining in the stream.
	10434	Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
	10435	terminates the pattern without including a newline.
	10436
	10437	Note its the callers responsibility to ensure that we are
	10438	actually in /x mode
	10439
	10440	*/
	10441
	10442	STATIC bool
	10443	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
	10444	{
	10445	bool ended = 0;
	10446
	10447	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	10448
	10449	while (RExC_parse < RExC_end)
	10450	if (*RExC_parse++ == '\n') {
	10451	ended = 1;
	10452	break;
	10453	}
	10454	if (!ended) {
	10455	/* we ran off the end of the pattern without ending
	10456	the comment, so we have to add an \n when wrapping */
	10457	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	10458	return 0;
	10459	} else
	10460	return 1;
	10461	}
	10462
	10463	/* nextchar()
	10464
	10465	Advances the parse position, and optionally absorbs
	10466	"whitespace" from the inputstream.
	10467
	10468	Without /x "whitespace" means (?#...) style comments only,
	10469	with /x this means (?#...) and # comments and whitespace proper.
	10470
	10471	Returns the RExC_parse point from BEFORE the scan occurs.
	10472
	10473	This is the /x friendly way of saying RExC_parse++.
	10474	*/
	10475
	10476	STATIC char*
	10477	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	10478	{
	10479	char* const retval = RExC_parse++;
	10480
	10481	PERL_ARGS_ASSERT_NEXTCHAR;
	10482
	10483	for (;;) {
	10484	if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
	10485	RExC_parse[2] == '#') {
	10486	while (*RExC_parse != ')') {
	10487	if (RExC_parse == RExC_end)
	10488	FAIL("Sequence (?#... not terminated");
	10489	RExC_parse++;
	10490	}
	10491	RExC_parse++;
	10492	continue;
	10493	}
	10494	if (RExC_flags & RXf_PMf_EXTENDED) {
	10495	if (isSPACE(*RExC_parse)) {
	10496	RExC_parse++;
	10497	continue;
	10498	}
	10499	else if (*RExC_parse == '#') {
	10500	if ( reg_skipcomment( pRExC_state ) )
	10501	continue;
	10502	}
	10503	}
	10504	return retval;
	10505	}
	10506	}
	10507
	10508	/*
	10509	- reg_node - emit a node
	10510	*/
	10511	STATIC regnode * /* Location. */
	10512	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	10513	{
	10514	dVAR;
	10515	register regnode *ptr;
	10516	regnode * const ret = RExC_emit;
	10517	GET_RE_DEBUG_FLAGS_DECL;
	10518
	10519	PERL_ARGS_ASSERT_REG_NODE;
	10520
	10521	if (SIZE_ONLY) {
	10522	SIZE_ALIGN(RExC_size);
	10523	RExC_size += 1;
	10524	return(ret);
	10525	}
	10526	if (RExC_emit >= RExC_emit_bound)
	10527	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
	10528
	10529	NODE_ALIGN_FILL(ret);
	10530	ptr = ret;
	10531	FILL_ADVANCE_NODE(ptr, op);
	10532	#ifdef RE_TRACK_PATTERN_OFFSETS
	10533	if (RExC_offsets) { /* MJD */
	10534	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	10535	"reg_node", __LINE__,
	10536	PL_reg_name[op],
	10537	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	10538	? "Overwriting end of array!\n" : "OK",
	10539	(UV)(RExC_emit - RExC_emit_start),
	10540	(UV)(RExC_parse - RExC_start),
	10541	(UV)RExC_offsets[0]));
	10542	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	10543	}
	10544	#endif
	10545	RExC_emit = ptr;
	10546	return(ret);
	10547	}
	10548
	10549	/*
	10550	- reganode - emit a node with an argument
	10551	*/
	10552	STATIC regnode * /* Location. */
	10553	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	10554	{
	10555	dVAR;
	10556	register regnode *ptr;
	10557	regnode * const ret = RExC_emit;
	10558	GET_RE_DEBUG_FLAGS_DECL;
	10559
	10560	PERL_ARGS_ASSERT_REGANODE;
	10561
	10562	if (SIZE_ONLY) {
	10563	SIZE_ALIGN(RExC_size);
	10564	RExC_size += 2;
	10565	/*
	10566	We can't do this:
	10567
	10568	assert(2==regarglen[op]+1);
	10569
	10570	Anything larger than this has to allocate the extra amount.
	10571	If we changed this to be:
	10572
	10573	RExC_size += (1 + regarglen[op]);
	10574
	10575	then it wouldn't matter. Its not clear what side effect
	10576	might come from that so its not done so far.
	10577	-- dmq
	10578	*/
	10579	return(ret);
	10580	}
	10581	if (RExC_emit >= RExC_emit_bound)
	10582	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
	10583
	10584	NODE_ALIGN_FILL(ret);
	10585	ptr = ret;
	10586	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	10587	#ifdef RE_TRACK_PATTERN_OFFSETS
	10588	if (RExC_offsets) { /* MJD */
	10589	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	10590	"reganode",
	10591	__LINE__,
	10592	PL_reg_name[op],
	10593	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	10594	"Overwriting end of array!\n" : "OK",
	10595	(UV)(RExC_emit - RExC_emit_start),
	10596	(UV)(RExC_parse - RExC_start),
	10597	(UV)RExC_offsets[0]));
	10598	Set_Cur_Node_Offset;
	10599	}
	10600	#endif
	10601	RExC_emit = ptr;
	10602	return(ret);
	10603	}
	10604
	10605	/*
	10606	- reguni - emit (if appropriate) a Unicode character
	10607	*/
	10608	STATIC STRLEN
	10609	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	10610	{
	10611	dVAR;
	10612
	10613	PERL_ARGS_ASSERT_REGUNI;
	10614
	10615	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	10616	}
	10617
	10618	/*
	10619	- reginsert - insert an operator in front of already-emitted operand
	10620	*
	10621	* Means relocating the operand.
	10622	*/
	10623	STATIC void
	10624	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	10625	{
	10626	dVAR;
	10627	register regnode *src;
	10628	register regnode *dst;
	10629	register regnode *place;
	10630	const int offset = regarglen[(U8)op];
	10631	const int size = NODE_STEP_REGNODE + offset;
	10632	GET_RE_DEBUG_FLAGS_DECL;
	10633
	10634	PERL_ARGS_ASSERT_REGINSERT;
	10635	PERL_UNUSED_ARG(depth);
	10636	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	10637	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	10638	if (SIZE_ONLY) {
	10639	RExC_size += size;
	10640	return;
	10641	}
	10642
	10643	src = RExC_emit;
	10644	RExC_emit += size;
	10645	dst = RExC_emit;
	10646	if (RExC_open_parens) {
	10647	int paren;
	10648	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	10649	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	10650	if ( RExC_open_parens[paren] >= opnd ) {
	10651	/DEBUG_PARSE_FMT("open"," - %d",size);/
	10652	RExC_open_parens[paren] += size;
	10653	} else {
	10654	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	10655	}
	10656	if ( RExC_close_parens[paren] >= opnd ) {
	10657	/DEBUG_PARSE_FMT("close"," - %d",size);/
	10658	RExC_close_parens[paren] += size;
	10659	} else {
	10660	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	10661	}
	10662	}
	10663	}
	10664
	10665	while (src > opnd) {
	10666	StructCopy(--src, --dst, regnode);
	10667	#ifdef RE_TRACK_PATTERN_OFFSETS
	10668	if (RExC_offsets) { /* MJD 20010112 */
	10669	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	10670	"reg_insert",
	10671	__LINE__,
	10672	PL_reg_name[op],
	10673	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	10674	? "Overwriting end of array!\n" : "OK",
	10675	(UV)(src - RExC_emit_start),
	10676	(UV)(dst - RExC_emit_start),
	10677	(UV)RExC_offsets[0]));
	10678	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	10679	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	10680	}
	10681	#endif
	10682	}
	10683
	10684
	10685	place = opnd; /* Op node, where operand used to be. */
	10686	#ifdef RE_TRACK_PATTERN_OFFSETS
	10687	if (RExC_offsets) { /* MJD */
	10688	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	10689	"reginsert",
	10690	__LINE__,
	10691	PL_reg_name[op],
	10692	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	10693	? "Overwriting end of array!\n" : "OK",
	10694	(UV)(place - RExC_emit_start),
	10695	(UV)(RExC_parse - RExC_start),
	10696	(UV)RExC_offsets[0]));
	10697	Set_Node_Offset(place, RExC_parse);
	10698	Set_Node_Length(place, 1);
	10699	}
	10700	#endif
	10701	src = NEXTOPER(place);
	10702	FILL_ADVANCE_NODE(place, op);
	10703	Zero(src, offset, regnode);
	10704	}
	10705
	10706	/*
	10707	- regtail - set the next-pointer at the end of a node chain of p to val.
	10708	- SEE ALSO: regtail_study
	10709	*/
	10710	/* TODO: All three parms should be const */
	10711	STATIC void
	10712	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	10713	{
	10714	dVAR;
	10715	register regnode *scan;
	10716	GET_RE_DEBUG_FLAGS_DECL;
	10717
	10718	PERL_ARGS_ASSERT_REGTAIL;
	10719	#ifndef DEBUGGING
	10720	PERL_UNUSED_ARG(depth);
	10721	#endif
	10722
	10723	if (SIZE_ONLY)
	10724	return;
	10725
	10726	/* Find last node. */
	10727	scan = p;
	10728	for (;;) {
	10729	regnode * const temp = regnext(scan);
	10730	DEBUG_PARSE_r({
	10731	SV * const mysv=sv_newmortal();
	10732	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	10733	regprop(RExC_rx, mysv, scan);
	10734	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	10735	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	10736	(temp == NULL ? "->" : ""),
	10737	(temp == NULL ? PL_reg_name[OP(val)] : "")
	10738	);
	10739	});
	10740	if (temp == NULL)
	10741	break;
	10742	scan = temp;
	10743	}
	10744
	10745	if (reg_off_by_arg[OP(scan)]) {
	10746	ARG_SET(scan, val - scan);
	10747	}
	10748	else {
	10749	NEXT_OFF(scan) = val - scan;
	10750	}
	10751	}
	10752
	10753	#ifdef DEBUGGING
	10754	/*
	10755	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	10756	- Look for optimizable sequences at the same time.
	10757	- currently only looks for EXACT chains.
	10758
	10759	This is experimental code. The idea is to use this routine to perform
	10760	in place optimizations on branches and groups as they are constructed,
	10761	with the long term intention of removing optimization from study_chunk so
	10762	that it is purely analytical.
	10763
	10764	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	10765	to control which is which.
	10766
	10767	*/
	10768	/* TODO: All four parms should be const */
	10769
	10770	STATIC U8
	10771	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	10772	{
	10773	dVAR;
	10774	register regnode *scan;
	10775	U8 exact = PSEUDO;
	10776	#ifdef EXPERIMENTAL_INPLACESCAN
	10777	I32 min = 0;
	10778	#endif
	10779	GET_RE_DEBUG_FLAGS_DECL;
	10780
	10781	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	10782
	10783
	10784	if (SIZE_ONLY)
	10785	return exact;
	10786
	10787	/* Find last node. */
	10788
	10789	scan = p;
	10790	for (;;) {
	10791	regnode * const temp = regnext(scan);
	10792	#ifdef EXPERIMENTAL_INPLACESCAN
	10793	if (PL_regkind[OP(scan)] == EXACT)
	10794	if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
	10795	return EXACT;
	10796	#endif
	10797	if ( exact ) {
	10798	switch (OP(scan)) {
	10799	case EXACT:
	10800	case EXACTF:
	10801	case EXACTFA:
	10802	case EXACTFU:
	10803	case EXACTFL:
	10804	if( exact == PSEUDO )
	10805	exact= OP(scan);
	10806	else if ( exact != OP(scan) )
	10807	exact= 0;
	10808	case NOTHING:
	10809	break;
	10810	default:
	10811	exact= 0;
	10812	}
	10813	}
	10814	DEBUG_PARSE_r({
	10815	SV * const mysv=sv_newmortal();
	10816	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	10817	regprop(RExC_rx, mysv, scan);
	10818	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	10819	SvPV_nolen_const(mysv),
	10820	REG_NODE_NUM(scan),
	10821	PL_reg_name[exact]);
	10822	});
	10823	if (temp == NULL)
	10824	break;
	10825	scan = temp;
	10826	}
	10827	DEBUG_PARSE_r({
	10828	SV * const mysv_val=sv_newmortal();
	10829	DEBUG_PARSE_MSG("");
	10830	regprop(RExC_rx, mysv_val, val);
	10831	PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	10832	SvPV_nolen_const(mysv_val),
	10833	(IV)REG_NODE_NUM(val),
	10834	(IV)(val - scan)
	10835	);
	10836	});
	10837	if (reg_off_by_arg[OP(scan)]) {
	10838	ARG_SET(scan, val - scan);
	10839	}
	10840	else {
	10841	NEXT_OFF(scan) = val - scan;
	10842	}
	10843
	10844	return exact;
	10845	}
	10846	#endif
	10847
	10848	/*
	10849	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	10850	*/
	10851	#ifdef DEBUGGING
	10852	static void
	10853	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	10854	{
	10855	int bit;
	10856	int set=0;
	10857	regex_charset cs;
	10858
	10859	for (bit=0; bit<32; bit++) {
	10860	if (flags & (1<<bit)) {
	10861	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	10862	continue;
	10863	}
	10864	if (!set++ && lead)
	10865	PerlIO_printf(Perl_debug_log, "%s",lead);
	10866	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	10867	}
	10868	}
	10869	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	10870	if (!set++ && lead) {
	10871	PerlIO_printf(Perl_debug_log, "%s",lead);
	10872	}
	10873	switch (cs) {
	10874	case REGEX_UNICODE_CHARSET:
	10875	PerlIO_printf(Perl_debug_log, "UNICODE");
	10876	break;
	10877	case REGEX_LOCALE_CHARSET:
	10878	PerlIO_printf(Perl_debug_log, "LOCALE");
	10879	break;
	10880	case REGEX_ASCII_RESTRICTED_CHARSET:
	10881	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	10882	break;
	10883	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	10884	PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
	10885	break;
	10886	default:
	10887	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	10888	break;
	10889	}
	10890	}
	10891	if (lead) {
	10892	if (set)
	10893	PerlIO_printf(Perl_debug_log, "\n");
	10894	else
	10895	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	10896	}
	10897	}
	10898	#endif
	10899
	10900	void
	10901	Perl_regdump(pTHX_ const regexp *r)
	10902	{
	10903	#ifdef DEBUGGING
	10904	dVAR;
	10905	SV * const sv = sv_newmortal();
	10906	SV *dsv= sv_newmortal();
	10907	RXi_GET_DECL(r,ri);
	10908	GET_RE_DEBUG_FLAGS_DECL;
	10909
	10910	PERL_ARGS_ASSERT_REGDUMP;
	10911
	10912	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	10913
	10914	/* Header fields of interest. */
	10915	if (r->anchored_substr) {
	10916	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	10917	RE_SV_DUMPLEN(r->anchored_substr), 30);
	10918	PerlIO_printf(Perl_debug_log,
	10919	"anchored %s%s at %"IVdf" ",
	10920	s, RE_SV_TAIL(r->anchored_substr),
	10921	(IV)r->anchored_offset);
	10922	} else if (r->anchored_utf8) {
	10923	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	10924	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	10925	PerlIO_printf(Perl_debug_log,
	10926	"anchored utf8 %s%s at %"IVdf" ",
	10927	s, RE_SV_TAIL(r->anchored_utf8),
	10928	(IV)r->anchored_offset);
	10929	}
	10930	if (r->float_substr) {
	10931	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	10932	RE_SV_DUMPLEN(r->float_substr), 30);
	10933	PerlIO_printf(Perl_debug_log,
	10934	"floating %s%s at %"IVdf"..%"UVuf" ",
	10935	s, RE_SV_TAIL(r->float_substr),
	10936	(IV)r->float_min_offset, (UV)r->float_max_offset);
	10937	} else if (r->float_utf8) {
	10938	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	10939	RE_SV_DUMPLEN(r->float_utf8), 30);
	10940	PerlIO_printf(Perl_debug_log,
	10941	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	10942	s, RE_SV_TAIL(r->float_utf8),
	10943	(IV)r->float_min_offset, (UV)r->float_max_offset);
	10944	}
	10945	if (r->check_substr \|\| r->check_utf8)
	10946	PerlIO_printf(Perl_debug_log,
	10947	(const char *)
	10948	(r->check_substr == r->float_substr
	10949	&& r->check_utf8 == r->float_utf8
	10950	? "(checking floating" : "(checking anchored"));
	10951	if (r->extflags & RXf_NOSCAN)
	10952	PerlIO_printf(Perl_debug_log, " noscan");
	10953	if (r->extflags & RXf_CHECK_ALL)
	10954	PerlIO_printf(Perl_debug_log, " isall");
	10955	if (r->check_substr \|\| r->check_utf8)
	10956	PerlIO_printf(Perl_debug_log, ") ");
	10957
	10958	if (ri->regstclass) {
	10959	regprop(r, sv, ri->regstclass);
	10960	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	10961	}
	10962	if (r->extflags & RXf_ANCH) {
	10963	PerlIO_printf(Perl_debug_log, "anchored");
	10964	if (r->extflags & RXf_ANCH_BOL)
	10965	PerlIO_printf(Perl_debug_log, "(BOL)");
	10966	if (r->extflags & RXf_ANCH_MBOL)
	10967	PerlIO_printf(Perl_debug_log, "(MBOL)");
	10968	if (r->extflags & RXf_ANCH_SBOL)
	10969	PerlIO_printf(Perl_debug_log, "(SBOL)");
	10970	if (r->extflags & RXf_ANCH_GPOS)
	10971	PerlIO_printf(Perl_debug_log, "(GPOS)");
	10972	PerlIO_putc(Perl_debug_log, ' ');
	10973	}
	10974	if (r->extflags & RXf_GPOS_SEEN)
	10975	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	10976	if (r->intflags & PREGf_SKIP)
	10977	PerlIO_printf(Perl_debug_log, "plus ");
	10978	if (r->intflags & PREGf_IMPLICIT)
	10979	PerlIO_printf(Perl_debug_log, "implicit ");
	10980	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	10981	if (r->extflags & RXf_EVAL_SEEN)
	10982	PerlIO_printf(Perl_debug_log, "with eval ");
	10983	PerlIO_printf(Perl_debug_log, "\n");
	10984	DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
	10985	#else
	10986	PERL_ARGS_ASSERT_REGDUMP;
	10987	PERL_UNUSED_CONTEXT;
	10988	PERL_UNUSED_ARG(r);
	10989	#endif /* DEBUGGING */
	10990	}
	10991
	10992	/*
	10993	- regprop - printable representation of opcode
	10994	*/
	10995	#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
	10996	STMT_START { \
	10997	if (do_sep) { \
	10998	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
	10999	if (flags & ANYOF_INVERT) \
	11000	/make sure the invert info is in each / \
	11001	sv_catpvs(sv, "^"); \
	11002	do_sep = 0; \
	11003	} \
	11004	} STMT_END
	11005
	11006	void
	11007	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode *o)
	11008	{
	11009	#ifdef DEBUGGING
	11010	dVAR;
	11011	register int k;
	11012	RXi_GET_DECL(prog,progi);
	11013	GET_RE_DEBUG_FLAGS_DECL;
	11014
	11015	PERL_ARGS_ASSERT_REGPROP;
	11016
	11017	sv_setpvs(sv, "");
	11018
	11019	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	11020	/* It would be nice to FAIL() here, but this may be called from
	11021	regexec.c, and it would be hard to supply pRExC_state. */
	11022	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
	11023	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	11024
	11025	k = PL_regkind[OP(o)];
	11026
	11027	if (k == EXACT) {
	11028	sv_catpvs(sv, " ");
	11029	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	11030	* is a crude hack but it may be the best for now since
	11031	* we have no flag "this EXACTish node was UTF-8"
	11032	* --jhi */
	11033	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	11034	PERL_PV_ESCAPE_UNI_DETECT \|
	11035	PERL_PV_ESCAPE_NONASCII \|
	11036	PERL_PV_PRETTY_ELLIPSES \|
	11037	PERL_PV_PRETTY_LTGT \|
	11038	PERL_PV_PRETTY_NOCLEAR
	11039	);
	11040	} else if (k == TRIE) {
	11041	/* print the details of the trie in dumpuntil instead, as
	11042	* progi->data isn't available here */
	11043	const char op = OP(o);
	11044	const U32 n = ARG(o);
	11045	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	11046	(reg_ac_data *)progi->data->data[n] :
	11047	NULL;
	11048	const reg_trie_data * const trie
	11049	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	11050
	11051	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	11052	DEBUG_TRIE_COMPILE_r(
	11053	Perl_sv_catpvf(aTHX_ sv,
	11054	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	11055	(UV)trie->startstate,
	11056	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	11057	(UV)trie->wordcount,
	11058	(UV)trie->minlen,
	11059	(UV)trie->maxlen,
	11060	(UV)TRIE_CHARCOUNT(trie),
	11061	(UV)trie->uniquecharcount
	11062	)
	11063	);
	11064	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	11065	int i;
	11066	int rangestart = -1;
	11067	U8* bitmap = IS_ANYOF_TRIE(op) ? (U8)ANYOF_BITMAP(o) : (U8)TRIE_BITMAP(trie);
	11068	sv_catpvs(sv, "[");
	11069	for (i = 0; i <= 256; i++) {
	11070	if (i < 256 && BITMAP_TEST(bitmap,i)) {
	11071	if (rangestart == -1)
	11072	rangestart = i;
	11073	} else if (rangestart != -1) {
	11074	if (i <= rangestart + 3)
	11075	for (; rangestart < i; rangestart++)
	11076	put_byte(sv, rangestart);
	11077	else {
	11078	put_byte(sv, rangestart);
	11079	sv_catpvs(sv, "-");
	11080	put_byte(sv, i - 1);
	11081	}
	11082	rangestart = -1;
	11083	}
	11084	}
	11085	sv_catpvs(sv, "]");
	11086	}
	11087
	11088	} else if (k == CURLY) {
	11089	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	11090	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	11091	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	11092	}
	11093	else if (k == WHILEM && o->flags) /* Ordinal/of */
	11094	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	11095	else if (k == REF \|\| k == OPEN \|\| k == CLOSE \|\| k == GROUPP \|\| OP(o)==ACCEPT) {
	11096	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	11097	if ( RXp_PAREN_NAMES(prog) ) {
	11098	if ( k != REF \|\| (OP(o) < NREF)) {
	11099	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	11100	SV **name= av_fetch(list, ARG(o), 0 );
	11101	if (name)
	11102	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	11103	}
	11104	else {
	11105	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	11106	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	11107	I32 nums=(I32)SvPVX(sv_dat);
	11108	SV **name= av_fetch(list, nums[0], 0 );
	11109	I32 n;
	11110	if (name) {
	11111	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	11112	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	11113	(n ? "," : ""), (IV)nums[n]);
	11114	}
	11115	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	11116	}
	11117	}
	11118	}
	11119	} else if (k == GOSUB)
	11120	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
	11121	else if (k == VERB) {
	11122	if (!o->flags)
	11123	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	11124	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	11125	} else if (k == LOGICAL)
	11126	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
	11127	else if (k == FOLDCHAR)
	11128	Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
	11129	else if (k == ANYOF) {
	11130	int i, rangestart = -1;
	11131	const U8 flags = ANYOF_FLAGS(o);
	11132	int do_sep = 0;
	11133
	11134	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	11135	static const char * const anyofs[] = {
	11136	"\\w",
	11137	"\\W",
	11138	"\\s",
	11139	"\\S",
	11140	"\\d",
	11141	"\\D",
	11142	"[:alnum:]",
	11143	"[:^alnum:]",
	11144	"[:alpha:]",
	11145	"[:^alpha:]",
	11146	"[:ascii:]",
	11147	"[:^ascii:]",
	11148	"[:cntrl:]",
	11149	"[:^cntrl:]",
	11150	"[:graph:]",
	11151	"[:^graph:]",
	11152	"[:lower:]",
	11153	"[:^lower:]",
	11154	"[:print:]",
	11155	"[:^print:]",
	11156	"[:punct:]",
	11157	"[:^punct:]",
	11158	"[:upper:]",
	11159	"[:^upper:]",
	11160	"[:xdigit:]",
	11161	"[:^xdigit:]",
	11162	"[:space:]",
	11163	"[:^space:]",
	11164	"[:blank:]",
	11165	"[:^blank:]"
	11166	};
	11167
	11168	if (flags & ANYOF_LOCALE)
	11169	sv_catpvs(sv, "{loc}");
	11170	if (flags & ANYOF_LOC_NONBITMAP_FOLD)
	11171	sv_catpvs(sv, "{i}");
	11172	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	11173	if (flags & ANYOF_INVERT)
	11174	sv_catpvs(sv, "^");
	11175
	11176	/* output what the standard cp 0-255 bitmap matches */
	11177	for (i = 0; i <= 256; i++) {
	11178	if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
	11179	if (rangestart == -1)
	11180	rangestart = i;
	11181	} else if (rangestart != -1) {
	11182	if (i <= rangestart + 3)
	11183	for (; rangestart < i; rangestart++)
	11184	put_byte(sv, rangestart);
	11185	else {
	11186	put_byte(sv, rangestart);
	11187	sv_catpvs(sv, "-");
	11188	put_byte(sv, i - 1);
	11189	}
	11190	do_sep = 1;
	11191	rangestart = -1;
	11192	}
	11193	}
	11194
	11195	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	11196	/* output any special charclass tests (used entirely under use locale) */
	11197	if (ANYOF_CLASS_TEST_ANY_SET(o))
	11198	for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
	11199	if (ANYOF_CLASS_TEST(o,i)) {
	11200	sv_catpv(sv, anyofs[i]);
	11201	do_sep = 1;
	11202	}
	11203
	11204	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	11205
	11206	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	11207	sv_catpvs(sv, "{non-utf8-latin1-all}");
	11208	}
	11209
	11210	/* output information about the unicode matching */
	11211	if (flags & ANYOF_UNICODE_ALL)
	11212	sv_catpvs(sv, "{unicode_all}");
	11213	else if (ANYOF_NONBITMAP(o))
	11214	sv_catpvs(sv, "{unicode}");
	11215	if (flags & ANYOF_NONBITMAP_NON_UTF8)
	11216	sv_catpvs(sv, "{outside bitmap}");
	11217
	11218	if (ANYOF_NONBITMAP(o)) {
	11219	SV *lv;
	11220	SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
	11221
	11222	if (lv) {
	11223	if (sw) {
	11224	U8 s[UTF8_MAXBYTES_CASE+1];
	11225
	11226	for (i = 0; i <= 256; i++) { /* just the first 256 */
	11227	uvchr_to_utf8(s, i);
	11228
	11229	if (i < 256 && swash_fetch(sw, s, TRUE)) {
	11230	if (rangestart == -1)
	11231	rangestart = i;
	11232	} else if (rangestart != -1) {
	11233	if (i <= rangestart + 3)
	11234	for (; rangestart < i; rangestart++) {
	11235	const U8 * const e = uvchr_to_utf8(s,rangestart);
	11236	U8 *p;
	11237	for(p = s; p < e; p++)
	11238	put_byte(sv, *p);
	11239	}
	11240	else {
	11241	const U8 *e = uvchr_to_utf8(s,rangestart);
	11242	U8 *p;
	11243	for (p = s; p < e; p++)
	11244	put_byte(sv, *p);
	11245	sv_catpvs(sv, "-");
	11246	e = uvchr_to_utf8(s, i-1);
	11247	for (p = s; p < e; p++)
	11248	put_byte(sv, *p);
	11249	}
	11250	rangestart = -1;
	11251	}
	11252	}
	11253
	11254	sv_catpvs(sv, "..."); /* et cetera */
	11255	}
	11256
	11257	{
	11258	char *s = savesvpv(lv);
	11259	char * const origs = s;
	11260
	11261	while (s && s != '\n')
	11262	s++;
	11263
	11264	if (*s == '\n') {
	11265	const char * const t = ++s;
	11266
	11267	while (*s) {
	11268	if (*s == '\n')
	11269	*s = ' ';
	11270	s++;
	11271	}
	11272	if (s[-1] == ' ')
	11273	s[-1] = 0;
	11274
	11275	sv_catpv(sv, t);
	11276	}
	11277
	11278	Safefree(origs);
	11279	}
	11280	}
	11281	}
	11282
	11283	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	11284	}
	11285	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	11286	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	11287	#else
	11288	PERL_UNUSED_CONTEXT;
	11289	PERL_UNUSED_ARG(sv);
	11290	PERL_UNUSED_ARG(o);
	11291	PERL_UNUSED_ARG(prog);
	11292	#endif /* DEBUGGING */
	11293	}
	11294
	11295	SV *
	11296	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	11297	{ /* Assume that RE_INTUIT is set */
	11298	dVAR;
	11299	struct regexp const prog = (struct regexp )SvANY(r);
	11300	GET_RE_DEBUG_FLAGS_DECL;
	11301
	11302	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	11303	PERL_UNUSED_CONTEXT;
	11304
	11305	DEBUG_COMPILE_r(
	11306	{
	11307	const char * const s = SvPV_nolen_const(prog->check_substr
	11308	? prog->check_substr : prog->check_utf8);
	11309
	11310	if (!PL_colorset) reginitcolors();
	11311	PerlIO_printf(Perl_debug_log,
	11312	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	11313	PL_colors[4],
	11314	prog->check_substr ? "" : "utf8 ",
	11315	PL_colors[5],PL_colors[0],
	11316	s,
	11317	PL_colors[1],
	11318	(strlen(s) > 60 ? "..." : ""));
	11319	} );
	11320
	11321	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	11322	}
	11323
	11324	/*
	11325	pregfree()
	11326
	11327	handles refcounting and freeing the perl core regexp structure. When
	11328	it is necessary to actually free the structure the first thing it
	11329	does is call the 'free' method of the regexp_engine associated to
	11330	the regexp, allowing the handling of the void *pprivate; member
	11331	first. (This routine is not overridable by extensions, which is why
	11332	the extensions free is called first.)
	11333
	11334	See regdupe and regdupe_internal if you change anything here.
	11335	*/
	11336	#ifndef PERL_IN_XSUB_RE
	11337	void
	11338	Perl_pregfree(pTHX_ REGEXP *r)
	11339	{
	11340	SvREFCNT_dec(r);
	11341	}
	11342
	11343	void
	11344	Perl_pregfree2(pTHX_ REGEXP *rx)
	11345	{
	11346	dVAR;
	11347	struct regexp const r = (struct regexp )SvANY(rx);
	11348	GET_RE_DEBUG_FLAGS_DECL;
	11349
	11350	PERL_ARGS_ASSERT_PREGFREE2;
	11351
	11352	if (r->mother_re) {
	11353	ReREFCNT_dec(r->mother_re);
	11354	} else {
	11355	CALLREGFREE_PVT(rx); /* free the private data */
	11356	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	11357	}
	11358	if (r->substrs) {
	11359	SvREFCNT_dec(r->anchored_substr);
	11360	SvREFCNT_dec(r->anchored_utf8);
	11361	SvREFCNT_dec(r->float_substr);
	11362	SvREFCNT_dec(r->float_utf8);
	11363	Safefree(r->substrs);
	11364	}
	11365	RX_MATCH_COPY_FREE(rx);
	11366	#ifdef PERL_OLD_COPY_ON_WRITE
	11367	SvREFCNT_dec(r->saved_copy);
	11368	#endif
	11369	Safefree(r->offs);
	11370	}
	11371
	11372	/* reg_temp_copy()
	11373
	11374	This is a hacky workaround to the structural issue of match results
	11375	being stored in the regexp structure which is in turn stored in
	11376	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	11377	could be PL_curpm in multiple contexts, and could require multiple
	11378	result sets being associated with the pattern simultaneously, such
	11379	as when doing a recursive match with (??{$qr})
	11380
	11381	The solution is to make a lightweight copy of the regexp structure
	11382	when a qr// is returned from the code executed by (??{$qr}) this
	11383	lightweight copy doesn't actually own any of its data except for
	11384	the starp/end and the actual regexp structure itself.
	11385
	11386	*/
	11387
	11388
	11389	REGEXP *
	11390	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	11391	{
	11392	struct regexp *ret;
	11393	struct regexp const r = (struct regexp )SvANY(rx);
	11394	register const I32 npar = r->nparens+1;
	11395
	11396	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	11397
	11398	if (!ret_x)
	11399	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	11400	ret = (struct regexp *)SvANY(ret_x);
	11401
	11402	(void)ReREFCNT_inc(rx);
	11403	/* We can take advantage of the existing "copied buffer" mechanism in SVs
	11404	by pointing directly at the buffer, but flagging that the allocated
	11405	space in the copy is zero. As we've just done a struct copy, it's now
	11406	a case of zero-ing that, rather than copying the current length. */
	11407	SvPV_set(ret_x, RX_WRAPPED(rx));
	11408	SvFLAGS(ret_x) \|= SvFLAGS(rx) & (SVf_POK\|SVp_POK\|SVf_UTF8);
	11409	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	11410	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	11411	SvLEN_set(ret_x, 0);
	11412	SvSTASH_set(ret_x, NULL);
	11413	SvMAGIC_set(ret_x, NULL);
	11414	Newx(ret->offs, npar, regexp_paren_pair);
	11415	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	11416	if (r->substrs) {
	11417	Newx(ret->substrs, 1, struct reg_substr_data);
	11418	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	11419
	11420	SvREFCNT_inc_void(ret->anchored_substr);
	11421	SvREFCNT_inc_void(ret->anchored_utf8);
	11422	SvREFCNT_inc_void(ret->float_substr);
	11423	SvREFCNT_inc_void(ret->float_utf8);
	11424
	11425	/* check_substr and check_utf8, if non-NULL, point to either their
	11426	anchored or float namesakes, and don't hold a second reference. */
	11427	}
	11428	RX_MATCH_COPIED_off(ret_x);
	11429	#ifdef PERL_OLD_COPY_ON_WRITE
	11430	ret->saved_copy = NULL;
	11431	#endif
	11432	ret->mother_re = rx;
	11433
	11434	return ret_x;
	11435	}
	11436	#endif
	11437
	11438	/* regfree_internal()
	11439
	11440	Free the private data in a regexp. This is overloadable by
	11441	extensions. Perl takes care of the regexp structure in pregfree(),
	11442	this covers the *pprivate pointer which technically perl doesn't
	11443	know about, however of course we have to handle the
	11444	regexp_internal structure when no extension is in use.
	11445
	11446	Note this is called before freeing anything in the regexp
	11447	structure.
	11448	*/
	11449
	11450	void
	11451	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	11452	{
	11453	dVAR;
	11454	struct regexp const r = (struct regexp )SvANY(rx);
	11455	RXi_GET_DECL(r,ri);
	11456	GET_RE_DEBUG_FLAGS_DECL;
	11457
	11458	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	11459
	11460	DEBUG_COMPILE_r({
	11461	if (!PL_colorset)
	11462	reginitcolors();
	11463	{
	11464	SV *dsv= sv_newmortal();
	11465	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	11466	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	11467	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	11468	PL_colors[4],PL_colors[5],s);
	11469	}
	11470	});
	11471	#ifdef RE_TRACK_PATTERN_OFFSETS
	11472	if (ri->u.offsets)
	11473	Safefree(ri->u.offsets); /* 20010421 MJD */
	11474	#endif
	11475	if (ri->data) {
	11476	int n = ri->data->count;
	11477	PAD* new_comppad = NULL;
	11478	PAD* old_comppad;
	11479	PADOFFSET refcnt;
	11480
	11481	while (--n >= 0) {
	11482	/* If you add a ->what type here, update the comment in regcomp.h */
	11483	switch (ri->data->what[n]) {
	11484	case 'a':
	11485	case 's':
	11486	case 'S':
	11487	case 'u':
	11488	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	11489	break;
	11490	case 'f':
	11491	Safefree(ri->data->data[n]);
	11492	break;
	11493	case 'p':
	11494	new_comppad = MUTABLE_AV(ri->data->data[n]);
	11495	break;
	11496	case 'o':
	11497	if (new_comppad == NULL)
	11498	Perl_croak(aTHX_ "panic: pregfree comppad");
	11499	PAD_SAVE_LOCAL(old_comppad,
	11500	/* Watch out for global destruction's random ordering. */
	11501	(SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
	11502	);
	11503	OP_REFCNT_LOCK;
	11504	refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
	11505	OP_REFCNT_UNLOCK;
	11506	if (!refcnt)
	11507	op_free((OP_4tree*)ri->data->data[n]);
	11508
	11509	PAD_RESTORE_LOCAL(old_comppad);
	11510	SvREFCNT_dec(MUTABLE_SV(new_comppad));
	11511	new_comppad = NULL;
	11512	break;
	11513	case 'n':
	11514	break;
	11515	case 'T':
	11516	{ /* Aho Corasick add-on structure for a trie node.
	11517	Used in stclass optimization only */
	11518	U32 refcount;
	11519	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	11520	OP_REFCNT_LOCK;
	11521	refcount = --aho->refcount;
	11522	OP_REFCNT_UNLOCK;
	11523	if ( !refcount ) {
	11524	PerlMemShared_free(aho->states);
	11525	PerlMemShared_free(aho->fail);
	11526	/* do this last!!!! */
	11527	PerlMemShared_free(ri->data->data[n]);
	11528	PerlMemShared_free(ri->regstclass);
	11529	}
	11530	}
	11531	break;
	11532	case 't':
	11533	{
	11534	/* trie structure. */
	11535	U32 refcount;
	11536	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	11537	OP_REFCNT_LOCK;
	11538	refcount = --trie->refcount;
	11539	OP_REFCNT_UNLOCK;
	11540	if ( !refcount ) {
	11541	PerlMemShared_free(trie->charmap);
	11542	PerlMemShared_free(trie->states);
	11543	PerlMemShared_free(trie->trans);
	11544	if (trie->bitmap)
	11545	PerlMemShared_free(trie->bitmap);
	11546	if (trie->jump)
	11547	PerlMemShared_free(trie->jump);
	11548	PerlMemShared_free(trie->wordinfo);
	11549	/* do this last!!!! */
	11550	PerlMemShared_free(ri->data->data[n]);
	11551	}
	11552	}
	11553	break;
	11554	default:
	11555	Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
	11556	}
	11557	}
	11558	Safefree(ri->data->what);
	11559	Safefree(ri->data);
	11560	}
	11561
	11562	Safefree(ri);
	11563	}
	11564
	11565	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	11566	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	11567	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	11568
	11569	/*
	11570	re_dup - duplicate a regexp.
	11571
	11572	This routine is expected to clone a given regexp structure. It is only
	11573	compiled under USE_ITHREADS.
	11574
	11575	After all of the core data stored in struct regexp is duplicated
	11576	the regexp_engine.dupe method is used to copy any private data
	11577	stored in the *pprivate pointer. This allows extensions to handle
	11578	any duplication it needs to do.
	11579
	11580	See pregfree() and regfree_internal() if you change anything here.
	11581	*/
	11582	#if defined(USE_ITHREADS)
	11583	#ifndef PERL_IN_XSUB_RE
	11584	void
	11585	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	11586	{
	11587	dVAR;
	11588	I32 npar;
	11589	const struct regexp r = (const struct regexp )SvANY(sstr);
	11590	struct regexp ret = (struct regexp )SvANY(dstr);
	11591
	11592	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	11593
	11594	npar = r->nparens+1;
	11595	Newx(ret->offs, npar, regexp_paren_pair);
	11596	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	11597	if(ret->swap) {
	11598	/* no need to copy these */
	11599	Newx(ret->swap, npar, regexp_paren_pair);
	11600	}
	11601
	11602	if (ret->substrs) {
	11603	/* Do it this way to avoid reading from *r after the StructCopy().
	11604	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	11605	cache, it doesn't matter. */
	11606	const bool anchored = r->check_substr
	11607	? r->check_substr == r->anchored_substr
	11608	: r->check_utf8 == r->anchored_utf8;
	11609	Newx(ret->substrs, 1, struct reg_substr_data);
	11610	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	11611
	11612	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	11613	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	11614	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	11615	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	11616
	11617	/* check_substr and check_utf8, if non-NULL, point to either their
	11618	anchored or float namesakes, and don't hold a second reference. */
	11619
	11620	if (ret->check_substr) {
	11621	if (anchored) {
	11622	assert(r->check_utf8 == r->anchored_utf8);
	11623	ret->check_substr = ret->anchored_substr;
	11624	ret->check_utf8 = ret->anchored_utf8;
	11625	} else {
	11626	assert(r->check_substr == r->float_substr);
	11627	assert(r->check_utf8 == r->float_utf8);
	11628	ret->check_substr = ret->float_substr;
	11629	ret->check_utf8 = ret->float_utf8;
	11630	}
	11631	} else if (ret->check_utf8) {
	11632	if (anchored) {
	11633	ret->check_utf8 = ret->anchored_utf8;
	11634	} else {
	11635	ret->check_utf8 = ret->float_utf8;
	11636	}
	11637	}
	11638	}
	11639
	11640	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	11641
	11642	if (ret->pprivate)
	11643	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	11644
	11645	if (RX_MATCH_COPIED(dstr))
	11646	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	11647	else
	11648	ret->subbeg = NULL;
	11649	#ifdef PERL_OLD_COPY_ON_WRITE
	11650	ret->saved_copy = NULL;
	11651	#endif
	11652
	11653	if (ret->mother_re) {
	11654	if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
	11655	/* Our storage points directly to our mother regexp, but that's
	11656	1: a buffer in a different thread
	11657	2: something we no longer hold a reference on
	11658	so we need to copy it locally. */
	11659	/* Note we need to sue SvCUR() on our mother_re, because it, in
	11660	turn, may well be pointing to its own mother_re. */
	11661	SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
	11662	SvCUR(ret->mother_re)+1));
	11663	SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
	11664	}
	11665	ret->mother_re = NULL;
	11666	}
	11667	ret->gofs = 0;
	11668	}
	11669	#endif /* PERL_IN_XSUB_RE */
	11670
	11671	/*
	11672	regdupe_internal()
	11673
	11674	This is the internal complement to regdupe() which is used to copy
	11675	the structure pointed to by the *pprivate pointer in the regexp.
	11676	This is the core version of the extension overridable cloning hook.
	11677	The regexp structure being duplicated will be copied by perl prior
	11678	to this and will be provided as the regexp *r argument, however
	11679	with the /old/ structures pprivate pointer value. Thus this routine
	11680	may override any copying normally done by perl.
	11681
	11682	It returns a pointer to the new regexp_internal structure.
	11683	*/
	11684
	11685	void *
	11686	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	11687	{
	11688	dVAR;
	11689	struct regexp const r = (struct regexp )SvANY(rx);
	11690	regexp_internal *reti;
	11691	int len;
	11692	RXi_GET_DECL(r,ri);
	11693
	11694	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	11695
	11696	len = ProgLen(ri);
	11697
	11698	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
	11699	Copy(ri->program, reti->program, len+1, regnode);
	11700
	11701
	11702	reti->regstclass = NULL;
	11703
	11704	if (ri->data) {
	11705	struct reg_data *d;
	11706	const int count = ri->data->count;
	11707	int i;
	11708
	11709	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	11710	char, struct reg_data);
	11711	Newx(d->what, count, U8);
	11712
	11713	d->count = count;
	11714	for (i = 0; i < count; i++) {
	11715	d->what[i] = ri->data->what[i];
	11716	switch (d->what[i]) {
	11717	/* legal options are one of: sSfpontTua
	11718	see also regcomp.h and pregfree() */
	11719	case 'a': /* actually an AV, but the dup function is identical. */
	11720	case 's':
	11721	case 'S':
	11722	case 'p': /* actually an AV, but the dup function is identical. */
	11723	case 'u': /* actually an HV, but the dup function is identical. */
	11724	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	11725	break;
	11726	case 'f':
	11727	/* This is cheating. */
	11728	Newx(d->data[i], 1, struct regnode_charclass_class);
	11729	StructCopy(ri->data->data[i], d->data[i],
	11730	struct regnode_charclass_class);
	11731	reti->regstclass = (regnode*)d->data[i];
	11732	break;
	11733	case 'o':
	11734	/* Compiled op trees are readonly and in shared memory,
	11735	and can thus be shared without duplication. */
	11736	OP_REFCNT_LOCK;
	11737	d->data[i] = (void)OpREFCNT_inc((OP)ri->data->data[i]);
	11738	OP_REFCNT_UNLOCK;
	11739	break;
	11740	case 'T':
	11741	/* Trie stclasses are readonly and can thus be shared
	11742	* without duplication. We free the stclass in pregfree
	11743	* when the corresponding reg_ac_data struct is freed.
	11744	*/
	11745	reti->regstclass= ri->regstclass;
	11746	/* Fall through */
	11747	case 't':
	11748	OP_REFCNT_LOCK;
	11749	((reg_trie_data*)ri->data->data[i])->refcount++;
	11750	OP_REFCNT_UNLOCK;
	11751	/* Fall through */
	11752	case 'n':
	11753	d->data[i] = ri->data->data[i];
	11754	break;
	11755	default:
	11756	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
	11757	}
	11758	}
	11759
	11760	reti->data = d;
	11761	}
	11762	else
	11763	reti->data = NULL;
	11764
	11765	reti->name_list_idx = ri->name_list_idx;
	11766
	11767	#ifdef RE_TRACK_PATTERN_OFFSETS
	11768	if (ri->u.offsets) {
	11769	Newx(reti->u.offsets, 2*len+1, U32);
	11770	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	11771	}
	11772	#else
	11773	SetProgLen(reti,len);
	11774	#endif
	11775
	11776	return (void*)reti;
	11777	}
	11778
	11779	#endif /* USE_ITHREADS */
	11780
	11781	#ifndef PERL_IN_XSUB_RE
	11782
	11783	/*
	11784	- regnext - dig the "next" pointer out of a node
	11785	*/
	11786	regnode *
	11787	Perl_regnext(pTHX_ register regnode *p)
	11788	{
	11789	dVAR;
	11790	register I32 offset;
	11791
	11792	if (!p)
	11793	return(NULL);
	11794
	11795	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	11796	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
	11797	}
	11798
	11799	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	11800	if (offset == 0)
	11801	return(NULL);
	11802
	11803	return(p+offset);
	11804	}
	11805	#endif
	11806
	11807	STATIC void
	11808	S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
	11809	{
	11810	va_list args;
	11811	STRLEN l1 = strlen(pat1);
	11812	STRLEN l2 = strlen(pat2);
	11813	char buf[512];
	11814	SV *msv;
	11815	const char *message;
	11816
	11817	PERL_ARGS_ASSERT_RE_CROAK2;
	11818
	11819	if (l1 > 510)
	11820	l1 = 510;
	11821	if (l1 + l2 > 510)
	11822	l2 = 510 - l1;
	11823	Copy(pat1, buf, l1 , char);
	11824	Copy(pat2, buf + l1, l2 , char);
	11825	buf[l1 + l2] = '\n';
	11826	buf[l1 + l2 + 1] = '\0';
	11827	#ifdef I_STDARG
	11828	/* ANSI variant takes additional second argument */
	11829	va_start(args, pat2);
	11830	#else
	11831	va_start(args);
	11832	#endif
	11833	msv = vmess(buf, &args);
	11834	va_end(args);
	11835	message = SvPV_const(msv,l1);
	11836	if (l1 > 512)
	11837	l1 = 512;
	11838	Copy(message, buf, l1 , char);
	11839	buf[l1-1] = '\0'; /* Overwrite \n */
	11840	Perl_croak(aTHX_ "%s", buf);
	11841	}
	11842
	11843	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	11844
	11845	#ifndef PERL_IN_XSUB_RE
	11846	void
	11847	Perl_save_re_context(pTHX)
	11848	{
	11849	dVAR;
	11850
	11851	struct re_save_state *state;
	11852
	11853	SAVEVPTR(PL_curcop);
	11854	SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
	11855
	11856	state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
	11857	PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
	11858	SSPUSHUV(SAVEt_RE_STATE);
	11859
	11860	Copy(&PL_reg_state, state, 1, struct re_save_state);
	11861
	11862	PL_reg_start_tmp = 0;
	11863	PL_reg_start_tmpl = 0;
	11864	PL_reg_oldsaved = NULL;
	11865	PL_reg_oldsavedlen = 0;
	11866	PL_reg_maxiter = 0;
	11867	PL_reg_leftiter = 0;
	11868	PL_reg_poscache = NULL;
	11869	PL_reg_poscache_size = 0;
	11870	#ifdef PERL_OLD_COPY_ON_WRITE
	11871	PL_nrs = NULL;
	11872	#endif
	11873
	11874	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	11875	if (PL_curpm) {
	11876	const REGEXP * const rx = PM_GETRE(PL_curpm);
	11877	if (rx) {
	11878	U32 i;
	11879	for (i = 1; i <= RX_NPARENS(rx); i++) {
	11880	char digits[TYPE_CHARS(long)];
	11881	const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
	11882	GV const const gvp
	11883	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	11884
	11885	if (gvp) {
	11886	GV * const gv = *gvp;
	11887	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	11888	save_scalar(gv);
	11889	}
	11890	}
	11891	}
	11892	}
	11893	}
	11894	#endif
	11895
	11896	static void
	11897	clear_re(pTHX_ void *r)
	11898	{
	11899	dVAR;
	11900	ReREFCNT_dec((REGEXP *)r);
	11901	}
	11902
	11903	#ifdef DEBUGGING
	11904
	11905	STATIC void
	11906	S_put_byte(pTHX_ SV *sv, int c)
	11907	{
	11908	PERL_ARGS_ASSERT_PUT_BYTE;
	11909
	11910	/* Our definition of isPRINT() ignores locales, so only bytes that are
	11911	not part of UTF-8 are considered printable. I assume that the same
	11912	holds for UTF-EBCDIC.
	11913	Also, code point 255 is not printable in either (it's E0 in EBCDIC,
	11914	which Wikipedia says:
	11915
	11916	EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
	11917	ones (binary 1111 1111, hexadecimal FF). It is similar, but not
	11918	identical, to the ASCII delete (DEL) or rubout control character.
	11919	) So the old condition can be simplified to !isPRINT(c) */
	11920	if (!isPRINT(c)) {
	11921	if (c < 256) {
	11922	Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
	11923	}
	11924	else {
	11925	Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
	11926	}
	11927	}
	11928	else {
	11929	const char string = c;
	11930	if (c == '-' \|\| c == ']' \|\| c == '\\' \|\| c == '^')
	11931	sv_catpvs(sv, "\\");
	11932	sv_catpvn(sv, &string, 1);
	11933	}
	11934	}
	11935
	11936
	11937	#define CLEAR_OPTSTART \
	11938	if (optstart) STMT_START { \
	11939	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	11940	optstart=NULL; \
	11941	} STMT_END
	11942
	11943	#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	11944
	11945	STATIC const regnode *
	11946	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	11947	const regnode last, const regnode plast,
	11948	SV* sv, I32 indent, U32 depth)
	11949	{
	11950	dVAR;
	11951	register U8 op = PSEUDO; /* Arbitrary non-END op. */
	11952	register const regnode *next;
	11953	const regnode *optstart= NULL;
	11954
	11955	RXi_GET_DECL(r,ri);
	11956	GET_RE_DEBUG_FLAGS_DECL;
	11957
	11958	PERL_ARGS_ASSERT_DUMPUNTIL;
	11959
	11960	#ifdef DEBUG_DUMPUNTIL
	11961	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	11962	last ? last-start : 0,plast ? plast-start : 0);
	11963	#endif
	11964
	11965	if (plast && plast < last)
	11966	last= plast;
	11967
	11968	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	11969	/* While that wasn't END last time... */
	11970	NODE_ALIGN(node);
	11971	op = OP(node);
	11972	if (op == CLOSE \|\| op == WHILEM)
	11973	indent--;
	11974	next = regnext((regnode *)node);
	11975
	11976	/* Where, what. */
	11977	if (OP(node) == OPTIMIZED) {
	11978	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	11979	optstart = node;
	11980	else
	11981	goto after_print;
	11982	} else
	11983	CLEAR_OPTSTART;
	11984
	11985	regprop(r, sv, node);
	11986	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	11987	(int)(2*indent + 1), "", SvPVX_const(sv));
	11988
	11989	if (OP(node) != OPTIMIZED) {
	11990	if (next == NULL) /* Next ptr. */
	11991	PerlIO_printf(Perl_debug_log, " (0)");
	11992	else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
	11993	PerlIO_printf(Perl_debug_log, " (FAIL)");
	11994	else
	11995	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	11996	(void)PerlIO_putc(Perl_debug_log, '\n');
	11997	}
	11998
	11999	after_print:
	12000	if (PL_regkind[(U8)op] == BRANCHJ) {
	12001	assert(next);
	12002	{
	12003	register const regnode *nnode = (OP(next) == LONGJMP
	12004	? regnext((regnode *)next)
	12005	: next);
	12006	if (last && nnode > last)
	12007	nnode = last;
	12008	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	12009	}
	12010	}
	12011	else if (PL_regkind[(U8)op] == BRANCH) {
	12012	assert(next);
	12013	DUMPUNTIL(NEXTOPER(node), next);
	12014	}
	12015	else if ( PL_regkind[(U8)op] == TRIE ) {
	12016	const regnode *this_trie = node;
	12017	const char op = OP(node);
	12018	const U32 n = ARG(node);
	12019	const reg_ac_data * const ac = op>=AHOCORASICK ?
	12020	(reg_ac_data *)ri->data->data[n] :
	12021	NULL;
	12022	const reg_trie_data * const trie =
	12023	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	12024	#ifdef DEBUGGING
	12025	AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	12026	#endif
	12027	const regnode *nextbranch= NULL;
	12028	I32 word_idx;
	12029	sv_setpvs(sv, "");
	12030	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	12031	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	12032
	12033	PerlIO_printf(Perl_debug_log, "%*s%s ",
	12034	(int)(2*(indent+3)), "",
	12035	elem_ptr ? pv_pretty(sv, SvPV_nolen_const(elem_ptr), SvCUR(elem_ptr), 60,
	12036	PL_colors[0], PL_colors[1],
	12037	(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) \|
	12038	PERL_PV_PRETTY_ELLIPSES \|
	12039	PERL_PV_PRETTY_LTGT
	12040	)
	12041	: "???"
	12042	);
	12043	if (trie->jump) {
	12044	U16 dist= trie->jump[word_idx+1];
	12045	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	12046	(UV)((dist ? this_trie + dist : next) - start));
	12047	if (dist) {
	12048	if (!nextbranch)
	12049	nextbranch= this_trie + trie->jump[0];
	12050	DUMPUNTIL(this_trie + dist, nextbranch);
	12051	}
	12052	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	12053	nextbranch= regnext((regnode *)nextbranch);
	12054	} else {
	12055	PerlIO_printf(Perl_debug_log, "\n");
	12056	}
	12057	}
	12058	if (last && next > last)
	12059	node= last;
	12060	else
	12061	node= next;
	12062	}
	12063	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	12064	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	12065	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	12066	}
	12067	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	12068	assert(next);
	12069	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	12070	}
	12071	else if ( op == PLUS \|\| op == STAR) {
	12072	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	12073	}
	12074	else if (PL_regkind[(U8)op] == ANYOF) {
	12075	/* arglen 1 + class block */
	12076	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
	12077	? ANYOF_CLASS_SKIP : ANYOF_SKIP);
	12078	node = NEXTOPER(node);
	12079	}
	12080	else if (PL_regkind[(U8)op] == EXACT) {
	12081	/* Literal string, where present. */
	12082	node += NODE_SZ_STR(node) - 1;
	12083	node = NEXTOPER(node);
	12084	}
	12085	else {
	12086	node = NEXTOPER(node);
	12087	node += regarglen[(U8)op];
	12088	}
	12089	if (op == CURLYX \|\| op == OPEN)
	12090	indent++;
	12091	}
	12092	CLEAR_OPTSTART;
	12093	#ifdef DEBUG_DUMPUNTIL
	12094	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	12095	#endif
	12096	return node;
	12097	}
	12098
	12099	#endif /* DEBUGGING */
	12100
	12101	/*
	12102	* Local variables:
	12103	* c-indentation-style: bsd
	12104	* c-basic-offset: 4
	12105	* indent-tabs-mode: t
	12106	* End:
	12107	*
	12108	* ex: set ts=8 sts=4 sw=4 noet:
	12109	*/