perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	extern const struct regexp_engine my_reg_engine;
	85	#else
	86	# include "regcomp.h"
	87	#endif
	88
	89	#include "dquote_static.c"
	90	#include "charclass_invlists.h"
	91	#include "inline_invlist.c"
	92	#include "unicode_constants.h"
	93
	94	#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
	95	#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
	96	#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
	97
	98	#ifdef op
	99	#undef op
	100	#endif /* op */
	101
	102	#ifdef MSDOS
	103	# if defined(BUGGY_MSC6)
	104	/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
	105	# pragma optimize("a",off)
	106	/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
	107	# pragma optimize("w",on )
	108	# endif /* BUGGY_MSC6 */
	109	#endif /* MSDOS */
	110
	111	#ifndef STATIC
	112	#define STATIC static
	113	#endif
	114
	115
	116	typedef struct RExC_state_t {
	117	U32 flags; /* RXf_* are we folding, multilining? */
	118	U32 pm_flags; /* PMf_* stuff from the calling PMOP */
	119	char precomp; / uncompiled string. */
	120	REGEXP rx_sv; / The SV that is the regexp. */
	121	regexp rx; / perl core regexp structure */
	122	regexp_internal rxi; / internal data for regexp object pprivate field */
	123	char start; / Start of input for compile */
	124	char end; / End of input for compile */
	125	char parse; / Input-scan pointer. */
	126	SSize_t whilem_seen; /* number of WHILEM in this expr */
	127	regnode emit_start; / Start of emitted-code area */
	128	regnode emit_bound; / First regnode outside of the allocated space */
	129	regnode emit; / Code-emit pointer; if = &emit_dummy,
	130	implies compiling, so don't emit */
	131	regnode emit_dummy; /* placeholder for emit to point to */
	132	I32 naughty; /* How bad is this pattern? */
	133	I32 sawback; /* Did we see \1, ...? */
	134	U32 seen;
	135	SSize_t size; /* Code size. */
	136	I32 npar; /* Capture buffer count, (OPEN). */
	137	I32 cpar; /* Capture buffer count, (CLOSE). */
	138	I32 nestroot; /* root parens we are in - used by accept */
	139	I32 extralen;
	140	I32 seen_zerolen;
	141	regnode *open_parens; / pointers to open parens */
	142	regnode *close_parens; / pointers to close parens */
	143	regnode opend; / END node in program */
	144	I32 utf8; /* whether the pattern is utf8 or not */
	145	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	146	/* XXX use this for future optimisation of case
	147	* where pattern must be upgraded to utf8. */
	148	I32 uni_semantics; /* If a d charset modifier should use unicode
	149	rules, even if the pattern is not in
	150	utf8 */
	151	HV paren_names; / Paren names */
	152
	153	regnode *recurse; / Recurse regops */
	154	I32 recurse_count; /* Number of recurse regops */
	155	I32 in_lookbehind;
	156	I32 contains_locale;
	157	I32 override_recoding;
	158	I32 in_multi_char_class;
	159	struct reg_code_block code_blocks; / positions of literal (?{})
	160	within pattern */
	161	int num_code_blocks; /* size of code_blocks[] */
	162	int code_index; /* next code_blocks[] slot */
	163	#if ADD_TO_REGEXEC
	164	char starttry; / -Dr: where regtry was called. */
	165	#define RExC_starttry (pRExC_state->starttry)
	166	#endif
	167	SV runtime_code_qr; / qr with the runtime code blocks */
	168	#ifdef DEBUGGING
	169	const char *lastparse;
	170	I32 lastnum;
	171	AV paren_name_list; / idx -> name */
	172	#define RExC_lastparse (pRExC_state->lastparse)
	173	#define RExC_lastnum (pRExC_state->lastnum)
	174	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	175	#endif
	176	} RExC_state_t;
	177
	178	#define RExC_flags (pRExC_state->flags)
	179	#define RExC_pm_flags (pRExC_state->pm_flags)
	180	#define RExC_precomp (pRExC_state->precomp)
	181	#define RExC_rx_sv (pRExC_state->rx_sv)
	182	#define RExC_rx (pRExC_state->rx)
	183	#define RExC_rxi (pRExC_state->rxi)
	184	#define RExC_start (pRExC_state->start)
	185	#define RExC_end (pRExC_state->end)
	186	#define RExC_parse (pRExC_state->parse)
	187	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	188	#ifdef RE_TRACK_PATTERN_OFFSETS
	189	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
	190	#endif
	191	#define RExC_emit (pRExC_state->emit)
	192	#define RExC_emit_dummy (pRExC_state->emit_dummy)
	193	#define RExC_emit_start (pRExC_state->emit_start)
	194	#define RExC_emit_bound (pRExC_state->emit_bound)
	195	#define RExC_naughty (pRExC_state->naughty)
	196	#define RExC_sawback (pRExC_state->sawback)
	197	#define RExC_seen (pRExC_state->seen)
	198	#define RExC_size (pRExC_state->size)
	199	#define RExC_npar (pRExC_state->npar)
	200	#define RExC_nestroot (pRExC_state->nestroot)
	201	#define RExC_extralen (pRExC_state->extralen)
	202	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	203	#define RExC_utf8 (pRExC_state->utf8)
	204	#define RExC_uni_semantics (pRExC_state->uni_semantics)
	205	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	206	#define RExC_open_parens (pRExC_state->open_parens)
	207	#define RExC_close_parens (pRExC_state->close_parens)
	208	#define RExC_opend (pRExC_state->opend)
	209	#define RExC_paren_names (pRExC_state->paren_names)
	210	#define RExC_recurse (pRExC_state->recurse)
	211	#define RExC_recurse_count (pRExC_state->recurse_count)
	212	#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
	213	#define RExC_contains_locale (pRExC_state->contains_locale)
	214	#define RExC_override_recoding (pRExC_state->override_recoding)
	215	#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
	216
	217
	218	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	219	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	220	((*s) == '{' && regcurly(s, FALSE)))
	221
	222	#ifdef SPSTART
	223	#undef SPSTART /* dratted cpp namespace... */
	224	#endif
	225	/*
	226	* Flags to be passed up and down.
	227	*/
	228	#define WORST 0 /* Worst case. */
	229	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	230
	231	/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
	232	* character. (There needs to be a case: in the switch statement in regexec.c
	233	* for any node marked SIMPLE.) Note that this is not the same thing as
	234	* REGNODE_SIMPLE */
	235	#define SIMPLE 0x02
	236	#define SPSTART 0x04 /* Starts with * or + */
	237	#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
	238	#define TRYAGAIN 0x10 /* Weeded out a declaration. */
	239	#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
	240
	241	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	242
	243	/* whether trie related optimizations are enabled */
	244	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	245	#define TRIE_STUDY_OPT
	246	#define FULL_TRIE_STUDY
	247	#define TRIE_STCLASS
	248	#endif
	249
	250
	251
	252	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	253	#define PBITVAL(paren) (1 << ((paren) & 7))
	254	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	255	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	256	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	257
	258	#define REQUIRE_UTF8 STMT_START { \
	259	if (!UTF) { \
	260	*flagp = RESTART_UTF8; \
	261	return NULL; \
	262	} \
	263	} STMT_END
	264
	265	/* This converts the named class defined in regcomp.h to its equivalent class
	266	* number defined in handy.h. */
	267	#define namedclass_to_classnum(class) ((int) ((class) / 2))
	268	#define classnum_to_namedclass(classnum) ((classnum) * 2)
	269
	270	/* About scan_data_t.
	271
	272	During optimisation we recurse through the regexp program performing
	273	various inplace (keyhole style) optimisations. In addition study_chunk
	274	and scan_commit populate this data structure with information about
	275	what strings MUST appear in the pattern. We look for the longest
	276	string that must appear at a fixed location, and we look for the
	277	longest string that may appear at a floating location. So for instance
	278	in the pattern:
	279
	280	/FOO[xX]A.*B[xX]BAR/
	281
	282	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	283	strings (because they follow a .* construct). study_chunk will identify
	284	both FOO and BAR as being the longest fixed and floating strings respectively.
	285
	286	The strings can be composites, for instance
	287
	288	/(f)(o)(o)/
	289
	290	will result in a composite fixed substring 'foo'.
	291
	292	For each string some basic information is maintained:
	293
	294	- offset or min_offset
	295	This is the position the string must appear at, or not before.
	296	It also implicitly (when combined with minlenp) tells us how many
	297	characters must match before the string we are searching for.
	298	Likewise when combined with minlenp and the length of the string it
	299	tells us how many characters must appear after the string we have
	300	found.
	301
	302	- max_offset
	303	Only used for floating strings. This is the rightmost point that
	304	the string can appear at. If set to SSize_t_MAX it indicates that the
	305	string can occur infinitely far to the right.
	306
	307	- minlenp
	308	A pointer to the minimum number of characters of the pattern that the
	309	string was found inside. This is important as in the case of positive
	310	lookahead or positive lookbehind we can have multiple patterns
	311	involved. Consider
	312
	313	/(?=FOO).*F/
	314
	315	The minimum length of the pattern overall is 3, the minimum length
	316	of the lookahead part is 3, but the minimum length of the part that
	317	will actually match is 1. So 'FOO's minimum length is 3, but the
	318	minimum length for the F is 1. This is important as the minimum length
	319	is used to determine offsets in front of and behind the string being
	320	looked for. Since strings can be composites this is the length of the
	321	pattern at the time it was committed with a scan_commit. Note that
	322	the length is calculated by study_chunk, so that the minimum lengths
	323	are not known until the full pattern has been compiled, thus the
	324	pointer to the value.
	325
	326	- lookbehind
	327
	328	In the case of lookbehind the string being searched for can be
	329	offset past the start point of the final matching string.
	330	If this value was just blithely removed from the min_offset it would
	331	invalidate some of the calculations for how many chars must match
	332	before or after (as they are derived from min_offset and minlen and
	333	the length of the string being searched for).
	334	When the final pattern is compiled and the data is moved from the
	335	scan_data_t structure into the regexp structure the information
	336	about lookbehind is factored in, with the information that would
	337	have been lost precalculated in the end_shift field for the
	338	associated string.
	339
	340	The fields pos_min and pos_delta are used to store the minimum offset
	341	and the delta to the maximum offset at the current point in the pattern.
	342
	343	*/
	344
	345	typedef struct scan_data_t {
	346	/I32 len_min; unused /
	347	/I32 len_delta; unused /
	348	SSize_t pos_min;
	349	SSize_t pos_delta;
	350	SV *last_found;
	351	SSize_t last_end; /* min value, <0 unless valid. */
	352	SSize_t last_start_min;
	353	SSize_t last_start_max;
	354	SV *longest; / Either &l_fixed, or &l_float. */
	355	SV longest_fixed; / longest fixed string found in pattern */
	356	SSize_t offset_fixed; /* offset where it starts */
	357	SSize_t minlen_fixed; / pointer to the minlen relevant to the string */
	358	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	359	SV longest_float; / longest floating string found in pattern */
	360	SSize_t offset_float_min; /* earliest point in string it can appear */
	361	SSize_t offset_float_max; /* latest point in string it can appear */
	362	SSize_t minlen_float; / pointer to the minlen relevant to the string */
	363	SSize_t lookbehind_float; /* is the pos of the string modified by LB */
	364	I32 flags;
	365	I32 whilem_c;
	366	SSize_t *last_closep;
	367	struct regnode_charclass_class *start_class;
	368	} scan_data_t;
	369
	370	/* The below is perhaps overboard, but this allows us to save a test at the
	371	* expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
	372	* and 'a' differ by a single bit; the same with the upper and lower case of
	373	* all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
	374	* on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
	375	* then inverts it to form a mask, with just a single 0, in the bit position
	376	* where the upper- and lowercase differ. XXX There are about 40 other
	377	* instances in the Perl core where this micro-optimization could be used.
	378	* Should decide if maintenance cost is worse, before changing those
	379	*
	380	* Returns a boolean as to whether or not 'v' is either a lowercase or
	381	* uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
	382	* compile-time constant, the generated code is better than some optimizing
	383	* compilers figure out, amounting to a mask and test. The results are
	384	* meaningless if 'c' is not one of [A-Za-z] */
	385	#define isARG2_lower_or_UPPER_ARG1(c, v) \
	386	(((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
	387
	388	/*
	389	* Forward declarations for pregcomp()'s friends.
	390	*/
	391
	392	static const scan_data_t zero_scan_data =
	393	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	394
	395	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	396	#define SF_BEFORE_SEOL 0x0001
	397	#define SF_BEFORE_MEOL 0x0002
	398	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	399	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	400
	401	#ifdef NO_UNARY_PLUS
	402	# define SF_FIX_SHIFT_EOL (0+2)
	403	# define SF_FL_SHIFT_EOL (0+4)
	404	#else
	405	# define SF_FIX_SHIFT_EOL (+2)
	406	# define SF_FL_SHIFT_EOL (+4)
	407	#endif
	408
	409	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	410	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	411
	412	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	413	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	414	#define SF_IS_INF 0x0040
	415	#define SF_HAS_PAR 0x0080
	416	#define SF_IN_PAR 0x0100
	417	#define SF_HAS_EVAL 0x0200
	418	#define SCF_DO_SUBSTR 0x0400
	419	#define SCF_DO_STCLASS_AND 0x0800
	420	#define SCF_DO_STCLASS_OR 0x1000
	421	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	422	#define SCF_WHILEM_VISITED_POS 0x2000
	423
	424	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	425	#define SCF_SEEN_ACCEPT 0x8000
	426	#define SCF_TRIE_DOING_RESTUDY 0x10000
	427
	428	#define UTF cBOOL(RExC_utf8)
	429
	430	/* The enums for all these are ordered so things work out correctly */
	431	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	432	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
	433	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	434	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
	435	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
	436	#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
	437	#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	438
	439	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	440
	441	#define OOB_NAMEDCLASS -1
	442
	443	/* There is no code point that is out-of-bounds, so this is problematic. But
	444	* its only current use is to initialize a variable that is always set before
	445	* looked at. */
	446	#define OOB_UNICODE 0xDEADBEEF
	447
	448	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	449	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	450
	451
	452	/* length of regex to show in messages that don't mark a position within */
	453	#define RegexLengthToShowInErrorMessages 127
	454
	455	/*
	456	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	457	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	458	* op/pragma/warn/regcomp.
	459	*/
	460	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	461	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	462
	463	#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
	464
	465	/*
	466	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	467	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	468	* "...".
	469	*/
	470	#define _FAIL(code) STMT_START { \
	471	const char *ellipses = ""; \
	472	IV len = RExC_end - RExC_precomp; \
	473	\
	474	if (!SIZE_ONLY) \
	475	SAVEFREESV(RExC_rx_sv); \
	476	if (len > RegexLengthToShowInErrorMessages) { \
	477	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	478	len = RegexLengthToShowInErrorMessages - 10; \
	479	ellipses = "..."; \
	480	} \
	481	code; \
	482	} STMT_END
	483
	484	#define FAIL(msg) _FAIL( \
	485	Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
	486	msg, (int)len, RExC_precomp, ellipses))
	487
	488	#define FAIL2(msg,arg) _FAIL( \
	489	Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
	490	arg, (int)len, RExC_precomp, ellipses))
	491
	492	/*
	493	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	494	*/
	495	#define Simple_vFAIL(m) STMT_START { \
	496	const IV offset = RExC_parse - RExC_precomp; \
	497	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	498	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	499	} STMT_END
	500
	501	/*
	502	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	503	*/
	504	#define vFAIL(m) STMT_START { \
	505	if (!SIZE_ONLY) \
	506	SAVEFREESV(RExC_rx_sv); \
	507	Simple_vFAIL(m); \
	508	} STMT_END
	509
	510	/*
	511	* Like Simple_vFAIL(), but accepts two arguments.
	512	*/
	513	#define Simple_vFAIL2(m,a1) STMT_START { \
	514	const IV offset = RExC_parse - RExC_precomp; \
	515	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
	516	(int)offset, RExC_precomp, RExC_precomp + offset); \
	517	} STMT_END
	518
	519	/*
	520	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	521	*/
	522	#define vFAIL2(m,a1) STMT_START { \
	523	if (!SIZE_ONLY) \
	524	SAVEFREESV(RExC_rx_sv); \
	525	Simple_vFAIL2(m, a1); \
	526	} STMT_END
	527
	528
	529	/*
	530	* Like Simple_vFAIL(), but accepts three arguments.
	531	*/
	532	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	533	const IV offset = RExC_parse - RExC_precomp; \
	534	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
	535	(int)offset, RExC_precomp, RExC_precomp + offset); \
	536	} STMT_END
	537
	538	/*
	539	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	540	*/
	541	#define vFAIL3(m,a1,a2) STMT_START { \
	542	if (!SIZE_ONLY) \
	543	SAVEFREESV(RExC_rx_sv); \
	544	Simple_vFAIL3(m, a1, a2); \
	545	} STMT_END
	546
	547	/*
	548	* Like Simple_vFAIL(), but accepts four arguments.
	549	*/
	550	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	551	const IV offset = RExC_parse - RExC_precomp; \
	552	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
	553	(int)offset, RExC_precomp, RExC_precomp + offset); \
	554	} STMT_END
	555
	556	#define vFAIL4(m,a1,a2,a3) STMT_START { \
	557	if (!SIZE_ONLY) \
	558	SAVEFREESV(RExC_rx_sv); \
	559	Simple_vFAIL4(m, a1, a2, a3); \
	560	} STMT_END
	561
	562	/* m is not necessarily a "literal string", in this macro */
	563	#define reg_warn_non_literal_string(loc, m) STMT_START { \
	564	const IV offset = loc - RExC_precomp; \
	565	Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
	566	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	567	} STMT_END
	568
	569	#define ckWARNreg(loc,m) STMT_START { \
	570	const IV offset = loc - RExC_precomp; \
	571	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	572	(int)offset, RExC_precomp, RExC_precomp + offset); \
	573	} STMT_END
	574
	575	#define vWARN_dep(loc, m) STMT_START { \
	576	const IV offset = loc - RExC_precomp; \
	577	Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
	578	(int)offset, RExC_precomp, RExC_precomp + offset); \
	579	} STMT_END
	580
	581	#define ckWARNdep(loc,m) STMT_START { \
	582	const IV offset = loc - RExC_precomp; \
	583	Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
	584	m REPORT_LOCATION, \
	585	(int)offset, RExC_precomp, RExC_precomp + offset); \
	586	} STMT_END
	587
	588	#define ckWARNregdep(loc,m) STMT_START { \
	589	const IV offset = loc - RExC_precomp; \
	590	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	591	m REPORT_LOCATION, \
	592	(int)offset, RExC_precomp, RExC_precomp + offset); \
	593	} STMT_END
	594
	595	#define ckWARN2reg_d(loc,m, a1) STMT_START { \
	596	const IV offset = loc - RExC_precomp; \
	597	Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
	598	m REPORT_LOCATION, \
	599	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	600	} STMT_END
	601
	602	#define ckWARN2reg(loc, m, a1) STMT_START { \
	603	const IV offset = loc - RExC_precomp; \
	604	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	605	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	606	} STMT_END
	607
	608	#define vWARN3(loc, m, a1, a2) STMT_START { \
	609	const IV offset = loc - RExC_precomp; \
	610	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	611	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	612	} STMT_END
	613
	614	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	615	const IV offset = loc - RExC_precomp; \
	616	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	617	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	618	} STMT_END
	619
	620	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	621	const IV offset = loc - RExC_precomp; \
	622	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	623	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	624	} STMT_END
	625
	626	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	627	const IV offset = loc - RExC_precomp; \
	628	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	629	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	630	} STMT_END
	631
	632	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	633	const IV offset = loc - RExC_precomp; \
	634	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	635	a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
	636	} STMT_END
	637
	638
	639	/* Allow for side effects in s */
	640	#define REGC(c,s) STMT_START { \
	641	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	642	} STMT_END
	643
	644	/* Macros for recording node offsets. 20001227 mjd@plover.com
	645	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	646	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	647	* Element 0 holds the number n.
	648	* Position is 1 indexed.
	649	*/
	650	#ifndef RE_TRACK_PATTERN_OFFSETS
	651	#define Set_Node_Offset_To_R(node,byte)
	652	#define Set_Node_Offset(node,byte)
	653	#define Set_Cur_Node_Offset
	654	#define Set_Node_Length_To_R(node,len)
	655	#define Set_Node_Length(node,len)
	656	#define Set_Node_Cur_Length(node,start)
	657	#define Node_Offset(n)
	658	#define Node_Length(n)
	659	#define Set_Node_Offset_Length(node,offset,len)
	660	#define ProgLen(ri) ri->u.proglen
	661	#define SetProgLen(ri,x) ri->u.proglen = x
	662	#else
	663	#define ProgLen(ri) ri->u.offsets[0]
	664	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	665	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	666	if (! SIZE_ONLY) { \
	667	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	668	__LINE__, (int)(node), (int)(byte))); \
	669	if((node) < 0) { \
	670	Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
	671	} else { \
	672	RExC_offsets[2*(node)-1] = (byte); \
	673	} \
	674	} \
	675	} STMT_END
	676
	677	#define Set_Node_Offset(node,byte) \
	678	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	679	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	680
	681	#define Set_Node_Length_To_R(node,len) STMT_START { \
	682	if (! SIZE_ONLY) { \
	683	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	684	__LINE__, (int)(node), (int)(len))); \
	685	if((node) < 0) { \
	686	Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
	687	} else { \
	688	RExC_offsets[2*(node)] = (len); \
	689	} \
	690	} \
	691	} STMT_END
	692
	693	#define Set_Node_Length(node,len) \
	694	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	695	#define Set_Node_Cur_Length(node, start) \
	696	Set_Node_Length(node, RExC_parse - start)
	697
	698	/* Get offsets and lengths */
	699	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	700	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	701
	702	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	703	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	704	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	705	} STMT_END
	706	#endif
	707
	708	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	709	#define EXPERIMENTAL_INPLACESCAN
	710	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	711
	712	#define DEBUG_STUDYDATA(str,data,depth) \
	713	DEBUG_OPTIMISE_MORE_r(if(data){ \
	714	PerlIO_printf(Perl_debug_log, \
	715	"%*s" str "Pos:%"IVdf"/%"IVdf \
	716	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	717	(int)(depth)*2, "", \
	718	(IV)((data)->pos_min), \
	719	(IV)((data)->pos_delta), \
	720	(UV)((data)->flags), \
	721	(IV)((data)->whilem_c), \
	722	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	723	is_inf ? "INF " : "" \
	724	); \
	725	if ((data)->last_found) \
	726	PerlIO_printf(Perl_debug_log, \
	727	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	728	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	729	SvPVX_const((data)->last_found), \
	730	(IV)((data)->last_end), \
	731	(IV)((data)->last_start_min), \
	732	(IV)((data)->last_start_max), \
	733	((data)->longest && \
	734	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	735	SvPVX_const((data)->longest_fixed), \
	736	(IV)((data)->offset_fixed), \
	737	((data)->longest && \
	738	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	739	SvPVX_const((data)->longest_float), \
	740	(IV)((data)->offset_float_min), \
	741	(IV)((data)->offset_float_max) \
	742	); \
	743	PerlIO_printf(Perl_debug_log,"\n"); \
	744	});
	745
	746	/* Mark that we cannot extend a found fixed substring at this point.
	747	Update the longest found anchored substring and the longest found
	748	floating substrings if needed. */
	749
	750	STATIC void
	751	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data,
	752	SSize_t *minlenp, int is_inf)
	753	{
	754	const STRLEN l = CHR_SVLEN(data->last_found);
	755	const STRLEN old_l = CHR_SVLEN(*data->longest);
	756	GET_RE_DEBUG_FLAGS_DECL;
	757
	758	PERL_ARGS_ASSERT_SCAN_COMMIT;
	759
	760	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	761	SvSetMagicSV(*data->longest, data->last_found);
	762	if (*data->longest == data->longest_fixed) {
	763	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	764	if (data->flags & SF_BEFORE_EOL)
	765	data->flags
	766	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	767	else
	768	data->flags &= ~SF_FIX_BEFORE_EOL;
	769	data->minlen_fixed=minlenp;
	770	data->lookbehind_fixed=0;
	771	}
	772	else { /* data->longest == data->longest_float /
	773	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	774	data->offset_float_max = (l
	775	? data->last_start_max
	776	: (data->pos_delta == SSize_t_MAX
	777	? SSize_t_MAX
	778	: data->pos_min + data->pos_delta));
	779	if (is_inf
	780	\|\| (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
	781	data->offset_float_max = SSize_t_MAX;
	782	if (data->flags & SF_BEFORE_EOL)
	783	data->flags
	784	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	785	else
	786	data->flags &= ~SF_FL_BEFORE_EOL;
	787	data->minlen_float=minlenp;
	788	data->lookbehind_float=0;
	789	}
	790	}
	791	SvCUR_set(data->last_found, 0);
	792	{
	793	SV * const sv = data->last_found;
	794	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	795	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	796	if (mg)
	797	mg->mg_len = 0;
	798	}
	799	}
	800	data->last_end = -1;
	801	data->flags &= ~SF_BEFORE_EOL;
	802	DEBUG_STUDYDATA("commit: ",data,0);
	803	}
	804
	805	/* These macros set, clear and test whether the synthetic start class ('ssc',
	806	* given by the parameter) matches an empty string (EOS). This uses the
	807	* 'next_off' field in the node, to save a bit in the flags field. The ssc
	808	* stands alone, so there is never a next_off, so this field is otherwise
	809	* unused. The EOS information is used only for compilation, but theoretically
	810	* it could be passed on to the execution code. This could be used to store
	811	* more than one bit of information, but only this one is currently used. */
	812	#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
	813	#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
	814	#define TEST_SSC_EOS(node) cBOOL((node)->next_off)
	815
	816	/* Can match anything (initialization) */
	817	STATIC void
	818	S_cl_anything(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	819	{
	820	PERL_ARGS_ASSERT_CL_ANYTHING;
	821
	822	ANYOF_BITMAP_SETALL(cl);
	823	cl->flags = ANYOF_UNICODE_ALL;
	824	SET_SSC_EOS(cl);
	825
	826	/* If any portion of the regex is to operate under locale rules,
	827	* initialization includes it. The reason this isn't done for all regexes
	828	* is that the optimizer was written under the assumption that locale was
	829	* all-or-nothing. Given the complexity and lack of documentation in the
	830	* optimizer, and that there are inadequate test cases for locale, so many
	831	* parts of it may not work properly, it is safest to avoid locale unless
	832	* necessary. */
	833	if (RExC_contains_locale) {
	834	ANYOF_CLASS_SETALL(cl); /* /l uses class */
	835	cl->flags \|= ANYOF_LOCALE\|ANYOF_CLASS\|ANYOF_LOC_FOLD;
	836	}
	837	else {
	838	ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
	839	}
	840	}
	841
	842	/* Can match anything (initialization) */
	843	STATIC int
	844	S_cl_is_anything(const struct regnode_charclass_class *cl)
	845	{
	846	int value;
	847
	848	PERL_ARGS_ASSERT_CL_IS_ANYTHING;
	849
	850	for (value = 0; value < ANYOF_MAX; value += 2)
	851	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
	852	return 1;
	853	if (!(cl->flags & ANYOF_UNICODE_ALL))
	854	return 0;
	855	if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
	856	return 0;
	857	return 1;
	858	}
	859
	860	/* Can match anything (initialization) */
	861	STATIC void
	862	S_cl_init(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	863	{
	864	PERL_ARGS_ASSERT_CL_INIT;
	865
	866	Zero(cl, 1, struct regnode_charclass_class);
	867	cl->type = ANYOF;
	868	cl_anything(pRExC_state, cl);
	869	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	870	}
	871
	872	/* These two functions currently do the exact same thing */
	873	#define cl_init_zero cl_init
	874
	875	/* 'AND' a given class with another one. Can create false positives. 'cl'
	876	* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
	877	* 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
	878	STATIC void
	879	S_cl_and(struct regnode_charclass_class *cl,
	880	const struct regnode_charclass_class *and_with)
	881	{
	882	PERL_ARGS_ASSERT_CL_AND;
	883
	884	assert(PL_regkind[and_with->type] == ANYOF);
	885
	886	/* I (khw) am not sure all these restrictions are necessary XXX */
	887	if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
	888	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
	889	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	890	&& !(and_with->flags & ANYOF_LOC_FOLD)
	891	&& !(cl->flags & ANYOF_LOC_FOLD)) {
	892	int i;
	893
	894	if (and_with->flags & ANYOF_INVERT)
	895	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	896	cl->bitmap[i] &= ~and_with->bitmap[i];
	897	else
	898	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	899	cl->bitmap[i] &= and_with->bitmap[i];
	900	} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
	901
	902	if (and_with->flags & ANYOF_INVERT) {
	903
	904	/* Here, the and'ed node is inverted. Get the AND of the flags that
	905	* aren't affected by the inversion. Those that are affected are
	906	* handled individually below */
	907	U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
	908	cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
	909	cl->flags \|= affected_flags;
	910
	911	/* We currently don't know how to deal with things that aren't in the
	912	* bitmap, but we know that the intersection is no greater than what
	913	* is already in cl, so let there be false positives that get sorted
	914	* out after the synthetic start class succeeds, and the node is
	915	* matched for real. */
	916
	917	/* The inversion of these two flags indicate that the resulting
	918	* intersection doesn't have them */
	919	if (and_with->flags & ANYOF_UNICODE_ALL) {
	920	cl->flags &= ~ANYOF_UNICODE_ALL;
	921	}
	922	if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	923	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
	924	}
	925	}
	926	else { /* and'd node is not inverted */
	927	U8 outside_bitmap_but_not_utf8; /* Temp variable */
	928
	929	if (! ANYOF_NONBITMAP(and_with)) {
	930
	931	/* Here 'and_with' doesn't match anything outside the bitmap
	932	* (except possibly ANYOF_UNICODE_ALL), which means the
	933	* intersection can't either, except for ANYOF_UNICODE_ALL, in
	934	* which case we don't know what the intersection is, but it's no
	935	* greater than what cl already has, so can just leave it alone,
	936	* with possible false positives */
	937	if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
	938	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	939	cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
	940	}
	941	}
	942	else if (! ANYOF_NONBITMAP(cl)) {
	943
	944	/* Here, 'and_with' does match something outside the bitmap, and cl
	945	* doesn't have a list of things to match outside the bitmap. If
	946	* cl can match all code points above 255, the intersection will
	947	* be those above-255 code points that 'and_with' matches. If cl
	948	* can't match all Unicode code points, it means that it can't
	949	* match anything outside the bitmap (since the 'if' that got us
	950	* into this block tested for that), so we leave the bitmap empty.
	951	*/
	952	if (cl->flags & ANYOF_UNICODE_ALL) {
	953	ARG_SET(cl, ARG(and_with));
	954
	955	/* and_with's ARG may match things that don't require UTF8.
	956	* And now cl's will too, in spite of this being an 'and'. See
	957	* the comments below about the kludge */
	958	cl->flags \|= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
	959	}
	960	}
	961	else {
	962	/* Here, both 'and_with' and cl match something outside the
	963	* bitmap. Currently we do not do the intersection, so just match
	964	* whatever cl had at the beginning. */
	965	}
	966
	967
	968	/* Take the intersection of the two sets of flags. However, the
	969	* ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
	970	* kludge around the fact that this flag is not treated like the others
	971	* which are initialized in cl_anything(). The way the optimizer works
	972	* is that the synthetic start class (SSC) is initialized to match
	973	* anything, and then the first time a real node is encountered, its
	974	* values are AND'd with the SSC's with the result being the values of
	975	* the real node. However, there are paths through the optimizer where
	976	* the AND never gets called, so those initialized bits are set
	977	* inappropriately, which is not usually a big deal, as they just cause
	978	* false positives in the SSC, which will just mean a probably
	979	* imperceptible slow down in execution. However this bit has a
	980	* higher false positive consequence in that it can cause utf8.pm,
	981	* utf8_heavy.pl ... to be loaded when not necessary, which is a much
	982	* bigger slowdown and also causes significant extra memory to be used.
	983	* In order to prevent this, the code now takes a different tack. The
	984	* bit isn't set unless some part of the regular expression needs it,
	985	* but once set it won't get cleared. This means that these extra
	986	* modules won't get loaded unless there was some path through the
	987	* pattern that would have required them anyway, and so any false
	988	* positives that occur by not ANDing them out when they could be
	989	* aren't as severe as they would be if we treated this bit like all
	990	* the others */
	991	outside_bitmap_but_not_utf8 = (cl->flags \| and_with->flags)
	992	& ANYOF_NONBITMAP_NON_UTF8;
	993	cl->flags &= and_with->flags;
	994	cl->flags \|= outside_bitmap_but_not_utf8;
	995	}
	996	}
	997
	998	/* 'OR' a given class with another one. Can create false positives. 'cl'
	999	* should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
	1000	* 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
	1001	STATIC void
	1002	S_cl_or(const RExC_state_t pRExC_state, struct regnode_charclass_class cl, const struct regnode_charclass_class *or_with)
	1003	{
	1004	PERL_ARGS_ASSERT_CL_OR;
	1005
	1006	if (or_with->flags & ANYOF_INVERT) {
	1007
	1008	/* Here, the or'd node is to be inverted. This means we take the
	1009	* complement of everything not in the bitmap, but currently we don't
	1010	* know what that is, so give up and match anything */
	1011	if (ANYOF_NONBITMAP(or_with)) {
	1012	cl_anything(pRExC_state, cl);
	1013	}
	1014	/* We do not use
	1015	* (B1 \| CL1) \| (!B2 & !CL2) = (B1 \| !B2 & !CL2) \| (CL1 \| (!B2 & !CL2))
	1016	* <= (B1 \| !B2) \| (CL1 \| !CL2)
	1017	* which is wasteful if CL2 is small, but we ignore CL2:
	1018	* (B1 \| CL1) \| (!B2 & !CL2) <= (B1 \| CL1) \| !B2 = (B1 \| !B2) \| CL1
	1019	* XXXX Can we handle case-fold? Unclear:
	1020	* (OK1(i) \| OK1(i')) \| !(OK1(i) \| OK1(i')) =
	1021	* (OK1(i) \| OK1(i')) \| (!OK1(i) & !OK1(i'))
	1022	*/
	1023	else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	1024	&& !(or_with->flags & ANYOF_LOC_FOLD)
	1025	&& !(cl->flags & ANYOF_LOC_FOLD) ) {
	1026	int i;
	1027
	1028	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	1029	cl->bitmap[i] \|= ~or_with->bitmap[i];
	1030	} /* XXXX: logic is complicated otherwise */
	1031	else {
	1032	cl_anything(pRExC_state, cl);
	1033	}
	1034
	1035	/* And, we can just take the union of the flags that aren't affected
	1036	* by the inversion */
	1037	cl->flags \|= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
	1038
	1039	/* For the remaining flags:
	1040	ANYOF_UNICODE_ALL and inverted means to not match anything above
	1041	255, which means that the union with cl should just be
	1042	what cl has in it, so can ignore this flag
	1043	ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
	1044	is (ASCII) 127-255 to match them, but then invert that, so
	1045	the union with cl should just be what cl has in it, so can
	1046	ignore this flag
	1047	*/
	1048	} else { /* 'or_with' is not inverted */
	1049	/* (B1 \| CL1) \| (B2 \| CL2) = (B1 \| B2) \| (CL1 \| CL2)) */
	1050	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	1051	&& (!(or_with->flags & ANYOF_LOC_FOLD)
	1052	\|\| (cl->flags & ANYOF_LOC_FOLD)) ) {
	1053	int i;
	1054
	1055	/* OR char bitmap and class bitmap separately */
	1056	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	1057	cl->bitmap[i] \|= or_with->bitmap[i];
	1058	if (or_with->flags & ANYOF_CLASS) {
	1059	ANYOF_CLASS_OR(or_with, cl);
	1060	}
	1061	}
	1062	else { /* XXXX: logic is complicated, leave it along for a moment. */
	1063	cl_anything(pRExC_state, cl);
	1064	}
	1065
	1066	if (ANYOF_NONBITMAP(or_with)) {
	1067
	1068	/* Use the added node's outside-the-bit-map match if there isn't a
	1069	* conflict. If there is a conflict (both nodes match something
	1070	* outside the bitmap, but what they match outside is not the same
	1071	* pointer, and hence not easily compared until XXX we extend
	1072	* inversion lists this far), give up and allow the start class to
	1073	* match everything outside the bitmap. If that stuff is all above
	1074	* 255, can just set UNICODE_ALL, otherwise caould be anything. */
	1075	if (! ANYOF_NONBITMAP(cl)) {
	1076	ARG_SET(cl, ARG(or_with));
	1077	}
	1078	else if (ARG(cl) != ARG(or_with)) {
	1079
	1080	if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
	1081	cl_anything(pRExC_state, cl);
	1082	}
	1083	else {
	1084	cl->flags \|= ANYOF_UNICODE_ALL;
	1085	}
	1086	}
	1087	}
	1088
	1089	/* Take the union */
	1090	cl->flags \|= or_with->flags;
	1091	}
	1092	}
	1093
	1094	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	1095	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	1096	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	1097	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
	1098
	1099
	1100	#ifdef DEBUGGING
	1101	/*
	1102	dump_trie(trie,widecharmap,revcharmap)
	1103	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	1104	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	1105
	1106	These routines dump out a trie in a somewhat readable format.
	1107	The _interim_ variants are used for debugging the interim
	1108	tables that are used to generate the final compressed
	1109	representation which is what dump_trie expects.
	1110
	1111	Part of the reason for their existence is to provide a form
	1112	of documentation as to how the different representations function.
	1113
	1114	*/
	1115
	1116	/*
	1117	Dumps the final compressed table form of the trie to Perl_debug_log.
	1118	Used for debugging make_trie().
	1119	*/
	1120
	1121	STATIC void
	1122	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	1123	AV *revcharmap, U32 depth)
	1124	{
	1125	U32 state;
	1126	SV *sv=sv_newmortal();
	1127	int colwidth= widecharmap ? 6 : 4;
	1128	U16 word;
	1129	GET_RE_DEBUG_FLAGS_DECL;
	1130
	1131	PERL_ARGS_ASSERT_DUMP_TRIE;
	1132
	1133	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	1134	(int)depth * 2 + 2,"",
	1135	"Match","Base","Ofs" );
	1136
	1137	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	1138	SV ** const tmp = av_fetch( revcharmap, state, 0);
	1139	if ( tmp ) {
	1140	PerlIO_printf( Perl_debug_log, "%*s",
	1141	colwidth,
	1142	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1143	PL_colors[0], PL_colors[1],
	1144	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1145	PERL_PV_ESCAPE_FIRSTCHAR
	1146	)
	1147	);
	1148	}
	1149	}
	1150	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	1151	(int)depth * 2 + 2,"");
	1152
	1153	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	1154	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	1155	PerlIO_printf( Perl_debug_log, "\n");
	1156
	1157	for( state = 1 ; state < trie->statecount ; state++ ) {
	1158	const U32 base = trie->states[ state ].trans.base;
	1159
	1160	PerlIO_printf( Perl_debug_log, "%s#%4"UVXf"\|", (int)depth 2 + 2,"", (UV)state);
	1161
	1162	if ( trie->states[ state ].wordnum ) {
	1163	PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
	1164	} else {
	1165	PerlIO_printf( Perl_debug_log, "%6s", "" );
	1166	}
	1167
	1168	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	1169
	1170	if ( base ) {
	1171	U32 ofs = 0;
	1172
	1173	while( ( base + ofs < trie->uniquecharcount ) \|\|
	1174	( base + ofs - trie->uniquecharcount < trie->lasttrans
	1175	&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
	1176	ofs++;
	1177
	1178	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	1179
	1180	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1181	if ( ( base + ofs >= trie->uniquecharcount ) &&
	1182	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	1183	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	1184	{
	1185	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	1186	colwidth,
	1187	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
	1188	} else {
	1189	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	1190	}
	1191	}
	1192
	1193	PerlIO_printf( Perl_debug_log, "]");
	1194
	1195	}
	1196	PerlIO_printf( Perl_debug_log, "\n" );
	1197	}
	1198	PerlIO_printf(Perl_debug_log, "%sword_info N:(prev,len)=", (int)depth2, "");
	1199	for (word=1; word <= trie->wordcount; word++) {
	1200	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	1201	(int)word, (int)(trie->wordinfo[word].prev),
	1202	(int)(trie->wordinfo[word].len));
	1203	}
	1204	PerlIO_printf(Perl_debug_log, "\n" );
	1205	}
	1206	/*
	1207	Dumps a fully constructed but uncompressed trie in list form.
	1208	List tries normally only are used for construction when the number of
	1209	possible chars (trie->uniquecharcount) is very high.
	1210	Used for debugging make_trie().
	1211	*/
	1212	STATIC void
	1213	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	1214	HV widecharmap, AV revcharmap, U32 next_alloc,
	1215	U32 depth)
	1216	{
	1217	U32 state;
	1218	SV *sv=sv_newmortal();
	1219	int colwidth= widecharmap ? 6 : 4;
	1220	GET_RE_DEBUG_FLAGS_DECL;
	1221
	1222	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1223
	1224	/* print out the table precompression. */
	1225	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1226	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1227	"------:-----+-----------------\n" );
	1228
	1229	for( state=1 ; state < next_alloc ; state ++ ) {
	1230	U16 charid;
	1231
	1232	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1233	(int)depth * 2 + 2,"", (UV)state );
	1234	if ( ! trie->states[ state ].wordnum ) {
	1235	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1236	} else {
	1237	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1238	trie->states[ state ].wordnum
	1239	);
	1240	}
	1241	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1242	SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
	1243	if ( tmp ) {
	1244	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1245	colwidth,
	1246	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1247	PL_colors[0], PL_colors[1],
	1248	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1249	PERL_PV_ESCAPE_FIRSTCHAR
	1250	) ,
	1251	TRIE_LIST_ITEM(state,charid).forid,
	1252	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1253	);
	1254	if (!(charid % 10))
	1255	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1256	(int)((depth * 2) + 14), "");
	1257	}
	1258	}
	1259	PerlIO_printf( Perl_debug_log, "\n");
	1260	}
	1261	}
	1262
	1263	/*
	1264	Dumps a fully constructed but uncompressed trie in table form.
	1265	This is the normal DFA style state transition table, with a few
	1266	twists to facilitate compression later.
	1267	Used for debugging make_trie().
	1268	*/
	1269	STATIC void
	1270	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1271	HV widecharmap, AV revcharmap, U32 next_alloc,
	1272	U32 depth)
	1273	{
	1274	U32 state;
	1275	U16 charid;
	1276	SV *sv=sv_newmortal();
	1277	int colwidth= widecharmap ? 6 : 4;
	1278	GET_RE_DEBUG_FLAGS_DECL;
	1279
	1280	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1281
	1282	/*
	1283	print out the table precompression so that we can do a visual check
	1284	that they are identical.
	1285	*/
	1286
	1287	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1288
	1289	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1290	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1291	if ( tmp ) {
	1292	PerlIO_printf( Perl_debug_log, "%*s",
	1293	colwidth,
	1294	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1295	PL_colors[0], PL_colors[1],
	1296	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1297	PERL_PV_ESCAPE_FIRSTCHAR
	1298	)
	1299	);
	1300	}
	1301	}
	1302
	1303	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1304
	1305	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1306	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1307	}
	1308
	1309	PerlIO_printf( Perl_debug_log, "\n" );
	1310
	1311	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1312
	1313	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1314	(int)depth * 2 + 2,"",
	1315	(UV)TRIE_NODENUM( state ) );
	1316
	1317	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1318	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1319	if (v)
	1320	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1321	else
	1322	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1323	}
	1324	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1325	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
	1326	} else {
	1327	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
	1328	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1329	}
	1330	}
	1331	}
	1332
	1333	#endif
	1334
	1335
	1336	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1337	startbranch: the first branch in the whole branch sequence
	1338	first : start branch of sequence of branch-exact nodes.
	1339	May be the same as startbranch
	1340	last : Thing following the last branch.
	1341	May be the same as tail.
	1342	tail : item following the branch sequence
	1343	count : words in the sequence
	1344	flags : currently the OP() type we will be building one of /EXACT(\|F\|Fl)/
	1345	depth : indent depth
	1346
	1347	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1348
	1349	A trie is an N'ary tree where the branches are determined by digital
	1350	decomposition of the key. IE, at the root node you look up the 1st character and
	1351	follow that branch repeat until you find the end of the branches. Nodes can be
	1352	marked as "accepting" meaning they represent a complete word. Eg:
	1353
	1354	/he\|she\|his\|hers/
	1355
	1356	would convert into the following structure. Numbers represent states, letters
	1357	following numbers represent valid transitions on the letter from that state, if
	1358	the number is in square brackets it represents an accepting state, otherwise it
	1359	will be in parenthesis.
	1360
	1361	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1362	\| \|
	1363	\| (2)
	1364	\| \|
	1365	(1) +-i->(6)-+-s->[7]
	1366	\|
	1367	+-s->(3)-+-h->(4)-+-e->[5]
	1368
	1369	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1370
	1371	This shows that when matching against the string 'hers' we will begin at state 1
	1372	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1373	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1374	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1375	single traverse. We store a mapping from accepting to state to which word was
	1376	matched, and then when we have multiple possibilities we try to complete the
	1377	rest of the regex in the order in which they occured in the alternation.
	1378
	1379	The only prior NFA like behaviour that would be changed by the TRIE support is
	1380	the silent ignoring of duplicate alternations which are of the form:
	1381
	1382	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1383
	1384	Thus EVAL blocks following a trie may be called a different number of times with
	1385	and without the optimisation. With the optimisations dupes will be silently
	1386	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1387	the following demonstrates:
	1388
	1389	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1390
	1391	which prints out 'word' three times, but
	1392
	1393	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1394
	1395	which doesnt print it out at all. This is due to other optimisations kicking in.
	1396
	1397	Example of what happens on a structural level:
	1398
	1399	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1400
	1401	1: CURLYM[1] {1,32767}(18)
	1402	5: BRANCH(8)
	1403	6: EXACT <ac>(16)
	1404	8: BRANCH(11)
	1405	9: EXACT <ad>(16)
	1406	11: BRANCH(14)
	1407	12: EXACT <ab>(16)
	1408	16: SUCCEED(0)
	1409	17: NOTHING(18)
	1410	18: END(0)
	1411
	1412	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1413	and should turn into:
	1414
	1415	1: CURLYM[1] {1,32767}(18)
	1416	5: TRIE(16)
	1417	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1418	<ac>
	1419	<ad>
	1420	<ab>
	1421	16: SUCCEED(0)
	1422	17: NOTHING(18)
	1423	18: END(0)
	1424
	1425	Cases where tail != last would be like /(?foo\|bar)baz/:
	1426
	1427	1: BRANCH(4)
	1428	2: EXACT <foo>(8)
	1429	4: BRANCH(7)
	1430	5: EXACT <bar>(8)
	1431	7: TAIL(8)
	1432	8: EXACT <baz>(10)
	1433	10: END(0)
	1434
	1435	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1436	and would end up looking like:
	1437
	1438	1: TRIE(8)
	1439	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1440	<foo>
	1441	<bar>
	1442	7: TAIL(8)
	1443	8: EXACT <baz>(10)
	1444	10: END(0)
	1445
	1446	d = uvchr_to_utf8_flags(d, uv, 0);
	1447
	1448	is the recommended Unicode-aware way of saying
	1449
	1450	*(d++) = uv;
	1451	*/
	1452
	1453	#define TRIE_STORE_REVCHAR(val) \
	1454	STMT_START { \
	1455	if (UTF) { \
	1456	SV zlopp = newSV(7); / XXX: optimize me */ \
	1457	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1458	unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
	1459	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1460	SvPOK_on(zlopp); \
	1461	SvUTF8_on(zlopp); \
	1462	av_push(revcharmap, zlopp); \
	1463	} else { \
	1464	char ooooff = (char)val; \
	1465	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1466	} \
	1467	} STMT_END
	1468
	1469	/* This gets the next character from the input, folding it if not already
	1470	* folded. */
	1471	#define TRIE_READ_CHAR STMT_START { \
	1472	wordlen++; \
	1473	if ( UTF ) { \
	1474	/* if it is UTF then it is either already folded, or does not need \
	1475	* folding */ \
	1476	uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
	1477	} \
	1478	else if (folder == PL_fold_latin1) { \
	1479	/* This folder implies Unicode rules, which in the range expressible \
	1480	* by not UTF is the lower case, with the two exceptions, one of \
	1481	* which should have been taken care of before calling this */ \
	1482	assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
	1483	uvc = toLOWER_L1(*uc); \
	1484	if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
	1485	len = 1; \
	1486	} else { \
	1487	/* raw data, will be folded later if needed */ \
	1488	uvc = (U32)*uc; \
	1489	len = 1; \
	1490	} \
	1491	} STMT_END
	1492
	1493
	1494
	1495	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1496	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1497	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1498	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1499	} \
	1500	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1501	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1502	TRIE_LIST_CUR( state )++; \
	1503	} STMT_END
	1504
	1505	#define TRIE_LIST_NEW(state) STMT_START { \
	1506	Newxz( trie->states[ state ].trans.list, \
	1507	4, reg_trie_trans_le ); \
	1508	TRIE_LIST_CUR( state ) = 1; \
	1509	TRIE_LIST_LEN( state ) = 4; \
	1510	} STMT_END
	1511
	1512	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1513	U16 dupe= trie->states[ state ].wordnum; \
	1514	regnode * const noper_next = regnext( noper ); \
	1515	\
	1516	DEBUG_r({ \
	1517	/* store the word for dumping */ \
	1518	SV* tmp; \
	1519	if (OP(noper) != NOTHING) \
	1520	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1521	else \
	1522	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1523	av_push( trie_words, tmp ); \
	1524	}); \
	1525	\
	1526	curword++; \
	1527	trie->wordinfo[curword].prev = 0; \
	1528	trie->wordinfo[curword].len = wordlen; \
	1529	trie->wordinfo[curword].accept = state; \
	1530	\
	1531	if ( noper_next < tail ) { \
	1532	if (!trie->jump) \
	1533	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
	1534	trie->jump[curword] = (U16)(noper_next - convert); \
	1535	if (!jumper) \
	1536	jumper = noper_next; \
	1537	if (!nextbranch) \
	1538	nextbranch= regnext(cur); \
	1539	} \
	1540	\
	1541	if ( dupe ) { \
	1542	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1543	/* chain, so that when the bits of chain are later */\
	1544	/* linked together, the dups appear in the chain */\
	1545	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1546	trie->wordinfo[dupe].prev = curword; \
	1547	} else { \
	1548	/* we haven't inserted this word yet. */ \
	1549	trie->states[ state ].wordnum = curword; \
	1550	} \
	1551	} STMT_END
	1552
	1553
	1554	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1555	( ( base + charid >= ucharcount \
	1556	&& base + charid < ubound \
	1557	&& state == trie->trans[ base - ucharcount + charid ].check \
	1558	&& trie->trans[ base - ucharcount + charid ].next ) \
	1559	? trie->trans[ base - ucharcount + charid ].next \
	1560	: ( state==1 ? special : 0 ) \
	1561	)
	1562
	1563	#define MADE_TRIE 1
	1564	#define MADE_JUMP_TRIE 2
	1565	#define MADE_EXACT_TRIE 4
	1566
	1567	STATIC I32
	1568	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch, regnode first, regnode last, regnode *tail, U32 word_count, U32 flags, U32 depth)
	1569	{
	1570	dVAR;
	1571	/* first pass, loop through and scan words */
	1572	reg_trie_data *trie;
	1573	HV *widecharmap = NULL;
	1574	AV *revcharmap = newAV();
	1575	regnode *cur;
	1576	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1577	STRLEN len = 0;
	1578	UV uvc = 0;
	1579	U16 curword = 0;
	1580	U32 next_alloc = 0;
	1581	regnode *jumper = NULL;
	1582	regnode *nextbranch = NULL;
	1583	regnode *convert = NULL;
	1584	U32 prev_states; / temp array mapping each state to previous one */
	1585	/* we just use folder as a flag in utf8 */
	1586	const U8 * folder = NULL;
	1587
	1588	#ifdef DEBUGGING
	1589	const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
	1590	AV *trie_words = NULL;
	1591	/* along with revcharmap, this only used during construction but both are
	1592	* useful during debugging so we store them in the struct when debugging.
	1593	*/
	1594	#else
	1595	const U32 data_slot = add_data( pRExC_state, 2, "tu" );
	1596	STRLEN trie_charcount=0;
	1597	#endif
	1598	SV *re_trie_maxbuff;
	1599	GET_RE_DEBUG_FLAGS_DECL;
	1600
	1601	PERL_ARGS_ASSERT_MAKE_TRIE;
	1602	#ifndef DEBUGGING
	1603	PERL_UNUSED_ARG(depth);
	1604	#endif
	1605
	1606	switch (flags) {
	1607	case EXACT: break;
	1608	case EXACTFA:
	1609	case EXACTFU_SS:
	1610	case EXACTFU: folder = PL_fold_latin1; break;
	1611	case EXACTF: folder = PL_fold; break;
	1612	case EXACTFL: folder = PL_fold_locale; break;
	1613	default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
	1614	}
	1615
	1616	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1617	trie->refcount = 1;
	1618	trie->startstate = 1;
	1619	trie->wordcount = word_count;
	1620	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1621	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1622	if (flags == EXACT)
	1623	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1624	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1625	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1626
	1627	DEBUG_r({
	1628	trie_words = newAV();
	1629	});
	1630
	1631	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1632	if (!SvIOK(re_trie_maxbuff)) {
	1633	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1634	}
	1635	DEBUG_TRIE_COMPILE_r({
	1636	PerlIO_printf( Perl_debug_log,
	1637	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	1638	(int)depth * 2 + 2, "",
	1639	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	1640	REG_NODE_NUM(last), REG_NODE_NUM(tail),
	1641	(int)depth);
	1642	});
	1643
	1644	/* Find the node we are going to overwrite */
	1645	if ( first == startbranch && OP( last ) != BRANCH ) {
	1646	/* whole branch chain */
	1647	convert = first;
	1648	} else {
	1649	/* branch sub-chain */
	1650	convert = NEXTOPER( first );
	1651	}
	1652
	1653	/* -- First loop and Setup --
	1654
	1655	We first traverse the branches and scan each word to determine if it
	1656	contains widechars, and how many unique chars there are, this is
	1657	important as we have to build a table with at least as many columns as we
	1658	have unique chars.
	1659
	1660	We use an array of integers to represent the character codes 0..255
	1661	(trie->charmap) and we use a an HV* to store Unicode characters. We use the
	1662	native representation of the character value as the key and IV's for the
	1663	coded index.
	1664
	1665	TODO If we keep track of how many times each character is used we can
	1666	remap the columns so that the table compression later on is more
	1667	efficient in terms of memory by ensuring the most common value is in the
	1668	middle and the least common are on the outside. IMO this would be better
	1669	than a most to least common mapping as theres a decent chance the most
	1670	common letter will share a node with the least common, meaning the node
	1671	will not be compressible. With a middle is most common approach the worst
	1672	case is when we have the least common nodes twice.
	1673
	1674	*/
	1675
	1676	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1677	regnode *noper = NEXTOPER( cur );
	1678	const U8 uc = (U8)STRING( noper );
	1679	const U8 *e = uc + STR_LEN( noper );
	1680	STRLEN foldlen = 0;
	1681	U32 wordlen = 0; /* required init */
	1682	STRLEN minbytes = 0;
	1683	STRLEN maxbytes = 0;
	1684	bool set_bit = trie->bitmap ? 1 : 0; /store the first char in the bitmap?/
	1685
	1686	if (OP(noper) == NOTHING) {
	1687	regnode *noper_next= regnext(noper);
	1688	if (noper_next != tail && OP(noper_next) == flags) {
	1689	noper = noper_next;
	1690	uc= (U8*)STRING(noper);
	1691	e= uc + STR_LEN(noper);
	1692	trie->minlen= STR_LEN(noper);
	1693	} else {
	1694	trie->minlen= 0;
	1695	continue;
	1696	}
	1697	}
	1698
	1699	if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
	1700	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	1701	regardless of encoding */
	1702	if (OP( noper ) == EXACTFU_SS) {
	1703	/* false positives are ok, so just set this */
	1704	TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
	1705	}
	1706	}
	1707	for ( ; uc < e ; uc += len ) {
	1708	TRIE_CHARCOUNT(trie)++;
	1709	TRIE_READ_CHAR;
	1710
	1711	/* Acummulate to the current values, the range in the number of
	1712	* bytes that this character could match. The max is presumed to
	1713	* be the same as the folded input (which TRIE_READ_CHAR returns),
	1714	* except that when this is not in UTF-8, it could be matched
	1715	* against a string which is UTF-8, and the variant characters
	1716	* could be 2 bytes instead of the 1 here. Likewise, for the
	1717	* minimum number of bytes when not folded. When folding, the min
	1718	* is assumed to be 1 byte could fold to match the single character
	1719	* here, or in the case of a multi-char fold, 1 byte can fold to
	1720	* the whole sequence. 'foldlen' is used to denote whether we are
	1721	* in such a sequence, skipping the min setting if so. XXX TODO
	1722	* Use the exact list of what folds to each character, from
	1723	* PL_utf8_foldclosures */
	1724	if (UTF) {
	1725	maxbytes += UTF8SKIP(uc);
	1726	if (! folder) {
	1727	/* A non-UTF-8 string could be 1 byte to match our 2 */
	1728	minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
	1729	? 1
	1730	: UTF8SKIP(uc);
	1731	}
	1732	else {
	1733	if (foldlen) {
	1734	foldlen -= UTF8SKIP(uc);
	1735	}
	1736	else {
	1737	foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
	1738	minbytes++;
	1739	}
	1740	}
	1741	}
	1742	else {
	1743	maxbytes += (UNI_IS_INVARIANT(*uc))
	1744	? 1
	1745	: 2;
	1746	if (! folder) {
	1747	minbytes++;
	1748	}
	1749	else {
	1750	if (foldlen) {
	1751	foldlen--;
	1752	}
	1753	else {
	1754	foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
	1755	minbytes++;
	1756	}
	1757	}
	1758	}
	1759	if ( uvc < 256 ) {
	1760	if ( folder ) {
	1761	U8 folded= folder[ (U8) uvc ];
	1762	if ( !trie->charmap[ folded ] ) {
	1763	trie->charmap[ folded ]=( ++trie->uniquecharcount );
	1764	TRIE_STORE_REVCHAR( folded );
	1765	}
	1766	}
	1767	if ( !trie->charmap[ uvc ] ) {
	1768	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	1769	TRIE_STORE_REVCHAR( uvc );
	1770	}
	1771	if ( set_bit ) {
	1772	/* store the codepoint in the bitmap, and its folded
	1773	* equivalent. */
	1774	TRIE_BITMAP_SET(trie, uvc);
	1775
	1776	/* store the folded codepoint */
	1777	if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
	1778
	1779	if ( !UTF ) {
	1780	/* store first byte of utf8 representation of
	1781	variant codepoints */
	1782	if (! NATIVE_IS_INVARIANT(uvc)) {
	1783	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	1784	}
	1785	}
	1786	set_bit = 0; /* We've done our bit :-) */
	1787	}
	1788	} else {
	1789	SV** svpp;
	1790	if ( !widecharmap )
	1791	widecharmap = newHV();
	1792
	1793	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	1794
	1795	if ( !svpp )
	1796	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	1797
	1798	if ( !SvTRUE( *svpp ) ) {
	1799	sv_setiv( *svpp, ++trie->uniquecharcount );
	1800	TRIE_STORE_REVCHAR(uvc);
	1801	}
	1802	}
	1803	}
	1804	if( cur == first ) {
	1805	trie->minlen = minbytes;
	1806	trie->maxlen = maxbytes;
	1807	} else if (minbytes < trie->minlen) {
	1808	trie->minlen = minbytes;
	1809	} else if (maxbytes > trie->maxlen) {
	1810	trie->maxlen = maxbytes;
	1811	}
	1812	} /* end first pass */
	1813	DEBUG_TRIE_COMPILE_r(
	1814	PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	1815	(int)depth * 2 + 2,"",
	1816	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	1817	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	1818	(int)trie->minlen, (int)trie->maxlen )
	1819	);
	1820
	1821	/*
	1822	We now know what we are dealing with in terms of unique chars and
	1823	string sizes so we can calculate how much memory a naive
	1824	representation using a flat table will take. If it's over a reasonable
	1825	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	1826	conservative but potentially much slower representation using an array
	1827	of lists.
	1828
	1829	At the end we convert both representations into the same compressed
	1830	form that will be used in regexec.c for matching with. The latter
	1831	is a form that cannot be used to construct with but has memory
	1832	properties similar to the list form and access properties similar
	1833	to the table form making it both suitable for fast searches and
	1834	small enough that its feasable to store for the duration of a program.
	1835
	1836	See the comment in the code where the compressed table is produced
	1837	inplace from the flat tabe representation for an explanation of how
	1838	the compression works.
	1839
	1840	*/
	1841
	1842
	1843	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	1844	prev_states[1] = 0;
	1845
	1846	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
	1847	/*
	1848	Second Pass -- Array Of Lists Representation
	1849
	1850	Each state will be represented by a list of charid:state records
	1851	(reg_trie_trans_le) the first such element holds the CUR and LEN
	1852	points of the allocated array. (See defines above).
	1853
	1854	We build the initial structure using the lists, and then convert
	1855	it into the compressed table form which allows faster lookups
	1856	(but cant be modified once converted).
	1857	*/
	1858
	1859	STRLEN transcount = 1;
	1860
	1861	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1862	"%*sCompiling trie using list compiler\n",
	1863	(int)depth * 2 + 2, ""));
	1864
	1865	trie->states = (reg_trie_state *)
	1866	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1867	sizeof(reg_trie_state) );
	1868	TRIE_LIST_NEW(1);
	1869	next_alloc = 2;
	1870
	1871	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1872
	1873	regnode *noper = NEXTOPER( cur );
	1874	U8 uc = (U8)STRING( noper );
	1875	const U8 *e = uc + STR_LEN( noper );
	1876	U32 state = 1; /* required init */
	1877	U16 charid = 0; /* sanity init */
	1878	U32 wordlen = 0; /* required init */
	1879
	1880	if (OP(noper) == NOTHING) {
	1881	regnode *noper_next= regnext(noper);
	1882	if (noper_next != tail && OP(noper_next) == flags) {
	1883	noper = noper_next;
	1884	uc= (U8*)STRING(noper);
	1885	e= uc + STR_LEN(noper);
	1886	}
	1887	}
	1888
	1889	if (OP(noper) != NOTHING) {
	1890	for ( ; uc < e ; uc += len ) {
	1891
	1892	TRIE_READ_CHAR;
	1893
	1894	if ( uvc < 256 ) {
	1895	charid = trie->charmap[ uvc ];
	1896	} else {
	1897	SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1898	if ( !svpp ) {
	1899	charid = 0;
	1900	} else {
	1901	charid=(U16)SvIV( *svpp );
	1902	}
	1903	}
	1904	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1905	if ( charid ) {
	1906
	1907	U16 check;
	1908	U32 newstate = 0;
	1909
	1910	charid--;
	1911	if ( !trie->states[ state ].trans.list ) {
	1912	TRIE_LIST_NEW( state );
	1913	}
	1914	for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
	1915	if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
	1916	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	1917	break;
	1918	}
	1919	}
	1920	if ( ! newstate ) {
	1921	newstate = next_alloc++;
	1922	prev_states[newstate] = state;
	1923	TRIE_LIST_PUSH( state, charid, newstate );
	1924	transcount++;
	1925	}
	1926	state = newstate;
	1927	} else {
	1928	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1929	}
	1930	}
	1931	}
	1932	TRIE_HANDLE_WORD(state);
	1933
	1934	} /* end second pass */
	1935
	1936	/* next alloc is the NEXT state to be allocated */
	1937	trie->statecount = next_alloc;
	1938	trie->states = (reg_trie_state *)
	1939	PerlMemShared_realloc( trie->states,
	1940	next_alloc
	1941	* sizeof(reg_trie_state) );
	1942
	1943	/* and now dump it out before we compress it */
	1944	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	1945	revcharmap, next_alloc,
	1946	depth+1)
	1947	);
	1948
	1949	trie->trans = (reg_trie_trans *)
	1950	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	1951	{
	1952	U32 state;
	1953	U32 tp = 0;
	1954	U32 zp = 0;
	1955
	1956
	1957	for( state=1 ; state < next_alloc ; state ++ ) {
	1958	U32 base=0;
	1959
	1960	/*
	1961	DEBUG_TRIE_COMPILE_MORE_r(
	1962	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	1963	);
	1964	*/
	1965
	1966	if (trie->states[state].trans.list) {
	1967	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1968	U16 maxid=minid;
	1969	U16 idx;
	1970
	1971	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1972	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1973	if ( forid < minid ) {
	1974	minid=forid;
	1975	} else if ( forid > maxid ) {
	1976	maxid=forid;
	1977	}
	1978	}
	1979	if ( transcount < tp + maxid - minid + 1) {
	1980	transcount *= 2;
	1981	trie->trans = (reg_trie_trans *)
	1982	PerlMemShared_realloc( trie->trans,
	1983	transcount
	1984	* sizeof(reg_trie_trans) );
	1985	Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
	1986	}
	1987	base = trie->uniquecharcount + tp - minid;
	1988	if ( maxid == minid ) {
	1989	U32 set = 0;
	1990	for ( ; zp < tp ; zp++ ) {
	1991	if ( ! trie->trans[ zp ].next ) {
	1992	base = trie->uniquecharcount + zp - minid;
	1993	trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1994	trie->trans[ zp ].check = state;
	1995	set = 1;
	1996	break;
	1997	}
	1998	}
	1999	if ( !set ) {
	2000	trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	2001	trie->trans[ tp ].check = state;
	2002	tp++;
	2003	zp = tp;
	2004	}
	2005	} else {
	2006	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	2007	const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
	2008	trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
	2009	trie->trans[ tid ].check = state;
	2010	}
	2011	tp += ( maxid - minid + 1 );
	2012	}
	2013	Safefree(trie->states[ state ].trans.list);
	2014	}
	2015	/*
	2016	DEBUG_TRIE_COMPILE_MORE_r(
	2017	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	2018	);
	2019	*/
	2020	trie->states[ state ].trans.base=base;
	2021	}
	2022	trie->lasttrans = tp + 1;
	2023	}
	2024	} else {
	2025	/*
	2026	Second Pass -- Flat Table Representation.
	2027
	2028	we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
	2029	We know that we will need Charcount+1 trans at most to store the data
	2030	(one row per char at worst case) So we preallocate both structures
	2031	assuming worst case.
	2032
	2033	We then construct the trie using only the .next slots of the entry
	2034	structs.
	2035
	2036	We use the .check field of the first entry of the node temporarily to
	2037	make compression both faster and easier by keeping track of how many non
	2038	zero fields are in the node.
	2039
	2040	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	2041	transition.
	2042
	2043	There are two terms at use here: state as a TRIE_NODEIDX() which is a
	2044	number representing the first entry of the node, and state as a
	2045	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
	2046	TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
	2047	are 2 entrys per node. eg:
	2048
	2049	A B A B
	2050	1. 2 4 1. 3 7
	2051	2. 0 3 3. 0 5
	2052	3. 0 0 5. 0 0
	2053	4. 0 0 7. 0 0
	2054
	2055	The table is internally in the right hand, idx form. However as we also
	2056	have to deal with the states array which is indexed by nodenum we have to
	2057	use TRIE_NODENUM() to convert.
	2058
	2059	*/
	2060	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	2061	"%*sCompiling trie using table compiler\n",
	2062	(int)depth * 2 + 2, ""));
	2063
	2064	trie->trans = (reg_trie_trans *)
	2065	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	2066	* trie->uniquecharcount + 1,
	2067	sizeof(reg_trie_trans) );
	2068	trie->states = (reg_trie_state *)
	2069	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	2070	sizeof(reg_trie_state) );
	2071	next_alloc = trie->uniquecharcount + 1;
	2072
	2073
	2074	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	2075
	2076	regnode *noper = NEXTOPER( cur );
	2077	const U8 uc = (U8)STRING( noper );
	2078	const U8 *e = uc + STR_LEN( noper );
	2079
	2080	U32 state = 1; /* required init */
	2081
	2082	U16 charid = 0; /* sanity init */
	2083	U32 accept_state = 0; /* sanity init */
	2084
	2085	U32 wordlen = 0; /* required init */
	2086
	2087	if (OP(noper) == NOTHING) {
	2088	regnode *noper_next= regnext(noper);
	2089	if (noper_next != tail && OP(noper_next) == flags) {
	2090	noper = noper_next;
	2091	uc= (U8*)STRING(noper);
	2092	e= uc + STR_LEN(noper);
	2093	}
	2094	}
	2095
	2096	if ( OP(noper) != NOTHING ) {
	2097	for ( ; uc < e ; uc += len ) {
	2098
	2099	TRIE_READ_CHAR;
	2100
	2101	if ( uvc < 256 ) {
	2102	charid = trie->charmap[ uvc ];
	2103	} else {
	2104	SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	2105	charid = svpp ? (U16)SvIV(*svpp) : 0;
	2106	}
	2107	if ( charid ) {
	2108	charid--;
	2109	if ( !trie->trans[ state + charid ].next ) {
	2110	trie->trans[ state + charid ].next = next_alloc;
	2111	trie->trans[ state ].check++;
	2112	prev_states[TRIE_NODENUM(next_alloc)]
	2113	= TRIE_NODENUM(state);
	2114	next_alloc += trie->uniquecharcount;
	2115	}
	2116	state = trie->trans[ state + charid ].next;
	2117	} else {
	2118	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	2119	}
	2120	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	2121	}
	2122	}
	2123	accept_state = TRIE_NODENUM( state );
	2124	TRIE_HANDLE_WORD(accept_state);
	2125
	2126	} /* end second pass */
	2127
	2128	/* and now dump it out before we compress it */
	2129	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	2130	revcharmap,
	2131	next_alloc, depth+1));
	2132
	2133	{
	2134	/*
	2135	* Inplace compress the table.*
	2136
	2137	For sparse data sets the table constructed by the trie algorithm will
	2138	be mostly 0/FAIL transitions or to put it another way mostly empty.
	2139	(Note that leaf nodes will not contain any transitions.)
	2140
	2141	This algorithm compresses the tables by eliminating most such
	2142	transitions, at the cost of a modest bit of extra work during lookup:
	2143
	2144	- Each states[] entry contains a .base field which indicates the
	2145	index in the state[] array wheres its transition data is stored.
	2146
	2147	- If .base is 0 there are no valid transitions from that node.
	2148
	2149	- If .base is nonzero then charid is added to it to find an entry in
	2150	the trans array.
	2151
	2152	-If trans[states[state].base+charid].check!=state then the
	2153	transition is taken to be a 0/Fail transition. Thus if there are fail
	2154	transitions at the front of the node then the .base offset will point
	2155	somewhere inside the previous nodes data (or maybe even into a node
	2156	even earlier), but the .check field determines if the transition is
	2157	valid.
	2158
	2159	XXX - wrong maybe?
	2160	The following process inplace converts the table to the compressed
	2161	table: We first do not compress the root node 1,and mark all its
	2162	.check pointers as 1 and set its .base pointer as 1 as well. This
	2163	allows us to do a DFA construction from the compressed table later,
	2164	and ensures that any .base pointers we calculate later are greater
	2165	than 0.
	2166
	2167	- We set 'pos' to indicate the first entry of the second node.
	2168
	2169	- We then iterate over the columns of the node, finding the first and
	2170	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	2171	and set the .check pointers accordingly, and advance pos
	2172	appropriately and repreat for the next node. Note that when we copy
	2173	the next pointers we have to convert them from the original
	2174	NODEIDX form to NODENUM form as the former is not valid post
	2175	compression.
	2176
	2177	- If a node has no transitions used we mark its base as 0 and do not
	2178	advance the pos pointer.
	2179
	2180	- If a node only has one transition we use a second pointer into the
	2181	structure to fill in allocated fail transitions from other states.
	2182	This pointer is independent of the main pointer and scans forward
	2183	looking for null transitions that are allocated to a state. When it
	2184	finds one it writes the single transition into the "hole". If the
	2185	pointer doesnt find one the single transition is appended as normal.
	2186
	2187	- Once compressed we can Renew/realloc the structures to release the
	2188	excess space.
	2189
	2190	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	2191	specifically Fig 3.47 and the associated pseudocode.
	2192
	2193	demq
	2194	*/
	2195	const U32 laststate = TRIE_NODENUM( next_alloc );
	2196	U32 state, charid;
	2197	U32 pos = 0, zp=0;
	2198	trie->statecount = laststate;
	2199
	2200	for ( state = 1 ; state < laststate ; state++ ) {
	2201	U8 flag = 0;
	2202	const U32 stateidx = TRIE_NODEIDX( state );
	2203	const U32 o_used = trie->trans[ stateidx ].check;
	2204	U32 used = trie->trans[ stateidx ].check;
	2205	trie->trans[ stateidx ].check = 0;
	2206
	2207	for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
	2208	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	2209	if ( trie->trans[ stateidx + charid ].next ) {
	2210	if (o_used == 1) {
	2211	for ( ; zp < pos ; zp++ ) {
	2212	if ( ! trie->trans[ zp ].next ) {
	2213	break;
	2214	}
	2215	}
	2216	trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
	2217	trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2218	trie->trans[ zp ].check = state;
	2219	if ( ++zp > pos ) pos = zp;
	2220	break;
	2221	}
	2222	used--;
	2223	}
	2224	if ( !flag ) {
	2225	flag = 1;
	2226	trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
	2227	}
	2228	trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2229	trie->trans[ pos ].check = state;
	2230	pos++;
	2231	}
	2232	}
	2233	}
	2234	trie->lasttrans = pos + 1;
	2235	trie->states = (reg_trie_state *)
	2236	PerlMemShared_realloc( trie->states, laststate
	2237	* sizeof(reg_trie_state) );
	2238	DEBUG_TRIE_COMPILE_MORE_r(
	2239	PerlIO_printf( Perl_debug_log,
	2240	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	2241	(int)depth * 2 + 2,"",
	2242	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
	2243	(IV)next_alloc,
	2244	(IV)pos,
	2245	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	2246	);
	2247
	2248	} /* end table compress */
	2249	}
	2250	DEBUG_TRIE_COMPILE_MORE_r(
	2251	PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	2252	(int)depth * 2 + 2, "",
	2253	(UV)trie->statecount,
	2254	(UV)trie->lasttrans)
	2255	);
	2256	/* resize the trans array to remove unused space */
	2257	trie->trans = (reg_trie_trans *)
	2258	PerlMemShared_realloc( trie->trans, trie->lasttrans
	2259	* sizeof(reg_trie_trans) );
	2260
	2261	{ /* Modify the program and insert the new TRIE node */
	2262	U8 nodetype =(U8)(flags & 0xFF);
	2263	char *str=NULL;
	2264
	2265	#ifdef DEBUGGING
	2266	regnode *optimize = NULL;
	2267	#ifdef RE_TRACK_PATTERN_OFFSETS
	2268
	2269	U32 mjd_offset = 0;
	2270	U32 mjd_nodelen = 0;
	2271	#endif /* RE_TRACK_PATTERN_OFFSETS */
	2272	#endif /* DEBUGGING */
	2273	/*
	2274	This means we convert either the first branch or the first Exact,
	2275	depending on whether the thing following (in 'last') is a branch
	2276	or not and whther first is the startbranch (ie is it a sub part of
	2277	the alternation or is it the whole thing.)
	2278	Assuming its a sub part we convert the EXACT otherwise we convert
	2279	the whole branch sequence, including the first.
	2280	*/
	2281	/* Find the node we are going to overwrite */
	2282	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	2283	/* branch sub-chain */
	2284	NEXT_OFF( first ) = (U16)(last - first);
	2285	#ifdef RE_TRACK_PATTERN_OFFSETS
	2286	DEBUG_r({
	2287	mjd_offset= Node_Offset((convert));
	2288	mjd_nodelen= Node_Length((convert));
	2289	});
	2290	#endif
	2291	/* whole branch chain */
	2292	}
	2293	#ifdef RE_TRACK_PATTERN_OFFSETS
	2294	else {
	2295	DEBUG_r({
	2296	const regnode *nop = NEXTOPER( convert );
	2297	mjd_offset= Node_Offset((nop));
	2298	mjd_nodelen= Node_Length((nop));
	2299	});
	2300	}
	2301	DEBUG_OPTIMISE_r(
	2302	PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2303	(int)depth * 2 + 2, "",
	2304	(UV)mjd_offset, (UV)mjd_nodelen)
	2305	);
	2306	#endif
	2307	/* But first we check to see if there is a common prefix we can
	2308	split out as an EXACT and put in front of the TRIE node. */
	2309	trie->startstate= 1;
	2310	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2311	U32 state;
	2312	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2313	U32 ofs = 0;
	2314	I32 idx = -1;
	2315	U32 count = 0;
	2316	const U32 base = trie->states[ state ].trans.base;
	2317
	2318	if ( trie->states[state].wordnum )
	2319	count = 1;
	2320
	2321	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2322	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2323	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2324	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2325	{
	2326	if ( ++count > 1 ) {
	2327	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2328	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2329	if ( state == 1 ) break;
	2330	if ( count == 2 ) {
	2331	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2332	DEBUG_OPTIMISE_r(
	2333	PerlIO_printf(Perl_debug_log,
	2334	"%*sNew Start State=%"UVuf" Class: [",
	2335	(int)depth * 2 + 2, "",
	2336	(UV)state));
	2337	if (idx >= 0) {
	2338	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2339	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2340
	2341	TRIE_BITMAP_SET(trie,*ch);
	2342	if ( folder )
	2343	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2344	DEBUG_OPTIMISE_r(
	2345	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2346	);
	2347	}
	2348	}
	2349	TRIE_BITMAP_SET(trie,*ch);
	2350	if ( folder )
	2351	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2352	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2353	}
	2354	idx = ofs;
	2355	}
	2356	}
	2357	if ( count == 1 ) {
	2358	SV **tmp = av_fetch( revcharmap, idx, 0);
	2359	STRLEN len;
	2360	char ch = SvPV( tmp, len );
	2361	DEBUG_OPTIMISE_r({
	2362	SV *sv=sv_newmortal();
	2363	PerlIO_printf( Perl_debug_log,
	2364	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2365	(int)depth * 2 + 2, "",
	2366	(UV)state, (UV)idx,
	2367	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2368	PL_colors[0], PL_colors[1],
	2369	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2370	PERL_PV_ESCAPE_FIRSTCHAR
	2371	)
	2372	);
	2373	});
	2374	if ( state==1 ) {
	2375	OP( convert ) = nodetype;
	2376	str=STRING(convert);
	2377	STR_LEN(convert)=0;
	2378	}
	2379	STR_LEN(convert) += len;
	2380	while (len--)
	2381	str++ = ch++;
	2382	} else {
	2383	#ifdef DEBUGGING
	2384	if (state>1)
	2385	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2386	#endif
	2387	break;
	2388	}
	2389	}
	2390	trie->prefixlen = (state-1);
	2391	if (str) {
	2392	regnode *n = convert+NODE_SZ_STR(convert);
	2393	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2394	trie->startstate = state;
	2395	trie->minlen -= (state - 1);
	2396	trie->maxlen -= (state - 1);
	2397	#ifdef DEBUGGING
	2398	/* At least the UNICOS C compiler choked on this
	2399	* being argument to DEBUG_r(), so let's just have
	2400	* it right here. */
	2401	if (
	2402	#ifdef PERL_EXT_RE_BUILD
	2403	1
	2404	#else
	2405	DEBUG_r_TEST
	2406	#endif
	2407	) {
	2408	regnode *fix = convert;
	2409	U32 word = trie->wordcount;
	2410	mjd_nodelen++;
	2411	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2412	while( ++fix < n ) {
	2413	Set_Node_Offset_Length(fix, 0, 0);
	2414	}
	2415	while (word--) {
	2416	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2417	if (tmp) {
	2418	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2419	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2420	else
	2421	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2422	}
	2423	}
	2424	}
	2425	#endif
	2426	if (trie->maxlen) {
	2427	convert = n;
	2428	} else {
	2429	NEXT_OFF(convert) = (U16)(tail - convert);
	2430	DEBUG_r(optimize= n);
	2431	}
	2432	}
	2433	}
	2434	if (!jumper)
	2435	jumper = last;
	2436	if ( trie->maxlen ) {
	2437	NEXT_OFF( convert ) = (U16)(tail - convert);
	2438	ARG_SET( convert, data_slot );
	2439	/* Store the offset to the first unabsorbed branch in
	2440	jump[0], which is otherwise unused by the jump logic.
	2441	We use this when dumping a trie and during optimisation. */
	2442	if (trie->jump)
	2443	trie->jump[0] = (U16)(nextbranch - convert);
	2444
	2445	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2446	* and there is a bitmap
	2447	* and the first "jump target" node we found leaves enough room
	2448	* then convert the TRIE node into a TRIEC node, with the bitmap
	2449	* embedded inline in the opcode - this is hypothetically faster.
	2450	*/
	2451	if ( !trie->states[trie->startstate].wordnum
	2452	&& trie->bitmap
	2453	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2454	{
	2455	OP( convert ) = TRIEC;
	2456	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2457	PerlMemShared_free(trie->bitmap);
	2458	trie->bitmap= NULL;
	2459	} else
	2460	OP( convert ) = TRIE;
	2461
	2462	/* store the type in the flags */
	2463	convert->flags = nodetype;
	2464	DEBUG_r({
	2465	optimize = convert
	2466	+ NODE_STEP_REGNODE
	2467	+ regarglen[ OP( convert ) ];
	2468	});
	2469	/* XXX We really should free up the resource in trie now,
	2470	as we won't use them - (which resources?) dmq */
	2471	}
	2472	/* needed for dumping*/
	2473	DEBUG_r(if (optimize) {
	2474	regnode *opt = convert;
	2475
	2476	while ( ++opt < optimize) {
	2477	Set_Node_Offset_Length(opt,0,0);
	2478	}
	2479	/*
	2480	Try to clean up some of the debris left after the
	2481	optimisation.
	2482	*/
	2483	while( optimize < jumper ) {
	2484	mjd_nodelen += Node_Length((optimize));
	2485	OP( optimize ) = OPTIMIZED;
	2486	Set_Node_Offset_Length(optimize,0,0);
	2487	optimize++;
	2488	}
	2489	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2490	});
	2491	} /* end node insert */
	2492
	2493	/* Finish populating the prev field of the wordinfo array. Walk back
	2494	* from each accept state until we find another accept state, and if
	2495	* so, point the first word's .prev field at the second word. If the
	2496	* second already has a .prev field set, stop now. This will be the
	2497	* case either if we've already processed that word's accept state,
	2498	* or that state had multiple words, and the overspill words were
	2499	* already linked up earlier.
	2500	*/
	2501	{
	2502	U16 word;
	2503	U32 state;
	2504	U16 prev;
	2505
	2506	for (word=1; word <= trie->wordcount; word++) {
	2507	prev = 0;
	2508	if (trie->wordinfo[word].prev)
	2509	continue;
	2510	state = trie->wordinfo[word].accept;
	2511	while (state) {
	2512	state = prev_states[state];
	2513	if (!state)
	2514	break;
	2515	prev = trie->states[state].wordnum;
	2516	if (prev)
	2517	break;
	2518	}
	2519	trie->wordinfo[word].prev = prev;
	2520	}
	2521	Safefree(prev_states);
	2522	}
	2523
	2524
	2525	/* and now dump out the compressed format */
	2526	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2527
	2528	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2529	#ifdef DEBUGGING
	2530	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2531	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2532	#else
	2533	SvREFCNT_dec_NN(revcharmap);
	2534	#endif
	2535	return trie->jump
	2536	? MADE_JUMP_TRIE
	2537	: trie->startstate>1
	2538	? MADE_EXACT_TRIE
	2539	: MADE_TRIE;
	2540	}
	2541
	2542	STATIC void
	2543	S_make_trie_failtable(pTHX_ RExC_state_t pRExC_state, regnode source, regnode *stclass, U32 depth)
	2544	{
	2545	/* The Trie is constructed and compressed now so we can build a fail array if it's needed
	2546
	2547	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
	2548	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
	2549	ISBN 0-201-10088-6
	2550
	2551	We find the fail state for each state in the trie, this state is the longest proper
	2552	suffix of the current state's 'word' that is also a proper prefix of another word in our
	2553	trie. State 1 represents the word '' and is thus the default fail state. This allows
	2554	the DFA not to have to restart after its tried and failed a word at a given point, it
	2555	simply continues as though it had been matching the other word in the first place.
	2556	Consider
	2557	'abcdgu'=~/abcdefg\|cdgu/
	2558	When we get to 'd' we are still matching the first word, we would encounter 'g' which would
	2559	fail, which would bring us to the state representing 'd' in the second word where we would
	2560	try 'g' and succeed, proceeding to match 'cdgu'.
	2561	*/
	2562	/* add a fail transition */
	2563	const U32 trie_offset = ARG(source);
	2564	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2565	U32 *q;
	2566	const U32 ucharcount = trie->uniquecharcount;
	2567	const U32 numstates = trie->statecount;
	2568	const U32 ubound = trie->lasttrans + ucharcount;
	2569	U32 q_read = 0;
	2570	U32 q_write = 0;
	2571	U32 charid;
	2572	U32 base = trie->states[ 1 ].trans.base;
	2573	U32 *fail;
	2574	reg_ac_data *aho;
	2575	const U32 data_slot = add_data( pRExC_state, 1, "T" );
	2576	GET_RE_DEBUG_FLAGS_DECL;
	2577
	2578	PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
	2579	#ifndef DEBUGGING
	2580	PERL_UNUSED_ARG(depth);
	2581	#endif
	2582
	2583
	2584	ARG_SET( stclass, data_slot );
	2585	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	2586	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	2587	aho->trie=trie_offset;
	2588	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	2589	Copy( trie->states, aho->states, numstates, reg_trie_state );
	2590	Newxz( q, numstates, U32);
	2591	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	2592	aho->refcount = 1;
	2593	fail = aho->fail;
	2594	/* initialize fail[0..1] to be 1 so that we always have
	2595	a valid final fail state */
	2596	fail[ 0 ] = fail[ 1 ] = 1;
	2597
	2598	for ( charid = 0; charid < ucharcount ; charid++ ) {
	2599	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	2600	if ( newstate ) {
	2601	q[ q_write ] = newstate;
	2602	/* set to point at the root */
	2603	fail[ q[ q_write++ ] ]=1;
	2604	}
	2605	}
	2606	while ( q_read < q_write) {
	2607	const U32 cur = q[ q_read++ % numstates ];
	2608	base = trie->states[ cur ].trans.base;
	2609
	2610	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	2611	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	2612	if (ch_state) {
	2613	U32 fail_state = cur;
	2614	U32 fail_base;
	2615	do {
	2616	fail_state = fail[ fail_state ];
	2617	fail_base = aho->states[ fail_state ].trans.base;
	2618	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	2619
	2620	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	2621	fail[ ch_state ] = fail_state;
	2622	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	2623	{
	2624	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	2625	}
	2626	q[ q_write++ % numstates] = ch_state;
	2627	}
	2628	}
	2629	}
	2630	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	2631	when we fail in state 1, this allows us to use the
	2632	charclass scan to find a valid start char. This is based on the principle
	2633	that theres a good chance the string being searched contains lots of stuff
	2634	that cant be a start char.
	2635	*/
	2636	fail[ 0 ] = fail[ 1 ] = 0;
	2637	DEBUG_TRIE_COMPILE_r({
	2638	PerlIO_printf(Perl_debug_log,
	2639	"%*sStclass Failtable (%"UVuf" states): 0",
	2640	(int)(depth * 2), "", (UV)numstates
	2641	);
	2642	for( q_read=1; q_read<numstates; q_read++ ) {
	2643	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	2644	}
	2645	PerlIO_printf(Perl_debug_log, "\n");
	2646	});
	2647	Safefree(q);
	2648	/RExC_seen \|= REG_SEEN_TRIEDFA;/
	2649	}
	2650
	2651
	2652	/*
	2653	* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
	2654	* These need to be revisited when a newer toolchain becomes available.
	2655	*/
	2656	#if defined(__sparc64__) && defined(__GNUC__)
	2657	# if __GNUC__ < 2 \|\| (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
	2658	# undef SPARC64_GCC_WORKAROUND
	2659	# define SPARC64_GCC_WORKAROUND 1
	2660	# endif
	2661	#endif
	2662
	2663	#define DEBUG_PEEP(str,scan,depth) \
	2664	DEBUG_OPTIMISE_r({if (scan){ \
	2665	SV * const mysv=sv_newmortal(); \
	2666	regnode *Next = regnext(scan); \
	2667	regprop(RExC_rx, mysv, scan); \
	2668	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	2669	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	2670	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	2671	}});
	2672
	2673
	2674	/* The below joins as many adjacent EXACTish nodes as possible into a single
	2675	* one. The regop may be changed if the node(s) contain certain sequences that
	2676	* require special handling. The joining is only done if:
	2677	* 1) there is room in the current conglomerated node to entirely contain the
	2678	* next one.
	2679	* 2) they are the exact same node type
	2680	*
	2681	* The adjacent nodes actually may be separated by NOTHING-kind nodes, and
	2682	* these get optimized out
	2683	*
	2684	* If a node is to match under /i (folded), the number of characters it matches
	2685	* can be different than its character length if it contains a multi-character
	2686	* fold. *min_subtract is set to the total delta of the input nodes.
	2687	*
	2688	* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
	2689	* and contains LATIN SMALL LETTER SHARP S
	2690	*
	2691	* This is as good a place as any to discuss the design of handling these
	2692	* multi-character fold sequences. It's been wrong in Perl for a very long
	2693	* time. There are three code points in Unicode whose multi-character folds
	2694	* were long ago discovered to mess things up. The previous designs for
	2695	* dealing with these involved assigning a special node for them. This
	2696	* approach doesn't work, as evidenced by this example:
	2697	* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
	2698	* Both these fold to "sss", but if the pattern is parsed to create a node that
	2699	* would match just the \xDF, it won't be able to handle the case where a
	2700	* successful match would have to cross the node's boundary. The new approach
	2701	* that hopefully generally solves the problem generates an EXACTFU_SS node
	2702	* that is "sss".
	2703	*
	2704	* It turns out that there are problems with all multi-character folds, and not
	2705	* just these three. Now the code is general, for all such cases. The
	2706	* approach taken is:
	2707	* 1) This routine examines each EXACTFish node that could contain multi-
	2708	* character fold sequences. It returns in *min_subtract how much to
	2709	* subtract from the the actual length of the string to get a real minimum
	2710	* match length; it is 0 if there are no multi-char folds. This delta is
	2711	* used by the caller to adjust the min length of the match, and the delta
	2712	* between min and max, so that the optimizer doesn't reject these
	2713	* possibilities based on size constraints.
	2714	* 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
	2715	* is used for an EXACTFU node that contains at least one "ss" sequence in
	2716	* it. For non-UTF-8 patterns and strings, this is the only case where
	2717	* there is a possible fold length change. That means that a regular
	2718	* EXACTFU node without UTF-8 involvement doesn't have to concern itself
	2719	* with length changes, and so can be processed faster. regexec.c takes
	2720	* advantage of this. Generally, an EXACTFish node that is in UTF-8 is
	2721	* pre-folded by regcomp.c. This saves effort in regex matching.
	2722	* However, the pre-folding isn't done for non-UTF8 patterns because the
	2723	* fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
	2724	* down by forcing the pattern into UTF8 unless necessary. Also what
	2725	* EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
	2726	* possibilities for the non-UTF8 patterns are quite simple, except for
	2727	* the sharp s. All the ones that don't involve a UTF-8 target string are
	2728	* members of a fold-pair, and arrays are set up for all of them so that
	2729	* the other member of the pair can be found quickly. Code elsewhere in
	2730	* this file makes sure that in EXACTFU nodes, the sharp s gets folded to
	2731	* 'ss', even if the pattern isn't UTF-8. This avoids the issues
	2732	* described in the next item.
	2733	* 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
	2734	* pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
	2735	* UTF-8 pattern.) An assumption that the optimizer part of regexec.c
	2736	* (probably unwittingly, in Perl_regexec_flags()) makes is that a
	2737	* character in the pattern corresponds to at most a single character in
	2738	* the target string. (And I do mean character, and not byte here, unlike
	2739	* other parts of the documentation that have never been updated to
	2740	* account for multibyte Unicode.) sharp s in EXACTF nodes can match the
	2741	* two character string 'ss'; in EXACTFA nodes it can match
	2742	* "\x{17F}\x{17F}". These violate the assumption, and they are the only
	2743	* instances where it is violated. I'm reluctant to try to change the
	2744	* assumption, as the code involved is impenetrable to me (khw), so
	2745	* instead the code here punts. This routine examines (when the pattern
	2746	* isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
	2747	* boolean indicating whether or not the node contains a sharp s. When it
	2748	* is true, the caller sets a flag that later causes the optimizer in this
	2749	* file to not set values for the floating and fixed string lengths, and
	2750	* thus avoids the optimizer code in regexec.c that makes the invalid
	2751	* assumption. Thus, there is no optimization based on string lengths for
	2752	* non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
	2753	* (The reason the assumption is wrong only in these two cases is that all
	2754	* other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
	2755	* other folds to their expanded versions. We can't prefold sharp s to
	2756	* 'ss' in EXACTF nodes because we don't know at compile time if it
	2757	* actually matches 'ss' or not. It will match iff the target string is
	2758	* in UTF-8, unlike the EXACTFU nodes, where it always matches; and
	2759	* EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
	2760	* pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
	2761	* but in a non-UTF8 pattern, folding it to that above-Latin1 string would
	2762	* require the pattern to be forced into UTF-8, the overhead of which we
	2763	* want to avoid.)
	2764	*
	2765	* Similarly, the code that generates tries doesn't currently handle
	2766	* not-already-folded multi-char folds, and it looks like a pain to change
	2767	* that. Therefore, trie generation of EXACTFA nodes with the sharp s
	2768	* doesn't work. Instead, such an EXACTFA is turned into a new regnode,
	2769	* EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
	2770	* using /iaa matching will be doing so almost entirely with ASCII
	2771	* strings, so this should rarely be encountered in practice */
	2772
	2773	#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
	2774	if (PL_regkind[OP(scan)] == EXACT) \
	2775	join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
	2776
	2777	STATIC U32
	2778	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan, UV min_subtract, bool has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
	2779	/* Merge several consecutive EXACTish nodes into one. */
	2780	regnode *n = regnext(scan);
	2781	U32 stringok = 1;
	2782	regnode *next = scan + NODE_SZ_STR(scan);
	2783	U32 merged = 0;
	2784	U32 stopnow = 0;
	2785	#ifdef DEBUGGING
	2786	regnode *stop = scan;
	2787	GET_RE_DEBUG_FLAGS_DECL;
	2788	#else
	2789	PERL_UNUSED_ARG(depth);
	2790	#endif
	2791
	2792	PERL_ARGS_ASSERT_JOIN_EXACT;
	2793	#ifndef EXPERIMENTAL_INPLACESCAN
	2794	PERL_UNUSED_ARG(flags);
	2795	PERL_UNUSED_ARG(val);
	2796	#endif
	2797	DEBUG_PEEP("join",scan,depth);
	2798
	2799	/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
	2800	* EXACT ones that are mergeable to the current one. */
	2801	while (n
	2802	&& (PL_regkind[OP(n)] == NOTHING
	2803	\|\| (stringok && OP(n) == OP(scan)))
	2804	&& NEXT_OFF(n)
	2805	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
	2806	{
	2807
	2808	if (OP(n) == TAIL \|\| n > next)
	2809	stringok = 0;
	2810	if (PL_regkind[OP(n)] == NOTHING) {
	2811	DEBUG_PEEP("skip:",n,depth);
	2812	NEXT_OFF(scan) += NEXT_OFF(n);
	2813	next = n + NODE_STEP_REGNODE;
	2814	#ifdef DEBUGGING
	2815	if (stringok)
	2816	stop = n;
	2817	#endif
	2818	n = regnext(n);
	2819	}
	2820	else if (stringok) {
	2821	const unsigned int oldl = STR_LEN(scan);
	2822	regnode * const nnext = regnext(n);
	2823
	2824	/* XXX I (khw) kind of doubt that this works on platforms where
	2825	* U8_MAX is above 255 because of lots of other assumptions */
	2826	/* Don't join if the sum can't fit into a single node */
	2827	if (oldl + STR_LEN(n) > U8_MAX)
	2828	break;
	2829
	2830	DEBUG_PEEP("merg",n,depth);
	2831	merged++;
	2832
	2833	NEXT_OFF(scan) += NEXT_OFF(n);
	2834	STR_LEN(scan) += STR_LEN(n);
	2835	next = n + NODE_SZ_STR(n);
	2836	/* Now we can overwrite n : /
	2837	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	2838	#ifdef DEBUGGING
	2839	stop = next - 1;
	2840	#endif
	2841	n = nnext;
	2842	if (stopnow) break;
	2843	}
	2844
	2845	#ifdef EXPERIMENTAL_INPLACESCAN
	2846	if (flags && !NEXT_OFF(n)) {
	2847	DEBUG_PEEP("atch", val, depth);
	2848	if (reg_off_by_arg[OP(n)]) {
	2849	ARG_SET(n, val - n);
	2850	}
	2851	else {
	2852	NEXT_OFF(n) = val - n;
	2853	}
	2854	stopnow = 1;
	2855	}
	2856	#endif
	2857	}
	2858
	2859	*min_subtract = 0;
	2860	*has_exactf_sharp_s = FALSE;
	2861
	2862	/* Here, all the adjacent mergeable EXACTish nodes have been merged. We
	2863	* can now analyze for sequences of problematic code points. (Prior to
	2864	* this final joining, sequences could have been split over boundaries, and
	2865	* hence missed). The sequences only happen in folding, hence for any
	2866	* non-EXACT EXACTish node */
	2867	if (OP(scan) != EXACT) {
	2868	const U8 * const s0 = (U8*) STRING(scan);
	2869	const U8 * s = s0;
	2870	const U8 * const s_end = s0 + STR_LEN(scan);
	2871
	2872	/* One pass is made over the node's string looking for all the
	2873	* possibilities. to avoid some tests in the loop, there are two main
	2874	* cases, for UTF-8 patterns (which can't have EXACTF nodes) and
	2875	* non-UTF-8 */
	2876	if (UTF) {
	2877
	2878	/* Examine the string for a multi-character fold sequence. UTF-8
	2879	* patterns have all characters pre-folded by the time this code is
	2880	* executed */
	2881	while (s < s_end - 1) /* Can stop 1 before the end, as minimum
	2882	length sequence we are looking for is 2 */
	2883	{
	2884	int count = 0;
	2885	int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
	2886	if (! len) { /* Not a multi-char fold: get next char */
	2887	s += UTF8SKIP(s);
	2888	continue;
	2889	}
	2890
	2891	/* Nodes with 'ss' require special handling, except for EXACTFL
	2892	* and EXACTFA-ish for which there is no multi-char fold to
	2893	* this */
	2894	if (len == 2 && s == 's' && (s+1) == 's'
	2895	&& OP(scan) != EXACTFL
	2896	&& OP(scan) != EXACTFA
	2897	&& OP(scan) != EXACTFA_NO_TRIE)
	2898	{
	2899	count = 2;
	2900	OP(scan) = EXACTFU_SS;
	2901	s += 2;
	2902	}
	2903	else { /* Here is a generic multi-char fold. */
	2904	const U8* multi_end = s + len;
	2905
	2906	/* Count how many characters in it. In the case of /l and
	2907	* /aa, no folds which contain ASCII code points are
	2908	* allowed, so check for those, and skip if found. (In
	2909	* EXACTFL, no folds are allowed to any Latin1 code point,
	2910	* not just ASCII. But there aren't any of these
	2911	* currently, nor ever likely, so don't take the time to
	2912	* test for them. The code that generates the
	2913	* is_MULTI_foo() macros croaks should one actually get put
	2914	* into Unicode .) */
	2915	if (OP(scan) != EXACTFL
	2916	&& OP(scan) != EXACTFA
	2917	&& OP(scan) != EXACTFA_NO_TRIE)
	2918	{
	2919	count = utf8_length(s, multi_end);
	2920	s = multi_end;
	2921	}
	2922	else {
	2923	while (s < multi_end) {
	2924	if (isASCII(*s)) {
	2925	s++;
	2926	goto next_iteration;
	2927	}
	2928	else {
	2929	s += UTF8SKIP(s);
	2930	}
	2931	count++;
	2932	}
	2933	}
	2934	}
	2935
	2936	/* The delta is how long the sequence is minus 1 (1 is how long
	2937	* the character that folds to the sequence is) */
	2938	*min_subtract += count - 1;
	2939	next_iteration: ;
	2940	}
	2941	}
	2942	else if (OP(scan) == EXACTFA) {
	2943
	2944	/* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
	2945	* fold to the ASCII range (and there are no existing ones in the
	2946	* upper latin1 range). But, as outlined in the comments preceding
	2947	* this function, we need to flag any occurrences of the sharp s.
	2948	* This character forbids trie formation (because of added
	2949	* complexity) */
	2950	while (s < s_end) {
	2951	if (*s == LATIN_SMALL_LETTER_SHARP_S) {
	2952	OP(scan) = EXACTFA_NO_TRIE;
	2953	*has_exactf_sharp_s = TRUE;
	2954	break;
	2955	}
	2956	s++;
	2957	continue;
	2958	}
	2959	}
	2960	else if (OP(scan) != EXACTFL) {
	2961
	2962	/* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
	2963	* multi-char folds that are all Latin1. (This code knows that
	2964	* there are no current multi-char folds possible with EXACTFL,
	2965	* relying on fold_grind.t to catch any errors if the very unlikely
	2966	* event happens that some get added in future Unicode versions.)
	2967	* As explained in the comments preceding this function, we look
	2968	* also for the sharp s in EXACTF nodes; it can be in the final
	2969	* position. Otherwise we can stop looking 1 byte earlier because
	2970	* have to find at least two characters for a multi-fold */
	2971	const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
	2972
	2973	while (s < upper) {
	2974	int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
	2975	if (! len) { /* Not a multi-char fold. */
	2976	if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
	2977	{
	2978	*has_exactf_sharp_s = TRUE;
	2979	}
	2980	s++;
	2981	continue;
	2982	}
	2983
	2984	if (len == 2
	2985	&& isARG2_lower_or_UPPER_ARG1('s', *s)
	2986	&& isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
	2987	{
	2988
	2989	/* EXACTF nodes need to know that the minimum length
	2990	* changed so that a sharp s in the string can match this
	2991	* ss in the pattern, but they remain EXACTF nodes, as they
	2992	* won't match this unless the target string is is UTF-8,
	2993	* which we don't know until runtime */
	2994	if (OP(scan) != EXACTF) {
	2995	OP(scan) = EXACTFU_SS;
	2996	}
	2997	}
	2998
	2999	*min_subtract += len - 1;
	3000	s += len;
	3001	}
	3002	}
	3003	}
	3004
	3005	#ifdef DEBUGGING
	3006	/* Allow dumping but overwriting the collection of skipped
	3007	* ops and/or strings with fake optimized ops */
	3008	n = scan + NODE_SZ_STR(scan);
	3009	while (n <= stop) {
	3010	OP(n) = OPTIMIZED;
	3011	FLAGS(n) = 0;
	3012	NEXT_OFF(n) = 0;
	3013	n++;
	3014	}
	3015	#endif
	3016	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	3017	return stopnow;
	3018	}
	3019
	3020	/* REx optimizer. Converts nodes into quicker variants "in place".
	3021	Finds fixed substrings. */
	3022
	3023	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	3024	to the position after last scanned or to NULL. */
	3025
	3026	#define INIT_AND_WITHP \
	3027	assert(!and_withp); \
	3028	Newx(and_withp,1,struct regnode_charclass_class); \
	3029	SAVEFREEPV(and_withp)
	3030
	3031	/* this is a chain of data about sub patterns we are processing that
	3032	need to be handled separately/specially in study_chunk. Its so
	3033	we can simulate recursion without losing state. */
	3034	struct scan_frame;
	3035	typedef struct scan_frame {
	3036	regnode last; / last node to process in this frame */
	3037	regnode next; / next node to process when last is reached */
	3038	struct scan_frame prev; /previous frame*/
	3039	I32 stop; /* what stopparen do we use */
	3040	} scan_frame;
	3041
	3042
	3043	#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
	3044
	3045	STATIC SSize_t
	3046	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	3047	SSize_t minlenp, SSize_t deltap,
	3048	regnode *last,
	3049	scan_data_t *data,
	3050	I32 stopparen,
	3051	U8* recursed,
	3052	struct regnode_charclass_class *and_withp,
	3053	U32 flags, U32 depth)
	3054	/* scanp: Start here (read-write). */
	3055	/* deltap: Write maxlen-minlen here. */
	3056	/* last: Stop before this one. */
	3057	/* data: string data about the pattern */
	3058	/* stopparen: treat close N as END */
	3059	/* recursed: which subroutines have we recursed into */
	3060	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	3061	{
	3062	dVAR;
	3063	/* There must be at least this number of characters to match */
	3064	SSize_t min = 0;
	3065	I32 pars = 0, code;
	3066	regnode scan = scanp, *next;
	3067	SSize_t delta = 0;
	3068	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	3069	int is_inf_internal = 0; /* The studied chunk is infinite */
	3070	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	3071	scan_data_t data_fake;
	3072	SV *re_trie_maxbuff = NULL;
	3073	regnode *first_non_open = scan;
	3074	SSize_t stopmin = SSize_t_MAX;
	3075	scan_frame *frame = NULL;
	3076	GET_RE_DEBUG_FLAGS_DECL;
	3077
	3078	PERL_ARGS_ASSERT_STUDY_CHUNK;
	3079
	3080	#ifdef DEBUGGING
	3081	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	3082	#endif
	3083
	3084	if ( depth == 0 ) {
	3085	while (first_non_open && OP(first_non_open) == OPEN)
	3086	first_non_open=regnext(first_non_open);
	3087	}
	3088
	3089
	3090	fake_study_recurse:
	3091	while ( scan && OP(scan) != END && scan < last ){
	3092	UV min_subtract = 0; /* How mmany chars to subtract from the minimum
	3093	node length to get a real minimum (because
	3094	the folded version may be shorter) */
	3095	bool has_exactf_sharp_s = FALSE;
	3096	/* Peephole optimizer: */
	3097	DEBUG_STUDYDATA("Peep:", data,depth);
	3098	DEBUG_PEEP("Peep",scan,depth);
	3099
	3100	/* Its not clear to khw or hv why this is done here, and not in the
	3101	* clauses that deal with EXACT nodes. khw's guess is that it's
	3102	* because of a previous design */
	3103	JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
	3104
	3105	/* Follow the next-chain of the current node and optimize
	3106	away all the NOTHINGs from it. */
	3107	if (OP(scan) != CURLYX) {
	3108	const int max = (reg_off_by_arg[OP(scan)]
	3109	? I32_MAX
	3110	/* I32 may be smaller than U16 on CRAYs! */
	3111	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	3112	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	3113	int noff;
	3114	regnode *n = scan;
	3115
	3116	/* Skip NOTHING and LONGJMP. */
	3117	while ((n = regnext(n))
	3118	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	3119	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	3120	&& off + noff < max)
	3121	off += noff;
	3122	if (reg_off_by_arg[OP(scan)])
	3123	ARG(scan) = off;
	3124	else
	3125	NEXT_OFF(scan) = off;
	3126	}
	3127
	3128
	3129
	3130	/* The principal pseudo-switch. Cannot be a switch, since we
	3131	look into several different things. */
	3132	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	3133	\|\| OP(scan) == IFTHEN) {
	3134	next = regnext(scan);
	3135	code = OP(scan);
	3136	/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
	3137
	3138	if (OP(next) == code \|\| code == IFTHEN) {
	3139	/* NOTE - There is similar code to this block below for handling
	3140	TRIE nodes on a re-study. If you change stuff here check there
	3141	too. */
	3142	SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
	3143	struct regnode_charclass_class accum;
	3144	regnode * const startbranch=scan;
	3145
	3146	if (flags & SCF_DO_SUBSTR)
	3147	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
	3148	if (flags & SCF_DO_STCLASS)
	3149	cl_init_zero(pRExC_state, &accum);
	3150
	3151	while (OP(scan) == code) {
	3152	SSize_t deltanext, minnext, fake;
	3153	I32 f = 0;
	3154	struct regnode_charclass_class this_class;
	3155
	3156	num++;
	3157	data_fake.flags = 0;
	3158	if (data) {
	3159	data_fake.whilem_c = data->whilem_c;
	3160	data_fake.last_closep = data->last_closep;
	3161	}
	3162	else
	3163	data_fake.last_closep = &fake;
	3164
	3165	data_fake.pos_delta = delta;
	3166	next = regnext(scan);
	3167	scan = NEXTOPER(scan);
	3168	if (code != BRANCH)
	3169	scan = NEXTOPER(scan);
	3170	if (flags & SCF_DO_STCLASS) {
	3171	cl_init(pRExC_state, &this_class);
	3172	data_fake.start_class = &this_class;
	3173	f = SCF_DO_STCLASS_AND;
	3174	}
	3175	if (flags & SCF_WHILEM_VISITED_POS)
	3176	f \|= SCF_WHILEM_VISITED_POS;
	3177
	3178	/* we suppose the run is continuous, last=next...*/
	3179	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3180	next, &data_fake,
	3181	stopparen, recursed, NULL, f,depth+1);
	3182	if (min1 > minnext)
	3183	min1 = minnext;
	3184	if (deltanext == SSize_t_MAX) {
	3185	is_inf = is_inf_internal = 1;
	3186	max1 = SSize_t_MAX;
	3187	} else if (max1 < minnext + deltanext)
	3188	max1 = minnext + deltanext;
	3189	scan = next;
	3190	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3191	pars++;
	3192	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	3193	if ( stopmin > minnext)
	3194	stopmin = min + min1;
	3195	flags &= ~SCF_DO_SUBSTR;
	3196	if (data)
	3197	data->flags \|= SCF_SEEN_ACCEPT;
	3198	}
	3199	if (data) {
	3200	if (data_fake.flags & SF_HAS_EVAL)
	3201	data->flags \|= SF_HAS_EVAL;
	3202	data->whilem_c = data_fake.whilem_c;
	3203	}
	3204	if (flags & SCF_DO_STCLASS)
	3205	cl_or(pRExC_state, &accum, &this_class);
	3206	}
	3207	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	3208	min1 = 0;
	3209	if (flags & SCF_DO_SUBSTR) {
	3210	data->pos_min += min1;
	3211	if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
	3212	data->pos_delta = SSize_t_MAX;
	3213	else
	3214	data->pos_delta += max1 - min1;
	3215	if (max1 != min1 \|\| is_inf)
	3216	data->longest = &(data->longest_float);
	3217	}
	3218	min += min1;
	3219	if (delta == SSize_t_MAX
	3220	\|\| SSize_t_MAX - delta - (max1 - min1) < 0)
	3221	delta = SSize_t_MAX;
	3222	else
	3223	delta += max1 - min1;
	3224	if (flags & SCF_DO_STCLASS_OR) {
	3225	cl_or(pRExC_state, data->start_class, &accum);
	3226	if (min1) {
	3227	cl_and(data->start_class, and_withp);
	3228	flags &= ~SCF_DO_STCLASS;
	3229	}
	3230	}
	3231	else if (flags & SCF_DO_STCLASS_AND) {
	3232	if (min1) {
	3233	cl_and(data->start_class, &accum);
	3234	flags &= ~SCF_DO_STCLASS;
	3235	}
	3236	else {
	3237	/* Switch to OR mode: cache the old value of
	3238	* data->start_class */
	3239	INIT_AND_WITHP;
	3240	StructCopy(data->start_class, and_withp,
	3241	struct regnode_charclass_class);
	3242	flags &= ~SCF_DO_STCLASS_AND;
	3243	StructCopy(&accum, data->start_class,
	3244	struct regnode_charclass_class);
	3245	flags \|= SCF_DO_STCLASS_OR;
	3246	SET_SSC_EOS(data->start_class);
	3247	}
	3248	}
	3249
	3250	if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
	3251	/* demq.
	3252
	3253	Assuming this was/is a branch we are dealing with: 'scan' now
	3254	points at the item that follows the branch sequence, whatever
	3255	it is. We now start at the beginning of the sequence and look
	3256	for subsequences of
	3257
	3258	BRANCH->EXACT=>x1
	3259	BRANCH->EXACT=>x2
	3260	tail
	3261
	3262	which would be constructed from a pattern like /A\|LIST\|OF\|WORDS/
	3263
	3264	If we can find such a subsequence we need to turn the first
	3265	element into a trie and then add the subsequent branch exact
	3266	strings to the trie.
	3267
	3268	We have two cases
	3269
	3270	1. patterns where the whole set of branches can be converted.
	3271
	3272	2. patterns where only a subset can be converted.
	3273
	3274	In case 1 we can replace the whole set with a single regop
	3275	for the trie. In case 2 we need to keep the start and end
	3276	branches so
	3277
	3278	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	3279	becomes BRANCH TRIE; BRANCH X;
	3280
	3281	There is an additional case, that being where there is a
	3282	common prefix, which gets split out into an EXACT like node
	3283	preceding the TRIE node.
	3284
	3285	If x(1..n)==tail then we can do a simple trie, if not we make
	3286	a "jump" trie, such that when we match the appropriate word
	3287	we "jump" to the appropriate tail node. Essentially we turn
	3288	a nested if into a case structure of sorts.
	3289
	3290	*/
	3291
	3292	int made=0;
	3293	if (!re_trie_maxbuff) {
	3294	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	3295	if (!SvIOK(re_trie_maxbuff))
	3296	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	3297	}
	3298	if ( SvIV(re_trie_maxbuff)>=0 ) {
	3299	regnode *cur;
	3300	regnode first = (regnode )NULL;
	3301	regnode last = (regnode )NULL;
	3302	regnode *tail = scan;
	3303	U8 trietype = 0;
	3304	U32 count=0;
	3305
	3306	#ifdef DEBUGGING
	3307	SV * const mysv = sv_newmortal(); /* for dumping */
	3308	#endif
	3309	/* var tail is used because there may be a TAIL
	3310	regop in the way. Ie, the exacts will point to the
	3311	thing following the TAIL, but the last branch will
	3312	point at the TAIL. So we advance tail. If we
	3313	have nested (?:) we may have to move through several
	3314	tails.
	3315	*/
	3316
	3317	while ( OP( tail ) == TAIL ) {
	3318	/* this is the TAIL generated by (?:) */
	3319	tail = regnext( tail );
	3320	}
	3321
	3322
	3323	DEBUG_TRIE_COMPILE_r({
	3324	regprop(RExC_rx, mysv, tail );
	3325	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	3326	(int)depth * 2 + 2, "",
	3327	"Looking for TRIE'able sequences. Tail node is: ",
	3328	SvPV_nolen_const( mysv )
	3329	);
	3330	});
	3331
	3332	/*
	3333
	3334	Step through the branches
	3335	cur represents each branch,
	3336	noper is the first thing to be matched as part of that branch
	3337	noper_next is the regnext() of that node.
	3338
	3339	We normally handle a case like this /FOO[xyz]\|BAR[pqr]/
	3340	via a "jump trie" but we also support building with NOJUMPTRIE,
	3341	which restricts the trie logic to structures like /FOO\|BAR/.
	3342
	3343	If noper is a trieable nodetype then the branch is a possible optimization
	3344	target. If we are building under NOJUMPTRIE then we require that noper_next
	3345	is the same as scan (our current position in the regex program).
	3346
	3347	Once we have two or more consecutive such branches we can create a
	3348	trie of the EXACT's contents and stitch it in place into the program.
	3349
	3350	If the sequence represents all of the branches in the alternation we
	3351	replace the entire thing with a single TRIE node.
	3352
	3353	Otherwise when it is a subsequence we need to stitch it in place and
	3354	replace only the relevant branches. This means the first branch has
	3355	to remain as it is used by the alternation logic, and its next pointer,
	3356	and needs to be repointed at the item on the branch chain following
	3357	the last branch we have optimized away.
	3358
	3359	This could be either a BRANCH, in which case the subsequence is internal,
	3360	or it could be the item following the branch sequence in which case the
	3361	subsequence is at the end (which does not necessarily mean the first node
	3362	is the start of the alternation).
	3363
	3364	TRIE_TYPE(X) is a define which maps the optype to a trietype.
	3365
	3366	optype \| trietype
	3367	----------------+-----------
	3368	NOTHING \| NOTHING
	3369	EXACT \| EXACT
	3370	EXACTFU \| EXACTFU
	3371	EXACTFU_SS \| EXACTFU
	3372	EXACTFA \| EXACTFA
	3373
	3374
	3375	*/
	3376	#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
	3377	( EXACT == (X) ) ? EXACT : \
	3378	( EXACTFU == (X) \|\| EXACTFU_SS == (X) ) ? EXACTFU : \
	3379	( EXACTFA == (X) ) ? EXACTFA : \
	3380	0 )
	3381
	3382	/* dont use tail as the end marker for this traverse */
	3383	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	3384	regnode * const noper = NEXTOPER( cur );
	3385	U8 noper_type = OP( noper );
	3386	U8 noper_trietype = TRIE_TYPE( noper_type );
	3387	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	3388	regnode * const noper_next = regnext( noper );
	3389	U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
	3390	U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
	3391	#endif
	3392
	3393	DEBUG_TRIE_COMPILE_r({
	3394	regprop(RExC_rx, mysv, cur);
	3395	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	3396	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	3397
	3398	regprop(RExC_rx, mysv, noper);
	3399	PerlIO_printf( Perl_debug_log, " -> %s",
	3400	SvPV_nolen_const(mysv));
	3401
	3402	if ( noper_next ) {
	3403	regprop(RExC_rx, mysv, noper_next );
	3404	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	3405	SvPV_nolen_const(mysv));
	3406	}
	3407	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
	3408	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
	3409	PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
	3410	);
	3411	});
	3412
	3413	/* Is noper a trieable nodetype that can be merged with the
	3414	* current trie (if there is one)? */
	3415	if ( noper_trietype
	3416	&&
	3417	(
	3418	( noper_trietype == NOTHING)
	3419	\|\| ( trietype == NOTHING )
	3420	\|\| ( trietype == noper_trietype )
	3421	)
	3422	#ifdef NOJUMPTRIE
	3423	&& noper_next == tail
	3424	#endif
	3425	&& count < U16_MAX)
	3426	{
	3427	/* Handle mergable triable node
	3428	* Either we are the first node in a new trieable sequence,
	3429	* in which case we do some bookkeeping, otherwise we update
	3430	* the end pointer. */
	3431	if ( !first ) {
	3432	first = cur;
	3433	if ( noper_trietype == NOTHING ) {
	3434	#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
	3435	regnode * const noper_next = regnext( noper );
	3436	U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
	3437	U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
	3438	#endif
	3439
	3440	if ( noper_next_trietype ) {
	3441	trietype = noper_next_trietype;
	3442	} else if (noper_next_type) {
	3443	/* a NOTHING regop is 1 regop wide. We need at least two
	3444	* for a trie so we can't merge this in */
	3445	first = NULL;
	3446	}
	3447	} else {
	3448	trietype = noper_trietype;
	3449	}
	3450	} else {
	3451	if ( trietype == NOTHING )
	3452	trietype = noper_trietype;
	3453	last = cur;
	3454	}
	3455	if (first)
	3456	count++;
	3457	} /* end handle mergable triable node */
	3458	else {
	3459	/* handle unmergable node -
	3460	* noper may either be a triable node which can not be tried
	3461	* together with the current trie, or a non triable node */
	3462	if ( last ) {
	3463	/* If last is set and trietype is not NOTHING then we have found
	3464	* at least two triable branch sequences in a row of a similar
	3465	* trietype so we can turn them into a trie. If/when we
	3466	* allow NOTHING to start a trie sequence this condition will be
	3467	* required, and it isn't expensive so we leave it in for now. */
	3468	if ( trietype && trietype != NOTHING )
	3469	make_trie( pRExC_state,
	3470	startbranch, first, cur, tail, count,
	3471	trietype, depth+1 );
	3472	last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
	3473	}
	3474	if ( noper_trietype
	3475	#ifdef NOJUMPTRIE
	3476	&& noper_next == tail
	3477	#endif
	3478	){
	3479	/* noper is triable, so we can start a new trie sequence */
	3480	count = 1;
	3481	first = cur;
	3482	trietype = noper_trietype;
	3483	} else if (first) {
	3484	/* if we already saw a first but the current node is not triable then we have
	3485	* to reset the first information. */
	3486	count = 0;
	3487	first = NULL;
	3488	trietype = 0;
	3489	}
	3490	} /* end handle unmergable node */
	3491	} /* loop over branches */
	3492	DEBUG_TRIE_COMPILE_r({
	3493	regprop(RExC_rx, mysv, cur);
	3494	PerlIO_printf( Perl_debug_log,
	3495	"%s- %s (%d) <SCAN FINISHED>\n", (int)depth 2 + 2,
	3496	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3497
	3498	});
	3499	if ( last && trietype ) {
	3500	if ( trietype != NOTHING ) {
	3501	/* the last branch of the sequence was part of a trie,
	3502	* so we have to construct it here outside of the loop
	3503	*/
	3504	made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
	3505	#ifdef TRIE_STUDY_OPT
	3506	if ( ((made == MADE_EXACT_TRIE &&
	3507	startbranch == first)
	3508	\|\| ( first_non_open == first )) &&
	3509	depth==0 ) {
	3510	flags \|= SCF_TRIE_RESTUDY;
	3511	if ( startbranch == first
	3512	&& scan == tail )
	3513	{
	3514	RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
	3515	}
	3516	}
	3517	#endif
	3518	} else {
	3519	/* at this point we know whatever we have is a NOTHING sequence/branch
	3520	* AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
	3521	*/
	3522	if ( startbranch == first ) {
	3523	regnode *opt;
	3524	/* the entire thing is a NOTHING sequence, something like this:
	3525	* (?:\|) So we can turn it into a plain NOTHING op. */
	3526	DEBUG_TRIE_COMPILE_r({
	3527	regprop(RExC_rx, mysv, cur);
	3528	PerlIO_printf( Perl_debug_log,
	3529	"%s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth 2 + 2,
	3530	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3531
	3532	});
	3533	OP(startbranch)= NOTHING;
	3534	NEXT_OFF(startbranch)= tail - startbranch;
	3535	for ( opt= startbranch + 1; opt < tail ; opt++ )
	3536	OP(opt)= OPTIMIZED;
	3537	}
	3538	}
	3539	} /* end if ( last) */
	3540	} /* TRIE_MAXBUF is non zero */
	3541
	3542	} /* do trie */
	3543
	3544	}
	3545	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	3546	scan = NEXTOPER(NEXTOPER(scan));
	3547	} else /* single branch is optimized. */
	3548	scan = NEXTOPER(scan);
	3549	continue;
	3550	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	3551	scan_frame *newframe = NULL;
	3552	I32 paren;
	3553	regnode *start;
	3554	regnode *end;
	3555
	3556	if (OP(scan) != SUSPEND) {
	3557	/* set the pointer */
	3558	if (OP(scan) == GOSUB) {
	3559	paren = ARG(scan);
	3560	RExC_recurse[ARG2L(scan)] = scan;
	3561	start = RExC_open_parens[paren-1];
	3562	end = RExC_close_parens[paren-1];
	3563	} else {
	3564	paren = 0;
	3565	start = RExC_rxi->program + 1;
	3566	end = RExC_opend;
	3567	}
	3568	if (!recursed) {
	3569	Newxz(recursed, (((RExC_npar)>>3) +1), U8);
	3570	SAVEFREEPV(recursed);
	3571	}
	3572	if (!PAREN_TEST(recursed,paren+1)) {
	3573	PAREN_SET(recursed,paren+1);
	3574	Newx(newframe,1,scan_frame);
	3575	} else {
	3576	if (flags & SCF_DO_SUBSTR) {
	3577	SCAN_COMMIT(pRExC_state,data,minlenp);
	3578	data->longest = &(data->longest_float);
	3579	}
	3580	is_inf = is_inf_internal = 1;
	3581	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3582	cl_anything(pRExC_state, data->start_class);
	3583	flags &= ~SCF_DO_STCLASS;
	3584	}
	3585	} else {
	3586	Newx(newframe,1,scan_frame);
	3587	paren = stopparen;
	3588	start = scan+2;
	3589	end = regnext(scan);
	3590	}
	3591	if (newframe) {
	3592	assert(start);
	3593	assert(end);
	3594	SAVEFREEPV(newframe);
	3595	newframe->next = regnext(scan);
	3596	newframe->last = last;
	3597	newframe->stop = stopparen;
	3598	newframe->prev = frame;
	3599
	3600	frame = newframe;
	3601	scan = start;
	3602	stopparen = paren;
	3603	last = end;
	3604
	3605	continue;
	3606	}
	3607	}
	3608	else if (OP(scan) == EXACT) {
	3609	SSize_t l = STR_LEN(scan);
	3610	UV uc;
	3611	if (UTF) {
	3612	const U8 * const s = (U8*)STRING(scan);
	3613	uc = utf8_to_uvchr_buf(s, s + l, NULL);
	3614	l = utf8_length(s, s + l);
	3615	} else {
	3616	uc = ((U8)STRING(scan));
	3617	}
	3618	min += l;
	3619	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	3620	/* The code below prefers earlier match for fixed
	3621	offset, later match for variable offset. */
	3622	if (data->last_end == -1) { /* Update the start info. */
	3623	data->last_start_min = data->pos_min;
	3624	data->last_start_max = is_inf
	3625	? SSize_t_MAX : data->pos_min + data->pos_delta;
	3626	}
	3627	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	3628	if (UTF)
	3629	SvUTF8_on(data->last_found);
	3630	{
	3631	SV * const sv = data->last_found;
	3632	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3633	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3634	if (mg && mg->mg_len >= 0)
	3635	mg->mg_len += utf8_length((U8*)STRING(scan),
	3636	(U8*)STRING(scan)+STR_LEN(scan));
	3637	}
	3638	data->last_end = data->pos_min + l;
	3639	data->pos_min += l; /* As in the first entry. */
	3640	data->flags &= ~SF_BEFORE_EOL;
	3641	}
	3642	if (flags & SCF_DO_STCLASS_AND) {
	3643	/* Check whether it is compatible with what we know already! */
	3644	int compat = 1;
	3645
	3646
	3647	/* If compatible, we or it in below. It is compatible if is
	3648	* in the bitmp and either 1) its bit or its fold is set, or 2)
	3649	* it's for a locale. Even if there isn't unicode semantics
	3650	* here, at runtime there may be because of matching against a
	3651	* utf8 string, so accept a possible false positive for
	3652	* latin1-range folds */
	3653	if (uc >= 0x100 \|\|
	3654	(!(data->start_class->flags & ANYOF_LOCALE)
	3655	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3656	&& (!(data->start_class->flags & ANYOF_LOC_FOLD)
	3657	\|\| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3658	)
	3659	{
	3660	compat = 0;
	3661	}
	3662	ANYOF_CLASS_ZERO(data->start_class);
	3663	ANYOF_BITMAP_ZERO(data->start_class);
	3664	if (compat)
	3665	ANYOF_BITMAP_SET(data->start_class, uc);
	3666	else if (uc >= 0x100) {
	3667	int i;
	3668
	3669	/* Some Unicode code points fold to the Latin1 range; as
	3670	* XXX temporary code, instead of figuring out if this is
	3671	* one, just assume it is and set all the start class bits
	3672	* that could be some such above 255 code point's fold
	3673	* which will generate fals positives. As the code
	3674	* elsewhere that does compute the fold settles down, it
	3675	* can be extracted out and re-used here */
	3676	for (i = 0; i < 256; i++){
	3677	if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
	3678	ANYOF_BITMAP_SET(data->start_class, i);
	3679	}
	3680	}
	3681	}
	3682	CLEAR_SSC_EOS(data->start_class);
	3683	if (uc < 0x100)
	3684	data->start_class->flags &= ~ANYOF_UNICODE_ALL;
	3685	}
	3686	else if (flags & SCF_DO_STCLASS_OR) {
	3687	/* false positive possible if the class is case-folded */
	3688	if (uc < 0x100)
	3689	ANYOF_BITMAP_SET(data->start_class, uc);
	3690	else
	3691	data->start_class->flags \|= ANYOF_UNICODE_ALL;
	3692	CLEAR_SSC_EOS(data->start_class);
	3693	cl_and(data->start_class, and_withp);
	3694	}
	3695	flags &= ~SCF_DO_STCLASS;
	3696	}
	3697	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
	3698	SSize_t l = STR_LEN(scan);
	3699	UV uc = ((U8)STRING(scan));
	3700
	3701	/* Search for fixed substrings supports EXACT only. */
	3702	if (flags & SCF_DO_SUBSTR) {
	3703	assert(data);
	3704	SCAN_COMMIT(pRExC_state, data, minlenp);
	3705	}
	3706	if (UTF) {
	3707	const U8 * const s = (U8 *)STRING(scan);
	3708	uc = utf8_to_uvchr_buf(s, s + l, NULL);
	3709	l = utf8_length(s, s + l);
	3710	}
	3711	if (has_exactf_sharp_s) {
	3712	RExC_seen \|= REG_SEEN_EXACTF_SHARP_S;
	3713	}
	3714	min += l - min_subtract;
	3715	assert (min >= 0);
	3716	delta += min_subtract;
	3717	if (flags & SCF_DO_SUBSTR) {
	3718	data->pos_min += l - min_subtract;
	3719	if (data->pos_min < 0) {
	3720	data->pos_min = 0;
	3721	}
	3722	data->pos_delta += min_subtract;
	3723	if (min_subtract) {
	3724	data->longest = &(data->longest_float);
	3725	}
	3726	}
	3727	if (flags & SCF_DO_STCLASS_AND) {
	3728	/* Check whether it is compatible with what we know already! */
	3729	int compat = 1;
	3730	if (uc >= 0x100 \|\|
	3731	(!(data->start_class->flags & ANYOF_LOCALE)
	3732	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3733	&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3734	{
	3735	compat = 0;
	3736	}
	3737	ANYOF_CLASS_ZERO(data->start_class);
	3738	ANYOF_BITMAP_ZERO(data->start_class);
	3739	if (compat) {
	3740	ANYOF_BITMAP_SET(data->start_class, uc);
	3741	CLEAR_SSC_EOS(data->start_class);
	3742	if (OP(scan) == EXACTFL) {
	3743	/* XXX This set is probably no longer necessary, and
	3744	* probably wrong as LOCALE now is on in the initial
	3745	* state */
	3746	data->start_class->flags \|= ANYOF_LOCALE\|ANYOF_LOC_FOLD;
	3747	}
	3748	else {
	3749
	3750	/* Also set the other member of the fold pair. In case
	3751	* that unicode semantics is called for at runtime, use
	3752	* the full latin1 fold. (Can't do this for locale,
	3753	* because not known until runtime) */
	3754	ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
	3755
	3756	/* All other (EXACTFL handled above) folds except under
	3757	* /iaa that include s, S, and sharp_s also may include
	3758	* the others */
	3759	if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
	3760	{
	3761	if (uc == 's' \|\| uc == 'S') {
	3762	ANYOF_BITMAP_SET(data->start_class,
	3763	LATIN_SMALL_LETTER_SHARP_S);
	3764	}
	3765	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3766	ANYOF_BITMAP_SET(data->start_class, 's');
	3767	ANYOF_BITMAP_SET(data->start_class, 'S');
	3768	}
	3769	}
	3770	}
	3771	}
	3772	else if (uc >= 0x100) {
	3773	int i;
	3774	for (i = 0; i < 256; i++){
	3775	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3776	ANYOF_BITMAP_SET(data->start_class, i);
	3777	}
	3778	}
	3779	}
	3780	}
	3781	else if (flags & SCF_DO_STCLASS_OR) {
	3782	if (data->start_class->flags & ANYOF_LOC_FOLD) {
	3783	/* false positive possible if the class is case-folded.
	3784	Assume that the locale settings are the same... */
	3785	if (uc < 0x100) {
	3786	ANYOF_BITMAP_SET(data->start_class, uc);
	3787	if (OP(scan) != EXACTFL) {
	3788
	3789	/* And set the other member of the fold pair, but
	3790	* can't do that in locale because not known until
	3791	* run-time */
	3792	ANYOF_BITMAP_SET(data->start_class,
	3793	PL_fold_latin1[uc]);
	3794
	3795	/* All folds except under /iaa that include s, S,
	3796	* and sharp_s also may include the others */
	3797	if (OP(scan) != EXACTFA
	3798	&& OP(scan) != EXACTFA_NO_TRIE)
	3799	{
	3800	if (uc == 's' \|\| uc == 'S') {
	3801	ANYOF_BITMAP_SET(data->start_class,
	3802	LATIN_SMALL_LETTER_SHARP_S);
	3803	}
	3804	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3805	ANYOF_BITMAP_SET(data->start_class, 's');
	3806	ANYOF_BITMAP_SET(data->start_class, 'S');
	3807	}
	3808	}
	3809	}
	3810	}
	3811	CLEAR_SSC_EOS(data->start_class);
	3812	}
	3813	cl_and(data->start_class, and_withp);
	3814	}
	3815	flags &= ~SCF_DO_STCLASS;
	3816	}
	3817	else if (REGNODE_VARIES(OP(scan))) {
	3818	SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
	3819	I32 fl = 0, f = flags;
	3820	regnode * const oscan = scan;
	3821	struct regnode_charclass_class this_class;
	3822	struct regnode_charclass_class *oclass = NULL;
	3823	I32 next_is_eval = 0;
	3824
	3825	switch (PL_regkind[OP(scan)]) {
	3826	case WHILEM: /* End of (?:...)* . */
	3827	scan = NEXTOPER(scan);
	3828	goto finish;
	3829	case PLUS:
	3830	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	3831	next = NEXTOPER(scan);
	3832	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	3833	mincount = 1;
	3834	maxcount = REG_INFTY;
	3835	next = regnext(scan);
	3836	scan = NEXTOPER(scan);
	3837	goto do_curly;
	3838	}
	3839	}
	3840	if (flags & SCF_DO_SUBSTR)
	3841	data->pos_min++;
	3842	min++;
	3843	/* Fall through. */
	3844	case STAR:
	3845	if (flags & SCF_DO_STCLASS) {
	3846	mincount = 0;
	3847	maxcount = REG_INFTY;
	3848	next = regnext(scan);
	3849	scan = NEXTOPER(scan);
	3850	goto do_curly;
	3851	}
	3852	is_inf = is_inf_internal = 1;
	3853	scan = regnext(scan);
	3854	if (flags & SCF_DO_SUBSTR) {
	3855	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
	3856	data->longest = &(data->longest_float);
	3857	}
	3858	goto optimize_curly_tail;
	3859	case CURLY:
	3860	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	3861	&& (scan->flags == stopparen))
	3862	{
	3863	mincount = 1;
	3864	maxcount = 1;
	3865	} else {
	3866	mincount = ARG1(scan);
	3867	maxcount = ARG2(scan);
	3868	}
	3869	next = regnext(scan);
	3870	if (OP(scan) == CURLYX) {
	3871	I32 lp = (data ? *(data->last_closep) : 0);
	3872	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	3873	}
	3874	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	3875	next_is_eval = (OP(scan) == EVAL);
	3876	do_curly:
	3877	if (flags & SCF_DO_SUBSTR) {
	3878	if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
	3879	pos_before = data->pos_min;
	3880	}
	3881	if (data) {
	3882	fl = data->flags;
	3883	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	3884	if (is_inf)
	3885	data->flags \|= SF_IS_INF;
	3886	}
	3887	if (flags & SCF_DO_STCLASS) {
	3888	cl_init(pRExC_state, &this_class);
	3889	oclass = data->start_class;
	3890	data->start_class = &this_class;
	3891	f \|= SCF_DO_STCLASS_AND;
	3892	f &= ~SCF_DO_STCLASS_OR;
	3893	}
	3894	/* Exclude from super-linear cache processing any {n,m}
	3895	regops for which the combination of input pos and regex
	3896	pos is not enough information to determine if a match
	3897	will be possible.
	3898
	3899	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	3900	regex pos at the \s*, the prospects for a match depend not
	3901	only on the input position but also on how many (bar\s*)
	3902	repeats into the {4,8} we are. */
	3903	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	3904	f &= ~SCF_WHILEM_VISITED_POS;
	3905
	3906	/* This will finish on WHILEM, setting scan, or on NULL: */
	3907	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3908	last, data, stopparen, recursed, NULL,
	3909	(mincount == 0
	3910	? (f & ~SCF_DO_SUBSTR) : f),depth+1);
	3911
	3912	if (flags & SCF_DO_STCLASS)
	3913	data->start_class = oclass;
	3914	if (mincount == 0 \|\| minnext == 0) {
	3915	if (flags & SCF_DO_STCLASS_OR) {
	3916	cl_or(pRExC_state, data->start_class, &this_class);
	3917	}
	3918	else if (flags & SCF_DO_STCLASS_AND) {
	3919	/* Switch to OR mode: cache the old value of
	3920	* data->start_class */
	3921	INIT_AND_WITHP;
	3922	StructCopy(data->start_class, and_withp,
	3923	struct regnode_charclass_class);
	3924	flags &= ~SCF_DO_STCLASS_AND;
	3925	StructCopy(&this_class, data->start_class,
	3926	struct regnode_charclass_class);
	3927	flags \|= SCF_DO_STCLASS_OR;
	3928	SET_SSC_EOS(data->start_class);
	3929	}
	3930	} else { /* Non-zero len */
	3931	if (flags & SCF_DO_STCLASS_OR) {
	3932	cl_or(pRExC_state, data->start_class, &this_class);
	3933	cl_and(data->start_class, and_withp);
	3934	}
	3935	else if (flags & SCF_DO_STCLASS_AND)
	3936	cl_and(data->start_class, &this_class);
	3937	flags &= ~SCF_DO_STCLASS;
	3938	}
	3939	if (!scan) /* It was not CURLYX, but CURLY. */
	3940	scan = next;
	3941	if (!(flags & SCF_TRIE_DOING_RESTUDY)
	3942	/* ? quantifier ok, except for (?{ ... }) */
	3943	&& (next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	3944	&& (minnext == 0) && (deltanext == 0)
	3945	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	3946	&& maxcount <= REG_INFTY/3) /* Complement check for big count */
	3947	{
	3948	/* Fatal warnings may leak the regexp without this: */
	3949	SAVEFREESV(RExC_rx_sv);
	3950	ckWARNreg(RExC_parse,
	3951	"Quantifier unexpected on zero-length expression");
	3952	(void)ReREFCNT_inc(RExC_rx_sv);
	3953	}
	3954
	3955	min += minnext * mincount;
	3956	is_inf_internal \|= deltanext == SSize_t_MAX
	3957	\|\| (maxcount == REG_INFTY && minnext + deltanext > 0);
	3958	is_inf \|= is_inf_internal;
	3959	if (is_inf)
	3960	delta = SSize_t_MAX;
	3961	else
	3962	delta += (minnext + deltanext) * maxcount - minnext * mincount;
	3963
	3964	/* Try powerful optimization CURLYX => CURLYN. */
	3965	if ( OP(oscan) == CURLYX && data
	3966	&& data->flags & SF_IN_PAR
	3967	&& !(data->flags & SF_HAS_EVAL)
	3968	&& !deltanext && minnext == 1 ) {
	3969	/* Try to optimize to CURLYN. */
	3970	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	3971	regnode * const nxt1 = nxt;
	3972	#ifdef DEBUGGING
	3973	regnode *nxt2;
	3974	#endif
	3975
	3976	/* Skip open. */
	3977	nxt = regnext(nxt);
	3978	if (!REGNODE_SIMPLE(OP(nxt))
	3979	&& !(PL_regkind[OP(nxt)] == EXACT
	3980	&& STR_LEN(nxt) == 1))
	3981	goto nogo;
	3982	#ifdef DEBUGGING
	3983	nxt2 = nxt;
	3984	#endif
	3985	nxt = regnext(nxt);
	3986	if (OP(nxt) != CLOSE)
	3987	goto nogo;
	3988	if (RExC_open_parens) {
	3989	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3990	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	3991	}
	3992	/* Now we know that nxt2 is the only contents: */
	3993	oscan->flags = (U8)ARG(nxt);
	3994	OP(oscan) = CURLYN;
	3995	OP(nxt1) = NOTHING; /* was OPEN. */
	3996
	3997	#ifdef DEBUGGING
	3998	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3999	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	4000	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	4001	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	4002	OP(nxt + 1) = OPTIMIZED; /* was count. */
	4003	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	4004	#endif
	4005	}
	4006	nogo:
	4007
	4008	/* Try optimization CURLYX => CURLYM. */
	4009	if ( OP(oscan) == CURLYX && data
	4010	&& !(data->flags & SF_HAS_PAR)
	4011	&& !(data->flags & SF_HAS_EVAL)
	4012	&& !deltanext /* atom is fixed width */
	4013	&& minnext != 0 /* CURLYM can't handle zero width */
	4014	&& ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
	4015	) {
	4016	/* XXXX How to optimize if data == 0? */
	4017	/* Optimize to a simpler form. */
	4018	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	4019	regnode *nxt2;
	4020
	4021	OP(oscan) = CURLYM;
	4022	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	4023	&& (OP(nxt2) != WHILEM))
	4024	nxt = nxt2;
	4025	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	4026	/* Need to optimize away parenths. */
	4027	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	4028	/* Set the parenth number. */
	4029	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	4030
	4031	oscan->flags = (U8)ARG(nxt);
	4032	if (RExC_open_parens) {
	4033	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	4034	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	4035	}
	4036	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	4037	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	4038
	4039	#ifdef DEBUGGING
	4040	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	4041	OP(nxt + 1) = OPTIMIZED; /* was count. */
	4042	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	4043	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	4044	#endif
	4045	#if 0
	4046	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	4047	regnode *nnxt = regnext(nxt1);
	4048	if (nnxt == nxt) {
	4049	if (reg_off_by_arg[OP(nxt1)])
	4050	ARG_SET(nxt1, nxt2 - nxt1);
	4051	else if (nxt2 - nxt1 < U16_MAX)
	4052	NEXT_OFF(nxt1) = nxt2 - nxt1;
	4053	else
	4054	OP(nxt) = NOTHING; /* Cannot beautify */
	4055	}
	4056	nxt1 = nnxt;
	4057	}
	4058	#endif
	4059	/* Optimize again: */
	4060	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	4061	NULL, stopparen, recursed, NULL, 0,depth+1);
	4062	}
	4063	else
	4064	oscan->flags = 0;
	4065	}
	4066	else if ((OP(oscan) == CURLYX)
	4067	&& (flags & SCF_WHILEM_VISITED_POS)
	4068	/* See the comment on a similar expression above.
	4069	However, this time it's not a subexpression
	4070	we care about, but the expression itself. */
	4071	&& (maxcount == REG_INFTY)
	4072	&& data && ++data->whilem_c < 16) {
	4073	/* This stays as CURLYX, we can put the count/of pair. */
	4074	/* Find WHILEM (as in regexec.c) */
	4075	regnode *nxt = oscan + NEXT_OFF(oscan);
	4076
	4077	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	4078	nxt += ARG(nxt);
	4079	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	4080	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	4081	}
	4082	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	4083	pars++;
	4084	if (flags & SCF_DO_SUBSTR) {
	4085	SV *last_str = NULL;
	4086	int counted = mincount != 0;
	4087
	4088	if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
	4089	#if defined(SPARC64_GCC_WORKAROUND)
	4090	SSize_t b = 0;
	4091	STRLEN l = 0;
	4092	const char *s = NULL;
	4093	SSize_t old = 0;
	4094
	4095	if (pos_before >= data->last_start_min)
	4096	b = pos_before;
	4097	else
	4098	b = data->last_start_min;
	4099
	4100	l = 0;
	4101	s = SvPV_const(data->last_found, l);
	4102	old = b - data->last_start_min;
	4103
	4104	#else
	4105	SSize_t b = pos_before >= data->last_start_min
	4106	? pos_before : data->last_start_min;
	4107	STRLEN l;
	4108	const char * const s = SvPV_const(data->last_found, l);
	4109	SSize_t old = b - data->last_start_min;
	4110	#endif
	4111
	4112	if (UTF)
	4113	old = utf8_hop((U8)s, old) - (U8)s;
	4114	l -= old;
	4115	/* Get the added string: */
	4116	last_str = newSVpvn_utf8(s + old, l, UTF);
	4117	if (deltanext == 0 && pos_before == b) {
	4118	/* What was added is a constant string */
	4119	if (mincount > 1) {
	4120	SvGROW(last_str, (mincount * l) + 1);
	4121	repeatcpy(SvPVX(last_str) + l,
	4122	SvPVX_const(last_str), l, mincount - 1);
	4123	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	4124	/* Add additional parts. */
	4125	SvCUR_set(data->last_found,
	4126	SvCUR(data->last_found) - l);
	4127	sv_catsv(data->last_found, last_str);
	4128	{
	4129	SV * sv = data->last_found;
	4130	MAGIC *mg =
	4131	SvUTF8(sv) && SvMAGICAL(sv) ?
	4132	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	4133	if (mg && mg->mg_len >= 0)
	4134	mg->mg_len += CHR_SVLEN(last_str) - l;
	4135	}
	4136	data->last_end += l * (mincount - 1);
	4137	}
	4138	} else {
	4139	/* start offset must point into the last copy */
	4140	data->last_start_min += minnext * (mincount - 1);
	4141	data->last_start_max += is_inf ? SSize_t_MAX
	4142	: (maxcount - 1) * (minnext + data->pos_delta);
	4143	}
	4144	}
	4145	/* It is counted once already... */
	4146	data->pos_min += minnext * (mincount - counted);
	4147	#if 0
	4148	PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
	4149	" SSize_t_MAX=%"UVdf" minnext=%"UVdf
	4150	" maxcount=%"UVdf" mincount=%"UVdf"\n",
	4151	(UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
	4152	(UV)mincount);
	4153	if (deltanext != SSize_t_MAX)
	4154	PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
	4155	(UV)(-counted * deltanext + (minnext + deltanext) * maxcount
	4156	- minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
	4157	#endif
	4158	if (deltanext == SSize_t_MAX \|\|
	4159	-counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
	4160	data->pos_delta = SSize_t_MAX;
	4161	else
	4162	data->pos_delta += - counted * deltanext +
	4163	(minnext + deltanext) * maxcount - minnext * mincount;
	4164	if (mincount != maxcount) {
	4165	/* Cannot extend fixed substrings found inside
	4166	the group. */
	4167	SCAN_COMMIT(pRExC_state,data,minlenp);
	4168	if (mincount && last_str) {
	4169	SV * const sv = data->last_found;
	4170	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	4171	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	4172
	4173	if (mg)
	4174	mg->mg_len = -1;
	4175	sv_setsv(sv, last_str);
	4176	data->last_end = data->pos_min;
	4177	data->last_start_min =
	4178	data->pos_min - CHR_SVLEN(last_str);
	4179	data->last_start_max = is_inf
	4180	? SSize_t_MAX
	4181	: data->pos_min + data->pos_delta
	4182	- CHR_SVLEN(last_str);
	4183	}
	4184	data->longest = &(data->longest_float);
	4185	}
	4186	SvREFCNT_dec(last_str);
	4187	}
	4188	if (data && (fl & SF_HAS_EVAL))
	4189	data->flags \|= SF_HAS_EVAL;
	4190	optimize_curly_tail:
	4191	if (OP(oscan) != CURLYX) {
	4192	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	4193	&& NEXT_OFF(next))
	4194	NEXT_OFF(oscan) += NEXT_OFF(next);
	4195	}
	4196	continue;
	4197	default: /* REF, and CLUMP only? */
	4198	if (flags & SCF_DO_SUBSTR) {
	4199	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4200	data->longest = &(data->longest_float);
	4201	}
	4202	is_inf = is_inf_internal = 1;
	4203	if (flags & SCF_DO_STCLASS_OR)
	4204	cl_anything(pRExC_state, data->start_class);
	4205	flags &= ~SCF_DO_STCLASS;
	4206	break;
	4207	}
	4208	}
	4209	else if (OP(scan) == LNBREAK) {
	4210	if (flags & SCF_DO_STCLASS) {
	4211	int value = 0;
	4212	CLEAR_SSC_EOS(data->start_class); /* No match on empty */
	4213	if (flags & SCF_DO_STCLASS_AND) {
	4214	for (value = 0; value < 256; value++)
	4215	if (!is_VERTWS_cp(value))
	4216	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4217	}
	4218	else {
	4219	for (value = 0; value < 256; value++)
	4220	if (is_VERTWS_cp(value))
	4221	ANYOF_BITMAP_SET(data->start_class, value);
	4222	}
	4223	if (flags & SCF_DO_STCLASS_OR)
	4224	cl_and(data->start_class, and_withp);
	4225	flags &= ~SCF_DO_STCLASS;
	4226	}
	4227	min++;
	4228	delta++; /* Because of the 2 char string cr-lf */
	4229	if (flags & SCF_DO_SUBSTR) {
	4230	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4231	data->pos_min += 1;
	4232	data->pos_delta += 1;
	4233	data->longest = &(data->longest_float);
	4234	}
	4235	}
	4236	else if (REGNODE_SIMPLE(OP(scan))) {
	4237	int value = 0;
	4238
	4239	if (flags & SCF_DO_SUBSTR) {
	4240	SCAN_COMMIT(pRExC_state,data,minlenp);
	4241	data->pos_min++;
	4242	}
	4243	min++;
	4244	if (flags & SCF_DO_STCLASS) {
	4245	int loop_max = 256;
	4246	CLEAR_SSC_EOS(data->start_class); /* No match on empty */
	4247
	4248	/* Some of the logic below assumes that switching
	4249	locale on will only add false positives. */
	4250	switch (PL_regkind[OP(scan)]) {
	4251	U8 classnum;
	4252
	4253	case SANY:
	4254	default:
	4255	#ifdef DEBUGGING
	4256	Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
	4257	#endif
	4258	do_default:
	4259	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4260	cl_anything(pRExC_state, data->start_class);
	4261	break;
	4262	case REG_ANY:
	4263	if (OP(scan) == SANY)
	4264	goto do_default;
	4265	if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
	4266	value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
	4267	\|\| ANYOF_CLASS_TEST_ANY_SET(data->start_class));
	4268	cl_anything(pRExC_state, data->start_class);
	4269	}
	4270	if (flags & SCF_DO_STCLASS_AND \|\| !value)
	4271	ANYOF_BITMAP_CLEAR(data->start_class,'\n');
	4272	break;
	4273	case ANYOF:
	4274	if (flags & SCF_DO_STCLASS_AND)
	4275	cl_and(data->start_class,
	4276	(struct regnode_charclass_class*)scan);
	4277	else
	4278	cl_or(pRExC_state, data->start_class,
	4279	(struct regnode_charclass_class*)scan);
	4280	break;
	4281	case POSIXA:
	4282	loop_max = 128;
	4283	/* FALL THROUGH */
	4284	case POSIXL:
	4285	case POSIXD:
	4286	case POSIXU:
	4287	classnum = FLAGS(scan);
	4288	if (flags & SCF_DO_STCLASS_AND) {
	4289	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4290	ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
	4291	for (value = 0; value < loop_max; value++) {
	4292	if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
	4293	ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
	4294	}
	4295	}
	4296	}
	4297	}
	4298	else {
	4299	if (data->start_class->flags & ANYOF_LOCALE) {
	4300	ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
	4301	}
	4302	else {
	4303
	4304	/* Even if under locale, set the bits for non-locale
	4305	* in case it isn't a true locale-node. This will
	4306	* create false positives if it truly is locale */
	4307	for (value = 0; value < loop_max; value++) {
	4308	if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
	4309	ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
	4310	}
	4311	}
	4312	}
	4313	}
	4314	break;
	4315	case NPOSIXA:
	4316	loop_max = 128;
	4317	/* FALL THROUGH */
	4318	case NPOSIXL:
	4319	case NPOSIXU:
	4320	case NPOSIXD:
	4321	classnum = FLAGS(scan);
	4322	if (flags & SCF_DO_STCLASS_AND) {
	4323	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4324	ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
	4325	for (value = 0; value < loop_max; value++) {
	4326	if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
	4327	ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
	4328	}
	4329	}
	4330	}
	4331	}
	4332	else {
	4333	if (data->start_class->flags & ANYOF_LOCALE) {
	4334	ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
	4335	}
	4336	else {
	4337
	4338	/* Even if under locale, set the bits for non-locale in
	4339	* case it isn't a true locale-node. This will create
	4340	* false positives if it truly is locale */
	4341	for (value = 0; value < loop_max; value++) {
	4342	if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
	4343	ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
	4344	}
	4345	}
	4346	if (PL_regkind[OP(scan)] == NPOSIXD) {
	4347	data->start_class->flags \|= ANYOF_NON_UTF8_LATIN1_ALL;
	4348	}
	4349	}
	4350	}
	4351	break;
	4352	}
	4353	if (flags & SCF_DO_STCLASS_OR)
	4354	cl_and(data->start_class, and_withp);
	4355	flags &= ~SCF_DO_STCLASS;
	4356	}
	4357	}
	4358	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	4359	data->flags \|= (OP(scan) == MEOL
	4360	? SF_BEFORE_MEOL
	4361	: SF_BEFORE_SEOL);
	4362	SCAN_COMMIT(pRExC_state, data, minlenp);
	4363
	4364	}
	4365	else if ( PL_regkind[OP(scan)] == BRANCHJ
	4366	/* Lookbehind, or need to calculate parens/evals/stclass: */
	4367	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	4368	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM)) {
	4369	if ( OP(scan) == UNLESSM &&
	4370	scan->flags == 0 &&
	4371	OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
	4372	OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
	4373	) {
	4374	regnode *opt;
	4375	regnode *upto= regnext(scan);
	4376	DEBUG_PARSE_r({
	4377	SV * const mysv_val=sv_newmortal();
	4378	DEBUG_STUDYDATA("OPFAIL",data,depth);
	4379
	4380	/DEBUG_PARSE_MSG("opfail");/
	4381	regprop(RExC_rx, mysv_val, upto);
	4382	PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
	4383	SvPV_nolen_const(mysv_val),
	4384	(IV)REG_NODE_NUM(upto),
	4385	(IV)(upto - scan)
	4386	);
	4387	});
	4388	OP(scan) = OPFAIL;
	4389	NEXT_OFF(scan) = upto - scan;
	4390	for (opt= scan + 1; opt < upto ; opt++)
	4391	OP(opt) = OPTIMIZED;
	4392	scan= upto;
	4393	continue;
	4394	}
	4395	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4396	\|\| OP(scan) == UNLESSM )
	4397	{
	4398	/* Negative Lookahead/lookbehind
	4399	In this case we can't do fixed string optimisation.
	4400	*/
	4401
	4402	SSize_t deltanext, minnext, fake = 0;
	4403	regnode *nscan;
	4404	struct regnode_charclass_class intrnl;
	4405	int f = 0;
	4406
	4407	data_fake.flags = 0;
	4408	if (data) {
	4409	data_fake.whilem_c = data->whilem_c;
	4410	data_fake.last_closep = data->last_closep;
	4411	}
	4412	else
	4413	data_fake.last_closep = &fake;
	4414	data_fake.pos_delta = delta;
	4415	if ( flags & SCF_DO_STCLASS && !scan->flags
	4416	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4417	cl_init(pRExC_state, &intrnl);
	4418	data_fake.start_class = &intrnl;
	4419	f \|= SCF_DO_STCLASS_AND;
	4420	}
	4421	if (flags & SCF_WHILEM_VISITED_POS)
	4422	f \|= SCF_WHILEM_VISITED_POS;
	4423	next = regnext(scan);
	4424	nscan = NEXTOPER(NEXTOPER(scan));
	4425	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	4426	last, &data_fake, stopparen, recursed, NULL, f, depth+1);
	4427	if (scan->flags) {
	4428	if (deltanext) {
	4429	FAIL("Variable length lookbehind not implemented");
	4430	}
	4431	else if (minnext > (I32)U8_MAX) {
	4432	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4433	}
	4434	scan->flags = (U8)minnext;
	4435	}
	4436	if (data) {
	4437	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4438	pars++;
	4439	if (data_fake.flags & SF_HAS_EVAL)
	4440	data->flags \|= SF_HAS_EVAL;
	4441	data->whilem_c = data_fake.whilem_c;
	4442	}
	4443	if (f & SCF_DO_STCLASS_AND) {
	4444	if (flags & SCF_DO_STCLASS_OR) {
	4445	/* OR before, AND after: ideally we would recurse with
	4446	* data_fake to get the AND applied by study of the
	4447	* remainder of the pattern, and then derecurse;
	4448	* * HACK * for now just treat as "no information".
	4449	* See [perl #56690].
	4450	*/
	4451	cl_init(pRExC_state, data->start_class);
	4452	} else {
	4453	/* AND before and after: combine and continue */
	4454	const int was = TEST_SSC_EOS(data->start_class);
	4455
	4456	cl_and(data->start_class, &intrnl);
	4457	if (was)
	4458	SET_SSC_EOS(data->start_class);
	4459	}
	4460	}
	4461	}
	4462	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4463	else {
	4464	/* Positive Lookahead/lookbehind
	4465	In this case we can do fixed string optimisation,
	4466	but we must be careful about it. Note in the case of
	4467	lookbehind the positions will be offset by the minimum
	4468	length of the pattern, something we won't know about
	4469	until after the recurse.
	4470	*/
	4471	SSize_t deltanext;
	4472	I32 fake = 0;
	4473	regnode *nscan;
	4474	struct regnode_charclass_class intrnl;
	4475	int f = 0;
	4476	/* We use SAVEFREEPV so that when the full compile
	4477	is finished perl will clean up the allocated
	4478	minlens when it's all done. This way we don't
	4479	have to worry about freeing them when we know
	4480	they wont be used, which would be a pain.
	4481	*/
	4482	SSize_t *minnextp;
	4483	Newx( minnextp, 1, SSize_t );
	4484	SAVEFREEPV(minnextp);
	4485
	4486	if (data) {
	4487	StructCopy(data, &data_fake, scan_data_t);
	4488	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	4489	f \|= SCF_DO_SUBSTR;
	4490	if (scan->flags)
	4491	SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
	4492	data_fake.last_found=newSVsv(data->last_found);
	4493	}
	4494	}
	4495	else
	4496	data_fake.last_closep = &fake;
	4497	data_fake.flags = 0;
	4498	data_fake.pos_delta = delta;
	4499	if (is_inf)
	4500	data_fake.flags \|= SF_IS_INF;
	4501	if ( flags & SCF_DO_STCLASS && !scan->flags
	4502	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4503	cl_init(pRExC_state, &intrnl);
	4504	data_fake.start_class = &intrnl;
	4505	f \|= SCF_DO_STCLASS_AND;
	4506	}
	4507	if (flags & SCF_WHILEM_VISITED_POS)
	4508	f \|= SCF_WHILEM_VISITED_POS;
	4509	next = regnext(scan);
	4510	nscan = NEXTOPER(NEXTOPER(scan));
	4511
	4512	*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
	4513	last, &data_fake, stopparen, recursed, NULL, f,depth+1);
	4514	if (scan->flags) {
	4515	if (deltanext) {
	4516	FAIL("Variable length lookbehind not implemented");
	4517	}
	4518	else if (*minnextp > (I32)U8_MAX) {
	4519	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4520	}
	4521	scan->flags = (U8)*minnextp;
	4522	}
	4523
	4524	*minnextp += min;
	4525
	4526	if (f & SCF_DO_STCLASS_AND) {
	4527	const int was = TEST_SSC_EOS(data.start_class);
	4528
	4529	cl_and(data->start_class, &intrnl);
	4530	if (was)
	4531	SET_SSC_EOS(data->start_class);
	4532	}
	4533	if (data) {
	4534	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4535	pars++;
	4536	if (data_fake.flags & SF_HAS_EVAL)
	4537	data->flags \|= SF_HAS_EVAL;
	4538	data->whilem_c = data_fake.whilem_c;
	4539	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	4540	if (RExC_rx->minlen<*minnextp)
	4541	RExC_rx->minlen=*minnextp;
	4542	SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
	4543	SvREFCNT_dec_NN(data_fake.last_found);
	4544
	4545	if ( data_fake.minlen_fixed != minlenp )
	4546	{
	4547	data->offset_fixed= data_fake.offset_fixed;
	4548	data->minlen_fixed= data_fake.minlen_fixed;
	4549	data->lookbehind_fixed+= scan->flags;
	4550	}
	4551	if ( data_fake.minlen_float != minlenp )
	4552	{
	4553	data->minlen_float= data_fake.minlen_float;
	4554	data->offset_float_min=data_fake.offset_float_min;
	4555	data->offset_float_max=data_fake.offset_float_max;
	4556	data->lookbehind_float+= scan->flags;
	4557	}
	4558	}
	4559	}
	4560	}
	4561	#endif
	4562	}
	4563	else if (OP(scan) == OPEN) {
	4564	if (stopparen != (I32)ARG(scan))
	4565	pars++;
	4566	}
	4567	else if (OP(scan) == CLOSE) {
	4568	if (stopparen == (I32)ARG(scan)) {
	4569	break;
	4570	}
	4571	if ((I32)ARG(scan) == is_par) {
	4572	next = regnext(scan);
	4573
	4574	if ( next && (OP(next) != WHILEM) && next < last)
	4575	is_par = 0; /* Disable optimization */
	4576	}
	4577	if (data)
	4578	*(data->last_closep) = ARG(scan);
	4579	}
	4580	else if (OP(scan) == EVAL) {
	4581	if (data)
	4582	data->flags \|= SF_HAS_EVAL;
	4583	}
	4584	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	4585	if (flags & SCF_DO_SUBSTR) {
	4586	SCAN_COMMIT(pRExC_state,data,minlenp);
	4587	flags &= ~SCF_DO_SUBSTR;
	4588	}
	4589	if (data && OP(scan)==ACCEPT) {
	4590	data->flags \|= SCF_SEEN_ACCEPT;
	4591	if (stopmin > min)
	4592	stopmin = min;
	4593	}
	4594	}
	4595	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	4596	{
	4597	if (flags & SCF_DO_SUBSTR) {
	4598	SCAN_COMMIT(pRExC_state,data,minlenp);
	4599	data->longest = &(data->longest_float);
	4600	}
	4601	is_inf = is_inf_internal = 1;
	4602	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4603	cl_anything(pRExC_state, data->start_class);
	4604	flags &= ~SCF_DO_STCLASS;
	4605	}
	4606	else if (OP(scan) == GPOS) {
	4607	if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
	4608	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	4609	{
	4610	if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
	4611	RExC_rx->extflags \|= RXf_ANCH_GPOS;
	4612	if (RExC_rx->gofs < (STRLEN)min)
	4613	RExC_rx->gofs = min;
	4614	} else {
	4615	RExC_rx->extflags \|= RXf_GPOS_FLOAT;
	4616	RExC_rx->gofs = 0;
	4617	}
	4618	}
	4619	#ifdef TRIE_STUDY_OPT
	4620	#ifdef FULL_TRIE_STUDY
	4621	else if (PL_regkind[OP(scan)] == TRIE) {
	4622	/* NOTE - There is similar code to this block above for handling
	4623	BRANCH nodes on the initial study. If you change stuff here
	4624	check there too. */
	4625	regnode *trie_node= scan;
	4626	regnode *tail= regnext(scan);
	4627	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4628	SSize_t max1 = 0, min1 = SSize_t_MAX;
	4629	struct regnode_charclass_class accum;
	4630
	4631	if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
	4632	SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
	4633	if (flags & SCF_DO_STCLASS)
	4634	cl_init_zero(pRExC_state, &accum);
	4635
	4636	if (!trie->jump) {
	4637	min1= trie->minlen;
	4638	max1= trie->maxlen;
	4639	} else {
	4640	const regnode *nextbranch= NULL;
	4641	U32 word;
	4642
	4643	for ( word=1 ; word <= trie->wordcount ; word++)
	4644	{
	4645	SSize_t deltanext=0, minnext=0, f = 0, fake;
	4646	struct regnode_charclass_class this_class;
	4647
	4648	data_fake.flags = 0;
	4649	if (data) {
	4650	data_fake.whilem_c = data->whilem_c;
	4651	data_fake.last_closep = data->last_closep;
	4652	}
	4653	else
	4654	data_fake.last_closep = &fake;
	4655	data_fake.pos_delta = delta;
	4656	if (flags & SCF_DO_STCLASS) {
	4657	cl_init(pRExC_state, &this_class);
	4658	data_fake.start_class = &this_class;
	4659	f = SCF_DO_STCLASS_AND;
	4660	}
	4661	if (flags & SCF_WHILEM_VISITED_POS)
	4662	f \|= SCF_WHILEM_VISITED_POS;
	4663
	4664	if (trie->jump[word]) {
	4665	if (!nextbranch)
	4666	nextbranch = trie_node + trie->jump[0];
	4667	scan= trie_node + trie->jump[word];
	4668	/* We go from the jump point to the branch that follows
	4669	it. Note this means we need the vestigal unused branches
	4670	even though they arent otherwise used.
	4671	*/
	4672	minnext = study_chunk(pRExC_state, &scan, minlenp,
	4673	&deltanext, (regnode *)nextbranch, &data_fake,
	4674	stopparen, recursed, NULL, f,depth+1);
	4675	}
	4676	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	4677	nextbranch= regnext((regnode*)nextbranch);
	4678
	4679	if (min1 > (SSize_t)(minnext + trie->minlen))
	4680	min1 = minnext + trie->minlen;
	4681	if (deltanext == SSize_t_MAX) {
	4682	is_inf = is_inf_internal = 1;
	4683	max1 = SSize_t_MAX;
	4684	} else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
	4685	max1 = minnext + deltanext + trie->maxlen;
	4686
	4687	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4688	pars++;
	4689	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	4690	if ( stopmin > min + min1)
	4691	stopmin = min + min1;
	4692	flags &= ~SCF_DO_SUBSTR;
	4693	if (data)
	4694	data->flags \|= SCF_SEEN_ACCEPT;
	4695	}
	4696	if (data) {
	4697	if (data_fake.flags & SF_HAS_EVAL)
	4698	data->flags \|= SF_HAS_EVAL;
	4699	data->whilem_c = data_fake.whilem_c;
	4700	}
	4701	if (flags & SCF_DO_STCLASS)
	4702	cl_or(pRExC_state, &accum, &this_class);
	4703	}
	4704	}
	4705	if (flags & SCF_DO_SUBSTR) {
	4706	data->pos_min += min1;
	4707	data->pos_delta += max1 - min1;
	4708	if (max1 != min1 \|\| is_inf)
	4709	data->longest = &(data->longest_float);
	4710	}
	4711	min += min1;
	4712	delta += max1 - min1;
	4713	if (flags & SCF_DO_STCLASS_OR) {
	4714	cl_or(pRExC_state, data->start_class, &accum);
	4715	if (min1) {
	4716	cl_and(data->start_class, and_withp);
	4717	flags &= ~SCF_DO_STCLASS;
	4718	}
	4719	}
	4720	else if (flags & SCF_DO_STCLASS_AND) {
	4721	if (min1) {
	4722	cl_and(data->start_class, &accum);
	4723	flags &= ~SCF_DO_STCLASS;
	4724	}
	4725	else {
	4726	/* Switch to OR mode: cache the old value of
	4727	* data->start_class */
	4728	INIT_AND_WITHP;
	4729	StructCopy(data->start_class, and_withp,
	4730	struct regnode_charclass_class);
	4731	flags &= ~SCF_DO_STCLASS_AND;
	4732	StructCopy(&accum, data->start_class,
	4733	struct regnode_charclass_class);
	4734	flags \|= SCF_DO_STCLASS_OR;
	4735	SET_SSC_EOS(data->start_class);
	4736	}
	4737	}
	4738	scan= tail;
	4739	continue;
	4740	}
	4741	#else
	4742	else if (PL_regkind[OP(scan)] == TRIE) {
	4743	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4744	U8*bang=NULL;
	4745
	4746	min += trie->minlen;
	4747	delta += (trie->maxlen - trie->minlen);
	4748	flags &= ~SCF_DO_STCLASS; /* xxx */
	4749	if (flags & SCF_DO_SUBSTR) {
	4750	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4751	data->pos_min += trie->minlen;
	4752	data->pos_delta += (trie->maxlen - trie->minlen);
	4753	if (trie->maxlen != trie->minlen)
	4754	data->longest = &(data->longest_float);
	4755	}
	4756	if (trie->jump) /* no more substrings -- for now /grr*/
	4757	flags &= ~SCF_DO_SUBSTR;
	4758	}
	4759	#endif /* old or new */
	4760	#endif /* TRIE_STUDY_OPT */
	4761
	4762	/* Else: zero-length, ignore. */
	4763	scan = regnext(scan);
	4764	}
	4765	if (frame) {
	4766	last = frame->last;
	4767	scan = frame->next;
	4768	stopparen = frame->stop;
	4769	frame = frame->prev;
	4770	goto fake_study_recurse;
	4771	}
	4772
	4773	finish:
	4774	assert(!frame);
	4775	DEBUG_STUDYDATA("pre-fin:",data,depth);
	4776
	4777	*scanp = scan;
	4778	*deltap = is_inf_internal ? SSize_t_MAX : delta;
	4779	if (flags & SCF_DO_SUBSTR && is_inf)
	4780	data->pos_delta = SSize_t_MAX - data->pos_min;
	4781	if (is_par > (I32)U8_MAX)
	4782	is_par = 0;
	4783	if (is_par && pars==1 && data) {
	4784	data->flags \|= SF_IN_PAR;
	4785	data->flags &= ~SF_HAS_PAR;
	4786	}
	4787	else if (pars && data) {
	4788	data->flags \|= SF_HAS_PAR;
	4789	data->flags &= ~SF_IN_PAR;
	4790	}
	4791	if (flags & SCF_DO_STCLASS_OR)
	4792	cl_and(data->start_class, and_withp);
	4793	if (flags & SCF_TRIE_RESTUDY)
	4794	data->flags \|= SCF_TRIE_RESTUDY;
	4795
	4796	DEBUG_STUDYDATA("post-fin:",data,depth);
	4797
	4798	return min < stopmin ? min : stopmin;
	4799	}
	4800
	4801	STATIC U32
	4802	S_add_data(RExC_state_t pRExC_state, U32 n, const char s)
	4803	{
	4804	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	4805
	4806	PERL_ARGS_ASSERT_ADD_DATA;
	4807
	4808	Renewc(RExC_rxi->data,
	4809	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	4810	char, struct reg_data);
	4811	if(count)
	4812	Renew(RExC_rxi->data->what, count + n, U8);
	4813	else
	4814	Newx(RExC_rxi->data->what, n, U8);
	4815	RExC_rxi->data->count = count + n;
	4816	Copy(s, RExC_rxi->data->what + count, n, U8);
	4817	return count;
	4818	}
	4819
	4820	/XXX: todo make this not included in a non debugging perl /
	4821	#ifndef PERL_IN_XSUB_RE
	4822	void
	4823	Perl_reginitcolors(pTHX)
	4824	{
	4825	dVAR;
	4826	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	4827	if (s) {
	4828	char *t = savepv(s);
	4829	int i = 0;
	4830	PL_colors[0] = t;
	4831	while (++i < 6) {
	4832	t = strchr(t, '\t');
	4833	if (t) {
	4834	*t = '\0';
	4835	PL_colors[i] = ++t;
	4836	}
	4837	else
	4838	PL_colors[i] = t = (char *)"";
	4839	}
	4840	} else {
	4841	int i = 0;
	4842	while (i < 6)
	4843	PL_colors[i++] = (char *)"";
	4844	}
	4845	PL_colorset = 1;
	4846	}
	4847	#endif
	4848
	4849
	4850	#ifdef TRIE_STUDY_OPT
	4851	#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
	4852	STMT_START { \
	4853	if ( \
	4854	(data.flags & SCF_TRIE_RESTUDY) \
	4855	&& ! restudied++ \
	4856	) { \
	4857	dOsomething; \
	4858	goto reStudy; \
	4859	} \
	4860	} STMT_END
	4861	#else
	4862	#define CHECK_RESTUDY_GOTO_butfirst
	4863	#endif
	4864
	4865	/*
	4866	* pregcomp - compile a regular expression into internal code
	4867	*
	4868	* Decides which engine's compiler to call based on the hint currently in
	4869	* scope
	4870	*/
	4871
	4872	#ifndef PERL_IN_XSUB_RE
	4873
	4874	/* return the currently in-scope regex engine (or the default if none) */
	4875
	4876	regexp_engine const *
	4877	Perl_current_re_engine(pTHX)
	4878	{
	4879	dVAR;
	4880
	4881	if (IN_PERL_COMPILETIME) {
	4882	HV * const table = GvHV(PL_hintgv);
	4883	SV **ptr;
	4884
	4885	if (!table)
	4886	return &PL_core_reg_engine;
	4887	ptr = hv_fetchs(table, "regcomp", FALSE);
	4888	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	4889	return &PL_core_reg_engine;
	4890	return INT2PTR(regexp_engine,SvIV(ptr));
	4891	}
	4892	else {
	4893	SV *ptr;
	4894	if (!PL_curcop->cop_hints_hash)
	4895	return &PL_core_reg_engine;
	4896	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
	4897	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	4898	return &PL_core_reg_engine;
	4899	return INT2PTR(regexp_engine*,SvIV(ptr));
	4900	}
	4901	}
	4902
	4903
	4904	REGEXP *
	4905	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	4906	{
	4907	dVAR;
	4908	regexp_engine const *eng = current_re_engine();
	4909	GET_RE_DEBUG_FLAGS_DECL;
	4910
	4911	PERL_ARGS_ASSERT_PREGCOMP;
	4912
	4913	/* Dispatch a request to compile a regexp to correct regexp engine. */
	4914	DEBUG_COMPILE_r({
	4915	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	4916	PTR2UV(eng));
	4917	});
	4918	return CALLREGCOMP_ENG(eng, pattern, flags);
	4919	}
	4920	#endif
	4921
	4922	/* public(ish) entry point for the perl core's own regex compiling code.
	4923	* It's actually a wrapper for Perl_re_op_compile that only takes an SV
	4924	* pattern rather than a list of OPs, and uses the internal engine rather
	4925	* than the current one */
	4926
	4927	REGEXP *
	4928	Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
	4929	{
	4930	SV pat = pattern; / defeat constness! */
	4931	PERL_ARGS_ASSERT_RE_COMPILE;
	4932	return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
	4933	#ifdef PERL_IN_XSUB_RE
	4934	&my_reg_engine,
	4935	#else
	4936	&PL_core_reg_engine,
	4937	#endif
	4938	NULL, NULL, rx_flags, 0);
	4939	}
	4940
	4941
	4942	/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
	4943	* blocks, recalculate the indices. Update pat_p and plen_p in-place to
	4944	* point to the realloced string and length.
	4945	*
	4946	* This is essentially a copy of Perl_bytes_to_utf8() with the code index
	4947	* stuff added */
	4948
	4949	static void
	4950	S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
	4951	char *pat_p, STRLEN plen_p, int num_code_blocks)
	4952	{
	4953	U8 const src = (U8)*pat_p;
	4954	U8 *dst;
	4955	int n=0;
	4956	STRLEN s = 0, d = 0;
	4957	bool do_end = 0;
	4958	GET_RE_DEBUG_FLAGS_DECL;
	4959
	4960	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	4961	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	4962
	4963	Newx(dst, plen_p 2 + 1, U8);
	4964
	4965	while (s < *plen_p) {
	4966	if (NATIVE_IS_INVARIANT(src[s]))
	4967	dst[d] = src[s];
	4968	else {
	4969	dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
	4970	dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
	4971	}
	4972	if (n < num_code_blocks) {
	4973	if (!do_end && pRExC_state->code_blocks[n].start == s) {
	4974	pRExC_state->code_blocks[n].start = d;
	4975	assert(dst[d] == '(');
	4976	do_end = 1;
	4977	}
	4978	else if (do_end && pRExC_state->code_blocks[n].end == s) {
	4979	pRExC_state->code_blocks[n].end = d;
	4980	assert(dst[d] == ')');
	4981	do_end = 0;
	4982	n++;
	4983	}
	4984	}
	4985	s++;
	4986	d++;
	4987	}
	4988	dst[d] = '\0';
	4989	*plen_p = d;
	4990	pat_p = (char) dst;
	4991	SAVEFREEPV(*pat_p);
	4992	RExC_orig_utf8 = RExC_utf8 = 1;
	4993	}
	4994
	4995
	4996
	4997	/* S_concat_pat(): concatenate a list of args to the pattern string pat,
	4998	* while recording any code block indices, and handling overloading,
	4999	* nested qr// objects etc. If pat is null, it will allocate a new
	5000	* string, or just return the first arg, if there's only one.
	5001	*
	5002	* Returns the malloced/updated pat.
	5003	* patternp and pat_count is the array of SVs to be concatted;
	5004	* oplist is the optional list of ops that generated the SVs;
	5005	* recompile_p is a pointer to a boolean that will be set if
	5006	* the regex will need to be recompiled.
	5007	* delim, if non-null is an SV that will be inserted between each element
	5008	*/
	5009
	5010	static SV*
	5011	S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
	5012	SV pat, SV * const patternp, int pat_count,
	5013	OP oplist, bool recompile_p, SV *delim)
	5014	{
	5015	SV **svp;
	5016	int n = 0;
	5017	bool use_delim = FALSE;
	5018	bool alloced = FALSE;
	5019
	5020	/* if we know we have at least two args, create an empty string,
	5021	* then concatenate args to that. For no args, return an empty string */
	5022	if (!pat && pat_count != 1) {
	5023	pat = newSVpvn("", 0);
	5024	SAVEFREESV(pat);
	5025	alloced = TRUE;
	5026	}
	5027
	5028	for (svp = patternp; svp < patternp + pat_count; svp++) {
	5029	SV *sv;
	5030	SV *rx = NULL;
	5031	STRLEN orig_patlen = 0;
	5032	bool code = 0;
	5033	SV msv = use_delim ? delim : svp;
	5034
	5035	/* if we've got a delimiter, we go round the loop twice for each
	5036	* svp slot (except the last), using the delimiter the second
	5037	* time round */
	5038	if (use_delim) {
	5039	svp--;
	5040	use_delim = FALSE;
	5041	}
	5042	else if (delim)
	5043	use_delim = TRUE;
	5044
	5045	if (SvTYPE(msv) == SVt_PVAV) {
	5046	/* we've encountered an interpolated array within
	5047	* the pattern, e.g. /...@a..../. Expand the list of elements,
	5048	* then recursively append elements.
	5049	* The code in this block is based on S_pushav() */
	5050
	5051	AV const av = (AV)msv;
	5052	const I32 maxarg = AvFILL(av) + 1;
	5053	SV **array;
	5054
	5055	if (oplist) {
	5056	assert(oplist->op_type == OP_PADAV
	5057	\|\| oplist->op_type == OP_RV2AV);
	5058	oplist = oplist->op_sibling;;
	5059	}
	5060
	5061	if (SvRMAGICAL(av)) {
	5062	U32 i;
	5063
	5064	Newx(array, maxarg, SV*);
	5065	SAVEFREEPV(array);
	5066	for (i=0; i < (U32)maxarg; i++) {
	5067	SV ** const svp = av_fetch(av, i, FALSE);
	5068	array[i] = svp ? *svp : &PL_sv_undef;
	5069	}
	5070	}
	5071	else
	5072	array = AvARRAY(av);
	5073
	5074	pat = S_concat_pat(aTHX_ pRExC_state, pat,
	5075	array, maxarg, NULL, recompile_p,
	5076	/* $" */
	5077	GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
	5078
	5079	continue;
	5080	}
	5081
	5082
	5083	/* we make the assumption here that each op in the list of
	5084	* op_siblings maps to one SV pushed onto the stack,
	5085	* except for code blocks, with have both an OP_NULL and
	5086	* and OP_CONST.
	5087	* This allows us to match up the list of SVs against the
	5088	* list of OPs to find the next code block.
	5089	*
	5090	* Note that PUSHMARK PADSV PADSV ..
	5091	* is optimised to
	5092	* PADRANGE PADSV PADSV ..
	5093	* so the alignment still works. */
	5094
	5095	if (oplist) {
	5096	if (oplist->op_type == OP_NULL
	5097	&& (oplist->op_flags & OPf_SPECIAL))
	5098	{
	5099	assert(n < pRExC_state->num_code_blocks);
	5100	pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
	5101	pRExC_state->code_blocks[n].block = oplist;
	5102	pRExC_state->code_blocks[n].src_regex = NULL;
	5103	n++;
	5104	code = 1;
	5105	oplist = oplist->op_sibling; /* skip CONST */
	5106	assert(oplist);
	5107	}
	5108	oplist = oplist->op_sibling;;
	5109	}
	5110
	5111	/* apply magic and QR overloading to arg */
	5112
	5113	SvGETMAGIC(msv);
	5114	if (SvROK(msv) && SvAMAGIC(msv)) {
	5115	SV *sv = AMG_CALLunary(msv, regexp_amg);
	5116	if (sv) {
	5117	if (SvROK(sv))
	5118	sv = SvRV(sv);
	5119	if (SvTYPE(sv) != SVt_REGEXP)
	5120	Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
	5121	msv = sv;
	5122	}
	5123	}
	5124
	5125	/* try concatenation overload ... */
	5126	if (pat && (SvAMAGIC(pat) \|\| SvAMAGIC(msv)) &&
	5127	(sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
	5128	{
	5129	sv_setsv(pat, sv);
	5130	/* overloading involved: all bets are off over literal
	5131	* code. Pretend we haven't seen it */
	5132	pRExC_state->num_code_blocks -= n;
	5133	n = 0;
	5134	}
	5135	else {
	5136	/* ... or failing that, try "" overload */
	5137	while (SvAMAGIC(msv)
	5138	&& (sv = AMG_CALLunary(msv, string_amg))
	5139	&& sv != msv
	5140	&& !( SvROK(msv)
	5141	&& SvROK(sv)
	5142	&& SvRV(msv) == SvRV(sv))
	5143	) {
	5144	msv = sv;
	5145	SvGETMAGIC(msv);
	5146	}
	5147	if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
	5148	msv = SvRV(msv);
	5149
	5150	if (pat) {
	5151	/* this is a partially unrolled
	5152	* sv_catsv_nomg(pat, msv);
	5153	* that allows us to adjust code block indices if
	5154	* needed */
	5155	STRLEN dlen;
	5156	char *dst = SvPV_force_nomg(pat, dlen);
	5157	orig_patlen = dlen;
	5158	if (SvUTF8(msv) && !SvUTF8(pat)) {
	5159	S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
	5160	sv_setpvn(pat, dst, dlen);
	5161	SvUTF8_on(pat);
	5162	}
	5163	sv_catsv_nomg(pat, msv);
	5164	rx = msv;
	5165	}
	5166	else
	5167	pat = msv;
	5168
	5169	if (code)
	5170	pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
	5171	}
	5172
	5173	/* extract any code blocks within any embedded qr//'s */
	5174	if (rx && SvTYPE(rx) == SVt_REGEXP
	5175	&& RX_ENGINE((REGEXP*)rx)->op_comp)
	5176	{
	5177
	5178	RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
	5179	if (ri->num_code_blocks) {
	5180	int i;
	5181	/* the presence of an embedded qr// with code means
	5182	* we should always recompile: the text of the
	5183	* qr// may not have changed, but it may be a
	5184	* different closure than last time */
	5185	*recompile_p = 1;
	5186	Renew(pRExC_state->code_blocks,
	5187	pRExC_state->num_code_blocks + ri->num_code_blocks,
	5188	struct reg_code_block);
	5189	pRExC_state->num_code_blocks += ri->num_code_blocks;
	5190
	5191	for (i=0; i < ri->num_code_blocks; i++) {
	5192	struct reg_code_block src, dst;
	5193	STRLEN offset = orig_patlen
	5194	+ ReANY((REGEXP *)rx)->pre_prefix;
	5195	assert(n < pRExC_state->num_code_blocks);
	5196	src = &ri->code_blocks[i];
	5197	dst = &pRExC_state->code_blocks[n];
	5198	dst->start = src->start + offset;
	5199	dst->end = src->end + offset;
	5200	dst->block = src->block;
	5201	dst->src_regex = (REGEXP) SvREFCNT_inc( (SV)
	5202	src->src_regex
	5203	? src->src_regex
	5204	: (REGEXP*)rx);
	5205	n++;
	5206	}
	5207	}
	5208	}
	5209	}
	5210	/* avoid calling magic multiple times on a single element e.g. =~ $qr */
	5211	if (alloced)
	5212	SvSETMAGIC(pat);
	5213
	5214	return pat;
	5215	}
	5216
	5217
	5218
	5219	/* see if there are any run-time code blocks in the pattern.
	5220	* False positives are allowed */
	5221
	5222	static bool
	5223	S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
	5224	char *pat, STRLEN plen)
	5225	{
	5226	int n = 0;
	5227	STRLEN s;
	5228
	5229	for (s = 0; s < plen; s++) {
	5230	if (n < pRExC_state->num_code_blocks
	5231	&& s == pRExC_state->code_blocks[n].start)
	5232	{
	5233	s = pRExC_state->code_blocks[n].end;
	5234	n++;
	5235	continue;
	5236	}
	5237	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
	5238	* positives here */
	5239	if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
	5240	(pat[s+2] == '{'
	5241	\|\| (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
	5242	)
	5243	return 1;
	5244	}
	5245	return 0;
	5246	}
	5247
	5248	/* Handle run-time code blocks. We will already have compiled any direct
	5249	* or indirect literal code blocks. Now, take the pattern 'pat' and make a
	5250	* copy of it, but with any literal code blocks blanked out and
	5251	* appropriate chars escaped; then feed it into
	5252	*
	5253	* eval "qr'modified_pattern'"
	5254	*
	5255	* For example,
	5256	*
	5257	* a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
	5258	*
	5259	* becomes
	5260	*
	5261	* qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
	5262	*
	5263	* After eval_sv()-ing that, grab any new code blocks from the returned qr
	5264	* and merge them with any code blocks of the original regexp.
	5265	*
	5266	* If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
	5267	* instead, just save the qr and return FALSE; this tells our caller that
	5268	* the original pattern needs upgrading to utf8.
	5269	*/
	5270
	5271	static bool
	5272	S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
	5273	char *pat, STRLEN plen)
	5274	{
	5275	SV *qr;
	5276
	5277	GET_RE_DEBUG_FLAGS_DECL;
	5278
	5279	if (pRExC_state->runtime_code_qr) {
	5280	/* this is the second time we've been called; this should
	5281	* only happen if the main pattern got upgraded to utf8
	5282	* during compilation; re-use the qr we compiled first time
	5283	* round (which should be utf8 too)
	5284	*/
	5285	qr = pRExC_state->runtime_code_qr;
	5286	pRExC_state->runtime_code_qr = NULL;
	5287	assert(RExC_utf8 && SvUTF8(qr));
	5288	}
	5289	else {
	5290	int n = 0;
	5291	STRLEN s;
	5292	char p, newpat;
	5293	int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
	5294	SV sv, qr_ref;
	5295	dSP;
	5296
	5297	/* determine how many extra chars we need for ' and \ escaping */
	5298	for (s = 0; s < plen; s++) {
	5299	if (pat[s] == '\'' \|\| pat[s] == '\\')
	5300	newlen++;
	5301	}
	5302
	5303	Newx(newpat, newlen, char);
	5304	p = newpat;
	5305	p++ = 'q'; p++ = 'r'; *p++ = '\'';
	5306
	5307	for (s = 0; s < plen; s++) {
	5308	if (n < pRExC_state->num_code_blocks
	5309	&& s == pRExC_state->code_blocks[n].start)
	5310	{
	5311	/* blank out literal code block */
	5312	assert(pat[s] == '(');
	5313	while (s <= pRExC_state->code_blocks[n].end) {
	5314	*p++ = '_';
	5315	s++;
	5316	}
	5317	s--;
	5318	n++;
	5319	continue;
	5320	}
	5321	if (pat[s] == '\'' \|\| pat[s] == '\\')
	5322	*p++ = '\\';
	5323	*p++ = pat[s];
	5324	}
	5325	*p++ = '\'';
	5326	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
	5327	*p++ = 'x';
	5328	*p++ = '\0';
	5329	DEBUG_COMPILE_r({
	5330	PerlIO_printf(Perl_debug_log,
	5331	"%sre-parsing pattern for runtime code:%s %s\n",
	5332	PL_colors[4],PL_colors[5],newpat);
	5333	});
	5334
	5335	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
	5336	Safefree(newpat);
	5337
	5338	ENTER;
	5339	SAVETMPS;
	5340	save_re_context();
	5341	PUSHSTACKi(PERLSI_REQUIRE);
	5342	/* G_RE_REPARSING causes the toker to collapse \\ into \ when
	5343	* parsing qr''; normally only q'' does this. It also alters
	5344	* hints handling */
	5345	eval_sv(sv, G_SCALAR\|G_RE_REPARSING);
	5346	SvREFCNT_dec_NN(sv);
	5347	SPAGAIN;
	5348	qr_ref = POPs;
	5349	PUTBACK;
	5350	{
	5351	SV * const errsv = ERRSV;
	5352	if (SvTRUE_NN(errsv))
	5353	{
	5354	Safefree(pRExC_state->code_blocks);
	5355	/* use croak_sv ? */
	5356	Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
	5357	}
	5358	}
	5359	assert(SvROK(qr_ref));
	5360	qr = SvRV(qr_ref);
	5361	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
	5362	/* the leaving below frees the tmp qr_ref.
	5363	* Give qr a life of its own */
	5364	SvREFCNT_inc(qr);
	5365	POPSTACK;
	5366	FREETMPS;
	5367	LEAVE;
	5368
	5369	}
	5370
	5371	if (!RExC_utf8 && SvUTF8(qr)) {
	5372	/* first time through; the pattern got upgraded; save the
	5373	* qr for the next time through */
	5374	assert(!pRExC_state->runtime_code_qr);
	5375	pRExC_state->runtime_code_qr = qr;
	5376	return 0;
	5377	}
	5378
	5379
	5380	/* extract any code blocks within the returned qr// */
	5381
	5382
	5383	/* merge the main (r1) and run-time (r2) code blocks into one */
	5384	{
	5385	RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
	5386	struct reg_code_block new_block, dst;
	5387	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
	5388	int i1 = 0, i2 = 0;
	5389
	5390	if (!r2->num_code_blocks) /* we guessed wrong */
	5391	{
	5392	SvREFCNT_dec_NN(qr);
	5393	return 1;
	5394	}
	5395
	5396	Newx(new_block,
	5397	r1->num_code_blocks + r2->num_code_blocks,
	5398	struct reg_code_block);
	5399	dst = new_block;
	5400
	5401	while ( i1 < r1->num_code_blocks
	5402	\|\| i2 < r2->num_code_blocks)
	5403	{
	5404	struct reg_code_block *src;
	5405	bool is_qr = 0;
	5406
	5407	if (i1 == r1->num_code_blocks) {
	5408	src = &r2->code_blocks[i2++];
	5409	is_qr = 1;
	5410	}
	5411	else if (i2 == r2->num_code_blocks)
	5412	src = &r1->code_blocks[i1++];
	5413	else if ( r1->code_blocks[i1].start
	5414	< r2->code_blocks[i2].start)
	5415	{
	5416	src = &r1->code_blocks[i1++];
	5417	assert(src->end < r2->code_blocks[i2].start);
	5418	}
	5419	else {
	5420	assert( r1->code_blocks[i1].start
	5421	> r2->code_blocks[i2].start);
	5422	src = &r2->code_blocks[i2++];
	5423	is_qr = 1;
	5424	assert(src->end < r1->code_blocks[i1].start);
	5425	}
	5426
	5427	assert(pat[src->start] == '(');
	5428	assert(pat[src->end] == ')');
	5429	dst->start = src->start;
	5430	dst->end = src->end;
	5431	dst->block = src->block;
	5432	dst->src_regex = is_qr ? (REGEXP) SvREFCNT_inc( (SV) qr)
	5433	: src->src_regex;
	5434	dst++;
	5435	}
	5436	r1->num_code_blocks += r2->num_code_blocks;
	5437	Safefree(r1->code_blocks);
	5438	r1->code_blocks = new_block;
	5439	}
	5440
	5441	SvREFCNT_dec_NN(qr);
	5442	return 1;
	5443	}
	5444
	5445
	5446	STATIC bool
	5447	S_setup_longest(pTHX_ RExC_state_t pRExC_state, SV sv_longest, SV rx_utf8, SV rx_substr, SSize_t* rx_end_shift,
	5448	SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
	5449	{
	5450	/* This is the common code for setting up the floating and fixed length
	5451	* string data extracted from Perl_re_op_compile() below. Returns a boolean
	5452	* as to whether succeeded or not */
	5453
	5454	I32 t;
	5455	SSize_t ml;
	5456
	5457	if (! (longest_length
	5458	\|\| (eol /* Can't have SEOL and MULTI */
	5459	&& (! meol \|\| (RExC_flags & RXf_PMf_MULTILINE)))
	5460	)
	5461	/* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
	5462	\|\| (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
	5463	{
	5464	return FALSE;
	5465	}
	5466
	5467	/* copy the information about the longest from the reg_scan_data
	5468	over to the program. */
	5469	if (SvUTF8(sv_longest)) {
	5470	*rx_utf8 = sv_longest;
	5471	*rx_substr = NULL;
	5472	} else {
	5473	*rx_substr = sv_longest;
	5474	*rx_utf8 = NULL;
	5475	}
	5476	/* end_shift is how many chars that must be matched that
	5477	follow this item. We calculate it ahead of time as once the
	5478	lookbehind offset is added in we lose the ability to correctly
	5479	calculate it.*/
	5480	ml = minlen ? *(minlen) : (SSize_t)longest_length;
	5481	*rx_end_shift = ml - offset
	5482	- longest_length + (SvTAIL(sv_longest) != 0)
	5483	+ lookbehind;
	5484
	5485	t = (eol/* Can't have SEOL and MULTI */
	5486	&& (! meol \|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5487	fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
	5488
	5489	return TRUE;
	5490	}
	5491
	5492	/*
	5493	* Perl_re_op_compile - the perl internal RE engine's function to compile a
	5494	* regular expression into internal code.
	5495	* The pattern may be passed either as:
	5496	* a list of SVs (patternp plus pat_count)
	5497	* a list of OPs (expr)
	5498	* If both are passed, the SV list is used, but the OP list indicates
	5499	* which SVs are actually pre-compiled code blocks
	5500	*
	5501	* The SVs in the list have magic and qr overloading applied to them (and
	5502	* the list may be modified in-place with replacement SVs in the latter
	5503	* case).
	5504	*
	5505	* If the pattern hasn't changed from old_re, then old_re will be
	5506	* returned.
	5507	*
	5508	* eng is the current engine. If that engine has an op_comp method, then
	5509	* handle directly (i.e. we assume that op_comp was us); otherwise, just
	5510	* do the initial concatenation of arguments and pass on to the external
	5511	* engine.
	5512	*
	5513	* If is_bare_re is not null, set it to a boolean indicating whether the
	5514	* arg list reduced (after overloading) to a single bare regex which has
	5515	* been returned (i.e. /$qr/).
	5516	*
	5517	* orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
	5518	*
	5519	* pm_flags contains the PMf_* flags, typically based on those from the
	5520	* pm_flags field of the related PMOP. Currently we're only interested in
	5521	* PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
	5522	*
	5523	* We can't allocate space until we know how big the compiled form will be,
	5524	* but we can't compile it (and thus know how big it is) until we've got a
	5525	* place to put the code. So we cheat: we compile it twice, once with code
	5526	* generation turned off and size counting turned on, and once "for real".
	5527	* This also means that we don't allocate space until we are sure that the
	5528	* thing really will compile successfully, and we never have to move the
	5529	* code and thus invalidate pointers into it. (Note that it has to be in
	5530	* one piece because free() must be able to free it all.) [NB: not true in perl]
	5531	*
	5532	* Beware that the optimization-preparation code in here knows about some
	5533	* of the structure of the compiled regexp. [I'll say.]
	5534	*/
	5535
	5536	REGEXP *
	5537	Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
	5538	OP expr, const regexp_engine eng, REGEXP *old_re,
	5539	bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
	5540	{
	5541	dVAR;
	5542	REGEXP *rx;
	5543	struct regexp *r;
	5544	regexp_internal *ri;
	5545	STRLEN plen;
	5546	char *exp;
	5547	regnode *scan;
	5548	I32 flags;
	5549	SSize_t minlen = 0;
	5550	U32 rx_flags;
	5551	SV *pat;
	5552	SV *code_blocksv = NULL;
	5553	SV** new_patternp = patternp;
	5554
	5555	/* these are all flags - maybe they should be turned
	5556	* into a single int with different bit masks */
	5557	I32 sawlookahead = 0;
	5558	I32 sawplus = 0;
	5559	I32 sawopen = 0;
	5560	I32 sawminmod = 0;
	5561
	5562	regex_charset initial_charset = get_regex_charset(orig_rx_flags);
	5563	bool recompile = 0;
	5564	bool runtime_code = 0;
	5565	scan_data_t data;
	5566	RExC_state_t RExC_state;
	5567	RExC_state_t * const pRExC_state = &RExC_state;
	5568	#ifdef TRIE_STUDY_OPT
	5569	int restudied = 0;
	5570	RExC_state_t copyRExC_state;
	5571	#endif
	5572	GET_RE_DEBUG_FLAGS_DECL;
	5573
	5574	PERL_ARGS_ASSERT_RE_OP_COMPILE;
	5575
	5576	DEBUG_r(if (!PL_colorset) reginitcolors());
	5577
	5578	#ifndef PERL_IN_XSUB_RE
	5579	/* Initialize these here instead of as-needed, as is quick and avoids
	5580	* having to test them each time otherwise */
	5581	if (! PL_AboveLatin1) {
	5582	PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
	5583	PL_ASCII = _new_invlist_C_array(ASCII_invlist);
	5584	PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
	5585
	5586	PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
	5587	= _new_invlist_C_array(L1PosixAlnum_invlist);
	5588	PL_Posix_ptrs[_CC_ALPHANUMERIC]
	5589	= _new_invlist_C_array(PosixAlnum_invlist);
	5590
	5591	PL_L1Posix_ptrs[_CC_ALPHA]
	5592	= _new_invlist_C_array(L1PosixAlpha_invlist);
	5593	PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
	5594
	5595	PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
	5596	PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
	5597
	5598	/* Cased is the same as Alpha in the ASCII range */
	5599	PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
	5600	PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
	5601
	5602	PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
	5603	PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
	5604
	5605	PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
	5606	PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
	5607
	5608	PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
	5609	PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
	5610
	5611	PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
	5612	PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
	5613
	5614	PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
	5615	PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
	5616
	5617	PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
	5618	PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
	5619
	5620	PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
	5621	PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
	5622	PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
	5623	PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
	5624
	5625	PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
	5626	PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
	5627
	5628	PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
	5629
	5630	PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
	5631	PL_L1Posix_ptrs[_CC_WORDCHAR]
	5632	= _new_invlist_C_array(L1PosixWord_invlist);
	5633
	5634	PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
	5635	PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
	5636
	5637	PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
	5638	}
	5639	#endif
	5640
	5641	pRExC_state->code_blocks = NULL;
	5642	pRExC_state->num_code_blocks = 0;
	5643
	5644	if (is_bare_re)
	5645	*is_bare_re = FALSE;
	5646
	5647	if (expr && (expr->op_type == OP_LIST \|\|
	5648	(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
	5649	/* allocate code_blocks if needed */
	5650	OP *o;
	5651	int ncode = 0;
	5652
	5653	for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
	5654	if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
	5655	ncode++; /* count of DO blocks */
	5656	if (ncode) {
	5657	pRExC_state->num_code_blocks = ncode;
	5658	Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
	5659	}
	5660	}
	5661
	5662	if (!pat_count) {
	5663	/* compile-time pattern with just OP_CONSTs and DO blocks */
	5664
	5665	int n;
	5666	OP *o;
	5667
	5668	/* find how many CONSTs there are */
	5669	assert(expr);
	5670	n = 0;
	5671	if (expr->op_type == OP_CONST)
	5672	n = 1;
	5673	else
	5674	for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
	5675	if (o->op_type == OP_CONST)
	5676	n++;
	5677	}
	5678
	5679	/* fake up an SV array */
	5680
	5681	assert(!new_patternp);
	5682	Newx(new_patternp, n, SV*);
	5683	SAVEFREEPV(new_patternp);
	5684	pat_count = n;
	5685
	5686	n = 0;
	5687	if (expr->op_type == OP_CONST)
	5688	new_patternp[n] = cSVOPx_sv(expr);
	5689	else
	5690	for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
	5691	if (o->op_type == OP_CONST)
	5692	new_patternp[n++] = cSVOPo_sv;
	5693	}
	5694
	5695	}
	5696
	5697	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	5698	"Assembling pattern from %d elements%s\n", pat_count,
	5699	orig_rx_flags & RXf_SPLIT ? " for split" : ""));
	5700
	5701	/* set expr to the first arg op */
	5702
	5703	if (pRExC_state->num_code_blocks
	5704	&& expr->op_type != OP_CONST)
	5705	{
	5706	expr = cLISTOPx(expr)->op_first;
	5707	assert( expr->op_type == OP_PUSHMARK
	5708	\|\| (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
	5709	\|\| expr->op_type == OP_PADRANGE);
	5710	expr = expr->op_sibling;
	5711	}
	5712
	5713	pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
	5714	expr, &recompile, NULL);
	5715
	5716	/* handle bare (possibly after overloading) regex: foo =~ $re */
	5717	{
	5718	SV *re = pat;
	5719	if (SvROK(re))
	5720	re = SvRV(re);
	5721	if (SvTYPE(re) == SVt_REGEXP) {
	5722	if (is_bare_re)
	5723	*is_bare_re = TRUE;
	5724	SvREFCNT_inc(re);
	5725	Safefree(pRExC_state->code_blocks);
	5726	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	5727	"Precompiled pattern%s\n",
	5728	orig_rx_flags & RXf_SPLIT ? " for split" : ""));
	5729
	5730	return (REGEXP*)re;
	5731	}
	5732	}
	5733
	5734	exp = SvPV_nomg(pat, plen);
	5735
	5736	if (!eng->op_comp) {
	5737	if ((SvUTF8(pat) && IN_BYTES)
	5738	\|\| SvGMAGICAL(pat) \|\| SvAMAGIC(pat))
	5739	{
	5740	/* make a temporary copy; either to convert to bytes,
	5741	* or to avoid repeating get-magic / overloaded stringify */
	5742	pat = newSVpvn_flags(exp, plen, SVs_TEMP \|
	5743	(IN_BYTES ? 0 : SvUTF8(pat)));
	5744	}
	5745	Safefree(pRExC_state->code_blocks);
	5746	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
	5747	}
	5748
	5749	/* ignore the utf8ness if the pattern is 0 length */
	5750	RExC_utf8 = RExC_orig_utf8 = (plen == 0 \|\| IN_BYTES) ? 0 : SvUTF8(pat);
	5751	RExC_uni_semantics = 0;
	5752	RExC_contains_locale = 0;
	5753	pRExC_state->runtime_code_qr = NULL;
	5754
	5755	DEBUG_COMPILE_r({
	5756	SV *dsv= sv_newmortal();
	5757	RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
	5758	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	5759	PL_colors[4],PL_colors[5],s);
	5760	});
	5761
	5762	redo_first_pass:
	5763	/* we jump here if we upgrade the pattern to utf8 and have to
	5764	* recompile */
	5765
	5766	if ((pm_flags & PMf_USE_RE_EVAL)
	5767	/* this second condition covers the non-regex literal case,
	5768	* i.e. $foo =~ '(?{})'. */
	5769	\|\| (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
	5770	)
	5771	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
	5772
	5773	/* return old regex if pattern hasn't changed */
	5774	/* XXX: note in the below we have to check the flags as well as the pattern.
	5775	*
	5776	* Things get a touch tricky as we have to compare the utf8 flag independently
	5777	* from the compile flags.
	5778	*/
	5779
	5780	if ( old_re
	5781	&& !recompile
	5782	&& !!RX_UTF8(old_re) == !!RExC_utf8
	5783	&& ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
	5784	&& RX_PRECOMP(old_re)
	5785	&& RX_PRELEN(old_re) == plen
	5786	&& memEQ(RX_PRECOMP(old_re), exp, plen)
	5787	&& !runtime_code /* with runtime code, always recompile */ )
	5788	{
	5789	Safefree(pRExC_state->code_blocks);
	5790	return old_re;
	5791	}
	5792
	5793	rx_flags = orig_rx_flags;
	5794
	5795	if (initial_charset == REGEX_LOCALE_CHARSET) {
	5796	RExC_contains_locale = 1;
	5797	}
	5798	else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
	5799
	5800	/* Set to use unicode semantics if the pattern is in utf8 and has the
	5801	* 'depends' charset specified, as it means unicode when utf8 */
	5802	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
	5803	}
	5804
	5805	RExC_precomp = exp;
	5806	RExC_flags = rx_flags;
	5807	RExC_pm_flags = pm_flags;
	5808
	5809	if (runtime_code) {
	5810	if (TAINTING_get && TAINT_get)
	5811	Perl_croak(aTHX_ "Eval-group in insecure regular expression");
	5812
	5813	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
	5814	/* whoops, we have a non-utf8 pattern, whilst run-time code
	5815	* got compiled as utf8. Try again with a utf8 pattern */
	5816	S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
	5817	pRExC_state->num_code_blocks);
	5818	goto redo_first_pass;
	5819	}
	5820	}
	5821	assert(!pRExC_state->runtime_code_qr);
	5822
	5823	RExC_sawback = 0;
	5824
	5825	RExC_seen = 0;
	5826	RExC_in_lookbehind = 0;
	5827	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	5828	RExC_extralen = 0;
	5829	RExC_override_recoding = 0;
	5830	RExC_in_multi_char_class = 0;
	5831
	5832	/* First pass: determine size, legality. */
	5833	RExC_parse = exp;
	5834	RExC_start = exp;
	5835	RExC_end = exp + plen;
	5836	RExC_naughty = 0;
	5837	RExC_npar = 1;
	5838	RExC_nestroot = 0;
	5839	RExC_size = 0L;
	5840	RExC_emit = &RExC_emit_dummy;
	5841	RExC_whilem_seen = 0;
	5842	RExC_open_parens = NULL;
	5843	RExC_close_parens = NULL;
	5844	RExC_opend = NULL;
	5845	RExC_paren_names = NULL;
	5846	#ifdef DEBUGGING
	5847	RExC_paren_name_list = NULL;
	5848	#endif
	5849	RExC_recurse = NULL;
	5850	RExC_recurse_count = 0;
	5851	pRExC_state->code_index = 0;
	5852
	5853	#if 0 /* REGC() is (currently) a NOP at the first pass.
	5854	* Clever compilers notice this and complain. --jhi */
	5855	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	5856	#endif
	5857	DEBUG_PARSE_r(
	5858	PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
	5859	RExC_lastnum=0;
	5860	RExC_lastparse=NULL;
	5861	);
	5862	/* reg may croak on us, not giving us a chance to free
	5863	pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
	5864	need it to survive as long as the regexp (qr/(?{})/).
	5865	We must check that code_blocksv is not already set, because we may
	5866	have jumped back to restart the sizing pass. */
	5867	if (pRExC_state->code_blocks && !code_blocksv) {
	5868	code_blocksv = newSV_type(SVt_PV);
	5869	SAVEFREESV(code_blocksv);
	5870	SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
	5871	SvLEN_set(code_blocksv, 1); /sufficient to make sv_clear free it/
	5872	}
	5873	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	5874	/* It's possible to write a regexp in ascii that represents Unicode
	5875	codepoints outside of the byte range, such as via \x{100}. If we
	5876	detect such a sequence we have to convert the entire pattern to utf8
	5877	and then recompile, as our sizing calculation will have been based
	5878	on 1 byte == 1 character, but we will need to use utf8 to encode
	5879	at least some part of the pattern, and therefore must convert the whole
	5880	thing.
	5881	-- dmq */
	5882	if (flags & RESTART_UTF8) {
	5883	S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
	5884	pRExC_state->num_code_blocks);
	5885	goto redo_first_pass;
	5886	}
	5887	Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
	5888	}
	5889	if (code_blocksv)
	5890	SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
	5891
	5892	DEBUG_PARSE_r({
	5893	PerlIO_printf(Perl_debug_log,
	5894	"Required size %"IVdf" nodes\n"
	5895	"Starting second pass (creation)\n",
	5896	(IV)RExC_size);
	5897	RExC_lastnum=0;
	5898	RExC_lastparse=NULL;
	5899	});
	5900
	5901	/* The first pass could have found things that force Unicode semantics */
	5902	if ((RExC_utf8 \|\| RExC_uni_semantics)
	5903	&& get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
	5904	{
	5905	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
	5906	}
	5907
	5908	/* Small enough for pointer-storage convention?
	5909	If extralen==0, this means that we will not need long jumps. */
	5910	if (RExC_size >= 0x10000L && RExC_extralen)
	5911	RExC_size += RExC_extralen;
	5912	else
	5913	RExC_extralen = 0;
	5914	if (RExC_whilem_seen > 15)
	5915	RExC_whilem_seen = 15;
	5916
	5917	/* Allocate space and zero-initialize. Note, the two step process
	5918	of zeroing when in debug mode, thus anything assigned has to
	5919	happen after that */
	5920	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	5921	r = ReANY(rx);
	5922	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	5923	char, regexp_internal);
	5924	if ( r == NULL \|\| ri == NULL )
	5925	FAIL("Regexp out of space");
	5926	#ifdef DEBUGGING
	5927	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	5928	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
	5929	#else
	5930	/* bulk initialize base fields with 0. */
	5931	Zero(ri, sizeof(regexp_internal), char);
	5932	#endif
	5933
	5934	/* non-zero initialization begins here */
	5935	RXi_SET( r, ri );
	5936	r->engine= eng;
	5937	r->extflags = rx_flags;
	5938	RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
	5939
	5940	if (pm_flags & PMf_IS_QR) {
	5941	ri->code_blocks = pRExC_state->code_blocks;
	5942	ri->num_code_blocks = pRExC_state->num_code_blocks;
	5943	}
	5944	else
	5945	{
	5946	int n;
	5947	for (n = 0; n < pRExC_state->num_code_blocks; n++)
	5948	if (pRExC_state->code_blocks[n].src_regex)
	5949	SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
	5950	SAVEFREEPV(pRExC_state->code_blocks);
	5951	}
	5952
	5953	{
	5954	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	5955	bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
	5956
	5957	/* The caret is output if there are any defaults: if not all the STD
	5958	* flags are set, or if no character set specifier is needed */
	5959	bool has_default =
	5960	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	5961	\|\| ! has_charset);
	5962	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
	5963	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	5964	>> RXf_PMf_STD_PMMOD_SHIFT);
	5965	const char fptr = STD_PAT_MODS; /"msix"*/
	5966	char *p;
	5967	/* Allocate for the worst case, which is all the std flags are turned
	5968	* on. If more precision is desired, we could do a population count of
	5969	* the flags set. This could be done with a small lookup table, or by
	5970	* shifting, masking and adding, or even, when available, assembly
	5971	* language for a machine-language population count.
	5972	* We never output a minus, as all those are defaults, so are
	5973	* covered by the caret */
	5974	const STRLEN wraplen = plen + has_p + has_runon
	5975	+ has_default /* If needs a caret */
	5976
	5977	/* If needs a character set specifier */
	5978	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	5979	+ (sizeof(STD_PAT_MODS) - 1)
	5980	+ (sizeof("(?:)") - 1);
	5981
	5982	Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
	5983	r->xpv_len_u.xpvlenu_pv = p;
	5984	if (RExC_utf8)
	5985	SvFLAGS(rx) \|= SVf_UTF8;
	5986	p++='('; p++='?';
	5987
	5988	/* If a default, cover it using the caret */
	5989	if (has_default) {
	5990	*p++= DEFAULT_PAT_MOD;
	5991	}
	5992	if (has_charset) {
	5993	STRLEN len;
	5994	const char* const name = get_regex_charset_name(r->extflags, &len);
	5995	Copy(name, p, len, char);
	5996	p += len;
	5997	}
	5998	if (has_p)
	5999	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	6000	{
	6001	char ch;
	6002	while((ch = *fptr++)) {
	6003	if(reganch & 1)
	6004	*p++ = ch;
	6005	reganch >>= 1;
	6006	}
	6007	}
	6008
	6009	*p++ = ':';
	6010	Copy(RExC_precomp, p, plen, char);
	6011	assert ((RX_WRAPPED(rx) - p) < 16);
	6012	r->pre_prefix = p - RX_WRAPPED(rx);
	6013	p += plen;
	6014	if (has_runon)
	6015	*p++ = '\n';
	6016	*p++ = ')';
	6017	*p = 0;
	6018	SvCUR_set(rx, p - RX_WRAPPED(rx));
	6019	}
	6020
	6021	r->intflags = 0;
	6022	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	6023
	6024	if (RExC_seen & REG_SEEN_RECURSE) {
	6025	Newxz(RExC_open_parens, RExC_npar,regnode *);
	6026	SAVEFREEPV(RExC_open_parens);
	6027	Newxz(RExC_close_parens,RExC_npar,regnode *);
	6028	SAVEFREEPV(RExC_close_parens);
	6029	}
	6030
	6031	/* Useful during FAIL. */
	6032	#ifdef RE_TRACK_PATTERN_OFFSETS
	6033	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	6034	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	6035	"%s %"UVuf" bytes for offset annotations.\n",
	6036	ri->u.offsets ? "Got" : "Couldn't get",
	6037	(UV)((2RExC_size+1) sizeof(U32))));
	6038	#endif
	6039	SetProgLen(ri,RExC_size);
	6040	RExC_rx_sv = rx;
	6041	RExC_rx = r;
	6042	RExC_rxi = ri;
	6043
	6044	/* Second pass: emit code. */
	6045	RExC_flags = rx_flags; /* don't let top level (?i) bleed */
	6046	RExC_pm_flags = pm_flags;
	6047	RExC_parse = exp;
	6048	RExC_end = exp + plen;
	6049	RExC_naughty = 0;
	6050	RExC_npar = 1;
	6051	RExC_emit_start = ri->program;
	6052	RExC_emit = ri->program;
	6053	RExC_emit_bound = ri->program + RExC_size + 1;
	6054	pRExC_state->code_index = 0;
	6055
	6056	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	6057	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	6058	ReREFCNT_dec(rx);
	6059	Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
	6060	}
	6061	/* XXXX To minimize changes to RE engine we always allocate
	6062	3-units-long substrs field. */
	6063	Newx(r->substrs, 1, struct reg_substr_data);
	6064	if (RExC_recurse_count) {
	6065	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	6066	SAVEFREEPV(RExC_recurse);
	6067	}
	6068
	6069	reStudy:
	6070	r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
	6071	Zero(r->substrs, 1, struct reg_substr_data);
	6072
	6073	#ifdef TRIE_STUDY_OPT
	6074	if (!restudied) {
	6075	StructCopy(&zero_scan_data, &data, scan_data_t);
	6076	copyRExC_state = RExC_state;
	6077	} else {
	6078	U32 seen=RExC_seen;
	6079	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	6080
	6081	RExC_state = copyRExC_state;
	6082	if (seen & REG_TOP_LEVEL_BRANCHES)
	6083	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	6084	else
	6085	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
	6086	StructCopy(&zero_scan_data, &data, scan_data_t);
	6087	}
	6088	#else
	6089	StructCopy(&zero_scan_data, &data, scan_data_t);
	6090	#endif
	6091
	6092	/* Dig out information for optimizations. */
	6093	r->extflags = RExC_flags; /* was pm_op */
	6094	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	6095
	6096	if (UTF)
	6097	SvUTF8_on(rx); /* Unicode in it? */
	6098	ri->regstclass = NULL;
	6099	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	6100	r->intflags \|= PREGf_NAUGHTY;
	6101	scan = ri->program + 1; /* First BRANCH. */
	6102
	6103	/* testing for BRANCH here tells us whether there is "must appear"
	6104	data in the pattern. If there is then we can use it for optimisations */
	6105	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
	6106	SSize_t fake;
	6107	STRLEN longest_float_length, longest_fixed_length;
	6108	struct regnode_charclass_class ch_class; /* pointed to by data */
	6109	int stclass_flag;
	6110	SSize_t last_close = 0; /* pointed to by data */
	6111	regnode *first= scan;
	6112	regnode *first_next= regnext(first);
	6113	/*
	6114	* Skip introductions and multiplicators >= 1
	6115	* so that we can extract the 'meat' of the pattern that must
	6116	* match in the large if() sequence following.
	6117	* NOTE that EXACT is NOT covered here, as it is normally
	6118	* picked up by the optimiser separately.
	6119	*
	6120	* This is unfortunate as the optimiser isnt handling lookahead
	6121	* properly currently.
	6122	*
	6123	*/
	6124	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	6125	/* An OR of one alternative - should not happen now. */
	6126	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	6127	/* for now we can't handle lookbehind IFMATCH*/
	6128	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	6129	(OP(first) == PLUS) \|\|
	6130	(OP(first) == MINMOD) \|\|
	6131	/* An {n,m} with n>0 */
	6132	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	6133	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	6134	{
	6135	/*
	6136	* the only op that could be a regnode is PLUS, all the rest
	6137	* will be regnode_1 or regnode_2.
	6138	*
	6139	* (yves doesn't think this is true)
	6140	*/
	6141	if (OP(first) == PLUS)
	6142	sawplus = 1;
	6143	else {
	6144	if (OP(first) == MINMOD)
	6145	sawminmod = 1;
	6146	first += regarglen[OP(first)];
	6147	}
	6148	first = NEXTOPER(first);
	6149	first_next= regnext(first);
	6150	}
	6151
	6152	/* Starting-point info. */
	6153	again:
	6154	DEBUG_PEEP("first:",first,0);
	6155	/* Ignore EXACT as we deal with it later. */
	6156	if (PL_regkind[OP(first)] == EXACT) {
	6157	if (OP(first) == EXACT)
	6158	NOOP; /* Empty, get anchored substr later. */
	6159	else
	6160	ri->regstclass = first;
	6161	}
	6162	#ifdef TRIE_STCLASS
	6163	else if (PL_regkind[OP(first)] == TRIE &&
	6164	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	6165	{
	6166	regnode *trie_op;
	6167	/* this can happen only on restudy */
	6168	if ( OP(first) == TRIE ) {
	6169	struct regnode_1 trieop = (struct regnode_1 )
	6170	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	6171	StructCopy(first,trieop,struct regnode_1);
	6172	trie_op=(regnode *)trieop;
	6173	} else {
	6174	struct regnode_charclass trieop = (struct regnode_charclass )
	6175	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	6176	StructCopy(first,trieop,struct regnode_charclass);
	6177	trie_op=(regnode *)trieop;
	6178	}
	6179	OP(trie_op)+=2;
	6180	make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
	6181	ri->regstclass = trie_op;
	6182	}
	6183	#endif
	6184	else if (REGNODE_SIMPLE(OP(first)))
	6185	ri->regstclass = first;
	6186	else if (PL_regkind[OP(first)] == BOUND \|\|
	6187	PL_regkind[OP(first)] == NBOUND)
	6188	ri->regstclass = first;
	6189	else if (PL_regkind[OP(first)] == BOL) {
	6190	r->extflags \|= (OP(first) == MBOL
	6191	? RXf_ANCH_MBOL
	6192	: (OP(first) == SBOL
	6193	? RXf_ANCH_SBOL
	6194	: RXf_ANCH_BOL));
	6195	first = NEXTOPER(first);
	6196	goto again;
	6197	}
	6198	else if (OP(first) == GPOS) {
	6199	r->extflags \|= RXf_ANCH_GPOS;
	6200	first = NEXTOPER(first);
	6201	goto again;
	6202	}
	6203	else if ((!sawopen \|\| !RExC_sawback) &&
	6204	(OP(first) == STAR &&
	6205	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	6206	!(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
	6207	{
	6208	/* turn .* into ^.* with an implied $=1 /
	6209	const int type =
	6210	(OP(NEXTOPER(first)) == REG_ANY)
	6211	? RXf_ANCH_MBOL
	6212	: RXf_ANCH_SBOL;
	6213	r->extflags \|= type;
	6214	r->intflags \|= PREGf_IMPLICIT;
	6215	first = NEXTOPER(first);
	6216	goto again;
	6217	}
	6218	if (sawplus && !sawminmod && !sawlookahead && (!sawopen \|\| !RExC_sawback)
	6219	&& !pRExC_state->num_code_blocks) /* May examine pos and $& */
	6220	/* x+ must match at the 1st pos of run of x's */
	6221	r->intflags \|= PREGf_SKIP;
	6222
	6223	/* Scan is after the zeroth branch, first is atomic matcher. */
	6224	#ifdef TRIE_STUDY_OPT
	6225	DEBUG_PARSE_r(
	6226	if (!restudied)
	6227	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	6228	(IV)(first - scan + 1))
	6229	);
	6230	#else
	6231	DEBUG_PARSE_r(
	6232	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	6233	(IV)(first - scan + 1))
	6234	);
	6235	#endif
	6236
	6237
	6238	/*
	6239	* If there's something expensive in the r.e., find the
	6240	* longest literal string that must appear and make it the
	6241	* regmust. Resolve ties in favor of later strings, since
	6242	* the regstart check works with the beginning of the r.e.
	6243	* and avoiding duplication strengthens checking. Not a
	6244	* strong reason, but sufficient in the absence of others.
	6245	* [Now we resolve ties in favor of the earlier string if
	6246	* it happens that c_offset_min has been invalidated, since the
	6247	* earlier string may buy us something the later one won't.]
	6248	*/
	6249
	6250	data.longest_fixed = newSVpvs("");
	6251	data.longest_float = newSVpvs("");
	6252	data.last_found = newSVpvs("");
	6253	data.longest = &(data.longest_fixed);
	6254	ENTER_with_name("study_chunk");
	6255	SAVEFREESV(data.longest_fixed);
	6256	SAVEFREESV(data.longest_float);
	6257	SAVEFREESV(data.last_found);
	6258	first = scan;
	6259	if (!ri->regstclass) {
	6260	cl_init(pRExC_state, &ch_class);
	6261	data.start_class = &ch_class;
	6262	stclass_flag = SCF_DO_STCLASS_AND;
	6263	} else /* XXXX Check for BOUND? */
	6264	stclass_flag = 0;
	6265	data.last_closep = &last_close;
	6266
	6267	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
	6268	&data, -1, NULL, NULL,
	6269	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag
	6270	\| (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
	6271	0);
	6272
	6273
	6274	CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
	6275
	6276
	6277	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	6278	&& data.last_start_min == 0 && data.last_end > 0
	6279	&& !RExC_seen_zerolen
	6280	&& !(RExC_seen & REG_SEEN_VERBARG)
	6281	&& !((RExC_seen & REG_SEEN_GPOS) \|\| (r->extflags & RXf_ANCH_GPOS)))
	6282	r->extflags \|= RXf_CHECK_ALL;
	6283	scan_commit(pRExC_state, &data,&minlen,0);
	6284
	6285	longest_float_length = CHR_SVLEN(data.longest_float);
	6286
	6287	if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	6288	&& data.offset_fixed == data.offset_float_min
	6289	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
	6290	&& S_setup_longest (aTHX_ pRExC_state,
	6291	data.longest_float,
	6292	&(r->float_utf8),
	6293	&(r->float_substr),
	6294	&(r->float_end_shift),
	6295	data.lookbehind_float,
	6296	data.offset_float_min,
	6297	data.minlen_float,
	6298	longest_float_length,
	6299	cBOOL(data.flags & SF_FL_BEFORE_EOL),
	6300	cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
	6301	{
	6302	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	6303	r->float_max_offset = data.offset_float_max;
	6304	if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
	6305	r->float_max_offset -= data.lookbehind_float;
	6306	SvREFCNT_inc_simple_void_NN(data.longest_float);
	6307	}
	6308	else {
	6309	r->float_substr = r->float_utf8 = NULL;
	6310	longest_float_length = 0;
	6311	}
	6312
	6313	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	6314
	6315	if (S_setup_longest (aTHX_ pRExC_state,
	6316	data.longest_fixed,
	6317	&(r->anchored_utf8),
	6318	&(r->anchored_substr),
	6319	&(r->anchored_end_shift),
	6320	data.lookbehind_fixed,
	6321	data.offset_fixed,
	6322	data.minlen_fixed,
	6323	longest_fixed_length,
	6324	cBOOL(data.flags & SF_FIX_BEFORE_EOL),
	6325	cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
	6326	{
	6327	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	6328	SvREFCNT_inc_simple_void_NN(data.longest_fixed);
	6329	}
	6330	else {
	6331	r->anchored_substr = r->anchored_utf8 = NULL;
	6332	longest_fixed_length = 0;
	6333	}
	6334	LEAVE_with_name("study_chunk");
	6335
	6336	if (ri->regstclass
	6337	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	6338	ri->regstclass = NULL;
	6339
	6340	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	6341	&& stclass_flag
	6342	&& ! TEST_SSC_EOS(data.start_class)
	6343	&& !cl_is_anything(data.start_class))
	6344	{
	6345	const U32 n = add_data(pRExC_state, 1, "f");
	6346	OP(data.start_class) = ANYOF_SYNTHETIC;
	6347
	6348	Newx(RExC_rxi->data->data[n], 1,
	6349	struct regnode_charclass_class);
	6350	StructCopy(data.start_class,
	6351	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	6352	struct regnode_charclass_class);
	6353	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	6354	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	6355	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	6356	regprop(r, sv, (regnode*)data.start_class);
	6357	PerlIO_printf(Perl_debug_log,
	6358	"synthetic stclass \"%s\".\n",
	6359	SvPVX_const(sv));});
	6360	}
	6361
	6362	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
	6363	if (longest_fixed_length > longest_float_length) {
	6364	r->check_end_shift = r->anchored_end_shift;
	6365	r->check_substr = r->anchored_substr;
	6366	r->check_utf8 = r->anchored_utf8;
	6367	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	6368	if (r->extflags & RXf_ANCH_SINGLE)
	6369	r->extflags \|= RXf_NOSCAN;
	6370	}
	6371	else {
	6372	r->check_end_shift = r->float_end_shift;
	6373	r->check_substr = r->float_substr;
	6374	r->check_utf8 = r->float_utf8;
	6375	r->check_offset_min = r->float_min_offset;
	6376	r->check_offset_max = r->float_max_offset;
	6377	}
	6378	if ((r->check_substr \|\| r->check_utf8) ) {
	6379	r->extflags \|= RXf_USE_INTUIT;
	6380	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	6381	r->extflags \|= RXf_INTUIT_TAIL;
	6382	}
	6383	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	6384	if ( (STRLEN)minlen < longest_float_length )
	6385	minlen= longest_float_length;
	6386	if ( (STRLEN)minlen < longest_fixed_length )
	6387	minlen= longest_fixed_length;
	6388	*/
	6389	}
	6390	else {
	6391	/* Several toplevels. Best we can is to set minlen. */
	6392	SSize_t fake;
	6393	struct regnode_charclass_class ch_class;
	6394	SSize_t last_close = 0;
	6395
	6396	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	6397
	6398	scan = ri->program + 1;
	6399	cl_init(pRExC_state, &ch_class);
	6400	data.start_class = &ch_class;
	6401	data.last_closep = &last_close;
	6402
	6403
	6404	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
	6405	&data, -1, NULL, NULL,
	6406	SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS
	6407	\|(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
	6408	0);
	6409
	6410	CHECK_RESTUDY_GOTO_butfirst(NOOP);
	6411
	6412	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	6413	= r->float_substr = r->float_utf8 = NULL;
	6414
	6415	if (! TEST_SSC_EOS(data.start_class)
	6416	&& !cl_is_anything(data.start_class))
	6417	{
	6418	const U32 n = add_data(pRExC_state, 1, "f");
	6419	OP(data.start_class) = ANYOF_SYNTHETIC;
	6420
	6421	Newx(RExC_rxi->data->data[n], 1,
	6422	struct regnode_charclass_class);
	6423	StructCopy(data.start_class,
	6424	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	6425	struct regnode_charclass_class);
	6426	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	6427	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	6428	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	6429	regprop(r, sv, (regnode*)data.start_class);
	6430	PerlIO_printf(Perl_debug_log,
	6431	"synthetic stclass \"%s\".\n",
	6432	SvPVX_const(sv));});
	6433	}
	6434	}
	6435
	6436	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	6437	the "real" pattern. */
	6438	DEBUG_OPTIMISE_r({
	6439	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
	6440	(IV)minlen, (IV)r->minlen);
	6441	});
	6442	r->minlenret = minlen;
	6443	if (r->minlen < minlen)
	6444	r->minlen = minlen;
	6445
	6446	if (RExC_seen & REG_SEEN_GPOS)
	6447	r->extflags \|= RXf_GPOS_SEEN;
	6448	if (RExC_seen & REG_SEEN_LOOKBEHIND)
	6449	r->extflags \|= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
	6450	if (pRExC_state->num_code_blocks)
	6451	r->extflags \|= RXf_EVAL_SEEN;
	6452	if (RExC_seen & REG_SEEN_CANY)
	6453	r->extflags \|= RXf_CANY_SEEN;
	6454	if (RExC_seen & REG_SEEN_VERBARG)
	6455	{
	6456	r->intflags \|= PREGf_VERBARG_SEEN;
	6457	r->extflags \|= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
	6458	}
	6459	if (RExC_seen & REG_SEEN_CUTGROUP)
	6460	r->intflags \|= PREGf_CUTGROUP_SEEN;
	6461	if (pm_flags & PMf_USE_RE_EVAL)
	6462	r->intflags \|= PREGf_USE_RE_EVAL;
	6463	if (RExC_paren_names)
	6464	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	6465	else
	6466	RXp_PAREN_NAMES(r) = NULL;
	6467
	6468	{
	6469	regnode *first = ri->program + 1;
	6470	U8 fop = OP(first);
	6471	regnode *next = NEXTOPER(first);
	6472	U8 nop = OP(next);
	6473
	6474	if (PL_regkind[fop] == NOTHING && nop == END)
	6475	r->extflags \|= RXf_NULL;
	6476	else if (PL_regkind[fop] == BOL && nop == END)
	6477	r->extflags \|= RXf_START_ONLY;
	6478	else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
	6479	r->extflags \|= RXf_WHITE;
	6480	else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
	6481	r->extflags \|= (RXf_SKIPWHITE\|RXf_WHITE);
	6482
	6483	}
	6484	#ifdef DEBUGGING
	6485	if (RExC_paren_names) {
	6486	ri->name_list_idx = add_data( pRExC_state, 1, "a" );
	6487	ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
	6488	} else
	6489	#endif
	6490	ri->name_list_idx = 0;
	6491
	6492	if (RExC_recurse_count) {
	6493	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	6494	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	6495	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	6496	}
	6497	}
	6498	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	6499	/* assume we don't need to swap parens around before we match */
	6500
	6501	DEBUG_DUMP_r({
	6502	PerlIO_printf(Perl_debug_log,"Final program:\n");
	6503	regdump(r);
	6504	});
	6505	#ifdef RE_TRACK_PATTERN_OFFSETS
	6506	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	6507	const STRLEN len = ri->u.offsets[0];
	6508	STRLEN i;
	6509	GET_RE_DEBUG_FLAGS_DECL;
	6510	PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	6511	for (i = 1; i <= len; i++) {
	6512	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	6513	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	6514	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	6515	}
	6516	PerlIO_printf(Perl_debug_log, "\n");
	6517	});
	6518	#endif
	6519
	6520	#ifdef USE_ITHREADS
	6521	/* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
	6522	* by setting the regexp SV to readonly-only instead. If the
	6523	* pattern's been recompiled, the USEDness should remain. */
	6524	if (old_re && SvREADONLY(old_re))
	6525	SvREADONLY_on(rx);
	6526	#endif
	6527	return rx;
	6528	}
	6529
	6530
	6531	SV*
	6532	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	6533	const U32 flags)
	6534	{
	6535	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	6536
	6537	PERL_UNUSED_ARG(value);
	6538
	6539	if (flags & RXapif_FETCH) {
	6540	return reg_named_buff_fetch(rx, key, flags);
	6541	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	6542	Perl_croak_no_modify();
	6543	return NULL;
	6544	} else if (flags & RXapif_EXISTS) {
	6545	return reg_named_buff_exists(rx, key, flags)
	6546	? &PL_sv_yes
	6547	: &PL_sv_no;
	6548	} else if (flags & RXapif_REGNAMES) {
	6549	return reg_named_buff_all(rx, flags);
	6550	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	6551	return reg_named_buff_scalar(rx, flags);
	6552	} else {
	6553	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	6554	return NULL;
	6555	}
	6556	}
	6557
	6558	SV*
	6559	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	6560	const U32 flags)
	6561	{
	6562	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	6563	PERL_UNUSED_ARG(lastkey);
	6564
	6565	if (flags & RXapif_FIRSTKEY)
	6566	return reg_named_buff_firstkey(rx, flags);
	6567	else if (flags & RXapif_NEXTKEY)
	6568	return reg_named_buff_nextkey(rx, flags);
	6569	else {
	6570	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
	6571	return NULL;
	6572	}
	6573	}
	6574
	6575	SV*
	6576	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	6577	const U32 flags)
	6578	{
	6579	AV *retarray = NULL;
	6580	SV *ret;
	6581	struct regexp *const rx = ReANY(r);
	6582
	6583	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	6584
	6585	if (flags & RXapif_ALL)
	6586	retarray=newAV();
	6587
	6588	if (rx && RXp_PAREN_NAMES(rx)) {
	6589	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	6590	if (he_str) {
	6591	IV i;
	6592	SV* sv_dat=HeVAL(he_str);
	6593	I32 nums=(I32)SvPVX(sv_dat);
	6594	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	6595	if ((I32)(rx->nparens) >= nums[i]
	6596	&& rx->offs[nums[i]].start != -1
	6597	&& rx->offs[nums[i]].end != -1)
	6598	{
	6599	ret = newSVpvs("");
	6600	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	6601	if (!retarray)
	6602	return ret;
	6603	} else {
	6604	if (retarray)
	6605	ret = newSVsv(&PL_sv_undef);
	6606	}
	6607	if (retarray)
	6608	av_push(retarray, ret);
	6609	}
	6610	if (retarray)
	6611	return newRV_noinc(MUTABLE_SV(retarray));
	6612	}
	6613	}
	6614	return NULL;
	6615	}
	6616
	6617	bool
	6618	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	6619	const U32 flags)
	6620	{
	6621	struct regexp *const rx = ReANY(r);
	6622
	6623	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	6624
	6625	if (rx && RXp_PAREN_NAMES(rx)) {
	6626	if (flags & RXapif_ALL) {
	6627	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	6628	} else {
	6629	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	6630	if (sv) {
	6631	SvREFCNT_dec_NN(sv);
	6632	return TRUE;
	6633	} else {
	6634	return FALSE;
	6635	}
	6636	}
	6637	} else {
	6638	return FALSE;
	6639	}
	6640	}
	6641
	6642	SV*
	6643	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	6644	{
	6645	struct regexp *const rx = ReANY(r);
	6646
	6647	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	6648
	6649	if ( rx && RXp_PAREN_NAMES(rx) ) {
	6650	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	6651
	6652	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	6653	} else {
	6654	return FALSE;
	6655	}
	6656	}
	6657
	6658	SV*
	6659	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	6660	{
	6661	struct regexp *const rx = ReANY(r);
	6662	GET_RE_DEBUG_FLAGS_DECL;
	6663
	6664	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	6665
	6666	if (rx && RXp_PAREN_NAMES(rx)) {
	6667	HV *hv = RXp_PAREN_NAMES(rx);
	6668	HE *temphe;
	6669	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	6670	IV i;
	6671	IV parno = 0;
	6672	SV* sv_dat = HeVAL(temphe);
	6673	I32 nums = (I32)SvPVX(sv_dat);
	6674	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	6675	if ((I32)(rx->lastparen) >= nums[i] &&
	6676	rx->offs[nums[i]].start != -1 &&
	6677	rx->offs[nums[i]].end != -1)
	6678	{
	6679	parno = nums[i];
	6680	break;
	6681	}
	6682	}
	6683	if (parno \|\| flags & RXapif_ALL) {
	6684	return newSVhek(HeKEY_hek(temphe));
	6685	}
	6686	}
	6687	}
	6688	return NULL;
	6689	}
	6690
	6691	SV*
	6692	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	6693	{
	6694	SV *ret;
	6695	AV *av;
	6696	SSize_t length;
	6697	struct regexp *const rx = ReANY(r);
	6698
	6699	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	6700
	6701	if (rx && RXp_PAREN_NAMES(rx)) {
	6702	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	6703	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	6704	} else if (flags & RXapif_ONE) {
	6705	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	6706	av = MUTABLE_AV(SvRV(ret));
	6707	length = av_len(av);
	6708	SvREFCNT_dec_NN(ret);
	6709	return newSViv(length + 1);
	6710	} else {
	6711	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
	6712	return NULL;
	6713	}
	6714	}
	6715	return &PL_sv_undef;
	6716	}
	6717
	6718	SV*
	6719	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	6720	{
	6721	struct regexp *const rx = ReANY(r);
	6722	AV *av = newAV();
	6723
	6724	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	6725
	6726	if (rx && RXp_PAREN_NAMES(rx)) {
	6727	HV *hv= RXp_PAREN_NAMES(rx);
	6728	HE *temphe;
	6729	(void)hv_iterinit(hv);
	6730	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	6731	IV i;
	6732	IV parno = 0;
	6733	SV* sv_dat = HeVAL(temphe);
	6734	I32 nums = (I32)SvPVX(sv_dat);
	6735	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	6736	if ((I32)(rx->lastparen) >= nums[i] &&
	6737	rx->offs[nums[i]].start != -1 &&
	6738	rx->offs[nums[i]].end != -1)
	6739	{
	6740	parno = nums[i];
	6741	break;
	6742	}
	6743	}
	6744	if (parno \|\| flags & RXapif_ALL) {
	6745	av_push(av, newSVhek(HeKEY_hek(temphe)));
	6746	}
	6747	}
	6748	}
	6749
	6750	return newRV_noinc(MUTABLE_SV(av));
	6751	}
	6752
	6753	void
	6754	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	6755	SV * const sv)
	6756	{
	6757	struct regexp *const rx = ReANY(r);
	6758	char *s = NULL;
	6759	SSize_t i = 0;
	6760	SSize_t s1, t1;
	6761	I32 n = paren;
	6762
	6763	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	6764
	6765	if ( n == RX_BUFF_IDX_CARET_PREMATCH
	6766	\|\| n == RX_BUFF_IDX_CARET_FULLMATCH
	6767	\|\| n == RX_BUFF_IDX_CARET_POSTMATCH
	6768	)
	6769	{
	6770	bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
	6771	if (!keepcopy) {
	6772	/* on something like
	6773	* $r = qr/.../;
	6774	* /$qr/p;
	6775	* the KEEPCOPY is set on the PMOP rather than the regex */
	6776	if (PL_curpm && r == PM_GETRE(PL_curpm))
	6777	keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
	6778	}
	6779	if (!keepcopy)
	6780	goto ret_undef;
	6781	}
	6782
	6783	if (!rx->subbeg)
	6784	goto ret_undef;
	6785
	6786	if (n == RX_BUFF_IDX_CARET_FULLMATCH)
	6787	/* no need to distinguish between them any more */
	6788	n = RX_BUFF_IDX_FULLMATCH;
	6789
	6790	if ((n == RX_BUFF_IDX_PREMATCH \|\| n == RX_BUFF_IDX_CARET_PREMATCH)
	6791	&& rx->offs[0].start != -1)
	6792	{
	6793	/* $`, ${^PREMATCH} */
	6794	i = rx->offs[0].start;
	6795	s = rx->subbeg;
	6796	}
	6797	else
	6798	if ((n == RX_BUFF_IDX_POSTMATCH \|\| n == RX_BUFF_IDX_CARET_POSTMATCH)
	6799	&& rx->offs[0].end != -1)
	6800	{
	6801	/* $', ${^POSTMATCH} */
	6802	s = rx->subbeg - rx->suboffset + rx->offs[0].end;
	6803	i = rx->sublen + rx->suboffset - rx->offs[0].end;
	6804	}
	6805	else
	6806	if ( 0 <= n && n <= (I32)rx->nparens &&
	6807	(s1 = rx->offs[n].start) != -1 &&
	6808	(t1 = rx->offs[n].end) != -1)
	6809	{
	6810	/* $&, ${^MATCH}, $1 ... */
	6811	i = t1 - s1;
	6812	s = rx->subbeg + s1 - rx->suboffset;
	6813	} else {
	6814	goto ret_undef;
	6815	}
	6816
	6817	assert(s >= rx->subbeg);
	6818	assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
	6819	if (i >= 0) {
	6820	#if NO_TAINT_SUPPORT
	6821	sv_setpvn(sv, s, i);
	6822	#else
	6823	const int oldtainted = TAINT_get;
	6824	TAINT_NOT;
	6825	sv_setpvn(sv, s, i);
	6826	TAINT_set(oldtainted);
	6827	#endif
	6828	if ( (rx->extflags & RXf_CANY_SEEN)
	6829	? (RXp_MATCH_UTF8(rx)
	6830	&& (!i \|\| is_utf8_string((U8*)s, i)))
	6831	: (RXp_MATCH_UTF8(rx)) )
	6832	{
	6833	SvUTF8_on(sv);
	6834	}
	6835	else
	6836	SvUTF8_off(sv);
	6837	if (TAINTING_get) {
	6838	if (RXp_MATCH_TAINTED(rx)) {
	6839	if (SvTYPE(sv) >= SVt_PVMG) {
	6840	MAGIC* const mg = SvMAGIC(sv);
	6841	MAGIC* mgt;
	6842	TAINT;
	6843	SvMAGIC_set(sv, mg->mg_moremagic);
	6844	SvTAINT(sv);
	6845	if ((mgt = SvMAGIC(sv))) {
	6846	mg->mg_moremagic = mgt;
	6847	SvMAGIC_set(sv, mg);
	6848	}
	6849	} else {
	6850	TAINT;
	6851	SvTAINT(sv);
	6852	}
	6853	} else
	6854	SvTAINTED_off(sv);
	6855	}
	6856	} else {
	6857	ret_undef:
	6858	sv_setsv(sv,&PL_sv_undef);
	6859	return;
	6860	}
	6861	}
	6862
	6863	void
	6864	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	6865	SV const * const value)
	6866	{
	6867	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	6868
	6869	PERL_UNUSED_ARG(rx);
	6870	PERL_UNUSED_ARG(paren);
	6871	PERL_UNUSED_ARG(value);
	6872
	6873	if (!PL_localizing)
	6874	Perl_croak_no_modify();
	6875	}
	6876
	6877	I32
	6878	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	6879	const I32 paren)
	6880	{
	6881	struct regexp *const rx = ReANY(r);
	6882	I32 i;
	6883	I32 s1, t1;
	6884
	6885	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	6886
	6887	if ( paren == RX_BUFF_IDX_CARET_PREMATCH
	6888	\|\| paren == RX_BUFF_IDX_CARET_FULLMATCH
	6889	\|\| paren == RX_BUFF_IDX_CARET_POSTMATCH
	6890	)
	6891	{
	6892	bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
	6893	if (!keepcopy) {
	6894	/* on something like
	6895	* $r = qr/.../;
	6896	* /$qr/p;
	6897	* the KEEPCOPY is set on the PMOP rather than the regex */
	6898	if (PL_curpm && r == PM_GETRE(PL_curpm))
	6899	keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
	6900	}
	6901	if (!keepcopy)
	6902	goto warn_undef;
	6903	}
	6904
	6905	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	6906	switch (paren) {
	6907	case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
	6908	case RX_BUFF_IDX_PREMATCH: /* $` */
	6909	if (rx->offs[0].start != -1) {
	6910	i = rx->offs[0].start;
	6911	if (i > 0) {
	6912	s1 = 0;
	6913	t1 = i;
	6914	goto getlen;
	6915	}
	6916	}
	6917	return 0;
	6918
	6919	case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
	6920	case RX_BUFF_IDX_POSTMATCH: /* $' */
	6921	if (rx->offs[0].end != -1) {
	6922	i = rx->sublen - rx->offs[0].end;
	6923	if (i > 0) {
	6924	s1 = rx->offs[0].end;
	6925	t1 = rx->sublen;
	6926	goto getlen;
	6927	}
	6928	}
	6929	return 0;
	6930
	6931	default: /* $& / ${^MATCH}, $1, $2, ... */
	6932	if (paren <= (I32)rx->nparens &&
	6933	(s1 = rx->offs[paren].start) != -1 &&
	6934	(t1 = rx->offs[paren].end) != -1)
	6935	{
	6936	i = t1 - s1;
	6937	goto getlen;
	6938	} else {
	6939	warn_undef:
	6940	if (ckWARN(WARN_UNINITIALIZED))
	6941	report_uninit((const SV *)sv);
	6942	return 0;
	6943	}
	6944	}
	6945	getlen:
	6946	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	6947	const char * const s = rx->subbeg - rx->suboffset + s1;
	6948	const U8 *ep;
	6949	STRLEN el;
	6950
	6951	i = t1 - s1;
	6952	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	6953	i = el;
	6954	}
	6955	return i;
	6956	}
	6957
	6958	SV*
	6959	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	6960	{
	6961	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	6962	PERL_UNUSED_ARG(rx);
	6963	if (0)
	6964	return NULL;
	6965	else
	6966	return newSVpvs("Regexp");
	6967	}
	6968
	6969	/* Scans the name of a named buffer from the pattern.
	6970	* If flags is REG_RSN_RETURN_NULL returns null.
	6971	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	6972	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	6973	* to the parsed name as looked up in the RExC_paren_names hash.
	6974	* If there is an error throws a vFAIL().. type exception.
	6975	*/
	6976
	6977	#define REG_RSN_RETURN_NULL 0
	6978	#define REG_RSN_RETURN_NAME 1
	6979	#define REG_RSN_RETURN_DATA 2
	6980
	6981	STATIC SV*
	6982	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	6983	{
	6984	char *name_start = RExC_parse;
	6985
	6986	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	6987
	6988	if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	6989	/* skip IDFIRST by using do...while */
	6990	if (UTF)
	6991	do {
	6992	RExC_parse += UTF8SKIP(RExC_parse);
	6993	} while (isWORDCHAR_utf8((U8*)RExC_parse));
	6994	else
	6995	do {
	6996	RExC_parse++;
	6997	} while (isWORDCHAR(*RExC_parse));
	6998	} else {
	6999	RExC_parse++; /* so the <- from the vFAIL is after the offending character */
	7000	vFAIL("Group name must start with a non-digit word character");
	7001	}
	7002	if ( flags ) {
	7003	SV* sv_name
	7004	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	7005	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	7006	if ( flags == REG_RSN_RETURN_NAME)
	7007	return sv_name;
	7008	else if (flags==REG_RSN_RETURN_DATA) {
	7009	HE *he_str = NULL;
	7010	SV *sv_dat = NULL;
	7011	if ( ! sv_name ) /* should not happen*/
	7012	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	7013	if (RExC_paren_names)
	7014	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	7015	if ( he_str )
	7016	sv_dat = HeVAL(he_str);
	7017	if ( ! sv_dat )
	7018	vFAIL("Reference to nonexistent named group");
	7019	return sv_dat;
	7020	}
	7021	else {
	7022	Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
	7023	(unsigned long) flags);
	7024	}
	7025	assert(0); /* NOT REACHED */
	7026	}
	7027	return NULL;
	7028	}
	7029
	7030	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	7031	int rem=(int)(RExC_end - RExC_parse); \
	7032	int cut; \
	7033	int num; \
	7034	int iscut=0; \
	7035	if (rem>10) { \
	7036	rem=10; \
	7037	iscut=1; \
	7038	} \
	7039	cut=10-rem; \
	7040	if (RExC_lastparse!=RExC_parse) \
	7041	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	7042	rem, RExC_parse, \
	7043	cut + 4, \
	7044	iscut ? "..." : "<" \
	7045	); \
	7046	else \
	7047	PerlIO_printf(Perl_debug_log,"%16s",""); \
	7048	\
	7049	if (SIZE_ONLY) \
	7050	num = RExC_size + 1; \
	7051	else \
	7052	num=REG_NODE_NUM(RExC_emit); \
	7053	if (RExC_lastnum!=num) \
	7054	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	7055	else \
	7056	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	7057	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	7058	(int)((depth*2)), "", \
	7059	(funcname) \
	7060	); \
	7061	RExC_lastnum=num; \
	7062	RExC_lastparse=RExC_parse; \
	7063	})
	7064
	7065
	7066
	7067	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	7068	DEBUG_PARSE_MSG((funcname)); \
	7069	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	7070	})
	7071	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	7072	DEBUG_PARSE_MSG((funcname)); \
	7073	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	7074	})
	7075
	7076	/* This section of code defines the inversion list object and its methods. The
	7077	* interfaces are highly subject to change, so as much as possible is static to
	7078	* this file. An inversion list is here implemented as a malloc'd C UV array
	7079	* as an SVt_INVLIST scalar.
	7080	*
	7081	* An inversion list for Unicode is an array of code points, sorted by ordinal
	7082	* number. The zeroth element is the first code point in the list. The 1th
	7083	* element is the first element beyond that not in the list. In other words,
	7084	* the first range is
	7085	* invlist[0]..(invlist[1]-1)
	7086	* The other ranges follow. Thus every element whose index is divisible by two
	7087	* marks the beginning of a range that is in the list, and every element not
	7088	* divisible by two marks the beginning of a range not in the list. A single
	7089	* element inversion list that contains the single code point N generally
	7090	* consists of two elements
	7091	* invlist[0] == N
	7092	* invlist[1] == N+1
	7093	* (The exception is when N is the highest representable value on the
	7094	* machine, in which case the list containing just it would be a single
	7095	* element, itself. By extension, if the last range in the list extends to
	7096	* infinity, then the first element of that range will be in the inversion list
	7097	* at a position that is divisible by two, and is the final element in the
	7098	* list.)
	7099	* Taking the complement (inverting) an inversion list is quite simple, if the
	7100	* first element is 0, remove it; otherwise add a 0 element at the beginning.
	7101	* This implementation reserves an element at the beginning of each inversion
	7102	* list to always contain 0; there is an additional flag in the header which
	7103	* indicates if the list begins at the 0, or is offset to begin at the next
	7104	* element.
	7105	*
	7106	* More about inversion lists can be found in "Unicode Demystified"
	7107	* Chapter 13 by Richard Gillam, published by Addison-Wesley.
	7108	* More will be coming when functionality is added later.
	7109	*
	7110	* The inversion list data structure is currently implemented as an SV pointing
	7111	* to an array of UVs that the SV thinks are bytes. This allows us to have an
	7112	* array of UV whose memory management is automatically handled by the existing
	7113	* facilities for SV's.
	7114	*
	7115	* Some of the methods should always be private to the implementation, and some
	7116	* should eventually be made public */
	7117
	7118	/* The header definitions are in F<inline_invlist.c> */
	7119
	7120	PERL_STATIC_INLINE UV*
	7121	S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
	7122	{
	7123	/* Returns a pointer to the first element in the inversion list's array.
	7124	* This is called upon initialization of an inversion list. Where the
	7125	* array begins depends on whether the list has the code point U+0000 in it
	7126	* or not. The other parameter tells it whether the code that follows this
	7127	* call is about to put a 0 in the inversion list or not. The first
	7128	* element is either the element reserved for 0, if TRUE, or the element
	7129	* after it, if FALSE */
	7130
	7131	bool* offset = get_invlist_offset_addr(invlist);
	7132	UV* zero_addr = (UV *) SvPVX(invlist);
	7133
	7134	PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
	7135
	7136	/* Must be empty */
	7137	assert(! _invlist_len(invlist));
	7138
	7139	*zero_addr = 0;
	7140
	7141	/* 1^1 = 0; 1^0 = 1 */
	7142	*offset = 1 ^ will_have_0;
	7143	return zero_addr + *offset;
	7144	}
	7145
	7146	PERL_STATIC_INLINE UV*
	7147	S_invlist_array(pTHX_ SV* const invlist)
	7148	{
	7149	/* Returns the pointer to the inversion list's array. Every time the
	7150	* length changes, this needs to be called in case malloc or realloc moved
	7151	* it */
	7152
	7153	PERL_ARGS_ASSERT_INVLIST_ARRAY;
	7154
	7155	/* Must not be empty. If these fail, you probably didn't check for <len>
	7156	* being non-zero before trying to get the array */
	7157	assert(_invlist_len(invlist));
	7158
	7159	/* The very first element always contains zero, The array begins either
	7160	* there, or if the inversion list is offset, at the element after it.
	7161	* The offset header field determines which; it contains 0 or 1 to indicate
	7162	* how much additionally to add */
	7163	assert(0 == *(SvPVX(invlist)));
	7164	return ((UV ) SvPVX(invlist) + get_invlist_offset_addr(invlist));
	7165	}
	7166
	7167	PERL_STATIC_INLINE void
	7168	S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
	7169	{
	7170	/* Sets the current number of elements stored in the inversion list.
	7171	* Updates SvCUR correspondingly */
	7172
	7173	PERL_ARGS_ASSERT_INVLIST_SET_LEN;
	7174
	7175	assert(SvTYPE(invlist) == SVt_INVLIST);
	7176
	7177	SvCUR_set(invlist,
	7178	(len == 0)
	7179	? 0
	7180	: TO_INTERNAL_SIZE(len + offset));
	7181	assert(SvLEN(invlist) == 0 \|\| SvCUR(invlist) <= SvLEN(invlist));
	7182	}
	7183
	7184	PERL_STATIC_INLINE IV*
	7185	S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
	7186	{
	7187	/* Return the address of the IV that is reserved to hold the cached index
	7188	* */
	7189
	7190	PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
	7191
	7192	assert(SvTYPE(invlist) == SVt_INVLIST);
	7193
	7194	return &(((XINVLIST*) SvANY(invlist))->prev_index);
	7195	}
	7196
	7197	PERL_STATIC_INLINE IV
	7198	S_invlist_previous_index(pTHX_ SV* const invlist)
	7199	{
	7200	/* Returns cached index of previous search */
	7201
	7202	PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
	7203
	7204	return *get_invlist_previous_index_addr(invlist);
	7205	}
	7206
	7207	PERL_STATIC_INLINE void
	7208	S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
	7209	{
	7210	/* Caches <index> for later retrieval */
	7211
	7212	PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
	7213
	7214	assert(index == 0 \|\| index < (int) _invlist_len(invlist));
	7215
	7216	*get_invlist_previous_index_addr(invlist) = index;
	7217	}
	7218
	7219	PERL_STATIC_INLINE UV
	7220	S_invlist_max(pTHX_ SV* const invlist)
	7221	{
	7222	/* Returns the maximum number of elements storable in the inversion list's
	7223	* array, without having to realloc() */
	7224
	7225	PERL_ARGS_ASSERT_INVLIST_MAX;
	7226
	7227	assert(SvTYPE(invlist) == SVt_INVLIST);
	7228
	7229	/* Assumes worst case, in which the 0 element is not counted in the
	7230	* inversion list, so subtracts 1 for that */
	7231	return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
	7232	? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
	7233	: FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
	7234	}
	7235
	7236	#ifndef PERL_IN_XSUB_RE
	7237	SV*
	7238	Perl__new_invlist(pTHX_ IV initial_size)
	7239	{
	7240
	7241	/* Return a pointer to a newly constructed inversion list, with enough
	7242	* space to store 'initial_size' elements. If that number is negative, a
	7243	* system default is used instead */
	7244
	7245	SV* new_list;
	7246
	7247	if (initial_size < 0) {
	7248	initial_size = 10;
	7249	}
	7250
	7251	/* Allocate the initial space */
	7252	new_list = newSV_type(SVt_INVLIST);
	7253
	7254	/* First 1 is in case the zero element isn't in the list; second 1 is for
	7255	* trailing NUL */
	7256	SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
	7257	invlist_set_len(new_list, 0, 0);
	7258
	7259	/* Force iterinit() to be used to get iteration to work */
	7260	*get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
	7261
	7262	*get_invlist_previous_index_addr(new_list) = 0;
	7263
	7264	return new_list;
	7265	}
	7266	#endif
	7267
	7268	STATIC SV*
	7269	S__new_invlist_C_array(pTHX_ const UV* const list)
	7270	{
	7271	/* Return a pointer to a newly constructed inversion list, initialized to
	7272	* point to <list>, which has to be in the exact correct inversion list
	7273	* form, including internal fields. Thus this is a dangerous routine that
	7274	* should not be used in the wrong hands. The passed in 'list' contains
	7275	* several header fields at the beginning that are not part of the
	7276	* inversion list body proper */
	7277
	7278	const STRLEN length = (STRLEN) list[0];
	7279	const UV version_id = list[1];
	7280	const bool offset = cBOOL(list[2]);
	7281	#define HEADER_LENGTH 3
	7282	/* If any of the above changes in any way, you must change HEADER_LENGTH
	7283	* (if appropriate) and regenerate INVLIST_VERSION_ID by running
	7284	* perl -E 'say int(rand 2**31-1)'
	7285	*/
	7286	#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
	7287	data structure type, so that one being
	7288	passed in can be validated to be an
	7289	inversion list of the correct vintage.
	7290	*/
	7291
	7292	SV* invlist = newSV_type(SVt_INVLIST);
	7293
	7294	PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
	7295
	7296	if (version_id != INVLIST_VERSION_ID) {
	7297	Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
	7298	}
	7299
	7300	/* The generated array passed in includes header elements that aren't part
	7301	* of the list proper, so start it just after them */
	7302	SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
	7303
	7304	SvLEN_set(invlist, 0); /* Means we own the contents, and the system
	7305	shouldn't touch it */
	7306
	7307	*(get_invlist_offset_addr(invlist)) = offset;
	7308
	7309	/* The 'length' passed to us is the physical number of elements in the
	7310	* inversion list. But if there is an offset the logical number is one
	7311	* less than that */
	7312	invlist_set_len(invlist, length - offset, offset);
	7313
	7314	invlist_set_previous_index(invlist, 0);
	7315
	7316	/* Initialize the iteration pointer. */
	7317	invlist_iterfinish(invlist);
	7318
	7319	return invlist;
	7320	}
	7321
	7322	STATIC void
	7323	S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
	7324	{
	7325	/* Grow the maximum size of an inversion list */
	7326
	7327	PERL_ARGS_ASSERT_INVLIST_EXTEND;
	7328
	7329	assert(SvTYPE(invlist) == SVt_INVLIST);
	7330
	7331	/* Add one to account for the zero element at the beginning which may not
	7332	* be counted by the calling parameters */
	7333	SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
	7334	}
	7335
	7336	PERL_STATIC_INLINE void
	7337	S_invlist_trim(pTHX_ SV* const invlist)
	7338	{
	7339	PERL_ARGS_ASSERT_INVLIST_TRIM;
	7340
	7341	assert(SvTYPE(invlist) == SVt_INVLIST);
	7342
	7343	/* Change the length of the inversion list to how many entries it currently
	7344	* has */
	7345	SvPV_shrink_to_cur((SV *) invlist);
	7346	}
	7347
	7348	#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
	7349
	7350	STATIC void
	7351	S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
	7352	{
	7353	/* Subject to change or removal. Append the range from 'start' to 'end' at
	7354	* the end of the inversion list. The range must be above any existing
	7355	* ones. */
	7356
	7357	UV* array;
	7358	UV max = invlist_max(invlist);
	7359	UV len = _invlist_len(invlist);
	7360	bool offset;
	7361
	7362	PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
	7363
	7364	if (len == 0) { /* Empty lists must be initialized */
	7365	offset = start != 0;
	7366	array = _invlist_array_init(invlist, ! offset);
	7367	}
	7368	else {
	7369	/* Here, the existing list is non-empty. The current max entry in the
	7370	* list is generally the first value not in the set, except when the
	7371	* set extends to the end of permissible values, in which case it is
	7372	* the first entry in that final set, and so this call is an attempt to
	7373	* append out-of-order */
	7374
	7375	UV final_element = len - 1;
	7376	array = invlist_array(invlist);
	7377	if (array[final_element] > start
	7378	\|\| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
	7379	{
	7380	Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
	7381	array[final_element], start,
	7382	ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
	7383	}
	7384
	7385	/* Here, it is a legal append. If the new range begins with the first
	7386	* value not in the set, it is extending the set, so the new first
	7387	* value not in the set is one greater than the newly extended range.
	7388	* */
	7389	offset = *get_invlist_offset_addr(invlist);
	7390	if (array[final_element] == start) {
	7391	if (end != UV_MAX) {
	7392	array[final_element] = end + 1;
	7393	}
	7394	else {
	7395	/* But if the end is the maximum representable on the machine,
	7396	* just let the range that this would extend to have no end */
	7397	invlist_set_len(invlist, len - 1, offset);
	7398	}
	7399	return;
	7400	}
	7401	}
	7402
	7403	/* Here the new range doesn't extend any existing set. Add it */
	7404
	7405	len += 2; /* Includes an element each for the start and end of range */
	7406
	7407	/* If wll overflow the existing space, extend, which may cause the array to
	7408	* be moved */
	7409	if (max < len) {
	7410	invlist_extend(invlist, len);
	7411
	7412	/* Have to set len here to avoid assert failure in invlist_array() */
	7413	invlist_set_len(invlist, len, offset);
	7414
	7415	array = invlist_array(invlist);
	7416	}
	7417	else {
	7418	invlist_set_len(invlist, len, offset);
	7419	}
	7420
	7421	/* The next item on the list starts the range, the one after that is
	7422	* one past the new range. */
	7423	array[len - 2] = start;
	7424	if (end != UV_MAX) {
	7425	array[len - 1] = end + 1;
	7426	}
	7427	else {
	7428	/* But if the end is the maximum representable on the machine, just let
	7429	* the range have no end */
	7430	invlist_set_len(invlist, len - 1, offset);
	7431	}
	7432	}
	7433
	7434	#ifndef PERL_IN_XSUB_RE
	7435
	7436	IV
	7437	Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
	7438	{
	7439	/* Searches the inversion list for the entry that contains the input code
	7440	* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
	7441	* return value is the index into the list's array of the range that
	7442	* contains <cp> */
	7443
	7444	IV low = 0;
	7445	IV mid;
	7446	IV high = _invlist_len(invlist);
	7447	const IV highest_element = high - 1;
	7448	const UV* array;
	7449
	7450	PERL_ARGS_ASSERT__INVLIST_SEARCH;
	7451
	7452	/* If list is empty, return failure. */
	7453	if (high == 0) {
	7454	return -1;
	7455	}
	7456
	7457	/* (We can't get the array unless we know the list is non-empty) */
	7458	array = invlist_array(invlist);
	7459
	7460	mid = invlist_previous_index(invlist);
	7461	assert(mid >=0 && mid <= highest_element);
	7462
	7463	/* <mid> contains the cache of the result of the previous call to this
	7464	* function (0 the first time). See if this call is for the same result,
	7465	* or if it is for mid-1. This is under the theory that calls to this
	7466	* function will often be for related code points that are near each other.
	7467	* And benchmarks show that caching gives better results. We also test
	7468	* here if the code point is within the bounds of the list. These tests
	7469	* replace others that would have had to be made anyway to make sure that
	7470	* the array bounds were not exceeded, and these give us extra information
	7471	* at the same time */
	7472	if (cp >= array[mid]) {
	7473	if (cp >= array[highest_element]) {
	7474	return highest_element;
	7475	}
	7476
	7477	/* Here, array[mid] <= cp < array[highest_element]. This means that
	7478	* the final element is not the answer, so can exclude it; it also
	7479	* means that <mid> is not the final element, so can refer to 'mid + 1'
	7480	* safely */
	7481	if (cp < array[mid + 1]) {
	7482	return mid;
	7483	}
	7484	high--;
	7485	low = mid + 1;
	7486	}
	7487	else { /* cp < aray[mid] */
	7488	if (cp < array[0]) { /* Fail if outside the array */
	7489	return -1;
	7490	}
	7491	high = mid;
	7492	if (cp >= array[mid - 1]) {
	7493	goto found_entry;
	7494	}
	7495	}
	7496
	7497	/* Binary search. What we are looking for is <i> such that
	7498	* array[i] <= cp < array[i+1]
	7499	* The loop below converges on the i+1. Note that there may not be an
	7500	* (i+1)th element in the array, and things work nonetheless */
	7501	while (low < high) {
	7502	mid = (low + high) / 2;
	7503	assert(mid <= highest_element);
	7504	if (array[mid] <= cp) { /* cp >= array[mid] */
	7505	low = mid + 1;
	7506
	7507	/* We could do this extra test to exit the loop early.
	7508	if (cp < array[low]) {
	7509	return mid;
	7510	}
	7511	*/
	7512	}
	7513	else { /* cp < array[mid] */
	7514	high = mid;
	7515	}
	7516	}
	7517
	7518	found_entry:
	7519	high--;
	7520	invlist_set_previous_index(invlist, high);
	7521	return high;
	7522	}
	7523
	7524	void
	7525	Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
	7526	{
	7527	/* populates a swatch of a swash the same way swatch_get() does in utf8.c,
	7528	* but is used when the swash has an inversion list. This makes this much
	7529	* faster, as it uses a binary search instead of a linear one. This is
	7530	* intimately tied to that function, and perhaps should be in utf8.c,
	7531	* except it is intimately tied to inversion lists as well. It assumes
	7532	* that <swatch> is all 0's on input */
	7533
	7534	UV current = start;
	7535	const IV len = _invlist_len(invlist);
	7536	IV i;
	7537	const UV * array;
	7538
	7539	PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
	7540
	7541	if (len == 0) { /* Empty inversion list */
	7542	return;
	7543	}
	7544
	7545	array = invlist_array(invlist);
	7546
	7547	/* Find which element it is */
	7548	i = _invlist_search(invlist, start);
	7549
	7550	/* We populate from <start> to <end> */
	7551	while (current < end) {
	7552	UV upper;
	7553
	7554	/* The inversion list gives the results for every possible code point
	7555	* after the first one in the list. Only those ranges whose index is
	7556	* even are ones that the inversion list matches. For the odd ones,
	7557	* and if the initial code point is not in the list, we have to skip
	7558	* forward to the next element */
	7559	if (i == -1 \|\| ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
	7560	i++;
	7561	if (i >= len) { /* Finished if beyond the end of the array */
	7562	return;
	7563	}
	7564	current = array[i];
	7565	if (current >= end) { /* Finished if beyond the end of what we
	7566	are populating */
	7567	if (LIKELY(end < UV_MAX)) {
	7568	return;
	7569	}
	7570
	7571	/* We get here when the upper bound is the maximum
	7572	* representable on the machine, and we are looking for just
	7573	* that code point. Have to special case it */
	7574	i = len;
	7575	goto join_end_of_list;
	7576	}
	7577	}
	7578	assert(current >= start);
	7579
	7580	/* The current range ends one below the next one, except don't go past
	7581	* <end> */
	7582	i++;
	7583	upper = (i < len && array[i] < end) ? array[i] : end;
	7584
	7585	/* Here we are in a range that matches. Populate a bit in the 3-bit U8
	7586	* for each code point in it */
	7587	for (; current < upper; current++) {
	7588	const STRLEN offset = (STRLEN)(current - start);
	7589	swatch[offset >> 3] \|= 1 << (offset & 7);
	7590	}
	7591
	7592	join_end_of_list:
	7593
	7594	/* Quit if at the end of the list */
	7595	if (i >= len) {
	7596
	7597	/* But first, have to deal with the highest possible code point on
	7598	* the platform. The previous code assumes that <end> is one
	7599	* beyond where we want to populate, but that is impossible at the
	7600	* platform's infinity, so have to handle it specially */
	7601	if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
	7602	{
	7603	const STRLEN offset = (STRLEN)(end - start);
	7604	swatch[offset >> 3] \|= 1 << (offset & 7);
	7605	}
	7606	return;
	7607	}
	7608
	7609	/* Advance to the next range, which will be for code points not in the
	7610	* inversion list */
	7611	current = array[i];
	7612	}
	7613
	7614	return;
	7615	}
	7616
	7617	void
	7618	Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
	7619	{
	7620	/* Take the union of two inversion lists and point <output> to it. *output
	7621	* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
	7622	* the reference count to that list will be decremented. The first list,
	7623	* <a>, may be NULL, in which case a copy of the second list is returned.
	7624	* If <complement_b> is TRUE, the union is taken of the complement
	7625	* (inversion) of <b> instead of b itself.
	7626	*
	7627	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	7628	* Richard Gillam, published by Addison-Wesley, and explained at some
	7629	* length there. The preface says to incorporate its examples into your
	7630	* code at your own risk.
	7631	*
	7632	* The algorithm is like a merge sort.
	7633	*
	7634	* XXX A potential performance improvement is to keep track as we go along
	7635	* if only one of the inputs contributes to the result, meaning the other
	7636	* is a subset of that one. In that case, we can skip the final copy and
	7637	* return the larger of the input lists, but then outside code might need
	7638	* to keep track of whether to free the input list or not */
	7639
	7640	const UV* array_a; /* a's array */
	7641	const UV* array_b;
	7642	UV len_a; /* length of a's array */
	7643	UV len_b;
	7644
	7645	SV* u; /* the resulting union */
	7646	UV* array_u;
	7647	UV len_u;
	7648
	7649	UV i_a = 0; /* current index into a's array */
	7650	UV i_b = 0;
	7651	UV i_u = 0;
	7652
	7653	/* running count, as explained in the algorithm source book; items are
	7654	* stopped accumulating and are output when the count changes to/from 0.
	7655	* The count is incremented when we start a range that's in the set, and
	7656	* decremented when we start a range that's not in the set. So its range
	7657	* is 0 to 2. Only when the count is zero is something not in the set.
	7658	*/
	7659	UV count = 0;
	7660
	7661	PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
	7662	assert(a != b);
	7663
	7664	/* If either one is empty, the union is the other one */
	7665	if (a == NULL \|\| ((len_a = _invlist_len(a)) == 0)) {
	7666	if (*output == a) {
	7667	if (a != NULL) {
	7668	SvREFCNT_dec_NN(a);
	7669	}
	7670	}
	7671	if (*output != b) {
	7672	*output = invlist_clone(b);
	7673	if (complement_b) {
	7674	_invlist_invert(*output);
	7675	}
	7676	} /* else output already = b; /
	7677	return;
	7678	}
	7679	else if ((len_b = _invlist_len(b)) == 0) {
	7680	if (*output == b) {
	7681	SvREFCNT_dec_NN(b);
	7682	}
	7683
	7684	/* The complement of an empty list is a list that has everything in it,
	7685	* so the union with <a> includes everything too */
	7686	if (complement_b) {
	7687	if (a == *output) {
	7688	SvREFCNT_dec_NN(a);
	7689	}
	7690	*output = _new_invlist(1);
	7691	_append_range_to_invlist(*output, 0, UV_MAX);
	7692	}
	7693	else if (*output != a) {
	7694	*output = invlist_clone(a);
	7695	}
	7696	/* else output already = a; /
	7697	return;
	7698	}
	7699
	7700	/* Here both lists exist and are non-empty */
	7701	array_a = invlist_array(a);
	7702	array_b = invlist_array(b);
	7703
	7704	/* If are to take the union of 'a' with the complement of b, set it
	7705	* up so are looking at b's complement. */
	7706	if (complement_b) {
	7707
	7708	/* To complement, we invert: if the first element is 0, remove it. To
	7709	* do this, we just pretend the array starts one later */
	7710	if (array_b[0] == 0) {
	7711	array_b++;
	7712	len_b--;
	7713	}
	7714	else {
	7715
	7716	/* But if the first element is not zero, we pretend the list starts
	7717	* at the 0 that is always stored immediately before the array. */
	7718	array_b--;
	7719	len_b++;
	7720	}
	7721	}
	7722
	7723	/* Size the union for the worst case: that the sets are completely
	7724	* disjoint */
	7725	u = _new_invlist(len_a + len_b);
	7726
	7727	/* Will contain U+0000 if either component does */
	7728	array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
	7729	\|\| (len_b > 0 && array_b[0] == 0));
	7730
	7731	/* Go through each list item by item, stopping when exhausted one of
	7732	* them */
	7733	while (i_a < len_a && i_b < len_b) {
	7734	UV cp; /* The element to potentially add to the union's array */
	7735	bool cp_in_set; /* is it in the the input list's set or not */
	7736
	7737	/* We need to take one or the other of the two inputs for the union.
	7738	* Since we are merging two sorted lists, we take the smaller of the
	7739	* next items. In case of a tie, we take the one that is in its set
	7740	* first. If we took one not in the set first, it would decrement the
	7741	* count, possibly to 0 which would cause it to be output as ending the
	7742	* range, and the next time through we would take the same number, and
	7743	* output it again as beginning the next range. By doing it the
	7744	* opposite way, there is no possibility that the count will be
	7745	* momentarily decremented to 0, and thus the two adjoining ranges will
	7746	* be seamlessly merged. (In a tie and both are in the set or both not
	7747	* in the set, it doesn't matter which we take first.) */
	7748	if (array_a[i_a] < array_b[i_b]
	7749	\|\| (array_a[i_a] == array_b[i_b]
	7750	&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	7751	{
	7752	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	7753	cp= array_a[i_a++];
	7754	}
	7755	else {
	7756	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	7757	cp = array_b[i_b++];
	7758	}
	7759
	7760	/* Here, have chosen which of the two inputs to look at. Only output
	7761	* if the running count changes to/from 0, which marks the
	7762	* beginning/end of a range in that's in the set */
	7763	if (cp_in_set) {
	7764	if (count == 0) {
	7765	array_u[i_u++] = cp;
	7766	}
	7767	count++;
	7768	}
	7769	else {
	7770	count--;
	7771	if (count == 0) {
	7772	array_u[i_u++] = cp;
	7773	}
	7774	}
	7775	}
	7776
	7777	/* Here, we are finished going through at least one of the lists, which
	7778	* means there is something remaining in at most one. We check if the list
	7779	* that hasn't been exhausted is positioned such that we are in the middle
	7780	* of a range in its set or not. (i_a and i_b point to the element beyond
	7781	* the one we care about.) If in the set, we decrement 'count'; if 0, there
	7782	* is potentially more to output.
	7783	* There are four cases:
	7784	* 1) Both weren't in their sets, count is 0, and remains 0. What's left
	7785	* in the union is entirely from the non-exhausted set.
	7786	* 2) Both were in their sets, count is 2. Nothing further should
	7787	* be output, as everything that remains will be in the exhausted
	7788	* list's set, hence in the union; decrementing to 1 but not 0 insures
	7789	* that
	7790	* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
	7791	* Nothing further should be output because the union includes
	7792	* everything from the exhausted set. Not decrementing ensures that.
	7793	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
	7794	* decrementing to 0 insures that we look at the remainder of the
	7795	* non-exhausted set */
	7796	if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	7797	\|\| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	7798	{
	7799	count--;
	7800	}
	7801
	7802	/* The final length is what we've output so far, plus what else is about to
	7803	* be output. (If 'count' is non-zero, then the input list we exhausted
	7804	* has everything remaining up to the machine's limit in its set, and hence
	7805	* in the union, so there will be no further output. */
	7806	len_u = i_u;
	7807	if (count == 0) {
	7808	/* At most one of the subexpressions will be non-zero */
	7809	len_u += (len_a - i_a) + (len_b - i_b);
	7810	}
	7811
	7812	/* Set result to final length, which can change the pointer to array_u, so
	7813	* re-find it */
	7814	if (len_u != _invlist_len(u)) {
	7815	invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
	7816	invlist_trim(u);
	7817	array_u = invlist_array(u);
	7818	}
	7819
	7820	/* When 'count' is 0, the list that was exhausted (if one was shorter than
	7821	* the other) ended with everything above it not in its set. That means
	7822	* that the remaining part of the union is precisely the same as the
	7823	* non-exhausted list, so can just copy it unchanged. (If both list were
	7824	* exhausted at the same time, then the operations below will be both 0.)
	7825	*/
	7826	if (count == 0) {
	7827	IV copy_count; /* At most one will have a non-zero copy count */
	7828	if ((copy_count = len_a - i_a) > 0) {
	7829	Copy(array_a + i_a, array_u + i_u, copy_count, UV);
	7830	}
	7831	else if ((copy_count = len_b - i_b) > 0) {
	7832	Copy(array_b + i_b, array_u + i_u, copy_count, UV);
	7833	}
	7834	}
	7835
	7836	/* We may be removing a reference to one of the inputs */
	7837	if (a == output \|\| b == output) {
	7838	assert(! invlist_is_iterating(*output));
	7839	SvREFCNT_dec_NN(*output);
	7840	}
	7841
	7842	*output = u;
	7843	return;
	7844	}
	7845
	7846	void
	7847	Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
	7848	{
	7849	/* Take the intersection of two inversion lists and point <i> to it. *i
	7850	* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
	7851	* the reference count to that list will be decremented.
	7852	* If <complement_b> is TRUE, the result will be the intersection of <a>
	7853	* and the complement (or inversion) of <b> instead of <b> directly.
	7854	*
	7855	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	7856	* Richard Gillam, published by Addison-Wesley, and explained at some
	7857	* length there. The preface says to incorporate its examples into your
	7858	* code at your own risk. In fact, it had bugs
	7859	*
	7860	* The algorithm is like a merge sort, and is essentially the same as the
	7861	* union above
	7862	*/
	7863
	7864	const UV* array_a; /* a's array */
	7865	const UV* array_b;
	7866	UV len_a; /* length of a's array */
	7867	UV len_b;
	7868
	7869	SV* r; /* the resulting intersection */
	7870	UV* array_r;
	7871	UV len_r;
	7872
	7873	UV i_a = 0; /* current index into a's array */
	7874	UV i_b = 0;
	7875	UV i_r = 0;
	7876
	7877	/* running count, as explained in the algorithm source book; items are
	7878	* stopped accumulating and are output when the count changes to/from 2.
	7879	* The count is incremented when we start a range that's in the set, and
	7880	* decremented when we start a range that's not in the set. So its range
	7881	* is 0 to 2. Only when the count is 2 is something in the intersection.
	7882	*/
	7883	UV count = 0;
	7884
	7885	PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
	7886	assert(a != b);
	7887
	7888	/* Special case if either one is empty */
	7889	len_a = (a == NULL) ? 0 : _invlist_len(a);
	7890	if ((len_a == 0) \|\| ((len_b = _invlist_len(b)) == 0)) {
	7891
	7892	if (len_a != 0 && complement_b) {
	7893
	7894	/* Here, 'a' is not empty, therefore from the above 'if', 'b' must
	7895	* be empty. Here, also we are using 'b's complement, which hence
	7896	* must be every possible code point. Thus the intersection is
	7897	* simply 'a'. */
	7898	if (*i != a) {
	7899	if (*i == b) {
	7900	SvREFCNT_dec_NN(b);
	7901	}
	7902
	7903	*i = invlist_clone(a);
	7904	}
	7905	/* else i is already 'a' /
	7906	return;
	7907	}
	7908
	7909	/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
	7910	* intersection must be empty */
	7911	if (*i == a) {
	7912	SvREFCNT_dec_NN(a);
	7913	}
	7914	else if (*i == b) {
	7915	SvREFCNT_dec_NN(b);
	7916	}
	7917	*i = _new_invlist(0);
	7918	return;
	7919	}
	7920
	7921	/* Here both lists exist and are non-empty */
	7922	array_a = invlist_array(a);
	7923	array_b = invlist_array(b);
	7924
	7925	/* If are to take the intersection of 'a' with the complement of b, set it
	7926	* up so are looking at b's complement. */
	7927	if (complement_b) {
	7928
	7929	/* To complement, we invert: if the first element is 0, remove it. To
	7930	* do this, we just pretend the array starts one later */
	7931	if (array_b[0] == 0) {
	7932	array_b++;
	7933	len_b--;
	7934	}
	7935	else {
	7936
	7937	/* But if the first element is not zero, we pretend the list starts
	7938	* at the 0 that is always stored immediately before the array. */
	7939	array_b--;
	7940	len_b++;
	7941	}
	7942	}
	7943
	7944	/* Size the intersection for the worst case: that the intersection ends up
	7945	* fragmenting everything to be completely disjoint */
	7946	r= _new_invlist(len_a + len_b);
	7947
	7948	/* Will contain U+0000 iff both components do */
	7949	array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
	7950	&& len_b > 0 && array_b[0] == 0);
	7951
	7952	/* Go through each list item by item, stopping when exhausted one of
	7953	* them */
	7954	while (i_a < len_a && i_b < len_b) {
	7955	UV cp; /* The element to potentially add to the intersection's
	7956	array */
	7957	bool cp_in_set; /* Is it in the input list's set or not */
	7958
	7959	/* We need to take one or the other of the two inputs for the
	7960	* intersection. Since we are merging two sorted lists, we take the
	7961	* smaller of the next items. In case of a tie, we take the one that
	7962	* is not in its set first (a difference from the union algorithm). If
	7963	* we took one in the set first, it would increment the count, possibly
	7964	* to 2 which would cause it to be output as starting a range in the
	7965	* intersection, and the next time through we would take that same
	7966	* number, and output it again as ending the set. By doing it the
	7967	* opposite of this, there is no possibility that the count will be
	7968	* momentarily incremented to 2. (In a tie and both are in the set or
	7969	* both not in the set, it doesn't matter which we take first.) */
	7970	if (array_a[i_a] < array_b[i_b]
	7971	\|\| (array_a[i_a] == array_b[i_b]
	7972	&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	7973	{
	7974	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	7975	cp= array_a[i_a++];
	7976	}
	7977	else {
	7978	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	7979	cp= array_b[i_b++];
	7980	}
	7981
	7982	/* Here, have chosen which of the two inputs to look at. Only output
	7983	* if the running count changes to/from 2, which marks the
	7984	* beginning/end of a range that's in the intersection */
	7985	if (cp_in_set) {
	7986	count++;
	7987	if (count == 2) {
	7988	array_r[i_r++] = cp;
	7989	}
	7990	}
	7991	else {
	7992	if (count == 2) {
	7993	array_r[i_r++] = cp;
	7994	}
	7995	count--;
	7996	}
	7997	}
	7998
	7999	/* Here, we are finished going through at least one of the lists, which
	8000	* means there is something remaining in at most one. We check if the list
	8001	* that has been exhausted is positioned such that we are in the middle
	8002	* of a range in its set or not. (i_a and i_b point to elements 1 beyond
	8003	* the ones we care about.) There are four cases:
	8004	* 1) Both weren't in their sets, count is 0, and remains 0. There's
	8005	* nothing left in the intersection.
	8006	* 2) Both were in their sets, count is 2 and perhaps is incremented to
	8007	* above 2. What should be output is exactly that which is in the
	8008	* non-exhausted set, as everything it has is also in the intersection
	8009	* set, and everything it doesn't have can't be in the intersection
	8010	* 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
	8011	* gets incremented to 2. Like the previous case, the intersection is
	8012	* everything that remains in the non-exhausted set.
	8013	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
	8014	* remains 1. And the intersection has nothing more. */
	8015	if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	8016	\|\| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	8017	{
	8018	count++;
	8019	}
	8020
	8021	/* The final length is what we've output so far plus what else is in the
	8022	* intersection. At most one of the subexpressions below will be non-zero */
	8023	len_r = i_r;
	8024	if (count >= 2) {
	8025	len_r += (len_a - i_a) + (len_b - i_b);
	8026	}
	8027
	8028	/* Set result to final length, which can change the pointer to array_r, so
	8029	* re-find it */
	8030	if (len_r != _invlist_len(r)) {
	8031	invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
	8032	invlist_trim(r);
	8033	array_r = invlist_array(r);
	8034	}
	8035
	8036	/* Finish outputting any remaining */
	8037	if (count >= 2) { /* At most one will have a non-zero copy count */
	8038	IV copy_count;
	8039	if ((copy_count = len_a - i_a) > 0) {
	8040	Copy(array_a + i_a, array_r + i_r, copy_count, UV);
	8041	}
	8042	else if ((copy_count = len_b - i_b) > 0) {
	8043	Copy(array_b + i_b, array_r + i_r, copy_count, UV);
	8044	}
	8045	}
	8046
	8047	/* We may be removing a reference to one of the inputs */
	8048	if (a == i \|\| b == i) {
	8049	assert(! invlist_is_iterating(*i));
	8050	SvREFCNT_dec_NN(*i);
	8051	}
	8052
	8053	*i = r;
	8054	return;
	8055	}
	8056
	8057	SV*
	8058	Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
	8059	{
	8060	/* Add the range from 'start' to 'end' inclusive to the inversion list's
	8061	* set. A pointer to the inversion list is returned. This may actually be
	8062	* a new list, in which case the passed in one has been destroyed. The
	8063	* passed in inversion list can be NULL, in which case a new one is created
	8064	* with just the one range in it */
	8065
	8066	SV* range_invlist;
	8067	UV len;
	8068
	8069	if (invlist == NULL) {
	8070	invlist = _new_invlist(2);
	8071	len = 0;
	8072	}
	8073	else {
	8074	len = _invlist_len(invlist);
	8075	}
	8076
	8077	/* If comes after the final entry actually in the list, can just append it
	8078	* to the end, */
	8079	if (len == 0
	8080	\|\| (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
	8081	&& start >= invlist_array(invlist)[len - 1]))
	8082	{
	8083	_append_range_to_invlist(invlist, start, end);
	8084	return invlist;
	8085	}
	8086
	8087	/* Here, can't just append things, create and return a new inversion list
	8088	* which is the union of this range and the existing inversion list */
	8089	range_invlist = _new_invlist(2);
	8090	_append_range_to_invlist(range_invlist, start, end);
	8091
	8092	_invlist_union(invlist, range_invlist, &invlist);
	8093
	8094	/* The temporary can be freed */
	8095	SvREFCNT_dec_NN(range_invlist);
	8096
	8097	return invlist;
	8098	}
	8099
	8100	#endif
	8101
	8102	PERL_STATIC_INLINE SV*
	8103	S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
	8104	return _add_range_to_invlist(invlist, cp, cp);
	8105	}
	8106
	8107	#ifndef PERL_IN_XSUB_RE
	8108	void
	8109	Perl__invlist_invert(pTHX_ SV* const invlist)
	8110	{
	8111	/* Complement the input inversion list. This adds a 0 if the list didn't
	8112	* have a zero; removes it otherwise. As described above, the data
	8113	* structure is set up so that this is very efficient */
	8114
	8115	PERL_ARGS_ASSERT__INVLIST_INVERT;
	8116
	8117	assert(! invlist_is_iterating(invlist));
	8118
	8119	/* The inverse of matching nothing is matching everything */
	8120	if (_invlist_len(invlist) == 0) {
	8121	_append_range_to_invlist(invlist, 0, UV_MAX);
	8122	return;
	8123	}
	8124
	8125	get_invlist_offset_addr(invlist) = ! get_invlist_offset_addr(invlist);
	8126	}
	8127
	8128	void
	8129	Perl__invlist_invert_prop(pTHX_ SV* const invlist)
	8130	{
	8131	/* Complement the input inversion list (which must be a Unicode property,
	8132	* all of which don't match above the Unicode maximum code point.) And
	8133	* Perl has chosen to not have the inversion match above that either. This
	8134	* adds a 0x110000 if the list didn't end with it, and removes it if it did
	8135	*/
	8136
	8137	UV len;
	8138	UV* array;
	8139
	8140	PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
	8141
	8142	_invlist_invert(invlist);
	8143
	8144	len = _invlist_len(invlist);
	8145
	8146	if (len != 0) { /* If empty do nothing */
	8147	array = invlist_array(invlist);
	8148	if (array[len - 1] != PERL_UNICODE_MAX + 1) {
	8149	/* Add 0x110000. First, grow if necessary */
	8150	len++;
	8151	if (invlist_max(invlist) < len) {
	8152	invlist_extend(invlist, len);
	8153	array = invlist_array(invlist);
	8154	}
	8155	invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
	8156	array[len - 1] = PERL_UNICODE_MAX + 1;
	8157	}
	8158	else { /* Remove the 0x110000 */
	8159	invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
	8160	}
	8161	}
	8162
	8163	return;
	8164	}
	8165	#endif
	8166
	8167	PERL_STATIC_INLINE SV*
	8168	S_invlist_clone(pTHX_ SV* const invlist)
	8169	{
	8170
	8171	/* Return a new inversion list that is a copy of the input one, which is
	8172	* unchanged */
	8173
	8174	/* Need to allocate extra space to accommodate Perl's addition of a
	8175	* trailing NUL to SvPV's, since it thinks they are always strings */
	8176	SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
	8177	STRLEN physical_length = SvCUR(invlist);
	8178	bool offset = *(get_invlist_offset_addr(invlist));
	8179
	8180	PERL_ARGS_ASSERT_INVLIST_CLONE;
	8181
	8182	*(get_invlist_offset_addr(new_invlist)) = offset;
	8183	invlist_set_len(new_invlist, _invlist_len(invlist), offset);
	8184	Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
	8185
	8186	return new_invlist;
	8187	}
	8188
	8189	PERL_STATIC_INLINE STRLEN*
	8190	S_get_invlist_iter_addr(pTHX_ SV* invlist)
	8191	{
	8192	/* Return the address of the UV that contains the current iteration
	8193	* position */
	8194
	8195	PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
	8196
	8197	assert(SvTYPE(invlist) == SVt_INVLIST);
	8198
	8199	return &(((XINVLIST*) SvANY(invlist))->iterator);
	8200	}
	8201
	8202	PERL_STATIC_INLINE void
	8203	S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
	8204	{
	8205	PERL_ARGS_ASSERT_INVLIST_ITERINIT;
	8206
	8207	*get_invlist_iter_addr(invlist) = 0;
	8208	}
	8209
	8210	PERL_STATIC_INLINE void
	8211	S_invlist_iterfinish(pTHX_ SV* invlist)
	8212	{
	8213	/* Terminate iterator for invlist. This is to catch development errors.
	8214	* Any iteration that is interrupted before completed should call this
	8215	* function. Functions that add code points anywhere else but to the end
	8216	* of an inversion list assert that they are not in the middle of an
	8217	* iteration. If they were, the addition would make the iteration
	8218	* problematical: if the iteration hadn't reached the place where things
	8219	* were being added, it would be ok */
	8220
	8221	PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
	8222
	8223	*get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
	8224	}
	8225
	8226	STATIC bool
	8227	S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
	8228	{
	8229	/* An C<invlist_iterinit> call on <invlist> must be used to set this up.
	8230	* This call sets in <start> and <end>, the next range in <invlist>.
	8231	* Returns <TRUE> if successful and the next call will return the next
	8232	* range; <FALSE> if was already at the end of the list. If the latter,
	8233	* <start> and <end> are unchanged, and the next call to this function
	8234	* will start over at the beginning of the list */
	8235
	8236	STRLEN* pos = get_invlist_iter_addr(invlist);
	8237	UV len = _invlist_len(invlist);
	8238	UV *array;
	8239
	8240	PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
	8241
	8242	if (*pos >= len) {
	8243	pos = (STRLEN) UV_MAX; / Force iterinit() to be required next time */
	8244	return FALSE;
	8245	}
	8246
	8247	array = invlist_array(invlist);
	8248
	8249	start = array[(pos)++];
	8250
	8251	if (*pos >= len) {
	8252	*end = UV_MAX;
	8253	}
	8254	else {
	8255	end = array[(pos)++] - 1;
	8256	}
	8257
	8258	return TRUE;
	8259	}
	8260
	8261	PERL_STATIC_INLINE bool
	8262	S_invlist_is_iterating(pTHX_ SV* const invlist)
	8263	{
	8264	PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
	8265
	8266	return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
	8267	}
	8268
	8269	PERL_STATIC_INLINE UV
	8270	S_invlist_highest(pTHX_ SV* const invlist)
	8271	{
	8272	/* Returns the highest code point that matches an inversion list. This API
	8273	* has an ambiguity, as it returns 0 under either the highest is actually
	8274	* 0, or if the list is empty. If this distinction matters to you, check
	8275	* for emptiness before calling this function */
	8276
	8277	UV len = _invlist_len(invlist);
	8278	UV *array;
	8279
	8280	PERL_ARGS_ASSERT_INVLIST_HIGHEST;
	8281
	8282	if (len == 0) {
	8283	return 0;
	8284	}
	8285
	8286	array = invlist_array(invlist);
	8287
	8288	/* The last element in the array in the inversion list always starts a
	8289	* range that goes to infinity. That range may be for code points that are
	8290	* matched in the inversion list, or it may be for ones that aren't
	8291	* matched. In the latter case, the highest code point in the set is one
	8292	* less than the beginning of this range; otherwise it is the final element
	8293	* of this range: infinity */
	8294	return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
	8295	? UV_MAX
	8296	: array[len - 1] - 1;
	8297	}
	8298
	8299	#ifndef PERL_IN_XSUB_RE
	8300	SV *
	8301	Perl__invlist_contents(pTHX_ SV* const invlist)
	8302	{
	8303	/* Get the contents of an inversion list into a string SV so that they can
	8304	* be printed out. It uses the format traditionally done for debug tracing
	8305	*/
	8306
	8307	UV start, end;
	8308	SV* output = newSVpvs("\n");
	8309
	8310	PERL_ARGS_ASSERT__INVLIST_CONTENTS;
	8311
	8312	assert(! invlist_is_iterating(invlist));
	8313
	8314	invlist_iterinit(invlist);
	8315	while (invlist_iternext(invlist, &start, &end)) {
	8316	if (end == UV_MAX) {
	8317	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
	8318	}
	8319	else if (end != start) {
	8320	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
	8321	start, end);
	8322	}
	8323	else {
	8324	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
	8325	}
	8326	}
	8327
	8328	return output;
	8329	}
	8330	#endif
	8331
	8332	#ifndef PERL_IN_XSUB_RE
	8333	void
	8334	Perl__invlist_dump(pTHX_ PerlIO file, I32 level, const char const indent, SV* const invlist)
	8335	{
	8336	/* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
	8337	* inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
	8338	* the string 'indent'. The output looks like this:
	8339	[0] 0x000A .. 0x000D
	8340	[2] 0x0085
	8341	[4] 0x2028 .. 0x2029
	8342	[6] 0x3104 .. INFINITY
	8343	* This means that the first range of code points matched by the list are
	8344	* 0xA through 0xD; the second range contains only the single code point
	8345	* 0x85, etc. An inversion list is an array of UVs. Two array elements
	8346	* are used to define each range (except if the final range extends to
	8347	* infinity, only a single element is needed). The array index of the
	8348	* first element for the corresponding range is given in brackets. */
	8349
	8350	UV start, end;
	8351	STRLEN count = 0;
	8352
	8353	PERL_ARGS_ASSERT__INVLIST_DUMP;
	8354
	8355	if (invlist_is_iterating(invlist)) {
	8356	Perl_dump_indent(aTHX_ level, file,
	8357	"%sCan't dump inversion list because is in middle of iterating\n",
	8358	indent);
	8359	return;
	8360	}
	8361
	8362	invlist_iterinit(invlist);
	8363	while (invlist_iternext(invlist, &start, &end)) {
	8364	if (end == UV_MAX) {
	8365	Perl_dump_indent(aTHX_ level, file,
	8366	"%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
	8367	indent, (UV)count, start);
	8368	}
	8369	else if (end != start) {
	8370	Perl_dump_indent(aTHX_ level, file,
	8371	"%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
	8372	indent, (UV)count, start, end);
	8373	}
	8374	else {
	8375	Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
	8376	indent, (UV)count, start);
	8377	}
	8378	count += 2;
	8379	}
	8380	}
	8381	#endif
	8382
	8383	#ifdef PERL_ARGS_ASSERT__INVLISTEQ
	8384	bool
	8385	S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
	8386	{
	8387	/* Return a boolean as to if the two passed in inversion lists are
	8388	* identical. The final argument, if TRUE, says to take the complement of
	8389	* the second inversion list before doing the comparison */
	8390
	8391	const UV* array_a = invlist_array(a);
	8392	const UV* array_b = invlist_array(b);
	8393	UV len_a = _invlist_len(a);
	8394	UV len_b = _invlist_len(b);
	8395
	8396	UV i = 0; /* current index into the arrays */
	8397	bool retval = TRUE; /* Assume are identical until proven otherwise */
	8398
	8399	PERL_ARGS_ASSERT__INVLISTEQ;
	8400
	8401	/* If are to compare 'a' with the complement of b, set it
	8402	* up so are looking at b's complement. */
	8403	if (complement_b) {
	8404
	8405	/* The complement of nothing is everything, so <a> would have to have
	8406	* just one element, starting at zero (ending at infinity) */
	8407	if (len_b == 0) {
	8408	return (len_a == 1 && array_a[0] == 0);
	8409	}
	8410	else if (array_b[0] == 0) {
	8411
	8412	/* Otherwise, to complement, we invert. Here, the first element is
	8413	* 0, just remove it. To do this, we just pretend the array starts
	8414	* one later */
	8415
	8416	array_b++;
	8417	len_b--;
	8418	}
	8419	else {
	8420
	8421	/* But if the first element is not zero, we pretend the list starts
	8422	* at the 0 that is always stored immediately before the array. */
	8423	array_b--;
	8424	len_b++;
	8425	}
	8426	}
	8427
	8428	/* Make sure that the lengths are the same, as well as the final element
	8429	* before looping through the remainder. (Thus we test the length, final,
	8430	* and first elements right off the bat) */
	8431	if (len_a != len_b \|\| array_a[len_a-1] != array_b[len_a-1]) {
	8432	retval = FALSE;
	8433	}
	8434	else for (i = 0; i < len_a - 1; i++) {
	8435	if (array_a[i] != array_b[i]) {
	8436	retval = FALSE;
	8437	break;
	8438	}
	8439	}
	8440
	8441	return retval;
	8442	}
	8443	#endif
	8444
	8445	#undef HEADER_LENGTH
	8446	#undef TO_INTERNAL_SIZE
	8447	#undef FROM_INTERNAL_SIZE
	8448	#undef INVLIST_VERSION_ID
	8449
	8450	/* End of inversion list object */
	8451
	8452	STATIC void
	8453	S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
	8454	{
	8455	/* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
	8456	* constructs, and updates RExC_flags with them. On input, RExC_parse
	8457	* should point to the first flag; it is updated on output to point to the
	8458	* final ')' or ':'. There needs to be at least one flag, or this will
	8459	* abort */
	8460
	8461	/* for (?g), (?gc), and (?o) warnings; warning
	8462	about (?c) will warn about (?g) -- japhy */
	8463
	8464	#define WASTED_O 0x01
	8465	#define WASTED_G 0x02
	8466	#define WASTED_C 0x04
	8467	#define WASTED_GC (WASTED_G\|WASTED_C)
	8468	I32 wastedflags = 0x00;
	8469	U32 posflags = 0, negflags = 0;
	8470	U32 *flagsp = &posflags;
	8471	char has_charset_modifier = '\0';
	8472	regex_charset cs;
	8473	bool has_use_defaults = FALSE;
	8474	const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
	8475
	8476	PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
	8477
	8478	/* '^' as an initial flag sets certain defaults */
	8479	if (UCHARAT(RExC_parse) == '^') {
	8480	RExC_parse++;
	8481	has_use_defaults = TRUE;
	8482	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	8483	set_regex_charset(&RExC_flags, (RExC_utf8 \|\| RExC_uni_semantics)
	8484	? REGEX_UNICODE_CHARSET
	8485	: REGEX_DEPENDS_CHARSET);
	8486	}
	8487
	8488	cs = get_regex_charset(RExC_flags);
	8489	if (cs == REGEX_DEPENDS_CHARSET
	8490	&& (RExC_utf8 \|\| RExC_uni_semantics))
	8491	{
	8492	cs = REGEX_UNICODE_CHARSET;
	8493	}
	8494
	8495	while (*RExC_parse) {
	8496	/* && strchr("iogcmsx", RExC_parse) /
	8497	/* (?g), (?gc) and (?o) are useless here
	8498	and must be globally applied -- japhy */
	8499	switch (*RExC_parse) {
	8500
	8501	/* Code for the imsx flags */
	8502	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	8503
	8504	case LOCALE_PAT_MOD:
	8505	if (has_charset_modifier) {
	8506	goto excess_modifier;
	8507	}
	8508	else if (flagsp == &negflags) {
	8509	goto neg_modifier;
	8510	}
	8511	cs = REGEX_LOCALE_CHARSET;
	8512	has_charset_modifier = LOCALE_PAT_MOD;
	8513	RExC_contains_locale = 1;
	8514	break;
	8515	case UNICODE_PAT_MOD:
	8516	if (has_charset_modifier) {
	8517	goto excess_modifier;
	8518	}
	8519	else if (flagsp == &negflags) {
	8520	goto neg_modifier;
	8521	}
	8522	cs = REGEX_UNICODE_CHARSET;
	8523	has_charset_modifier = UNICODE_PAT_MOD;
	8524	break;
	8525	case ASCII_RESTRICT_PAT_MOD:
	8526	if (flagsp == &negflags) {
	8527	goto neg_modifier;
	8528	}
	8529	if (has_charset_modifier) {
	8530	if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
	8531	goto excess_modifier;
	8532	}
	8533	/* Doubled modifier implies more restricted */
	8534	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	8535	}
	8536	else {
	8537	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	8538	}
	8539	has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
	8540	break;
	8541	case DEPENDS_PAT_MOD:
	8542	if (has_use_defaults) {
	8543	goto fail_modifiers;
	8544	}
	8545	else if (flagsp == &negflags) {
	8546	goto neg_modifier;
	8547	}
	8548	else if (has_charset_modifier) {
	8549	goto excess_modifier;
	8550	}
	8551
	8552	/* The dual charset means unicode semantics if the
	8553	* pattern (or target, not known until runtime) are
	8554	* utf8, or something in the pattern indicates unicode
	8555	* semantics */
	8556	cs = (RExC_utf8 \|\| RExC_uni_semantics)
	8557	? REGEX_UNICODE_CHARSET
	8558	: REGEX_DEPENDS_CHARSET;
	8559	has_charset_modifier = DEPENDS_PAT_MOD;
	8560	break;
	8561	excess_modifier:
	8562	RExC_parse++;
	8563	if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
	8564	vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
	8565	}
	8566	else if (has_charset_modifier == *(RExC_parse - 1)) {
	8567	vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
	8568	}
	8569	else {
	8570	vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
	8571	}
	8572	/NOTREACHED/
	8573	neg_modifier:
	8574	RExC_parse++;
	8575	vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
	8576	/NOTREACHED/
	8577	case ONCE_PAT_MOD: /* 'o' */
	8578	case GLOBAL_PAT_MOD: /* 'g' */
	8579	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	8580	const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
	8581	if (! (wastedflags & wflagbit) ) {
	8582	wastedflags \|= wflagbit;
	8583	/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
	8584	vWARN5(
	8585	RExC_parse + 1,
	8586	"Useless (%s%c) - %suse /%c modifier",
	8587	flagsp == &negflags ? "?-" : "?",
	8588	*RExC_parse,
	8589	flagsp == &negflags ? "don't " : "",
	8590	*RExC_parse
	8591	);
	8592	}
	8593	}
	8594	break;
	8595
	8596	case CONTINUE_PAT_MOD: /* 'c' */
	8597	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	8598	if (! (wastedflags & WASTED_C) ) {
	8599	wastedflags \|= WASTED_GC;
	8600	/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
	8601	vWARN3(
	8602	RExC_parse + 1,
	8603	"Useless (%sc) - %suse /gc modifier",
	8604	flagsp == &negflags ? "?-" : "?",
	8605	flagsp == &negflags ? "don't " : ""
	8606	);
	8607	}
	8608	}
	8609	break;
	8610	case KEEPCOPY_PAT_MOD: /* 'p' */
	8611	if (flagsp == &negflags) {
	8612	if (SIZE_ONLY)
	8613	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	8614	} else {
	8615	*flagsp \|= RXf_PMf_KEEPCOPY;
	8616	}
	8617	break;
	8618	case '-':
	8619	/* A flag is a default iff it is following a minus, so
	8620	* if there is a minus, it means will be trying to
	8621	* re-specify a default which is an error */
	8622	if (has_use_defaults \|\| flagsp == &negflags) {
	8623	goto fail_modifiers;
	8624	}
	8625	flagsp = &negflags;
	8626	wastedflags = 0; /* reset so (?g-c) warns twice */
	8627	break;
	8628	case ':':
	8629	case ')':
	8630	RExC_flags \|= posflags;
	8631	RExC_flags &= ~negflags;
	8632	set_regex_charset(&RExC_flags, cs);
	8633	return;
	8634	/NOTREACHED/
	8635	default:
	8636	fail_modifiers:
	8637	RExC_parse++;
	8638	vFAIL3("Sequence (%.*s...) not recognized",
	8639	RExC_parse-seqstart, seqstart);
	8640	/NOTREACHED/
	8641	}
	8642
	8643	++RExC_parse;
	8644	}
	8645	}
	8646
	8647	/*
	8648	- reg - regular expression, i.e. main body or parenthesized thing
	8649	*
	8650	* Caller must absorb opening parenthesis.
	8651	*
	8652	* Combining parenthesis handling with the base level of regular expression
	8653	* is a trifle forced, but the need to tie the tails of the branches to what
	8654	* follows makes it hard to avoid.
	8655	*/
	8656	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	8657	#ifdef DEBUGGING
	8658	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	8659	#else
	8660	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	8661	#endif
	8662
	8663	/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
	8664	flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
	8665	needs to be restarted.
	8666	Otherwise would only return NULL if regbranch() returns NULL, which
	8667	cannot happen. */
	8668	STATIC regnode *
	8669	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	8670	/* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
	8671	* 2 is like 1, but indicates that nextchar() has been called to advance
	8672	* RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
	8673	* this flag alerts us to the need to check for that */
	8674	{
	8675	dVAR;
	8676	regnode ret; / Will be the head of the group. */
	8677	regnode *br;
	8678	regnode *lastbr;
	8679	regnode *ender = NULL;
	8680	I32 parno = 0;
	8681	I32 flags;
	8682	U32 oregflags = RExC_flags;
	8683	bool have_branch = 0;
	8684	bool is_open = 0;
	8685	I32 freeze_paren = 0;
	8686	I32 after_freeze = 0;
	8687
	8688	char * parse_start = RExC_parse; /* MJD */
	8689	char * const oregcomp_parse = RExC_parse;
	8690
	8691	GET_RE_DEBUG_FLAGS_DECL;
	8692
	8693	PERL_ARGS_ASSERT_REG;
	8694	DEBUG_PARSE("reg ");
	8695
	8696	flagp = 0; / Tentatively. */
	8697
	8698
	8699	/* Make an OPEN node, if parenthesized. */
	8700	if (paren) {
	8701
	8702	/* Under /x, space and comments can be gobbled up between the '(' and
	8703	* here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
	8704	* intervening space, as the sequence is a token, and a token should be
	8705	* indivisible */
	8706	bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
	8707
	8708	if ( RExC_parse == '') { /* (VERB:ARG) /
	8709	char *start_verb = RExC_parse;
	8710	STRLEN verb_len = 0;
	8711	char *start_arg = NULL;
	8712	unsigned char op = 0;
	8713	int argok = 1;
	8714	int internal_argval = 0; /* internal_argval is only useful if !argok */
	8715
	8716	if (has_intervening_patws && SIZE_ONLY) {
	8717	ckWARNregdep(RExC_parse + 1, "In '(VERB...)', splitting the initial '(' is deprecated");
	8718	}
	8719	while ( RExC_parse && RExC_parse != ')' ) {
	8720	if ( *RExC_parse == ':' ) {
	8721	start_arg = RExC_parse + 1;
	8722	break;
	8723	}
	8724	RExC_parse++;
	8725	}
	8726	++start_verb;
	8727	verb_len = RExC_parse - start_verb;
	8728	if ( start_arg ) {
	8729	RExC_parse++;
	8730	while ( RExC_parse && RExC_parse != ')' )
	8731	RExC_parse++;
	8732	if ( *RExC_parse != ')' )
	8733	vFAIL("Unterminated verb pattern argument");
	8734	if ( RExC_parse == start_arg )
	8735	start_arg = NULL;
	8736	} else {
	8737	if ( *RExC_parse != ')' )
	8738	vFAIL("Unterminated verb pattern");
	8739	}
	8740
	8741	switch ( *start_verb ) {
	8742	case 'A': /* (ACCEPT) /
	8743	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	8744	op = ACCEPT;
	8745	internal_argval = RExC_nestroot;
	8746	}
	8747	break;
	8748	case 'C': /* (COMMIT) /
	8749	if ( memEQs(start_verb,verb_len,"COMMIT") )
	8750	op = COMMIT;
	8751	break;
	8752	case 'F': /* (FAIL) /
	8753	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	8754	op = OPFAIL;
	8755	argok = 0;
	8756	}
	8757	break;
	8758	case ':': /* (:NAME) /
	8759	case 'M': /* (MARK:NAME) /
	8760	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	8761	op = MARKPOINT;
	8762	argok = -1;
	8763	}
	8764	break;
	8765	case 'P': /* (PRUNE) /
	8766	if ( memEQs(start_verb,verb_len,"PRUNE") )
	8767	op = PRUNE;
	8768	break;
	8769	case 'S': /* (SKIP) /
	8770	if ( memEQs(start_verb,verb_len,"SKIP") )
	8771	op = SKIP;
	8772	break;
	8773	case 'T': /* (THEN) /
	8774	/* [19:06] <TimToady> :: is then */
	8775	if ( memEQs(start_verb,verb_len,"THEN") ) {
	8776	op = CUTGROUP;
	8777	RExC_seen \|= REG_SEEN_CUTGROUP;
	8778	}
	8779	break;
	8780	}
	8781	if ( ! op ) {
	8782	RExC_parse++;
	8783	vFAIL3("Unknown verb pattern '%.*s'",
	8784	verb_len, start_verb);
	8785	}
	8786	if ( argok ) {
	8787	if ( start_arg && internal_argval ) {
	8788	vFAIL3("Verb pattern '%.*s' may not have an argument",
	8789	verb_len, start_verb);
	8790	} else if ( argok < 0 && !start_arg ) {
	8791	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	8792	verb_len, start_verb);
	8793	} else {
	8794	ret = reganode(pRExC_state, op, internal_argval);
	8795	if ( ! internal_argval && ! SIZE_ONLY ) {
	8796	if (start_arg) {
	8797	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	8798	ARG(ret) = add_data( pRExC_state, 1, "S" );
	8799	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	8800	ret->flags = 0;
	8801	} else {
	8802	ret->flags = 1;
	8803	}
	8804	}
	8805	}
	8806	if (!internal_argval)
	8807	RExC_seen \|= REG_SEEN_VERBARG;
	8808	} else if ( start_arg ) {
	8809	vFAIL3("Verb pattern '%.*s' may not have an argument",
	8810	verb_len, start_verb);
	8811	} else {
	8812	ret = reg_node(pRExC_state, op);
	8813	}
	8814	nextchar(pRExC_state);
	8815	return ret;
	8816	}
	8817	else if (RExC_parse == '?') { / (?...) */
	8818	bool is_logical = 0;
	8819	const char * const seqstart = RExC_parse;
	8820	if (has_intervening_patws && SIZE_ONLY) {
	8821	ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
	8822	}
	8823
	8824	RExC_parse++;
	8825	paren = *RExC_parse++;
	8826	ret = NULL; /* For look-ahead/behind. */
	8827	switch (paren) {
	8828
	8829	case 'P': /* (?P...) variants for those used to PCRE/Python */
	8830	paren = *RExC_parse++;
	8831	if ( paren == '<') /* (?P<...>) named capture */
	8832	goto named_capture;
	8833	else if (paren == '>') { /* (?P>name) named recursion */
	8834	goto named_recursion;
	8835	}
	8836	else if (paren == '=') { /* (?P=...) named backref */
	8837	/* this pretty much dupes the code for \k<NAME> in regatom(), if
	8838	you change this make sure you change that */
	8839	char* name_start = RExC_parse;
	8840	U32 num = 0;
	8841	SV *sv_dat = reg_scan_name(pRExC_state,
	8842	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8843	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	8844	vFAIL2("Sequence %.3s... not terminated",parse_start);
	8845
	8846	if (!SIZE_ONLY) {
	8847	num = add_data( pRExC_state, 1, "S" );
	8848	RExC_rxi->data->data[num]=(void*)sv_dat;
	8849	SvREFCNT_inc_simple_void(sv_dat);
	8850	}
	8851	RExC_sawback = 1;
	8852	ret = reganode(pRExC_state,
	8853	((! FOLD)
	8854	? NREF
	8855	: (ASCII_FOLD_RESTRICTED)
	8856	? NREFFA
	8857	: (AT_LEAST_UNI_SEMANTICS)
	8858	? NREFFU
	8859	: (LOC)
	8860	? NREFFL
	8861	: NREFF),
	8862	num);
	8863	*flagp \|= HASWIDTH;
	8864
	8865	Set_Node_Offset(ret, parse_start+1);
	8866	Set_Node_Cur_Length(ret, parse_start);
	8867
	8868	nextchar(pRExC_state);
	8869	return ret;
	8870	}
	8871	RExC_parse++;
	8872	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8873	/NOTREACHED/
	8874	case '<': /* (?<...) */
	8875	if (*RExC_parse == '!')
	8876	paren = ',';
	8877	else if (*RExC_parse != '=')
	8878	named_capture:
	8879	{ /* (?<...>) */
	8880	char *name_start;
	8881	SV *svname;
	8882	paren= '>';
	8883	case '\'': /* (?'...') */
	8884	name_start= RExC_parse;
	8885	svname = reg_scan_name(pRExC_state,
	8886	SIZE_ONLY ? /* reverse test from the others */
	8887	REG_RSN_RETURN_NAME :
	8888	REG_RSN_RETURN_NULL);
	8889	if (RExC_parse == name_start) {
	8890	RExC_parse++;
	8891	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8892	/NOTREACHED/
	8893	}
	8894	if (*RExC_parse != paren)
	8895	vFAIL2("Sequence (?%c... not terminated",
	8896	paren=='>' ? '<' : paren);
	8897	if (SIZE_ONLY) {
	8898	HE *he_str;
	8899	SV *sv_dat = NULL;
	8900	if (!svname) /* shouldn't happen */
	8901	Perl_croak(aTHX_
	8902	"panic: reg_scan_name returned NULL");
	8903	if (!RExC_paren_names) {
	8904	RExC_paren_names= newHV();
	8905	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	8906	#ifdef DEBUGGING
	8907	RExC_paren_name_list= newAV();
	8908	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	8909	#endif
	8910	}
	8911	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	8912	if ( he_str )
	8913	sv_dat = HeVAL(he_str);
	8914	if ( ! sv_dat ) {
	8915	/* croak baby croak */
	8916	Perl_croak(aTHX_
	8917	"panic: paren_name hash element allocation failed");
	8918	} else if ( SvPOK(sv_dat) ) {
	8919	/* (?\|...) can mean we have dupes so scan to check
	8920	its already been stored. Maybe a flag indicating
	8921	we are inside such a construct would be useful,
	8922	but the arrays are likely to be quite small, so
	8923	for now we punt -- dmq */
	8924	IV count = SvIV(sv_dat);
	8925	I32 pv = (I32)SvPVX(sv_dat);
	8926	IV i;
	8927	for ( i = 0 ; i < count ; i++ ) {
	8928	if ( pv[i] == RExC_npar ) {
	8929	count = 0;
	8930	break;
	8931	}
	8932	}
	8933	if ( count ) {
	8934	pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
	8935	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	8936	pv[count] = RExC_npar;
	8937	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	8938	}
	8939	} else {
	8940	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	8941	sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
	8942	SvIOK_on(sv_dat);
	8943	SvIV_set(sv_dat, 1);
	8944	}
	8945	#ifdef DEBUGGING
	8946	/* Yes this does cause a memory leak in debugging Perls */
	8947	if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
	8948	SvREFCNT_dec_NN(svname);
	8949	#endif
	8950
	8951	/sv_dump(sv_dat);/
	8952	}
	8953	nextchar(pRExC_state);
	8954	paren = 1;
	8955	goto capturing_parens;
	8956	}
	8957	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8958	RExC_in_lookbehind++;
	8959	RExC_parse++;
	8960	case '=': /* (?=...) */
	8961	RExC_seen_zerolen++;
	8962	break;
	8963	case '!': /* (?!...) */
	8964	RExC_seen_zerolen++;
	8965	if (*RExC_parse == ')') {
	8966	ret=reg_node(pRExC_state, OPFAIL);
	8967	nextchar(pRExC_state);
	8968	return ret;
	8969	}
	8970	break;
	8971	case '\|': /* (?\|...) */
	8972	/* branch reset, behave like a (?:...) except that
	8973	buffers in alternations share the same numbers */
	8974	paren = ':';
	8975	after_freeze = freeze_paren = RExC_npar;
	8976	break;
	8977	case ':': /* (?:...) */
	8978	case '>': /* (?>...) */
	8979	break;
	8980	case '$': /* (?$...) */
	8981	case '@': /* (?@...) */
	8982	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	8983	break;
	8984	case '#': /* (?#...) */
	8985	/* XXX As soon as we disallow separating the '?' and '*' (by
	8986	* spaces or (?#...) comment), it is believed that this case
	8987	* will be unreachable and can be removed. See
	8988	* [perl #117327] */
	8989	while (RExC_parse && RExC_parse != ')')
	8990	RExC_parse++;
	8991	if (*RExC_parse != ')')
	8992	FAIL("Sequence (?#... not terminated");
	8993	nextchar(pRExC_state);
	8994	*flagp = TRYAGAIN;
	8995	return NULL;
	8996	case '0' : /* (?0) */
	8997	case 'R' : /* (?R) */
	8998	if (*RExC_parse != ')')
	8999	FAIL("Sequence (?R) not terminated");
	9000	ret = reg_node(pRExC_state, GOSTART);
	9001	*flagp \|= POSTPONED;
	9002	nextchar(pRExC_state);
	9003	return ret;
	9004	/notreached/
	9005	{ /* named and numeric backreferences */
	9006	I32 num;
	9007	case '&': /* (?&NAME) */
	9008	parse_start = RExC_parse - 1;
	9009	named_recursion:
	9010	{
	9011	SV *sv_dat = reg_scan_name(pRExC_state,
	9012	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	9013	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	9014	}
	9015	goto gen_recurse_regop;
	9016	assert(0); /* NOT REACHED */
	9017	case '+':
	9018	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	9019	RExC_parse++;
	9020	vFAIL("Illegal pattern");
	9021	}
	9022	goto parse_recursion;
	9023	/* NOT REACHED*/
	9024	case '-': /* (?-1) */
	9025	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	9026	RExC_parse--; /* rewind to let it be handled later */
	9027	goto parse_flags;
	9028	}
	9029	/FALLTHROUGH /
	9030	case '1': case '2': case '3': case '4': /* (?1) */
	9031	case '5': case '6': case '7': case '8': case '9':
	9032	RExC_parse--;
	9033	parse_recursion:
	9034	num = atoi(RExC_parse);
	9035	parse_start = RExC_parse - 1; /* MJD */
	9036	if (*RExC_parse == '-')
	9037	RExC_parse++;
	9038	while (isDIGIT(*RExC_parse))
	9039	RExC_parse++;
	9040	if (*RExC_parse!=')')
	9041	vFAIL("Expecting close bracket");
	9042
	9043	gen_recurse_regop:
	9044	if ( paren == '-' ) {
	9045	/*
	9046	Diagram of capture buffer numbering.
	9047	Top line is the normal capture buffer numbers
	9048	Bottom line is the negative indexing as from
	9049	the X (the (?-2))
	9050
	9051	+ 1 2 3 4 5 X 6 7
	9052	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	9053	- 5 4 3 2 1 X x x
	9054
	9055	*/
	9056	num = RExC_npar + num;
	9057	if (num < 1) {
	9058	RExC_parse++;
	9059	vFAIL("Reference to nonexistent group");
	9060	}
	9061	} else if ( paren == '+' ) {
	9062	num = RExC_npar + num - 1;
	9063	}
	9064
	9065	ret = reganode(pRExC_state, GOSUB, num);
	9066	if (!SIZE_ONLY) {
	9067	if (num > (I32)RExC_rx->nparens) {
	9068	RExC_parse++;
	9069	vFAIL("Reference to nonexistent group");
	9070	}
	9071	ARG2L_SET( ret, RExC_recurse_count++);
	9072	RExC_emit++;
	9073	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	9074	"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
	9075	} else {
	9076	RExC_size++;
	9077	}
	9078	RExC_seen \|= REG_SEEN_RECURSE;
	9079	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	9080	Set_Node_Offset(ret, parse_start); /* MJD */
	9081
	9082	*flagp \|= POSTPONED;
	9083	nextchar(pRExC_state);
	9084	return ret;
	9085	} /* named and numeric backreferences */
	9086	assert(0); /* NOT REACHED */
	9087
	9088	case '?': /* (??...) */
	9089	is_logical = 1;
	9090	if (*RExC_parse != '{') {
	9091	RExC_parse++;
	9092	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	9093	/NOTREACHED/
	9094	}
	9095	*flagp \|= POSTPONED;
	9096	paren = *RExC_parse++;
	9097	/* FALL THROUGH */
	9098	case '{': /* (?{...}) */
	9099	{
	9100	U32 n = 0;
	9101	struct reg_code_block *cb;
	9102
	9103	RExC_seen_zerolen++;
	9104
	9105	if ( !pRExC_state->num_code_blocks
	9106	\|\| pRExC_state->code_index >= pRExC_state->num_code_blocks
	9107	\|\| pRExC_state->code_blocks[pRExC_state->code_index].start
	9108	!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
	9109	- RExC_start)
	9110	) {
	9111	if (RExC_pm_flags & PMf_USE_RE_EVAL)
	9112	FAIL("panic: Sequence (?{...}): no code block found\n");
	9113	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	9114	}
	9115	/* this is a pre-compiled code block (?{...}) */
	9116	cb = &pRExC_state->code_blocks[pRExC_state->code_index];
	9117	RExC_parse = RExC_start + cb->end;
	9118	if (!SIZE_ONLY) {
	9119	OP *o = cb->block;
	9120	if (cb->src_regex) {
	9121	n = add_data(pRExC_state, 2, "rl");
	9122	RExC_rxi->data->data[n] =
	9123	(void)SvREFCNT_inc((SV)cb->src_regex);
	9124	RExC_rxi->data->data[n+1] = (void*)o;
	9125	}
	9126	else {
	9127	n = add_data(pRExC_state, 1,
	9128	(RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
	9129	RExC_rxi->data->data[n] = (void*)o;
	9130	}
	9131	}
	9132	pRExC_state->code_index++;
	9133	nextchar(pRExC_state);
	9134
	9135	if (is_logical) {
	9136	regnode *eval;
	9137	ret = reg_node(pRExC_state, LOGICAL);
	9138	eval = reganode(pRExC_state, EVAL, n);
	9139	if (!SIZE_ONLY) {
	9140	ret->flags = 2;
	9141	/* for later propagation into (??{}) return value */
	9142	eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
	9143	}
	9144	REGTAIL(pRExC_state, ret, eval);
	9145	/* deal with the length of this later - MJD */
	9146	return ret;
	9147	}
	9148	ret = reganode(pRExC_state, EVAL, n);
	9149	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	9150	Set_Node_Offset(ret, parse_start);
	9151	return ret;
	9152	}
	9153	case '(': /* (?(?{...})...) and (?(?=...)...) */
	9154	{
	9155	int is_define= 0;
	9156	if (RExC_parse[0] == '?') { /* (?(?...)) */
	9157	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	9158	\|\| RExC_parse[1] == '<'
	9159	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	9160	I32 flag;
	9161	regnode *tail;
	9162
	9163	ret = reg_node(pRExC_state, LOGICAL);
	9164	if (!SIZE_ONLY)
	9165	ret->flags = 1;
	9166
	9167	tail = reg(pRExC_state, 1, &flag, depth+1);
	9168	if (flag & RESTART_UTF8) {
	9169	*flagp = RESTART_UTF8;
	9170	return NULL;
	9171	}
	9172	REGTAIL(pRExC_state, ret, tail);
	9173	goto insert_if;
	9174	}
	9175	}
	9176	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	9177	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	9178	{
	9179	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	9180	char *name_start= RExC_parse++;
	9181	U32 num = 0;
	9182	SV *sv_dat=reg_scan_name(pRExC_state,
	9183	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	9184	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	9185	vFAIL2("Sequence (?(%c... not terminated",
	9186	(ch == '>' ? '<' : ch));
	9187	RExC_parse++;
	9188	if (!SIZE_ONLY) {
	9189	num = add_data( pRExC_state, 1, "S" );
	9190	RExC_rxi->data->data[num]=(void*)sv_dat;
	9191	SvREFCNT_inc_simple_void(sv_dat);
	9192	}
	9193	ret = reganode(pRExC_state,NGROUPP,num);
	9194	goto insert_if_check_paren;
	9195	}
	9196	else if (RExC_parse[0] == 'D' &&
	9197	RExC_parse[1] == 'E' &&
	9198	RExC_parse[2] == 'F' &&
	9199	RExC_parse[3] == 'I' &&
	9200	RExC_parse[4] == 'N' &&
	9201	RExC_parse[5] == 'E')
	9202	{
	9203	ret = reganode(pRExC_state,DEFINEP,0);
	9204	RExC_parse +=6 ;
	9205	is_define = 1;
	9206	goto insert_if_check_paren;
	9207	}
	9208	else if (RExC_parse[0] == 'R') {
	9209	RExC_parse++;
	9210	parno = 0;
	9211	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	9212	parno = atoi(RExC_parse++);
	9213	while (isDIGIT(*RExC_parse))
	9214	RExC_parse++;
	9215	} else if (RExC_parse[0] == '&') {
	9216	SV *sv_dat;
	9217	RExC_parse++;
	9218	sv_dat = reg_scan_name(pRExC_state,
	9219	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	9220	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	9221	}
	9222	ret = reganode(pRExC_state,INSUBP,parno);
	9223	goto insert_if_check_paren;
	9224	}
	9225	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	9226	/* (?(1)...) */
	9227	char c;
	9228	parno = atoi(RExC_parse++);
	9229
	9230	while (isDIGIT(*RExC_parse))
	9231	RExC_parse++;
	9232	ret = reganode(pRExC_state, GROUPP, parno);
	9233
	9234	insert_if_check_paren:
	9235	if ((c = *nextchar(pRExC_state)) != ')')
	9236	vFAIL("Switch condition not recognized");
	9237	insert_if:
	9238	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	9239	br = regbranch(pRExC_state, &flags, 1,depth+1);
	9240	if (br == NULL) {
	9241	if (flags & RESTART_UTF8) {
	9242	*flagp = RESTART_UTF8;
	9243	return NULL;
	9244	}
	9245	FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
	9246	(UV) flags);
	9247	} else
	9248	REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
	9249	c = *nextchar(pRExC_state);
	9250	if (flags&HASWIDTH)
	9251	*flagp \|= HASWIDTH;
	9252	if (c == '\|') {
	9253	if (is_define)
	9254	vFAIL("(?(DEFINE)....) does not allow branches");
	9255	lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
	9256	if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
	9257	if (flags & RESTART_UTF8) {
	9258	*flagp = RESTART_UTF8;
	9259	return NULL;
	9260	}
	9261	FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
	9262	(UV) flags);
	9263	}
	9264	REGTAIL(pRExC_state, ret, lastbr);
	9265	if (flags&HASWIDTH)
	9266	*flagp \|= HASWIDTH;
	9267	c = *nextchar(pRExC_state);
	9268	}
	9269	else
	9270	lastbr = NULL;
	9271	if (c != ')')
	9272	vFAIL("Switch (?(condition)... contains too many branches");
	9273	ender = reg_node(pRExC_state, TAIL);
	9274	REGTAIL(pRExC_state, br, ender);
	9275	if (lastbr) {
	9276	REGTAIL(pRExC_state, lastbr, ender);
	9277	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	9278	}
	9279	else
	9280	REGTAIL(pRExC_state, ret, ender);
	9281	RExC_size++; /* XXX WHY do we need this?!!
	9282	For large programs it seems to be required
	9283	but I can't figure out why. -- dmq*/
	9284	return ret;
	9285	}
	9286	else {
	9287	vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
	9288	}
	9289	}
	9290	case '[': /* (?[ ... ]) */
	9291	return handle_regex_sets(pRExC_state, NULL, flagp, depth,
	9292	oregcomp_parse);
	9293	case 0:
	9294	RExC_parse--; /* for vFAIL to print correctly */
	9295	vFAIL("Sequence (? incomplete");
	9296	break;
	9297	default: /* e.g., (?i) */
	9298	--RExC_parse;
	9299	parse_flags:
	9300	parse_lparen_question_flags(pRExC_state);
	9301	if (UCHARAT(RExC_parse) != ':') {
	9302	nextchar(pRExC_state);
	9303	*flagp = TRYAGAIN;
	9304	return NULL;
	9305	}
	9306	paren = ':';
	9307	nextchar(pRExC_state);
	9308	ret = NULL;
	9309	goto parse_rest;
	9310	} /* end switch */
	9311	}
	9312	else { /* (...) */
	9313	capturing_parens:
	9314	parno = RExC_npar;
	9315	RExC_npar++;
	9316
	9317	ret = reganode(pRExC_state, OPEN, parno);
	9318	if (!SIZE_ONLY ){
	9319	if (!RExC_nestroot)
	9320	RExC_nestroot = parno;
	9321	if (RExC_seen & REG_SEEN_RECURSE
	9322	&& !RExC_open_parens[parno-1])
	9323	{
	9324	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	9325	"Setting open paren #%"IVdf" to %d\n",
	9326	(IV)parno, REG_NODE_NUM(ret)));
	9327	RExC_open_parens[parno-1]= ret;
	9328	}
	9329	}
	9330	Set_Node_Length(ret, 1); /* MJD */
	9331	Set_Node_Offset(ret, RExC_parse); /* MJD */
	9332	is_open = 1;
	9333	}
	9334	}
	9335	else /* ! paren */
	9336	ret = NULL;
	9337
	9338	parse_rest:
	9339	/* Pick up the branches, linking them together. */
	9340	parse_start = RExC_parse; /* MJD */
	9341	br = regbranch(pRExC_state, &flags, 1,depth+1);
	9342
	9343	/* branch_len = (paren != 0); */
	9344
	9345	if (br == NULL) {
	9346	if (flags & RESTART_UTF8) {
	9347	*flagp = RESTART_UTF8;
	9348	return NULL;
	9349	}
	9350	FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
	9351	}
	9352	if (*RExC_parse == '\|') {
	9353	if (!SIZE_ONLY && RExC_extralen) {
	9354	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	9355	}
	9356	else { /* MJD */
	9357	reginsert(pRExC_state, BRANCH, br, depth+1);
	9358	Set_Node_Length(br, paren != 0);
	9359	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	9360	}
	9361	have_branch = 1;
	9362	if (SIZE_ONLY)
	9363	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	9364	}
	9365	else if (paren == ':') {
	9366	*flagp \|= flags&SIMPLE;
	9367	}
	9368	if (is_open) { /* Starts with OPEN. */
	9369	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	9370	}
	9371	else if (paren != '?') /* Not Conditional */
	9372	ret = br;
	9373	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	9374	lastbr = br;
	9375	while (*RExC_parse == '\|') {
	9376	if (!SIZE_ONLY && RExC_extralen) {
	9377	ender = reganode(pRExC_state, LONGJMP,0);
	9378	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
	9379	}
	9380	if (SIZE_ONLY)
	9381	RExC_extralen += 2; /* Account for LONGJMP. */
	9382	nextchar(pRExC_state);
	9383	if (freeze_paren) {
	9384	if (RExC_npar > after_freeze)
	9385	after_freeze = RExC_npar;
	9386	RExC_npar = freeze_paren;
	9387	}
	9388	br = regbranch(pRExC_state, &flags, 0, depth+1);
	9389
	9390	if (br == NULL) {
	9391	if (flags & RESTART_UTF8) {
	9392	*flagp = RESTART_UTF8;
	9393	return NULL;
	9394	}
	9395	FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
	9396	}
	9397	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	9398	lastbr = br;
	9399	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	9400	}
	9401
	9402	if (have_branch \|\| paren != ':') {
	9403	/* Make a closing node, and hook it on the end. */
	9404	switch (paren) {
	9405	case ':':
	9406	ender = reg_node(pRExC_state, TAIL);
	9407	break;
	9408	case 1: case 2:
	9409	ender = reganode(pRExC_state, CLOSE, parno);
	9410	if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
	9411	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	9412	"Setting close paren #%"IVdf" to %d\n",
	9413	(IV)parno, REG_NODE_NUM(ender)));
	9414	RExC_close_parens[parno-1]= ender;
	9415	if (RExC_nestroot == parno)
	9416	RExC_nestroot = 0;
	9417	}
	9418	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	9419	Set_Node_Length(ender,1); /* MJD */
	9420	break;
	9421	case '<':
	9422	case ',':
	9423	case '=':
	9424	case '!':
	9425	*flagp &= ~HASWIDTH;
	9426	/* FALL THROUGH */
	9427	case '>':
	9428	ender = reg_node(pRExC_state, SUCCEED);
	9429	break;
	9430	case 0:
	9431	ender = reg_node(pRExC_state, END);
	9432	if (!SIZE_ONLY) {
	9433	assert(!RExC_opend); /* there can only be one! */
	9434	RExC_opend = ender;
	9435	}
	9436	break;
	9437	}
	9438	DEBUG_PARSE_r(if (!SIZE_ONLY) {
	9439	SV * const mysv_val1=sv_newmortal();
	9440	SV * const mysv_val2=sv_newmortal();
	9441	DEBUG_PARSE_MSG("lsbr");
	9442	regprop(RExC_rx, mysv_val1, lastbr);
	9443	regprop(RExC_rx, mysv_val2, ender);
	9444	PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
	9445	SvPV_nolen_const(mysv_val1),
	9446	(IV)REG_NODE_NUM(lastbr),
	9447	SvPV_nolen_const(mysv_val2),
	9448	(IV)REG_NODE_NUM(ender),
	9449	(IV)(ender - lastbr)
	9450	);
	9451	});
	9452	REGTAIL(pRExC_state, lastbr, ender);
	9453
	9454	if (have_branch && !SIZE_ONLY) {
	9455	char is_nothing= 1;
	9456	if (depth==1)
	9457	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	9458
	9459	/* Hook the tails of the branches to the closing node. */
	9460	for (br = ret; br; br = regnext(br)) {
	9461	const U8 op = PL_regkind[OP(br)];
	9462	if (op == BRANCH) {
	9463	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	9464	if (OP(NEXTOPER(br)) != NOTHING \|\| regnext(NEXTOPER(br)) != ender)
	9465	is_nothing= 0;
	9466	}
	9467	else if (op == BRANCHJ) {
	9468	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	9469	/* for now we always disable this optimisation * /
	9470	if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING \|\| regnext(NEXTOPER(NEXTOPER(br))) != ender)
	9471	*/
	9472	is_nothing= 0;
	9473	}
	9474	}
	9475	if (is_nothing) {
	9476	br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
	9477	DEBUG_PARSE_r(if (!SIZE_ONLY) {
	9478	SV * const mysv_val1=sv_newmortal();
	9479	SV * const mysv_val2=sv_newmortal();
	9480	DEBUG_PARSE_MSG("NADA");
	9481	regprop(RExC_rx, mysv_val1, ret);
	9482	regprop(RExC_rx, mysv_val2, ender);
	9483	PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
	9484	SvPV_nolen_const(mysv_val1),
	9485	(IV)REG_NODE_NUM(ret),
	9486	SvPV_nolen_const(mysv_val2),
	9487	(IV)REG_NODE_NUM(ender),
	9488	(IV)(ender - ret)
	9489	);
	9490	});
	9491	OP(br)= NOTHING;
	9492	if (OP(ender) == TAIL) {
	9493	NEXT_OFF(br)= 0;
	9494	RExC_emit= br + 1;
	9495	} else {
	9496	regnode *opt;
	9497	for ( opt= br + 1; opt < ender ; opt++ )
	9498	OP(opt)= OPTIMIZED;
	9499	NEXT_OFF(br)= ender - br;
	9500	}
	9501	}
	9502	}
	9503	}
	9504
	9505	{
	9506	const char *p;
	9507	static const char parens[] = "=!<,>";
	9508
	9509	if (paren && (p = strchr(parens, paren))) {
	9510	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	9511	int flag = (p - parens) > 1;
	9512
	9513	if (paren == '>')
	9514	node = SUSPEND, flag = 0;
	9515	reginsert(pRExC_state, node,ret, depth+1);
	9516	Set_Node_Cur_Length(ret, parse_start);
	9517	Set_Node_Offset(ret, parse_start + 1);
	9518	ret->flags = flag;
	9519	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	9520	}
	9521	}
	9522
	9523	/* Check for proper termination. */
	9524	if (paren) {
	9525	/* restore original flags, but keep (?p) */
	9526	RExC_flags = oregflags \| (RExC_flags & RXf_PMf_KEEPCOPY);
	9527	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	9528	RExC_parse = oregcomp_parse;
	9529	vFAIL("Unmatched (");
	9530	}
	9531	}
	9532	else if (!paren && RExC_parse < RExC_end) {
	9533	if (*RExC_parse == ')') {
	9534	RExC_parse++;
	9535	vFAIL("Unmatched )");
	9536	}
	9537	else
	9538	FAIL("Junk on end of regexp"); /* "Can't happen". */
	9539	assert(0); /* NOTREACHED */
	9540	}
	9541
	9542	if (RExC_in_lookbehind) {
	9543	RExC_in_lookbehind--;
	9544	}
	9545	if (after_freeze > RExC_npar)
	9546	RExC_npar = after_freeze;
	9547	return(ret);
	9548	}
	9549
	9550	/*
	9551	- regbranch - one alternative of an \| operator
	9552	*
	9553	* Implements the concatenation operator.
	9554	*
	9555	* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
	9556	* restarted.
	9557	*/
	9558	STATIC regnode *
	9559	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	9560	{
	9561	dVAR;
	9562	regnode *ret;
	9563	regnode *chain = NULL;
	9564	regnode *latest;
	9565	I32 flags = 0, c = 0;
	9566	GET_RE_DEBUG_FLAGS_DECL;
	9567
	9568	PERL_ARGS_ASSERT_REGBRANCH;
	9569
	9570	DEBUG_PARSE("brnc");
	9571
	9572	if (first)
	9573	ret = NULL;
	9574	else {
	9575	if (!SIZE_ONLY && RExC_extralen)
	9576	ret = reganode(pRExC_state, BRANCHJ,0);
	9577	else {
	9578	ret = reg_node(pRExC_state, BRANCH);
	9579	Set_Node_Length(ret, 1);
	9580	}
	9581	}
	9582
	9583	if (!first && SIZE_ONLY)
	9584	RExC_extralen += 1; /* BRANCHJ */
	9585
	9586	flagp = WORST; / Tentatively. */
	9587
	9588	RExC_parse--;
	9589	nextchar(pRExC_state);
	9590	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	9591	flags &= ~TRYAGAIN;
	9592	latest = regpiece(pRExC_state, &flags,depth+1);
	9593	if (latest == NULL) {
	9594	if (flags & TRYAGAIN)
	9595	continue;
	9596	if (flags & RESTART_UTF8) {
	9597	*flagp = RESTART_UTF8;
	9598	return NULL;
	9599	}
	9600	FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
	9601	}
	9602	else if (ret == NULL)
	9603	ret = latest;
	9604	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	9605	if (chain == NULL) /* First piece. */
	9606	*flagp \|= flags&SPSTART;
	9607	else {
	9608	RExC_naughty++;
	9609	REGTAIL(pRExC_state, chain, latest);
	9610	}
	9611	chain = latest;
	9612	c++;
	9613	}
	9614	if (chain == NULL) { /* Loop ran zero times. */
	9615	chain = reg_node(pRExC_state, NOTHING);
	9616	if (ret == NULL)
	9617	ret = chain;
	9618	}
	9619	if (c == 1) {
	9620	*flagp \|= flags&SIMPLE;
	9621	}
	9622
	9623	return ret;
	9624	}
	9625
	9626	/*
	9627	- regpiece - something followed by possible [*+?]
	9628	*
	9629	* Note that the branching code sequences used for ? and the general cases
	9630	* of * and + are somewhat optimized: they use the same NOTHING node as
	9631	* both the endmarker for their branch list and the body of the last branch.
	9632	* It might seem that this node could be dispensed with entirely, but the
	9633	* endmarker role is not redundant.
	9634	*
	9635	* Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
	9636	* TRYAGAIN.
	9637	* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
	9638	* restarted.
	9639	*/
	9640	STATIC regnode *
	9641	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	9642	{
	9643	dVAR;
	9644	regnode *ret;
	9645	char op;
	9646	char *next;
	9647	I32 flags;
	9648	const char * const origparse = RExC_parse;
	9649	I32 min;
	9650	I32 max = REG_INFTY;
	9651	#ifdef RE_TRACK_PATTERN_OFFSETS
	9652	char *parse_start;
	9653	#endif
	9654	const char *maxpos = NULL;
	9655
	9656	/* Save the original in case we change the emitted regop to a FAIL. */
	9657	regnode * const orig_emit = RExC_emit;
	9658
	9659	GET_RE_DEBUG_FLAGS_DECL;
	9660
	9661	PERL_ARGS_ASSERT_REGPIECE;
	9662
	9663	DEBUG_PARSE("piec");
	9664
	9665	ret = regatom(pRExC_state, &flags,depth+1);
	9666	if (ret == NULL) {
	9667	if (flags & (TRYAGAIN\|RESTART_UTF8))
	9668	*flagp \|= flags & (TRYAGAIN\|RESTART_UTF8);
	9669	else
	9670	FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
	9671	return(NULL);
	9672	}
	9673
	9674	op = *RExC_parse;
	9675
	9676	if (op == '{' && regcurly(RExC_parse, FALSE)) {
	9677	maxpos = NULL;
	9678	#ifdef RE_TRACK_PATTERN_OFFSETS
	9679	parse_start = RExC_parse; /* MJD */
	9680	#endif
	9681	next = RExC_parse + 1;
	9682	while (isDIGIT(next) \|\| next == ',') {
	9683	if (*next == ',') {
	9684	if (maxpos)
	9685	break;
	9686	else
	9687	maxpos = next;
	9688	}
	9689	next++;
	9690	}
	9691	if (next == '}') { / got one */
	9692	if (!maxpos)
	9693	maxpos = next;
	9694	RExC_parse++;
	9695	min = atoi(RExC_parse);
	9696	if (*maxpos == ',')
	9697	maxpos++;
	9698	else
	9699	maxpos = RExC_parse;
	9700	max = atoi(maxpos);
	9701	if (!max && *maxpos != '0')
	9702	max = REG_INFTY; /* meaning "infinity" */
	9703	else if (max >= REG_INFTY)
	9704	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	9705	RExC_parse = next;
	9706	nextchar(pRExC_state);
	9707	if (max < min) { /* If can't match, warn and optimize to fail
	9708	unconditionally */
	9709	if (SIZE_ONLY) {
	9710	ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
	9711
	9712	/* We can't back off the size because we have to reserve
	9713	* enough space for all the things we are about to throw
	9714	* away, but we can shrink it by the ammount we are about
	9715	* to re-use here */
	9716	RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
	9717	}
	9718	else {
	9719	RExC_emit = orig_emit;
	9720	}
	9721	ret = reg_node(pRExC_state, OPFAIL);
	9722	return ret;
	9723	}
	9724
	9725	do_curly:
	9726	if ((flags&SIMPLE)) {
	9727	RExC_naughty += 2 + RExC_naughty / 2;
	9728	reginsert(pRExC_state, CURLY, ret, depth+1);
	9729	Set_Node_Offset(ret, parse_start+1); /* MJD */
	9730	Set_Node_Cur_Length(ret, parse_start);
	9731	}
	9732	else {
	9733	regnode * const w = reg_node(pRExC_state, WHILEM);
	9734
	9735	w->flags = 0;
	9736	REGTAIL(pRExC_state, ret, w);
	9737	if (!SIZE_ONLY && RExC_extralen) {
	9738	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	9739	reginsert(pRExC_state, NOTHING,ret, depth+1);
	9740	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	9741	}
	9742	reginsert(pRExC_state, CURLYX,ret, depth+1);
	9743	/* MJD hk */
	9744	Set_Node_Offset(ret, parse_start+1);
	9745	Set_Node_Length(ret,
	9746	op == '{' ? (RExC_parse - parse_start) : 1);
	9747
	9748	if (!SIZE_ONLY && RExC_extralen)
	9749	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	9750	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	9751	if (SIZE_ONLY)
	9752	RExC_whilem_seen++, RExC_extralen += 3;
	9753	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	9754	}
	9755	ret->flags = 0;
	9756
	9757	if (min > 0)
	9758	*flagp = WORST;
	9759	if (max > 0)
	9760	*flagp \|= HASWIDTH;
	9761	if (!SIZE_ONLY) {
	9762	ARG1_SET(ret, (U16)min);
	9763	ARG2_SET(ret, (U16)max);
	9764	}
	9765
	9766	goto nest_check;
	9767	}
	9768	}
	9769
	9770	if (!ISMULT1(op)) {
	9771	*flagp = flags;
	9772	return(ret);
	9773	}
	9774
	9775	#if 0 /* Now runtime fix should be reliable. */
	9776
	9777	/* if this is reinstated, don't forget to put this back into perldiag:
	9778
	9779	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	9780
	9781	(F) The part of the regexp subject to either the * or + quantifier
	9782	could match an empty string. The {#} shows in the regular
	9783	expression about where the problem was discovered.
	9784
	9785	*/
	9786
	9787	if (!(flags&HASWIDTH) && op != '?')
	9788	vFAIL("Regexp *+ operand could be empty");
	9789	#endif
	9790
	9791	#ifdef RE_TRACK_PATTERN_OFFSETS
	9792	parse_start = RExC_parse;
	9793	#endif
	9794	nextchar(pRExC_state);
	9795
	9796	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	9797
	9798	if (op == '*' && (flags&SIMPLE)) {
	9799	reginsert(pRExC_state, STAR, ret, depth+1);
	9800	ret->flags = 0;
	9801	RExC_naughty += 4;
	9802	}
	9803	else if (op == '*') {
	9804	min = 0;
	9805	goto do_curly;
	9806	}
	9807	else if (op == '+' && (flags&SIMPLE)) {
	9808	reginsert(pRExC_state, PLUS, ret, depth+1);
	9809	ret->flags = 0;
	9810	RExC_naughty += 3;
	9811	}
	9812	else if (op == '+') {
	9813	min = 1;
	9814	goto do_curly;
	9815	}
	9816	else if (op == '?') {
	9817	min = 0; max = 1;
	9818	goto do_curly;
	9819	}
	9820	nest_check:
	9821	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	9822	SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
	9823	ckWARN3reg(RExC_parse,
	9824	"%.*s matches null string many times",
	9825	(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
	9826	origparse);
	9827	(void)ReREFCNT_inc(RExC_rx_sv);
	9828	}
	9829
	9830	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	9831	nextchar(pRExC_state);
	9832	reginsert(pRExC_state, MINMOD, ret, depth+1);
	9833	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	9834	}
	9835	else
	9836	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	9837	regnode *ender;
	9838	nextchar(pRExC_state);
	9839	ender = reg_node(pRExC_state, SUCCEED);
	9840	REGTAIL(pRExC_state, ret, ender);
	9841	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	9842	ret->flags = 0;
	9843	ender = reg_node(pRExC_state, TAIL);
	9844	REGTAIL(pRExC_state, ret, ender);
	9845	}
	9846
	9847	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	9848	RExC_parse++;
	9849	vFAIL("Nested quantifiers");
	9850	}
	9851
	9852	return(ret);
	9853	}
	9854
	9855	STATIC bool
	9856	S_grok_bslash_N(pTHX_ RExC_state_t pRExC_state, regnode* node_p, UV valuep, I32 flagp, U32 depth, bool in_char_class,
	9857	const bool strict /* Apply stricter parsing rules? */
	9858	)
	9859	{
	9860
	9861	/* This is expected to be called by a parser routine that has recognized '\N'
	9862	and needs to handle the rest. RExC_parse is expected to point at the first
	9863	char following the N at the time of the call. On successful return,
	9864	RExC_parse has been updated to point to just after the sequence identified
	9865	by this routine, and <*flagp> has been updated.
	9866
	9867	The \N may be inside (indicated by the boolean <in_char_class>) or outside a
	9868	character class.
	9869
	9870	\N may begin either a named sequence, or if outside a character class, mean
	9871	to match a non-newline. For non single-quoted regexes, the tokenizer has
	9872	attempted to decide which, and in the case of a named sequence, converted it
	9873	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	9874	where c1... are the characters in the sequence. For single-quoted regexes,
	9875	the tokenizer passes the \N sequence through unchanged; this code will not
	9876	attempt to determine this nor expand those, instead raising a syntax error.
	9877	The net effect is that if the beginning of the passed-in pattern isn't '{U+'
	9878	or there is no '}', it signals that this \N occurrence means to match a
	9879	non-newline.
	9880
	9881	Only the \N{U+...} form should occur in a character class, for the same
	9882	reason that '.' inside a character class means to just match a period: it
	9883	just doesn't make sense.
	9884
	9885	The function raises an error (via vFAIL), and doesn't return for various
	9886	syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
	9887	success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
	9888	RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
	9889	only possible if node_p is non-NULL.
	9890
	9891
	9892	If <valuep> is non-null, it means the caller can accept an input sequence
	9893	consisting of a just a single code point; <*valuep> is set to that value
	9894	if the input is such.
	9895
	9896	If <node_p> is non-null it signifies that the caller can accept any other
	9897	legal sequence (i.e., one that isn't just a single code point). <*node_p>
	9898	is set as follows:
	9899	1) \N means not-a-NL: points to a newly created REG_ANY node;
	9900	2) \N{}: points to a new NOTHING node;
	9901	3) otherwise: points to a new EXACT node containing the resolved
	9902	string.
	9903	Note that FALSE is returned for single code point sequences if <valuep> is
	9904	null.
	9905	*/
	9906
	9907	char * endbrace; /* '}' following the name */
	9908	char* p;
	9909	char endchar; / Points to '.' or '}' ending cur char in the input
	9910	stream */
	9911	bool has_multiple_chars; /* true if the input stream contains a sequence of
	9912	more than one character */
	9913
	9914	GET_RE_DEBUG_FLAGS_DECL;
	9915
	9916	PERL_ARGS_ASSERT_GROK_BSLASH_N;
	9917
	9918	GET_RE_DEBUG_FLAGS;
	9919
	9920	assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
	9921
	9922	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	9923	* modifier. The other meaning does not */
	9924	p = (RExC_flags & RXf_PMf_EXTENDED)
	9925	? regwhite( pRExC_state, RExC_parse )
	9926	: RExC_parse;
	9927
	9928	/* Disambiguate between \N meaning a named character versus \N meaning
	9929	* [^\n]. The former is assumed when it can't be the latter. */
	9930	if (*p != '{' \|\| regcurly(p, FALSE)) {
	9931	RExC_parse = p;
	9932	if (! node_p) {
	9933	/* no bare \N in a charclass */
	9934	if (in_char_class) {
	9935	vFAIL("\\N in a character class must be a named character: \\N{...}");
	9936	}
	9937	return FALSE;
	9938	}
	9939	nextchar(pRExC_state);
	9940	*node_p = reg_node(pRExC_state, REG_ANY);
	9941	*flagp \|= HASWIDTH\|SIMPLE;
	9942	RExC_naughty++;
	9943	RExC_parse--;
	9944	Set_Node_Length(node_p, 1); / MJD */
	9945	return TRUE;
	9946	}
	9947
	9948	/* Here, we have decided it should be a named character or sequence */
	9949
	9950	/* The test above made sure that the next real character is a '{', but
	9951	* under the /x modifier, it could be separated by space (or a comment and
	9952	* \n) and this is not allowed (for consistency with \x{...} and the
	9953	* tokenizer handling of \N{NAME}). */
	9954	if (*RExC_parse != '{') {
	9955	vFAIL("Missing braces on \\N{}");
	9956	}
	9957
	9958	RExC_parse++; /* Skip past the '{' */
	9959
	9960	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	9961	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	9962	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
	9963	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
	9964	{
	9965	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	9966	vFAIL("\\N{NAME} must be resolved by the lexer");
	9967	}
	9968
	9969	if (endbrace == RExC_parse) { /* empty: \N{} */
	9970	bool ret = TRUE;
	9971	if (node_p) {
	9972	*node_p = reg_node(pRExC_state,NOTHING);
	9973	}
	9974	else if (in_char_class) {
	9975	if (SIZE_ONLY && in_char_class) {
	9976	if (strict) {
	9977	RExC_parse++; /* Position after the "}" */
	9978	vFAIL("Zero length \\N{}");
	9979	}
	9980	else {
	9981	ckWARNreg(RExC_parse,
	9982	"Ignoring zero length \\N{} in character class");
	9983	}
	9984	}
	9985	ret = FALSE;
	9986	}
	9987	else {
	9988	return FALSE;
	9989	}
	9990	nextchar(pRExC_state);
	9991	return ret;
	9992	}
	9993
	9994	RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
	9995	RExC_parse += 2; /* Skip past the 'U+' */
	9996
	9997	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	9998
	9999	/* Code points are separated by dots. If none, there is only one code
	10000	* point, and is terminated by the brace */
	10001	has_multiple_chars = (endchar < endbrace);
	10002
	10003	if (valuep && (! has_multiple_chars \|\| in_char_class)) {
	10004	/* We only pay attention to the first char of
	10005	multichar strings being returned in char classes. I kinda wonder
	10006	if this makes sense as it does change the behaviour
	10007	from earlier versions, OTOH that behaviour was broken
	10008	as well. XXX Solution is to recharacterize as
	10009	[rest-of-class]\|multi1\|multi2... */
	10010
	10011	STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
	10012	I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
	10013	\| PERL_SCAN_DISALLOW_PREFIX
	10014	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	10015
	10016	*valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
	10017
	10018	/* The tokenizer should have guaranteed validity, but it's possible to
	10019	* bypass it by using single quoting, so check */
	10020	if (length_of_hex == 0
	10021	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	10022	{
	10023	RExC_parse += length_of_hex; /* Includes all the valid */
	10024	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	10025	? UTF8SKIP(RExC_parse)
	10026	: 1;
	10027	/* Guard against malformed utf8 */
	10028	if (RExC_parse >= endchar) {
	10029	RExC_parse = endchar;
	10030	}
	10031	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	10032	}
	10033
	10034	if (in_char_class && has_multiple_chars) {
	10035	if (strict) {
	10036	RExC_parse = endbrace;
	10037	vFAIL("\\N{} in character class restricted to one character");
	10038	}
	10039	else {
	10040	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	10041	}
	10042	}
	10043
	10044	RExC_parse = endbrace + 1;
	10045	}
	10046	else if (! node_p \|\| ! has_multiple_chars) {
	10047
	10048	/* Here, the input is legal, but not according to the caller's
	10049	* options. We fail without advancing the parse, so that the
	10050	* caller can try again */
	10051	RExC_parse = p;
	10052	return FALSE;
	10053	}
	10054	else {
	10055
	10056	/* What is done here is to convert this to a sub-pattern of the form
	10057	* (?:\x{char1}\x{char2}...)
	10058	* and then call reg recursively. That way, it retains its atomicness,
	10059	* while not having to worry about special handling that some code
	10060	* points may have. toke.c has converted the original Unicode values
	10061	* to native, so that we can just pass on the hex values unchanged. We
	10062	* do have to set a flag to keep recoding from happening in the
	10063	* recursion */
	10064
	10065	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8\|SVs_TEMP);
	10066	STRLEN len;
	10067	char *orig_end = RExC_end;
	10068	I32 flags;
	10069
	10070	while (RExC_parse < endbrace) {
	10071
	10072	/* Convert to notation the rest of the code understands */
	10073	sv_catpv(substitute_parse, "\\x{");
	10074	sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
	10075	sv_catpv(substitute_parse, "}");
	10076
	10077	/* Point to the beginning of the next character in the sequence. */
	10078	RExC_parse = endchar + 1;
	10079	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	10080	}
	10081	sv_catpv(substitute_parse, ")");
	10082
	10083	RExC_parse = SvPV(substitute_parse, len);
	10084
	10085	/* Don't allow empty number */
	10086	if (len < 8) {
	10087	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	10088	}
	10089	RExC_end = RExC_parse + len;
	10090
	10091	/* The values are Unicode, and therefore not subject to recoding */
	10092	RExC_override_recoding = 1;
	10093
	10094	if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
	10095	if (flags & RESTART_UTF8) {
	10096	*flagp = RESTART_UTF8;
	10097	return FALSE;
	10098	}
	10099	FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
	10100	(UV) flags);
	10101	}
	10102	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	10103
	10104	RExC_parse = endbrace;
	10105	RExC_end = orig_end;
	10106	RExC_override_recoding = 0;
	10107
	10108	nextchar(pRExC_state);
	10109	}
	10110
	10111	return TRUE;
	10112	}
	10113
	10114
	10115	/*
	10116	* reg_recode
	10117	*
	10118	* It returns the code point in utf8 for the value in *encp.
	10119	* value: a code value in the source encoding
	10120	* encp: a pointer to an Encode object
	10121	*
	10122	* If the result from Encode is not a single character,
	10123	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	10124	*/
	10125	STATIC UV
	10126	S_reg_recode(pTHX_ const char value, SV **encp)
	10127	{
	10128	STRLEN numlen = 1;
	10129	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	10130	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	10131	const STRLEN newlen = SvCUR(sv);
	10132	UV uv = UNICODE_REPLACEMENT;
	10133
	10134	PERL_ARGS_ASSERT_REG_RECODE;
	10135
	10136	if (newlen)
	10137	uv = SvUTF8(sv)
	10138	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	10139	: (U8)s;
	10140
	10141	if (!newlen \|\| numlen != newlen) {
	10142	uv = UNICODE_REPLACEMENT;
	10143	*encp = NULL;
	10144	}
	10145	return uv;
	10146	}
	10147
	10148	PERL_STATIC_INLINE U8
	10149	S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
	10150	{
	10151	U8 op;
	10152
	10153	PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
	10154
	10155	if (! FOLD) {
	10156	return EXACT;
	10157	}
	10158
	10159	op = get_regex_charset(RExC_flags);
	10160	if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
	10161	op--; /* /a is same as /u, and map /aa's offset to what /a's would have
	10162	been, so there is no hole */
	10163	}
	10164
	10165	return op + EXACTF;
	10166	}
	10167
	10168	PERL_STATIC_INLINE void
	10169	S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t pRExC_state, regnode node, I32* flagp, STRLEN len, UV code_point)
	10170	{
	10171	/* This knows the details about sizing an EXACTish node, setting flags for
	10172	* it (by setting <*flagp>, and potentially populating it with a single
	10173	* character.
	10174	*
	10175	* If <len> (the length in bytes) is non-zero, this function assumes that
	10176	* the node has already been populated, and just does the sizing. In this
	10177	* case <code_point> should be the final code point that has already been
	10178	* placed into the node. This value will be ignored except that under some
	10179	* circumstances <*flagp> is set based on it.
	10180	*
	10181	* If <len> is zero, the function assumes that the node is to contain only
	10182	* the single character given by <code_point> and calculates what <len>
	10183	* should be. In pass 1, it sizes the node appropriately. In pass 2, it
	10184	* additionally will populate the node's STRING with <code_point>, if <len>
	10185	* is 0. In both cases <*flagp> is appropriately set
	10186	*
	10187	* It knows that under FOLD, the Latin Sharp S and UTF characters above
	10188	* 255, must be folded (the former only when the rules indicate it can
	10189	* match 'ss') */
	10190
	10191	bool len_passed_in = cBOOL(len != 0);
	10192	U8 character[UTF8_MAXBYTES_CASE+1];
	10193
	10194	PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
	10195
	10196	if (! len_passed_in) {
	10197	if (UTF) {
	10198	if (FOLD && (! LOC \|\| code_point > 255)) {
	10199	_to_uni_fold_flags(code_point,
	10200	character,
	10201	&len,
	10202	FOLD_FLAGS_FULL \| ((LOC)
	10203	? FOLD_FLAGS_LOCALE
	10204	: (ASCII_FOLD_RESTRICTED)
	10205	? FOLD_FLAGS_NOMIX_ASCII
	10206	: 0));
	10207	}
	10208	else {
	10209	uvchr_to_utf8( character, code_point);
	10210	len = UTF8SKIP(character);
	10211	}
	10212	}
	10213	else if (! FOLD
	10214	\|\| code_point != LATIN_SMALL_LETTER_SHARP_S
	10215	\|\| ASCII_FOLD_RESTRICTED
	10216	\|\| ! AT_LEAST_UNI_SEMANTICS)
	10217	{
	10218	*character = (U8) code_point;
	10219	len = 1;
	10220	}
	10221	else {
	10222	*character = 's';
	10223	*(character + 1) = 's';
	10224	len = 2;
	10225	}
	10226	}
	10227
	10228	if (SIZE_ONLY) {
	10229	RExC_size += STR_SZ(len);
	10230	}
	10231	else {
	10232	RExC_emit += STR_SZ(len);
	10233	STR_LEN(node) = len;
	10234	if (! len_passed_in) {
	10235	Copy((char *) character, STRING(node), len, char);
	10236	}
	10237	}
	10238
	10239	*flagp \|= HASWIDTH;
	10240
	10241	/* A single character node is SIMPLE, except for the special-cased SHARP S
	10242	* under /di. */
	10243	if ((len == 1 \|\| (UTF && len == UNISKIP(code_point)))
	10244	&& (code_point != LATIN_SMALL_LETTER_SHARP_S
	10245	\|\| ! FOLD \|\| ! DEPENDS_SEMANTICS))
	10246	{
	10247	*flagp \|= SIMPLE;
	10248	}
	10249	}
	10250
	10251	/*
	10252	- regatom - the lowest level
	10253
	10254	Try to identify anything special at the start of the pattern. If there
	10255	is, then handle it as required. This may involve generating a single regop,
	10256	such as for an assertion; or it may involve recursing, such as to
	10257	handle a () structure.
	10258
	10259	If the string doesn't start with something special then we gobble up
	10260	as much literal text as we can.
	10261
	10262	Once we have been able to handle whatever type of thing started the
	10263	sequence, we return.
	10264
	10265	Note: we have to be careful with escapes, as they can be both literal
	10266	and special, and in the case of \10 and friends, context determines which.
	10267
	10268	A summary of the code structure is:
	10269
	10270	switch (first_byte) {
	10271	cases for each special:
	10272	handle this special;
	10273	break;
	10274	case '\\':
	10275	switch (2nd byte) {
	10276	cases for each unambiguous special:
	10277	handle this special;
	10278	break;
	10279	cases for each ambigous special/literal:
	10280	disambiguate;
	10281	if (special) handle here
	10282	else goto defchar;
	10283	default: // unambiguously literal:
	10284	goto defchar;
	10285	}
	10286	default: // is a literal char
	10287	// FALL THROUGH
	10288	defchar:
	10289	create EXACTish node for literal;
	10290	while (more input and node isn't full) {
	10291	switch (input_byte) {
	10292	cases for each special;
	10293	make sure parse pointer is set so that the next call to
	10294	regatom will see this special first
	10295	goto loopdone; // EXACTish node terminated by prev. char
	10296	default:
	10297	append char to EXACTISH node;
	10298	}
	10299	get next input byte;
	10300	}
	10301	loopdone:
	10302	}
	10303	return the generated node;
	10304
	10305	Specifically there are two separate switches for handling
	10306	escape sequences, with the one for handling literal escapes requiring
	10307	a dummy entry for all of the special escapes that are actually handled
	10308	by the other.
	10309
	10310	Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
	10311	TRYAGAIN.
	10312	Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
	10313	restarted.
	10314	Otherwise does not return NULL.
	10315	*/
	10316
	10317	STATIC regnode *
	10318	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	10319	{
	10320	dVAR;
	10321	regnode *ret = NULL;
	10322	I32 flags = 0;
	10323	char *parse_start = RExC_parse;
	10324	U8 op;
	10325	int invert = 0;
	10326
	10327	GET_RE_DEBUG_FLAGS_DECL;
	10328
	10329	flagp = WORST; / Tentatively. */
	10330
	10331	DEBUG_PARSE("atom");
	10332
	10333	PERL_ARGS_ASSERT_REGATOM;
	10334
	10335	tryagain:
	10336	switch ((U8)*RExC_parse) {
	10337	case '^':
	10338	RExC_seen_zerolen++;
	10339	nextchar(pRExC_state);
	10340	if (RExC_flags & RXf_PMf_MULTILINE)
	10341	ret = reg_node(pRExC_state, MBOL);
	10342	else if (RExC_flags & RXf_PMf_SINGLELINE)
	10343	ret = reg_node(pRExC_state, SBOL);
	10344	else
	10345	ret = reg_node(pRExC_state, BOL);
	10346	Set_Node_Length(ret, 1); /* MJD */
	10347	break;
	10348	case '$':
	10349	nextchar(pRExC_state);
	10350	if (*RExC_parse)
	10351	RExC_seen_zerolen++;
	10352	if (RExC_flags & RXf_PMf_MULTILINE)
	10353	ret = reg_node(pRExC_state, MEOL);
	10354	else if (RExC_flags & RXf_PMf_SINGLELINE)
	10355	ret = reg_node(pRExC_state, SEOL);
	10356	else
	10357	ret = reg_node(pRExC_state, EOL);
	10358	Set_Node_Length(ret, 1); /* MJD */
	10359	break;
	10360	case '.':
	10361	nextchar(pRExC_state);
	10362	if (RExC_flags & RXf_PMf_SINGLELINE)
	10363	ret = reg_node(pRExC_state, SANY);
	10364	else
	10365	ret = reg_node(pRExC_state, REG_ANY);
	10366	*flagp \|= HASWIDTH\|SIMPLE;
	10367	RExC_naughty++;
	10368	Set_Node_Length(ret, 1); /* MJD */
	10369	break;
	10370	case '[':
	10371	{
	10372	char * const oregcomp_parse = ++RExC_parse;
	10373	ret = regclass(pRExC_state, flagp,depth+1,
	10374	FALSE, /* means parse the whole char class */
	10375	TRUE, /* allow multi-char folds */
	10376	FALSE, /* don't silence non-portable warnings. */
	10377	NULL);
	10378	if (*RExC_parse != ']') {
	10379	RExC_parse = oregcomp_parse;
	10380	vFAIL("Unmatched [");
	10381	}
	10382	if (ret == NULL) {
	10383	if (*flagp & RESTART_UTF8)
	10384	return NULL;
	10385	FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
	10386	(UV) *flagp);
	10387	}
	10388	nextchar(pRExC_state);
	10389	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	10390	break;
	10391	}
	10392	case '(':
	10393	nextchar(pRExC_state);
	10394	ret = reg(pRExC_state, 2, &flags,depth+1);
	10395	if (ret == NULL) {
	10396	if (flags & TRYAGAIN) {
	10397	if (RExC_parse == RExC_end) {
	10398	/* Make parent create an empty node if needed. */
	10399	*flagp \|= TRYAGAIN;
	10400	return(NULL);
	10401	}
	10402	goto tryagain;
	10403	}
	10404	if (flags & RESTART_UTF8) {
	10405	*flagp = RESTART_UTF8;
	10406	return NULL;
	10407	}
	10408	FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
	10409	}
	10410	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	10411	break;
	10412	case '\|':
	10413	case ')':
	10414	if (flags & TRYAGAIN) {
	10415	*flagp \|= TRYAGAIN;
	10416	return NULL;
	10417	}
	10418	vFAIL("Internal urp");
	10419	/* Supposed to be caught earlier. */
	10420	break;
	10421	case '{':
	10422	if (!regcurly(RExC_parse, FALSE)) {
	10423	RExC_parse++;
	10424	goto defchar;
	10425	}
	10426	/* FALL THROUGH */
	10427	case '?':
	10428	case '+':
	10429	case '*':
	10430	RExC_parse++;
	10431	vFAIL("Quantifier follows nothing");
	10432	break;
	10433	case '\\':
	10434	/* Special Escapes
	10435
	10436	This switch handles escape sequences that resolve to some kind
	10437	of special regop and not to literal text. Escape sequnces that
	10438	resolve to literal text are handled below in the switch marked
	10439	"Literal Escapes".
	10440
	10441	Every entry in this switch must have a corresponding entry
	10442	in the literal escape switch. However, the opposite is not
	10443	required, as the default for this switch is to jump to the
	10444	literal text handling code.
	10445	*/
	10446	switch ((U8)*++RExC_parse) {
	10447	U8 arg;
	10448	/* Special Escapes */
	10449	case 'A':
	10450	RExC_seen_zerolen++;
	10451	ret = reg_node(pRExC_state, SBOL);
	10452	*flagp \|= SIMPLE;
	10453	goto finish_meta_pat;
	10454	case 'G':
	10455	ret = reg_node(pRExC_state, GPOS);
	10456	RExC_seen \|= REG_SEEN_GPOS;
	10457	*flagp \|= SIMPLE;
	10458	goto finish_meta_pat;
	10459	case 'K':
	10460	RExC_seen_zerolen++;
	10461	ret = reg_node(pRExC_state, KEEPS);
	10462	*flagp \|= SIMPLE;
	10463	/* XXX:dmq : disabling in-place substitution seems to
	10464	* be necessary here to avoid cases of memory corruption, as
	10465	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	10466	*/
	10467	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	10468	goto finish_meta_pat;
	10469	case 'Z':
	10470	ret = reg_node(pRExC_state, SEOL);
	10471	*flagp \|= SIMPLE;
	10472	RExC_seen_zerolen++; /* Do not optimize RE away */
	10473	goto finish_meta_pat;
	10474	case 'z':
	10475	ret = reg_node(pRExC_state, EOS);
	10476	*flagp \|= SIMPLE;
	10477	RExC_seen_zerolen++; /* Do not optimize RE away */
	10478	goto finish_meta_pat;
	10479	case 'C':
	10480	ret = reg_node(pRExC_state, CANY);
	10481	RExC_seen \|= REG_SEEN_CANY;
	10482	*flagp \|= HASWIDTH\|SIMPLE;
	10483	goto finish_meta_pat;
	10484	case 'X':
	10485	ret = reg_node(pRExC_state, CLUMP);
	10486	*flagp \|= HASWIDTH;
	10487	goto finish_meta_pat;
	10488
	10489	case 'W':
	10490	invert = 1;
	10491	/* FALLTHROUGH */
	10492	case 'w':
	10493	arg = ANYOF_WORDCHAR;
	10494	goto join_posix;
	10495
	10496	case 'b':
	10497	RExC_seen_zerolen++;
	10498	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	10499	op = BOUND + get_regex_charset(RExC_flags);
	10500	if (op > BOUNDA) { /* /aa is same as /a */
	10501	op = BOUNDA;
	10502	}
	10503	ret = reg_node(pRExC_state, op);
	10504	FLAGS(ret) = get_regex_charset(RExC_flags);
	10505	*flagp \|= SIMPLE;
	10506	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	10507	ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
	10508	}
	10509	goto finish_meta_pat;
	10510	case 'B':
	10511	RExC_seen_zerolen++;
	10512	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	10513	op = NBOUND + get_regex_charset(RExC_flags);
	10514	if (op > NBOUNDA) { /* /aa is same as /a */
	10515	op = NBOUNDA;
	10516	}
	10517	ret = reg_node(pRExC_state, op);
	10518	FLAGS(ret) = get_regex_charset(RExC_flags);
	10519	*flagp \|= SIMPLE;
	10520	if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
	10521	ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
	10522	}
	10523	goto finish_meta_pat;
	10524
	10525	case 'D':
	10526	invert = 1;
	10527	/* FALLTHROUGH */
	10528	case 'd':
	10529	arg = ANYOF_DIGIT;
	10530	goto join_posix;
	10531
	10532	case 'R':
	10533	ret = reg_node(pRExC_state, LNBREAK);
	10534	*flagp \|= HASWIDTH\|SIMPLE;
	10535	goto finish_meta_pat;
	10536
	10537	case 'H':
	10538	invert = 1;
	10539	/* FALLTHROUGH */
	10540	case 'h':
	10541	arg = ANYOF_BLANK;
	10542	op = POSIXU;
	10543	goto join_posix_op_known;
	10544
	10545	case 'V':
	10546	invert = 1;
	10547	/* FALLTHROUGH */
	10548	case 'v':
	10549	arg = ANYOF_VERTWS;
	10550	op = POSIXU;
	10551	goto join_posix_op_known;
	10552
	10553	case 'S':
	10554	invert = 1;
	10555	/* FALLTHROUGH */
	10556	case 's':
	10557	arg = ANYOF_SPACE;
	10558
	10559	join_posix:
	10560
	10561	op = POSIXD + get_regex_charset(RExC_flags);
	10562	if (op > POSIXA) { /* /aa is same as /a */
	10563	op = POSIXA;
	10564	}
	10565
	10566	join_posix_op_known:
	10567
	10568	if (invert) {
	10569	op += NPOSIXD - POSIXD;
	10570	}
	10571
	10572	ret = reg_node(pRExC_state, op);
	10573	if (! SIZE_ONLY) {
	10574	FLAGS(ret) = namedclass_to_classnum(arg);
	10575	}
	10576
	10577	*flagp \|= HASWIDTH\|SIMPLE;
	10578	/* FALL THROUGH */
	10579
	10580	finish_meta_pat:
	10581	nextchar(pRExC_state);
	10582	Set_Node_Length(ret, 2); /* MJD */
	10583	break;
	10584	case 'p':
	10585	case 'P':
	10586	{
	10587	#ifdef DEBUGGING
	10588	char* parse_start = RExC_parse - 2;
	10589	#endif
	10590
	10591	RExC_parse--;
	10592
	10593	ret = regclass(pRExC_state, flagp,depth+1,
	10594	TRUE, /* means just parse this element */
	10595	FALSE, /* don't allow multi-char folds */
	10596	FALSE, /* don't silence non-portable warnings.
	10597	It would be a bug if these returned
	10598	non-portables */
	10599	NULL);
	10600	/* regclass() can only return RESTART_UTF8 if multi-char folds
	10601	are allowed. */
	10602	if (!ret)
	10603	FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
	10604	(UV) *flagp);
	10605
	10606	RExC_parse--;
	10607
	10608	Set_Node_Offset(ret, parse_start + 2);
	10609	Set_Node_Cur_Length(ret, parse_start);
	10610	nextchar(pRExC_state);
	10611	}
	10612	break;
	10613	case 'N':
	10614	/* Handle \N and \N{NAME} with multiple code points here and not
	10615	* below because it can be multicharacter. join_exact() will join
	10616	* them up later on. Also this makes sure that things like
	10617	* /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
	10618	* The options to the grok function call causes it to fail if the
	10619	* sequence is just a single code point. We then go treat it as
	10620	* just another character in the current EXACT node, and hence it
	10621	* gets uniform treatment with all the other characters. The
	10622	* special treatment for quantifiers is not needed for such single
	10623	* character sequences */
	10624	++RExC_parse;
	10625	if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
	10626	FALSE /* not strict */ )) {
	10627	if (*flagp & RESTART_UTF8)
	10628	return NULL;
	10629	RExC_parse--;
	10630	goto defchar;
	10631	}
	10632	break;
	10633	case 'k': /* Handle \k<NAME> and \k'NAME' */
	10634	parse_named_seq:
	10635	{
	10636	char ch= RExC_parse[1];
	10637	if (ch != '<' && ch != '\'' && ch != '{') {
	10638	RExC_parse++;
	10639	vFAIL2("Sequence %.2s... not terminated",parse_start);
	10640	} else {
	10641	/* this pretty much dupes the code for (?P=...) in reg(), if
	10642	you change this make sure you change that */
	10643	char* name_start = (RExC_parse += 2);
	10644	U32 num = 0;
	10645	SV *sv_dat = reg_scan_name(pRExC_state,
	10646	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	10647	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	10648	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	10649	vFAIL2("Sequence %.3s... not terminated",parse_start);
	10650
	10651	if (!SIZE_ONLY) {
	10652	num = add_data( pRExC_state, 1, "S" );
	10653	RExC_rxi->data->data[num]=(void*)sv_dat;
	10654	SvREFCNT_inc_simple_void(sv_dat);
	10655	}
	10656
	10657	RExC_sawback = 1;
	10658	ret = reganode(pRExC_state,
	10659	((! FOLD)
	10660	? NREF
	10661	: (ASCII_FOLD_RESTRICTED)
	10662	? NREFFA
	10663	: (AT_LEAST_UNI_SEMANTICS)
	10664	? NREFFU
	10665	: (LOC)
	10666	? NREFFL
	10667	: NREFF),
	10668	num);
	10669	*flagp \|= HASWIDTH;
	10670
	10671	/* override incorrect value set in reganode MJD */
	10672	Set_Node_Offset(ret, parse_start+1);
	10673	Set_Node_Cur_Length(ret, parse_start);
	10674	nextchar(pRExC_state);
	10675
	10676	}
	10677	break;
	10678	}
	10679	case 'g':
	10680	case '1': case '2': case '3': case '4':
	10681	case '5': case '6': case '7': case '8': case '9':
	10682	{
	10683	I32 num;
	10684	bool isg = *RExC_parse == 'g';
	10685	bool isrel = 0;
	10686	bool hasbrace = 0;
	10687	if (isg) {
	10688	RExC_parse++;
	10689	if (*RExC_parse == '{') {
	10690	RExC_parse++;
	10691	hasbrace = 1;
	10692	}
	10693	if (*RExC_parse == '-') {
	10694	RExC_parse++;
	10695	isrel = 1;
	10696	}
	10697	if (hasbrace && !isDIGIT(*RExC_parse)) {
	10698	if (isrel) RExC_parse--;
	10699	RExC_parse -= 2;
	10700	goto parse_named_seq;
	10701	} }
	10702	num = atoi(RExC_parse);
	10703	if (isg && num == 0) {
	10704	if (*RExC_parse == '0') {
	10705	vFAIL("Reference to invalid group 0");
	10706	}
	10707	else {
	10708	vFAIL("Unterminated \\g... pattern");
	10709	}
	10710	}
	10711	if (isrel) {
	10712	num = RExC_npar - num;
	10713	if (num < 1)
	10714	vFAIL("Reference to nonexistent or unclosed group");
	10715	}
	10716	if (!isg && num > 9 && num >= RExC_npar && RExC_parse != '8' && RExC_parse != '9')
	10717	/* Probably a character specified in octal, e.g. \35 */
	10718	goto defchar;
	10719	else {
	10720	#ifdef RE_TRACK_PATTERN_OFFSETS
	10721	char * const parse_start = RExC_parse - 1; /* MJD */
	10722	#endif
	10723	while (isDIGIT(*RExC_parse))
	10724	RExC_parse++;
	10725	if (hasbrace) {
	10726	if (*RExC_parse != '}')
	10727	vFAIL("Unterminated \\g{...} pattern");
	10728	RExC_parse++;
	10729	}
	10730	if (!SIZE_ONLY) {
	10731	if (num > (I32)RExC_rx->nparens)
	10732	vFAIL("Reference to nonexistent group");
	10733	}
	10734	RExC_sawback = 1;
	10735	ret = reganode(pRExC_state,
	10736	((! FOLD)
	10737	? REF
	10738	: (ASCII_FOLD_RESTRICTED)
	10739	? REFFA
	10740	: (AT_LEAST_UNI_SEMANTICS)
	10741	? REFFU
	10742	: (LOC)
	10743	? REFFL
	10744	: REFF),
	10745	num);
	10746	*flagp \|= HASWIDTH;
	10747
	10748	/* override incorrect value set in reganode MJD */
	10749	Set_Node_Offset(ret, parse_start+1);
	10750	Set_Node_Cur_Length(ret, parse_start);
	10751	RExC_parse--;
	10752	nextchar(pRExC_state);
	10753	}
	10754	}
	10755	break;
	10756	case '\0':
	10757	if (RExC_parse >= RExC_end)
	10758	FAIL("Trailing \\");
	10759	/* FALL THROUGH */
	10760	default:
	10761	/* Do not generate "unrecognized" warnings here, we fall
	10762	back into the quick-grab loop below */
	10763	parse_start--;
	10764	goto defchar;
	10765	}
	10766	break;
	10767
	10768	case '#':
	10769	if (RExC_flags & RXf_PMf_EXTENDED) {
	10770	if ( reg_skipcomment( pRExC_state ) )
	10771	goto tryagain;
	10772	}
	10773	/* FALL THROUGH */
	10774
	10775	default:
	10776
	10777	parse_start = RExC_parse - 1;
	10778
	10779	RExC_parse++;
	10780
	10781	defchar: {
	10782	STRLEN len = 0;
	10783	UV ender = 0;
	10784	char *p;
	10785	char *s;
	10786	#define MAX_NODE_STRING_SIZE 127
	10787	char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
	10788	char *s0;
	10789	U8 upper_parse = MAX_NODE_STRING_SIZE;
	10790	STRLEN foldlen;
	10791	U8 node_type = compute_EXACTish(pRExC_state);
	10792	bool next_is_quantifier;
	10793	char * oldp = NULL;
	10794
	10795	/* We can convert EXACTF nodes to EXACTFU if they contain only
	10796	* characters that match identically regardless of the target
	10797	* string's UTF8ness. The reason to do this is that EXACTF is not
	10798	* trie-able, EXACTFU is. (We don't need to figure this out until
	10799	* pass 2) */
	10800	bool maybe_exactfu = node_type == EXACTF && PASS2;
	10801
	10802	/* If a folding node contains only code points that don't
	10803	* participate in folds, it can be changed into an EXACT node,
	10804	* which allows the optimizer more things to look for */
	10805	bool maybe_exact;
	10806
	10807	ret = reg_node(pRExC_state, node_type);
	10808
	10809	/* In pass1, folded, we use a temporary buffer instead of the
	10810	* actual node, as the node doesn't exist yet */
	10811	s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
	10812
	10813	s0 = s;
	10814
	10815	reparse:
	10816
	10817	/* We do the EXACTFish to EXACT node only if folding, and not if in
	10818	* locale, as whether a character folds or not isn't known until
	10819	* runtime. (And we don't need to figure this out until pass 2) */
	10820	maybe_exact = FOLD && ! LOC && PASS2;
	10821
	10822	/* XXX The node can hold up to 255 bytes, yet this only goes to
	10823	* 127. I (khw) do not know why. Keeping it somewhat less than
	10824	* 255 allows us to not have to worry about overflow due to
	10825	* converting to utf8 and fold expansion, but that value is
	10826	* 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
	10827	* split up by this limit into a single one using the real max of
	10828	* 255. Even at 127, this breaks under rare circumstances. If
	10829	* folding, we do not want to split a node at a character that is a
	10830	* non-final in a multi-char fold, as an input string could just
	10831	* happen to want to match across the node boundary. The join
	10832	* would solve that problem if the join actually happens. But a
	10833	* series of more than two nodes in a row each of 127 would cause
	10834	* the first join to succeed to get to 254, but then there wouldn't
	10835	* be room for the next one, which could at be one of those split
	10836	* multi-char folds. I don't know of any fool-proof solution. One
	10837	* could back off to end with only a code point that isn't such a
	10838	* non-final, but it is possible for there not to be any in the
	10839	* entire node. */
	10840	for (p = RExC_parse - 1;
	10841	len < upper_parse && p < RExC_end;
	10842	len++)
	10843	{
	10844	oldp = p;
	10845
	10846	if (RExC_flags & RXf_PMf_EXTENDED)
	10847	p = regwhite( pRExC_state, p );
	10848	switch ((U8)*p) {
	10849	case '^':
	10850	case '$':
	10851	case '.':
	10852	case '[':
	10853	case '(':
	10854	case ')':
	10855	case '\|':
	10856	goto loopdone;
	10857	case '\\':
	10858	/* Literal Escapes Switch
	10859
	10860	This switch is meant to handle escape sequences that
	10861	resolve to a literal character.
	10862
	10863	Every escape sequence that represents something
	10864	else, like an assertion or a char class, is handled
	10865	in the switch marked 'Special Escapes' above in this
	10866	routine, but also has an entry here as anything that
	10867	isn't explicitly mentioned here will be treated as
	10868	an unescaped equivalent literal.
	10869	*/
	10870
	10871	switch ((U8)*++p) {
	10872	/* These are all the special escapes. */
	10873	case 'A': /* Start assertion */
	10874	case 'b': case 'B': /* Word-boundary assertion*/
	10875	case 'C': /* Single char !DANGEROUS! */
	10876	case 'd': case 'D': /* digit class */
	10877	case 'g': case 'G': /* generic-backref, pos assertion */
	10878	case 'h': case 'H': /* HORIZWS */
	10879	case 'k': case 'K': /* named backref, keep marker */
	10880	case 'p': case 'P': /* Unicode property */
	10881	case 'R': /* LNBREAK */
	10882	case 's': case 'S': /* space class */
	10883	case 'v': case 'V': /* VERTWS */
	10884	case 'w': case 'W': /* word class */
	10885	case 'X': /* eXtended Unicode "combining character sequence" */
	10886	case 'z': case 'Z': /* End of line/string assertion */
	10887	--p;
	10888	goto loopdone;
	10889
	10890	/* Anything after here is an escape that resolves to a
	10891	literal. (Except digits, which may or may not)
	10892	*/
	10893	case 'n':
	10894	ender = '\n';
	10895	p++;
	10896	break;
	10897	case 'N': /* Handle a single-code point named character. */
	10898	/* The options cause it to fail if a multiple code
	10899	* point sequence. Handle those in the switch() above
	10900	* */
	10901	RExC_parse = p + 1;
	10902	if (! grok_bslash_N(pRExC_state, NULL, &ender,
	10903	flagp, depth, FALSE,
	10904	FALSE /* not strict */ ))
	10905	{
	10906	if (*flagp & RESTART_UTF8)
	10907	FAIL("panic: grok_bslash_N set RESTART_UTF8");
	10908	RExC_parse = p = oldp;
	10909	goto loopdone;
	10910	}
	10911	p = RExC_parse;
	10912	if (ender > 0xff) {
	10913	REQUIRE_UTF8;
	10914	}
	10915	break;
	10916	case 'r':
	10917	ender = '\r';
	10918	p++;
	10919	break;
	10920	case 't':
	10921	ender = '\t';
	10922	p++;
	10923	break;
	10924	case 'f':
	10925	ender = '\f';
	10926	p++;
	10927	break;
	10928	case 'e':
	10929	ender = ASCII_TO_NATIVE('\033');
	10930	p++;
	10931	break;
	10932	case 'a':
	10933	ender = '\a';
	10934	p++;
	10935	break;
	10936	case 'o':
	10937	{
	10938	UV result;
	10939	const char* error_msg;
	10940
	10941	bool valid = grok_bslash_o(&p,
	10942	&result,
	10943	&error_msg,
	10944	TRUE, /* out warnings */
	10945	FALSE, /* not strict */
	10946	TRUE, /* Output warnings
	10947	for non-
	10948	portables */
	10949	UTF);
	10950	if (! valid) {
	10951	RExC_parse = p; /* going to die anyway; point
	10952	to exact spot of failure */
	10953	vFAIL(error_msg);
	10954	}
	10955	ender = result;
	10956	if (PL_encoding && ender < 0x100) {
	10957	goto recode_encoding;
	10958	}
	10959	if (ender > 0xff) {
	10960	REQUIRE_UTF8;
	10961	}
	10962	break;
	10963	}
	10964	case 'x':
	10965	{
	10966	UV result = UV_MAX; /* initialize to erroneous
	10967	value */
	10968	const char* error_msg;
	10969
	10970	bool valid = grok_bslash_x(&p,
	10971	&result,
	10972	&error_msg,
	10973	TRUE, /* out warnings */
	10974	FALSE, /* not strict */
	10975	TRUE, /* Output warnings
	10976	for non-
	10977	portables */
	10978	UTF);
	10979	if (! valid) {
	10980	RExC_parse = p; /* going to die anyway; point
	10981	to exact spot of failure */
	10982	vFAIL(error_msg);
	10983	}
	10984	ender = result;
	10985
	10986	if (PL_encoding && ender < 0x100) {
	10987	goto recode_encoding;
	10988	}
	10989	if (ender > 0xff) {
	10990	REQUIRE_UTF8;
	10991	}
	10992	break;
	10993	}
	10994	case 'c':
	10995	p++;
	10996	ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
	10997	break;
	10998	case '8': case '9': /* must be a backreference */
	10999	--p;
	11000	goto loopdone;
	11001	case '1': case '2': case '3':case '4':
	11002	case '5': case '6': case '7':
	11003	/* When we parse backslash escapes there is ambiguity
	11004	* between backreferences and octal escapes. Any escape
	11005	* from \1 - \9 is a backreference, any multi-digit
	11006	* escape which does not start with 0 and which when
	11007	* evaluated as decimal could refer to an already
	11008	* parsed capture buffer is a backslash. Anything else
	11009	* is octal.
	11010	*
	11011	* Note this implies that \118 could be interpreted as
	11012	* 118 OR as "\11" . "8" depending on whether there
	11013	* were 118 capture buffers defined already in the
	11014	* pattern. */
	11015	if ( !isDIGIT(p[1]) \|\| atoi(p) <= RExC_npar )
	11016	{ /* Not to be treated as an octal constant, go
	11017	find backref */
	11018	--p;
	11019	goto loopdone;
	11020	}
	11021	case '0':
	11022	{
	11023	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	11024	STRLEN numlen = 3;
	11025	ender = grok_oct(p, &numlen, &flags, NULL);
	11026	if (ender > 0xff) {
	11027	REQUIRE_UTF8;
	11028	}
	11029	p += numlen;
	11030	if (SIZE_ONLY /* like \08, \178 */
	11031	&& numlen < 3
	11032	&& p < RExC_end
	11033	&& isDIGIT(*p) && ckWARN(WARN_REGEXP))
	11034	{
	11035	reg_warn_non_literal_string(
	11036	p + 1,
	11037	form_short_octal_warning(p, numlen));
	11038	}
	11039	}
	11040	if (PL_encoding && ender < 0x100)
	11041	goto recode_encoding;
	11042	break;
	11043	recode_encoding:
	11044	if (! RExC_override_recoding) {
	11045	SV* enc = PL_encoding;
	11046	ender = reg_recode((const char)(U8)ender, &enc);
	11047	if (!enc && SIZE_ONLY)
	11048	ckWARNreg(p, "Invalid escape in the specified encoding");
	11049	REQUIRE_UTF8;
	11050	}
	11051	break;
	11052	case '\0':
	11053	if (p >= RExC_end)
	11054	FAIL("Trailing \\");
	11055	/* FALL THROUGH */
	11056	default:
	11057	if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
	11058	/* Include any { following the alpha to emphasize
	11059	* that it could be part of an escape at some point
	11060	* in the future */
	11061	int len = (isALPHA(p) && (p + 1) == '{') ? 2 : 1;
	11062	ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
	11063	}
	11064	goto normal_default;
	11065	} /* End of switch on '\' */
	11066	break;
	11067	default: /* A literal character */
	11068
	11069	if (! SIZE_ONLY
	11070	&& RExC_flags & RXf_PMf_EXTENDED
	11071	&& ckWARN_d(WARN_DEPRECATED)
	11072	&& is_PATWS_non_low(p, UTF))
	11073	{
	11074	vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
	11075	"Escape literal pattern white space under /x");
	11076	}
	11077
	11078	normal_default:
	11079	if (UTF8_IS_START(*p) && UTF) {
	11080	STRLEN numlen;
	11081	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	11082	&numlen, UTF8_ALLOW_DEFAULT);
	11083	p += numlen;
	11084	}
	11085	else
	11086	ender = (U8) *p++;
	11087	break;
	11088	} /* End of switch on the literal */
	11089
	11090	/* Here, have looked at the literal character and <ender>
	11091	* contains its ordinal, <p> points to the character after it
	11092	*/
	11093
	11094	if ( RExC_flags & RXf_PMf_EXTENDED)
	11095	p = regwhite( pRExC_state, p );
	11096
	11097	/* If the next thing is a quantifier, it applies to this
	11098	* character only, which means that this character has to be in
	11099	* its own node and can't just be appended to the string in an
	11100	* existing node, so if there are already other characters in
	11101	* the node, close the node with just them, and set up to do
	11102	* this character again next time through, when it will be the
	11103	* only thing in its new node */
	11104	if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
	11105	{
	11106	p = oldp;
	11107	goto loopdone;
	11108	}
	11109
	11110	if (! FOLD) {
	11111	if (UTF) {
	11112	const STRLEN unilen = reguni(pRExC_state, ender, s);
	11113	if (unilen > 0) {
	11114	s += unilen;
	11115	len += unilen;
	11116	}
	11117
	11118	/* The loop increments <len> each time, as all but this
	11119	* path (and one other) through it add a single byte to
	11120	* the EXACTish node. But this one has changed len to
	11121	* be the correct final value, so subtract one to
	11122	* cancel out the increment that follows */
	11123	len--;
	11124	}
	11125	else {
	11126	REGC((char)ender, s++);
	11127	}
	11128	}
	11129	else /* FOLD */ if (! ( UTF
	11130	/* See comments for join_exact() as to why we fold this
	11131	* non-UTF at compile time */
	11132	\|\| (node_type == EXACTFU
	11133	&& ender == LATIN_SMALL_LETTER_SHARP_S)))
	11134	{
	11135	if (IS_IN_SOME_FOLD_L1(ender)) {
	11136	maybe_exact = FALSE;
	11137
	11138	/* See if the character's fold differs between /d and
	11139	* /u. This includes the multi-char fold SHARP S to
	11140	* 'ss' */
	11141	if (maybe_exactfu
	11142	&& (PL_fold[ender] != PL_fold_latin1[ender]
	11143	\|\| ender == LATIN_SMALL_LETTER_SHARP_S
	11144	\|\| (len > 0
	11145	&& isARG2_lower_or_UPPER_ARG1('s', ender)
	11146	&& isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
	11147	{
	11148	maybe_exactfu = FALSE;
	11149	}
	11150	}
	11151	*(s++) = (char) ender;
	11152	}
	11153	else { /* UTF */
	11154
	11155	/* Prime the casefolded buffer. Locale rules, which apply
	11156	* only to code points < 256, aren't known until execution,
	11157	* so for them, just output the original character using
	11158	* utf8. If we start to fold non-UTF patterns, be sure to
	11159	* update join_exact() */
	11160	if (LOC && ender < 256) {
	11161	if (NATIVE_IS_INVARIANT(ender)) {
	11162	*s = (U8) ender;
	11163	foldlen = 1;
	11164	} else {
	11165	*s = UTF8_TWO_BYTE_HI(ender);
	11166	*(s + 1) = UTF8_TWO_BYTE_LO(ender);
	11167	foldlen = 2;
	11168	}
	11169	}
	11170	else {
	11171	UV folded = _to_uni_fold_flags(
	11172	ender,
	11173	(U8 *) s,
	11174	&foldlen,
	11175	FOLD_FLAGS_FULL
	11176	\| ((LOC) ? FOLD_FLAGS_LOCALE
	11177	: (ASCII_FOLD_RESTRICTED)
	11178	? FOLD_FLAGS_NOMIX_ASCII
	11179	: 0)
	11180	);
	11181
	11182	/* If this node only contains non-folding code points
	11183	* so far, see if this new one is also non-folding */
	11184	if (maybe_exact) {
	11185	if (folded != ender) {
	11186	maybe_exact = FALSE;
	11187	}
	11188	else {
	11189	/* Here the fold is the original; we have
	11190	* to check further to see if anything
	11191	* folds to it */
	11192	if (! PL_utf8_foldable) {
	11193	SV* swash = swash_init("utf8",
	11194	"_Perl_Any_Folds",
	11195	&PL_sv_undef, 1, 0);
	11196	PL_utf8_foldable =
	11197	_get_swash_invlist(swash);
	11198	SvREFCNT_dec_NN(swash);
	11199	}
	11200	if (_invlist_contains_cp(PL_utf8_foldable,
	11201	ender))
	11202	{
	11203	maybe_exact = FALSE;
	11204	}
	11205	}
	11206	}
	11207	ender = folded;
	11208	}
	11209	s += foldlen;
	11210
	11211	/* The loop increments <len> each time, as all but this
	11212	* path (and one other) through it add a single byte to the
	11213	* EXACTish node. But this one has changed len to be the
	11214	* correct final value, so subtract one to cancel out the
	11215	* increment that follows */
	11216	len += foldlen - 1;
	11217	}
	11218
	11219	if (next_is_quantifier) {
	11220
	11221	/* Here, the next input is a quantifier, and to get here,
	11222	* the current character is the only one in the node.
	11223	* Also, here <len> doesn't include the final byte for this
	11224	* character */
	11225	len++;
	11226	goto loopdone;
	11227	}
	11228
	11229	} /* End of loop through literal characters */
	11230
	11231	/* Here we have either exhausted the input or ran out of room in
	11232	* the node. (If we encountered a character that can't be in the
	11233	* node, transfer is made directly to <loopdone>, and so we
	11234	* wouldn't have fallen off the end of the loop.) In the latter
	11235	* case, we artificially have to split the node into two, because
	11236	* we just don't have enough space to hold everything. This
	11237	* creates a problem if the final character participates in a
	11238	* multi-character fold in the non-final position, as a match that
	11239	* should have occurred won't, due to the way nodes are matched,
	11240	* and our artificial boundary. So back off until we find a non-
	11241	* problematic character -- one that isn't at the beginning or
	11242	* middle of such a fold. (Either it doesn't participate in any
	11243	* folds, or appears only in the final position of all the folds it
	11244	* does participate in.) A better solution with far fewer false
	11245	* positives, and that would fill the nodes more completely, would
	11246	* be to actually have available all the multi-character folds to
	11247	* test against, and to back-off only far enough to be sure that
	11248	* this node isn't ending with a partial one. <upper_parse> is set
	11249	* further below (if we need to reparse the node) to include just
	11250	* up through that final non-problematic character that this code
	11251	* identifies, so when it is set to less than the full node, we can
	11252	* skip the rest of this */
	11253	if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
	11254
	11255	const STRLEN full_len = len;
	11256
	11257	assert(len >= MAX_NODE_STRING_SIZE);
	11258
	11259	/* Here, <s> points to the final byte of the final character.
	11260	* Look backwards through the string until find a non-
	11261	* problematic character */
	11262
	11263	if (! UTF) {
	11264
	11265	/* These two have no multi-char folds to non-UTF characters
	11266	*/
	11267	if (ASCII_FOLD_RESTRICTED \|\| LOC) {
	11268	goto loopdone;
	11269	}
	11270
	11271	while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
	11272	len = s - s0 + 1;
	11273	}
	11274	else {
	11275	if (! PL_NonL1NonFinalFold) {
	11276	PL_NonL1NonFinalFold = _new_invlist_C_array(
	11277	NonL1_Perl_Non_Final_Folds_invlist);
	11278	}
	11279
	11280	/* Point to the first byte of the final character */
	11281	s = (char ) utf8_hop((U8 ) s, -1);
	11282
	11283	while (s >= s0) { /* Search backwards until find
	11284	non-problematic char */
	11285	if (UTF8_IS_INVARIANT(*s)) {
	11286
	11287	/* There are no ascii characters that participate
	11288	* in multi-char folds under /aa. In EBCDIC, the
	11289	* non-ascii invariants are all control characters,
	11290	* so don't ever participate in any folds. */
	11291	if (ASCII_FOLD_RESTRICTED
	11292	\|\| ! IS_NON_FINAL_FOLD(*s))
	11293	{
	11294	break;
	11295	}
	11296	}
	11297	else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
	11298
	11299	/* No Latin1 characters participate in multi-char
	11300	* folds under /l */
	11301	if (LOC
	11302	\|\| ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
	11303	s, (s+1))))
	11304	{
	11305	break;
	11306	}
	11307	}
	11308	else if (! _invlist_contains_cp(
	11309	PL_NonL1NonFinalFold,
	11310	valid_utf8_to_uvchr((U8 *) s, NULL)))
	11311	{
	11312	break;
	11313	}
	11314
	11315	/* Here, the current character is problematic in that
	11316	* it does occur in the non-final position of some
	11317	* fold, so try the character before it, but have to
	11318	* special case the very first byte in the string, so
	11319	* we don't read outside the string */
	11320	s = (s == s0) ? s -1 : (char ) utf8_hop((U8 ) s, -1);
	11321	} /* End of loop backwards through the string */
	11322
	11323	/* If there were only problematic characters in the string,
	11324	* <s> will point to before s0, in which case the length
	11325	* should be 0, otherwise include the length of the
	11326	* non-problematic character just found */
	11327	len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
	11328	}
	11329
	11330	/* Here, have found the final character, if any, that is
	11331	* non-problematic as far as ending the node without splitting
	11332	* it across a potential multi-char fold. <len> contains the
	11333	* number of bytes in the node up-to and including that
	11334	* character, or is 0 if there is no such character, meaning
	11335	* the whole node contains only problematic characters. In
	11336	* this case, give up and just take the node as-is. We can't
	11337	* do any better */
	11338	if (len == 0) {
	11339	len = full_len;
	11340
	11341	/* If the node ends in an 's' we make sure it stays EXACTF,
	11342	* as if it turns into an EXACTFU, it could later get
	11343	* joined with another 's' that would then wrongly match
	11344	* the sharp s */
	11345	if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
	11346	{
	11347	maybe_exactfu = FALSE;
	11348	}
	11349	} else {
	11350
	11351	/* Here, the node does contain some characters that aren't
	11352	* problematic. If one such is the final character in the
	11353	* node, we are done */
	11354	if (len == full_len) {
	11355	goto loopdone;
	11356	}
	11357	else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
	11358
	11359	/* If the final character is problematic, but the
	11360	* penultimate is not, back-off that last character to
	11361	* later start a new node with it */
	11362	p = oldp;
	11363	goto loopdone;
	11364	}
	11365
	11366	/* Here, the final non-problematic character is earlier
	11367	* in the input than the penultimate character. What we do
	11368	* is reparse from the beginning, going up only as far as
	11369	* this final ok one, thus guaranteeing that the node ends
	11370	* in an acceptable character. The reason we reparse is
	11371	* that we know how far in the character is, but we don't
	11372	* know how to correlate its position with the input parse.
	11373	* An alternate implementation would be to build that
	11374	* correlation as we go along during the original parse,
	11375	* but that would entail extra work for every node, whereas
	11376	* this code gets executed only when the string is too
	11377	* large for the node, and the final two characters are
	11378	* problematic, an infrequent occurrence. Yet another
	11379	* possible strategy would be to save the tail of the
	11380	* string, and the next time regatom is called, initialize
	11381	* with that. The problem with this is that unless you
	11382	* back off one more character, you won't be guaranteed
	11383	* regatom will get called again, unless regbranch,
	11384	* regpiece ... are also changed. If you do back off that
	11385	* extra character, so that there is input guaranteed to
	11386	* force calling regatom, you can't handle the case where
	11387	* just the first character in the node is acceptable. I
	11388	* (khw) decided to try this method which doesn't have that
	11389	* pitfall; if performance issues are found, we can do a
	11390	* combination of the current approach plus that one */
	11391	upper_parse = len;
	11392	len = 0;
	11393	s = s0;
	11394	goto reparse;
	11395	}
	11396	} /* End of verifying node ends with an appropriate char */
	11397
	11398	loopdone: /* Jumped to when encounters something that shouldn't be in
	11399	the node */
	11400
	11401	/* I (khw) don't know if you can get here with zero length, but the
	11402	* old code handled this situation by creating a zero-length EXACT
	11403	* node. Might as well be NOTHING instead */
	11404	if (len == 0) {
	11405	OP(ret) = NOTHING;
	11406	}
	11407	else {
	11408	if (FOLD) {
	11409	/* If 'maybe_exact' is still set here, means there are no
	11410	* code points in the node that participate in folds;
	11411	* similarly for 'maybe_exactfu' and code points that match
	11412	* differently depending on UTF8ness of the target string
	11413	* */
	11414	if (maybe_exact) {
	11415	OP(ret) = EXACT;
	11416	}
	11417	else if (maybe_exactfu) {
	11418	OP(ret) = EXACTFU;
	11419	}
	11420	}
	11421	alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
	11422	}
	11423
	11424	RExC_parse = p - 1;
	11425	Set_Node_Cur_Length(ret, parse_start);
	11426	nextchar(pRExC_state);
	11427	{
	11428	/* len is STRLEN which is unsigned, need to copy to signed */
	11429	IV iv = len;
	11430	if (iv < 0)
	11431	vFAIL("Internal disaster");
	11432	}
	11433
	11434	} /* End of label 'defchar:' */
	11435	break;
	11436	} /* End of giant switch on input character */
	11437
	11438	return(ret);
	11439	}
	11440
	11441	STATIC char *
	11442	S_regwhite( RExC_state_t pRExC_state, char p )
	11443	{
	11444	const char *e = RExC_end;
	11445
	11446	PERL_ARGS_ASSERT_REGWHITE;
	11447
	11448	while (p < e) {
	11449	if (isSPACE(*p))
	11450	++p;
	11451	else if (*p == '#') {
	11452	bool ended = 0;
	11453	do {
	11454	if (*p++ == '\n') {
	11455	ended = 1;
	11456	break;
	11457	}
	11458	} while (p < e);
	11459	if (!ended)
	11460	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	11461	}
	11462	else
	11463	break;
	11464	}
	11465	return p;
	11466	}
	11467
	11468	STATIC char *
	11469	S_regpatws( RExC_state_t pRExC_state, char p , const bool recognize_comment )
	11470	{
	11471	/* Returns the next non-pattern-white space, non-comment character (the
	11472	* latter only if 'recognize_comment is true) in the string p, which is
	11473	* ended by RExC_end. If there is no line break ending a comment,
	11474	* RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
	11475	const char *e = RExC_end;
	11476
	11477	PERL_ARGS_ASSERT_REGPATWS;
	11478
	11479	while (p < e) {
	11480	STRLEN len;
	11481	if ((len = is_PATWS_safe(p, e, UTF))) {
	11482	p += len;
	11483	}
	11484	else if (recognize_comment && *p == '#') {
	11485	bool ended = 0;
	11486	do {
	11487	p++;
	11488	if (is_LNBREAK_safe(p, e, UTF)) {
	11489	ended = 1;
	11490	break;
	11491	}
	11492	} while (p < e);
	11493	if (!ended)
	11494	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	11495	}
	11496	else
	11497	break;
	11498	}
	11499	return p;
	11500	}
	11501
	11502	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	11503	Character classes ([:foo:]) can also be negated ([:^foo:]).
	11504	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	11505	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	11506	but trigger failures because they are currently unimplemented. */
	11507
	11508	#define POSIXCC_DONE(c) ((c) == ':')
	11509	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	11510	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	11511
	11512	PERL_STATIC_INLINE I32
	11513	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
	11514	{
	11515	dVAR;
	11516	I32 namedclass = OOB_NAMEDCLASS;
	11517
	11518	PERL_ARGS_ASSERT_REGPPOSIXCC;
	11519
	11520	if (value == '[' && RExC_parse + 1 < RExC_end &&
	11521	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	11522	POSIXCC(UCHARAT(RExC_parse)))
	11523	{
	11524	const char c = UCHARAT(RExC_parse);
	11525	char* const s = RExC_parse++;
	11526
	11527	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	11528	RExC_parse++;
	11529	if (RExC_parse == RExC_end) {
	11530	if (strict) {
	11531
	11532	/* Try to give a better location for the error (than the end of
	11533	* the string) by looking for the matching ']' */
	11534	RExC_parse = s;
	11535	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	11536	RExC_parse++;
	11537	}
	11538	vFAIL2("Unmatched '%c' in POSIX class", c);
	11539	}
	11540	/* Grandfather lone [:, [=, [. */
	11541	RExC_parse = s;
	11542	}
	11543	else {
	11544	const char* const t = RExC_parse++; /* skip over the c */
	11545	assert(*t == c);
	11546
	11547	if (UCHARAT(RExC_parse) == ']') {
	11548	const char *posixcc = s + 1;
	11549	RExC_parse++; /* skip over the ending ] */
	11550
	11551	if (*s == ':') {
	11552	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	11553	const I32 skip = t - posixcc;
	11554
	11555	/* Initially switch on the length of the name. */
	11556	switch (skip) {
	11557	case 4:
	11558	if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
	11559	this is the Perl \w
	11560	*/
	11561	namedclass = ANYOF_WORDCHAR;
	11562	break;
	11563	case 5:
	11564	/* Names all of length 5. */
	11565	/* alnum alpha ascii blank cntrl digit graph lower
	11566	print punct space upper */
	11567	/* Offset 4 gives the best switch position. */
	11568	switch (posixcc[4]) {
	11569	case 'a':
	11570	if (memEQ(posixcc, "alph", 4)) /* alpha */
	11571	namedclass = ANYOF_ALPHA;
	11572	break;
	11573	case 'e':
	11574	if (memEQ(posixcc, "spac", 4)) /* space */
	11575	namedclass = ANYOF_PSXSPC;
	11576	break;
	11577	case 'h':
	11578	if (memEQ(posixcc, "grap", 4)) /* graph */
	11579	namedclass = ANYOF_GRAPH;
	11580	break;
	11581	case 'i':
	11582	if (memEQ(posixcc, "asci", 4)) /* ascii */
	11583	namedclass = ANYOF_ASCII;
	11584	break;
	11585	case 'k':
	11586	if (memEQ(posixcc, "blan", 4)) /* blank */
	11587	namedclass = ANYOF_BLANK;
	11588	break;
	11589	case 'l':
	11590	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	11591	namedclass = ANYOF_CNTRL;
	11592	break;
	11593	case 'm':
	11594	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	11595	namedclass = ANYOF_ALPHANUMERIC;
	11596	break;
	11597	case 'r':
	11598	if (memEQ(posixcc, "lowe", 4)) /* lower */
	11599	namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
	11600	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	11601	namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
	11602	break;
	11603	case 't':
	11604	if (memEQ(posixcc, "digi", 4)) /* digit */
	11605	namedclass = ANYOF_DIGIT;
	11606	else if (memEQ(posixcc, "prin", 4)) /* print */
	11607	namedclass = ANYOF_PRINT;
	11608	else if (memEQ(posixcc, "punc", 4)) /* punct */
	11609	namedclass = ANYOF_PUNCT;
	11610	break;
	11611	}
	11612	break;
	11613	case 6:
	11614	if (memEQ(posixcc, "xdigit", 6))
	11615	namedclass = ANYOF_XDIGIT;
	11616	break;
	11617	}
	11618
	11619	if (namedclass == OOB_NAMEDCLASS)
	11620	Simple_vFAIL3("POSIX class [:%.*s:] unknown",
	11621	t - s - 1, s + 1);
	11622
	11623	/* The #defines are structured so each complement is +1 to
	11624	* the normal one */
	11625	if (complement) {
	11626	namedclass++;
	11627	}
	11628	assert (posixcc[skip] == ':');
	11629	assert (posixcc[skip+1] == ']');
	11630	} else if (!SIZE_ONLY) {
	11631	/* [[=foo=]] and [[.foo.]] are still future. */
	11632
	11633	/* adjust RExC_parse so the warning shows after
	11634	the class closes */
	11635	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	11636	RExC_parse++;
	11637	vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	11638	}
	11639	} else {
	11640	/* Maternal grandfather:
	11641	* "[:" ending in ":" but not in ":]" */
	11642	if (strict) {
	11643	vFAIL("Unmatched '[' in POSIX class");
	11644	}
	11645
	11646	/* Grandfather lone [:, [=, [. */
	11647	RExC_parse = s;
	11648	}
	11649	}
	11650	}
	11651
	11652	return namedclass;
	11653	}
	11654
	11655	STATIC bool
	11656	S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
	11657	{
	11658	/* This applies some heuristics at the current parse position (which should
	11659	* be at a '[') to see if what follows might be intended to be a [:posix:]
	11660	* class. It returns true if it really is a posix class, of course, but it
	11661	* also can return true if it thinks that what was intended was a posix
	11662	* class that didn't quite make it.
	11663	*
	11664	* It will return true for
	11665	* [:alphanumerics:
	11666	* [:alphanumerics] (as long as the ] isn't followed immediately by a
	11667	* ')' indicating the end of the (?[
	11668	* [:any garbage including %^&$ punctuation:]
	11669	*
	11670	* This is designed to be called only from S_handle_regex_sets; it could be
	11671	* easily adapted to be called from the spot at the beginning of regclass()
	11672	* that checks to see in a normal bracketed class if the surrounding []
	11673	* have been omitted ([:word:] instead of [[:word:]]). But doing so would
	11674	* change long-standing behavior, so I (khw) didn't do that */
	11675	char* p = RExC_parse + 1;
	11676	char first_char = *p;
	11677
	11678	PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
	11679
	11680	assert(*(p - 1) == '[');
	11681
	11682	if (! POSIXCC(first_char)) {
	11683	return FALSE;
	11684	}
	11685
	11686	p++;
	11687	while (p < RExC_end && isWORDCHAR(*p)) p++;
	11688
	11689	if (p >= RExC_end) {
	11690	return FALSE;
	11691	}
	11692
	11693	if (p - RExC_parse > 2 /* Got at least 1 word character */
	11694	&& (*p == first_char
	11695	\|\| (p == ']' && p + 1 < RExC_end && (p + 1) != ')')))
	11696	{
	11697	return TRUE;
	11698	}
	11699
	11700	p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
	11701
	11702	return (p
	11703	&& p - RExC_parse > 2 /* [:] evaluates to colon;
	11704	[::] is a bad posix class. */
	11705	&& first_char == *(p - 1));
	11706	}
	11707
	11708	STATIC regnode *
	11709	S_handle_regex_sets(pTHX_ RExC_state_t pRExC_state, SV* return_invlist, I32 *flagp, U32 depth,
	11710	char * const oregcomp_parse)
	11711	{
	11712	/* Handle the (?[...]) construct to do set operations */
	11713
	11714	U8 curchar;
	11715	UV start, end; /* End points of code point ranges */
	11716	SV* result_string;
	11717	char save_end, save_parse;
	11718	SV* final;
	11719	STRLEN len;
	11720	regnode* node;
	11721	AV* stack;
	11722	const bool save_fold = FOLD;
	11723
	11724	GET_RE_DEBUG_FLAGS_DECL;
	11725
	11726	PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
	11727
	11728	if (LOC) {
	11729	vFAIL("(?[...]) not valid in locale");
	11730	}
	11731	RExC_uni_semantics = 1;
	11732
	11733	/* This will return only an ANYOF regnode, or (unlikely) something smaller
	11734	* (such as EXACT). Thus we can skip most everything if just sizing. We
	11735	* call regclass to handle '[]' so as to not have to reinvent its parsing
	11736	* rules here (throwing away the size it computes each time). And, we exit
	11737	* upon an unescaped ']' that isn't one ending a regclass. To do both
	11738	* these things, we need to realize that something preceded by a backslash
	11739	* is escaped, so we have to keep track of backslashes */
	11740	if (SIZE_ONLY) {
	11741	UV depth = 0; /* how many nested (?[...]) constructs */
	11742
	11743	Perl_ck_warner_d(aTHX_
	11744	packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
	11745	"The regex_sets feature is experimental" REPORT_LOCATION,
	11746	(int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
	11747
	11748	while (RExC_parse < RExC_end) {
	11749	SV* current = NULL;
	11750	RExC_parse = regpatws(pRExC_state, RExC_parse,
	11751	TRUE); /* means recognize comments */
	11752	switch (*RExC_parse) {
	11753	case '?':
	11754	if (RExC_parse[1] == '[') depth++, RExC_parse++;
	11755	/* FALL THROUGH */
	11756	default:
	11757	break;
	11758	case '\\':
	11759	/* Skip the next byte (which could cause us to end up in
	11760	* the middle of a UTF-8 character, but since none of those
	11761	* are confusable with anything we currently handle in this
	11762	* switch (invariants all), it's safe. We'll just hit the
	11763	* default: case next time and keep on incrementing until
	11764	* we find one of the invariants we do handle. */
	11765	RExC_parse++;
	11766	break;
	11767	case '[':
	11768	{
	11769	/* If this looks like it is a [:posix:] class, leave the
	11770	* parse pointer at the '[' to fool regclass() into
	11771	* thinking it is part of a '[[:posix:]]'. That function
	11772	* will use strict checking to force a syntax error if it
	11773	* doesn't work out to a legitimate class */
	11774	bool is_posix_class
	11775	= could_it_be_a_POSIX_class(pRExC_state);
	11776	if (! is_posix_class) {
	11777	RExC_parse++;
	11778	}
	11779
	11780	/* regclass() can only return RESTART_UTF8 if multi-char
	11781	folds are allowed. */
	11782	if (!regclass(pRExC_state, flagp,depth+1,
	11783	is_posix_class, /* parse the whole char
	11784	class only if not a
	11785	posix class */
	11786	FALSE, /* don't allow multi-char folds */
	11787	TRUE, /* silence non-portable warnings. */
	11788	&current))
	11789	FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
	11790	(UV) *flagp);
	11791
	11792	/* function call leaves parse pointing to the ']', except
	11793	* if we faked it */
	11794	if (is_posix_class) {
	11795	RExC_parse--;
	11796	}
	11797
	11798	SvREFCNT_dec(current); /* In case it returned something */
	11799	break;
	11800	}
	11801
	11802	case ']':
	11803	if (depth--) break;
	11804	RExC_parse++;
	11805	if (RExC_parse < RExC_end
	11806	&& *RExC_parse == ')')
	11807	{
	11808	node = reganode(pRExC_state, ANYOF, 0);
	11809	RExC_size += ANYOF_SKIP;
	11810	nextchar(pRExC_state);
	11811	Set_Node_Length(node,
	11812	RExC_parse - oregcomp_parse + 1); /* MJD */
	11813	return node;
	11814	}
	11815	goto no_close;
	11816	}
	11817	RExC_parse++;
	11818	}
	11819
	11820	no_close:
	11821	FAIL("Syntax error in (?[...])");
	11822	}
	11823
	11824	/* Pass 2 only after this. Everything in this construct is a
	11825	* metacharacter. Operands begin with either a '\' (for an escape
	11826	* sequence), or a '[' for a bracketed character class. Any other
	11827	* character should be an operator, or parenthesis for grouping. Both
	11828	* types of operands are handled by calling regclass() to parse them. It
	11829	* is called with a parameter to indicate to return the computed inversion
	11830	* list. The parsing here is implemented via a stack. Each entry on the
	11831	* stack is a single character representing one of the operators, or the
	11832	* '('; or else a pointer to an operand inversion list. */
	11833
	11834	#define IS_OPERAND(a) (! SvIOK(a))
	11835
	11836	/* The stack starts empty. It is a syntax error if the first thing parsed
	11837	* is a binary operator; everything else is pushed on the stack. When an
	11838	* operand is parsed, the top of the stack is examined. If it is a binary
	11839	* operator, the item before it should be an operand, and both are replaced
	11840	* by the result of doing that operation on the new operand and the one on
	11841	* the stack. Thus a sequence of binary operands is reduced to a single
	11842	* one before the next one is parsed.
	11843	*
	11844	* A unary operator may immediately follow a binary in the input, for
	11845	* example
	11846	* [a] + ! [b]
	11847	* When an operand is parsed and the top of the stack is a unary operator,
	11848	* the operation is performed, and then the stack is rechecked to see if
	11849	* this new operand is part of a binary operation; if so, it is handled as
	11850	* above.
	11851	*
	11852	* A '(' is simply pushed on the stack; it is valid only if the stack is
	11853	* empty, or the top element of the stack is an operator or another '('
	11854	* (for which the parenthesized expression will become an operand). By the
	11855	* time the corresponding ')' is parsed everything in between should have
	11856	* been parsed and evaluated to a single operand (or else is a syntax
	11857	* error), and is handled as a regular operand */
	11858
	11859	sv_2mortal((SV *)(stack = newAV()));
	11860
	11861	while (RExC_parse < RExC_end) {
	11862	I32 top_index = av_tindex(stack);
	11863	SV** top_ptr;
	11864	SV* current = NULL;
	11865
	11866	/* Skip white space */
	11867	RExC_parse = regpatws(pRExC_state, RExC_parse,
	11868	TRUE); /* means recognize comments */
	11869	if (RExC_parse >= RExC_end) {
	11870	Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
	11871	}
	11872	if ((curchar = UCHARAT(RExC_parse)) == ']') {
	11873	break;
	11874	}
	11875
	11876	switch (curchar) {
	11877
	11878	case '?':
	11879	if (av_tindex(stack) >= 0 /* This makes sure that we can
	11880	safely subtract 1 from
	11881	RExC_parse in the next clause.
	11882	If we have something on the
	11883	stack, we have parsed something
	11884	*/
	11885	&& UCHARAT(RExC_parse - 1) == '('
	11886	&& RExC_parse < RExC_end)
	11887	{
	11888	/* If is a '(?', could be an embedded '(?flags:(?[...])'.
	11889	* This happens when we have some thing like
	11890	*
	11891	* my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
	11892	* ...
	11893	* qr/(?[ \p{Digit} & $thai_or_lao ])/;
	11894	*
	11895	* Here we would be handling the interpolated
	11896	* '$thai_or_lao'. We handle this by a recursive call to
	11897	* ourselves which returns the inversion list the
	11898	* interpolated expression evaluates to. We use the flags
	11899	* from the interpolated pattern. */
	11900	U32 save_flags = RExC_flags;
	11901	const char * const save_parse = ++RExC_parse;
	11902
	11903	parse_lparen_question_flags(pRExC_state);
	11904
	11905	if (RExC_parse == save_parse /* Makes sure there was at
	11906	least one flag (or this
	11907	embedding wasn't compiled)
	11908	*/
	11909	\|\| RExC_parse >= RExC_end - 4
	11910	\|\| UCHARAT(RExC_parse) != ':'
	11911	\|\| UCHARAT(++RExC_parse) != '('
	11912	\|\| UCHARAT(++RExC_parse) != '?'
	11913	\|\| UCHARAT(++RExC_parse) != '[')
	11914	{
	11915
	11916	/* In combination with the above, this moves the
	11917	* pointer to the point just after the first erroneous
	11918	* character (or if there are no flags, to where they
	11919	* should have been) */
	11920	if (RExC_parse >= RExC_end - 4) {
	11921	RExC_parse = RExC_end;
	11922	}
	11923	else if (RExC_parse != save_parse) {
	11924	RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
	11925	}
	11926	vFAIL("Expecting '(?flags:(?[...'");
	11927	}
	11928	RExC_parse++;
	11929	(void) handle_regex_sets(pRExC_state, &current, flagp,
	11930	depth+1, oregcomp_parse);
	11931
	11932	/* Here, 'current' contains the embedded expression's
	11933	* inversion list, and RExC_parse points to the trailing
	11934	* ']'; the next character should be the ')' which will be
	11935	* paired with the '(' that has been put on the stack, so
	11936	* the whole embedded expression reduces to '(operand)' */
	11937	RExC_parse++;
	11938
	11939	RExC_flags = save_flags;
	11940	goto handle_operand;
	11941	}
	11942	/* FALL THROUGH */
	11943
	11944	default:
	11945	RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
	11946	vFAIL("Unexpected character");
	11947
	11948	case '\\':
	11949	/* regclass() can only return RESTART_UTF8 if multi-char
	11950	folds are allowed. */
	11951	if (!regclass(pRExC_state, flagp,depth+1,
	11952	TRUE, /* means parse just the next thing */
	11953	FALSE, /* don't allow multi-char folds */
	11954	FALSE, /* don't silence non-portable warnings. */
	11955	&current))
	11956	FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
	11957	(UV) *flagp);
	11958	/* regclass() will return with parsing just the \ sequence,
	11959	* leaving the parse pointer at the next thing to parse */
	11960	RExC_parse--;
	11961	goto handle_operand;
	11962
	11963	case '[': /* Is a bracketed character class */
	11964	{
	11965	bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
	11966
	11967	if (! is_posix_class) {
	11968	RExC_parse++;
	11969	}
	11970
	11971	/* regclass() can only return RESTART_UTF8 if multi-char
	11972	folds are allowed. */
	11973	if(!regclass(pRExC_state, flagp,depth+1,
	11974	is_posix_class, /* parse the whole char class
	11975	only if not a posix class */
	11976	FALSE, /* don't allow multi-char folds */
	11977	FALSE, /* don't silence non-portable warnings. */
	11978	&current))
	11979	FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
	11980	(UV) *flagp);
	11981	/* function call leaves parse pointing to the ']', except if we
	11982	* faked it */
	11983	if (is_posix_class) {
	11984	RExC_parse--;
	11985	}
	11986
	11987	goto handle_operand;
	11988	}
	11989
	11990	case '&':
	11991	case '\|':
	11992	case '+':
	11993	case '-':
	11994	case '^':
	11995	if (top_index < 0
	11996	\|\| ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
	11997	\|\| ! IS_OPERAND(*top_ptr))
	11998	{
	11999	RExC_parse++;
	12000	vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
	12001	}
	12002	av_push(stack, newSVuv(curchar));
	12003	break;
	12004
	12005	case '!':
	12006	av_push(stack, newSVuv(curchar));
	12007	break;
	12008
	12009	case '(':
	12010	if (top_index >= 0) {
	12011	top_ptr = av_fetch(stack, top_index, FALSE);
	12012	assert(top_ptr);
	12013	if (IS_OPERAND(*top_ptr)) {
	12014	RExC_parse++;
	12015	vFAIL("Unexpected '(' with no preceding operator");
	12016	}
	12017	}
	12018	av_push(stack, newSVuv(curchar));
	12019	break;
	12020
	12021	case ')':
	12022	{
	12023	SV* lparen;
	12024	if (top_index < 1
	12025	\|\| ! (current = av_pop(stack))
	12026	\|\| ! IS_OPERAND(current)
	12027	\|\| ! (lparen = av_pop(stack))
	12028	\|\| IS_OPERAND(lparen)
	12029	\|\| SvUV(lparen) != '(')
	12030	{
	12031	SvREFCNT_dec(current);
	12032	RExC_parse++;
	12033	vFAIL("Unexpected ')'");
	12034	}
	12035	top_index -= 2;
	12036	SvREFCNT_dec_NN(lparen);
	12037
	12038	/* FALL THROUGH */
	12039	}
	12040
	12041	handle_operand:
	12042
	12043	/* Here, we have an operand to process, in 'current' */
	12044
	12045	if (top_index < 0) { /* Just push if stack is empty */
	12046	av_push(stack, current);
	12047	}
	12048	else {
	12049	SV* top = av_pop(stack);
	12050	SV *prev = NULL;
	12051	char current_operator;
	12052
	12053	if (IS_OPERAND(top)) {
	12054	SvREFCNT_dec_NN(top);
	12055	SvREFCNT_dec_NN(current);
	12056	vFAIL("Operand with no preceding operator");
	12057	}
	12058	current_operator = (char) SvUV(top);
	12059	switch (current_operator) {
	12060	case '(': /* Push the '(' back on followed by the new
	12061	operand */
	12062	av_push(stack, top);
	12063	av_push(stack, current);
	12064	SvREFCNT_inc(top); /* Counters the '_dec' done
	12065	just after the 'break', so
	12066	it doesn't get wrongly freed
	12067	*/
	12068	break;
	12069
	12070	case '!':
	12071	_invlist_invert(current);
	12072
	12073	/* Unlike binary operators, the top of the stack,
	12074	* now that this unary one has been popped off, may
	12075	* legally be an operator, and we now have operand
	12076	* for it. */
	12077	top_index--;
	12078	SvREFCNT_dec_NN(top);
	12079	goto handle_operand;
	12080
	12081	case '&':
	12082	prev = av_pop(stack);
	12083	_invlist_intersection(prev,
	12084	current,
	12085	&current);
	12086	av_push(stack, current);
	12087	break;
	12088
	12089	case '\|':
	12090	case '+':
	12091	prev = av_pop(stack);
	12092	_invlist_union(prev, current, &current);
	12093	av_push(stack, current);
	12094	break;
	12095
	12096	case '-':
	12097	prev = av_pop(stack);;
	12098	_invlist_subtract(prev, current, &current);
	12099	av_push(stack, current);
	12100	break;
	12101
	12102	case '^': /* The union minus the intersection */
	12103	{
	12104	SV* i = NULL;
	12105	SV* u = NULL;
	12106	SV* element;
	12107
	12108	prev = av_pop(stack);
	12109	_invlist_union(prev, current, &u);
	12110	_invlist_intersection(prev, current, &i);
	12111	/* _invlist_subtract will overwrite current
	12112	without freeing what it already contains */
	12113	element = current;
	12114	_invlist_subtract(u, i, &current);
	12115	av_push(stack, current);
	12116	SvREFCNT_dec_NN(i);
	12117	SvREFCNT_dec_NN(u);
	12118	SvREFCNT_dec_NN(element);
	12119	break;
	12120	}
	12121
	12122	default:
	12123	Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
	12124	}
	12125	SvREFCNT_dec_NN(top);
	12126	SvREFCNT_dec(prev);
	12127	}
	12128	}
	12129
	12130	RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
	12131	}
	12132
	12133	if (av_tindex(stack) < 0 /* Was empty */
	12134	\|\| ((final = av_pop(stack)) == NULL)
	12135	\|\| ! IS_OPERAND(final)
	12136	\|\| av_tindex(stack) >= 0) /* More left on stack */
	12137	{
	12138	vFAIL("Incomplete expression within '(?[ ])'");
	12139	}
	12140
	12141	/* Here, 'final' is the resultant inversion list from evaluating the
	12142	* expression. Return it if so requested */
	12143	if (return_invlist) {
	12144	*return_invlist = final;
	12145	return END;
	12146	}
	12147
	12148	/* Otherwise generate a resultant node, based on 'final'. regclass() is
	12149	* expecting a string of ranges and individual code points */
	12150	invlist_iterinit(final);
	12151	result_string = newSVpvs("");
	12152	while (invlist_iternext(final, &start, &end)) {
	12153	if (start == end) {
	12154	Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
	12155	}
	12156	else {
	12157	Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
	12158	start, end);
	12159	}
	12160	}
	12161
	12162	save_parse = RExC_parse;
	12163	RExC_parse = SvPV(result_string, len);
	12164	save_end = RExC_end;
	12165	RExC_end = RExC_parse + len;
	12166
	12167	/* We turn off folding around the call, as the class we have constructed
	12168	* already has all folding taken into consideration, and we don't want
	12169	* regclass() to add to that */
	12170	RExC_flags &= ~RXf_PMf_FOLD;
	12171	/* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
	12172	*/
	12173	node = regclass(pRExC_state, flagp,depth+1,
	12174	FALSE, /* means parse the whole char class */
	12175	FALSE, /* don't allow multi-char folds */
	12176	TRUE, /* silence non-portable warnings. The above may very
	12177	well have generated non-portable code points, but
	12178	they're valid on this machine */
	12179	NULL);
	12180	if (!node)
	12181	FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
	12182	PTR2UV(flagp));
	12183	if (save_fold) {
	12184	RExC_flags \|= RXf_PMf_FOLD;
	12185	}
	12186	RExC_parse = save_parse + 1;
	12187	RExC_end = save_end;
	12188	SvREFCNT_dec_NN(final);
	12189	SvREFCNT_dec_NN(result_string);
	12190
	12191	nextchar(pRExC_state);
	12192	Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
	12193	return node;
	12194	}
	12195	#undef IS_OPERAND
	12196
	12197	/* The names of properties whose definitions are not known at compile time are
	12198	* stored in this SV, after a constant heading. So if the length has been
	12199	* changed since initialization, then there is a run-time definition. */
	12200	#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
	12201
	12202	STATIC regnode *
	12203	S_regclass(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth,
	12204	const bool stop_at_1, /* Just parse the next thing, don't
	12205	look for a full character class */
	12206	bool allow_multi_folds,
	12207	const bool silence_non_portable, /* Don't output warnings
	12208	about too large
	12209	characters */
	12210	SV** ret_invlist) /* Return an inversion list, not a node */
	12211	{
	12212	/* parse a bracketed class specification. Most of these will produce an
	12213	* ANYOF node; but something like [a] will produce an EXACT node; [aA], an
	12214	* EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
	12215	* under /i with multi-character folds: it will be rewritten following the
	12216	* paradigm of this example, where the <multi-fold>s are characters which
	12217	* fold to multiple character sequences:
	12218	* /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
	12219	* gets effectively rewritten as:
	12220	* /(?:\x{multi-fold1}\|\x{multi-fold2}\|[abcdefghi]/i
	12221	* reg() gets called (recursively) on the rewritten version, and this
	12222	* function will return what it constructs. (Actually the <multi-fold>s
	12223	* aren't physically removed from the [abcdefghi], it's just that they are
	12224	* ignored in the recursion by means of a flag:
	12225	* <RExC_in_multi_char_class>.)
	12226	*
	12227	* ANYOF nodes contain a bit map for the first 256 characters, with the
	12228	* corresponding bit set if that character is in the list. For characters
	12229	* above 255, a range list or swash is used. There are extra bits for \w,
	12230	* etc. in locale ANYOFs, as what these match is not determinable at
	12231	* compile time
	12232	*
	12233	* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
	12234	* to be restarted. This can only happen if ret_invlist is non-NULL.
	12235	*/
	12236
	12237	dVAR;
	12238	UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
	12239	IV range = 0;
	12240	UV value = OOB_UNICODE, save_value = OOB_UNICODE;
	12241	regnode *ret;
	12242	STRLEN numlen;
	12243	IV namedclass = OOB_NAMEDCLASS;
	12244	char *rangebegin = NULL;
	12245	bool need_class = 0;
	12246	SV *listsv = NULL;
	12247	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	12248	than just initialized. */
	12249	SV* properties = NULL; /* Code points that match \p{} \P{} */
	12250	SV* posixes = NULL; /* Code points that match classes like, [:word:],
	12251	extended beyond the Latin1 range */
	12252	UV element_count = 0; /* Number of distinct elements in the class.
	12253	Optimizations may be possible if this is tiny */
	12254	AV * multi_char_matches = NULL; /* Code points that fold to more than one
	12255	character; used under /i */
	12256	UV n;
	12257	char * stop_ptr = RExC_end; /* where to stop parsing */
	12258	const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
	12259	space? */
	12260	const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
	12261
	12262	/* Unicode properties are stored in a swash; this holds the current one
	12263	* being parsed. If this swash is the only above-latin1 component of the
	12264	* character class, an optimization is to pass it directly on to the
	12265	* execution engine. Otherwise, it is set to NULL to indicate that there
	12266	* are other things in the class that have to be dealt with at execution
	12267	* time */
	12268	SV* swash = NULL; /* Code points that match \p{} \P{} */
	12269
	12270	/* Set if a component of this character class is user-defined; just passed
	12271	* on to the engine */
	12272	bool has_user_defined_property = FALSE;
	12273
	12274	/* inversion list of code points this node matches only when the target
	12275	* string is in UTF-8. (Because is under /d) */
	12276	SV* depends_list = NULL;
	12277
	12278	/* inversion list of code points this node matches. For much of the
	12279	* function, it includes only those that match regardless of the utf8ness
	12280	* of the target string */
	12281	SV* cp_list = NULL;
	12282
	12283	#ifdef EBCDIC
	12284	/* In a range, counts how many 0-2 of the ends of it came from literals,
	12285	* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
	12286	UV literal_endpoint = 0;
	12287	#endif
	12288	bool invert = FALSE; /* Is this class to be complemented */
	12289
	12290	/* Is there any thing like \W or [:^digit:] that matches above the legal
	12291	* Unicode range? */
	12292	bool runtime_posix_matches_above_Unicode = FALSE;
	12293
	12294	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	12295	case we need to change the emitted regop to an EXACT. */
	12296	const char * orig_parse = RExC_parse;
	12297	const SSize_t orig_size = RExC_size;
	12298	GET_RE_DEBUG_FLAGS_DECL;
	12299
	12300	PERL_ARGS_ASSERT_REGCLASS;
	12301	#ifndef DEBUGGING
	12302	PERL_UNUSED_ARG(depth);
	12303	#endif
	12304
	12305	DEBUG_PARSE("clas");
	12306
	12307	/* Assume we are going to generate an ANYOF node. */
	12308	ret = reganode(pRExC_state, ANYOF, 0);
	12309
	12310	if (SIZE_ONLY) {
	12311	RExC_size += ANYOF_SKIP;
	12312	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	12313	}
	12314	else {
	12315	ANYOF_FLAGS(ret) = 0;
	12316
	12317	RExC_emit += ANYOF_SKIP;
	12318	if (LOC) {
	12319	ANYOF_FLAGS(ret) \|= ANYOF_LOCALE;
	12320	}
	12321	listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
	12322	initial_listsv_len = SvCUR(listsv);
	12323	SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
	12324	}
	12325
	12326	if (skip_white) {
	12327	RExC_parse = regpatws(pRExC_state, RExC_parse,
	12328	FALSE /* means don't recognize comments */);
	12329	}
	12330
	12331	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	12332	RExC_parse++;
	12333	invert = TRUE;
	12334	allow_multi_folds = FALSE;
	12335	RExC_naughty++;
	12336	if (skip_white) {
	12337	RExC_parse = regpatws(pRExC_state, RExC_parse,
	12338	FALSE /* means don't recognize comments */);
	12339	}
	12340	}
	12341
	12342	/* Check that they didn't say [:posix:] instead of [[:posix:]] */
	12343	if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
	12344	const char *s = RExC_parse;
	12345	const char c = *s++;
	12346
	12347	while (isWORDCHAR(*s))
	12348	s++;
	12349	if (s && c == s && s[1] == ']') {
	12350	SAVEFREESV(RExC_rx_sv);
	12351	ckWARN3reg(s+2,
	12352	"POSIX syntax [%c %c] belongs inside character classes",
	12353	c, c);
	12354	(void)ReREFCNT_inc(RExC_rx_sv);
	12355	}
	12356	}
	12357
	12358	/* If the caller wants us to just parse a single element, accomplish this
	12359	* by faking the loop ending condition */
	12360	if (stop_at_1 && RExC_end > RExC_parse) {
	12361	stop_ptr = RExC_parse + 1;
	12362	}
	12363
	12364	/* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
	12365	if (UCHARAT(RExC_parse) == ']')
	12366	goto charclassloop;
	12367
	12368	parseit:
	12369	while (1) {
	12370	if (RExC_parse >= stop_ptr) {
	12371	break;
	12372	}
	12373
	12374	if (skip_white) {
	12375	RExC_parse = regpatws(pRExC_state, RExC_parse,
	12376	FALSE /* means don't recognize comments */);
	12377	}
	12378
	12379	if (UCHARAT(RExC_parse) == ']') {
	12380	break;
	12381	}
	12382
	12383	charclassloop:
	12384
	12385	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	12386	save_value = value;
	12387	save_prevvalue = prevvalue;
	12388
	12389	if (!range) {
	12390	rangebegin = RExC_parse;
	12391	element_count++;
	12392	}
	12393	if (UTF) {
	12394	value = utf8n_to_uvchr((U8*)RExC_parse,
	12395	RExC_end - RExC_parse,
	12396	&numlen, UTF8_ALLOW_DEFAULT);
	12397	RExC_parse += numlen;
	12398	}
	12399	else
	12400	value = UCHARAT(RExC_parse++);
	12401
	12402	if (value == '['
	12403	&& RExC_parse < RExC_end
	12404	&& POSIXCC(UCHARAT(RExC_parse)))
	12405	{
	12406	namedclass = regpposixcc(pRExC_state, value, strict);
	12407	}
	12408	else if (value == '\\') {
	12409	if (UTF) {
	12410	value = utf8n_to_uvchr((U8*)RExC_parse,
	12411	RExC_end - RExC_parse,
	12412	&numlen, UTF8_ALLOW_DEFAULT);
	12413	RExC_parse += numlen;
	12414	}
	12415	else
	12416	value = UCHARAT(RExC_parse++);
	12417
	12418	/* Some compilers cannot handle switching on 64-bit integer
	12419	* values, therefore value cannot be an UV. Yes, this will
	12420	* be a problem later if we want switch on Unicode.
	12421	* A similar issue a little bit later when switching on
	12422	* namedclass. --jhi */
	12423
	12424	/* If the \ is escaping white space when white space is being
	12425	* skipped, it means that that white space is wanted literally, and
	12426	* is already in 'value'. Otherwise, need to translate the escape
	12427	* into what it signifies. */
	12428	if (! skip_white \|\| ! is_PATWS_cp(value)) switch ((I32)value) {
	12429
	12430	case 'w': namedclass = ANYOF_WORDCHAR; break;
	12431	case 'W': namedclass = ANYOF_NWORDCHAR; break;
	12432	case 's': namedclass = ANYOF_SPACE; break;
	12433	case 'S': namedclass = ANYOF_NSPACE; break;
	12434	case 'd': namedclass = ANYOF_DIGIT; break;
	12435	case 'D': namedclass = ANYOF_NDIGIT; break;
	12436	case 'v': namedclass = ANYOF_VERTWS; break;
	12437	case 'V': namedclass = ANYOF_NVERTWS; break;
	12438	case 'h': namedclass = ANYOF_HORIZWS; break;
	12439	case 'H': namedclass = ANYOF_NHORIZWS; break;
	12440	case 'N': /* Handle \N{NAME} in class */
	12441	{
	12442	/* We only pay attention to the first char of
	12443	multichar strings being returned. I kinda wonder
	12444	if this makes sense as it does change the behaviour
	12445	from earlier versions, OTOH that behaviour was broken
	12446	as well. */
	12447	if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
	12448	TRUE, /* => charclass */
	12449	strict))
	12450	{
	12451	if (*flagp & RESTART_UTF8)
	12452	FAIL("panic: grok_bslash_N set RESTART_UTF8");
	12453	goto parseit;
	12454	}
	12455	}
	12456	break;
	12457	case 'p':
	12458	case 'P':
	12459	{
	12460	char *e;
	12461
	12462	/* We will handle any undefined properties ourselves */
	12463	U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
	12464
	12465	if (RExC_parse >= RExC_end)
	12466	vFAIL2("Empty \\%c{}", (U8)value);
	12467	if (*RExC_parse == '{') {
	12468	const U8 c = (U8)value;
	12469	e = strchr(RExC_parse++, '}');
	12470	if (!e)
	12471	vFAIL2("Missing right brace on \\%c{}", c);
	12472	while (isSPACE(UCHARAT(RExC_parse)))
	12473	RExC_parse++;
	12474	if (e == RExC_parse)
	12475	vFAIL2("Empty \\%c{}", c);
	12476	n = e - RExC_parse;
	12477	while (isSPACE(UCHARAT(RExC_parse + n - 1)))
	12478	n--;
	12479	}
	12480	else {
	12481	e = RExC_parse;
	12482	n = 1;
	12483	}
	12484	if (!SIZE_ONLY) {
	12485	SV* invlist;
	12486	char* name;
	12487
	12488	if (UCHARAT(RExC_parse) == '^') {
	12489	RExC_parse++;
	12490	n--;
	12491	/* toggle. (The rhs xor gets the single bit that
	12492	* differs between P and p; the other xor inverts just
	12493	* that bit) */
	12494	value ^= 'P' ^ 'p';
	12495
	12496	while (isSPACE(UCHARAT(RExC_parse))) {
	12497	RExC_parse++;
	12498	n--;
	12499	}
	12500	}
	12501	/* Try to get the definition of the property into
	12502	* <invlist>. If /i is in effect, the effective property
	12503	* will have its name be <__NAME_i>. The design is
	12504	* discussed in commit
	12505	* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
	12506	Newx(name, n + sizeof("_i__\n"), char);
	12507
	12508	sprintf(name, "%s%.*s%s\n",
	12509	(FOLD) ? "__" : "",
	12510	(int)n,
	12511	RExC_parse,
	12512	(FOLD) ? "_i" : ""
	12513	);
	12514
	12515	/* Look up the property name, and get its swash and
	12516	* inversion list, if the property is found */
	12517	if (swash) {
	12518	SvREFCNT_dec_NN(swash);
	12519	}
	12520	swash = _core_swash_init("utf8", name, &PL_sv_undef,
	12521	1, /* binary */
	12522	0, /* not tr/// */
	12523	NULL, /* No inversion list */
	12524	&swash_init_flags
	12525	);
	12526	if (! swash \|\| ! (invlist = _get_swash_invlist(swash))) {
	12527	if (swash) {
	12528	SvREFCNT_dec_NN(swash);
	12529	swash = NULL;
	12530	}
	12531
	12532	/* Here didn't find it. It could be a user-defined
	12533	* property that will be available at run-time. If we
	12534	* accept only compile-time properties, is an error;
	12535	* otherwise add it to the list for run-time look up */
	12536	if (ret_invlist) {
	12537	RExC_parse = e + 1;
	12538	vFAIL3("Property '%.*s' is unknown", (int) n, name);
	12539	}
	12540	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
	12541	(value == 'p' ? '+' : '!'),
	12542	name);
	12543	has_user_defined_property = TRUE;
	12544
	12545	/* We don't know yet, so have to assume that the
	12546	* property could match something in the Latin1 range,
	12547	* hence something that isn't utf8. Note that this
	12548	* would cause things in <depends_list> to match
	12549	* inappropriately, except that any \p{}, including
	12550	* this one forces Unicode semantics, which means there
	12551	* is <no depends_list> */
	12552	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	12553	}
	12554	else {
	12555
	12556	/* Here, did get the swash and its inversion list. If
	12557	* the swash is from a user-defined property, then this
	12558	* whole character class should be regarded as such */
	12559	has_user_defined_property =
	12560	(swash_init_flags
	12561	& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
	12562
	12563	/* Invert if asking for the complement */
	12564	if (value == 'P') {
	12565	_invlist_union_complement_2nd(properties,
	12566	invlist,
	12567	&properties);
	12568
	12569	/* The swash can't be used as-is, because we've
	12570	* inverted things; delay removing it to here after
	12571	* have copied its invlist above */
	12572	SvREFCNT_dec_NN(swash);
	12573	swash = NULL;
	12574	}
	12575	else {
	12576	_invlist_union(properties, invlist, &properties);
	12577	}
	12578	}
	12579	Safefree(name);
	12580	}
	12581	RExC_parse = e + 1;
	12582	namedclass = ANYOF_UNIPROP; /* no official name, but it's
	12583	named */
	12584
	12585	/* \p means they want Unicode semantics */
	12586	RExC_uni_semantics = 1;
	12587	}
	12588	break;
	12589	case 'n': value = '\n'; break;
	12590	case 'r': value = '\r'; break;
	12591	case 't': value = '\t'; break;
	12592	case 'f': value = '\f'; break;
	12593	case 'b': value = '\b'; break;
	12594	case 'e': value = ASCII_TO_NATIVE('\033');break;
	12595	case 'a': value = '\a'; break;
	12596	case 'o':
	12597	RExC_parse--; /* function expects to be pointed at the 'o' */
	12598	{
	12599	const char* error_msg;
	12600	bool valid = grok_bslash_o(&RExC_parse,
	12601	&value,
	12602	&error_msg,
	12603	SIZE_ONLY, /* warnings in pass
	12604	1 only */
	12605	strict,
	12606	silence_non_portable,
	12607	UTF);
	12608	if (! valid) {
	12609	vFAIL(error_msg);
	12610	}
	12611	}
	12612	if (PL_encoding && value < 0x100) {
	12613	goto recode_encoding;
	12614	}
	12615	break;
	12616	case 'x':
	12617	RExC_parse--; /* function expects to be pointed at the 'x' */
	12618	{
	12619	const char* error_msg;
	12620	bool valid = grok_bslash_x(&RExC_parse,
	12621	&value,
	12622	&error_msg,
	12623	TRUE, /* Output warnings */
	12624	strict,
	12625	silence_non_portable,
	12626	UTF);
	12627	if (! valid) {
	12628	vFAIL(error_msg);
	12629	}
	12630	}
	12631	if (PL_encoding && value < 0x100)
	12632	goto recode_encoding;
	12633	break;
	12634	case 'c':
	12635	value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
	12636	break;
	12637	case '0': case '1': case '2': case '3': case '4':
	12638	case '5': case '6': case '7':
	12639	{
	12640	/* Take 1-3 octal digits */
	12641	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	12642	numlen = (strict) ? 4 : 3;
	12643	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	12644	RExC_parse += numlen;
	12645	if (numlen != 3) {
	12646	if (strict) {
	12647	RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
	12648	vFAIL("Need exactly 3 octal digits");
	12649	}
	12650	else if (! SIZE_ONLY /* like \08, \178 */
	12651	&& numlen < 3
	12652	&& RExC_parse < RExC_end
	12653	&& isDIGIT(*RExC_parse)
	12654	&& ckWARN(WARN_REGEXP))
	12655	{
	12656	SAVEFREESV(RExC_rx_sv);
	12657	reg_warn_non_literal_string(
	12658	RExC_parse + 1,
	12659	form_short_octal_warning(RExC_parse, numlen));
	12660	(void)ReREFCNT_inc(RExC_rx_sv);
	12661	}
	12662	}
	12663	if (PL_encoding && value < 0x100)
	12664	goto recode_encoding;
	12665	break;
	12666	}
	12667	recode_encoding:
	12668	if (! RExC_override_recoding) {
	12669	SV* enc = PL_encoding;
	12670	value = reg_recode((const char)(U8)value, &enc);
	12671	if (!enc) {
	12672	if (strict) {
	12673	vFAIL("Invalid escape in the specified encoding");
	12674	}
	12675	else if (SIZE_ONLY) {
	12676	ckWARNreg(RExC_parse,
	12677	"Invalid escape in the specified encoding");
	12678	}
	12679	}
	12680	break;
	12681	}
	12682	default:
	12683	/* Allow \_ to not give an error */
	12684	if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
	12685	if (strict) {
	12686	vFAIL2("Unrecognized escape \\%c in character class",
	12687	(int)value);
	12688	}
	12689	else {
	12690	SAVEFREESV(RExC_rx_sv);
	12691	ckWARN2reg(RExC_parse,
	12692	"Unrecognized escape \\%c in character class passed through",
	12693	(int)value);
	12694	(void)ReREFCNT_inc(RExC_rx_sv);
	12695	}
	12696	}
	12697	break;
	12698	} /* End of switch on char following backslash */
	12699	} /* end of handling backslash escape sequences */
	12700	#ifdef EBCDIC
	12701	else
	12702	literal_endpoint++;
	12703	#endif
	12704
	12705	/* Here, we have the current token in 'value' */
	12706
	12707	/* What matches in a locale is not known until runtime. This includes
	12708	* what the Posix classes (like \w, [:space:]) match. Room must be
	12709	* reserved (one time per class) to store such classes, either if Perl
	12710	* is compiled so that locale nodes always should have this space, or
	12711	* if there is such class info to be stored. The space will contain a
	12712	* bit for each named class that is to be matched against. This isn't
	12713	* needed for \p{} and pseudo-classes, as they are not affected by
	12714	* locale, and hence are dealt with separately */
	12715	if (LOC
	12716	&& ! need_class
	12717	&& (ANYOF_LOCALE == ANYOF_CLASS
	12718	\|\| (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
	12719	{
	12720	need_class = 1;
	12721	if (SIZE_ONLY) {
	12722	RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	12723	}
	12724	else {
	12725	RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	12726	ANYOF_CLASS_ZERO(ret);
	12727	}
	12728	ANYOF_FLAGS(ret) \|= ANYOF_CLASS;
	12729	}
	12730
	12731	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	12732
	12733	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	12734	* literal, as is the character that began the false range, i.e.
	12735	* the 'a' in the examples */
	12736	if (range) {
	12737	if (!SIZE_ONLY) {
	12738	const int w = (RExC_parse >= rangebegin)
	12739	? RExC_parse - rangebegin
	12740	: 0;
	12741	if (strict) {
	12742	vFAIL4("False [] range \"%.s\"", w, w, rangebegin);
	12743	}
	12744	else {
	12745	SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
	12746	ckWARN4reg(RExC_parse,
	12747	"False [] range \"%.s\"",
	12748	w, w, rangebegin);
	12749	(void)ReREFCNT_inc(RExC_rx_sv);
	12750	cp_list = add_cp_to_invlist(cp_list, '-');
	12751	cp_list = add_cp_to_invlist(cp_list, prevvalue);
	12752	}
	12753	}
	12754
	12755	range = 0; /* this was not a true range */
	12756	element_count += 2; /* So counts for three values */
	12757	}
	12758
	12759	if (! SIZE_ONLY) {
	12760	U8 classnum = namedclass_to_classnum(namedclass);
	12761	if (namedclass >= ANYOF_MAX) { /* If a special class */
	12762	if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
	12763
	12764	/* Here, should be \h, \H, \v, or \V. Neither /d nor
	12765	* /l make a difference in what these match. There
	12766	* would be problems if these characters had folds
	12767	* other than themselves, as cp_list is subject to
	12768	* folding. */
	12769	if (classnum != _CC_VERTSPACE) {
	12770	assert( namedclass == ANYOF_HORIZWS
	12771	\|\| namedclass == ANYOF_NHORIZWS);
	12772
	12773	/* It turns out that \h is just a synonym for
	12774	* XPosixBlank */
	12775	classnum = _CC_BLANK;
	12776	}
	12777
	12778	_invlist_union_maybe_complement_2nd(
	12779	cp_list,
	12780	PL_XPosix_ptrs[classnum],
	12781	cBOOL(namedclass % 2), /* Complement if odd
	12782	(NHORIZWS, NVERTWS)
	12783	*/
	12784	&cp_list);
	12785	}
	12786	}
	12787	else if (classnum == _CC_ASCII) {
	12788	#ifdef HAS_ISASCII
	12789	if (LOC) {
	12790	ANYOF_CLASS_SET(ret, namedclass);
	12791	}
	12792	else
	12793	#endif /* Not isascii(); just use the hard-coded definition for it */
	12794	_invlist_union_maybe_complement_2nd(
	12795	posixes,
	12796	PL_ASCII,
	12797	cBOOL(namedclass % 2), /* Complement if odd
	12798	(NASCII) */
	12799	&posixes);
	12800	}
	12801	else { /* Garden variety class */
	12802
	12803	/* The ascii range inversion list */
	12804	SV* ascii_source = PL_Posix_ptrs[classnum];
	12805
	12806	/* The full Latin1 range inversion list */
	12807	SV* l1_source = PL_L1Posix_ptrs[classnum];
	12808
	12809	/* This code is structured into two major clauses. The
	12810	* first is for classes whose complete definitions may not
	12811	* already be known. It not, the Latin1 definition
	12812	* (guaranteed to already known) is used plus code is
	12813	* generated to load the rest at run-time (only if needed).
	12814	* If the complete definition is known, it drops down to
	12815	* the second clause, where the complete definition is
	12816	* known */
	12817
	12818	if (classnum < _FIRST_NON_SWASH_CC) {
	12819
	12820	/* Here, the class has a swash, which may or not
	12821	* already be loaded */
	12822
	12823	/* The name of the property to use to match the full
	12824	* eXtended Unicode range swash for this character
	12825	* class */
	12826	const char *Xname = swash_property_names[classnum];
	12827
	12828	/* If returning the inversion list, we can't defer
	12829	* getting this until runtime */
	12830	if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
	12831	PL_utf8_swash_ptrs[classnum] =
	12832	_core_swash_init("utf8", Xname, &PL_sv_undef,
	12833	1, /* binary */
	12834	0, /* not tr/// */
	12835	NULL, /* No inversion list */
	12836	NULL /* No flags */
	12837	);
	12838	assert(PL_utf8_swash_ptrs[classnum]);
	12839	}
	12840	if ( ! PL_utf8_swash_ptrs[classnum]) {
	12841	if (namedclass % 2 == 0) { /* A non-complemented
	12842	class */
	12843	/* If not /a matching, there are code points we
	12844	* don't know at compile time. Arrange for the
	12845	* unknown matches to be loaded at run-time, if
	12846	* needed */
	12847	if (! AT_LEAST_ASCII_RESTRICTED) {
	12848	Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
	12849	Xname);
	12850	}
	12851	if (LOC) { /* Under locale, set run-time
	12852	lookup */
	12853	ANYOF_CLASS_SET(ret, namedclass);
	12854	}
	12855	else {
	12856	/* Add the current class's code points to
	12857	* the running total */
	12858	_invlist_union(posixes,
	12859	(AT_LEAST_ASCII_RESTRICTED)
	12860	? ascii_source
	12861	: l1_source,
	12862	&posixes);
	12863	}
	12864	}
	12865	else { /* A complemented class */
	12866	if (AT_LEAST_ASCII_RESTRICTED) {
	12867	/* Under /a should match everything above
	12868	* ASCII, plus the complement of the set's
	12869	* ASCII matches */
	12870	_invlist_union_complement_2nd(posixes,
	12871	ascii_source,
	12872	&posixes);
	12873	}
	12874	else {
	12875	/* Arrange for the unknown matches to be
	12876	* loaded at run-time, if needed */
	12877	Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
	12878	Xname);
	12879	runtime_posix_matches_above_Unicode = TRUE;
	12880	if (LOC) {
	12881	ANYOF_CLASS_SET(ret, namedclass);
	12882	}
	12883	else {
	12884
	12885	/* We want to match everything in
	12886	* Latin1, except those things that
	12887	* l1_source matches */
	12888	SV* scratch_list = NULL;
	12889	_invlist_subtract(PL_Latin1, l1_source,
	12890	&scratch_list);
	12891
	12892	/* Add the list from this class to the
	12893	* running total */
	12894	if (! posixes) {
	12895	posixes = scratch_list;
	12896	}
	12897	else {
	12898	_invlist_union(posixes,
	12899	scratch_list,
	12900	&posixes);
	12901	SvREFCNT_dec_NN(scratch_list);
	12902	}
	12903	if (DEPENDS_SEMANTICS) {
	12904	ANYOF_FLAGS(ret)
	12905	\|= ANYOF_NON_UTF8_LATIN1_ALL;
	12906	}
	12907	}
	12908	}
	12909	}
	12910	goto namedclass_done;
	12911	}
	12912
	12913	/* Here, there is a swash loaded for the class. If no
	12914	* inversion list for it yet, get it */
	12915	if (! PL_XPosix_ptrs[classnum]) {
	12916	PL_XPosix_ptrs[classnum]
	12917	= _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
	12918	}
	12919	}
	12920
	12921	/* Here there is an inversion list already loaded for the
	12922	* entire class */
	12923
	12924	if (namedclass % 2 == 0) { /* A non-complemented class,
	12925	like ANYOF_PUNCT */
	12926	if (! LOC) {
	12927	/* For non-locale, just add it to any existing list
	12928	* */
	12929	_invlist_union(posixes,
	12930	(AT_LEAST_ASCII_RESTRICTED)
	12931	? ascii_source
	12932	: PL_XPosix_ptrs[classnum],
	12933	&posixes);
	12934	}
	12935	else { /* Locale */
	12936	SV* scratch_list = NULL;
	12937
	12938	/* For above Latin1 code points, we use the full
	12939	* Unicode range */
	12940	_invlist_intersection(PL_AboveLatin1,
	12941	PL_XPosix_ptrs[classnum],
	12942	&scratch_list);
	12943	/* And set the output to it, adding instead if
	12944	* there already is an output. Checking if
	12945	* 'posixes' is NULL first saves an extra clone.
	12946	* Its reference count will be decremented at the
	12947	* next union, etc, or if this is the only
	12948	* instance, at the end of the routine */
	12949	if (! posixes) {
	12950	posixes = scratch_list;
	12951	}
	12952	else {
	12953	_invlist_union(posixes, scratch_list, &posixes);
	12954	SvREFCNT_dec_NN(scratch_list);
	12955	}
	12956
	12957	#ifndef HAS_ISBLANK
	12958	if (namedclass != ANYOF_BLANK) {
	12959	#endif
	12960	/* Set this class in the node for runtime
	12961	* matching */
	12962	ANYOF_CLASS_SET(ret, namedclass);
	12963	#ifndef HAS_ISBLANK
	12964	}
	12965	else {
	12966	/* No isblank(), use the hard-coded ASCII-range
	12967	* blanks, adding them to the running total. */
	12968
	12969	_invlist_union(posixes, ascii_source, &posixes);
	12970	}
	12971	#endif
	12972	}
	12973	}
	12974	else { /* A complemented class, like ANYOF_NPUNCT */
	12975	if (! LOC) {
	12976	_invlist_union_complement_2nd(
	12977	posixes,
	12978	(AT_LEAST_ASCII_RESTRICTED)
	12979	? ascii_source
	12980	: PL_XPosix_ptrs[classnum],
	12981	&posixes);
	12982	/* Under /d, everything in the upper half of the
	12983	* Latin1 range matches this complement */
	12984	if (DEPENDS_SEMANTICS) {
	12985	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_LATIN1_ALL;
	12986	}
	12987	}
	12988	else { /* Locale */
	12989	SV* scratch_list = NULL;
	12990	_invlist_subtract(PL_AboveLatin1,
	12991	PL_XPosix_ptrs[classnum],
	12992	&scratch_list);
	12993	if (! posixes) {
	12994	posixes = scratch_list;
	12995	}
	12996	else {
	12997	_invlist_union(posixes, scratch_list, &posixes);
	12998	SvREFCNT_dec_NN(scratch_list);
	12999	}
	13000	#ifndef HAS_ISBLANK
	13001	if (namedclass != ANYOF_NBLANK) {
	13002	#endif
	13003	ANYOF_CLASS_SET(ret, namedclass);
	13004	#ifndef HAS_ISBLANK
	13005	}
	13006	else {
	13007	/* Get the list of all code points in Latin1
	13008	* that are not ASCII blanks, and add them to
	13009	* the running total */
	13010	_invlist_subtract(PL_Latin1, ascii_source,
	13011	&scratch_list);
	13012	_invlist_union(posixes, scratch_list, &posixes);
	13013	SvREFCNT_dec_NN(scratch_list);
	13014	}
	13015	#endif
	13016	}
	13017	}
	13018	}
	13019	namedclass_done:
	13020	continue; /* Go get next character */
	13021	}
	13022	} /* end of namedclass \blah */
	13023
	13024	/* Here, we have a single value. If 'range' is set, it is the ending
	13025	* of a range--check its validity. Later, we will handle each
	13026	* individual code point in the range. If 'range' isn't set, this
	13027	* could be the beginning of a range, so check for that by looking
	13028	* ahead to see if the next real character to be processed is the range
	13029	* indicator--the minus sign */
	13030
	13031	if (skip_white) {
	13032	RExC_parse = regpatws(pRExC_state, RExC_parse,
	13033	FALSE /* means don't recognize comments */);
	13034	}
	13035
	13036	if (range) {
	13037	if (prevvalue > value) /* b-a */ {
	13038	const int w = RExC_parse - rangebegin;
	13039	Simple_vFAIL4("Invalid [] range \"%.s\"", w, w, rangebegin);
	13040	range = 0; /* not a valid range */
	13041	}
	13042	}
	13043	else {
	13044	prevvalue = value; /* save the beginning of the potential range */
	13045	if (! stop_at_1 /* Can't be a range if parsing just one thing */
	13046	&& *RExC_parse == '-')
	13047	{
	13048	char* next_char_ptr = RExC_parse + 1;
	13049	if (skip_white) { /* Get the next real char after the '-' */
	13050	next_char_ptr = regpatws(pRExC_state,
	13051	RExC_parse + 1,
	13052	FALSE); /* means don't recognize
	13053	comments */
	13054	}
	13055
	13056	/* If the '-' is at the end of the class (just before the ']',
	13057	* it is a literal minus; otherwise it is a range */
	13058	if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
	13059	RExC_parse = next_char_ptr;
	13060
	13061	/* a bad range like \w-, [:word:]- ? */
	13062	if (namedclass > OOB_NAMEDCLASS) {
	13063	if (strict \|\| ckWARN(WARN_REGEXP)) {
	13064	const int w =
	13065	RExC_parse >= rangebegin ?
	13066	RExC_parse - rangebegin : 0;
	13067	if (strict) {
	13068	vFAIL4("False [] range \"%.s\"",
	13069	w, w, rangebegin);
	13070	}
	13071	else {
	13072	vWARN4(RExC_parse,
	13073	"False [] range \"%.s\"",
	13074	w, w, rangebegin);
	13075	}
	13076	}
	13077	if (!SIZE_ONLY) {
	13078	cp_list = add_cp_to_invlist(cp_list, '-');
	13079	}
	13080	element_count++;
	13081	} else
	13082	range = 1; /* yeah, it's a range! */
	13083	continue; /* but do it the next time */
	13084	}
	13085	}
	13086	}
	13087
	13088	/* Here, <prevvalue> is the beginning of the range, if any; or <value>
	13089	* if not */
	13090
	13091	/* non-Latin1 code point implies unicode semantics. Must be set in
	13092	* pass1 so is there for the whole of pass 2 */
	13093	if (value > 255) {
	13094	RExC_uni_semantics = 1;
	13095	}
	13096
	13097	/* Ready to process either the single value, or the completed range.
	13098	* For single-valued non-inverted ranges, we consider the possibility
	13099	* of multi-char folds. (We made a conscious decision to not do this
	13100	* for the other cases because it can often lead to non-intuitive
	13101	* results. For example, you have the peculiar case that:
	13102	* "s s" =~ /^[^\xDF]+$/i => Y
	13103	* "ss" =~ /^[^\xDF]+$/i => N
	13104	*
	13105	* See [perl #89750] */
	13106	if (FOLD && allow_multi_folds && value == prevvalue) {
	13107	if (value == LATIN_SMALL_LETTER_SHARP_S
	13108	\|\| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
	13109	value)))
	13110	{
	13111	/* Here <value> is indeed a multi-char fold. Get what it is */
	13112
	13113	U8 foldbuf[UTF8_MAXBYTES_CASE];
	13114	STRLEN foldlen;
	13115
	13116	UV folded = _to_uni_fold_flags(
	13117	value,
	13118	foldbuf,
	13119	&foldlen,
	13120	FOLD_FLAGS_FULL
	13121	\| ((LOC) ? FOLD_FLAGS_LOCALE
	13122	: (ASCII_FOLD_RESTRICTED)
	13123	? FOLD_FLAGS_NOMIX_ASCII
	13124	: 0)
	13125	);
	13126
	13127	/* Here, <folded> should be the first character of the
	13128	* multi-char fold of <value>, with <foldbuf> containing the
	13129	* whole thing. But, if this fold is not allowed (because of
	13130	* the flags), <fold> will be the same as <value>, and should
	13131	* be processed like any other character, so skip the special
	13132	* handling */
	13133	if (folded != value) {
	13134
	13135	/* Skip if we are recursed, currently parsing the class
	13136	* again. Otherwise add this character to the list of
	13137	* multi-char folds. */
	13138	if (! RExC_in_multi_char_class) {
	13139	AV** this_array_ptr;
	13140	AV* this_array;
	13141	STRLEN cp_count = utf8_length(foldbuf,
	13142	foldbuf + foldlen);
	13143	SV* multi_fold = sv_2mortal(newSVpvn("", 0));
	13144
	13145	Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
	13146
	13147
	13148	if (! multi_char_matches) {
	13149	multi_char_matches = newAV();
	13150	}
	13151
	13152	/* <multi_char_matches> is actually an array of arrays.
	13153	* There will be one or two top-level elements: [2],
	13154	* and/or [3]. The [2] element is an array, each
	13155	* element thereof is a character which folds to TWO
	13156	* characters; [3] is for folds to THREE characters.
	13157	* (Unicode guarantees a maximum of 3 characters in any
	13158	* fold.) When we rewrite the character class below,
	13159	* we will do so such that the longest folds are
	13160	* written first, so that it prefers the longest
	13161	* matching strings first. This is done even if it
	13162	* turns out that any quantifier is non-greedy, out of
	13163	* programmer laziness. Tom Christiansen has agreed
	13164	* that this is ok. This makes the test for the
	13165	* ligature 'ffi' come before the test for 'ff' */
	13166	if (av_exists(multi_char_matches, cp_count)) {
	13167	this_array_ptr = (AV**) av_fetch(multi_char_matches,
	13168	cp_count, FALSE);
	13169	this_array = *this_array_ptr;
	13170	}
	13171	else {
	13172	this_array = newAV();
	13173	av_store(multi_char_matches, cp_count,
	13174	(SV*) this_array);
	13175	}
	13176	av_push(this_array, multi_fold);
	13177	}
	13178
	13179	/* This element should not be processed further in this
	13180	* class */
	13181	element_count--;
	13182	value = save_value;
	13183	prevvalue = save_prevvalue;
	13184	continue;
	13185	}
	13186	}
	13187	}
	13188
	13189	/* Deal with this element of the class */
	13190	if (! SIZE_ONLY) {
	13191	#ifndef EBCDIC
	13192	cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
	13193	#else
	13194	SV* this_range = _new_invlist(1);
	13195	_append_range_to_invlist(this_range, prevvalue, value);
	13196
	13197	/* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
	13198	* If this range was specified using something like 'i-j', we want
	13199	* to include only the 'i' and the 'j', and not anything in
	13200	* between, so exclude non-ASCII, non-alphabetics from it.
	13201	* However, if the range was specified with something like
	13202	* [\x89-\x91] or [\x89-j], all code points within it should be
	13203	* included. literal_endpoint==2 means both ends of the range used
	13204	* a literal character, not \x{foo} */
	13205	if (literal_endpoint == 2
	13206	&& ((prevvalue >= 'a' && value <= 'z')
	13207	\|\| (prevvalue >= 'A' && value <= 'Z')))
	13208	{
	13209	_invlist_intersection(this_range, PL_ASCII,
	13210	&this_range);
	13211	_invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
	13212	&this_range);
	13213	}
	13214	_invlist_union(cp_list, this_range, &cp_list);
	13215	literal_endpoint = 0;
	13216	#endif
	13217	}
	13218
	13219	range = 0; /* this range (if it was one) is done now */
	13220	} /* End of loop through all the text within the brackets */
	13221
	13222	/* If anything in the class expands to more than one character, we have to
	13223	* deal with them by building up a substitute parse string, and recursively
	13224	* calling reg() on it, instead of proceeding */
	13225	if (multi_char_matches) {
	13226	SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
	13227	I32 cp_count;
	13228	STRLEN len;
	13229	char *save_end = RExC_end;
	13230	char *save_parse = RExC_parse;
	13231	bool first_time = TRUE; /* First multi-char occurrence doesn't get
	13232	a "\|" */
	13233	I32 reg_flags;
	13234
	13235	assert(! invert);
	13236	#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
	13237	because too confusing */
	13238	if (invert) {
	13239	sv_catpv(substitute_parse, "(?:");
	13240	}
	13241	#endif
	13242
	13243	/* Look at the longest folds first */
	13244	for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
	13245
	13246	if (av_exists(multi_char_matches, cp_count)) {
	13247	AV** this_array_ptr;
	13248	SV* this_sequence;
	13249
	13250	this_array_ptr = (AV**) av_fetch(multi_char_matches,
	13251	cp_count, FALSE);
	13252	while ((this_sequence = av_pop(*this_array_ptr)) !=
	13253	&PL_sv_undef)
	13254	{
	13255	if (! first_time) {
	13256	sv_catpv(substitute_parse, "\|");
	13257	}
	13258	first_time = FALSE;
	13259
	13260	sv_catpv(substitute_parse, SvPVX(this_sequence));
	13261	}
	13262	}
	13263	}
	13264
	13265	/* If the character class contains anything else besides these
	13266	* multi-character folds, have to include it in recursive parsing */
	13267	if (element_count) {
	13268	sv_catpv(substitute_parse, "\|[");
	13269	sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
	13270	sv_catpv(substitute_parse, "]");
	13271	}
	13272
	13273	sv_catpv(substitute_parse, ")");
	13274	#if 0
	13275	if (invert) {
	13276	/* This is a way to get the parse to skip forward a whole named
	13277	* sequence instead of matching the 2nd character when it fails the
	13278	* first */
	13279	sv_catpv(substitute_parse, "(THEN)(SKIP)(*FAIL)\|.)");
	13280	}
	13281	#endif
	13282
	13283	RExC_parse = SvPV(substitute_parse, len);
	13284	RExC_end = RExC_parse + len;
	13285	RExC_in_multi_char_class = 1;
	13286	RExC_emit = (regnode *)orig_emit;
	13287
	13288	ret = reg(pRExC_state, 1, &reg_flags, depth+1);
	13289
	13290	*flagp \|= reg_flags&(HASWIDTH\|SIMPLE\|SPSTART\|POSTPONED\|RESTART_UTF8);
	13291
	13292	RExC_parse = save_parse;
	13293	RExC_end = save_end;
	13294	RExC_in_multi_char_class = 0;
	13295	SvREFCNT_dec_NN(multi_char_matches);
	13296	return ret;
	13297	}
	13298
	13299	/* If the character class contains only a single element, it may be
	13300	* optimizable into another node type which is smaller and runs faster.
	13301	* Check if this is the case for this class */
	13302	if (element_count == 1 && ! ret_invlist) {
	13303	U8 op = END;
	13304	U8 arg = 0;
	13305
	13306	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
	13307	[:digit:] or \p{foo} */
	13308
	13309	/* All named classes are mapped into POSIXish nodes, with its FLAG
	13310	* argument giving which class it is */
	13311	switch ((I32)namedclass) {
	13312	case ANYOF_UNIPROP:
	13313	break;
	13314
	13315	/* These don't depend on the charset modifiers. They always
	13316	* match under /u rules */
	13317	case ANYOF_NHORIZWS:
	13318	case ANYOF_HORIZWS:
	13319	namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
	13320	/* FALLTHROUGH */
	13321
	13322	case ANYOF_NVERTWS:
	13323	case ANYOF_VERTWS:
	13324	op = POSIXU;
	13325	goto join_posix;
	13326
	13327	/* The actual POSIXish node for all the rest depends on the
	13328	* charset modifier. The ones in the first set depend only on
	13329	* ASCII or, if available on this platform, locale */
	13330	case ANYOF_ASCII:
	13331	case ANYOF_NASCII:
	13332	#ifdef HAS_ISASCII
	13333	op = (LOC) ? POSIXL : POSIXA;
	13334	#else
	13335	op = POSIXA;
	13336	#endif
	13337	goto join_posix;
	13338
	13339	case ANYOF_NCASED:
	13340	case ANYOF_LOWER:
	13341	case ANYOF_NLOWER:
	13342	case ANYOF_UPPER:
	13343	case ANYOF_NUPPER:
	13344	/* under /a could be alpha */
	13345	if (FOLD) {
	13346	if (ASCII_RESTRICTED) {
	13347	namedclass = ANYOF_ALPHA + (namedclass % 2);
	13348	}
	13349	else if (! LOC) {
	13350	break;
	13351	}
	13352	}
	13353	/* FALLTHROUGH */
	13354
	13355	/* The rest have more possibilities depending on the charset.
	13356	* We take advantage of the enum ordering of the charset
	13357	* modifiers to get the exact node type, */
	13358	default:
	13359	op = POSIXD + get_regex_charset(RExC_flags);
	13360	if (op > POSIXA) { /* /aa is same as /a */
	13361	op = POSIXA;
	13362	}
	13363	#ifndef HAS_ISBLANK
	13364	if (op == POSIXL
	13365	&& (namedclass == ANYOF_BLANK
	13366	\|\| namedclass == ANYOF_NBLANK))
	13367	{
	13368	op = POSIXA;
	13369	}
	13370	#endif
	13371
	13372	join_posix:
	13373	/* The odd numbered ones are the complements of the
	13374	* next-lower even number one */
	13375	if (namedclass % 2 == 1) {
	13376	invert = ! invert;
	13377	namedclass--;
	13378	}
	13379	arg = namedclass_to_classnum(namedclass);
	13380	break;
	13381	}
	13382	}
	13383	else if (value == prevvalue) {
	13384
	13385	/* Here, the class consists of just a single code point */
	13386
	13387	if (invert) {
	13388	if (! LOC && value == '\n') {
	13389	op = REG_ANY; /* Optimize [^\n] */
	13390	*flagp \|= HASWIDTH\|SIMPLE;
	13391	RExC_naughty++;
	13392	}
	13393	}
	13394	else if (value < 256 \|\| UTF) {
	13395
	13396	/* Optimize a single value into an EXACTish node, but not if it
	13397	* would require converting the pattern to UTF-8. */
	13398	op = compute_EXACTish(pRExC_state);
	13399	}
	13400	} /* Otherwise is a range */
	13401	else if (! LOC) { /* locale could vary these */
	13402	if (prevvalue == '0') {
	13403	if (value == '9') {
	13404	arg = _CC_DIGIT;
	13405	op = POSIXA;
	13406	}
	13407	}
	13408	}
	13409
	13410	/* Here, we have changed <op> away from its initial value iff we found
	13411	* an optimization */
	13412	if (op != END) {
	13413
	13414	/* Throw away this ANYOF regnode, and emit the calculated one,
	13415	* which should correspond to the beginning, not current, state of
	13416	* the parse */
	13417	const char * cur_parse = RExC_parse;
	13418	RExC_parse = (char *)orig_parse;
	13419	if ( SIZE_ONLY) {
	13420	if (! LOC) {
	13421
	13422	/* To get locale nodes to not use the full ANYOF size would
	13423	* require moving the code above that writes the portions
	13424	* of it that aren't in other nodes to after this point.
	13425	* e.g. ANYOF_CLASS_SET */
	13426	RExC_size = orig_size;
	13427	}
	13428	}
	13429	else {
	13430	RExC_emit = (regnode *)orig_emit;
	13431	if (PL_regkind[op] == POSIXD) {
	13432	if (invert) {
	13433	op += NPOSIXD - POSIXD;
	13434	}
	13435	}
	13436	}
	13437
	13438	ret = reg_node(pRExC_state, op);
	13439
	13440	if (PL_regkind[op] == POSIXD \|\| PL_regkind[op] == NPOSIXD) {
	13441	if (! SIZE_ONLY) {
	13442	FLAGS(ret) = arg;
	13443	}
	13444	*flagp \|= HASWIDTH\|SIMPLE;
	13445	}
	13446	else if (PL_regkind[op] == EXACT) {
	13447	alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
	13448	}
	13449
	13450	RExC_parse = (char *) cur_parse;
	13451
	13452	SvREFCNT_dec(posixes);
	13453	SvREFCNT_dec(cp_list);
	13454	return ret;
	13455	}
	13456	}
	13457
	13458	if (SIZE_ONLY)
	13459	return ret;
	13460	/**** !SIZE_ONLY (Pass 2) AFTER HERE *******/
	13461
	13462	/* If folding, we calculate all characters that could fold to or from the
	13463	* ones already on the list */
	13464	if (FOLD && cp_list) {
	13465	UV start, end; /* End points of code point ranges */
	13466
	13467	SV* fold_intersection = NULL;
	13468
	13469	/* If the highest code point is within Latin1, we can use the
	13470	* compiled-in Alphas list, and not have to go out to disk. This
	13471	* yields two false positives, the masculine and feminine ordinal
	13472	* indicators, which are weeded out below using the
	13473	* IS_IN_SOME_FOLD_L1() macro */
	13474	if (invlist_highest(cp_list) < 256) {
	13475	_invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
	13476	&fold_intersection);
	13477	}
	13478	else {
	13479
	13480	/* Here, there are non-Latin1 code points, so we will have to go
	13481	* fetch the list of all the characters that participate in folds
	13482	*/
	13483	if (! PL_utf8_foldable) {
	13484	SV* swash = swash_init("utf8", "_Perl_Any_Folds",
	13485	&PL_sv_undef, 1, 0);
	13486	PL_utf8_foldable = _get_swash_invlist(swash);
	13487	SvREFCNT_dec_NN(swash);
	13488	}
	13489
	13490	/* This is a hash that for a particular fold gives all characters
	13491	* that are involved in it */
	13492	if (! PL_utf8_foldclosures) {
	13493
	13494	/* If we were unable to find any folds, then we likely won't be
	13495	* able to find the closures. So just create an empty list.
	13496	* Folding will effectively be restricted to the non-Unicode
	13497	* rules hard-coded into Perl. (This case happens legitimately
	13498	* during compilation of Perl itself before the Unicode tables
	13499	* are generated) */
	13500	if (_invlist_len(PL_utf8_foldable) == 0) {
	13501	PL_utf8_foldclosures = newHV();
	13502	}
	13503	else {
	13504	/* If the folds haven't been read in, call a fold function
	13505	* to force that */
	13506	if (! PL_utf8_tofold) {
	13507	U8 dummy[UTF8_MAXBYTES_CASE+1];
	13508
	13509	/* This string is just a short named one above \xff */
	13510	to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
	13511	assert(PL_utf8_tofold); /* Verify that worked */
	13512	}
	13513	PL_utf8_foldclosures =
	13514	_swash_inversion_hash(PL_utf8_tofold);
	13515	}
	13516	}
	13517
	13518	/* Only the characters in this class that participate in folds need
	13519	* be checked. Get the intersection of this class and all the
	13520	* possible characters that are foldable. This can quickly narrow
	13521	* down a large class */
	13522	_invlist_intersection(PL_utf8_foldable, cp_list,
	13523	&fold_intersection);
	13524	}
	13525
	13526	/* Now look at the foldable characters in this class individually */
	13527	invlist_iterinit(fold_intersection);
	13528	while (invlist_iternext(fold_intersection, &start, &end)) {
	13529	UV j;
	13530
	13531	/* Locale folding for Latin1 characters is deferred until runtime */
	13532	if (LOC && start < 256) {
	13533	start = 256;
	13534	}
	13535
	13536	/* Look at every character in the range */
	13537	for (j = start; j <= end; j++) {
	13538
	13539	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	13540	STRLEN foldlen;
	13541	SV** listp;
	13542
	13543	if (j < 256) {
	13544
	13545	/* We have the latin1 folding rules hard-coded here so that
	13546	* an innocent-looking character class, like /[ks]/i won't
	13547	* have to go out to disk to find the possible matches.
	13548	* XXX It would be better to generate these via regen, in
	13549	* case a new version of the Unicode standard adds new
	13550	* mappings, though that is not really likely, and may be
	13551	* caught by the default: case of the switch below. */
	13552
	13553	if (IS_IN_SOME_FOLD_L1(j)) {
	13554
	13555	/* ASCII is always matched; non-ASCII is matched only
	13556	* under Unicode rules */
	13557	if (isASCII(j) \|\| AT_LEAST_UNI_SEMANTICS) {
	13558	cp_list =
	13559	add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
	13560	}
	13561	else {
	13562	depends_list =
	13563	add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
	13564	}
	13565	}
	13566
	13567	if (HAS_NONLATIN1_FOLD_CLOSURE(j)
	13568	&& (! isASCII(j) \|\| ! ASCII_FOLD_RESTRICTED))
	13569	{
	13570	/* Certain Latin1 characters have matches outside
	13571	* Latin1. To get here, <j> is one of those
	13572	* characters. None of these matches is valid for
	13573	* ASCII characters under /aa, which is why the 'if'
	13574	* just above excludes those. These matches only
	13575	* happen when the target string is utf8. The code
	13576	* below adds the single fold closures for <j> to the
	13577	* inversion list. */
	13578	switch (j) {
	13579	case 'k':
	13580	case 'K':
	13581	cp_list =
	13582	add_cp_to_invlist(cp_list, KELVIN_SIGN);
	13583	break;
	13584	case 's':
	13585	case 'S':
	13586	cp_list = add_cp_to_invlist(cp_list,
	13587	LATIN_SMALL_LETTER_LONG_S);
	13588	break;
	13589	case MICRO_SIGN:
	13590	cp_list = add_cp_to_invlist(cp_list,
	13591	GREEK_CAPITAL_LETTER_MU);
	13592	cp_list = add_cp_to_invlist(cp_list,
	13593	GREEK_SMALL_LETTER_MU);
	13594	break;
	13595	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	13596	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	13597	cp_list =
	13598	add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
	13599	break;
	13600	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	13601	cp_list = add_cp_to_invlist(cp_list,
	13602	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	13603	break;
	13604	case LATIN_SMALL_LETTER_SHARP_S:
	13605	cp_list = add_cp_to_invlist(cp_list,
	13606	LATIN_CAPITAL_LETTER_SHARP_S);
	13607	break;
	13608	case 'F': case 'f':
	13609	case 'I': case 'i':
	13610	case 'L': case 'l':
	13611	case 'T': case 't':
	13612	case 'A': case 'a':
	13613	case 'H': case 'h':
	13614	case 'J': case 'j':
	13615	case 'N': case 'n':
	13616	case 'W': case 'w':
	13617	case 'Y': case 'y':
	13618	/* These all are targets of multi-character
	13619	* folds from code points that require UTF8 to
	13620	* express, so they can't match unless the
	13621	* target string is in UTF-8, so no action here
	13622	* is necessary, as regexec.c properly handles
	13623	* the general case for UTF-8 matching and
	13624	* multi-char folds */
	13625	break;
	13626	default:
	13627	/* Use deprecated warning to increase the
	13628	* chances of this being output */
	13629	ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
	13630	break;
	13631	}
	13632	}
	13633	continue;
	13634	}
	13635
	13636	/* Here is an above Latin1 character. We don't have the rules
	13637	* hard-coded for it. First, get its fold. This is the simple
	13638	* fold, as the multi-character folds have been handled earlier
	13639	* and separated out */
	13640	_to_uni_fold_flags(j, foldbuf, &foldlen,
	13641	((LOC)
	13642	? FOLD_FLAGS_LOCALE
	13643	: (ASCII_FOLD_RESTRICTED)
	13644	? FOLD_FLAGS_NOMIX_ASCII
	13645	: 0));
	13646
	13647	/* Single character fold of above Latin1. Add everything in
	13648	* its fold closure to the list that this node should match.
	13649	* The fold closures data structure is a hash with the keys
	13650	* being the UTF-8 of every character that is folded to, like
	13651	* 'k', and the values each an array of all code points that
	13652	* fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
	13653	* Multi-character folds are not included */
	13654	if ((listp = hv_fetch(PL_utf8_foldclosures,
	13655	(char *) foldbuf, foldlen, FALSE)))
	13656	{
	13657	AV* list = (AV) listp;
	13658	IV k;
	13659	for (k = 0; k <= av_len(list); k++) {
	13660	SV** c_p = av_fetch(list, k, FALSE);
	13661	UV c;
	13662	if (c_p == NULL) {
	13663	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	13664	}
	13665	c = SvUV(*c_p);
	13666
	13667	/* /aa doesn't allow folds between ASCII and non-; /l
	13668	* doesn't allow them between above and below 256 */
	13669	if ((ASCII_FOLD_RESTRICTED
	13670	&& (isASCII(c) != isASCII(j)))
	13671	\|\| (LOC && c < 256)) {
	13672	continue;
	13673	}
	13674
	13675	/* Folds involving non-ascii Latin1 characters
	13676	* under /d are added to a separate list */
	13677	if (isASCII(c) \|\| c > 255 \|\| AT_LEAST_UNI_SEMANTICS)
	13678	{
	13679	cp_list = add_cp_to_invlist(cp_list, c);
	13680	}
	13681	else {
	13682	depends_list = add_cp_to_invlist(depends_list, c);
	13683	}
	13684	}
	13685	}
	13686	}
	13687	}
	13688	SvREFCNT_dec_NN(fold_intersection);
	13689	}
	13690
	13691	/* And combine the result (if any) with any inversion list from posix
	13692	* classes. The lists are kept separate up to now because we don't want to
	13693	* fold the classes (folding of those is automatically handled by the swash
	13694	* fetching code) */
	13695	if (posixes) {
	13696	if (! DEPENDS_SEMANTICS) {
	13697	if (cp_list) {
	13698	_invlist_union(cp_list, posixes, &cp_list);
	13699	SvREFCNT_dec_NN(posixes);
	13700	}
	13701	else {
	13702	cp_list = posixes;
	13703	}
	13704	}
	13705	else {
	13706	/* Under /d, we put into a separate list the Latin1 things that
	13707	* match only when the target string is utf8 */
	13708	SV* nonascii_but_latin1_properties = NULL;
	13709	_invlist_intersection(posixes, PL_Latin1,
	13710	&nonascii_but_latin1_properties);
	13711	_invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
	13712	&nonascii_but_latin1_properties);
	13713	_invlist_subtract(posixes, nonascii_but_latin1_properties,
	13714	&posixes);
	13715	if (cp_list) {
	13716	_invlist_union(cp_list, posixes, &cp_list);
	13717	SvREFCNT_dec_NN(posixes);
	13718	}
	13719	else {
	13720	cp_list = posixes;
	13721	}
	13722
	13723	if (depends_list) {
	13724	_invlist_union(depends_list, nonascii_but_latin1_properties,
	13725	&depends_list);
	13726	SvREFCNT_dec_NN(nonascii_but_latin1_properties);
	13727	}
	13728	else {
	13729	depends_list = nonascii_but_latin1_properties;
	13730	}
	13731	}
	13732	}
	13733
	13734	/* And combine the result (if any) with any inversion list from properties.
	13735	* The lists are kept separate up to now so that we can distinguish the two
	13736	* in regards to matching above-Unicode. A run-time warning is generated
	13737	* if a Unicode property is matched against a non-Unicode code point. But,
	13738	* we allow user-defined properties to match anything, without any warning,
	13739	* and we also suppress the warning if there is a portion of the character
	13740	* class that isn't a Unicode property, and which matches above Unicode, \W
	13741	* or [\x{110000}] for example.
	13742	* (Note that in this case, unlike the Posix one above, there is no
	13743	* <depends_list>, because having a Unicode property forces Unicode
	13744	* semantics */
	13745	if (properties) {
	13746	bool warn_super = ! has_user_defined_property;
	13747	if (cp_list) {
	13748
	13749	/* If it matters to the final outcome, see if a non-property
	13750	* component of the class matches above Unicode. If so, the
	13751	* warning gets suppressed. This is true even if just a single
	13752	* such code point is specified, as though not strictly correct if
	13753	* another such code point is matched against, the fact that they
	13754	* are using above-Unicode code points indicates they should know
	13755	* the issues involved */
	13756	if (warn_super) {
	13757	bool non_prop_matches_above_Unicode =
	13758	runtime_posix_matches_above_Unicode
	13759	\| (invlist_highest(cp_list) > PERL_UNICODE_MAX);
	13760	if (invert) {
	13761	non_prop_matches_above_Unicode =
	13762	! non_prop_matches_above_Unicode;
	13763	}
	13764	warn_super = ! non_prop_matches_above_Unicode;
	13765	}
	13766
	13767	_invlist_union(properties, cp_list, &cp_list);
	13768	SvREFCNT_dec_NN(properties);
	13769	}
	13770	else {
	13771	cp_list = properties;
	13772	}
	13773
	13774	if (warn_super) {
	13775	OP(ret) = ANYOF_WARN_SUPER;
	13776	}
	13777	}
	13778
	13779	/* Here, we have calculated what code points should be in the character
	13780	* class.
	13781	*
	13782	* Now we can see about various optimizations. Fold calculation (which we
	13783	* did above) needs to take place before inversion. Otherwise /[^k]/i
	13784	* would invert to include K, which under /i would match k, which it
	13785	* shouldn't. Therefore we can't invert folded locale now, as it won't be
	13786	* folded until runtime */
	13787
	13788	/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
	13789	* at compile time. Besides not inverting folded locale now, we can't
	13790	* invert if there are things such as \w, which aren't known until runtime
	13791	* */
	13792	if (invert
	13793	&& ! (LOC && (FOLD \|\| (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
	13794	&& ! depends_list
	13795	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	13796	{
	13797	_invlist_invert(cp_list);
	13798
	13799	/* Any swash can't be used as-is, because we've inverted things */
	13800	if (swash) {
	13801	SvREFCNT_dec_NN(swash);
	13802	swash = NULL;
	13803	}
	13804
	13805	/* Clear the invert flag since have just done it here */
	13806	invert = FALSE;
	13807	}
	13808
	13809	if (ret_invlist) {
	13810	*ret_invlist = cp_list;
	13811	SvREFCNT_dec(swash);
	13812
	13813	/* Discard the generated node */
	13814	if (SIZE_ONLY) {
	13815	RExC_size = orig_size;
	13816	}
	13817	else {
	13818	RExC_emit = orig_emit;
	13819	}
	13820	return orig_emit;
	13821	}
	13822
	13823	/* If we didn't do folding, it's because some information isn't available
	13824	* until runtime; set the run-time fold flag for these. (We don't have to
	13825	* worry about properties folding, as that is taken care of by the swash
	13826	* fetching) */
	13827	if (FOLD && LOC)
	13828	{
	13829	ANYOF_FLAGS(ret) \|= ANYOF_LOC_FOLD;
	13830	}
	13831
	13832	/* Some character classes are equivalent to other nodes. Such nodes take
	13833	* up less room and generally fewer operations to execute than ANYOF nodes.
	13834	* Above, we checked for and optimized into some such equivalents for
	13835	* certain common classes that are easy to test. Getting to this point in
	13836	* the code means that the class didn't get optimized there. Since this
	13837	* code is only executed in Pass 2, it is too late to save space--it has
	13838	* been allocated in Pass 1, and currently isn't given back. But turning
	13839	* things into an EXACTish node can allow the optimizer to join it to any
	13840	* adjacent such nodes. And if the class is equivalent to things like /./,
	13841	* expensive run-time swashes can be avoided. Now that we have more
	13842	* complete information, we can find things necessarily missed by the
	13843	* earlier code. I (khw) am not sure how much to look for here. It would
	13844	* be easy, but perhaps too slow, to check any candidates against all the
	13845	* node types they could possibly match using _invlistEQ(). */
	13846
	13847	if (cp_list
	13848	&& ! invert
	13849	&& ! depends_list
	13850	&& ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
	13851	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	13852	{
	13853	UV start, end;
	13854	U8 op = END; /* The optimzation node-type */
	13855	const char * cur_parse= RExC_parse;
	13856
	13857	invlist_iterinit(cp_list);
	13858	if (! invlist_iternext(cp_list, &start, &end)) {
	13859
	13860	/* Here, the list is empty. This happens, for example, when a
	13861	* Unicode property is the only thing in the character class, and
	13862	* it doesn't match anything. (perluniprops.pod notes such
	13863	* properties) */
	13864	op = OPFAIL;
	13865	*flagp \|= HASWIDTH\|SIMPLE;
	13866	}
	13867	else if (start == end) { /* The range is a single code point */
	13868	if (! invlist_iternext(cp_list, &start, &end)
	13869
	13870	/* Don't do this optimization if it would require changing
	13871	* the pattern to UTF-8 */
	13872	&& (start < 256 \|\| UTF))
	13873	{
	13874	/* Here, the list contains a single code point. Can optimize
	13875	* into an EXACT node */
	13876
	13877	value = start;
	13878
	13879	if (! FOLD) {
	13880	op = EXACT;
	13881	}
	13882	else if (LOC) {
	13883
	13884	/* A locale node under folding with one code point can be
	13885	* an EXACTFL, as its fold won't be calculated until
	13886	* runtime */
	13887	op = EXACTFL;
	13888	}
	13889	else {
	13890
	13891	/* Here, we are generally folding, but there is only one
	13892	* code point to match. If we have to, we use an EXACT
	13893	* node, but it would be better for joining with adjacent
	13894	* nodes in the optimization pass if we used the same
	13895	* EXACTFish node that any such are likely to be. We can
	13896	* do this iff the code point doesn't participate in any
	13897	* folds. For example, an EXACTF of a colon is the same as
	13898	* an EXACT one, since nothing folds to or from a colon. */
	13899	if (value < 256) {
	13900	if (IS_IN_SOME_FOLD_L1(value)) {
	13901	op = EXACT;
	13902	}
	13903	}
	13904	else {
	13905	if (! PL_utf8_foldable) {
	13906	SV* swash = swash_init("utf8", "_Perl_Any_Folds",
	13907	&PL_sv_undef, 1, 0);
	13908	PL_utf8_foldable = _get_swash_invlist(swash);
	13909	SvREFCNT_dec_NN(swash);
	13910	}
	13911	if (_invlist_contains_cp(PL_utf8_foldable, value)) {
	13912	op = EXACT;
	13913	}
	13914	}
	13915
	13916	/* If we haven't found the node type, above, it means we
	13917	* can use the prevailing one */
	13918	if (op == END) {
	13919	op = compute_EXACTish(pRExC_state);
	13920	}
	13921	}
	13922	}
	13923	}
	13924	else if (start == 0) {
	13925	if (end == UV_MAX) {
	13926	op = SANY;
	13927	*flagp \|= HASWIDTH\|SIMPLE;
	13928	RExC_naughty++;
	13929	}
	13930	else if (end == '\n' - 1
	13931	&& invlist_iternext(cp_list, &start, &end)
	13932	&& start == '\n' + 1 && end == UV_MAX)
	13933	{
	13934	op = REG_ANY;
	13935	*flagp \|= HASWIDTH\|SIMPLE;
	13936	RExC_naughty++;
	13937	}
	13938	}
	13939	invlist_iterfinish(cp_list);
	13940
	13941	if (op != END) {
	13942	RExC_parse = (char *)orig_parse;
	13943	RExC_emit = (regnode *)orig_emit;
	13944
	13945	ret = reg_node(pRExC_state, op);
	13946
	13947	RExC_parse = (char *)cur_parse;
	13948
	13949	if (PL_regkind[op] == EXACT) {
	13950	alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
	13951	}
	13952
	13953	SvREFCNT_dec_NN(cp_list);
	13954	return ret;
	13955	}
	13956	}
	13957
	13958	/* Here, <cp_list> contains all the code points we can determine at
	13959	* compile time that match under all conditions. Go through it, and
	13960	* for things that belong in the bitmap, put them there, and delete from
	13961	* <cp_list>. While we are at it, see if everything above 255 is in the
	13962	* list, and if so, set a flag to speed up execution */
	13963	ANYOF_BITMAP_ZERO(ret);
	13964	if (cp_list) {
	13965
	13966	/* This gets set if we actually need to modify things */
	13967	bool change_invlist = FALSE;
	13968
	13969	UV start, end;
	13970
	13971	/* Start looking through <cp_list> */
	13972	invlist_iterinit(cp_list);
	13973	while (invlist_iternext(cp_list, &start, &end)) {
	13974	UV high;
	13975	int i;
	13976
	13977	if (end == UV_MAX && start <= 256) {
	13978	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	13979	}
	13980
	13981	/* Quit if are above what we should change */
	13982	if (start > 255) {
	13983	break;
	13984	}
	13985
	13986	change_invlist = TRUE;
	13987
	13988	/* Set all the bits in the range, up to the max that we are doing */
	13989	high = (end < 255) ? end : 255;
	13990	for (i = start; i <= (int) high; i++) {
	13991	if (! ANYOF_BITMAP_TEST(ret, i)) {
	13992	ANYOF_BITMAP_SET(ret, i);
	13993	}
	13994	}
	13995	}
	13996	invlist_iterfinish(cp_list);
	13997
	13998	/* Done with loop; remove any code points that are in the bitmap from
	13999	* <cp_list> */
	14000	if (change_invlist) {
	14001	_invlist_subtract(cp_list, PL_Latin1, &cp_list);
	14002	}
	14003
	14004	/* If have completely emptied it, remove it completely */
	14005	if (_invlist_len(cp_list) == 0) {
	14006	SvREFCNT_dec_NN(cp_list);
	14007	cp_list = NULL;
	14008	}
	14009	}
	14010
	14011	if (invert) {
	14012	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	14013	}
	14014
	14015	/* Here, the bitmap has been populated with all the Latin1 code points that
	14016	* always match. Can now add to the overall list those that match only
	14017	* when the target string is UTF-8 (<depends_list>). */
	14018	if (depends_list) {
	14019	if (cp_list) {
	14020	_invlist_union(cp_list, depends_list, &cp_list);
	14021	SvREFCNT_dec_NN(depends_list);
	14022	}
	14023	else {
	14024	cp_list = depends_list;
	14025	}
	14026	}
	14027
	14028	/* If there is a swash and more than one element, we can't use the swash in
	14029	* the optimization below. */
	14030	if (swash && element_count > 1) {
	14031	SvREFCNT_dec_NN(swash);
	14032	swash = NULL;
	14033	}
	14034
	14035	if (! cp_list
	14036	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	14037	{
	14038	ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
	14039	}
	14040	else {
	14041	/* av[0] stores the character class description in its textual form:
	14042	* used later (regexec.c:Perl_regclass_swash()) to initialize the
	14043	* appropriate swash, and is also useful for dumping the regnode.
	14044	* av[1] if NULL, is a placeholder to later contain the swash computed
	14045	* from av[0]. But if no further computation need be done, the
	14046	* swash is stored there now.
	14047	* av[2] stores the cp_list inversion list for use in addition or
	14048	* instead of av[0]; used only if av[1] is NULL
	14049	* av[3] is set if any component of the class is from a user-defined
	14050	* property; used only if av[1] is NULL */
	14051	AV * const av = newAV();
	14052	SV *rv;
	14053
	14054	av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	14055	? SvREFCNT_inc(listsv) : &PL_sv_undef);
	14056	if (swash) {
	14057	av_store(av, 1, swash);
	14058	SvREFCNT_dec_NN(cp_list);
	14059	}
	14060	else {
	14061	av_store(av, 1, NULL);
	14062	if (cp_list) {
	14063	av_store(av, 2, cp_list);
	14064	av_store(av, 3, newSVuv(has_user_defined_property));
	14065	}
	14066	}
	14067
	14068	rv = newRV_noinc(MUTABLE_SV(av));
	14069	n = add_data(pRExC_state, 1, "s");
	14070	RExC_rxi->data->data[n] = (void*)rv;
	14071	ARG_SET(ret, n);
	14072	}
	14073
	14074	*flagp \|= HASWIDTH\|SIMPLE;
	14075	return ret;
	14076	}
	14077	#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
	14078
	14079
	14080	/* reg_skipcomment()
	14081
	14082	Absorbs an /x style # comments from the input stream.
	14083	Returns true if there is more text remaining in the stream.
	14084	Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
	14085	terminates the pattern without including a newline.
	14086
	14087	Note its the callers responsibility to ensure that we are
	14088	actually in /x mode
	14089
	14090	*/
	14091
	14092	STATIC bool
	14093	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
	14094	{
	14095	bool ended = 0;
	14096
	14097	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	14098
	14099	while (RExC_parse < RExC_end)
	14100	if (*RExC_parse++ == '\n') {
	14101	ended = 1;
	14102	break;
	14103	}
	14104	if (!ended) {
	14105	/* we ran off the end of the pattern without ending
	14106	the comment, so we have to add an \n when wrapping */
	14107	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	14108	return 0;
	14109	} else
	14110	return 1;
	14111	}
	14112
	14113	/* nextchar()
	14114
	14115	Advances the parse position, and optionally absorbs
	14116	"whitespace" from the inputstream.
	14117
	14118	Without /x "whitespace" means (?#...) style comments only,
	14119	with /x this means (?#...) and # comments and whitespace proper.
	14120
	14121	Returns the RExC_parse point from BEFORE the scan occurs.
	14122
	14123	This is the /x friendly way of saying RExC_parse++.
	14124	*/
	14125
	14126	STATIC char*
	14127	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	14128	{
	14129	char* const retval = RExC_parse++;
	14130
	14131	PERL_ARGS_ASSERT_NEXTCHAR;
	14132
	14133	for (;;) {
	14134	if (RExC_end - RExC_parse >= 3
	14135	&& *RExC_parse == '('
	14136	&& RExC_parse[1] == '?'
	14137	&& RExC_parse[2] == '#')
	14138	{
	14139	while (*RExC_parse != ')') {
	14140	if (RExC_parse == RExC_end)
	14141	FAIL("Sequence (?#... not terminated");
	14142	RExC_parse++;
	14143	}
	14144	RExC_parse++;
	14145	continue;
	14146	}
	14147	if (RExC_flags & RXf_PMf_EXTENDED) {
	14148	if (isSPACE(*RExC_parse)) {
	14149	RExC_parse++;
	14150	continue;
	14151	}
	14152	else if (*RExC_parse == '#') {
	14153	if ( reg_skipcomment( pRExC_state ) )
	14154	continue;
	14155	}
	14156	}
	14157	return retval;
	14158	}
	14159	}
	14160
	14161	/*
	14162	- reg_node - emit a node
	14163	*/
	14164	STATIC regnode * /* Location. */
	14165	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	14166	{
	14167	dVAR;
	14168	regnode *ptr;
	14169	regnode * const ret = RExC_emit;
	14170	GET_RE_DEBUG_FLAGS_DECL;
	14171
	14172	PERL_ARGS_ASSERT_REG_NODE;
	14173
	14174	if (SIZE_ONLY) {
	14175	SIZE_ALIGN(RExC_size);
	14176	RExC_size += 1;
	14177	return(ret);
	14178	}
	14179	if (RExC_emit >= RExC_emit_bound)
	14180	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	14181	op, RExC_emit, RExC_emit_bound);
	14182
	14183	NODE_ALIGN_FILL(ret);
	14184	ptr = ret;
	14185	FILL_ADVANCE_NODE(ptr, op);
	14186	#ifdef RE_TRACK_PATTERN_OFFSETS
	14187	if (RExC_offsets) { /* MJD */
	14188	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	14189	"reg_node", __LINE__,
	14190	PL_reg_name[op],
	14191	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	14192	? "Overwriting end of array!\n" : "OK",
	14193	(UV)(RExC_emit - RExC_emit_start),
	14194	(UV)(RExC_parse - RExC_start),
	14195	(UV)RExC_offsets[0]));
	14196	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	14197	}
	14198	#endif
	14199	RExC_emit = ptr;
	14200	return(ret);
	14201	}
	14202
	14203	/*
	14204	- reganode - emit a node with an argument
	14205	*/
	14206	STATIC regnode * /* Location. */
	14207	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	14208	{
	14209	dVAR;
	14210	regnode *ptr;
	14211	regnode * const ret = RExC_emit;
	14212	GET_RE_DEBUG_FLAGS_DECL;
	14213
	14214	PERL_ARGS_ASSERT_REGANODE;
	14215
	14216	if (SIZE_ONLY) {
	14217	SIZE_ALIGN(RExC_size);
	14218	RExC_size += 2;
	14219	/*
	14220	We can't do this:
	14221
	14222	assert(2==regarglen[op]+1);
	14223
	14224	Anything larger than this has to allocate the extra amount.
	14225	If we changed this to be:
	14226
	14227	RExC_size += (1 + regarglen[op]);
	14228
	14229	then it wouldn't matter. Its not clear what side effect
	14230	might come from that so its not done so far.
	14231	-- dmq
	14232	*/
	14233	return(ret);
	14234	}
	14235	if (RExC_emit >= RExC_emit_bound)
	14236	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	14237	op, RExC_emit, RExC_emit_bound);
	14238
	14239	NODE_ALIGN_FILL(ret);
	14240	ptr = ret;
	14241	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	14242	#ifdef RE_TRACK_PATTERN_OFFSETS
	14243	if (RExC_offsets) { /* MJD */
	14244	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	14245	"reganode",
	14246	__LINE__,
	14247	PL_reg_name[op],
	14248	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	14249	"Overwriting end of array!\n" : "OK",
	14250	(UV)(RExC_emit - RExC_emit_start),
	14251	(UV)(RExC_parse - RExC_start),
	14252	(UV)RExC_offsets[0]));
	14253	Set_Cur_Node_Offset;
	14254	}
	14255	#endif
	14256	RExC_emit = ptr;
	14257	return(ret);
	14258	}
	14259
	14260	/*
	14261	- reguni - emit (if appropriate) a Unicode character
	14262	*/
	14263	STATIC STRLEN
	14264	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	14265	{
	14266	dVAR;
	14267
	14268	PERL_ARGS_ASSERT_REGUNI;
	14269
	14270	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	14271	}
	14272
	14273	/*
	14274	- reginsert - insert an operator in front of already-emitted operand
	14275	*
	14276	* Means relocating the operand.
	14277	*/
	14278	STATIC void
	14279	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	14280	{
	14281	dVAR;
	14282	regnode *src;
	14283	regnode *dst;
	14284	regnode *place;
	14285	const int offset = regarglen[(U8)op];
	14286	const int size = NODE_STEP_REGNODE + offset;
	14287	GET_RE_DEBUG_FLAGS_DECL;
	14288
	14289	PERL_ARGS_ASSERT_REGINSERT;
	14290	PERL_UNUSED_ARG(depth);
	14291	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	14292	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	14293	if (SIZE_ONLY) {
	14294	RExC_size += size;
	14295	return;
	14296	}
	14297
	14298	src = RExC_emit;
	14299	RExC_emit += size;
	14300	dst = RExC_emit;
	14301	if (RExC_open_parens) {
	14302	int paren;
	14303	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	14304	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	14305	if ( RExC_open_parens[paren] >= opnd ) {
	14306	/DEBUG_PARSE_FMT("open"," - %d",size);/
	14307	RExC_open_parens[paren] += size;
	14308	} else {
	14309	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	14310	}
	14311	if ( RExC_close_parens[paren] >= opnd ) {
	14312	/DEBUG_PARSE_FMT("close"," - %d",size);/
	14313	RExC_close_parens[paren] += size;
	14314	} else {
	14315	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	14316	}
	14317	}
	14318	}
	14319
	14320	while (src > opnd) {
	14321	StructCopy(--src, --dst, regnode);
	14322	#ifdef RE_TRACK_PATTERN_OFFSETS
	14323	if (RExC_offsets) { /* MJD 20010112 */
	14324	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	14325	"reg_insert",
	14326	__LINE__,
	14327	PL_reg_name[op],
	14328	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	14329	? "Overwriting end of array!\n" : "OK",
	14330	(UV)(src - RExC_emit_start),
	14331	(UV)(dst - RExC_emit_start),
	14332	(UV)RExC_offsets[0]));
	14333	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	14334	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	14335	}
	14336	#endif
	14337	}
	14338
	14339
	14340	place = opnd; /* Op node, where operand used to be. */
	14341	#ifdef RE_TRACK_PATTERN_OFFSETS
	14342	if (RExC_offsets) { /* MJD */
	14343	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	14344	"reginsert",
	14345	__LINE__,
	14346	PL_reg_name[op],
	14347	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	14348	? "Overwriting end of array!\n" : "OK",
	14349	(UV)(place - RExC_emit_start),
	14350	(UV)(RExC_parse - RExC_start),
	14351	(UV)RExC_offsets[0]));
	14352	Set_Node_Offset(place, RExC_parse);
	14353	Set_Node_Length(place, 1);
	14354	}
	14355	#endif
	14356	src = NEXTOPER(place);
	14357	FILL_ADVANCE_NODE(place, op);
	14358	Zero(src, offset, regnode);
	14359	}
	14360
	14361	/*
	14362	- regtail - set the next-pointer at the end of a node chain of p to val.
	14363	- SEE ALSO: regtail_study
	14364	*/
	14365	/* TODO: All three parms should be const */
	14366	STATIC void
	14367	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	14368	{
	14369	dVAR;
	14370	regnode *scan;
	14371	GET_RE_DEBUG_FLAGS_DECL;
	14372
	14373	PERL_ARGS_ASSERT_REGTAIL;
	14374	#ifndef DEBUGGING
	14375	PERL_UNUSED_ARG(depth);
	14376	#endif
	14377
	14378	if (SIZE_ONLY)
	14379	return;
	14380
	14381	/* Find last node. */
	14382	scan = p;
	14383	for (;;) {
	14384	regnode * const temp = regnext(scan);
	14385	DEBUG_PARSE_r({
	14386	SV * const mysv=sv_newmortal();
	14387	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	14388	regprop(RExC_rx, mysv, scan);
	14389	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	14390	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	14391	(temp == NULL ? "->" : ""),
	14392	(temp == NULL ? PL_reg_name[OP(val)] : "")
	14393	);
	14394	});
	14395	if (temp == NULL)
	14396	break;
	14397	scan = temp;
	14398	}
	14399
	14400	if (reg_off_by_arg[OP(scan)]) {
	14401	ARG_SET(scan, val - scan);
	14402	}
	14403	else {
	14404	NEXT_OFF(scan) = val - scan;
	14405	}
	14406	}
	14407
	14408	#ifdef DEBUGGING
	14409	/*
	14410	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	14411	- Look for optimizable sequences at the same time.
	14412	- currently only looks for EXACT chains.
	14413
	14414	This is experimental code. The idea is to use this routine to perform
	14415	in place optimizations on branches and groups as they are constructed,
	14416	with the long term intention of removing optimization from study_chunk so
	14417	that it is purely analytical.
	14418
	14419	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	14420	to control which is which.
	14421
	14422	*/
	14423	/* TODO: All four parms should be const */
	14424
	14425	STATIC U8
	14426	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	14427	{
	14428	dVAR;
	14429	regnode *scan;
	14430	U8 exact = PSEUDO;
	14431	#ifdef EXPERIMENTAL_INPLACESCAN
	14432	I32 min = 0;
	14433	#endif
	14434	GET_RE_DEBUG_FLAGS_DECL;
	14435
	14436	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	14437
	14438
	14439	if (SIZE_ONLY)
	14440	return exact;
	14441
	14442	/* Find last node. */
	14443
	14444	scan = p;
	14445	for (;;) {
	14446	regnode * const temp = regnext(scan);
	14447	#ifdef EXPERIMENTAL_INPLACESCAN
	14448	if (PL_regkind[OP(scan)] == EXACT) {
	14449	bool has_exactf_sharp_s; /* Unexamined in this routine */
	14450	if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
	14451	return EXACT;
	14452	}
	14453	#endif
	14454	if ( exact ) {
	14455	switch (OP(scan)) {
	14456	case EXACT:
	14457	case EXACTF:
	14458	case EXACTFA_NO_TRIE:
	14459	case EXACTFA:
	14460	case EXACTFU:
	14461	case EXACTFU_SS:
	14462	case EXACTFL:
	14463	if( exact == PSEUDO )
	14464	exact= OP(scan);
	14465	else if ( exact != OP(scan) )
	14466	exact= 0;
	14467	case NOTHING:
	14468	break;
	14469	default:
	14470	exact= 0;
	14471	}
	14472	}
	14473	DEBUG_PARSE_r({
	14474	SV * const mysv=sv_newmortal();
	14475	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	14476	regprop(RExC_rx, mysv, scan);
	14477	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	14478	SvPV_nolen_const(mysv),
	14479	REG_NODE_NUM(scan),
	14480	PL_reg_name[exact]);
	14481	});
	14482	if (temp == NULL)
	14483	break;
	14484	scan = temp;
	14485	}
	14486	DEBUG_PARSE_r({
	14487	SV * const mysv_val=sv_newmortal();
	14488	DEBUG_PARSE_MSG("");
	14489	regprop(RExC_rx, mysv_val, val);
	14490	PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	14491	SvPV_nolen_const(mysv_val),
	14492	(IV)REG_NODE_NUM(val),
	14493	(IV)(val - scan)
	14494	);
	14495	});
	14496	if (reg_off_by_arg[OP(scan)]) {
	14497	ARG_SET(scan, val - scan);
	14498	}
	14499	else {
	14500	NEXT_OFF(scan) = val - scan;
	14501	}
	14502
	14503	return exact;
	14504	}
	14505	#endif
	14506
	14507	/*
	14508	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	14509	*/
	14510	#ifdef DEBUGGING
	14511
	14512	static void
	14513	S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
	14514	{
	14515	int bit;
	14516	int set=0;
	14517
	14518	for (bit=0; bit<32; bit++) {
	14519	if (flags & (1<<bit)) {
	14520	if (!set++ && lead)
	14521	PerlIO_printf(Perl_debug_log, "%s",lead);
	14522	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
	14523	}
	14524	}
	14525	if (lead) {
	14526	if (set)
	14527	PerlIO_printf(Perl_debug_log, "\n");
	14528	else
	14529	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	14530	}
	14531	}
	14532
	14533	static void
	14534	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	14535	{
	14536	int bit;
	14537	int set=0;
	14538	regex_charset cs;
	14539
	14540	for (bit=0; bit<32; bit++) {
	14541	if (flags & (1<<bit)) {
	14542	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	14543	continue;
	14544	}
	14545	if (!set++ && lead)
	14546	PerlIO_printf(Perl_debug_log, "%s",lead);
	14547	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	14548	}
	14549	}
	14550	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	14551	if (!set++ && lead) {
	14552	PerlIO_printf(Perl_debug_log, "%s",lead);
	14553	}
	14554	switch (cs) {
	14555	case REGEX_UNICODE_CHARSET:
	14556	PerlIO_printf(Perl_debug_log, "UNICODE");
	14557	break;
	14558	case REGEX_LOCALE_CHARSET:
	14559	PerlIO_printf(Perl_debug_log, "LOCALE");
	14560	break;
	14561	case REGEX_ASCII_RESTRICTED_CHARSET:
	14562	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	14563	break;
	14564	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	14565	PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
	14566	break;
	14567	default:
	14568	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	14569	break;
	14570	}
	14571	}
	14572	if (lead) {
	14573	if (set)
	14574	PerlIO_printf(Perl_debug_log, "\n");
	14575	else
	14576	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	14577	}
	14578	}
	14579	#endif
	14580
	14581	void
	14582	Perl_regdump(pTHX_ const regexp *r)
	14583	{
	14584	#ifdef DEBUGGING
	14585	dVAR;
	14586	SV * const sv = sv_newmortal();
	14587	SV *dsv= sv_newmortal();
	14588	RXi_GET_DECL(r,ri);
	14589	GET_RE_DEBUG_FLAGS_DECL;
	14590
	14591	PERL_ARGS_ASSERT_REGDUMP;
	14592
	14593	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	14594
	14595	/* Header fields of interest. */
	14596	if (r->anchored_substr) {
	14597	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	14598	RE_SV_DUMPLEN(r->anchored_substr), 30);
	14599	PerlIO_printf(Perl_debug_log,
	14600	"anchored %s%s at %"IVdf" ",
	14601	s, RE_SV_TAIL(r->anchored_substr),
	14602	(IV)r->anchored_offset);
	14603	} else if (r->anchored_utf8) {
	14604	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	14605	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	14606	PerlIO_printf(Perl_debug_log,
	14607	"anchored utf8 %s%s at %"IVdf" ",
	14608	s, RE_SV_TAIL(r->anchored_utf8),
	14609	(IV)r->anchored_offset);
	14610	}
	14611	if (r->float_substr) {
	14612	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	14613	RE_SV_DUMPLEN(r->float_substr), 30);
	14614	PerlIO_printf(Perl_debug_log,
	14615	"floating %s%s at %"IVdf"..%"UVuf" ",
	14616	s, RE_SV_TAIL(r->float_substr),
	14617	(IV)r->float_min_offset, (UV)r->float_max_offset);
	14618	} else if (r->float_utf8) {
	14619	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	14620	RE_SV_DUMPLEN(r->float_utf8), 30);
	14621	PerlIO_printf(Perl_debug_log,
	14622	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	14623	s, RE_SV_TAIL(r->float_utf8),
	14624	(IV)r->float_min_offset, (UV)r->float_max_offset);
	14625	}
	14626	if (r->check_substr \|\| r->check_utf8)
	14627	PerlIO_printf(Perl_debug_log,
	14628	(const char *)
	14629	(r->check_substr == r->float_substr
	14630	&& r->check_utf8 == r->float_utf8
	14631	? "(checking floating" : "(checking anchored"));
	14632	if (r->extflags & RXf_NOSCAN)
	14633	PerlIO_printf(Perl_debug_log, " noscan");
	14634	if (r->extflags & RXf_CHECK_ALL)
	14635	PerlIO_printf(Perl_debug_log, " isall");
	14636	if (r->check_substr \|\| r->check_utf8)
	14637	PerlIO_printf(Perl_debug_log, ") ");
	14638
	14639	if (ri->regstclass) {
	14640	regprop(r, sv, ri->regstclass);
	14641	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	14642	}
	14643	if (r->extflags & RXf_ANCH) {
	14644	PerlIO_printf(Perl_debug_log, "anchored");
	14645	if (r->extflags & RXf_ANCH_BOL)
	14646	PerlIO_printf(Perl_debug_log, "(BOL)");
	14647	if (r->extflags & RXf_ANCH_MBOL)
	14648	PerlIO_printf(Perl_debug_log, "(MBOL)");
	14649	if (r->extflags & RXf_ANCH_SBOL)
	14650	PerlIO_printf(Perl_debug_log, "(SBOL)");
	14651	if (r->extflags & RXf_ANCH_GPOS)
	14652	PerlIO_printf(Perl_debug_log, "(GPOS)");
	14653	PerlIO_putc(Perl_debug_log, ' ');
	14654	}
	14655	if (r->extflags & RXf_GPOS_SEEN)
	14656	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	14657	if (r->intflags & PREGf_SKIP)
	14658	PerlIO_printf(Perl_debug_log, "plus ");
	14659	if (r->intflags & PREGf_IMPLICIT)
	14660	PerlIO_printf(Perl_debug_log, "implicit ");
	14661	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	14662	if (r->extflags & RXf_EVAL_SEEN)
	14663	PerlIO_printf(Perl_debug_log, "with eval ");
	14664	PerlIO_printf(Perl_debug_log, "\n");
	14665	DEBUG_FLAGS_r({
	14666	regdump_extflags("r->extflags: ",r->extflags);
	14667	regdump_intflags("r->intflags: ",r->intflags);
	14668	});
	14669	#else
	14670	PERL_ARGS_ASSERT_REGDUMP;
	14671	PERL_UNUSED_CONTEXT;
	14672	PERL_UNUSED_ARG(r);
	14673	#endif /* DEBUGGING */
	14674	}
	14675
	14676	/*
	14677	- regprop - printable representation of opcode
	14678	*/
	14679	#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
	14680	STMT_START { \
	14681	if (do_sep) { \
	14682	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
	14683	if (flags & ANYOF_INVERT) \
	14684	/make sure the invert info is in each / \
	14685	sv_catpvs(sv, "^"); \
	14686	do_sep = 0; \
	14687	} \
	14688	} STMT_END
	14689
	14690	void
	14691	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode *o)
	14692	{
	14693	#ifdef DEBUGGING
	14694	dVAR;
	14695	int k;
	14696
	14697	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	14698	static const char * const anyofs[] = {
	14699	#if _CC_WORDCHAR != 0 \|\| _CC_DIGIT != 1 \|\| _CC_ALPHA != 2 \|\| _CC_LOWER != 3 \
	14700	\|\| _CC_UPPER != 4 \|\| _CC_PUNCT != 5 \|\| _CC_PRINT != 6 \
	14701	\|\| _CC_ALPHANUMERIC != 7 \|\| _CC_GRAPH != 8 \|\| _CC_CASED != 9 \
	14702	\|\| _CC_SPACE != 10 \|\| _CC_BLANK != 11 \|\| _CC_XDIGIT != 12 \
	14703	\|\| _CC_PSXSPC != 13 \|\| _CC_CNTRL != 14 \|\| _CC_ASCII != 15 \
	14704	\|\| _CC_VERTSPACE != 16
	14705	#error Need to adjust order of anyofs[]
	14706	#endif
	14707	"[\\w]",
	14708	"[\\W]",
	14709	"[\\d]",
	14710	"[\\D]",
	14711	"[:alpha:]",
	14712	"[:^alpha:]",
	14713	"[:lower:]",
	14714	"[:^lower:]",
	14715	"[:upper:]",
	14716	"[:^upper:]",
	14717	"[:punct:]",
	14718	"[:^punct:]",
	14719	"[:print:]",
	14720	"[:^print:]",
	14721	"[:alnum:]",
	14722	"[:^alnum:]",
	14723	"[:graph:]",
	14724	"[:^graph:]",
	14725	"[:cased:]",
	14726	"[:^cased:]",
	14727	"[\\s]",
	14728	"[\\S]",
	14729	"[:blank:]",
	14730	"[:^blank:]",
	14731	"[:xdigit:]",
	14732	"[:^xdigit:]",
	14733	"[:space:]",
	14734	"[:^space:]",
	14735	"[:cntrl:]",
	14736	"[:^cntrl:]",
	14737	"[:ascii:]",
	14738	"[:^ascii:]",
	14739	"[\\v]",
	14740	"[\\V]"
	14741	};
	14742	RXi_GET_DECL(prog,progi);
	14743	GET_RE_DEBUG_FLAGS_DECL;
	14744
	14745	PERL_ARGS_ASSERT_REGPROP;
	14746
	14747	sv_setpvs(sv, "");
	14748
	14749	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	14750	/* It would be nice to FAIL() here, but this may be called from
	14751	regexec.c, and it would be hard to supply pRExC_state. */
	14752	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
	14753	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	14754
	14755	k = PL_regkind[OP(o)];
	14756
	14757	if (k == EXACT) {
	14758	sv_catpvs(sv, " ");
	14759	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	14760	* is a crude hack but it may be the best for now since
	14761	* we have no flag "this EXACTish node was UTF-8"
	14762	* --jhi */
	14763	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	14764	PERL_PV_ESCAPE_UNI_DETECT \|
	14765	PERL_PV_ESCAPE_NONASCII \|
	14766	PERL_PV_PRETTY_ELLIPSES \|
	14767	PERL_PV_PRETTY_LTGT \|
	14768	PERL_PV_PRETTY_NOCLEAR
	14769	);
	14770	} else if (k == TRIE) {
	14771	/* print the details of the trie in dumpuntil instead, as
	14772	* progi->data isn't available here */
	14773	const char op = OP(o);
	14774	const U32 n = ARG(o);
	14775	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	14776	(reg_ac_data *)progi->data->data[n] :
	14777	NULL;
	14778	const reg_trie_data * const trie
	14779	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	14780
	14781	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	14782	DEBUG_TRIE_COMPILE_r(
	14783	Perl_sv_catpvf(aTHX_ sv,
	14784	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	14785	(UV)trie->startstate,
	14786	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	14787	(UV)trie->wordcount,
	14788	(UV)trie->minlen,
	14789	(UV)trie->maxlen,
	14790	(UV)TRIE_CHARCOUNT(trie),
	14791	(UV)trie->uniquecharcount
	14792	)
	14793	);
	14794	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	14795	sv_catpvs(sv, "[");
	14796	(void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
	14797	? ANYOF_BITMAP(o)
	14798	: TRIE_BITMAP(trie));
	14799	sv_catpvs(sv, "]");
	14800	}
	14801
	14802	} else if (k == CURLY) {
	14803	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	14804	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	14805	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	14806	}
	14807	else if (k == WHILEM && o->flags) /* Ordinal/of */
	14808	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	14809	else if (k == REF \|\| k == OPEN \|\| k == CLOSE \|\| k == GROUPP \|\| OP(o)==ACCEPT) {
	14810	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	14811	if ( RXp_PAREN_NAMES(prog) ) {
	14812	if ( k != REF \|\| (OP(o) < NREF)) {
	14813	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	14814	SV **name= av_fetch(list, ARG(o), 0 );
	14815	if (name)
	14816	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	14817	}
	14818	else {
	14819	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	14820	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	14821	I32 nums=(I32)SvPVX(sv_dat);
	14822	SV **name= av_fetch(list, nums[0], 0 );
	14823	I32 n;
	14824	if (name) {
	14825	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	14826	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	14827	(n ? "," : ""), (IV)nums[n]);
	14828	}
	14829	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	14830	}
	14831	}
	14832	}
	14833	} else if (k == GOSUB)
	14834	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
	14835	else if (k == VERB) {
	14836	if (!o->flags)
	14837	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	14838	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	14839	} else if (k == LOGICAL)
	14840	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
	14841	else if (k == ANYOF) {
	14842	const U8 flags = ANYOF_FLAGS(o);
	14843	int do_sep = 0;
	14844
	14845
	14846	if (flags & ANYOF_LOCALE)
	14847	sv_catpvs(sv, "{loc}");
	14848	if (flags & ANYOF_LOC_FOLD)
	14849	sv_catpvs(sv, "{i}");
	14850	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	14851	if (flags & ANYOF_INVERT)
	14852	sv_catpvs(sv, "^");
	14853
	14854	/* output what the standard cp 0-255 bitmap matches */
	14855	do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
	14856
	14857	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	14858	/* output any special charclass tests (used entirely under use locale) */
	14859	if (ANYOF_CLASS_TEST_ANY_SET(o)) {
	14860	int i;
	14861	for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
	14862	if (ANYOF_CLASS_TEST(o,i)) {
	14863	sv_catpv(sv, anyofs[i]);
	14864	do_sep = 1;
	14865	}
	14866	}
	14867	}
	14868
	14869	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	14870
	14871	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	14872	sv_catpvs(sv, "{non-utf8-latin1-all}");
	14873	}
	14874
	14875	/* output information about the unicode matching */
	14876	if (flags & ANYOF_UNICODE_ALL)
	14877	sv_catpvs(sv, "{unicode_all}");
	14878	else if (ANYOF_NONBITMAP(o)) {
	14879	SV lv; / Set if there is something outside the bit map. */
	14880	bool byte_output = FALSE; /* If something in the bitmap has been
	14881	output */
	14882
	14883	if (flags & ANYOF_NONBITMAP_NON_UTF8) {
	14884	sv_catpvs(sv, "{outside bitmap}");
	14885	}
	14886	else {
	14887	sv_catpvs(sv, "{utf8}");
	14888	}
	14889
	14890	/* Get the stuff that wasn't in the bitmap */
	14891	(void) regclass_swash(prog, o, FALSE, &lv, NULL);
	14892	if (lv && lv != &PL_sv_undef) {
	14893	char *s = savesvpv(lv);
	14894	char * const origs = s;
	14895
	14896	while (s && s != '\n')
	14897	s++;
	14898
	14899	if (*s == '\n') {
	14900	const char * const t = ++s;
	14901
	14902	if (byte_output) {
	14903	sv_catpvs(sv, " ");
	14904	}
	14905
	14906	while (*s) {
	14907	if (*s == '\n') {
	14908
	14909	/* Truncate very long output */
	14910	if (s - origs > 256) {
	14911	Perl_sv_catpvf(aTHX_ sv,
	14912	"%.*s...",
	14913	(int) (s - origs - 1),
	14914	t);
	14915	goto out_dump;
	14916	}
	14917	*s = ' ';
	14918	}
	14919	else if (*s == '\t') {
	14920	*s = '-';
	14921	}
	14922	s++;
	14923	}
	14924	if (s[-1] == ' ')
	14925	s[-1] = 0;
	14926
	14927	sv_catpv(sv, t);
	14928	}
	14929
	14930	out_dump:
	14931
	14932	Safefree(origs);
	14933	SvREFCNT_dec_NN(lv);
	14934	}
	14935	}
	14936
	14937	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	14938	}
	14939	else if (k == POSIXD \|\| k == NPOSIXD) {
	14940	U8 index = FLAGS(o) * 2;
	14941	if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
	14942	Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
	14943	}
	14944	else {
	14945	sv_catpv(sv, anyofs[index]);
	14946	}
	14947	}
	14948	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	14949	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	14950	#else
	14951	PERL_UNUSED_CONTEXT;
	14952	PERL_UNUSED_ARG(sv);
	14953	PERL_UNUSED_ARG(o);
	14954	PERL_UNUSED_ARG(prog);
	14955	#endif /* DEBUGGING */
	14956	}
	14957
	14958	SV *
	14959	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	14960	{ /* Assume that RE_INTUIT is set */
	14961	dVAR;
	14962	struct regexp *const prog = ReANY(r);
	14963	GET_RE_DEBUG_FLAGS_DECL;
	14964
	14965	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	14966	PERL_UNUSED_CONTEXT;
	14967
	14968	DEBUG_COMPILE_r(
	14969	{
	14970	const char * const s = SvPV_nolen_const(prog->check_substr
	14971	? prog->check_substr : prog->check_utf8);
	14972
	14973	if (!PL_colorset) reginitcolors();
	14974	PerlIO_printf(Perl_debug_log,
	14975	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	14976	PL_colors[4],
	14977	prog->check_substr ? "" : "utf8 ",
	14978	PL_colors[5],PL_colors[0],
	14979	s,
	14980	PL_colors[1],
	14981	(strlen(s) > 60 ? "..." : ""));
	14982	} );
	14983
	14984	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	14985	}
	14986
	14987	/*
	14988	pregfree()
	14989
	14990	handles refcounting and freeing the perl core regexp structure. When
	14991	it is necessary to actually free the structure the first thing it
	14992	does is call the 'free' method of the regexp_engine associated to
	14993	the regexp, allowing the handling of the void *pprivate; member
	14994	first. (This routine is not overridable by extensions, which is why
	14995	the extensions free is called first.)
	14996
	14997	See regdupe and regdupe_internal if you change anything here.
	14998	*/
	14999	#ifndef PERL_IN_XSUB_RE
	15000	void
	15001	Perl_pregfree(pTHX_ REGEXP *r)
	15002	{
	15003	SvREFCNT_dec(r);
	15004	}
	15005
	15006	void
	15007	Perl_pregfree2(pTHX_ REGEXP *rx)
	15008	{
	15009	dVAR;
	15010	struct regexp *const r = ReANY(rx);
	15011	GET_RE_DEBUG_FLAGS_DECL;
	15012
	15013	PERL_ARGS_ASSERT_PREGFREE2;
	15014
	15015	if (r->mother_re) {
	15016	ReREFCNT_dec(r->mother_re);
	15017	} else {
	15018	CALLREGFREE_PVT(rx); /* free the private data */
	15019	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	15020	Safefree(r->xpv_len_u.xpvlenu_pv);
	15021	}
	15022	if (r->substrs) {
	15023	SvREFCNT_dec(r->anchored_substr);
	15024	SvREFCNT_dec(r->anchored_utf8);
	15025	SvREFCNT_dec(r->float_substr);
	15026	SvREFCNT_dec(r->float_utf8);
	15027	Safefree(r->substrs);
	15028	}
	15029	RX_MATCH_COPY_FREE(rx);
	15030	#ifdef PERL_ANY_COW
	15031	SvREFCNT_dec(r->saved_copy);
	15032	#endif
	15033	Safefree(r->offs);
	15034	SvREFCNT_dec(r->qr_anoncv);
	15035	rx->sv_u.svu_rx = 0;
	15036	}
	15037
	15038	/* reg_temp_copy()
	15039
	15040	This is a hacky workaround to the structural issue of match results
	15041	being stored in the regexp structure which is in turn stored in
	15042	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	15043	could be PL_curpm in multiple contexts, and could require multiple
	15044	result sets being associated with the pattern simultaneously, such
	15045	as when doing a recursive match with (??{$qr})
	15046
	15047	The solution is to make a lightweight copy of the regexp structure
	15048	when a qr// is returned from the code executed by (??{$qr}) this
	15049	lightweight copy doesn't actually own any of its data except for
	15050	the starp/end and the actual regexp structure itself.
	15051
	15052	*/
	15053
	15054
	15055	REGEXP *
	15056	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	15057	{
	15058	struct regexp *ret;
	15059	struct regexp *const r = ReANY(rx);
	15060	const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
	15061
	15062	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	15063
	15064	if (!ret_x)
	15065	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	15066	else {
	15067	SvOK_off((SV *)ret_x);
	15068	if (islv) {
	15069	/* For PVLVs, SvANY points to the xpvlv body while sv_u points
	15070	to the regexp. (For SVt_REGEXPs, sv_upgrade has already
	15071	made both spots point to the same regexp body.) */
	15072	REGEXP temp = (REGEXP )newSV_type(SVt_REGEXP);
	15073	assert(!SvPVX(ret_x));
	15074	ret_x->sv_u.svu_rx = temp->sv_any;
	15075	temp->sv_any = NULL;
	15076	SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) \| SVt_NULL;
	15077	SvREFCNT_dec_NN(temp);
	15078	/* SvCUR still resides in the xpvlv struct, so the regexp copy-
	15079	ing below will not set it. */
	15080	SvCUR_set(ret_x, SvCUR(rx));
	15081	}
	15082	}
	15083	/* This ensures that SvTHINKFIRST(sv) is true, and hence that
	15084	sv_force_normal(sv) is called. */
	15085	SvFAKE_on(ret_x);
	15086	ret = ReANY(ret_x);
	15087
	15088	SvFLAGS(ret_x) \|= SvUTF8(rx);
	15089	/* We share the same string buffer as the original regexp, on which we
	15090	hold a reference count, incremented when mother_re is set below.
	15091	The string pointer is copied here, being part of the regexp struct.
	15092	*/
	15093	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	15094	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	15095	if (r->offs) {
	15096	const I32 npar = r->nparens+1;
	15097	Newx(ret->offs, npar, regexp_paren_pair);
	15098	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	15099	}
	15100	if (r->substrs) {
	15101	Newx(ret->substrs, 1, struct reg_substr_data);
	15102	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	15103
	15104	SvREFCNT_inc_void(ret->anchored_substr);
	15105	SvREFCNT_inc_void(ret->anchored_utf8);
	15106	SvREFCNT_inc_void(ret->float_substr);
	15107	SvREFCNT_inc_void(ret->float_utf8);
	15108
	15109	/* check_substr and check_utf8, if non-NULL, point to either their
	15110	anchored or float namesakes, and don't hold a second reference. */
	15111	}
	15112	RX_MATCH_COPIED_off(ret_x);
	15113	#ifdef PERL_ANY_COW
	15114	ret->saved_copy = NULL;
	15115	#endif
	15116	ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
	15117	SvREFCNT_inc_void(ret->qr_anoncv);
	15118
	15119	return ret_x;
	15120	}
	15121	#endif
	15122
	15123	/* regfree_internal()
	15124
	15125	Free the private data in a regexp. This is overloadable by
	15126	extensions. Perl takes care of the regexp structure in pregfree(),
	15127	this covers the *pprivate pointer which technically perl doesn't
	15128	know about, however of course we have to handle the
	15129	regexp_internal structure when no extension is in use.
	15130
	15131	Note this is called before freeing anything in the regexp
	15132	structure.
	15133	*/
	15134
	15135	void
	15136	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	15137	{
	15138	dVAR;
	15139	struct regexp *const r = ReANY(rx);
	15140	RXi_GET_DECL(r,ri);
	15141	GET_RE_DEBUG_FLAGS_DECL;
	15142
	15143	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	15144
	15145	DEBUG_COMPILE_r({
	15146	if (!PL_colorset)
	15147	reginitcolors();
	15148	{
	15149	SV *dsv= sv_newmortal();
	15150	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	15151	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	15152	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	15153	PL_colors[4],PL_colors[5],s);
	15154	}
	15155	});
	15156	#ifdef RE_TRACK_PATTERN_OFFSETS
	15157	if (ri->u.offsets)
	15158	Safefree(ri->u.offsets); /* 20010421 MJD */
	15159	#endif
	15160	if (ri->code_blocks) {
	15161	int n;
	15162	for (n = 0; n < ri->num_code_blocks; n++)
	15163	SvREFCNT_dec(ri->code_blocks[n].src_regex);
	15164	Safefree(ri->code_blocks);
	15165	}
	15166
	15167	if (ri->data) {
	15168	int n = ri->data->count;
	15169
	15170	while (--n >= 0) {
	15171	/* If you add a ->what type here, update the comment in regcomp.h */
	15172	switch (ri->data->what[n]) {
	15173	case 'a':
	15174	case 'r':
	15175	case 's':
	15176	case 'S':
	15177	case 'u':
	15178	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	15179	break;
	15180	case 'f':
	15181	Safefree(ri->data->data[n]);
	15182	break;
	15183	case 'l':
	15184	case 'L':
	15185	break;
	15186	case 'T':
	15187	{ /* Aho Corasick add-on structure for a trie node.
	15188	Used in stclass optimization only */
	15189	U32 refcount;
	15190	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	15191	OP_REFCNT_LOCK;
	15192	refcount = --aho->refcount;
	15193	OP_REFCNT_UNLOCK;
	15194	if ( !refcount ) {
	15195	PerlMemShared_free(aho->states);
	15196	PerlMemShared_free(aho->fail);
	15197	/* do this last!!!! */
	15198	PerlMemShared_free(ri->data->data[n]);
	15199	PerlMemShared_free(ri->regstclass);
	15200	}
	15201	}
	15202	break;
	15203	case 't':
	15204	{
	15205	/* trie structure. */
	15206	U32 refcount;
	15207	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	15208	OP_REFCNT_LOCK;
	15209	refcount = --trie->refcount;
	15210	OP_REFCNT_UNLOCK;
	15211	if ( !refcount ) {
	15212	PerlMemShared_free(trie->charmap);
	15213	PerlMemShared_free(trie->states);
	15214	PerlMemShared_free(trie->trans);
	15215	if (trie->bitmap)
	15216	PerlMemShared_free(trie->bitmap);
	15217	if (trie->jump)
	15218	PerlMemShared_free(trie->jump);
	15219	PerlMemShared_free(trie->wordinfo);
	15220	/* do this last!!!! */
	15221	PerlMemShared_free(ri->data->data[n]);
	15222	}
	15223	}
	15224	break;
	15225	default:
	15226	Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
	15227	}
	15228	}
	15229	Safefree(ri->data->what);
	15230	Safefree(ri->data);
	15231	}
	15232
	15233	Safefree(ri);
	15234	}
	15235
	15236	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	15237	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	15238	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	15239
	15240	/*
	15241	re_dup - duplicate a regexp.
	15242
	15243	This routine is expected to clone a given regexp structure. It is only
	15244	compiled under USE_ITHREADS.
	15245
	15246	After all of the core data stored in struct regexp is duplicated
	15247	the regexp_engine.dupe method is used to copy any private data
	15248	stored in the *pprivate pointer. This allows extensions to handle
	15249	any duplication it needs to do.
	15250
	15251	See pregfree() and regfree_internal() if you change anything here.
	15252	*/
	15253	#if defined(USE_ITHREADS)
	15254	#ifndef PERL_IN_XSUB_RE
	15255	void
	15256	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	15257	{
	15258	dVAR;
	15259	I32 npar;
	15260	const struct regexp *r = ReANY(sstr);
	15261	struct regexp *ret = ReANY(dstr);
	15262
	15263	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	15264
	15265	npar = r->nparens+1;
	15266	Newx(ret->offs, npar, regexp_paren_pair);
	15267	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	15268
	15269	if (ret->substrs) {
	15270	/* Do it this way to avoid reading from *r after the StructCopy().
	15271	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	15272	cache, it doesn't matter. */
	15273	const bool anchored = r->check_substr
	15274	? r->check_substr == r->anchored_substr
	15275	: r->check_utf8 == r->anchored_utf8;
	15276	Newx(ret->substrs, 1, struct reg_substr_data);
	15277	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	15278
	15279	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	15280	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	15281	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	15282	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	15283
	15284	/* check_substr and check_utf8, if non-NULL, point to either their
	15285	anchored or float namesakes, and don't hold a second reference. */
	15286
	15287	if (ret->check_substr) {
	15288	if (anchored) {
	15289	assert(r->check_utf8 == r->anchored_utf8);
	15290	ret->check_substr = ret->anchored_substr;
	15291	ret->check_utf8 = ret->anchored_utf8;
	15292	} else {
	15293	assert(r->check_substr == r->float_substr);
	15294	assert(r->check_utf8 == r->float_utf8);
	15295	ret->check_substr = ret->float_substr;
	15296	ret->check_utf8 = ret->float_utf8;
	15297	}
	15298	} else if (ret->check_utf8) {
	15299	if (anchored) {
	15300	ret->check_utf8 = ret->anchored_utf8;
	15301	} else {
	15302	ret->check_utf8 = ret->float_utf8;
	15303	}
	15304	}
	15305	}
	15306
	15307	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	15308	ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
	15309
	15310	if (ret->pprivate)
	15311	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	15312
	15313	if (RX_MATCH_COPIED(dstr))
	15314	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	15315	else
	15316	ret->subbeg = NULL;
	15317	#ifdef PERL_ANY_COW
	15318	ret->saved_copy = NULL;
	15319	#endif
	15320
	15321	/* Whether mother_re be set or no, we need to copy the string. We
	15322	cannot refrain from copying it when the storage points directly to
	15323	our mother regexp, because that's
	15324	1: a buffer in a different thread
	15325	2: something we no longer hold a reference on
	15326	so we need to copy it locally. */
	15327	RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
	15328	ret->mother_re = NULL;
	15329	}
	15330	#endif /* PERL_IN_XSUB_RE */
	15331
	15332	/*
	15333	regdupe_internal()
	15334
	15335	This is the internal complement to regdupe() which is used to copy
	15336	the structure pointed to by the *pprivate pointer in the regexp.
	15337	This is the core version of the extension overridable cloning hook.
	15338	The regexp structure being duplicated will be copied by perl prior
	15339	to this and will be provided as the regexp *r argument, however
	15340	with the /old/ structures pprivate pointer value. Thus this routine
	15341	may override any copying normally done by perl.
	15342
	15343	It returns a pointer to the new regexp_internal structure.
	15344	*/
	15345
	15346	void *
	15347	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	15348	{
	15349	dVAR;
	15350	struct regexp *const r = ReANY(rx);
	15351	regexp_internal *reti;
	15352	int len;
	15353	RXi_GET_DECL(r,ri);
	15354
	15355	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	15356
	15357	len = ProgLen(ri);
	15358
	15359	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
	15360	Copy(ri->program, reti->program, len+1, regnode);
	15361
	15362	reti->num_code_blocks = ri->num_code_blocks;
	15363	if (ri->code_blocks) {
	15364	int n;
	15365	Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
	15366	struct reg_code_block);
	15367	Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
	15368	struct reg_code_block);
	15369	for (n = 0; n < ri->num_code_blocks; n++)
	15370	reti->code_blocks[n].src_regex = (REGEXP*)
	15371	sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
	15372	}
	15373	else
	15374	reti->code_blocks = NULL;
	15375
	15376	reti->regstclass = NULL;
	15377
	15378	if (ri->data) {
	15379	struct reg_data *d;
	15380	const int count = ri->data->count;
	15381	int i;
	15382
	15383	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	15384	char, struct reg_data);
	15385	Newx(d->what, count, U8);
	15386
	15387	d->count = count;
	15388	for (i = 0; i < count; i++) {
	15389	d->what[i] = ri->data->what[i];
	15390	switch (d->what[i]) {
	15391	/* see also regcomp.h and regfree_internal() */
	15392	case 'a': /* actually an AV, but the dup function is identical. */
	15393	case 'r':
	15394	case 's':
	15395	case 'S':
	15396	case 'u': /* actually an HV, but the dup function is identical. */
	15397	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	15398	break;
	15399	case 'f':
	15400	/* This is cheating. */
	15401	Newx(d->data[i], 1, struct regnode_charclass_class);
	15402	StructCopy(ri->data->data[i], d->data[i],
	15403	struct regnode_charclass_class);
	15404	reti->regstclass = (regnode*)d->data[i];
	15405	break;
	15406	case 'T':
	15407	/* Trie stclasses are readonly and can thus be shared
	15408	* without duplication. We free the stclass in pregfree
	15409	* when the corresponding reg_ac_data struct is freed.
	15410	*/
	15411	reti->regstclass= ri->regstclass;
	15412	/* Fall through */
	15413	case 't':
	15414	OP_REFCNT_LOCK;
	15415	((reg_trie_data*)ri->data->data[i])->refcount++;
	15416	OP_REFCNT_UNLOCK;
	15417	/* Fall through */
	15418	case 'l':
	15419	case 'L':
	15420	d->data[i] = ri->data->data[i];
	15421	break;
	15422	default:
	15423	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
	15424	}
	15425	}
	15426
	15427	reti->data = d;
	15428	}
	15429	else
	15430	reti->data = NULL;
	15431
	15432	reti->name_list_idx = ri->name_list_idx;
	15433
	15434	#ifdef RE_TRACK_PATTERN_OFFSETS
	15435	if (ri->u.offsets) {
	15436	Newx(reti->u.offsets, 2*len+1, U32);
	15437	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	15438	}
	15439	#else
	15440	SetProgLen(reti,len);
	15441	#endif
	15442
	15443	return (void*)reti;
	15444	}
	15445
	15446	#endif /* USE_ITHREADS */
	15447
	15448	#ifndef PERL_IN_XSUB_RE
	15449
	15450	/*
	15451	- regnext - dig the "next" pointer out of a node
	15452	*/
	15453	regnode *
	15454	Perl_regnext(pTHX_ regnode *p)
	15455	{
	15456	dVAR;
	15457	I32 offset;
	15458
	15459	if (!p)
	15460	return(NULL);
	15461
	15462	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	15463	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
	15464	}
	15465
	15466	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	15467	if (offset == 0)
	15468	return(NULL);
	15469
	15470	return(p+offset);
	15471	}
	15472	#endif
	15473
	15474	STATIC void
	15475	S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
	15476	{
	15477	va_list args;
	15478	STRLEN l1 = strlen(pat1);
	15479	STRLEN l2 = strlen(pat2);
	15480	char buf[512];
	15481	SV *msv;
	15482	const char *message;
	15483
	15484	PERL_ARGS_ASSERT_RE_CROAK2;
	15485
	15486	if (l1 > 510)
	15487	l1 = 510;
	15488	if (l1 + l2 > 510)
	15489	l2 = 510 - l1;
	15490	Copy(pat1, buf, l1 , char);
	15491	Copy(pat2, buf + l1, l2 , char);
	15492	buf[l1 + l2] = '\n';
	15493	buf[l1 + l2 + 1] = '\0';
	15494	#ifdef I_STDARG
	15495	/* ANSI variant takes additional second argument */
	15496	va_start(args, pat2);
	15497	#else
	15498	va_start(args);
	15499	#endif
	15500	msv = vmess(buf, &args);
	15501	va_end(args);
	15502	message = SvPV_const(msv,l1);
	15503	if (l1 > 512)
	15504	l1 = 512;
	15505	Copy(message, buf, l1 , char);
	15506	buf[l1-1] = '\0'; /* Overwrite \n */
	15507	Perl_croak(aTHX_ "%s", buf);
	15508	}
	15509
	15510	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	15511
	15512	#ifndef PERL_IN_XSUB_RE
	15513	void
	15514	Perl_save_re_context(pTHX)
	15515	{
	15516	dVAR;
	15517
	15518	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	15519	if (PL_curpm) {
	15520	const REGEXP * const rx = PM_GETRE(PL_curpm);
	15521	if (rx) {
	15522	U32 i;
	15523	for (i = 1; i <= RX_NPARENS(rx); i++) {
	15524	char digits[TYPE_CHARS(long)];
	15525	const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
	15526	GV const const gvp
	15527	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	15528
	15529	if (gvp) {
	15530	GV * const gv = *gvp;
	15531	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	15532	save_scalar(gv);
	15533	}
	15534	}
	15535	}
	15536	}
	15537	}
	15538	#endif
	15539
	15540	#ifdef DEBUGGING
	15541
	15542	STATIC void
	15543	S_put_byte(pTHX_ SV *sv, int c)
	15544	{
	15545	PERL_ARGS_ASSERT_PUT_BYTE;
	15546
	15547	/* Our definition of isPRINT() ignores locales, so only bytes that are
	15548	not part of UTF-8 are considered printable. I assume that the same
	15549	holds for UTF-EBCDIC.
	15550	Also, code point 255 is not printable in either (it's E0 in EBCDIC,
	15551	which Wikipedia says:
	15552
	15553	EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
	15554	ones (binary 1111 1111, hexadecimal FF). It is similar, but not
	15555	identical, to the ASCII delete (DEL) or rubout control character. ...
	15556	it is typically mapped to hexadecimal code 9F, in order to provide a
	15557	unique character mapping in both directions)
	15558
	15559	So the old condition can be simplified to !isPRINT(c) */
	15560	if (!isPRINT(c)) {
	15561	switch (c) {
	15562	case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
	15563	case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
	15564	case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
	15565	case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
	15566	case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
	15567
	15568	default:
	15569	Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
	15570	break;
	15571	}
	15572	}
	15573	else {
	15574	const char string = c;
	15575	if (c == '-' \|\| c == ']' \|\| c == '\\' \|\| c == '^')
	15576	sv_catpvs(sv, "\\");
	15577	sv_catpvn(sv, &string, 1);
	15578	}
	15579	}
	15580
	15581	STATIC bool
	15582	S_put_latin1_charclass_innards(pTHX_ SV sv, char bitmap)
	15583	{
	15584	/* Appends to 'sv' a displayable version of the innards of the bracketed
	15585	* character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
	15586	* output anything */
	15587
	15588	int i;
	15589	int rangestart = -1;
	15590	bool has_output_anything = FALSE;
	15591
	15592	PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
	15593
	15594	for (i = 0; i <= 256; i++) {
	15595	if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
	15596	if (rangestart == -1)
	15597	rangestart = i;
	15598	} else if (rangestart != -1) {
	15599	int j = i - 1;
	15600	if (i <= rangestart + 3) { /* Individual chars in short ranges */
	15601	for (; rangestart < i; rangestart++)
	15602	put_byte(sv, rangestart);
	15603	}
	15604	else if ( j > 255
	15605	\|\| ! isALPHANUMERIC(rangestart)
	15606	\|\| ! isALPHANUMERIC(j)
	15607	\|\| isDIGIT(rangestart) != isDIGIT(j)
	15608	\|\| isUPPER(rangestart) != isUPPER(j)
	15609	\|\| isLOWER(rangestart) != isLOWER(j)
	15610
	15611	/* This final test should get optimized out except
	15612	* on EBCDIC platforms, where it causes ranges that
	15613	* cross discontinuities like i/j to be shown as hex
	15614	* instead of the misleading, e.g. H-K (since that
	15615	* range includes more than H, I, J, K). */
	15616	\|\| (j - rangestart)
	15617	!= NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
	15618	{
	15619	Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
	15620	rangestart,
	15621	(j < 256) ? j : 255);
	15622	}
	15623	else { /* Here, the ends of the range are both digits, or both
	15624	uppercase, or both lowercase; and there's no
	15625	discontinuity in the range (which could happen on EBCDIC
	15626	platforms) */
	15627	put_byte(sv, rangestart);
	15628	sv_catpvs(sv, "-");
	15629	put_byte(sv, j);
	15630	}
	15631	rangestart = -1;
	15632	has_output_anything = TRUE;
	15633	}
	15634	}
	15635
	15636	return has_output_anything;
	15637	}
	15638
	15639	#define CLEAR_OPTSTART \
	15640	if (optstart) STMT_START { \
	15641	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	15642	optstart=NULL; \
	15643	} STMT_END
	15644
	15645	#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	15646
	15647	STATIC const regnode *
	15648	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	15649	const regnode last, const regnode plast,
	15650	SV* sv, I32 indent, U32 depth)
	15651	{
	15652	dVAR;
	15653	U8 op = PSEUDO; /* Arbitrary non-END op. */
	15654	const regnode *next;
	15655	const regnode *optstart= NULL;
	15656
	15657	RXi_GET_DECL(r,ri);
	15658	GET_RE_DEBUG_FLAGS_DECL;
	15659
	15660	PERL_ARGS_ASSERT_DUMPUNTIL;
	15661
	15662	#ifdef DEBUG_DUMPUNTIL
	15663	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	15664	last ? last-start : 0,plast ? plast-start : 0);
	15665	#endif
	15666
	15667	if (plast && plast < last)
	15668	last= plast;
	15669
	15670	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	15671	/* While that wasn't END last time... */
	15672	NODE_ALIGN(node);
	15673	op = OP(node);
	15674	if (op == CLOSE \|\| op == WHILEM)
	15675	indent--;
	15676	next = regnext((regnode *)node);
	15677
	15678	/* Where, what. */
	15679	if (OP(node) == OPTIMIZED) {
	15680	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	15681	optstart = node;
	15682	else
	15683	goto after_print;
	15684	} else
	15685	CLEAR_OPTSTART;
	15686
	15687	regprop(r, sv, node);
	15688	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	15689	(int)(2*indent + 1), "", SvPVX_const(sv));
	15690
	15691	if (OP(node) != OPTIMIZED) {
	15692	if (next == NULL) /* Next ptr. */
	15693	PerlIO_printf(Perl_debug_log, " (0)");
	15694	else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
	15695	PerlIO_printf(Perl_debug_log, " (FAIL)");
	15696	else
	15697	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	15698	(void)PerlIO_putc(Perl_debug_log, '\n');
	15699	}
	15700
	15701	after_print:
	15702	if (PL_regkind[(U8)op] == BRANCHJ) {
	15703	assert(next);
	15704	{
	15705	const regnode *nnode = (OP(next) == LONGJMP
	15706	? regnext((regnode *)next)
	15707	: next);
	15708	if (last && nnode > last)
	15709	nnode = last;
	15710	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	15711	}
	15712	}
	15713	else if (PL_regkind[(U8)op] == BRANCH) {
	15714	assert(next);
	15715	DUMPUNTIL(NEXTOPER(node), next);
	15716	}
	15717	else if ( PL_regkind[(U8)op] == TRIE ) {
	15718	const regnode *this_trie = node;
	15719	const char op = OP(node);
	15720	const U32 n = ARG(node);
	15721	const reg_ac_data * const ac = op>=AHOCORASICK ?
	15722	(reg_ac_data *)ri->data->data[n] :
	15723	NULL;
	15724	const reg_trie_data * const trie =
	15725	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	15726	#ifdef DEBUGGING
	15727	AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	15728	#endif
	15729	const regnode *nextbranch= NULL;
	15730	I32 word_idx;
	15731	sv_setpvs(sv, "");
	15732	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	15733	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	15734
	15735	PerlIO_printf(Perl_debug_log, "%*s%s ",
	15736	(int)(2*(indent+3)), "",
	15737	elem_ptr ? pv_pretty(sv, SvPV_nolen_const(elem_ptr), SvCUR(elem_ptr), 60,
	15738	PL_colors[0], PL_colors[1],
	15739	(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) \|
	15740	PERL_PV_PRETTY_ELLIPSES \|
	15741	PERL_PV_PRETTY_LTGT
	15742	)
	15743	: "???"
	15744	);
	15745	if (trie->jump) {
	15746	U16 dist= trie->jump[word_idx+1];
	15747	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	15748	(UV)((dist ? this_trie + dist : next) - start));
	15749	if (dist) {
	15750	if (!nextbranch)
	15751	nextbranch= this_trie + trie->jump[0];
	15752	DUMPUNTIL(this_trie + dist, nextbranch);
	15753	}
	15754	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	15755	nextbranch= regnext((regnode *)nextbranch);
	15756	} else {
	15757	PerlIO_printf(Perl_debug_log, "\n");
	15758	}
	15759	}
	15760	if (last && next > last)
	15761	node= last;
	15762	else
	15763	node= next;
	15764	}
	15765	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	15766	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	15767	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	15768	}
	15769	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	15770	assert(next);
	15771	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	15772	}
	15773	else if ( op == PLUS \|\| op == STAR) {
	15774	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	15775	}
	15776	else if (PL_regkind[(U8)op] == ANYOF) {
	15777	/* arglen 1 + class block */
	15778	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
	15779	? ANYOF_CLASS_SKIP : ANYOF_SKIP);
	15780	node = NEXTOPER(node);
	15781	}
	15782	else if (PL_regkind[(U8)op] == EXACT) {
	15783	/* Literal string, where present. */
	15784	node += NODE_SZ_STR(node) - 1;
	15785	node = NEXTOPER(node);
	15786	}
	15787	else {
	15788	node = NEXTOPER(node);
	15789	node += regarglen[(U8)op];
	15790	}
	15791	if (op == CURLYX \|\| op == OPEN)
	15792	indent++;
	15793	}
	15794	CLEAR_OPTSTART;
	15795	#ifdef DEBUG_DUMPUNTIL
	15796	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	15797	#endif
	15798	return node;
	15799	}
	15800
	15801	#endif /* DEBUGGING */
	15802
	15803	/*
	15804	* Local variables:
	15805	* c-indentation-style: bsd
	15806	* c-basic-offset: 4
	15807	* indent-tabs-mode: nil
	15808	* End:
	15809	*
	15810	* ex: set ts=8 sts=4 sw=4 et:
	15811	*/