perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regcomp.c
	2	*/
	3
	4	/*
	5	* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
	6	*
	7	* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
	8	*/
	9
	10	/* This file contains functions for compiling a regular expression. See
	11	* also regexec.c which funnily enough, contains functions for executing
	12	* a regular expression.
	13	*
	14	* This file is also copied at build time to ext/re/re_comp.c, where
	15	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	16	* This causes the main functions to be compiled under new names and with
	17	* debugging support added, which makes "use re 'debug'" work.
	18	*/
	19
	20	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	21	* confused with the original package (see point 3 below). Thanks, Henry!
	22	*/
	23
	24	/* Additional note: this code is very heavily munged from Henry's version
	25	* in places. In some spots I've traded clarity for efficiency, so don't
	26	* blame Henry for some of the lack of readability.
	27	*/
	28
	29	/* The names of the functions have been changed from regcomp and
	30	* regexec to pregcomp and pregexec in order to avoid conflicts
	31	* with the POSIX routines of the same names.
	32	*/
	33
	34	#ifdef PERL_EXT_RE_BUILD
	35	#include "re_top.h"
	36	#endif
	37
	38	/*
	39	* pregcomp and pregexec -- regsub and regerror are not used in perl
	40	*
	41	* Copyright (c) 1986 by University of Toronto.
	42	* Written by Henry Spencer. Not derived from licensed software.
	43	*
	44	* Permission is granted to anyone to use this software for any
	45	* purpose on any computer system, and to redistribute it freely,
	46	* subject to the following restrictions:
	47	*
	48	* 1. The author is not responsible for the consequences of use of
	49	* this software, no matter how awful, even if they arise
	50	* from defects in it.
	51	*
	52	* 2. The origin of this software must not be misrepresented, either
	53	* by explicit claim or by omission.
	54	*
	55	* 3. Altered versions must be plainly marked as such, and must not
	56	* be misrepresented as being the original software.
	57	*
	58	*
	59	**** Alterations to Henry's code are...
	60	****
	61	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	62	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	63	**** by Larry Wall and others
	64	****
	65	**** You may distribute under the terms of either the GNU General Public
	66	**** License or the Artistic License, as specified in the README file.
	67
	68	*
	69	* Beware that some of this code is subtly aware of the way operator
	70	* precedence is structured in regular expressions. Serious changes in
	71	* regular-expression syntax might require a total rethink.
	72	*/
	73	#include "EXTERN.h"
	74	#define PERL_IN_REGCOMP_C
	75	#include "perl.h"
	76
	77	#ifndef PERL_IN_XSUB_RE
	78	# include "INTERN.h"
	79	#endif
	80
	81	#define REG_COMP_C
	82	#ifdef PERL_IN_XSUB_RE
	83	# include "re_comp.h"
	84	extern const struct regexp_engine my_reg_engine;
	85	#else
	86	# include "regcomp.h"
	87	#endif
	88
	89	#include "dquote_static.c"
	90	#include "charclass_invlists.h"
	91	#include "inline_invlist.c"
	92	#include "unicode_constants.h"
	93
	94	#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
	95	#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
	96
	97	#ifdef op
	98	#undef op
	99	#endif /* op */
	100
	101	#ifdef MSDOS
	102	# if defined(BUGGY_MSC6)
	103	/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
	104	# pragma optimize("a",off)
	105	/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
	106	# pragma optimize("w",on )
	107	# endif /* BUGGY_MSC6 */
	108	#endif /* MSDOS */
	109
	110	#ifndef STATIC
	111	#define STATIC static
	112	#endif
	113
	114
	115	typedef struct RExC_state_t {
	116	U32 flags; /* RXf_* are we folding, multilining? */
	117	U32 pm_flags; /* PMf_* stuff from the calling PMOP */
	118	char precomp; / uncompiled string. */
	119	REGEXP rx_sv; / The SV that is the regexp. */
	120	regexp rx; / perl core regexp structure */
	121	regexp_internal rxi; / internal data for regexp object pprivate field */
	122	char start; / Start of input for compile */
	123	char end; / End of input for compile */
	124	char parse; / Input-scan pointer. */
	125	I32 whilem_seen; /* number of WHILEM in this expr */
	126	regnode emit_start; / Start of emitted-code area */
	127	regnode emit_bound; / First regnode outside of the allocated space */
	128	regnode emit; / Code-emit pointer; &regdummy = don't = compiling */
	129	I32 naughty; /* How bad is this pattern? */
	130	I32 sawback; /* Did we see \1, ...? */
	131	U32 seen;
	132	I32 size; /* Code size. */
	133	I32 npar; /* Capture buffer count, (OPEN). */
	134	I32 cpar; /* Capture buffer count, (CLOSE). */
	135	I32 nestroot; /* root parens we are in - used by accept */
	136	I32 extralen;
	137	I32 seen_zerolen;
	138	regnode *open_parens; / pointers to open parens */
	139	regnode *close_parens; / pointers to close parens */
	140	regnode opend; / END node in program */
	141	I32 utf8; /* whether the pattern is utf8 or not */
	142	I32 orig_utf8; /* whether the pattern was originally in utf8 */
	143	/* XXX use this for future optimisation of case
	144	* where pattern must be upgraded to utf8. */
	145	I32 uni_semantics; /* If a d charset modifier should use unicode
	146	rules, even if the pattern is not in
	147	utf8 */
	148	HV paren_names; / Paren names */
	149
	150	regnode *recurse; / Recurse regops */
	151	I32 recurse_count; /* Number of recurse regops */
	152	I32 in_lookbehind;
	153	I32 contains_locale;
	154	I32 override_recoding;
	155	struct reg_code_block code_blocks; / positions of literal (?{})
	156	within pattern */
	157	int num_code_blocks; /* size of code_blocks[] */
	158	int code_index; /* next code_blocks[] slot */
	159	#if ADD_TO_REGEXEC
	160	char starttry; / -Dr: where regtry was called. */
	161	#define RExC_starttry (pRExC_state->starttry)
	162	#endif
	163	SV runtime_code_qr; / qr with the runtime code blocks */
	164	#ifdef DEBUGGING
	165	const char *lastparse;
	166	I32 lastnum;
	167	AV paren_name_list; / idx -> name */
	168	#define RExC_lastparse (pRExC_state->lastparse)
	169	#define RExC_lastnum (pRExC_state->lastnum)
	170	#define RExC_paren_name_list (pRExC_state->paren_name_list)
	171	#endif
	172	} RExC_state_t;
	173
	174	#define RExC_flags (pRExC_state->flags)
	175	#define RExC_pm_flags (pRExC_state->pm_flags)
	176	#define RExC_precomp (pRExC_state->precomp)
	177	#define RExC_rx_sv (pRExC_state->rx_sv)
	178	#define RExC_rx (pRExC_state->rx)
	179	#define RExC_rxi (pRExC_state->rxi)
	180	#define RExC_start (pRExC_state->start)
	181	#define RExC_end (pRExC_state->end)
	182	#define RExC_parse (pRExC_state->parse)
	183	#define RExC_whilem_seen (pRExC_state->whilem_seen)
	184	#ifdef RE_TRACK_PATTERN_OFFSETS
	185	#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
	186	#endif
	187	#define RExC_emit (pRExC_state->emit)
	188	#define RExC_emit_start (pRExC_state->emit_start)
	189	#define RExC_emit_bound (pRExC_state->emit_bound)
	190	#define RExC_naughty (pRExC_state->naughty)
	191	#define RExC_sawback (pRExC_state->sawback)
	192	#define RExC_seen (pRExC_state->seen)
	193	#define RExC_size (pRExC_state->size)
	194	#define RExC_npar (pRExC_state->npar)
	195	#define RExC_nestroot (pRExC_state->nestroot)
	196	#define RExC_extralen (pRExC_state->extralen)
	197	#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
	198	#define RExC_utf8 (pRExC_state->utf8)
	199	#define RExC_uni_semantics (pRExC_state->uni_semantics)
	200	#define RExC_orig_utf8 (pRExC_state->orig_utf8)
	201	#define RExC_open_parens (pRExC_state->open_parens)
	202	#define RExC_close_parens (pRExC_state->close_parens)
	203	#define RExC_opend (pRExC_state->opend)
	204	#define RExC_paren_names (pRExC_state->paren_names)
	205	#define RExC_recurse (pRExC_state->recurse)
	206	#define RExC_recurse_count (pRExC_state->recurse_count)
	207	#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
	208	#define RExC_contains_locale (pRExC_state->contains_locale)
	209	#define RExC_override_recoding (pRExC_state->override_recoding)
	210
	211
	212	#define ISMULT1(c) ((c) == '*' \|\| (c) == '+' \|\| (c) == '?')
	213	#define ISMULT2(s) ((s) == '' \|\| (s) == '+' \|\| (s) == '?' \|\| \
	214	((*s) == '{' && regcurly(s)))
	215
	216	#ifdef SPSTART
	217	#undef SPSTART /* dratted cpp namespace... */
	218	#endif
	219	/*
	220	* Flags to be passed up and down.
	221	*/
	222	#define WORST 0 /* Worst case. */
	223	#define HASWIDTH 0x01 /* Known to match non-null strings. */
	224
	225	/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
	226	* character, and if utf8, must be invariant. Note that this is not the same
	227	* thing as REGNODE_SIMPLE */
	228	#define SIMPLE 0x02
	229	#define SPSTART 0x04 /* Starts with * or +. */
	230	#define TRYAGAIN 0x08 /* Weeded out a declaration. */
	231	#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
	232
	233	#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
	234
	235	/* whether trie related optimizations are enabled */
	236	#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
	237	#define TRIE_STUDY_OPT
	238	#define FULL_TRIE_STUDY
	239	#define TRIE_STCLASS
	240	#endif
	241
	242
	243
	244	#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
	245	#define PBITVAL(paren) (1 << ((paren) & 7))
	246	#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
	247	#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) \|= PBITVAL(paren)
	248	#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
	249
	250	/* If not already in utf8, do a longjmp back to the beginning */
	251	#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
	252	#define REQUIRE_UTF8 STMT_START { \
	253	if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
	254	} STMT_END
	255
	256	/* About scan_data_t.
	257
	258	During optimisation we recurse through the regexp program performing
	259	various inplace (keyhole style) optimisations. In addition study_chunk
	260	and scan_commit populate this data structure with information about
	261	what strings MUST appear in the pattern. We look for the longest
	262	string that must appear at a fixed location, and we look for the
	263	longest string that may appear at a floating location. So for instance
	264	in the pattern:
	265
	266	/FOO[xX]A.*B[xX]BAR/
	267
	268	Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
	269	strings (because they follow a .* construct). study_chunk will identify
	270	both FOO and BAR as being the longest fixed and floating strings respectively.
	271
	272	The strings can be composites, for instance
	273
	274	/(f)(o)(o)/
	275
	276	will result in a composite fixed substring 'foo'.
	277
	278	For each string some basic information is maintained:
	279
	280	- offset or min_offset
	281	This is the position the string must appear at, or not before.
	282	It also implicitly (when combined with minlenp) tells us how many
	283	characters must match before the string we are searching for.
	284	Likewise when combined with minlenp and the length of the string it
	285	tells us how many characters must appear after the string we have
	286	found.
	287
	288	- max_offset
	289	Only used for floating strings. This is the rightmost point that
	290	the string can appear at. If set to I32 max it indicates that the
	291	string can occur infinitely far to the right.
	292
	293	- minlenp
	294	A pointer to the minimum length of the pattern that the string
	295	was found inside. This is important as in the case of positive
	296	lookahead or positive lookbehind we can have multiple patterns
	297	involved. Consider
	298
	299	/(?=FOO).*F/
	300
	301	The minimum length of the pattern overall is 3, the minimum length
	302	of the lookahead part is 3, but the minimum length of the part that
	303	will actually match is 1. So 'FOO's minimum length is 3, but the
	304	minimum length for the F is 1. This is important as the minimum length
	305	is used to determine offsets in front of and behind the string being
	306	looked for. Since strings can be composites this is the length of the
	307	pattern at the time it was committed with a scan_commit. Note that
	308	the length is calculated by study_chunk, so that the minimum lengths
	309	are not known until the full pattern has been compiled, thus the
	310	pointer to the value.
	311
	312	- lookbehind
	313
	314	In the case of lookbehind the string being searched for can be
	315	offset past the start point of the final matching string.
	316	If this value was just blithely removed from the min_offset it would
	317	invalidate some of the calculations for how many chars must match
	318	before or after (as they are derived from min_offset and minlen and
	319	the length of the string being searched for).
	320	When the final pattern is compiled and the data is moved from the
	321	scan_data_t structure into the regexp structure the information
	322	about lookbehind is factored in, with the information that would
	323	have been lost precalculated in the end_shift field for the
	324	associated string.
	325
	326	The fields pos_min and pos_delta are used to store the minimum offset
	327	and the delta to the maximum offset at the current point in the pattern.
	328
	329	*/
	330
	331	typedef struct scan_data_t {
	332	/I32 len_min; unused /
	333	/I32 len_delta; unused /
	334	I32 pos_min;
	335	I32 pos_delta;
	336	SV *last_found;
	337	I32 last_end; /* min value, <0 unless valid. */
	338	I32 last_start_min;
	339	I32 last_start_max;
	340	SV *longest; / Either &l_fixed, or &l_float. */
	341	SV longest_fixed; / longest fixed string found in pattern */
	342	I32 offset_fixed; /* offset where it starts */
	343	I32 minlen_fixed; / pointer to the minlen relevant to the string */
	344	I32 lookbehind_fixed; /* is the position of the string modfied by LB */
	345	SV longest_float; / longest floating string found in pattern */
	346	I32 offset_float_min; /* earliest point in string it can appear */
	347	I32 offset_float_max; /* latest point in string it can appear */
	348	I32 minlen_float; / pointer to the minlen relevant to the string */
	349	I32 lookbehind_float; /* is the position of the string modified by LB */
	350	I32 flags;
	351	I32 whilem_c;
	352	I32 *last_closep;
	353	struct regnode_charclass_class *start_class;
	354	} scan_data_t;
	355
	356	/*
	357	* Forward declarations for pregcomp()'s friends.
	358	*/
	359
	360	static const scan_data_t zero_scan_data =
	361	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
	362
	363	#define SF_BEFORE_EOL (SF_BEFORE_SEOL\|SF_BEFORE_MEOL)
	364	#define SF_BEFORE_SEOL 0x0001
	365	#define SF_BEFORE_MEOL 0x0002
	366	#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL\|SF_FIX_BEFORE_MEOL)
	367	#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL\|SF_FL_BEFORE_MEOL)
	368
	369	#ifdef NO_UNARY_PLUS
	370	# define SF_FIX_SHIFT_EOL (0+2)
	371	# define SF_FL_SHIFT_EOL (0+4)
	372	#else
	373	# define SF_FIX_SHIFT_EOL (+2)
	374	# define SF_FL_SHIFT_EOL (+4)
	375	#endif
	376
	377	#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
	378	#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
	379
	380	#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
	381	#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
	382	#define SF_IS_INF 0x0040
	383	#define SF_HAS_PAR 0x0080
	384	#define SF_IN_PAR 0x0100
	385	#define SF_HAS_EVAL 0x0200
	386	#define SCF_DO_SUBSTR 0x0400
	387	#define SCF_DO_STCLASS_AND 0x0800
	388	#define SCF_DO_STCLASS_OR 0x1000
	389	#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND\|SCF_DO_STCLASS_OR)
	390	#define SCF_WHILEM_VISITED_POS 0x2000
	391
	392	#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
	393	#define SCF_SEEN_ACCEPT 0x8000
	394
	395	#define UTF cBOOL(RExC_utf8)
	396
	397	/* The enums for all these are ordered so things work out correctly */
	398	#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
	399	#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
	400	#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
	401	#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
	402	#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
	403	#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
	404	#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	405
	406	#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
	407
	408	#define OOB_NAMEDCLASS -1
	409
	410	/* There is no code point that is out-of-bounds, so this is problematic. But
	411	* its only current use is to initialize a variable that is always set before
	412	* looked at. */
	413	#define OOB_UNICODE 0xDEADBEEF
	414
	415	#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
	416	#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
	417
	418
	419	/* length of regex to show in messages that don't mark a position within */
	420	#define RegexLengthToShowInErrorMessages 127
	421
	422	/*
	423	* If MARKER[12] are adjusted, be sure to adjust the constants at the top
	424	* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
	425	* op/pragma/warn/regcomp.
	426	*/
	427	#define MARKER1 "<-- HERE" /* marker as it appears in the description */
	428	#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
	429
	430	#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
	431
	432	/*
	433	* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
	434	* arg. Show regex, up to a maximum length. If it's too long, chop and add
	435	* "...".
	436	*/
	437	#define _FAIL(code) STMT_START { \
	438	const char *ellipses = ""; \
	439	IV len = RExC_end - RExC_precomp; \
	440	\
	441	if (!SIZE_ONLY) \
	442	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	443	if (len > RegexLengthToShowInErrorMessages) { \
	444	/* chop 10 shorter than the max, to ensure meaning of "..." */ \
	445	len = RegexLengthToShowInErrorMessages - 10; \
	446	ellipses = "..."; \
	447	} \
	448	code; \
	449	} STMT_END
	450
	451	#define FAIL(msg) _FAIL( \
	452	Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
	453	msg, (int)len, RExC_precomp, ellipses))
	454
	455	#define FAIL2(msg,arg) _FAIL( \
	456	Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
	457	arg, (int)len, RExC_precomp, ellipses))
	458
	459	/*
	460	* Simple_vFAIL -- like FAIL, but marks the current location in the scan
	461	*/
	462	#define Simple_vFAIL(m) STMT_START { \
	463	const IV offset = RExC_parse - RExC_precomp; \
	464	Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
	465	m, (int)offset, RExC_precomp, RExC_precomp + offset); \
	466	} STMT_END
	467
	468	/*
	469	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
	470	*/
	471	#define vFAIL(m) STMT_START { \
	472	if (!SIZE_ONLY) \
	473	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	474	Simple_vFAIL(m); \
	475	} STMT_END
	476
	477	/*
	478	* Like Simple_vFAIL(), but accepts two arguments.
	479	*/
	480	#define Simple_vFAIL2(m,a1) STMT_START { \
	481	const IV offset = RExC_parse - RExC_precomp; \
	482	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
	483	(int)offset, RExC_precomp, RExC_precomp + offset); \
	484	} STMT_END
	485
	486	/*
	487	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
	488	*/
	489	#define vFAIL2(m,a1) STMT_START { \
	490	if (!SIZE_ONLY) \
	491	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	492	Simple_vFAIL2(m, a1); \
	493	} STMT_END
	494
	495
	496	/*
	497	* Like Simple_vFAIL(), but accepts three arguments.
	498	*/
	499	#define Simple_vFAIL3(m, a1, a2) STMT_START { \
	500	const IV offset = RExC_parse - RExC_precomp; \
	501	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
	502	(int)offset, RExC_precomp, RExC_precomp + offset); \
	503	} STMT_END
	504
	505	/*
	506	* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
	507	*/
	508	#define vFAIL3(m,a1,a2) STMT_START { \
	509	if (!SIZE_ONLY) \
	510	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
	511	Simple_vFAIL3(m, a1, a2); \
	512	} STMT_END
	513
	514	/*
	515	* Like Simple_vFAIL(), but accepts four arguments.
	516	*/
	517	#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
	518	const IV offset = RExC_parse - RExC_precomp; \
	519	S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
	520	(int)offset, RExC_precomp, RExC_precomp + offset); \
	521	} STMT_END
	522
	523	#define ckWARNreg(loc,m) STMT_START { \
	524	const IV offset = loc - RExC_precomp; \
	525	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	526	(int)offset, RExC_precomp, RExC_precomp + offset); \
	527	} STMT_END
	528
	529	#define ckWARNregdep(loc,m) STMT_START { \
	530	const IV offset = loc - RExC_precomp; \
	531	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	532	m REPORT_LOCATION, \
	533	(int)offset, RExC_precomp, RExC_precomp + offset); \
	534	} STMT_END
	535
	536	#define ckWARN2regdep(loc,m, a1) STMT_START { \
	537	const IV offset = loc - RExC_precomp; \
	538	Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
	539	m REPORT_LOCATION, \
	540	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	541	} STMT_END
	542
	543	#define ckWARN2reg(loc, m, a1) STMT_START { \
	544	const IV offset = loc - RExC_precomp; \
	545	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	546	a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
	547	} STMT_END
	548
	549	#define vWARN3(loc, m, a1, a2) STMT_START { \
	550	const IV offset = loc - RExC_precomp; \
	551	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	552	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	553	} STMT_END
	554
	555	#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
	556	const IV offset = loc - RExC_precomp; \
	557	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	558	a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
	559	} STMT_END
	560
	561	#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
	562	const IV offset = loc - RExC_precomp; \
	563	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	564	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	565	} STMT_END
	566
	567	#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
	568	const IV offset = loc - RExC_precomp; \
	569	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	570	a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
	571	} STMT_END
	572
	573	#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
	574	const IV offset = loc - RExC_precomp; \
	575	Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
	576	a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
	577	} STMT_END
	578
	579
	580	/* Allow for side effects in s */
	581	#define REGC(c,s) STMT_START { \
	582	if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
	583	} STMT_END
	584
	585	/* Macros for recording node offsets. 20001227 mjd@plover.com
	586	* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
	587	* element 2*n-1 of the array. Element #2n holds the byte length node #n.
	588	* Element 0 holds the number n.
	589	* Position is 1 indexed.
	590	*/
	591	#ifndef RE_TRACK_PATTERN_OFFSETS
	592	#define Set_Node_Offset_To_R(node,byte)
	593	#define Set_Node_Offset(node,byte)
	594	#define Set_Cur_Node_Offset
	595	#define Set_Node_Length_To_R(node,len)
	596	#define Set_Node_Length(node,len)
	597	#define Set_Node_Cur_Length(node)
	598	#define Node_Offset(n)
	599	#define Node_Length(n)
	600	#define Set_Node_Offset_Length(node,offset,len)
	601	#define ProgLen(ri) ri->u.proglen
	602	#define SetProgLen(ri,x) ri->u.proglen = x
	603	#else
	604	#define ProgLen(ri) ri->u.offsets[0]
	605	#define SetProgLen(ri,x) ri->u.offsets[0] = x
	606	#define Set_Node_Offset_To_R(node,byte) STMT_START { \
	607	if (! SIZE_ONLY) { \
	608	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
	609	__LINE__, (int)(node), (int)(byte))); \
	610	if((node) < 0) { \
	611	Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
	612	} else { \
	613	RExC_offsets[2*(node)-1] = (byte); \
	614	} \
	615	} \
	616	} STMT_END
	617
	618	#define Set_Node_Offset(node,byte) \
	619	Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
	620	#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
	621
	622	#define Set_Node_Length_To_R(node,len) STMT_START { \
	623	if (! SIZE_ONLY) { \
	624	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
	625	__LINE__, (int)(node), (int)(len))); \
	626	if((node) < 0) { \
	627	Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
	628	} else { \
	629	RExC_offsets[2*(node)] = (len); \
	630	} \
	631	} \
	632	} STMT_END
	633
	634	#define Set_Node_Length(node,len) \
	635	Set_Node_Length_To_R((node)-RExC_emit_start, len)
	636	#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
	637	#define Set_Node_Cur_Length(node) \
	638	Set_Node_Length(node, RExC_parse - parse_start)
	639
	640	/* Get offsets and lengths */
	641	#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
	642	#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
	643
	644	#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
	645	Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
	646	Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
	647	} STMT_END
	648	#endif
	649
	650	#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
	651	#define EXPERIMENTAL_INPLACESCAN
	652	#endif /PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS/
	653
	654	#define DEBUG_STUDYDATA(str,data,depth) \
	655	DEBUG_OPTIMISE_MORE_r(if(data){ \
	656	PerlIO_printf(Perl_debug_log, \
	657	"%*s" str "Pos:%"IVdf"/%"IVdf \
	658	" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
	659	(int)(depth)*2, "", \
	660	(IV)((data)->pos_min), \
	661	(IV)((data)->pos_delta), \
	662	(UV)((data)->flags), \
	663	(IV)((data)->whilem_c), \
	664	(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
	665	is_inf ? "INF " : "" \
	666	); \
	667	if ((data)->last_found) \
	668	PerlIO_printf(Perl_debug_log, \
	669	"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
	670	" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
	671	SvPVX_const((data)->last_found), \
	672	(IV)((data)->last_end), \
	673	(IV)((data)->last_start_min), \
	674	(IV)((data)->last_start_max), \
	675	((data)->longest && \
	676	(data)->longest==&((data)->longest_fixed)) ? "*" : "", \
	677	SvPVX_const((data)->longest_fixed), \
	678	(IV)((data)->offset_fixed), \
	679	((data)->longest && \
	680	(data)->longest==&((data)->longest_float)) ? "*" : "", \
	681	SvPVX_const((data)->longest_float), \
	682	(IV)((data)->offset_float_min), \
	683	(IV)((data)->offset_float_max) \
	684	); \
	685	PerlIO_printf(Perl_debug_log,"\n"); \
	686	});
	687
	688	static void clear_re(pTHX_ void *r);
	689
	690	/* Mark that we cannot extend a found fixed substring at this point.
	691	Update the longest found anchored substring and the longest found
	692	floating substrings if needed. */
	693
	694	STATIC void
	695	S_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data, I32 *minlenp, int is_inf)
	696	{
	697	const STRLEN l = CHR_SVLEN(data->last_found);
	698	const STRLEN old_l = CHR_SVLEN(*data->longest);
	699	GET_RE_DEBUG_FLAGS_DECL;
	700
	701	PERL_ARGS_ASSERT_SCAN_COMMIT;
	702
	703	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	704	SvSetMagicSV(*data->longest, data->last_found);
	705	if (*data->longest == data->longest_fixed) {
	706	data->offset_fixed = l ? data->last_start_min : data->pos_min;
	707	if (data->flags & SF_BEFORE_EOL)
	708	data->flags
	709	\|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
	710	else
	711	data->flags &= ~SF_FIX_BEFORE_EOL;
	712	data->minlen_fixed=minlenp;
	713	data->lookbehind_fixed=0;
	714	}
	715	else { /* data->longest == data->longest_float /
	716	data->offset_float_min = l ? data->last_start_min : data->pos_min;
	717	data->offset_float_max = (l
	718	? data->last_start_max
	719	: data->pos_min + data->pos_delta);
	720	if (is_inf \|\| (U32)data->offset_float_max > (U32)I32_MAX)
	721	data->offset_float_max = I32_MAX;
	722	if (data->flags & SF_BEFORE_EOL)
	723	data->flags
	724	\|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
	725	else
	726	data->flags &= ~SF_FL_BEFORE_EOL;
	727	data->minlen_float=minlenp;
	728	data->lookbehind_float=0;
	729	}
	730	}
	731	SvCUR_set(data->last_found, 0);
	732	{
	733	SV * const sv = data->last_found;
	734	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	735	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	736	if (mg)
	737	mg->mg_len = 0;
	738	}
	739	}
	740	data->last_end = -1;
	741	data->flags &= ~SF_BEFORE_EOL;
	742	DEBUG_STUDYDATA("commit: ",data,0);
	743	}
	744
	745	/* Can match anything (initialization) */
	746	STATIC void
	747	S_cl_anything(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	748	{
	749	PERL_ARGS_ASSERT_CL_ANYTHING;
	750
	751	ANYOF_BITMAP_SETALL(cl);
	752	cl->flags = ANYOF_CLASS\|ANYOF_EOS\|ANYOF_UNICODE_ALL
	753	\|ANYOF_LOC_NONBITMAP_FOLD\|ANYOF_NON_UTF8_LATIN1_ALL;
	754
	755	/* If any portion of the regex is to operate under locale rules,
	756	* initialization includes it. The reason this isn't done for all regexes
	757	* is that the optimizer was written under the assumption that locale was
	758	* all-or-nothing. Given the complexity and lack of documentation in the
	759	* optimizer, and that there are inadequate test cases for locale, so many
	760	* parts of it may not work properly, it is safest to avoid locale unless
	761	* necessary. */
	762	if (RExC_contains_locale) {
	763	ANYOF_CLASS_SETALL(cl); /* /l uses class */
	764	cl->flags \|= ANYOF_LOCALE;
	765	}
	766	else {
	767	ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
	768	}
	769	}
	770
	771	/* Can match anything (initialization) */
	772	STATIC int
	773	S_cl_is_anything(const struct regnode_charclass_class *cl)
	774	{
	775	int value;
	776
	777	PERL_ARGS_ASSERT_CL_IS_ANYTHING;
	778
	779	for (value = 0; value <= ANYOF_MAX; value += 2)
	780	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
	781	return 1;
	782	if (!(cl->flags & ANYOF_UNICODE_ALL))
	783	return 0;
	784	if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
	785	return 0;
	786	return 1;
	787	}
	788
	789	/* Can match anything (initialization) */
	790	STATIC void
	791	S_cl_init(const RExC_state_t pRExC_state, struct regnode_charclass_class cl)
	792	{
	793	PERL_ARGS_ASSERT_CL_INIT;
	794
	795	Zero(cl, 1, struct regnode_charclass_class);
	796	cl->type = ANYOF;
	797	cl_anything(pRExC_state, cl);
	798	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	799	}
	800
	801	/* These two functions currently do the exact same thing */
	802	#define cl_init_zero S_cl_init
	803
	804	/* 'AND' a given class with another one. Can create false positives. 'cl'
	805	* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
	806	* 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
	807	STATIC void
	808	S_cl_and(struct regnode_charclass_class *cl,
	809	const struct regnode_charclass_class *and_with)
	810	{
	811	PERL_ARGS_ASSERT_CL_AND;
	812
	813	assert(and_with->type == ANYOF);
	814
	815	/* I (khw) am not sure all these restrictions are necessary XXX */
	816	if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
	817	&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
	818	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	819	&& !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	820	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
	821	int i;
	822
	823	if (and_with->flags & ANYOF_INVERT)
	824	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	825	cl->bitmap[i] &= ~and_with->bitmap[i];
	826	else
	827	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	828	cl->bitmap[i] &= and_with->bitmap[i];
	829	} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
	830
	831	if (and_with->flags & ANYOF_INVERT) {
	832
	833	/* Here, the and'ed node is inverted. Get the AND of the flags that
	834	* aren't affected by the inversion. Those that are affected are
	835	* handled individually below */
	836	U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
	837	cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
	838	cl->flags \|= affected_flags;
	839
	840	/* We currently don't know how to deal with things that aren't in the
	841	* bitmap, but we know that the intersection is no greater than what
	842	* is already in cl, so let there be false positives that get sorted
	843	* out after the synthetic start class succeeds, and the node is
	844	* matched for real. */
	845
	846	/* The inversion of these two flags indicate that the resulting
	847	* intersection doesn't have them */
	848	if (and_with->flags & ANYOF_UNICODE_ALL) {
	849	cl->flags &= ~ANYOF_UNICODE_ALL;
	850	}
	851	if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	852	cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
	853	}
	854	}
	855	else { /* and'd node is not inverted */
	856	U8 outside_bitmap_but_not_utf8; /* Temp variable */
	857
	858	if (! ANYOF_NONBITMAP(and_with)) {
	859
	860	/* Here 'and_with' doesn't match anything outside the bitmap
	861	* (except possibly ANYOF_UNICODE_ALL), which means the
	862	* intersection can't either, except for ANYOF_UNICODE_ALL, in
	863	* which case we don't know what the intersection is, but it's no
	864	* greater than what cl already has, so can just leave it alone,
	865	* with possible false positives */
	866	if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
	867	ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
	868	cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
	869	}
	870	}
	871	else if (! ANYOF_NONBITMAP(cl)) {
	872
	873	/* Here, 'and_with' does match something outside the bitmap, and cl
	874	* doesn't have a list of things to match outside the bitmap. If
	875	* cl can match all code points above 255, the intersection will
	876	* be those above-255 code points that 'and_with' matches. If cl
	877	* can't match all Unicode code points, it means that it can't
	878	* match anything outside the bitmap (since the 'if' that got us
	879	* into this block tested for that), so we leave the bitmap empty.
	880	*/
	881	if (cl->flags & ANYOF_UNICODE_ALL) {
	882	ARG_SET(cl, ARG(and_with));
	883
	884	/* and_with's ARG may match things that don't require UTF8.
	885	* And now cl's will too, in spite of this being an 'and'. See
	886	* the comments below about the kludge */
	887	cl->flags \|= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
	888	}
	889	}
	890	else {
	891	/* Here, both 'and_with' and cl match something outside the
	892	* bitmap. Currently we do not do the intersection, so just match
	893	* whatever cl had at the beginning. */
	894	}
	895
	896
	897	/* Take the intersection of the two sets of flags. However, the
	898	* ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
	899	* kludge around the fact that this flag is not treated like the others
	900	* which are initialized in cl_anything(). The way the optimizer works
	901	* is that the synthetic start class (SSC) is initialized to match
	902	* anything, and then the first time a real node is encountered, its
	903	* values are AND'd with the SSC's with the result being the values of
	904	* the real node. However, there are paths through the optimizer where
	905	* the AND never gets called, so those initialized bits are set
	906	* inappropriately, which is not usually a big deal, as they just cause
	907	* false positives in the SSC, which will just mean a probably
	908	* imperceptible slow down in execution. However this bit has a
	909	* higher false positive consequence in that it can cause utf8.pm,
	910	* utf8_heavy.pl ... to be loaded when not necessary, which is a much
	911	* bigger slowdown and also causes significant extra memory to be used.
	912	* In order to prevent this, the code now takes a different tack. The
	913	* bit isn't set unless some part of the regular expression needs it,
	914	* but once set it won't get cleared. This means that these extra
	915	* modules won't get loaded unless there was some path through the
	916	* pattern that would have required them anyway, and so any false
	917	* positives that occur by not ANDing them out when they could be
	918	* aren't as severe as they would be if we treated this bit like all
	919	* the others */
	920	outside_bitmap_but_not_utf8 = (cl->flags \| and_with->flags)
	921	& ANYOF_NONBITMAP_NON_UTF8;
	922	cl->flags &= and_with->flags;
	923	cl->flags \|= outside_bitmap_but_not_utf8;
	924	}
	925	}
	926
	927	/* 'OR' a given class with another one. Can create false positives. 'cl'
	928	* should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
	929	* 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
	930	STATIC void
	931	S_cl_or(const RExC_state_t pRExC_state, struct regnode_charclass_class cl, const struct regnode_charclass_class *or_with)
	932	{
	933	PERL_ARGS_ASSERT_CL_OR;
	934
	935	if (or_with->flags & ANYOF_INVERT) {
	936
	937	/* Here, the or'd node is to be inverted. This means we take the
	938	* complement of everything not in the bitmap, but currently we don't
	939	* know what that is, so give up and match anything */
	940	if (ANYOF_NONBITMAP(or_with)) {
	941	cl_anything(pRExC_state, cl);
	942	}
	943	/* We do not use
	944	* (B1 \| CL1) \| (!B2 & !CL2) = (B1 \| !B2 & !CL2) \| (CL1 \| (!B2 & !CL2))
	945	* <= (B1 \| !B2) \| (CL1 \| !CL2)
	946	* which is wasteful if CL2 is small, but we ignore CL2:
	947	* (B1 \| CL1) \| (!B2 & !CL2) <= (B1 \| CL1) \| !B2 = (B1 \| !B2) \| CL1
	948	* XXXX Can we handle case-fold? Unclear:
	949	* (OK1(i) \| OK1(i')) \| !(OK1(i) \| OK1(i')) =
	950	* (OK1(i) \| OK1(i')) \| (!OK1(i) & !OK1(i'))
	951	*/
	952	else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	953	&& !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	954	&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
	955	int i;
	956
	957	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	958	cl->bitmap[i] \|= ~or_with->bitmap[i];
	959	} /* XXXX: logic is complicated otherwise */
	960	else {
	961	cl_anything(pRExC_state, cl);
	962	}
	963
	964	/* And, we can just take the union of the flags that aren't affected
	965	* by the inversion */
	966	cl->flags \|= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
	967
	968	/* For the remaining flags:
	969	ANYOF_UNICODE_ALL and inverted means to not match anything above
	970	255, which means that the union with cl should just be
	971	what cl has in it, so can ignore this flag
	972	ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
	973	is 127-255 to match them, but then invert that, so the
	974	union with cl should just be what cl has in it, so can
	975	ignore this flag
	976	*/
	977	} else { /* 'or_with' is not inverted */
	978	/* (B1 \| CL1) \| (B2 \| CL2) = (B1 \| B2) \| (CL1 \| CL2)) */
	979	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
	980	&& (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
	981	\|\| (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
	982	int i;
	983
	984	/* OR char bitmap and class bitmap separately */
	985	for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
	986	cl->bitmap[i] \|= or_with->bitmap[i];
	987	if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
	988	for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
	989	cl->classflags[i] \|= or_with->classflags[i];
	990	cl->flags \|= ANYOF_CLASS;
	991	}
	992	}
	993	else { /* XXXX: logic is complicated, leave it along for a moment. */
	994	cl_anything(pRExC_state, cl);
	995	}
	996
	997	if (ANYOF_NONBITMAP(or_with)) {
	998
	999	/* Use the added node's outside-the-bit-map match if there isn't a
	1000	* conflict. If there is a conflict (both nodes match something
	1001	* outside the bitmap, but what they match outside is not the same
	1002	* pointer, and hence not easily compared until XXX we extend
	1003	* inversion lists this far), give up and allow the start class to
	1004	* match everything outside the bitmap. If that stuff is all above
	1005	* 255, can just set UNICODE_ALL, otherwise caould be anything. */
	1006	if (! ANYOF_NONBITMAP(cl)) {
	1007	ARG_SET(cl, ARG(or_with));
	1008	}
	1009	else if (ARG(cl) != ARG(or_with)) {
	1010
	1011	if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
	1012	cl_anything(pRExC_state, cl);
	1013	}
	1014	else {
	1015	cl->flags \|= ANYOF_UNICODE_ALL;
	1016	}
	1017	}
	1018	}
	1019
	1020	/* Take the union */
	1021	cl->flags \|= or_with->flags;
	1022	}
	1023	}
	1024
	1025	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	1026	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	1027	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	1028	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
	1029
	1030
	1031	#ifdef DEBUGGING
	1032	/*
	1033	dump_trie(trie,widecharmap,revcharmap)
	1034	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	1035	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	1036
	1037	These routines dump out a trie in a somewhat readable format.
	1038	The _interim_ variants are used for debugging the interim
	1039	tables that are used to generate the final compressed
	1040	representation which is what dump_trie expects.
	1041
	1042	Part of the reason for their existence is to provide a form
	1043	of documentation as to how the different representations function.
	1044
	1045	*/
	1046
	1047	/*
	1048	Dumps the final compressed table form of the trie to Perl_debug_log.
	1049	Used for debugging make_trie().
	1050	*/
	1051
	1052	STATIC void
	1053	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	1054	AV *revcharmap, U32 depth)
	1055	{
	1056	U32 state;
	1057	SV *sv=sv_newmortal();
	1058	int colwidth= widecharmap ? 6 : 4;
	1059	U16 word;
	1060	GET_RE_DEBUG_FLAGS_DECL;
	1061
	1062	PERL_ARGS_ASSERT_DUMP_TRIE;
	1063
	1064	PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
	1065	(int)depth * 2 + 2,"",
	1066	"Match","Base","Ofs" );
	1067
	1068	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	1069	SV ** const tmp = av_fetch( revcharmap, state, 0);
	1070	if ( tmp ) {
	1071	PerlIO_printf( Perl_debug_log, "%*s",
	1072	colwidth,
	1073	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1074	PL_colors[0], PL_colors[1],
	1075	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1076	PERL_PV_ESCAPE_FIRSTCHAR
	1077	)
	1078	);
	1079	}
	1080	}
	1081	PerlIO_printf( Perl_debug_log, "\n%*sState\|-----------------------",
	1082	(int)depth * 2 + 2,"");
	1083
	1084	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	1085	PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
	1086	PerlIO_printf( Perl_debug_log, "\n");
	1087
	1088	for( state = 1 ; state < trie->statecount ; state++ ) {
	1089	const U32 base = trie->states[ state ].trans.base;
	1090
	1091	PerlIO_printf( Perl_debug_log, "%s#%4"UVXf"\|", (int)depth 2 + 2,"", (UV)state);
	1092
	1093	if ( trie->states[ state ].wordnum ) {
	1094	PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
	1095	} else {
	1096	PerlIO_printf( Perl_debug_log, "%6s", "" );
	1097	}
	1098
	1099	PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
	1100
	1101	if ( base ) {
	1102	U32 ofs = 0;
	1103
	1104	while( ( base + ofs < trie->uniquecharcount ) \|\|
	1105	( base + ofs - trie->uniquecharcount < trie->lasttrans
	1106	&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
	1107	ofs++;
	1108
	1109	PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
	1110
	1111	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1112	if ( ( base + ofs >= trie->uniquecharcount ) &&
	1113	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	1114	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	1115	{
	1116	PerlIO_printf( Perl_debug_log, "%*"UVXf,
	1117	colwidth,
	1118	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
	1119	} else {
	1120	PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
	1121	}
	1122	}
	1123
	1124	PerlIO_printf( Perl_debug_log, "]");
	1125
	1126	}
	1127	PerlIO_printf( Perl_debug_log, "\n" );
	1128	}
	1129	PerlIO_printf(Perl_debug_log, "%sword_info N:(prev,len)=", (int)depth2, "");
	1130	for (word=1; word <= trie->wordcount; word++) {
	1131	PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
	1132	(int)word, (int)(trie->wordinfo[word].prev),
	1133	(int)(trie->wordinfo[word].len));
	1134	}
	1135	PerlIO_printf(Perl_debug_log, "\n" );
	1136	}
	1137	/*
	1138	Dumps a fully constructed but uncompressed trie in list form.
	1139	List tries normally only are used for construction when the number of
	1140	possible chars (trie->uniquecharcount) is very high.
	1141	Used for debugging make_trie().
	1142	*/
	1143	STATIC void
	1144	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	1145	HV widecharmap, AV revcharmap, U32 next_alloc,
	1146	U32 depth)
	1147	{
	1148	U32 state;
	1149	SV *sv=sv_newmortal();
	1150	int colwidth= widecharmap ? 6 : 4;
	1151	GET_RE_DEBUG_FLAGS_DECL;
	1152
	1153	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	1154
	1155	/* print out the table precompression. */
	1156	PerlIO_printf( Perl_debug_log, "%sState :Word \| Transition Data\n%s%s",
	1157	(int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
	1158	"------:-----+-----------------\n" );
	1159
	1160	for( state=1 ; state < next_alloc ; state ++ ) {
	1161	U16 charid;
	1162
	1163	PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
	1164	(int)depth * 2 + 2,"", (UV)state );
	1165	if ( ! trie->states[ state ].wordnum ) {
	1166	PerlIO_printf( Perl_debug_log, "%5s\| ","");
	1167	} else {
	1168	PerlIO_printf( Perl_debug_log, "W%4x\| ",
	1169	trie->states[ state ].wordnum
	1170	);
	1171	}
	1172	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	1173	SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
	1174	if ( tmp ) {
	1175	PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" \| ",
	1176	colwidth,
	1177	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1178	PL_colors[0], PL_colors[1],
	1179	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1180	PERL_PV_ESCAPE_FIRSTCHAR
	1181	) ,
	1182	TRIE_LIST_ITEM(state,charid).forid,
	1183	(UV)TRIE_LIST_ITEM(state,charid).newstate
	1184	);
	1185	if (!(charid % 10))
	1186	PerlIO_printf(Perl_debug_log, "\n%*s\| ",
	1187	(int)((depth * 2) + 14), "");
	1188	}
	1189	}
	1190	PerlIO_printf( Perl_debug_log, "\n");
	1191	}
	1192	}
	1193
	1194	/*
	1195	Dumps a fully constructed but uncompressed trie in table form.
	1196	This is the normal DFA style state transition table, with a few
	1197	twists to facilitate compression later.
	1198	Used for debugging make_trie().
	1199	*/
	1200	STATIC void
	1201	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	1202	HV widecharmap, AV revcharmap, U32 next_alloc,
	1203	U32 depth)
	1204	{
	1205	U32 state;
	1206	U16 charid;
	1207	SV *sv=sv_newmortal();
	1208	int colwidth= widecharmap ? 6 : 4;
	1209	GET_RE_DEBUG_FLAGS_DECL;
	1210
	1211	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	1212
	1213	/*
	1214	print out the table precompression so that we can do a visual check
	1215	that they are identical.
	1216	*/
	1217
	1218	PerlIO_printf( Perl_debug_log, "%sChar : ",(int)depth 2 + 2,"" );
	1219
	1220	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1221	SV ** const tmp = av_fetch( revcharmap, charid, 0);
	1222	if ( tmp ) {
	1223	PerlIO_printf( Perl_debug_log, "%*s",
	1224	colwidth,
	1225	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	1226	PL_colors[0], PL_colors[1],
	1227	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1228	PERL_PV_ESCAPE_FIRSTCHAR
	1229	)
	1230	);
	1231	}
	1232	}
	1233
	1234	PerlIO_printf( Perl_debug_log, "\n%sState+-",(int)depth 2 + 2,"" );
	1235
	1236	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	1237	PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
	1238	}
	1239
	1240	PerlIO_printf( Perl_debug_log, "\n" );
	1241
	1242	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	1243
	1244	PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
	1245	(int)depth * 2 + 2,"",
	1246	(UV)TRIE_NODENUM( state ) );
	1247
	1248	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	1249	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	1250	if (v)
	1251	PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
	1252	else
	1253	PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
	1254	}
	1255	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	1256	PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
	1257	} else {
	1258	PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
	1259	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	1260	}
	1261	}
	1262	}
	1263
	1264	#endif
	1265
	1266
	1267	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	1268	startbranch: the first branch in the whole branch sequence
	1269	first : start branch of sequence of branch-exact nodes.
	1270	May be the same as startbranch
	1271	last : Thing following the last branch.
	1272	May be the same as tail.
	1273	tail : item following the branch sequence
	1274	count : words in the sequence
	1275	flags : currently the OP() type we will be building one of /EXACT(\|F\|Fl)/
	1276	depth : indent depth
	1277
	1278	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	1279
	1280	A trie is an N'ary tree where the branches are determined by digital
	1281	decomposition of the key. IE, at the root node you look up the 1st character and
	1282	follow that branch repeat until you find the end of the branches. Nodes can be
	1283	marked as "accepting" meaning they represent a complete word. Eg:
	1284
	1285	/he\|she\|his\|hers/
	1286
	1287	would convert into the following structure. Numbers represent states, letters
	1288	following numbers represent valid transitions on the letter from that state, if
	1289	the number is in square brackets it represents an accepting state, otherwise it
	1290	will be in parenthesis.
	1291
	1292	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	1293	\| \|
	1294	\| (2)
	1295	\| \|
	1296	(1) +-i->(6)-+-s->[7]
	1297	\|
	1298	+-s->(3)-+-h->(4)-+-e->[5]
	1299
	1300	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	1301
	1302	This shows that when matching against the string 'hers' we will begin at state 1
	1303	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	1304	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	1305	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	1306	single traverse. We store a mapping from accepting to state to which word was
	1307	matched, and then when we have multiple possibilities we try to complete the
	1308	rest of the regex in the order in which they occured in the alternation.
	1309
	1310	The only prior NFA like behaviour that would be changed by the TRIE support is
	1311	the silent ignoring of duplicate alternations which are of the form:
	1312
	1313	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	1314
	1315	Thus EVAL blocks following a trie may be called a different number of times with
	1316	and without the optimisation. With the optimisations dupes will be silently
	1317	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	1318	the following demonstrates:
	1319
	1320	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	1321
	1322	which prints out 'word' three times, but
	1323
	1324	'words'=~/(word\|word\|word)(?{ print $1 })S/
	1325
	1326	which doesnt print it out at all. This is due to other optimisations kicking in.
	1327
	1328	Example of what happens on a structural level:
	1329
	1330	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	1331
	1332	1: CURLYM[1] {1,32767}(18)
	1333	5: BRANCH(8)
	1334	6: EXACT <ac>(16)
	1335	8: BRANCH(11)
	1336	9: EXACT <ad>(16)
	1337	11: BRANCH(14)
	1338	12: EXACT <ab>(16)
	1339	16: SUCCEED(0)
	1340	17: NOTHING(18)
	1341	18: END(0)
	1342
	1343	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	1344	and should turn into:
	1345
	1346	1: CURLYM[1] {1,32767}(18)
	1347	5: TRIE(16)
	1348	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	1349	<ac>
	1350	<ad>
	1351	<ab>
	1352	16: SUCCEED(0)
	1353	17: NOTHING(18)
	1354	18: END(0)
	1355
	1356	Cases where tail != last would be like /(?foo\|bar)baz/:
	1357
	1358	1: BRANCH(4)
	1359	2: EXACT <foo>(8)
	1360	4: BRANCH(7)
	1361	5: EXACT <bar>(8)
	1362	7: TAIL(8)
	1363	8: EXACT <baz>(10)
	1364	10: END(0)
	1365
	1366	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	1367	and would end up looking like:
	1368
	1369	1: TRIE(8)
	1370	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	1371	<foo>
	1372	<bar>
	1373	7: TAIL(8)
	1374	8: EXACT <baz>(10)
	1375	10: END(0)
	1376
	1377	d = uvuni_to_utf8_flags(d, uv, 0);
	1378
	1379	is the recommended Unicode-aware way of saying
	1380
	1381	*(d++) = uv;
	1382	*/
	1383
	1384	#define TRIE_STORE_REVCHAR(val) \
	1385	STMT_START { \
	1386	if (UTF) { \
	1387	SV zlopp = newSV(7); / XXX: optimize me */ \
	1388	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	1389	unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
	1390	SvCUR_set(zlopp, kapow - flrbbbbb); \
	1391	SvPOK_on(zlopp); \
	1392	SvUTF8_on(zlopp); \
	1393	av_push(revcharmap, zlopp); \
	1394	} else { \
	1395	char ooooff = (char)val; \
	1396	av_push(revcharmap, newSVpvn(&ooooff, 1)); \
	1397	} \
	1398	} STMT_END
	1399
	1400	#define TRIE_READ_CHAR STMT_START { \
	1401	wordlen++; \
	1402	if ( UTF ) { \
	1403	/* if it is UTF then it is either already folded, or does not need folding */ \
	1404	uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
	1405	} \
	1406	else if (folder == PL_fold_latin1) { \
	1407	/* if we use this folder we have to obey unicode rules on latin-1 data */ \
	1408	if ( foldlen > 0 ) { \
	1409	uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
	1410	foldlen -= len; \
	1411	scan += len; \
	1412	len = 0; \
	1413	} else { \
	1414	len = 1; \
	1415	uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
	1416	skiplen = UNISKIP(uvc); \
	1417	foldlen -= skiplen; \
	1418	scan = foldbuf + skiplen; \
	1419	} \
	1420	} else { \
	1421	/* raw data, will be folded later if needed */ \
	1422	uvc = (U32)*uc; \
	1423	len = 1; \
	1424	} \
	1425	} STMT_END
	1426
	1427
	1428
	1429	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	1430	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	1431	U32 ging = TRIE_LIST_LEN( state ) *= 2; \
	1432	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	1433	} \
	1434	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	1435	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	1436	TRIE_LIST_CUR( state )++; \
	1437	} STMT_END
	1438
	1439	#define TRIE_LIST_NEW(state) STMT_START { \
	1440	Newxz( trie->states[ state ].trans.list, \
	1441	4, reg_trie_trans_le ); \
	1442	TRIE_LIST_CUR( state ) = 1; \
	1443	TRIE_LIST_LEN( state ) = 4; \
	1444	} STMT_END
	1445
	1446	#define TRIE_HANDLE_WORD(state) STMT_START { \
	1447	U16 dupe= trie->states[ state ].wordnum; \
	1448	regnode * const noper_next = regnext( noper ); \
	1449	\
	1450	DEBUG_r({ \
	1451	/* store the word for dumping */ \
	1452	SV* tmp; \
	1453	if (OP(noper) != NOTHING) \
	1454	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	1455	else \
	1456	tmp = newSVpvn_utf8( "", 0, UTF ); \
	1457	av_push( trie_words, tmp ); \
	1458	}); \
	1459	\
	1460	curword++; \
	1461	trie->wordinfo[curword].prev = 0; \
	1462	trie->wordinfo[curword].len = wordlen; \
	1463	trie->wordinfo[curword].accept = state; \
	1464	\
	1465	if ( noper_next < tail ) { \
	1466	if (!trie->jump) \
	1467	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
	1468	trie->jump[curword] = (U16)(noper_next - convert); \
	1469	if (!jumper) \
	1470	jumper = noper_next; \
	1471	if (!nextbranch) \
	1472	nextbranch= regnext(cur); \
	1473	} \
	1474	\
	1475	if ( dupe ) { \
	1476	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	1477	/* chain, so that when the bits of chain are later */\
	1478	/* linked together, the dups appear in the chain */\
	1479	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	1480	trie->wordinfo[dupe].prev = curword; \
	1481	} else { \
	1482	/* we haven't inserted this word yet. */ \
	1483	trie->states[ state ].wordnum = curword; \
	1484	} \
	1485	} STMT_END
	1486
	1487
	1488	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	1489	( ( base + charid >= ucharcount \
	1490	&& base + charid < ubound \
	1491	&& state == trie->trans[ base - ucharcount + charid ].check \
	1492	&& trie->trans[ base - ucharcount + charid ].next ) \
	1493	? trie->trans[ base - ucharcount + charid ].next \
	1494	: ( state==1 ? special : 0 ) \
	1495	)
	1496
	1497	#define MADE_TRIE 1
	1498	#define MADE_JUMP_TRIE 2
	1499	#define MADE_EXACT_TRIE 4
	1500
	1501	STATIC I32
	1502	S_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch, regnode first, regnode last, regnode *tail, U32 word_count, U32 flags, U32 depth)
	1503	{
	1504	dVAR;
	1505	/* first pass, loop through and scan words */
	1506	reg_trie_data *trie;
	1507	HV *widecharmap = NULL;
	1508	AV *revcharmap = newAV();
	1509	regnode *cur;
	1510	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1511	STRLEN len = 0;
	1512	UV uvc = 0;
	1513	U16 curword = 0;
	1514	U32 next_alloc = 0;
	1515	regnode *jumper = NULL;
	1516	regnode *nextbranch = NULL;
	1517	regnode *convert = NULL;
	1518	U32 prev_states; / temp array mapping each state to previous one */
	1519	/* we just use folder as a flag in utf8 */
	1520	const U8 * folder = NULL;
	1521
	1522	#ifdef DEBUGGING
	1523	const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
	1524	AV *trie_words = NULL;
	1525	/* along with revcharmap, this only used during construction but both are
	1526	* useful during debugging so we store them in the struct when debugging.
	1527	*/
	1528	#else
	1529	const U32 data_slot = add_data( pRExC_state, 2, "tu" );
	1530	STRLEN trie_charcount=0;
	1531	#endif
	1532	SV *re_trie_maxbuff;
	1533	GET_RE_DEBUG_FLAGS_DECL;
	1534
	1535	PERL_ARGS_ASSERT_MAKE_TRIE;
	1536	#ifndef DEBUGGING
	1537	PERL_UNUSED_ARG(depth);
	1538	#endif
	1539
	1540	switch (flags) {
	1541	case EXACT: break;
	1542	case EXACTFA:
	1543	case EXACTFU_SS:
	1544	case EXACTFU_TRICKYFOLD:
	1545	case EXACTFU: folder = PL_fold_latin1; break;
	1546	case EXACTF: folder = PL_fold; break;
	1547	case EXACTFL: folder = PL_fold_locale; break;
	1548	default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
	1549	}
	1550
	1551	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	1552	trie->refcount = 1;
	1553	trie->startstate = 1;
	1554	trie->wordcount = word_count;
	1555	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	1556	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	1557	if (flags == EXACT)
	1558	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	1559	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	1560	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	1561
	1562	DEBUG_r({
	1563	trie_words = newAV();
	1564	});
	1565
	1566	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1567	if (!SvIOK(re_trie_maxbuff)) {
	1568	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1569	}
	1570	DEBUG_TRIE_COMPILE_r({
	1571	PerlIO_printf( Perl_debug_log,
	1572	"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	1573	(int)depth * 2 + 2, "",
	1574	REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
	1575	REG_NODE_NUM(last), REG_NODE_NUM(tail),
	1576	(int)depth);
	1577	});
	1578
	1579	/* Find the node we are going to overwrite */
	1580	if ( first == startbranch && OP( last ) != BRANCH ) {
	1581	/* whole branch chain */
	1582	convert = first;
	1583	} else {
	1584	/* branch sub-chain */
	1585	convert = NEXTOPER( first );
	1586	}
	1587
	1588	/* -- First loop and Setup --
	1589
	1590	We first traverse the branches and scan each word to determine if it
	1591	contains widechars, and how many unique chars there are, this is
	1592	important as we have to build a table with at least as many columns as we
	1593	have unique chars.
	1594
	1595	We use an array of integers to represent the character codes 0..255
	1596	(trie->charmap) and we use a an HV* to store Unicode characters. We use the
	1597	native representation of the character value as the key and IV's for the
	1598	coded index.
	1599
	1600	TODO If we keep track of how many times each character is used we can
	1601	remap the columns so that the table compression later on is more
	1602	efficient in terms of memory by ensuring the most common value is in the
	1603	middle and the least common are on the outside. IMO this would be better
	1604	than a most to least common mapping as theres a decent chance the most
	1605	common letter will share a node with the least common, meaning the node
	1606	will not be compressible. With a middle is most common approach the worst
	1607	case is when we have the least common nodes twice.
	1608
	1609	*/
	1610
	1611	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1612	regnode *noper = NEXTOPER( cur );
	1613	const U8 uc = (U8)STRING( noper );
	1614	const U8 *e = uc + STR_LEN( noper );
	1615	STRLEN foldlen = 0;
	1616	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1617	STRLEN skiplen = 0;
	1618	const U8 scan = (U8)NULL;
	1619	U32 wordlen = 0; /* required init */
	1620	STRLEN chars = 0;
	1621	bool set_bit = trie->bitmap ? 1 : 0; /store the first char in the bitmap?/
	1622
	1623	if (OP(noper) == NOTHING) {
	1624	regnode *noper_next= regnext(noper);
	1625	if (noper_next != tail && OP(noper_next) == flags) {
	1626	noper = noper_next;
	1627	uc= (U8*)STRING(noper);
	1628	e= uc + STR_LEN(noper);
	1629	trie->minlen= STR_LEN(noper);
	1630	} else {
	1631	trie->minlen= 0;
	1632	continue;
	1633	}
	1634	}
	1635
	1636	if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
	1637	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	1638	regardless of encoding */
	1639	if (OP( noper ) == EXACTFU_SS) {
	1640	/* false positives are ok, so just set this */
	1641	TRIE_BITMAP_SET(trie,0xDF);
	1642	}
	1643	}
	1644	for ( ; uc < e ; uc += len ) {
	1645	TRIE_CHARCOUNT(trie)++;
	1646	TRIE_READ_CHAR;
	1647	chars++;
	1648	if ( uvc < 256 ) {
	1649	if ( folder ) {
	1650	U8 folded= folder[ (U8) uvc ];
	1651	if ( !trie->charmap[ folded ] ) {
	1652	trie->charmap[ folded ]=( ++trie->uniquecharcount );
	1653	TRIE_STORE_REVCHAR( folded );
	1654	}
	1655	}
	1656	if ( !trie->charmap[ uvc ] ) {
	1657	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	1658	TRIE_STORE_REVCHAR( uvc );
	1659	}
	1660	if ( set_bit ) {
	1661	/* store the codepoint in the bitmap, and its folded
	1662	* equivalent. */
	1663	TRIE_BITMAP_SET(trie, uvc);
	1664
	1665	/* store the folded codepoint */
	1666	if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
	1667
	1668	if ( !UTF ) {
	1669	/* store first byte of utf8 representation of
	1670	variant codepoints */
	1671	if (! UNI_IS_INVARIANT(uvc)) {
	1672	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
	1673	}
	1674	}
	1675	set_bit = 0; /* We've done our bit :-) */
	1676	}
	1677	} else {
	1678	SV** svpp;
	1679	if ( !widecharmap )
	1680	widecharmap = newHV();
	1681
	1682	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	1683
	1684	if ( !svpp )
	1685	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
	1686
	1687	if ( !SvTRUE( *svpp ) ) {
	1688	sv_setiv( *svpp, ++trie->uniquecharcount );
	1689	TRIE_STORE_REVCHAR(uvc);
	1690	}
	1691	}
	1692	}
	1693	if( cur == first ) {
	1694	trie->minlen = chars;
	1695	trie->maxlen = chars;
	1696	} else if (chars < trie->minlen) {
	1697	trie->minlen = chars;
	1698	} else if (chars > trie->maxlen) {
	1699	trie->maxlen = chars;
	1700	}
	1701	if (OP( noper ) == EXACTFU_SS) {
	1702	/* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
	1703	if (trie->minlen > 1)
	1704	trie->minlen= 1;
	1705	}
	1706	if (OP( noper ) == EXACTFU_TRICKYFOLD) {
	1707	/* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
	1708	* - We assume that any such sequence might match a 2 byte string */
	1709	if (trie->minlen > 2 )
	1710	trie->minlen= 2;
	1711	}
	1712
	1713	} /* end first pass */
	1714	DEBUG_TRIE_COMPILE_r(
	1715	PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	1716	(int)depth * 2 + 2,"",
	1717	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	1718	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	1719	(int)trie->minlen, (int)trie->maxlen )
	1720	);
	1721
	1722	/*
	1723	We now know what we are dealing with in terms of unique chars and
	1724	string sizes so we can calculate how much memory a naive
	1725	representation using a flat table will take. If it's over a reasonable
	1726	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	1727	conservative but potentially much slower representation using an array
	1728	of lists.
	1729
	1730	At the end we convert both representations into the same compressed
	1731	form that will be used in regexec.c for matching with. The latter
	1732	is a form that cannot be used to construct with but has memory
	1733	properties similar to the list form and access properties similar
	1734	to the table form making it both suitable for fast searches and
	1735	small enough that its feasable to store for the duration of a program.
	1736
	1737	See the comment in the code where the compressed table is produced
	1738	inplace from the flat tabe representation for an explanation of how
	1739	the compression works.
	1740
	1741	*/
	1742
	1743
	1744	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	1745	prev_states[1] = 0;
	1746
	1747	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
	1748	/*
	1749	Second Pass -- Array Of Lists Representation
	1750
	1751	Each state will be represented by a list of charid:state records
	1752	(reg_trie_trans_le) the first such element holds the CUR and LEN
	1753	points of the allocated array. (See defines above).
	1754
	1755	We build the initial structure using the lists, and then convert
	1756	it into the compressed table form which allows faster lookups
	1757	(but cant be modified once converted).
	1758	*/
	1759
	1760	STRLEN transcount = 1;
	1761
	1762	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1763	"%*sCompiling trie using list compiler\n",
	1764	(int)depth * 2 + 2, ""));
	1765
	1766	trie->states = (reg_trie_state *)
	1767	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1768	sizeof(reg_trie_state) );
	1769	TRIE_LIST_NEW(1);
	1770	next_alloc = 2;
	1771
	1772	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1773
	1774	regnode *noper = NEXTOPER( cur );
	1775	U8 uc = (U8)STRING( noper );
	1776	const U8 *e = uc + STR_LEN( noper );
	1777	U32 state = 1; /* required init */
	1778	U16 charid = 0; /* sanity init */
	1779	U8 scan = (U8)NULL; /* sanity init */
	1780	STRLEN foldlen = 0; /* required init */
	1781	U32 wordlen = 0; /* required init */
	1782	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1783	STRLEN skiplen = 0;
	1784
	1785	if (OP(noper) == NOTHING) {
	1786	regnode *noper_next= regnext(noper);
	1787	if (noper_next != tail && OP(noper_next) == flags) {
	1788	noper = noper_next;
	1789	uc= (U8*)STRING(noper);
	1790	e= uc + STR_LEN(noper);
	1791	}
	1792	}
	1793
	1794	if (OP(noper) != NOTHING) {
	1795	for ( ; uc < e ; uc += len ) {
	1796
	1797	TRIE_READ_CHAR;
	1798
	1799	if ( uvc < 256 ) {
	1800	charid = trie->charmap[ uvc ];
	1801	} else {
	1802	SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	1803	if ( !svpp ) {
	1804	charid = 0;
	1805	} else {
	1806	charid=(U16)SvIV( *svpp );
	1807	}
	1808	}
	1809	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	1810	if ( charid ) {
	1811
	1812	U16 check;
	1813	U32 newstate = 0;
	1814
	1815	charid--;
	1816	if ( !trie->states[ state ].trans.list ) {
	1817	TRIE_LIST_NEW( state );
	1818	}
	1819	for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
	1820	if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
	1821	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	1822	break;
	1823	}
	1824	}
	1825	if ( ! newstate ) {
	1826	newstate = next_alloc++;
	1827	prev_states[newstate] = state;
	1828	TRIE_LIST_PUSH( state, charid, newstate );
	1829	transcount++;
	1830	}
	1831	state = newstate;
	1832	} else {
	1833	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	1834	}
	1835	}
	1836	}
	1837	TRIE_HANDLE_WORD(state);
	1838
	1839	} /* end second pass */
	1840
	1841	/* next alloc is the NEXT state to be allocated */
	1842	trie->statecount = next_alloc;
	1843	trie->states = (reg_trie_state *)
	1844	PerlMemShared_realloc( trie->states,
	1845	next_alloc
	1846	* sizeof(reg_trie_state) );
	1847
	1848	/* and now dump it out before we compress it */
	1849	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	1850	revcharmap, next_alloc,
	1851	depth+1)
	1852	);
	1853
	1854	trie->trans = (reg_trie_trans *)
	1855	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	1856	{
	1857	U32 state;
	1858	U32 tp = 0;
	1859	U32 zp = 0;
	1860
	1861
	1862	for( state=1 ; state < next_alloc ; state ++ ) {
	1863	U32 base=0;
	1864
	1865	/*
	1866	DEBUG_TRIE_COMPILE_MORE_r(
	1867	PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
	1868	);
	1869	*/
	1870
	1871	if (trie->states[state].trans.list) {
	1872	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1873	U16 maxid=minid;
	1874	U16 idx;
	1875
	1876	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1877	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1878	if ( forid < minid ) {
	1879	minid=forid;
	1880	} else if ( forid > maxid ) {
	1881	maxid=forid;
	1882	}
	1883	}
	1884	if ( transcount < tp + maxid - minid + 1) {
	1885	transcount *= 2;
	1886	trie->trans = (reg_trie_trans *)
	1887	PerlMemShared_realloc( trie->trans,
	1888	transcount
	1889	* sizeof(reg_trie_trans) );
	1890	Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
	1891	}
	1892	base = trie->uniquecharcount + tp - minid;
	1893	if ( maxid == minid ) {
	1894	U32 set = 0;
	1895	for ( ; zp < tp ; zp++ ) {
	1896	if ( ! trie->trans[ zp ].next ) {
	1897	base = trie->uniquecharcount + zp - minid;
	1898	trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1899	trie->trans[ zp ].check = state;
	1900	set = 1;
	1901	break;
	1902	}
	1903	}
	1904	if ( !set ) {
	1905	trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
	1906	trie->trans[ tp ].check = state;
	1907	tp++;
	1908	zp = tp;
	1909	}
	1910	} else {
	1911	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1912	const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
	1913	trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
	1914	trie->trans[ tid ].check = state;
	1915	}
	1916	tp += ( maxid - minid + 1 );
	1917	}
	1918	Safefree(trie->states[ state ].trans.list);
	1919	}
	1920	/*
	1921	DEBUG_TRIE_COMPILE_MORE_r(
	1922	PerlIO_printf( Perl_debug_log, " base: %d\n",base);
	1923	);
	1924	*/
	1925	trie->states[ state ].trans.base=base;
	1926	}
	1927	trie->lasttrans = tp + 1;
	1928	}
	1929	} else {
	1930	/*
	1931	Second Pass -- Flat Table Representation.
	1932
	1933	we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
	1934	We know that we will need Charcount+1 trans at most to store the data
	1935	(one row per char at worst case) So we preallocate both structures
	1936	assuming worst case.
	1937
	1938	We then construct the trie using only the .next slots of the entry
	1939	structs.
	1940
	1941	We use the .check field of the first entry of the node temporarily to
	1942	make compression both faster and easier by keeping track of how many non
	1943	zero fields are in the node.
	1944
	1945	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	1946	transition.
	1947
	1948	There are two terms at use here: state as a TRIE_NODEIDX() which is a
	1949	number representing the first entry of the node, and state as a
	1950	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
	1951	TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
	1952	are 2 entrys per node. eg:
	1953
	1954	A B A B
	1955	1. 2 4 1. 3 7
	1956	2. 0 3 3. 0 5
	1957	3. 0 0 5. 0 0
	1958	4. 0 0 7. 0 0
	1959
	1960	The table is internally in the right hand, idx form. However as we also
	1961	have to deal with the states array which is indexed by nodenum we have to
	1962	use TRIE_NODENUM() to convert.
	1963
	1964	*/
	1965	DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
	1966	"%*sCompiling trie using table compiler\n",
	1967	(int)depth * 2 + 2, ""));
	1968
	1969	trie->trans = (reg_trie_trans *)
	1970	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	1971	* trie->uniquecharcount + 1,
	1972	sizeof(reg_trie_trans) );
	1973	trie->states = (reg_trie_state *)
	1974	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1975	sizeof(reg_trie_state) );
	1976	next_alloc = trie->uniquecharcount + 1;
	1977
	1978
	1979	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1980
	1981	regnode *noper = NEXTOPER( cur );
	1982	const U8 uc = (U8)STRING( noper );
	1983	const U8 *e = uc + STR_LEN( noper );
	1984
	1985	U32 state = 1; /* required init */
	1986
	1987	U16 charid = 0; /* sanity init */
	1988	U32 accept_state = 0; /* sanity init */
	1989	U8 scan = (U8)NULL; /* sanity init */
	1990
	1991	STRLEN foldlen = 0; /* required init */
	1992	U32 wordlen = 0; /* required init */
	1993	STRLEN skiplen = 0;
	1994	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1995
	1996	if (OP(noper) == NOTHING) {
	1997	regnode *noper_next= regnext(noper);
	1998	if (noper_next != tail && OP(noper_next) == flags) {
	1999	noper = noper_next;
	2000	uc= (U8*)STRING(noper);
	2001	e= uc + STR_LEN(noper);
	2002	}
	2003	}
	2004
	2005	if ( OP(noper) != NOTHING ) {
	2006	for ( ; uc < e ; uc += len ) {
	2007
	2008	TRIE_READ_CHAR;
	2009
	2010	if ( uvc < 256 ) {
	2011	charid = trie->charmap[ uvc ];
	2012	} else {
	2013	SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
	2014	charid = svpp ? (U16)SvIV(*svpp) : 0;
	2015	}
	2016	if ( charid ) {
	2017	charid--;
	2018	if ( !trie->trans[ state + charid ].next ) {
	2019	trie->trans[ state + charid ].next = next_alloc;
	2020	trie->trans[ state ].check++;
	2021	prev_states[TRIE_NODENUM(next_alloc)]
	2022	= TRIE_NODENUM(state);
	2023	next_alloc += trie->uniquecharcount;
	2024	}
	2025	state = trie->trans[ state + charid ].next;
	2026	} else {
	2027	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
	2028	}
	2029	/* charid is now 0 if we dont know the char read, or nonzero if we do */
	2030	}
	2031	}
	2032	accept_state = TRIE_NODENUM( state );
	2033	TRIE_HANDLE_WORD(accept_state);
	2034
	2035	} /* end second pass */
	2036
	2037	/* and now dump it out before we compress it */
	2038	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	2039	revcharmap,
	2040	next_alloc, depth+1));
	2041
	2042	{
	2043	/*
	2044	* Inplace compress the table.*
	2045
	2046	For sparse data sets the table constructed by the trie algorithm will
	2047	be mostly 0/FAIL transitions or to put it another way mostly empty.
	2048	(Note that leaf nodes will not contain any transitions.)
	2049
	2050	This algorithm compresses the tables by eliminating most such
	2051	transitions, at the cost of a modest bit of extra work during lookup:
	2052
	2053	- Each states[] entry contains a .base field which indicates the
	2054	index in the state[] array wheres its transition data is stored.
	2055
	2056	- If .base is 0 there are no valid transitions from that node.
	2057
	2058	- If .base is nonzero then charid is added to it to find an entry in
	2059	the trans array.
	2060
	2061	-If trans[states[state].base+charid].check!=state then the
	2062	transition is taken to be a 0/Fail transition. Thus if there are fail
	2063	transitions at the front of the node then the .base offset will point
	2064	somewhere inside the previous nodes data (or maybe even into a node
	2065	even earlier), but the .check field determines if the transition is
	2066	valid.
	2067
	2068	XXX - wrong maybe?
	2069	The following process inplace converts the table to the compressed
	2070	table: We first do not compress the root node 1,and mark all its
	2071	.check pointers as 1 and set its .base pointer as 1 as well. This
	2072	allows us to do a DFA construction from the compressed table later,
	2073	and ensures that any .base pointers we calculate later are greater
	2074	than 0.
	2075
	2076	- We set 'pos' to indicate the first entry of the second node.
	2077
	2078	- We then iterate over the columns of the node, finding the first and
	2079	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	2080	and set the .check pointers accordingly, and advance pos
	2081	appropriately and repreat for the next node. Note that when we copy
	2082	the next pointers we have to convert them from the original
	2083	NODEIDX form to NODENUM form as the former is not valid post
	2084	compression.
	2085
	2086	- If a node has no transitions used we mark its base as 0 and do not
	2087	advance the pos pointer.
	2088
	2089	- If a node only has one transition we use a second pointer into the
	2090	structure to fill in allocated fail transitions from other states.
	2091	This pointer is independent of the main pointer and scans forward
	2092	looking for null transitions that are allocated to a state. When it
	2093	finds one it writes the single transition into the "hole". If the
	2094	pointer doesnt find one the single transition is appended as normal.
	2095
	2096	- Once compressed we can Renew/realloc the structures to release the
	2097	excess space.
	2098
	2099	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	2100	specifically Fig 3.47 and the associated pseudocode.
	2101
	2102	demq
	2103	*/
	2104	const U32 laststate = TRIE_NODENUM( next_alloc );
	2105	U32 state, charid;
	2106	U32 pos = 0, zp=0;
	2107	trie->statecount = laststate;
	2108
	2109	for ( state = 1 ; state < laststate ; state++ ) {
	2110	U8 flag = 0;
	2111	const U32 stateidx = TRIE_NODEIDX( state );
	2112	const U32 o_used = trie->trans[ stateidx ].check;
	2113	U32 used = trie->trans[ stateidx ].check;
	2114	trie->trans[ stateidx ].check = 0;
	2115
	2116	for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
	2117	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	2118	if ( trie->trans[ stateidx + charid ].next ) {
	2119	if (o_used == 1) {
	2120	for ( ; zp < pos ; zp++ ) {
	2121	if ( ! trie->trans[ zp ].next ) {
	2122	break;
	2123	}
	2124	}
	2125	trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
	2126	trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2127	trie->trans[ zp ].check = state;
	2128	if ( ++zp > pos ) pos = zp;
	2129	break;
	2130	}
	2131	used--;
	2132	}
	2133	if ( !flag ) {
	2134	flag = 1;
	2135	trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
	2136	}
	2137	trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
	2138	trie->trans[ pos ].check = state;
	2139	pos++;
	2140	}
	2141	}
	2142	}
	2143	trie->lasttrans = pos + 1;
	2144	trie->states = (reg_trie_state *)
	2145	PerlMemShared_realloc( trie->states, laststate
	2146	* sizeof(reg_trie_state) );
	2147	DEBUG_TRIE_COMPILE_MORE_r(
	2148	PerlIO_printf( Perl_debug_log,
	2149	"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
	2150	(int)depth * 2 + 2,"",
	2151	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
	2152	(IV)next_alloc,
	2153	(IV)pos,
	2154	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	2155	);
	2156
	2157	} /* end table compress */
	2158	}
	2159	DEBUG_TRIE_COMPILE_MORE_r(
	2160	PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
	2161	(int)depth * 2 + 2, "",
	2162	(UV)trie->statecount,
	2163	(UV)trie->lasttrans)
	2164	);
	2165	/* resize the trans array to remove unused space */
	2166	trie->trans = (reg_trie_trans *)
	2167	PerlMemShared_realloc( trie->trans, trie->lasttrans
	2168	* sizeof(reg_trie_trans) );
	2169
	2170	{ /* Modify the program and insert the new TRIE node */
	2171	U8 nodetype =(U8)(flags & 0xFF);
	2172	char *str=NULL;
	2173
	2174	#ifdef DEBUGGING
	2175	regnode *optimize = NULL;
	2176	#ifdef RE_TRACK_PATTERN_OFFSETS
	2177
	2178	U32 mjd_offset = 0;
	2179	U32 mjd_nodelen = 0;
	2180	#endif /* RE_TRACK_PATTERN_OFFSETS */
	2181	#endif /* DEBUGGING */
	2182	/*
	2183	This means we convert either the first branch or the first Exact,
	2184	depending on whether the thing following (in 'last') is a branch
	2185	or not and whther first is the startbranch (ie is it a sub part of
	2186	the alternation or is it the whole thing.)
	2187	Assuming its a sub part we convert the EXACT otherwise we convert
	2188	the whole branch sequence, including the first.
	2189	*/
	2190	/* Find the node we are going to overwrite */
	2191	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	2192	/* branch sub-chain */
	2193	NEXT_OFF( first ) = (U16)(last - first);
	2194	#ifdef RE_TRACK_PATTERN_OFFSETS
	2195	DEBUG_r({
	2196	mjd_offset= Node_Offset((convert));
	2197	mjd_nodelen= Node_Length((convert));
	2198	});
	2199	#endif
	2200	/* whole branch chain */
	2201	}
	2202	#ifdef RE_TRACK_PATTERN_OFFSETS
	2203	else {
	2204	DEBUG_r({
	2205	const regnode *nop = NEXTOPER( convert );
	2206	mjd_offset= Node_Offset((nop));
	2207	mjd_nodelen= Node_Length((nop));
	2208	});
	2209	}
	2210	DEBUG_OPTIMISE_r(
	2211	PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
	2212	(int)depth * 2 + 2, "",
	2213	(UV)mjd_offset, (UV)mjd_nodelen)
	2214	);
	2215	#endif
	2216	/* But first we check to see if there is a common prefix we can
	2217	split out as an EXACT and put in front of the TRIE node. */
	2218	trie->startstate= 1;
	2219	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	2220	U32 state;
	2221	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	2222	U32 ofs = 0;
	2223	I32 idx = -1;
	2224	U32 count = 0;
	2225	const U32 base = trie->states[ state ].trans.base;
	2226
	2227	if ( trie->states[state].wordnum )
	2228	count = 1;
	2229
	2230	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	2231	if ( ( base + ofs >= trie->uniquecharcount ) &&
	2232	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	2233	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	2234	{
	2235	if ( ++count > 1 ) {
	2236	SV **tmp = av_fetch( revcharmap, ofs, 0);
	2237	const U8 ch = (U8)SvPV_nolen_const( *tmp );
	2238	if ( state == 1 ) break;
	2239	if ( count == 2 ) {
	2240	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	2241	DEBUG_OPTIMISE_r(
	2242	PerlIO_printf(Perl_debug_log,
	2243	"%*sNew Start State=%"UVuf" Class: [",
	2244	(int)depth * 2 + 2, "",
	2245	(UV)state));
	2246	if (idx >= 0) {
	2247	SV ** const tmp = av_fetch( revcharmap, idx, 0);
	2248	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	2249
	2250	TRIE_BITMAP_SET(trie,*ch);
	2251	if ( folder )
	2252	TRIE_BITMAP_SET(trie, folder[ *ch ]);
	2253	DEBUG_OPTIMISE_r(
	2254	PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
	2255	);
	2256	}
	2257	}
	2258	TRIE_BITMAP_SET(trie,*ch);
	2259	if ( folder )
	2260	TRIE_BITMAP_SET(trie,folder[ *ch ]);
	2261	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
	2262	}
	2263	idx = ofs;
	2264	}
	2265	}
	2266	if ( count == 1 ) {
	2267	SV **tmp = av_fetch( revcharmap, idx, 0);
	2268	STRLEN len;
	2269	char ch = SvPV( tmp, len );
	2270	DEBUG_OPTIMISE_r({
	2271	SV *sv=sv_newmortal();
	2272	PerlIO_printf( Perl_debug_log,
	2273	"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
	2274	(int)depth * 2 + 2, "",
	2275	(UV)state, (UV)idx,
	2276	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	2277	PL_colors[0], PL_colors[1],
	2278	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	2279	PERL_PV_ESCAPE_FIRSTCHAR
	2280	)
	2281	);
	2282	});
	2283	if ( state==1 ) {
	2284	OP( convert ) = nodetype;
	2285	str=STRING(convert);
	2286	STR_LEN(convert)=0;
	2287	}
	2288	STR_LEN(convert) += len;
	2289	while (len--)
	2290	str++ = ch++;
	2291	} else {
	2292	#ifdef DEBUGGING
	2293	if (state>1)
	2294	DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
	2295	#endif
	2296	break;
	2297	}
	2298	}
	2299	trie->prefixlen = (state-1);
	2300	if (str) {
	2301	regnode *n = convert+NODE_SZ_STR(convert);
	2302	NEXT_OFF(convert) = NODE_SZ_STR(convert);
	2303	trie->startstate = state;
	2304	trie->minlen -= (state - 1);
	2305	trie->maxlen -= (state - 1);
	2306	#ifdef DEBUGGING
	2307	/* At least the UNICOS C compiler choked on this
	2308	* being argument to DEBUG_r(), so let's just have
	2309	* it right here. */
	2310	if (
	2311	#ifdef PERL_EXT_RE_BUILD
	2312	1
	2313	#else
	2314	DEBUG_r_TEST
	2315	#endif
	2316	) {
	2317	regnode *fix = convert;
	2318	U32 word = trie->wordcount;
	2319	mjd_nodelen++;
	2320	Set_Node_Offset_Length(convert, mjd_offset, state - 1);
	2321	while( ++fix < n ) {
	2322	Set_Node_Offset_Length(fix, 0, 0);
	2323	}
	2324	while (word--) {
	2325	SV ** const tmp = av_fetch( trie_words, word, 0 );
	2326	if (tmp) {
	2327	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	2328	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	2329	else
	2330	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	2331	}
	2332	}
	2333	}
	2334	#endif
	2335	if (trie->maxlen) {
	2336	convert = n;
	2337	} else {
	2338	NEXT_OFF(convert) = (U16)(tail - convert);
	2339	DEBUG_r(optimize= n);
	2340	}
	2341	}
	2342	}
	2343	if (!jumper)
	2344	jumper = last;
	2345	if ( trie->maxlen ) {
	2346	NEXT_OFF( convert ) = (U16)(tail - convert);
	2347	ARG_SET( convert, data_slot );
	2348	/* Store the offset to the first unabsorbed branch in
	2349	jump[0], which is otherwise unused by the jump logic.
	2350	We use this when dumping a trie and during optimisation. */
	2351	if (trie->jump)
	2352	trie->jump[0] = (U16)(nextbranch - convert);
	2353
	2354	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	2355	* and there is a bitmap
	2356	* and the first "jump target" node we found leaves enough room
	2357	* then convert the TRIE node into a TRIEC node, with the bitmap
	2358	* embedded inline in the opcode - this is hypothetically faster.
	2359	*/
	2360	if ( !trie->states[trie->startstate].wordnum
	2361	&& trie->bitmap
	2362	&& ( (char )jumper - (char )convert) >= (int)sizeof(struct regnode_charclass) )
	2363	{
	2364	OP( convert ) = TRIEC;
	2365	Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	2366	PerlMemShared_free(trie->bitmap);
	2367	trie->bitmap= NULL;
	2368	} else
	2369	OP( convert ) = TRIE;
	2370
	2371	/* store the type in the flags */
	2372	convert->flags = nodetype;
	2373	DEBUG_r({
	2374	optimize = convert
	2375	+ NODE_STEP_REGNODE
	2376	+ regarglen[ OP( convert ) ];
	2377	});
	2378	/* XXX We really should free up the resource in trie now,
	2379	as we won't use them - (which resources?) dmq */
	2380	}
	2381	/* needed for dumping*/
	2382	DEBUG_r(if (optimize) {
	2383	regnode *opt = convert;
	2384
	2385	while ( ++opt < optimize) {
	2386	Set_Node_Offset_Length(opt,0,0);
	2387	}
	2388	/*
	2389	Try to clean up some of the debris left after the
	2390	optimisation.
	2391	*/
	2392	while( optimize < jumper ) {
	2393	mjd_nodelen += Node_Length((optimize));
	2394	OP( optimize ) = OPTIMIZED;
	2395	Set_Node_Offset_Length(optimize,0,0);
	2396	optimize++;
	2397	}
	2398	Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
	2399	});
	2400	} /* end node insert */
	2401
	2402	/* Finish populating the prev field of the wordinfo array. Walk back
	2403	* from each accept state until we find another accept state, and if
	2404	* so, point the first word's .prev field at the second word. If the
	2405	* second already has a .prev field set, stop now. This will be the
	2406	* case either if we've already processed that word's accept state,
	2407	* or that state had multiple words, and the overspill words were
	2408	* already linked up earlier.
	2409	*/
	2410	{
	2411	U16 word;
	2412	U32 state;
	2413	U16 prev;
	2414
	2415	for (word=1; word <= trie->wordcount; word++) {
	2416	prev = 0;
	2417	if (trie->wordinfo[word].prev)
	2418	continue;
	2419	state = trie->wordinfo[word].accept;
	2420	while (state) {
	2421	state = prev_states[state];
	2422	if (!state)
	2423	break;
	2424	prev = trie->states[state].wordnum;
	2425	if (prev)
	2426	break;
	2427	}
	2428	trie->wordinfo[word].prev = prev;
	2429	}
	2430	Safefree(prev_states);
	2431	}
	2432
	2433
	2434	/* and now dump out the compressed format */
	2435	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	2436
	2437	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	2438	#ifdef DEBUGGING
	2439	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	2440	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	2441	#else
	2442	SvREFCNT_dec(revcharmap);
	2443	#endif
	2444	return trie->jump
	2445	? MADE_JUMP_TRIE
	2446	: trie->startstate>1
	2447	? MADE_EXACT_TRIE
	2448	: MADE_TRIE;
	2449	}
	2450
	2451	STATIC void
	2452	S_make_trie_failtable(pTHX_ RExC_state_t pRExC_state, regnode source, regnode *stclass, U32 depth)
	2453	{
	2454	/* The Trie is constructed and compressed now so we can build a fail array if it's needed
	2455
	2456	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
	2457	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
	2458	ISBN 0-201-10088-6
	2459
	2460	We find the fail state for each state in the trie, this state is the longest proper
	2461	suffix of the current state's 'word' that is also a proper prefix of another word in our
	2462	trie. State 1 represents the word '' and is thus the default fail state. This allows
	2463	the DFA not to have to restart after its tried and failed a word at a given point, it
	2464	simply continues as though it had been matching the other word in the first place.
	2465	Consider
	2466	'abcdgu'=~/abcdefg\|cdgu/
	2467	When we get to 'd' we are still matching the first word, we would encounter 'g' which would
	2468	fail, which would bring us to the state representing 'd' in the second word where we would
	2469	try 'g' and succeed, proceeding to match 'cdgu'.
	2470	*/
	2471	/* add a fail transition */
	2472	const U32 trie_offset = ARG(source);
	2473	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	2474	U32 *q;
	2475	const U32 ucharcount = trie->uniquecharcount;
	2476	const U32 numstates = trie->statecount;
	2477	const U32 ubound = trie->lasttrans + ucharcount;
	2478	U32 q_read = 0;
	2479	U32 q_write = 0;
	2480	U32 charid;
	2481	U32 base = trie->states[ 1 ].trans.base;
	2482	U32 *fail;
	2483	reg_ac_data *aho;
	2484	const U32 data_slot = add_data( pRExC_state, 1, "T" );
	2485	GET_RE_DEBUG_FLAGS_DECL;
	2486
	2487	PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
	2488	#ifndef DEBUGGING
	2489	PERL_UNUSED_ARG(depth);
	2490	#endif
	2491
	2492
	2493	ARG_SET( stclass, data_slot );
	2494	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	2495	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	2496	aho->trie=trie_offset;
	2497	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	2498	Copy( trie->states, aho->states, numstates, reg_trie_state );
	2499	Newxz( q, numstates, U32);
	2500	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	2501	aho->refcount = 1;
	2502	fail = aho->fail;
	2503	/* initialize fail[0..1] to be 1 so that we always have
	2504	a valid final fail state */
	2505	fail[ 0 ] = fail[ 1 ] = 1;
	2506
	2507	for ( charid = 0; charid < ucharcount ; charid++ ) {
	2508	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	2509	if ( newstate ) {
	2510	q[ q_write ] = newstate;
	2511	/* set to point at the root */
	2512	fail[ q[ q_write++ ] ]=1;
	2513	}
	2514	}
	2515	while ( q_read < q_write) {
	2516	const U32 cur = q[ q_read++ % numstates ];
	2517	base = trie->states[ cur ].trans.base;
	2518
	2519	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	2520	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	2521	if (ch_state) {
	2522	U32 fail_state = cur;
	2523	U32 fail_base;
	2524	do {
	2525	fail_state = fail[ fail_state ];
	2526	fail_base = aho->states[ fail_state ].trans.base;
	2527	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	2528
	2529	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	2530	fail[ ch_state ] = fail_state;
	2531	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	2532	{
	2533	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	2534	}
	2535	q[ q_write++ % numstates] = ch_state;
	2536	}
	2537	}
	2538	}
	2539	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	2540	when we fail in state 1, this allows us to use the
	2541	charclass scan to find a valid start char. This is based on the principle
	2542	that theres a good chance the string being searched contains lots of stuff
	2543	that cant be a start char.
	2544	*/
	2545	fail[ 0 ] = fail[ 1 ] = 0;
	2546	DEBUG_TRIE_COMPILE_r({
	2547	PerlIO_printf(Perl_debug_log,
	2548	"%*sStclass Failtable (%"UVuf" states): 0",
	2549	(int)(depth * 2), "", (UV)numstates
	2550	);
	2551	for( q_read=1; q_read<numstates; q_read++ ) {
	2552	PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
	2553	}
	2554	PerlIO_printf(Perl_debug_log, "\n");
	2555	});
	2556	Safefree(q);
	2557	/RExC_seen \|= REG_SEEN_TRIEDFA;/
	2558	}
	2559
	2560
	2561	/*
	2562	* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
	2563	* These need to be revisited when a newer toolchain becomes available.
	2564	*/
	2565	#if defined(__sparc64__) && defined(__GNUC__)
	2566	# if __GNUC__ < 2 \|\| (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
	2567	# undef SPARC64_GCC_WORKAROUND
	2568	# define SPARC64_GCC_WORKAROUND 1
	2569	# endif
	2570	#endif
	2571
	2572	#define DEBUG_PEEP(str,scan,depth) \
	2573	DEBUG_OPTIMISE_r({if (scan){ \
	2574	SV * const mysv=sv_newmortal(); \
	2575	regnode *Next = regnext(scan); \
	2576	regprop(RExC_rx, mysv, scan); \
	2577	PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
	2578	(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
	2579	Next ? (REG_NODE_NUM(Next)) : 0 ); \
	2580	}});
	2581
	2582
	2583	/* The below joins as many adjacent EXACTish nodes as possible into a single
	2584	* one, and looks for problematic sequences of characters whose folds vs.
	2585	* non-folds have sufficiently different lengths, that the optimizer would be
	2586	* fooled into rejecting legitimate matches of them, and the trie construction
	2587	* code needs to handle specially. The joining is only done if:
	2588	* 1) there is room in the current conglomerated node to entirely contain the
	2589	* next one.
	2590	* 2) they are the exact same node type
	2591	*
	2592	* The adjacent nodes actually may be separated by NOTHING-kind nodes, and
	2593	* these get optimized out
	2594	*
	2595	* If there are problematic code sequences, *min_subtract is set to the delta
	2596	* that the minimum size of the node can be less than its actual size. And,
	2597	* the node type of the result is changed to reflect that it contains these
	2598	* sequences.
	2599	*
	2600	* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
	2601	* and contains LATIN SMALL LETTER SHARP S
	2602	*
	2603	* This is as good a place as any to discuss the design of handling these
	2604	* problematic sequences. It's been wrong in Perl for a very long time. There
	2605	* are three code points currently in Unicode whose folded lengths differ so
	2606	* much from the un-folded lengths that it causes problems for the optimizer
	2607	* and trie construction. Why only these are problematic, and not others where
	2608	* lengths also differ is something I (khw) do not understand. New versions of
	2609	* Unicode might add more such code points. Hopefully the logic in
	2610	* fold_grind.t that figures out what to test (in part by verifying that each
	2611	* size-combination gets tested) will catch any that do come along, so they can
	2612	* be added to the special handling below. The chances of new ones are
	2613	* actually rather small, as most, if not all, of the world's scripts that have
	2614	* casefolding have already been encoded by Unicode. Also, a number of
	2615	* Unicode's decisions were made to allow compatibility with pre-existing
	2616	* standards, and almost all of those have already been dealt with. These
	2617	* would otherwise be the most likely candidates for generating further tricky
	2618	* sequences. In other words, Unicode by itself is unlikely to add new ones
	2619	* unless it is for compatibility with pre-existing standards, and there aren't
	2620	* many of those left.
	2621	*
	2622	* The previous designs for dealing with these involved assigning a special
	2623	* node for them. This approach doesn't work, as evidenced by this example:
	2624	* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
	2625	* Both these fold to "sss", but if the pattern is parsed to create a node
	2626	* that would match just the \xDF, it won't be able to handle the case where a
	2627	* successful match would have to cross the node's boundary. The new approach
	2628	* that hopefully generally solves the problem generates an EXACTFU_SS node
	2629	* that is "sss".
	2630	*
	2631	* There are a number of components to the approach (a lot of work for just
	2632	* three code points!):
	2633	* 1) This routine examines each EXACTFish node that could contain the
	2634	* problematic sequences. It returns in *min_subtract how much to
	2635	* subtract from the the actual length of the string to get a real minimum
	2636	* for one that could match it. This number is usually 0 except for the
	2637	* problematic sequences. This delta is used by the caller to adjust the
	2638	* min length of the match, and the delta between min and max, so that the
	2639	* optimizer doesn't reject these possibilities based on size constraints.
	2640	* 2) These sequences require special handling by the trie code, so this code
	2641	* changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
	2642	* EXACTFU_SS.
	2643	* 3) This is sufficient for the two Greek sequences (described below), but
	2644	* the one involving the Sharp s (\xDF) needs more. The node type
	2645	* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
	2646	* sequence in it. For non-UTF-8 patterns and strings, this is the only
	2647	* case where there is a possible fold length change. That means that a
	2648	* regular EXACTFU node without UTF-8 involvement doesn't have to concern
	2649	* itself with length changes, and so can be processed faster. regexec.c
	2650	* takes advantage of this. Generally, an EXACTFish node that is in UTF-8
	2651	* is pre-folded by regcomp.c. This saves effort in regex matching.
	2652	* However, the pre-folding isn't done for non-UTF8 patterns because the
	2653	* fold of the MICRO SIGN requires UTF-8. Also what EXACTF and EXACTFL
	2654	* nodes fold to isn't known until runtime. The fold possibilities for
	2655	* the non-UTF8 patterns are quite simple, except for the sharp s. All
	2656	* the ones that don't involve a UTF-8 target string are members of a
	2657	* fold-pair, and arrays are set up for all of them so that the other
	2658	* member of the pair can be found quickly. Code elsewhere in this file
	2659	* makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
	2660	* if the pattern isn't UTF-8. This avoids the issues described in the
	2661	* next item.
	2662	* 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
	2663	* 'ss' or not is not knowable at compile time. It will match iff the
	2664	* target string is in UTF-8, unlike the EXACTFU nodes, where it always
	2665	* matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
	2666	* it can't be folded to "ss" at compile time, unlike EXACTFU does (as
	2667	* described in item 3). An assumption that the optimizer part of
	2668	* regexec.c (probably unwittingly) makes is that a character in the
	2669	* pattern corresponds to at most a single character in the target string.
	2670	* (And I do mean character, and not byte here, unlike other parts of the
	2671	* documentation that have never been updated to account for multibyte
	2672	* Unicode.) This assumption is wrong only in this case, as all other
	2673	* cases are either 1-1 folds when no UTF-8 is involved; or is true by
	2674	* virtue of having this file pre-fold UTF-8 patterns. I'm
	2675	* reluctant to try to change this assumption, so instead the code punts.
	2676	* This routine examines EXACTF nodes for the sharp s, and returns a
	2677	* boolean indicating whether or not the node is an EXACTF node that
	2678	* contains a sharp s. When it is true, the caller sets a flag that later
	2679	* causes the optimizer in this file to not set values for the floating
	2680	* and fixed string lengths, and thus avoids the optimizer code in
	2681	* regexec.c that makes the invalid assumption. Thus, there is no
	2682	* optimization based on string lengths for EXACTF nodes that contain the
	2683	* sharp s. This only happens for /id rules (which means the pattern
	2684	* isn't in UTF-8).
	2685	*/
	2686
	2687	#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
	2688	if (PL_regkind[OP(scan)] == EXACT) \
	2689	join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
	2690
	2691	STATIC U32
	2692	S_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan, UV min_subtract, bool has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
	2693	/* Merge several consecutive EXACTish nodes into one. */
	2694	regnode *n = regnext(scan);
	2695	U32 stringok = 1;
	2696	regnode *next = scan + NODE_SZ_STR(scan);
	2697	U32 merged = 0;
	2698	U32 stopnow = 0;
	2699	#ifdef DEBUGGING
	2700	regnode *stop = scan;
	2701	GET_RE_DEBUG_FLAGS_DECL;
	2702	#else
	2703	PERL_UNUSED_ARG(depth);
	2704	#endif
	2705
	2706	PERL_ARGS_ASSERT_JOIN_EXACT;
	2707	#ifndef EXPERIMENTAL_INPLACESCAN
	2708	PERL_UNUSED_ARG(flags);
	2709	PERL_UNUSED_ARG(val);
	2710	#endif
	2711	DEBUG_PEEP("join",scan,depth);
	2712
	2713	/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
	2714	* EXACT ones that are mergeable to the current one. */
	2715	while (n
	2716	&& (PL_regkind[OP(n)] == NOTHING
	2717	\|\| (stringok && OP(n) == OP(scan)))
	2718	&& NEXT_OFF(n)
	2719	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
	2720	{
	2721
	2722	if (OP(n) == TAIL \|\| n > next)
	2723	stringok = 0;
	2724	if (PL_regkind[OP(n)] == NOTHING) {
	2725	DEBUG_PEEP("skip:",n,depth);
	2726	NEXT_OFF(scan) += NEXT_OFF(n);
	2727	next = n + NODE_STEP_REGNODE;
	2728	#ifdef DEBUGGING
	2729	if (stringok)
	2730	stop = n;
	2731	#endif
	2732	n = regnext(n);
	2733	}
	2734	else if (stringok) {
	2735	const unsigned int oldl = STR_LEN(scan);
	2736	regnode * const nnext = regnext(n);
	2737
	2738	/* XXX I (khw) kind of doubt that this works on platforms where
	2739	* U8_MAX is above 255 because of lots of other assumptions */
	2740	if (oldl + STR_LEN(n) > U8_MAX)
	2741	break;
	2742
	2743	DEBUG_PEEP("merg",n,depth);
	2744	merged++;
	2745
	2746	NEXT_OFF(scan) += NEXT_OFF(n);
	2747	STR_LEN(scan) += STR_LEN(n);
	2748	next = n + NODE_SZ_STR(n);
	2749	/* Now we can overwrite n : /
	2750	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	2751	#ifdef DEBUGGING
	2752	stop = next - 1;
	2753	#endif
	2754	n = nnext;
	2755	if (stopnow) break;
	2756	}
	2757
	2758	#ifdef EXPERIMENTAL_INPLACESCAN
	2759	if (flags && !NEXT_OFF(n)) {
	2760	DEBUG_PEEP("atch", val, depth);
	2761	if (reg_off_by_arg[OP(n)]) {
	2762	ARG_SET(n, val - n);
	2763	}
	2764	else {
	2765	NEXT_OFF(n) = val - n;
	2766	}
	2767	stopnow = 1;
	2768	}
	2769	#endif
	2770	}
	2771
	2772	*min_subtract = 0;
	2773	*has_exactf_sharp_s = FALSE;
	2774
	2775	/* Here, all the adjacent mergeable EXACTish nodes have been merged. We
	2776	* can now analyze for sequences of problematic code points. (Prior to
	2777	* this final joining, sequences could have been split over boundaries, and
	2778	* hence missed). The sequences only happen in folding, hence for any
	2779	* non-EXACT EXACTish node */
	2780	if (OP(scan) != EXACT) {
	2781	U8 *s;
	2782	U8 * s0 = (U8*) STRING(scan);
	2783	U8 * const s_end = s0 + STR_LEN(scan);
	2784
	2785	/* The below is perhaps overboard, but this allows us to save a test
	2786	* each time through the loop at the expense of a mask. This is
	2787	* because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
	2788	* single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
	2789	* This uses an exclusive 'or' to find that bit and then inverts it to
	2790	* form a mask, with just a single 0, in the bit position where 'S' and
	2791	* 's' differ. */
	2792	const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
	2793	const U8 s_masked = 's' & S_or_s_mask;
	2794
	2795	/* One pass is made over the node's string looking for all the
	2796	* possibilities. to avoid some tests in the loop, there are two main
	2797	* cases, for UTF-8 patterns (which can't have EXACTF nodes) and
	2798	* non-UTF-8 */
	2799	if (UTF) {
	2800
	2801	/* There are two problematic Greek code points in Unicode
	2802	* casefolding
	2803	*
	2804	* U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	2805	* U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
	2806	*
	2807	* which casefold to
	2808	*
	2809	* Unicode UTF-8
	2810	*
	2811	* U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
	2812	* U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
	2813	*
	2814	* This means that in case-insensitive matching (or "loose
	2815	* matching", as Unicode calls it), an EXACTF of length six (the
	2816	* UTF-8 encoded byte length of the above casefolded versions) can
	2817	* match a target string of length two (the byte length of UTF-8
	2818	* encoded U+0390 or U+03B0). This would rather mess up the
	2819	* minimum length computation. (there are other code points that
	2820	* also fold to these two sequences, but the delta is smaller)
	2821	*
	2822	* If these sequences are found, the minimum length is decreased by
	2823	* four (six minus two).
	2824	*
	2825	* Similarly, 'ss' may match the single char and byte LATIN SMALL
	2826	* LETTER SHARP S. We decrease the min length by 1 for each
	2827	* occurrence of 'ss' found */
	2828
	2829	#define U390_FIRST_BYTE GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE
	2830	#define U3B0_FIRST_BYTE GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE
	2831	const U8 U390_tail[] = GREEK_SMALL_LETTER_IOTA_UTF8_TAIL
	2832	COMBINING_DIAERESIS_UTF8
	2833	COMBINING_ACUTE_ACCENT_UTF8;
	2834	const U8 U3B0_tail[] = GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL
	2835	COMBINING_DIAERESIS_UTF8
	2836	COMBINING_ACUTE_ACCENT_UTF8;
	2837	const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
	2838	yields a net of 0 */
	2839	/* Examine the string for one of the problematic sequences */
	2840	for (s = s0;
	2841	s < s_end - 1; /* Can stop 1 before the end, as minimum length
	2842	* sequence we are looking for is 2 */
	2843	s += UTF8SKIP(s))
	2844	{
	2845
	2846	/* Look for the first byte in each problematic sequence */
	2847	switch (*s) {
	2848	/* We don't have to worry about other things that fold to
	2849	* 's' (such as the long s, U+017F), as all above-latin1
	2850	* code points have been pre-folded */
	2851	case 's':
	2852	case 'S':
	2853
	2854	/* Current character is an 's' or 'S'. If next one is
	2855	* as well, we have the dreaded sequence */
	2856	if (((*(s+1) & S_or_s_mask) == s_masked)
	2857	/* These two node types don't have special handling
	2858	* for 'ss' */
	2859	&& OP(scan) != EXACTFL && OP(scan) != EXACTFA)
	2860	{
	2861	*min_subtract += 1;
	2862	OP(scan) = EXACTFU_SS;
	2863	s++; /* No need to look at this character again */
	2864	}
	2865	break;
	2866
	2867	case U390_FIRST_BYTE:
	2868	if (s_end - s >= len
	2869
	2870	/* The 1's are because are skipping comparing the
	2871	* first byte */
	2872	&& memEQ(s + 1, U390_tail, len - 1))
	2873	{
	2874	goto greek_sequence;
	2875	}
	2876	break;
	2877
	2878	case U3B0_FIRST_BYTE:
	2879	if (! (s_end - s >= len
	2880	&& memEQ(s + 1, U3B0_tail, len - 1)))
	2881	{
	2882	break;
	2883	}
	2884	greek_sequence:
	2885	*min_subtract += 4;
	2886
	2887	/* This requires special handling by trie's, so change
	2888	* the node type to indicate this. If EXACTFA and
	2889	* EXACTFL were ever to be handled by trie's, this
	2890	* would have to be changed. If this node has already
	2891	* been changed to EXACTFU_SS in this loop, leave it as
	2892	* is. (I (khw) think it doesn't matter in regexec.c
	2893	* for UTF patterns, but no need to change it */
	2894	if (OP(scan) == EXACTFU) {
	2895	OP(scan) = EXACTFU_TRICKYFOLD;
	2896	}
	2897	s += 6; /* We already know what this sequence is. Skip
	2898	the rest of it */
	2899	break;
	2900	}
	2901	}
	2902	}
	2903	else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
	2904
	2905	/* Here, the pattern is not UTF-8. We need to look only for the
	2906	* 'ss' sequence, and in the EXACTF case, the sharp s, which can be
	2907	* in the final position. Otherwise we can stop looking 1 byte
	2908	* earlier because have to find both the first and second 's' */
	2909	const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
	2910
	2911	for (s = s0; s < upper; s++) {
	2912	switch (*s) {
	2913	case 'S':
	2914	case 's':
	2915	if (s_end - s > 1
	2916	&& ((*(s+1) & S_or_s_mask) == s_masked))
	2917	{
	2918	*min_subtract += 1;
	2919
	2920	/* EXACTF nodes need to know that the minimum
	2921	* length changed so that a sharp s in the string
	2922	* can match this ss in the pattern, but they
	2923	* remain EXACTF nodes, as they won't match this
	2924	* unless the target string is is UTF-8, which we
	2925	* don't know until runtime */
	2926	if (OP(scan) != EXACTF) {
	2927	OP(scan) = EXACTFU_SS;
	2928	}
	2929	s++;
	2930	}
	2931	break;
	2932	case LATIN_SMALL_LETTER_SHARP_S:
	2933	if (OP(scan) == EXACTF) {
	2934	*has_exactf_sharp_s = TRUE;
	2935	}
	2936	break;
	2937	}
	2938	}
	2939	}
	2940	}
	2941
	2942	#ifdef DEBUGGING
	2943	/* Allow dumping but overwriting the collection of skipped
	2944	* ops and/or strings with fake optimized ops */
	2945	n = scan + NODE_SZ_STR(scan);
	2946	while (n <= stop) {
	2947	OP(n) = OPTIMIZED;
	2948	FLAGS(n) = 0;
	2949	NEXT_OFF(n) = 0;
	2950	n++;
	2951	}
	2952	#endif
	2953	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
	2954	return stopnow;
	2955	}
	2956
	2957	/* REx optimizer. Converts nodes into quicker variants "in place".
	2958	Finds fixed substrings. */
	2959
	2960	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	2961	to the position after last scanned or to NULL. */
	2962
	2963	#define INIT_AND_WITHP \
	2964	assert(!and_withp); \
	2965	Newx(and_withp,1,struct regnode_charclass_class); \
	2966	SAVEFREEPV(and_withp)
	2967
	2968	/* this is a chain of data about sub patterns we are processing that
	2969	need to be handled separately/specially in study_chunk. Its so
	2970	we can simulate recursion without losing state. */
	2971	struct scan_frame;
	2972	typedef struct scan_frame {
	2973	regnode last; / last node to process in this frame */
	2974	regnode next; / next node to process when last is reached */
	2975	struct scan_frame prev; /previous frame*/
	2976	I32 stop; /* what stopparen do we use */
	2977	} scan_frame;
	2978
	2979
	2980	#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
	2981
	2982	#define CASE_SYNST_FNC(nAmE) \
	2983	case nAmE: \
	2984	if (flags & SCF_DO_STCLASS_AND) { \
	2985	for (value = 0; value < 256; value++) \
	2986	if (!is_ ## nAmE ## _cp(value)) \
	2987	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	2988	} \
	2989	else { \
	2990	for (value = 0; value < 256; value++) \
	2991	if (is_ ## nAmE ## _cp(value)) \
	2992	ANYOF_BITMAP_SET(data->start_class, value); \
	2993	} \
	2994	break; \
	2995	case N ## nAmE: \
	2996	if (flags & SCF_DO_STCLASS_AND) { \
	2997	for (value = 0; value < 256; value++) \
	2998	if (is_ ## nAmE ## _cp(value)) \
	2999	ANYOF_BITMAP_CLEAR(data->start_class, value); \
	3000	} \
	3001	else { \
	3002	for (value = 0; value < 256; value++) \
	3003	if (!is_ ## nAmE ## _cp(value)) \
	3004	ANYOF_BITMAP_SET(data->start_class, value); \
	3005	} \
	3006	break
	3007
	3008
	3009
	3010	STATIC I32
	3011	S_study_chunk(pTHX_ RExC_state_t pRExC_state, regnode *scanp,
	3012	I32 minlenp, I32 deltap,
	3013	regnode *last,
	3014	scan_data_t *data,
	3015	I32 stopparen,
	3016	U8* recursed,
	3017	struct regnode_charclass_class *and_withp,
	3018	U32 flags, U32 depth)
	3019	/* scanp: Start here (read-write). */
	3020	/* deltap: Write maxlen-minlen here. */
	3021	/* last: Stop before this one. */
	3022	/* data: string data about the pattern */
	3023	/* stopparen: treat close N as END */
	3024	/* recursed: which subroutines have we recursed into */
	3025	/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
	3026	{
	3027	dVAR;
	3028	I32 min = 0, pars = 0, code;
	3029	regnode scan = scanp, *next;
	3030	I32 delta = 0;
	3031	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	3032	int is_inf_internal = 0; /* The studied chunk is infinite */
	3033	I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
	3034	scan_data_t data_fake;
	3035	SV *re_trie_maxbuff = NULL;
	3036	regnode *first_non_open = scan;
	3037	I32 stopmin = I32_MAX;
	3038	scan_frame *frame = NULL;
	3039	GET_RE_DEBUG_FLAGS_DECL;
	3040
	3041	PERL_ARGS_ASSERT_STUDY_CHUNK;
	3042
	3043	#ifdef DEBUGGING
	3044	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	3045	#endif
	3046
	3047	if ( depth == 0 ) {
	3048	while (first_non_open && OP(first_non_open) == OPEN)
	3049	first_non_open=regnext(first_non_open);
	3050	}
	3051
	3052
	3053	fake_study_recurse:
	3054	while ( scan && OP(scan) != END && scan < last ){
	3055	UV min_subtract = 0; /* How much to subtract from the minimum node
	3056	length to get a real minimum (because the
	3057	folded version may be shorter) */
	3058	bool has_exactf_sharp_s = FALSE;
	3059	/* Peephole optimizer: */
	3060	DEBUG_STUDYDATA("Peep:", data,depth);
	3061	DEBUG_PEEP("Peep",scan,depth);
	3062
	3063	/* Its not clear to khw or hv why this is done here, and not in the
	3064	* clauses that deal with EXACT nodes. khw's guess is that it's
	3065	* because of a previous design */
	3066	JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
	3067
	3068	/* Follow the next-chain of the current node and optimize
	3069	away all the NOTHINGs from it. */
	3070	if (OP(scan) != CURLYX) {
	3071	const int max = (reg_off_by_arg[OP(scan)]
	3072	? I32_MAX
	3073	/* I32 may be smaller than U16 on CRAYs! */
	3074	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	3075	int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
	3076	int noff;
	3077	regnode *n = scan;
	3078
	3079	/* Skip NOTHING and LONGJMP. */
	3080	while ((n = regnext(n))
	3081	&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
	3082	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n))))
	3083	&& off + noff < max)
	3084	off += noff;
	3085	if (reg_off_by_arg[OP(scan)])
	3086	ARG(scan) = off;
	3087	else
	3088	NEXT_OFF(scan) = off;
	3089	}
	3090
	3091
	3092
	3093	/* The principal pseudo-switch. Cannot be a switch, since we
	3094	look into several different things. */
	3095	if (OP(scan) == BRANCH \|\| OP(scan) == BRANCHJ
	3096	\|\| OP(scan) == IFTHEN) {
	3097	next = regnext(scan);
	3098	code = OP(scan);
	3099	/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
	3100
	3101	if (OP(next) == code \|\| code == IFTHEN) {
	3102	/* NOTE - There is similar code to this block below for handling
	3103	TRIE nodes on a re-study. If you change stuff here check there
	3104	too. */
	3105	I32 max1 = 0, min1 = I32_MAX, num = 0;
	3106	struct regnode_charclass_class accum;
	3107	regnode * const startbranch=scan;
	3108
	3109	if (flags & SCF_DO_SUBSTR)
	3110	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
	3111	if (flags & SCF_DO_STCLASS)
	3112	cl_init_zero(pRExC_state, &accum);
	3113
	3114	while (OP(scan) == code) {
	3115	I32 deltanext, minnext, f = 0, fake;
	3116	struct regnode_charclass_class this_class;
	3117
	3118	num++;
	3119	data_fake.flags = 0;
	3120	if (data) {
	3121	data_fake.whilem_c = data->whilem_c;
	3122	data_fake.last_closep = data->last_closep;
	3123	}
	3124	else
	3125	data_fake.last_closep = &fake;
	3126
	3127	data_fake.pos_delta = delta;
	3128	next = regnext(scan);
	3129	scan = NEXTOPER(scan);
	3130	if (code != BRANCH)
	3131	scan = NEXTOPER(scan);
	3132	if (flags & SCF_DO_STCLASS) {
	3133	cl_init(pRExC_state, &this_class);
	3134	data_fake.start_class = &this_class;
	3135	f = SCF_DO_STCLASS_AND;
	3136	}
	3137	if (flags & SCF_WHILEM_VISITED_POS)
	3138	f \|= SCF_WHILEM_VISITED_POS;
	3139
	3140	/* we suppose the run is continuous, last=next...*/
	3141	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3142	next, &data_fake,
	3143	stopparen, recursed, NULL, f,depth+1);
	3144	if (min1 > minnext)
	3145	min1 = minnext;
	3146	if (max1 < minnext + deltanext)
	3147	max1 = minnext + deltanext;
	3148	if (deltanext == I32_MAX)
	3149	is_inf = is_inf_internal = 1;
	3150	scan = next;
	3151	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3152	pars++;
	3153	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	3154	if ( stopmin > minnext)
	3155	stopmin = min + min1;
	3156	flags &= ~SCF_DO_SUBSTR;
	3157	if (data)
	3158	data->flags \|= SCF_SEEN_ACCEPT;
	3159	}
	3160	if (data) {
	3161	if (data_fake.flags & SF_HAS_EVAL)
	3162	data->flags \|= SF_HAS_EVAL;
	3163	data->whilem_c = data_fake.whilem_c;
	3164	}
	3165	if (flags & SCF_DO_STCLASS)
	3166	cl_or(pRExC_state, &accum, &this_class);
	3167	}
	3168	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	3169	min1 = 0;
	3170	if (flags & SCF_DO_SUBSTR) {
	3171	data->pos_min += min1;
	3172	data->pos_delta += max1 - min1;
	3173	if (max1 != min1 \|\| is_inf)
	3174	data->longest = &(data->longest_float);
	3175	}
	3176	min += min1;
	3177	delta += max1 - min1;
	3178	if (flags & SCF_DO_STCLASS_OR) {
	3179	cl_or(pRExC_state, data->start_class, &accum);
	3180	if (min1) {
	3181	cl_and(data->start_class, and_withp);
	3182	flags &= ~SCF_DO_STCLASS;
	3183	}
	3184	}
	3185	else if (flags & SCF_DO_STCLASS_AND) {
	3186	if (min1) {
	3187	cl_and(data->start_class, &accum);
	3188	flags &= ~SCF_DO_STCLASS;
	3189	}
	3190	else {
	3191	/* Switch to OR mode: cache the old value of
	3192	* data->start_class */
	3193	INIT_AND_WITHP;
	3194	StructCopy(data->start_class, and_withp,
	3195	struct regnode_charclass_class);
	3196	flags &= ~SCF_DO_STCLASS_AND;
	3197	StructCopy(&accum, data->start_class,
	3198	struct regnode_charclass_class);
	3199	flags \|= SCF_DO_STCLASS_OR;
	3200	data->start_class->flags \|= ANYOF_EOS;
	3201	}
	3202	}
	3203
	3204	if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
	3205	/* demq.
	3206
	3207	Assuming this was/is a branch we are dealing with: 'scan' now
	3208	points at the item that follows the branch sequence, whatever
	3209	it is. We now start at the beginning of the sequence and look
	3210	for subsequences of
	3211
	3212	BRANCH->EXACT=>x1
	3213	BRANCH->EXACT=>x2
	3214	tail
	3215
	3216	which would be constructed from a pattern like /A\|LIST\|OF\|WORDS/
	3217
	3218	If we can find such a subsequence we need to turn the first
	3219	element into a trie and then add the subsequent branch exact
	3220	strings to the trie.
	3221
	3222	We have two cases
	3223
	3224	1. patterns where the whole set of branches can be converted.
	3225
	3226	2. patterns where only a subset can be converted.
	3227
	3228	In case 1 we can replace the whole set with a single regop
	3229	for the trie. In case 2 we need to keep the start and end
	3230	branches so
	3231
	3232	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	3233	becomes BRANCH TRIE; BRANCH X;
	3234
	3235	There is an additional case, that being where there is a
	3236	common prefix, which gets split out into an EXACT like node
	3237	preceding the TRIE node.
	3238
	3239	If x(1..n)==tail then we can do a simple trie, if not we make
	3240	a "jump" trie, such that when we match the appropriate word
	3241	we "jump" to the appropriate tail node. Essentially we turn
	3242	a nested if into a case structure of sorts.
	3243
	3244	*/
	3245
	3246	int made=0;
	3247	if (!re_trie_maxbuff) {
	3248	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	3249	if (!SvIOK(re_trie_maxbuff))
	3250	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	3251	}
	3252	if ( SvIV(re_trie_maxbuff)>=0 ) {
	3253	regnode *cur;
	3254	regnode first = (regnode )NULL;
	3255	regnode last = (regnode )NULL;
	3256	regnode *tail = scan;
	3257	U8 trietype = 0;
	3258	U32 count=0;
	3259
	3260	#ifdef DEBUGGING
	3261	SV * const mysv = sv_newmortal(); /* for dumping */
	3262	#endif
	3263	/* var tail is used because there may be a TAIL
	3264	regop in the way. Ie, the exacts will point to the
	3265	thing following the TAIL, but the last branch will
	3266	point at the TAIL. So we advance tail. If we
	3267	have nested (?:) we may have to move through several
	3268	tails.
	3269	*/
	3270
	3271	while ( OP( tail ) == TAIL ) {
	3272	/* this is the TAIL generated by (?:) */
	3273	tail = regnext( tail );
	3274	}
	3275
	3276
	3277	DEBUG_TRIE_COMPILE_r({
	3278	regprop(RExC_rx, mysv, tail );
	3279	PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
	3280	(int)depth * 2 + 2, "",
	3281	"Looking for TRIE'able sequences. Tail node is: ",
	3282	SvPV_nolen_const( mysv )
	3283	);
	3284	});
	3285
	3286	/*
	3287
	3288	Step through the branches
	3289	cur represents each branch,
	3290	noper is the first thing to be matched as part of that branch
	3291	noper_next is the regnext() of that node.
	3292
	3293	We normally handle a case like this /FOO[xyz]\|BAR[pqr]/
	3294	via a "jump trie" but we also support building with NOJUMPTRIE,
	3295	which restricts the trie logic to structures like /FOO\|BAR/.
	3296
	3297	If noper is a trieable nodetype then the branch is a possible optimization
	3298	target. If we are building under NOJUMPTRIE then we require that noper_next
	3299	is the same as scan (our current position in the regex program).
	3300
	3301	Once we have two or more consecutive such branches we can create a
	3302	trie of the EXACT's contents and stitch it in place into the program.
	3303
	3304	If the sequence represents all of the branches in the alternation we
	3305	replace the entire thing with a single TRIE node.
	3306
	3307	Otherwise when it is a subsequence we need to stitch it in place and
	3308	replace only the relevant branches. This means the first branch has
	3309	to remain as it is used by the alternation logic, and its next pointer,
	3310	and needs to be repointed at the item on the branch chain following
	3311	the last branch we have optimized away.
	3312
	3313	This could be either a BRANCH, in which case the subsequence is internal,
	3314	or it could be the item following the branch sequence in which case the
	3315	subsequence is at the end (which does not necessarily mean the first node
	3316	is the start of the alternation).
	3317
	3318	TRIE_TYPE(X) is a define which maps the optype to a trietype.
	3319
	3320	optype \| trietype
	3321	----------------+-----------
	3322	NOTHING \| NOTHING
	3323	EXACT \| EXACT
	3324	EXACTFU \| EXACTFU
	3325	EXACTFU_SS \| EXACTFU
	3326	EXACTFU_TRICKYFOLD \| EXACTFU
	3327	EXACTFA \| 0
	3328
	3329
	3330	*/
	3331	#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
	3332	( EXACT == (X) ) ? EXACT : \
	3333	( EXACTFU == (X) \|\| EXACTFU_SS == (X) \|\| EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
	3334	0 )
	3335
	3336	/* dont use tail as the end marker for this traverse */
	3337	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	3338	regnode * const noper = NEXTOPER( cur );
	3339	U8 noper_type = OP( noper );
	3340	U8 noper_trietype = TRIE_TYPE( noper_type );
	3341	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	3342	regnode * const noper_next = regnext( noper );
	3343	U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
	3344	U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
	3345	#endif
	3346
	3347	DEBUG_TRIE_COMPILE_r({
	3348	regprop(RExC_rx, mysv, cur);
	3349	PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
	3350	(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
	3351
	3352	regprop(RExC_rx, mysv, noper);
	3353	PerlIO_printf( Perl_debug_log, " -> %s",
	3354	SvPV_nolen_const(mysv));
	3355
	3356	if ( noper_next ) {
	3357	regprop(RExC_rx, mysv, noper_next );
	3358	PerlIO_printf( Perl_debug_log,"\t=> %s\t",
	3359	SvPV_nolen_const(mysv));
	3360	}
	3361	PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
	3362	REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
	3363	PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
	3364	);
	3365	});
	3366
	3367	/* Is noper a trieable nodetype that can be merged with the
	3368	* current trie (if there is one)? */
	3369	if ( noper_trietype
	3370	&&
	3371	(
	3372	( noper_trietype == NOTHING)
	3373	\|\| ( trietype == NOTHING )
	3374	\|\| ( trietype == noper_trietype )
	3375	)
	3376	#ifdef NOJUMPTRIE
	3377	&& noper_next == tail
	3378	#endif
	3379	&& count < U16_MAX)
	3380	{
	3381	/* Handle mergable triable node
	3382	* Either we are the first node in a new trieable sequence,
	3383	* in which case we do some bookkeeping, otherwise we update
	3384	* the end pointer. */
	3385	if ( !first ) {
	3386	first = cur;
	3387	if ( noper_trietype == NOTHING ) {
	3388	#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
	3389	regnode * const noper_next = regnext( noper );
	3390	U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
	3391	U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
	3392	#endif
	3393
	3394	if ( noper_next_trietype ) {
	3395	trietype = noper_next_trietype;
	3396	} else if (noper_next_type) {
	3397	/* a NOTHING regop is 1 regop wide. We need at least two
	3398	* for a trie so we can't merge this in */
	3399	first = NULL;
	3400	}
	3401	} else {
	3402	trietype = noper_trietype;
	3403	}
	3404	} else {
	3405	if ( trietype == NOTHING )
	3406	trietype = noper_trietype;
	3407	last = cur;
	3408	}
	3409	if (first)
	3410	count++;
	3411	} /* end handle mergable triable node */
	3412	else {
	3413	/* handle unmergable node -
	3414	* noper may either be a triable node which can not be tried
	3415	* together with the current trie, or a non triable node */
	3416	if ( last ) {
	3417	/* If last is set and trietype is not NOTHING then we have found
	3418	* at least two triable branch sequences in a row of a similar
	3419	* trietype so we can turn them into a trie. If/when we
	3420	* allow NOTHING to start a trie sequence this condition will be
	3421	* required, and it isn't expensive so we leave it in for now. */
	3422	if ( trietype != NOTHING )
	3423	make_trie( pRExC_state,
	3424	startbranch, first, cur, tail, count,
	3425	trietype, depth+1 );
	3426	last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
	3427	}
	3428	if ( noper_trietype
	3429	#ifdef NOJUMPTRIE
	3430	&& noper_next == tail
	3431	#endif
	3432	){
	3433	/* noper is triable, so we can start a new trie sequence */
	3434	count = 1;
	3435	first = cur;
	3436	trietype = noper_trietype;
	3437	} else if (first) {
	3438	/* if we already saw a first but the current node is not triable then we have
	3439	* to reset the first information. */
	3440	count = 0;
	3441	first = NULL;
	3442	trietype = 0;
	3443	}
	3444	} /* end handle unmergable node */
	3445	} /* loop over branches */
	3446	DEBUG_TRIE_COMPILE_r({
	3447	regprop(RExC_rx, mysv, cur);
	3448	PerlIO_printf( Perl_debug_log,
	3449	"%s- %s (%d) <SCAN FINISHED>\n", (int)depth 2 + 2,
	3450	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3451
	3452	});
	3453	if ( last ) {
	3454	if ( trietype != NOTHING ) {
	3455	/* the last branch of the sequence was part of a trie,
	3456	* so we have to construct it here outside of the loop
	3457	*/
	3458	made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
	3459	#ifdef TRIE_STUDY_OPT
	3460	if ( ((made == MADE_EXACT_TRIE &&
	3461	startbranch == first)
	3462	\|\| ( first_non_open == first )) &&
	3463	depth==0 ) {
	3464	flags \|= SCF_TRIE_RESTUDY;
	3465	if ( startbranch == first
	3466	&& scan == tail )
	3467	{
	3468	RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
	3469	}
	3470	}
	3471	#endif
	3472	} else {
	3473	/* at this point we know whatever we have is a NOTHING sequence/branch
	3474	* AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
	3475	*/
	3476	if ( startbranch == first ) {
	3477	regnode *opt;
	3478	/* the entire thing is a NOTHING sequence, something like this:
	3479	* (?:\|) So we can turn it into a plain NOTHING op. */
	3480	DEBUG_TRIE_COMPILE_r({
	3481	regprop(RExC_rx, mysv, cur);
	3482	PerlIO_printf( Perl_debug_log,
	3483	"%s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth 2 + 2,
	3484	"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
	3485
	3486	});
	3487	OP(startbranch)= NOTHING;
	3488	NEXT_OFF(startbranch)= tail - startbranch;
	3489	for ( opt= startbranch + 1; opt < tail ; opt++ )
	3490	OP(opt)= OPTIMIZED;
	3491	}
	3492	}
	3493	} /* end if ( last) */
	3494	} /* TRIE_MAXBUF is non zero */
	3495
	3496	} /* do trie */
	3497
	3498	}
	3499	else if ( code == BRANCHJ ) { /* single branch is optimized. */
	3500	scan = NEXTOPER(NEXTOPER(scan));
	3501	} else /* single branch is optimized. */
	3502	scan = NEXTOPER(scan);
	3503	continue;
	3504	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB \|\| OP(scan) == GOSTART) {
	3505	scan_frame *newframe = NULL;
	3506	I32 paren;
	3507	regnode *start;
	3508	regnode *end;
	3509
	3510	if (OP(scan) != SUSPEND) {
	3511	/* set the pointer */
	3512	if (OP(scan) == GOSUB) {
	3513	paren = ARG(scan);
	3514	RExC_recurse[ARG2L(scan)] = scan;
	3515	start = RExC_open_parens[paren-1];
	3516	end = RExC_close_parens[paren-1];
	3517	} else {
	3518	paren = 0;
	3519	start = RExC_rxi->program + 1;
	3520	end = RExC_opend;
	3521	}
	3522	if (!recursed) {
	3523	Newxz(recursed, (((RExC_npar)>>3) +1), U8);
	3524	SAVEFREEPV(recursed);
	3525	}
	3526	if (!PAREN_TEST(recursed,paren+1)) {
	3527	PAREN_SET(recursed,paren+1);
	3528	Newx(newframe,1,scan_frame);
	3529	} else {
	3530	if (flags & SCF_DO_SUBSTR) {
	3531	SCAN_COMMIT(pRExC_state,data,minlenp);
	3532	data->longest = &(data->longest_float);
	3533	}
	3534	is_inf = is_inf_internal = 1;
	3535	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3536	cl_anything(pRExC_state, data->start_class);
	3537	flags &= ~SCF_DO_STCLASS;
	3538	}
	3539	} else {
	3540	Newx(newframe,1,scan_frame);
	3541	paren = stopparen;
	3542	start = scan+2;
	3543	end = regnext(scan);
	3544	}
	3545	if (newframe) {
	3546	assert(start);
	3547	assert(end);
	3548	SAVEFREEPV(newframe);
	3549	newframe->next = regnext(scan);
	3550	newframe->last = last;
	3551	newframe->stop = stopparen;
	3552	newframe->prev = frame;
	3553
	3554	frame = newframe;
	3555	scan = start;
	3556	stopparen = paren;
	3557	last = end;
	3558
	3559	continue;
	3560	}
	3561	}
	3562	else if (OP(scan) == EXACT) {
	3563	I32 l = STR_LEN(scan);
	3564	UV uc;
	3565	if (UTF) {
	3566	const U8 * const s = (U8*)STRING(scan);
	3567	uc = utf8_to_uvchr_buf(s, s + l, NULL);
	3568	l = utf8_length(s, s + l);
	3569	} else {
	3570	uc = ((U8)STRING(scan));
	3571	}
	3572	min += l;
	3573	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	3574	/* The code below prefers earlier match for fixed
	3575	offset, later match for variable offset. */
	3576	if (data->last_end == -1) { /* Update the start info. */
	3577	data->last_start_min = data->pos_min;
	3578	data->last_start_max = is_inf
	3579	? I32_MAX : data->pos_min + data->pos_delta;
	3580	}
	3581	sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
	3582	if (UTF)
	3583	SvUTF8_on(data->last_found);
	3584	{
	3585	SV * const sv = data->last_found;
	3586	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	3587	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	3588	if (mg && mg->mg_len >= 0)
	3589	mg->mg_len += utf8_length((U8*)STRING(scan),
	3590	(U8*)STRING(scan)+STR_LEN(scan));
	3591	}
	3592	data->last_end = data->pos_min + l;
	3593	data->pos_min += l; /* As in the first entry. */
	3594	data->flags &= ~SF_BEFORE_EOL;
	3595	}
	3596	if (flags & SCF_DO_STCLASS_AND) {
	3597	/* Check whether it is compatible with what we know already! */
	3598	int compat = 1;
	3599
	3600
	3601	/* If compatible, we or it in below. It is compatible if is
	3602	* in the bitmp and either 1) its bit or its fold is set, or 2)
	3603	* it's for a locale. Even if there isn't unicode semantics
	3604	* here, at runtime there may be because of matching against a
	3605	* utf8 string, so accept a possible false positive for
	3606	* latin1-range folds */
	3607	if (uc >= 0x100 \|\|
	3608	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3609	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3610	&& (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
	3611	\|\| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3612	)
	3613	{
	3614	compat = 0;
	3615	}
	3616	ANYOF_CLASS_ZERO(data->start_class);
	3617	ANYOF_BITMAP_ZERO(data->start_class);
	3618	if (compat)
	3619	ANYOF_BITMAP_SET(data->start_class, uc);
	3620	else if (uc >= 0x100) {
	3621	int i;
	3622
	3623	/* Some Unicode code points fold to the Latin1 range; as
	3624	* XXX temporary code, instead of figuring out if this is
	3625	* one, just assume it is and set all the start class bits
	3626	* that could be some such above 255 code point's fold
	3627	* which will generate fals positives. As the code
	3628	* elsewhere that does compute the fold settles down, it
	3629	* can be extracted out and re-used here */
	3630	for (i = 0; i < 256; i++){
	3631	if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
	3632	ANYOF_BITMAP_SET(data->start_class, i);
	3633	}
	3634	}
	3635	}
	3636	data->start_class->flags &= ~ANYOF_EOS;
	3637	if (uc < 0x100)
	3638	data->start_class->flags &= ~ANYOF_UNICODE_ALL;
	3639	}
	3640	else if (flags & SCF_DO_STCLASS_OR) {
	3641	/* false positive possible if the class is case-folded */
	3642	if (uc < 0x100)
	3643	ANYOF_BITMAP_SET(data->start_class, uc);
	3644	else
	3645	data->start_class->flags \|= ANYOF_UNICODE_ALL;
	3646	data->start_class->flags &= ~ANYOF_EOS;
	3647	cl_and(data->start_class, and_withp);
	3648	}
	3649	flags &= ~SCF_DO_STCLASS;
	3650	}
	3651	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
	3652	I32 l = STR_LEN(scan);
	3653	UV uc = ((U8)STRING(scan));
	3654
	3655	/* Search for fixed substrings supports EXACT only. */
	3656	if (flags & SCF_DO_SUBSTR) {
	3657	assert(data);
	3658	SCAN_COMMIT(pRExC_state, data, minlenp);
	3659	}
	3660	if (UTF) {
	3661	const U8 * const s = (U8 *)STRING(scan);
	3662	uc = utf8_to_uvchr_buf(s, s + l, NULL);
	3663	l = utf8_length(s, s + l);
	3664	}
	3665	if (has_exactf_sharp_s) {
	3666	RExC_seen \|= REG_SEEN_EXACTF_SHARP_S;
	3667	}
	3668	min += l - min_subtract;
	3669	if (min < 0) {
	3670	min = 0;
	3671	}
	3672	delta += min_subtract;
	3673	if (flags & SCF_DO_SUBSTR) {
	3674	data->pos_min += l - min_subtract;
	3675	if (data->pos_min < 0) {
	3676	data->pos_min = 0;
	3677	}
	3678	data->pos_delta += min_subtract;
	3679	if (min_subtract) {
	3680	data->longest = &(data->longest_float);
	3681	}
	3682	}
	3683	if (flags & SCF_DO_STCLASS_AND) {
	3684	/* Check whether it is compatible with what we know already! */
	3685	int compat = 1;
	3686	if (uc >= 0x100 \|\|
	3687	(!(data->start_class->flags & (ANYOF_CLASS \| ANYOF_LOCALE))
	3688	&& !ANYOF_BITMAP_TEST(data->start_class, uc)
	3689	&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
	3690	{
	3691	compat = 0;
	3692	}
	3693	ANYOF_CLASS_ZERO(data->start_class);
	3694	ANYOF_BITMAP_ZERO(data->start_class);
	3695	if (compat) {
	3696	ANYOF_BITMAP_SET(data->start_class, uc);
	3697	data->start_class->flags &= ~ANYOF_EOS;
	3698	data->start_class->flags \|= ANYOF_LOC_NONBITMAP_FOLD;
	3699	if (OP(scan) == EXACTFL) {
	3700	/* XXX This set is probably no longer necessary, and
	3701	* probably wrong as LOCALE now is on in the initial
	3702	* state */
	3703	data->start_class->flags \|= ANYOF_LOCALE;
	3704	}
	3705	else {
	3706
	3707	/* Also set the other member of the fold pair. In case
	3708	* that unicode semantics is called for at runtime, use
	3709	* the full latin1 fold. (Can't do this for locale,
	3710	* because not known until runtime) */
	3711	ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
	3712
	3713	/* All other (EXACTFL handled above) folds except under
	3714	* /iaa that include s, S, and sharp_s also may include
	3715	* the others */
	3716	if (OP(scan) != EXACTFA) {
	3717	if (uc == 's' \|\| uc == 'S') {
	3718	ANYOF_BITMAP_SET(data->start_class,
	3719	LATIN_SMALL_LETTER_SHARP_S);
	3720	}
	3721	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3722	ANYOF_BITMAP_SET(data->start_class, 's');
	3723	ANYOF_BITMAP_SET(data->start_class, 'S');
	3724	}
	3725	}
	3726	}
	3727	}
	3728	else if (uc >= 0x100) {
	3729	int i;
	3730	for (i = 0; i < 256; i++){
	3731	if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
	3732	ANYOF_BITMAP_SET(data->start_class, i);
	3733	}
	3734	}
	3735	}
	3736	}
	3737	else if (flags & SCF_DO_STCLASS_OR) {
	3738	if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
	3739	/* false positive possible if the class is case-folded.
	3740	Assume that the locale settings are the same... */
	3741	if (uc < 0x100) {
	3742	ANYOF_BITMAP_SET(data->start_class, uc);
	3743	if (OP(scan) != EXACTFL) {
	3744
	3745	/* And set the other member of the fold pair, but
	3746	* can't do that in locale because not known until
	3747	* run-time */
	3748	ANYOF_BITMAP_SET(data->start_class,
	3749	PL_fold_latin1[uc]);
	3750
	3751	/* All folds except under /iaa that include s, S,
	3752	* and sharp_s also may include the others */
	3753	if (OP(scan) != EXACTFA) {
	3754	if (uc == 's' \|\| uc == 'S') {
	3755	ANYOF_BITMAP_SET(data->start_class,
	3756	LATIN_SMALL_LETTER_SHARP_S);
	3757	}
	3758	else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
	3759	ANYOF_BITMAP_SET(data->start_class, 's');
	3760	ANYOF_BITMAP_SET(data->start_class, 'S');
	3761	}
	3762	}
	3763	}
	3764	}
	3765	data->start_class->flags &= ~ANYOF_EOS;
	3766	}
	3767	cl_and(data->start_class, and_withp);
	3768	}
	3769	flags &= ~SCF_DO_STCLASS;
	3770	}
	3771	else if (REGNODE_VARIES(OP(scan))) {
	3772	I32 mincount, maxcount, minnext, deltanext, fl = 0;
	3773	I32 f = flags, pos_before = 0;
	3774	regnode * const oscan = scan;
	3775	struct regnode_charclass_class this_class;
	3776	struct regnode_charclass_class *oclass = NULL;
	3777	I32 next_is_eval = 0;
	3778
	3779	switch (PL_regkind[OP(scan)]) {
	3780	case WHILEM: /* End of (?:...)* . */
	3781	scan = NEXTOPER(scan);
	3782	goto finish;
	3783	case PLUS:
	3784	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	3785	next = NEXTOPER(scan);
	3786	if (OP(next) == EXACT \|\| (flags & SCF_DO_STCLASS)) {
	3787	mincount = 1;
	3788	maxcount = REG_INFTY;
	3789	next = regnext(scan);
	3790	scan = NEXTOPER(scan);
	3791	goto do_curly;
	3792	}
	3793	}
	3794	if (flags & SCF_DO_SUBSTR)
	3795	data->pos_min++;
	3796	min++;
	3797	/* Fall through. */
	3798	case STAR:
	3799	if (flags & SCF_DO_STCLASS) {
	3800	mincount = 0;
	3801	maxcount = REG_INFTY;
	3802	next = regnext(scan);
	3803	scan = NEXTOPER(scan);
	3804	goto do_curly;
	3805	}
	3806	is_inf = is_inf_internal = 1;
	3807	scan = regnext(scan);
	3808	if (flags & SCF_DO_SUBSTR) {
	3809	SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
	3810	data->longest = &(data->longest_float);
	3811	}
	3812	goto optimize_curly_tail;
	3813	case CURLY:
	3814	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	3815	&& (scan->flags == stopparen))
	3816	{
	3817	mincount = 1;
	3818	maxcount = 1;
	3819	} else {
	3820	mincount = ARG1(scan);
	3821	maxcount = ARG2(scan);
	3822	}
	3823	next = regnext(scan);
	3824	if (OP(scan) == CURLYX) {
	3825	I32 lp = (data ? *(data->last_closep) : 0);
	3826	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	3827	}
	3828	scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
	3829	next_is_eval = (OP(scan) == EVAL);
	3830	do_curly:
	3831	if (flags & SCF_DO_SUBSTR) {
	3832	if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
	3833	pos_before = data->pos_min;
	3834	}
	3835	if (data) {
	3836	fl = data->flags;
	3837	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	3838	if (is_inf)
	3839	data->flags \|= SF_IS_INF;
	3840	}
	3841	if (flags & SCF_DO_STCLASS) {
	3842	cl_init(pRExC_state, &this_class);
	3843	oclass = data->start_class;
	3844	data->start_class = &this_class;
	3845	f \|= SCF_DO_STCLASS_AND;
	3846	f &= ~SCF_DO_STCLASS_OR;
	3847	}
	3848	/* Exclude from super-linear cache processing any {n,m}
	3849	regops for which the combination of input pos and regex
	3850	pos is not enough information to determine if a match
	3851	will be possible.
	3852
	3853	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	3854	regex pos at the \s*, the prospects for a match depend not
	3855	only on the input position but also on how many (bar\s*)
	3856	repeats into the {4,8} we are. */
	3857	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	3858	f &= ~SCF_WHILEM_VISITED_POS;
	3859
	3860	/* This will finish on WHILEM, setting scan, or on NULL: */
	3861	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	3862	last, data, stopparen, recursed, NULL,
	3863	(mincount == 0
	3864	? (f & ~SCF_DO_SUBSTR) : f),depth+1);
	3865
	3866	if (flags & SCF_DO_STCLASS)
	3867	data->start_class = oclass;
	3868	if (mincount == 0 \|\| minnext == 0) {
	3869	if (flags & SCF_DO_STCLASS_OR) {
	3870	cl_or(pRExC_state, data->start_class, &this_class);
	3871	}
	3872	else if (flags & SCF_DO_STCLASS_AND) {
	3873	/* Switch to OR mode: cache the old value of
	3874	* data->start_class */
	3875	INIT_AND_WITHP;
	3876	StructCopy(data->start_class, and_withp,
	3877	struct regnode_charclass_class);
	3878	flags &= ~SCF_DO_STCLASS_AND;
	3879	StructCopy(&this_class, data->start_class,
	3880	struct regnode_charclass_class);
	3881	flags \|= SCF_DO_STCLASS_OR;
	3882	data->start_class->flags \|= ANYOF_EOS;
	3883	}
	3884	} else { /* Non-zero len */
	3885	if (flags & SCF_DO_STCLASS_OR) {
	3886	cl_or(pRExC_state, data->start_class, &this_class);
	3887	cl_and(data->start_class, and_withp);
	3888	}
	3889	else if (flags & SCF_DO_STCLASS_AND)
	3890	cl_and(data->start_class, &this_class);
	3891	flags &= ~SCF_DO_STCLASS;
	3892	}
	3893	if (!scan) /* It was not CURLYX, but CURLY. */
	3894	scan = next;
	3895	if ( /* ? quantifier ok, except for (?{ ... }) */
	3896	(next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	3897	&& (minnext == 0) && (deltanext == 0)
	3898	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	3899	&& maxcount <= REG_INFTY/3) /* Complement check for big count */
	3900	{
	3901	ckWARNreg(RExC_parse,
	3902	"Quantifier unexpected on zero-length expression");
	3903	}
	3904
	3905	min += minnext * mincount;
	3906	is_inf_internal \|= ((maxcount == REG_INFTY
	3907	&& (minnext + deltanext) > 0)
	3908	\|\| deltanext == I32_MAX);
	3909	is_inf \|= is_inf_internal;
	3910	delta += (minnext + deltanext) * maxcount - minnext * mincount;
	3911
	3912	/* Try powerful optimization CURLYX => CURLYN. */
	3913	if ( OP(oscan) == CURLYX && data
	3914	&& data->flags & SF_IN_PAR
	3915	&& !(data->flags & SF_HAS_EVAL)
	3916	&& !deltanext && minnext == 1 ) {
	3917	/* Try to optimize to CURLYN. */
	3918	regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
	3919	regnode * const nxt1 = nxt;
	3920	#ifdef DEBUGGING
	3921	regnode *nxt2;
	3922	#endif
	3923
	3924	/* Skip open. */
	3925	nxt = regnext(nxt);
	3926	if (!REGNODE_SIMPLE(OP(nxt))
	3927	&& !(PL_regkind[OP(nxt)] == EXACT
	3928	&& STR_LEN(nxt) == 1))
	3929	goto nogo;
	3930	#ifdef DEBUGGING
	3931	nxt2 = nxt;
	3932	#endif
	3933	nxt = regnext(nxt);
	3934	if (OP(nxt) != CLOSE)
	3935	goto nogo;
	3936	if (RExC_open_parens) {
	3937	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3938	RExC_close_parens[ARG(nxt1)-1]=nxt+2; /close->while/
	3939	}
	3940	/* Now we know that nxt2 is the only contents: */
	3941	oscan->flags = (U8)ARG(nxt);
	3942	OP(oscan) = CURLYN;
	3943	OP(nxt1) = NOTHING; /* was OPEN. */
	3944
	3945	#ifdef DEBUGGING
	3946	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3947	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	3948	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	3949	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3950	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3951	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	3952	#endif
	3953	}
	3954	nogo:
	3955
	3956	/* Try optimization CURLYX => CURLYM. */
	3957	if ( OP(oscan) == CURLYX && data
	3958	&& !(data->flags & SF_HAS_PAR)
	3959	&& !(data->flags & SF_HAS_EVAL)
	3960	&& !deltanext /* atom is fixed width */
	3961	&& minnext != 0 /* CURLYM can't handle zero width */
	3962	&& ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
	3963	) {
	3964	/* XXXX How to optimize if data == 0? */
	3965	/* Optimize to a simpler form. */
	3966	regnode nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN */
	3967	regnode *nxt2;
	3968
	3969	OP(oscan) = CURLYM;
	3970	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	3971	&& (OP(nxt2) != WHILEM))
	3972	nxt = nxt2;
	3973	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	3974	/* Need to optimize away parenths. */
	3975	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	3976	/* Set the parenth number. */
	3977	regnode nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; / OPEN*/
	3978
	3979	oscan->flags = (U8)ARG(nxt);
	3980	if (RExC_open_parens) {
	3981	RExC_open_parens[ARG(nxt1)-1]=oscan; /open->CURLYM/
	3982	RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /close->NOTHING/
	3983	}
	3984	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	3985	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	3986
	3987	#ifdef DEBUGGING
	3988	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	3989	OP(nxt + 1) = OPTIMIZED; /* was count. */
	3990	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	3991	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	3992	#endif
	3993	#if 0
	3994	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	3995	regnode *nnxt = regnext(nxt1);
	3996	if (nnxt == nxt) {
	3997	if (reg_off_by_arg[OP(nxt1)])
	3998	ARG_SET(nxt1, nxt2 - nxt1);
	3999	else if (nxt2 - nxt1 < U16_MAX)
	4000	NEXT_OFF(nxt1) = nxt2 - nxt1;
	4001	else
	4002	OP(nxt) = NOTHING; /* Cannot beautify */
	4003	}
	4004	nxt1 = nnxt;
	4005	}
	4006	#endif
	4007	/* Optimize again: */
	4008	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	4009	NULL, stopparen, recursed, NULL, 0,depth+1);
	4010	}
	4011	else
	4012	oscan->flags = 0;
	4013	}
	4014	else if ((OP(oscan) == CURLYX)
	4015	&& (flags & SCF_WHILEM_VISITED_POS)
	4016	/* See the comment on a similar expression above.
	4017	However, this time it's not a subexpression
	4018	we care about, but the expression itself. */
	4019	&& (maxcount == REG_INFTY)
	4020	&& data && ++data->whilem_c < 16) {
	4021	/* This stays as CURLYX, we can put the count/of pair. */
	4022	/* Find WHILEM (as in regexec.c) */
	4023	regnode *nxt = oscan + NEXT_OFF(oscan);
	4024
	4025	if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
	4026	nxt += ARG(nxt);
	4027	PREVOPER(nxt)->flags = (U8)(data->whilem_c
	4028	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	4029	}
	4030	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	4031	pars++;
	4032	if (flags & SCF_DO_SUBSTR) {
	4033	SV *last_str = NULL;
	4034	int counted = mincount != 0;
	4035
	4036	if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
	4037	#if defined(SPARC64_GCC_WORKAROUND)
	4038	I32 b = 0;
	4039	STRLEN l = 0;
	4040	const char *s = NULL;
	4041	I32 old = 0;
	4042
	4043	if (pos_before >= data->last_start_min)
	4044	b = pos_before;
	4045	else
	4046	b = data->last_start_min;
	4047
	4048	l = 0;
	4049	s = SvPV_const(data->last_found, l);
	4050	old = b - data->last_start_min;
	4051
	4052	#else
	4053	I32 b = pos_before >= data->last_start_min
	4054	? pos_before : data->last_start_min;
	4055	STRLEN l;
	4056	const char * const s = SvPV_const(data->last_found, l);
	4057	I32 old = b - data->last_start_min;
	4058	#endif
	4059
	4060	if (UTF)
	4061	old = utf8_hop((U8)s, old) - (U8)s;
	4062	l -= old;
	4063	/* Get the added string: */
	4064	last_str = newSVpvn_utf8(s + old, l, UTF);
	4065	if (deltanext == 0 && pos_before == b) {
	4066	/* What was added is a constant string */
	4067	if (mincount > 1) {
	4068	SvGROW(last_str, (mincount * l) + 1);
	4069	repeatcpy(SvPVX(last_str) + l,
	4070	SvPVX_const(last_str), l, mincount - 1);
	4071	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	4072	/* Add additional parts. */
	4073	SvCUR_set(data->last_found,
	4074	SvCUR(data->last_found) - l);
	4075	sv_catsv(data->last_found, last_str);
	4076	{
	4077	SV * sv = data->last_found;
	4078	MAGIC *mg =
	4079	SvUTF8(sv) && SvMAGICAL(sv) ?
	4080	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	4081	if (mg && mg->mg_len >= 0)
	4082	mg->mg_len += CHR_SVLEN(last_str) - l;
	4083	}
	4084	data->last_end += l * (mincount - 1);
	4085	}
	4086	} else {
	4087	/* start offset must point into the last copy */
	4088	data->last_start_min += minnext * (mincount - 1);
	4089	data->last_start_max += is_inf ? I32_MAX
	4090	: (maxcount - 1) * (minnext + data->pos_delta);
	4091	}
	4092	}
	4093	/* It is counted once already... */
	4094	data->pos_min += minnext * (mincount - counted);
	4095	data->pos_delta += - counted * deltanext +
	4096	(minnext + deltanext) * maxcount - minnext * mincount;
	4097	if (mincount != maxcount) {
	4098	/* Cannot extend fixed substrings found inside
	4099	the group. */
	4100	SCAN_COMMIT(pRExC_state,data,minlenp);
	4101	if (mincount && last_str) {
	4102	SV * const sv = data->last_found;
	4103	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	4104	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	4105
	4106	if (mg)
	4107	mg->mg_len = -1;
	4108	sv_setsv(sv, last_str);
	4109	data->last_end = data->pos_min;
	4110	data->last_start_min =
	4111	data->pos_min - CHR_SVLEN(last_str);
	4112	data->last_start_max = is_inf
	4113	? I32_MAX
	4114	: data->pos_min + data->pos_delta
	4115	- CHR_SVLEN(last_str);
	4116	}
	4117	data->longest = &(data->longest_float);
	4118	}
	4119	SvREFCNT_dec(last_str);
	4120	}
	4121	if (data && (fl & SF_HAS_EVAL))
	4122	data->flags \|= SF_HAS_EVAL;
	4123	optimize_curly_tail:
	4124	if (OP(oscan) != CURLYX) {
	4125	while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
	4126	&& NEXT_OFF(next))
	4127	NEXT_OFF(oscan) += NEXT_OFF(next);
	4128	}
	4129	continue;
	4130	default: /* REF, ANYOFV, and CLUMP only? */
	4131	if (flags & SCF_DO_SUBSTR) {
	4132	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4133	data->longest = &(data->longest_float);
	4134	}
	4135	is_inf = is_inf_internal = 1;
	4136	if (flags & SCF_DO_STCLASS_OR)
	4137	cl_anything(pRExC_state, data->start_class);
	4138	flags &= ~SCF_DO_STCLASS;
	4139	break;
	4140	}
	4141	}
	4142	else if (OP(scan) == LNBREAK) {
	4143	if (flags & SCF_DO_STCLASS) {
	4144	int value = 0;
	4145	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	4146	if (flags & SCF_DO_STCLASS_AND) {
	4147	for (value = 0; value < 256; value++)
	4148	if (!is_VERTWS_cp(value))
	4149	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4150	}
	4151	else {
	4152	for (value = 0; value < 256; value++)
	4153	if (is_VERTWS_cp(value))
	4154	ANYOF_BITMAP_SET(data->start_class, value);
	4155	}
	4156	if (flags & SCF_DO_STCLASS_OR)
	4157	cl_and(data->start_class, and_withp);
	4158	flags &= ~SCF_DO_STCLASS;
	4159	}
	4160	min += 1;
	4161	delta += 1;
	4162	if (flags & SCF_DO_SUBSTR) {
	4163	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4164	data->pos_min += 1;
	4165	data->pos_delta += 1;
	4166	data->longest = &(data->longest_float);
	4167	}
	4168	}
	4169	else if (REGNODE_SIMPLE(OP(scan))) {
	4170	int value = 0;
	4171
	4172	if (flags & SCF_DO_SUBSTR) {
	4173	SCAN_COMMIT(pRExC_state,data,minlenp);
	4174	data->pos_min++;
	4175	}
	4176	min++;
	4177	if (flags & SCF_DO_STCLASS) {
	4178	data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
	4179
	4180	/* Some of the logic below assumes that switching
	4181	locale on will only add false positives. */
	4182	switch (PL_regkind[OP(scan)]) {
	4183	case SANY:
	4184	default:
	4185	do_default:
	4186	/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
	4187	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4188	cl_anything(pRExC_state, data->start_class);
	4189	break;
	4190	case REG_ANY:
	4191	if (OP(scan) == SANY)
	4192	goto do_default;
	4193	if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
	4194	value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
	4195	\|\| ANYOF_CLASS_TEST_ANY_SET(data->start_class));
	4196	cl_anything(pRExC_state, data->start_class);
	4197	}
	4198	if (flags & SCF_DO_STCLASS_AND \|\| !value)
	4199	ANYOF_BITMAP_CLEAR(data->start_class,'\n');
	4200	break;
	4201	case ANYOF:
	4202	if (flags & SCF_DO_STCLASS_AND)
	4203	cl_and(data->start_class,
	4204	(struct regnode_charclass_class*)scan);
	4205	else
	4206	cl_or(pRExC_state, data->start_class,
	4207	(struct regnode_charclass_class*)scan);
	4208	break;
	4209	case ALNUM:
	4210	if (flags & SCF_DO_STCLASS_AND) {
	4211	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4212	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
	4213	if (OP(scan) == ALNUMU) {
	4214	for (value = 0; value < 256; value++) {
	4215	if (!isWORDCHAR_L1(value)) {
	4216	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4217	}
	4218	}
	4219	} else {
	4220	for (value = 0; value < 256; value++) {
	4221	if (!isALNUM(value)) {
	4222	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4223	}
	4224	}
	4225	}
	4226	}
	4227	}
	4228	else {
	4229	if (data->start_class->flags & ANYOF_LOCALE)
	4230	ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
	4231
	4232	/* Even if under locale, set the bits for non-locale
	4233	* in case it isn't a true locale-node. This will
	4234	* create false positives if it truly is locale */
	4235	if (OP(scan) == ALNUMU) {
	4236	for (value = 0; value < 256; value++) {
	4237	if (isWORDCHAR_L1(value)) {
	4238	ANYOF_BITMAP_SET(data->start_class, value);
	4239	}
	4240	}
	4241	} else {
	4242	for (value = 0; value < 256; value++) {
	4243	if (isALNUM(value)) {
	4244	ANYOF_BITMAP_SET(data->start_class, value);
	4245	}
	4246	}
	4247	}
	4248	}
	4249	break;
	4250	case NALNUM:
	4251	if (flags & SCF_DO_STCLASS_AND) {
	4252	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4253	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
	4254	if (OP(scan) == NALNUMU) {
	4255	for (value = 0; value < 256; value++) {
	4256	if (isWORDCHAR_L1(value)) {
	4257	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4258	}
	4259	}
	4260	} else {
	4261	for (value = 0; value < 256; value++) {
	4262	if (isALNUM(value)) {
	4263	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4264	}
	4265	}
	4266	}
	4267	}
	4268	}
	4269	else {
	4270	if (data->start_class->flags & ANYOF_LOCALE)
	4271	ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
	4272
	4273	/* Even if under locale, set the bits for non-locale in
	4274	* case it isn't a true locale-node. This will create
	4275	* false positives if it truly is locale */
	4276	if (OP(scan) == NALNUMU) {
	4277	for (value = 0; value < 256; value++) {
	4278	if (! isWORDCHAR_L1(value)) {
	4279	ANYOF_BITMAP_SET(data->start_class, value);
	4280	}
	4281	}
	4282	} else {
	4283	for (value = 0; value < 256; value++) {
	4284	if (! isALNUM(value)) {
	4285	ANYOF_BITMAP_SET(data->start_class, value);
	4286	}
	4287	}
	4288	}
	4289	}
	4290	break;
	4291	case SPACE:
	4292	if (flags & SCF_DO_STCLASS_AND) {
	4293	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4294	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
	4295	if (OP(scan) == SPACEU) {
	4296	for (value = 0; value < 256; value++) {
	4297	if (!isSPACE_L1(value)) {
	4298	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4299	}
	4300	}
	4301	} else {
	4302	for (value = 0; value < 256; value++) {
	4303	if (!isSPACE(value)) {
	4304	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4305	}
	4306	}
	4307	}
	4308	}
	4309	}
	4310	else {
	4311	if (data->start_class->flags & ANYOF_LOCALE) {
	4312	ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
	4313	}
	4314	if (OP(scan) == SPACEU) {
	4315	for (value = 0; value < 256; value++) {
	4316	if (isSPACE_L1(value)) {
	4317	ANYOF_BITMAP_SET(data->start_class, value);
	4318	}
	4319	}
	4320	} else {
	4321	for (value = 0; value < 256; value++) {
	4322	if (isSPACE(value)) {
	4323	ANYOF_BITMAP_SET(data->start_class, value);
	4324	}
	4325	}
	4326	}
	4327	}
	4328	break;
	4329	case NSPACE:
	4330	if (flags & SCF_DO_STCLASS_AND) {
	4331	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4332	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
	4333	if (OP(scan) == NSPACEU) {
	4334	for (value = 0; value < 256; value++) {
	4335	if (isSPACE_L1(value)) {
	4336	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4337	}
	4338	}
	4339	} else {
	4340	for (value = 0; value < 256; value++) {
	4341	if (isSPACE(value)) {
	4342	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4343	}
	4344	}
	4345	}
	4346	}
	4347	}
	4348	else {
	4349	if (data->start_class->flags & ANYOF_LOCALE)
	4350	ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
	4351	if (OP(scan) == NSPACEU) {
	4352	for (value = 0; value < 256; value++) {
	4353	if (!isSPACE_L1(value)) {
	4354	ANYOF_BITMAP_SET(data->start_class, value);
	4355	}
	4356	}
	4357	}
	4358	else {
	4359	for (value = 0; value < 256; value++) {
	4360	if (!isSPACE(value)) {
	4361	ANYOF_BITMAP_SET(data->start_class, value);
	4362	}
	4363	}
	4364	}
	4365	}
	4366	break;
	4367	case DIGIT:
	4368	if (flags & SCF_DO_STCLASS_AND) {
	4369	if (!(data->start_class->flags & ANYOF_LOCALE)) {
	4370	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
	4371	for (value = 0; value < 256; value++)
	4372	if (!isDIGIT(value))
	4373	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4374	}
	4375	}
	4376	else {
	4377	if (data->start_class->flags & ANYOF_LOCALE)
	4378	ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
	4379	for (value = 0; value < 256; value++)
	4380	if (isDIGIT(value))
	4381	ANYOF_BITMAP_SET(data->start_class, value);
	4382	}
	4383	break;
	4384	case NDIGIT:
	4385	if (flags & SCF_DO_STCLASS_AND) {
	4386	if (!(data->start_class->flags & ANYOF_LOCALE))
	4387	ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
	4388	for (value = 0; value < 256; value++)
	4389	if (isDIGIT(value))
	4390	ANYOF_BITMAP_CLEAR(data->start_class, value);
	4391	}
	4392	else {
	4393	if (data->start_class->flags & ANYOF_LOCALE)
	4394	ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
	4395	for (value = 0; value < 256; value++)
	4396	if (!isDIGIT(value))
	4397	ANYOF_BITMAP_SET(data->start_class, value);
	4398	}
	4399	break;
	4400	CASE_SYNST_FNC(VERTWS);
	4401	CASE_SYNST_FNC(HORIZWS);
	4402
	4403	}
	4404	if (flags & SCF_DO_STCLASS_OR)
	4405	cl_and(data->start_class, and_withp);
	4406	flags &= ~SCF_DO_STCLASS;
	4407	}
	4408	}
	4409	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
	4410	data->flags \|= (OP(scan) == MEOL
	4411	? SF_BEFORE_MEOL
	4412	: SF_BEFORE_SEOL);
	4413	SCAN_COMMIT(pRExC_state, data, minlenp);
	4414
	4415	}
	4416	else if ( PL_regkind[OP(scan)] == BRANCHJ
	4417	/* Lookbehind, or need to calculate parens/evals/stclass: */
	4418	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	4419	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM)) {
	4420	if ( OP(scan) == UNLESSM &&
	4421	scan->flags == 0 &&
	4422	OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
	4423	OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
	4424	) {
	4425	regnode *opt;
	4426	regnode *upto= regnext(scan);
	4427	DEBUG_PARSE_r({
	4428	SV * const mysv_val=sv_newmortal();
	4429	DEBUG_STUDYDATA("OPFAIL",data,depth);
	4430
	4431	/DEBUG_PARSE_MSG("opfail");/
	4432	regprop(RExC_rx, mysv_val, upto);
	4433	PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
	4434	SvPV_nolen_const(mysv_val),
	4435	(IV)REG_NODE_NUM(upto),
	4436	(IV)(upto - scan)
	4437	);
	4438	});
	4439	OP(scan) = OPFAIL;
	4440	NEXT_OFF(scan) = upto - scan;
	4441	for (opt= scan + 1; opt < upto ; opt++)
	4442	OP(opt) = OPTIMIZED;
	4443	scan= upto;
	4444	continue;
	4445	}
	4446	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4447	\|\| OP(scan) == UNLESSM )
	4448	{
	4449	/* Negative Lookahead/lookbehind
	4450	In this case we can't do fixed string optimisation.
	4451	*/
	4452
	4453	I32 deltanext, minnext, fake = 0;
	4454	regnode *nscan;
	4455	struct regnode_charclass_class intrnl;
	4456	int f = 0;
	4457
	4458	data_fake.flags = 0;
	4459	if (data) {
	4460	data_fake.whilem_c = data->whilem_c;
	4461	data_fake.last_closep = data->last_closep;
	4462	}
	4463	else
	4464	data_fake.last_closep = &fake;
	4465	data_fake.pos_delta = delta;
	4466	if ( flags & SCF_DO_STCLASS && !scan->flags
	4467	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4468	cl_init(pRExC_state, &intrnl);
	4469	data_fake.start_class = &intrnl;
	4470	f \|= SCF_DO_STCLASS_AND;
	4471	}
	4472	if (flags & SCF_WHILEM_VISITED_POS)
	4473	f \|= SCF_WHILEM_VISITED_POS;
	4474	next = regnext(scan);
	4475	nscan = NEXTOPER(NEXTOPER(scan));
	4476	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	4477	last, &data_fake, stopparen, recursed, NULL, f, depth+1);
	4478	if (scan->flags) {
	4479	if (deltanext) {
	4480	FAIL("Variable length lookbehind not implemented");
	4481	}
	4482	else if (minnext > (I32)U8_MAX) {
	4483	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4484	}
	4485	scan->flags = (U8)minnext;
	4486	}
	4487	if (data) {
	4488	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4489	pars++;
	4490	if (data_fake.flags & SF_HAS_EVAL)
	4491	data->flags \|= SF_HAS_EVAL;
	4492	data->whilem_c = data_fake.whilem_c;
	4493	}
	4494	if (f & SCF_DO_STCLASS_AND) {
	4495	if (flags & SCF_DO_STCLASS_OR) {
	4496	/* OR before, AND after: ideally we would recurse with
	4497	* data_fake to get the AND applied by study of the
	4498	* remainder of the pattern, and then derecurse;
	4499	* * HACK * for now just treat as "no information".
	4500	* See [perl #56690].
	4501	*/
	4502	cl_init(pRExC_state, data->start_class);
	4503	} else {
	4504	/* AND before and after: combine and continue */
	4505	const int was = (data->start_class->flags & ANYOF_EOS);
	4506
	4507	cl_and(data->start_class, &intrnl);
	4508	if (was)
	4509	data->start_class->flags \|= ANYOF_EOS;
	4510	}
	4511	}
	4512	}
	4513	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	4514	else {
	4515	/* Positive Lookahead/lookbehind
	4516	In this case we can do fixed string optimisation,
	4517	but we must be careful about it. Note in the case of
	4518	lookbehind the positions will be offset by the minimum
	4519	length of the pattern, something we won't know about
	4520	until after the recurse.
	4521	*/
	4522	I32 deltanext, fake = 0;
	4523	regnode *nscan;
	4524	struct regnode_charclass_class intrnl;
	4525	int f = 0;
	4526	/* We use SAVEFREEPV so that when the full compile
	4527	is finished perl will clean up the allocated
	4528	minlens when it's all done. This way we don't
	4529	have to worry about freeing them when we know
	4530	they wont be used, which would be a pain.
	4531	*/
	4532	I32 *minnextp;
	4533	Newx( minnextp, 1, I32 );
	4534	SAVEFREEPV(minnextp);
	4535
	4536	if (data) {
	4537	StructCopy(data, &data_fake, scan_data_t);
	4538	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	4539	f \|= SCF_DO_SUBSTR;
	4540	if (scan->flags)
	4541	SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
	4542	data_fake.last_found=newSVsv(data->last_found);
	4543	}
	4544	}
	4545	else
	4546	data_fake.last_closep = &fake;
	4547	data_fake.flags = 0;
	4548	data_fake.pos_delta = delta;
	4549	if (is_inf)
	4550	data_fake.flags \|= SF_IS_INF;
	4551	if ( flags & SCF_DO_STCLASS && !scan->flags
	4552	&& OP(scan) == IFMATCH ) { /* Lookahead */
	4553	cl_init(pRExC_state, &intrnl);
	4554	data_fake.start_class = &intrnl;
	4555	f \|= SCF_DO_STCLASS_AND;
	4556	}
	4557	if (flags & SCF_WHILEM_VISITED_POS)
	4558	f \|= SCF_WHILEM_VISITED_POS;
	4559	next = regnext(scan);
	4560	nscan = NEXTOPER(NEXTOPER(scan));
	4561
	4562	*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
	4563	last, &data_fake, stopparen, recursed, NULL, f,depth+1);
	4564	if (scan->flags) {
	4565	if (deltanext) {
	4566	FAIL("Variable length lookbehind not implemented");
	4567	}
	4568	else if (*minnextp > (I32)U8_MAX) {
	4569	FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
	4570	}
	4571	scan->flags = (U8)*minnextp;
	4572	}
	4573
	4574	*minnextp += min;
	4575
	4576	if (f & SCF_DO_STCLASS_AND) {
	4577	const int was = (data->start_class->flags & ANYOF_EOS);
	4578
	4579	cl_and(data->start_class, &intrnl);
	4580	if (was)
	4581	data->start_class->flags \|= ANYOF_EOS;
	4582	}
	4583	if (data) {
	4584	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4585	pars++;
	4586	if (data_fake.flags & SF_HAS_EVAL)
	4587	data->flags \|= SF_HAS_EVAL;
	4588	data->whilem_c = data_fake.whilem_c;
	4589	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	4590	if (RExC_rx->minlen<*minnextp)
	4591	RExC_rx->minlen=*minnextp;
	4592	SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
	4593	SvREFCNT_dec(data_fake.last_found);
	4594
	4595	if ( data_fake.minlen_fixed != minlenp )
	4596	{
	4597	data->offset_fixed= data_fake.offset_fixed;
	4598	data->minlen_fixed= data_fake.minlen_fixed;
	4599	data->lookbehind_fixed+= scan->flags;
	4600	}
	4601	if ( data_fake.minlen_float != minlenp )
	4602	{
	4603	data->minlen_float= data_fake.minlen_float;
	4604	data->offset_float_min=data_fake.offset_float_min;
	4605	data->offset_float_max=data_fake.offset_float_max;
	4606	data->lookbehind_float+= scan->flags;
	4607	}
	4608	}
	4609	}
	4610	}
	4611	#endif
	4612	}
	4613	else if (OP(scan) == OPEN) {
	4614	if (stopparen != (I32)ARG(scan))
	4615	pars++;
	4616	}
	4617	else if (OP(scan) == CLOSE) {
	4618	if (stopparen == (I32)ARG(scan)) {
	4619	break;
	4620	}
	4621	if ((I32)ARG(scan) == is_par) {
	4622	next = regnext(scan);
	4623
	4624	if ( next && (OP(next) != WHILEM) && next < last)
	4625	is_par = 0; /* Disable optimization */
	4626	}
	4627	if (data)
	4628	*(data->last_closep) = ARG(scan);
	4629	}
	4630	else if (OP(scan) == EVAL) {
	4631	if (data)
	4632	data->flags \|= SF_HAS_EVAL;
	4633	}
	4634	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
	4635	if (flags & SCF_DO_SUBSTR) {
	4636	SCAN_COMMIT(pRExC_state,data,minlenp);
	4637	flags &= ~SCF_DO_SUBSTR;
	4638	}
	4639	if (data && OP(scan)==ACCEPT) {
	4640	data->flags \|= SCF_SEEN_ACCEPT;
	4641	if (stopmin > min)
	4642	stopmin = min;
	4643	}
	4644	}
	4645	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	4646	{
	4647	if (flags & SCF_DO_SUBSTR) {
	4648	SCAN_COMMIT(pRExC_state,data,minlenp);
	4649	data->longest = &(data->longest_float);
	4650	}
	4651	is_inf = is_inf_internal = 1;
	4652	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	4653	cl_anything(pRExC_state, data->start_class);
	4654	flags &= ~SCF_DO_STCLASS;
	4655	}
	4656	else if (OP(scan) == GPOS) {
	4657	if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
	4658	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	4659	{
	4660	if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
	4661	RExC_rx->extflags \|= RXf_ANCH_GPOS;
	4662	if (RExC_rx->gofs < (U32)min)
	4663	RExC_rx->gofs = min;
	4664	} else {
	4665	RExC_rx->extflags \|= RXf_GPOS_FLOAT;
	4666	RExC_rx->gofs = 0;
	4667	}
	4668	}
	4669	#ifdef TRIE_STUDY_OPT
	4670	#ifdef FULL_TRIE_STUDY
	4671	else if (PL_regkind[OP(scan)] == TRIE) {
	4672	/* NOTE - There is similar code to this block above for handling
	4673	BRANCH nodes on the initial study. If you change stuff here
	4674	check there too. */
	4675	regnode *trie_node= scan;
	4676	regnode *tail= regnext(scan);
	4677	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4678	I32 max1 = 0, min1 = I32_MAX;
	4679	struct regnode_charclass_class accum;
	4680
	4681	if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
	4682	SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
	4683	if (flags & SCF_DO_STCLASS)
	4684	cl_init_zero(pRExC_state, &accum);
	4685
	4686	if (!trie->jump) {
	4687	min1= trie->minlen;
	4688	max1= trie->maxlen;
	4689	} else {
	4690	const regnode *nextbranch= NULL;
	4691	U32 word;
	4692
	4693	for ( word=1 ; word <= trie->wordcount ; word++)
	4694	{
	4695	I32 deltanext=0, minnext=0, f = 0, fake;
	4696	struct regnode_charclass_class this_class;
	4697
	4698	data_fake.flags = 0;
	4699	if (data) {
	4700	data_fake.whilem_c = data->whilem_c;
	4701	data_fake.last_closep = data->last_closep;
	4702	}
	4703	else
	4704	data_fake.last_closep = &fake;
	4705	data_fake.pos_delta = delta;
	4706	if (flags & SCF_DO_STCLASS) {
	4707	cl_init(pRExC_state, &this_class);
	4708	data_fake.start_class = &this_class;
	4709	f = SCF_DO_STCLASS_AND;
	4710	}
	4711	if (flags & SCF_WHILEM_VISITED_POS)
	4712	f \|= SCF_WHILEM_VISITED_POS;
	4713
	4714	if (trie->jump[word]) {
	4715	if (!nextbranch)
	4716	nextbranch = trie_node + trie->jump[0];
	4717	scan= trie_node + trie->jump[word];
	4718	/* We go from the jump point to the branch that follows
	4719	it. Note this means we need the vestigal unused branches
	4720	even though they arent otherwise used.
	4721	*/
	4722	minnext = study_chunk(pRExC_state, &scan, minlenp,
	4723	&deltanext, (regnode *)nextbranch, &data_fake,
	4724	stopparen, recursed, NULL, f,depth+1);
	4725	}
	4726	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	4727	nextbranch= regnext((regnode*)nextbranch);
	4728
	4729	if (min1 > (I32)(minnext + trie->minlen))
	4730	min1 = minnext + trie->minlen;
	4731	if (max1 < (I32)(minnext + deltanext + trie->maxlen))
	4732	max1 = minnext + deltanext + trie->maxlen;
	4733	if (deltanext == I32_MAX)
	4734	is_inf = is_inf_internal = 1;
	4735
	4736	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	4737	pars++;
	4738	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	4739	if ( stopmin > min + min1)
	4740	stopmin = min + min1;
	4741	flags &= ~SCF_DO_SUBSTR;
	4742	if (data)
	4743	data->flags \|= SCF_SEEN_ACCEPT;
	4744	}
	4745	if (data) {
	4746	if (data_fake.flags & SF_HAS_EVAL)
	4747	data->flags \|= SF_HAS_EVAL;
	4748	data->whilem_c = data_fake.whilem_c;
	4749	}
	4750	if (flags & SCF_DO_STCLASS)
	4751	cl_or(pRExC_state, &accum, &this_class);
	4752	}
	4753	}
	4754	if (flags & SCF_DO_SUBSTR) {
	4755	data->pos_min += min1;
	4756	data->pos_delta += max1 - min1;
	4757	if (max1 != min1 \|\| is_inf)
	4758	data->longest = &(data->longest_float);
	4759	}
	4760	min += min1;
	4761	delta += max1 - min1;
	4762	if (flags & SCF_DO_STCLASS_OR) {
	4763	cl_or(pRExC_state, data->start_class, &accum);
	4764	if (min1) {
	4765	cl_and(data->start_class, and_withp);
	4766	flags &= ~SCF_DO_STCLASS;
	4767	}
	4768	}
	4769	else if (flags & SCF_DO_STCLASS_AND) {
	4770	if (min1) {
	4771	cl_and(data->start_class, &accum);
	4772	flags &= ~SCF_DO_STCLASS;
	4773	}
	4774	else {
	4775	/* Switch to OR mode: cache the old value of
	4776	* data->start_class */
	4777	INIT_AND_WITHP;
	4778	StructCopy(data->start_class, and_withp,
	4779	struct regnode_charclass_class);
	4780	flags &= ~SCF_DO_STCLASS_AND;
	4781	StructCopy(&accum, data->start_class,
	4782	struct regnode_charclass_class);
	4783	flags \|= SCF_DO_STCLASS_OR;
	4784	data->start_class->flags \|= ANYOF_EOS;
	4785	}
	4786	}
	4787	scan= tail;
	4788	continue;
	4789	}
	4790	#else
	4791	else if (PL_regkind[OP(scan)] == TRIE) {
	4792	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	4793	U8*bang=NULL;
	4794
	4795	min += trie->minlen;
	4796	delta += (trie->maxlen - trie->minlen);
	4797	flags &= ~SCF_DO_STCLASS; /* xxx */
	4798	if (flags & SCF_DO_SUBSTR) {
	4799	SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
	4800	data->pos_min += trie->minlen;
	4801	data->pos_delta += (trie->maxlen - trie->minlen);
	4802	if (trie->maxlen != trie->minlen)
	4803	data->longest = &(data->longest_float);
	4804	}
	4805	if (trie->jump) /* no more substrings -- for now /grr*/
	4806	flags &= ~SCF_DO_SUBSTR;
	4807	}
	4808	#endif /* old or new */
	4809	#endif /* TRIE_STUDY_OPT */
	4810
	4811	/* Else: zero-length, ignore. */
	4812	scan = regnext(scan);
	4813	}
	4814	if (frame) {
	4815	last = frame->last;
	4816	scan = frame->next;
	4817	stopparen = frame->stop;
	4818	frame = frame->prev;
	4819	goto fake_study_recurse;
	4820	}
	4821
	4822	finish:
	4823	assert(!frame);
	4824	DEBUG_STUDYDATA("pre-fin:",data,depth);
	4825
	4826	*scanp = scan;
	4827	*deltap = is_inf_internal ? I32_MAX : delta;
	4828	if (flags & SCF_DO_SUBSTR && is_inf)
	4829	data->pos_delta = I32_MAX - data->pos_min;
	4830	if (is_par > (I32)U8_MAX)
	4831	is_par = 0;
	4832	if (is_par && pars==1 && data) {
	4833	data->flags \|= SF_IN_PAR;
	4834	data->flags &= ~SF_HAS_PAR;
	4835	}
	4836	else if (pars && data) {
	4837	data->flags \|= SF_HAS_PAR;
	4838	data->flags &= ~SF_IN_PAR;
	4839	}
	4840	if (flags & SCF_DO_STCLASS_OR)
	4841	cl_and(data->start_class, and_withp);
	4842	if (flags & SCF_TRIE_RESTUDY)
	4843	data->flags \|= SCF_TRIE_RESTUDY;
	4844
	4845	DEBUG_STUDYDATA("post-fin:",data,depth);
	4846
	4847	return min < stopmin ? min : stopmin;
	4848	}
	4849
	4850	STATIC U32
	4851	S_add_data(RExC_state_t pRExC_state, U32 n, const char s)
	4852	{
	4853	U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
	4854
	4855	PERL_ARGS_ASSERT_ADD_DATA;
	4856
	4857	Renewc(RExC_rxi->data,
	4858	sizeof(RExC_rxi->data) + sizeof(void) * (count + n - 1),
	4859	char, struct reg_data);
	4860	if(count)
	4861	Renew(RExC_rxi->data->what, count + n, U8);
	4862	else
	4863	Newx(RExC_rxi->data->what, n, U8);
	4864	RExC_rxi->data->count = count + n;
	4865	Copy(s, RExC_rxi->data->what + count, n, U8);
	4866	return count;
	4867	}
	4868
	4869	/XXX: todo make this not included in a non debugging perl /
	4870	#ifndef PERL_IN_XSUB_RE
	4871	void
	4872	Perl_reginitcolors(pTHX)
	4873	{
	4874	dVAR;
	4875	const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
	4876	if (s) {
	4877	char *t = savepv(s);
	4878	int i = 0;
	4879	PL_colors[0] = t;
	4880	while (++i < 6) {
	4881	t = strchr(t, '\t');
	4882	if (t) {
	4883	*t = '\0';
	4884	PL_colors[i] = ++t;
	4885	}
	4886	else
	4887	PL_colors[i] = t = (char *)"";
	4888	}
	4889	} else {
	4890	int i = 0;
	4891	while (i < 6)
	4892	PL_colors[i++] = (char *)"";
	4893	}
	4894	PL_colorset = 1;
	4895	}
	4896	#endif
	4897
	4898
	4899	#ifdef TRIE_STUDY_OPT
	4900	#define CHECK_RESTUDY_GOTO \
	4901	if ( \
	4902	(data.flags & SCF_TRIE_RESTUDY) \
	4903	&& ! restudied++ \
	4904	) goto reStudy
	4905	#else
	4906	#define CHECK_RESTUDY_GOTO
	4907	#endif
	4908
	4909	/*
	4910	* pregcomp - compile a regular expression into internal code
	4911	*
	4912	* Decides which engine's compiler to call based on the hint currently in
	4913	* scope
	4914	*/
	4915
	4916	#ifndef PERL_IN_XSUB_RE
	4917
	4918	/* return the currently in-scope regex engine (or the default if none) */
	4919
	4920	regexp_engine const *
	4921	Perl_current_re_engine(pTHX)
	4922	{
	4923	dVAR;
	4924
	4925	if (IN_PERL_COMPILETIME) {
	4926	HV * const table = GvHV(PL_hintgv);
	4927	SV **ptr;
	4928
	4929	if (!table)
	4930	return &PL_core_reg_engine;
	4931	ptr = hv_fetchs(table, "regcomp", FALSE);
	4932	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	4933	return &PL_core_reg_engine;
	4934	return INT2PTR(regexp_engine,SvIV(ptr));
	4935	}
	4936	else {
	4937	SV *ptr;
	4938	if (!PL_curcop->cop_hints_hash)
	4939	return &PL_core_reg_engine;
	4940	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
	4941	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
	4942	return &PL_core_reg_engine;
	4943	return INT2PTR(regexp_engine*,SvIV(ptr));
	4944	}
	4945	}
	4946
	4947
	4948	REGEXP *
	4949	Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
	4950	{
	4951	dVAR;
	4952	regexp_engine const *eng = current_re_engine();
	4953	GET_RE_DEBUG_FLAGS_DECL;
	4954
	4955	PERL_ARGS_ASSERT_PREGCOMP;
	4956
	4957	/* Dispatch a request to compile a regexp to correct regexp engine. */
	4958	DEBUG_COMPILE_r({
	4959	PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
	4960	PTR2UV(eng));
	4961	});
	4962	return CALLREGCOMP_ENG(eng, pattern, flags);
	4963	}
	4964	#endif
	4965
	4966	/* public(ish) entry point for the perl core's own regex compiling code.
	4967	* It's actually a wrapper for Perl_re_op_compile that only takes an SV
	4968	* pattern rather than a list of OPs, and uses the internal engine rather
	4969	* than the current one */
	4970
	4971	REGEXP *
	4972	Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
	4973	{
	4974	SV pat = pattern; / defeat constness! */
	4975	PERL_ARGS_ASSERT_RE_COMPILE;
	4976	return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
	4977	#ifdef PERL_IN_XSUB_RE
	4978	&my_reg_engine,
	4979	#else
	4980	&PL_core_reg_engine,
	4981	#endif
	4982	NULL, NULL, rx_flags, 0);
	4983	}
	4984
	4985	/* see if there are any run-time code blocks in the pattern.
	4986	* False positives are allowed */
	4987
	4988	static bool
	4989	S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
	4990	U32 pm_flags, char *pat, STRLEN plen)
	4991	{
	4992	int n = 0;
	4993	STRLEN s;
	4994
	4995	/* avoid infinitely recursing when we recompile the pattern parcelled up
	4996	* as qr'...'. A single constant qr// string can't have have any
	4997	* run-time component in it, and thus, no runtime code. (A non-qr
	4998	* string, however, can, e.g. $x =~ '(?{})') */
	4999	if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
	5000	return 0;
	5001
	5002	for (s = 0; s < plen; s++) {
	5003	if (n < pRExC_state->num_code_blocks
	5004	&& s == pRExC_state->code_blocks[n].start)
	5005	{
	5006	s = pRExC_state->code_blocks[n].end;
	5007	n++;
	5008	continue;
	5009	}
	5010	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
	5011	* positives here */
	5012	if (pat[s] == '(' && pat[s+1] == '?' &&
	5013	(pat[s+2] == '{' \|\| (pat[s+2] == '?' && pat[s+3] == '{'))
	5014	)
	5015	return 1;
	5016	}
	5017	return 0;
	5018	}
	5019
	5020	/* Handle run-time code blocks. We will already have compiled any direct
	5021	* or indirect literal code blocks. Now, take the pattern 'pat' and make a
	5022	* copy of it, but with any literal code blocks blanked out and
	5023	* appropriate chars escaped; then feed it into
	5024	*
	5025	* eval "qr'modified_pattern'"
	5026	*
	5027	* For example,
	5028	*
	5029	* a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
	5030	*
	5031	* becomes
	5032	*
	5033	* qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
	5034	*
	5035	* After eval_sv()-ing that, grab any new code blocks from the returned qr
	5036	* and merge them with any code blocks of the original regexp.
	5037	*
	5038	* If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
	5039	* instead, just save the qr and return FALSE; this tells our caller that
	5040	* the original pattern needs upgrading to utf8.
	5041	*/
	5042
	5043	static bool
	5044	S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
	5045	char *pat, STRLEN plen)
	5046	{
	5047	SV *qr;
	5048
	5049	GET_RE_DEBUG_FLAGS_DECL;
	5050
	5051	if (pRExC_state->runtime_code_qr) {
	5052	/* this is the second time we've been called; this should
	5053	* only happen if the main pattern got upgraded to utf8
	5054	* during compilation; re-use the qr we compiled first time
	5055	* round (which should be utf8 too)
	5056	*/
	5057	qr = pRExC_state->runtime_code_qr;
	5058	pRExC_state->runtime_code_qr = NULL;
	5059	assert(RExC_utf8 && SvUTF8(qr));
	5060	}
	5061	else {
	5062	int n = 0;
	5063	STRLEN s;
	5064	char p, newpat;
	5065	int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
	5066	SV sv, qr_ref;
	5067	dSP;
	5068
	5069	/* determine how many extra chars we need for ' and \ escaping */
	5070	for (s = 0; s < plen; s++) {
	5071	if (pat[s] == '\'' \|\| pat[s] == '\\')
	5072	newlen++;
	5073	}
	5074
	5075	Newx(newpat, newlen, char);
	5076	p = newpat;
	5077	p++ = 'q'; p++ = 'r'; *p++ = '\'';
	5078
	5079	for (s = 0; s < plen; s++) {
	5080	if (n < pRExC_state->num_code_blocks
	5081	&& s == pRExC_state->code_blocks[n].start)
	5082	{
	5083	/* blank out literal code block */
	5084	assert(pat[s] == '(');
	5085	while (s <= pRExC_state->code_blocks[n].end) {
	5086	*p++ = ' ';
	5087	s++;
	5088	}
	5089	s--;
	5090	n++;
	5091	continue;
	5092	}
	5093	if (pat[s] == '\'' \|\| pat[s] == '\\')
	5094	*p++ = '\\';
	5095	*p++ = pat[s];
	5096	}
	5097	*p++ = '\'';
	5098	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
	5099	*p++ = 'x';
	5100	*p++ = '\0';
	5101	DEBUG_COMPILE_r({
	5102	PerlIO_printf(Perl_debug_log,
	5103	"%sre-parsing pattern for runtime code:%s %s\n",
	5104	PL_colors[4],PL_colors[5],newpat);
	5105	});
	5106
	5107	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
	5108	Safefree(newpat);
	5109
	5110	ENTER;
	5111	SAVETMPS;
	5112	save_re_context();
	5113	PUSHSTACKi(PERLSI_REQUIRE);
	5114	/* this causes the toker to collapse \\ into \ when parsing
	5115	* qr''; normally only q'' does this. It also alters hints
	5116	* handling */
	5117	PL_reg_state.re_reparsing = TRUE;
	5118	eval_sv(sv, G_SCALAR);
	5119	SvREFCNT_dec(sv);
	5120	SPAGAIN;
	5121	qr_ref = POPs;
	5122	PUTBACK;
	5123	if (SvTRUE(ERRSV))
	5124	Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
	5125	assert(SvROK(qr_ref));
	5126	qr = SvRV(qr_ref);
	5127	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
	5128	/* the leaving below frees the tmp qr_ref.
	5129	* Give qr a life of its own */
	5130	SvREFCNT_inc(qr);
	5131	POPSTACK;
	5132	FREETMPS;
	5133	LEAVE;
	5134
	5135	}
	5136
	5137	if (!RExC_utf8 && SvUTF8(qr)) {
	5138	/* first time through; the pattern got upgraded; save the
	5139	* qr for the next time through */
	5140	assert(!pRExC_state->runtime_code_qr);
	5141	pRExC_state->runtime_code_qr = qr;
	5142	return 0;
	5143	}
	5144
	5145
	5146	/* extract any code blocks within the returned qr// */
	5147
	5148
	5149	/* merge the main (r1) and run-time (r2) code blocks into one */
	5150	{
	5151	RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
	5152	struct reg_code_block new_block, dst;
	5153	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
	5154	int i1 = 0, i2 = 0;
	5155
	5156	if (!r2->num_code_blocks) /* we guessed wrong */
	5157	return 1;
	5158
	5159	Newx(new_block,
	5160	r1->num_code_blocks + r2->num_code_blocks,
	5161	struct reg_code_block);
	5162	dst = new_block;
	5163
	5164	while ( i1 < r1->num_code_blocks
	5165	\|\| i2 < r2->num_code_blocks)
	5166	{
	5167	struct reg_code_block *src;
	5168	bool is_qr = 0;
	5169
	5170	if (i1 == r1->num_code_blocks) {
	5171	src = &r2->code_blocks[i2++];
	5172	is_qr = 1;
	5173	}
	5174	else if (i2 == r2->num_code_blocks)
	5175	src = &r1->code_blocks[i1++];
	5176	else if ( r1->code_blocks[i1].start
	5177	< r2->code_blocks[i2].start)
	5178	{
	5179	src = &r1->code_blocks[i1++];
	5180	assert(src->end < r2->code_blocks[i2].start);
	5181	}
	5182	else {
	5183	assert( r1->code_blocks[i1].start
	5184	> r2->code_blocks[i2].start);
	5185	src = &r2->code_blocks[i2++];
	5186	is_qr = 1;
	5187	assert(src->end < r1->code_blocks[i1].start);
	5188	}
	5189
	5190	assert(pat[src->start] == '(');
	5191	assert(pat[src->end] == ')');
	5192	dst->start = src->start;
	5193	dst->end = src->end;
	5194	dst->block = src->block;
	5195	dst->src_regex = is_qr ? (REGEXP) SvREFCNT_inc( (SV) qr)
	5196	: src->src_regex;
	5197	dst++;
	5198	}
	5199	r1->num_code_blocks += r2->num_code_blocks;
	5200	Safefree(r1->code_blocks);
	5201	r1->code_blocks = new_block;
	5202	}
	5203
	5204	SvREFCNT_dec(qr);
	5205	return 1;
	5206	}
	5207
	5208
	5209	STATIC bool
	5210	S_setup_longest(pTHX_ RExC_state_t pRExC_state, SV sv_longest, SV rx_utf8, SV rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
	5211	{
	5212	/* This is the common code for setting up the floating and fixed length
	5213	* string data extracted from Perlre_op_compile() below. Returns a boolean
	5214	* as to whether succeeded or not */
	5215
	5216	I32 t,ml;
	5217
	5218	if (! (longest_length
	5219	\|\| (eol /* Can't have SEOL and MULTI */
	5220	&& (! meol \|\| (RExC_flags & RXf_PMf_MULTILINE)))
	5221	)
	5222	/* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
	5223	\|\| (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
	5224	{
	5225	return FALSE;
	5226	}
	5227
	5228	/* copy the information about the longest from the reg_scan_data
	5229	over to the program. */
	5230	if (SvUTF8(sv_longest)) {
	5231	*rx_utf8 = sv_longest;
	5232	*rx_substr = NULL;
	5233	} else {
	5234	*rx_substr = sv_longest;
	5235	*rx_utf8 = NULL;
	5236	}
	5237	/* end_shift is how many chars that must be matched that
	5238	follow this item. We calculate it ahead of time as once the
	5239	lookbehind offset is added in we lose the ability to correctly
	5240	calculate it.*/
	5241	ml = minlen ? *(minlen) : (I32)longest_length;
	5242	*rx_end_shift = ml - offset
	5243	- longest_length + (SvTAIL(sv_longest) != 0)
	5244	+ lookbehind;
	5245
	5246	t = (eol/* Can't have SEOL and MULTI */
	5247	&& (! meol \|\| (RExC_flags & RXf_PMf_MULTILINE)));
	5248	fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
	5249
	5250	return TRUE;
	5251	}
	5252
	5253	/*
	5254	* Perl_re_op_compile - the perl internal RE engine's function to compile a
	5255	* regular expression into internal code.
	5256	* The pattern may be passed either as:
	5257	* a list of SVs (patternp plus pat_count)
	5258	* a list of OPs (expr)
	5259	* If both are passed, the SV list is used, but the OP list indicates
	5260	* which SVs are actually pre-compiled code blocks
	5261	*
	5262	* The SVs in the list have magic and qr overloading applied to them (and
	5263	* the list may be modified in-place with replacement SVs in the latter
	5264	* case).
	5265	*
	5266	* If the pattern hasn't changed from old_re, then old_re will be
	5267	* returned.
	5268	*
	5269	* eng is the current engine. If that engine has an op_comp method, then
	5270	* handle directly (i.e. we assume that op_comp was us); otherwise, just
	5271	* do the initial concatenation of arguments and pass on to the external
	5272	* engine.
	5273	*
	5274	* If is_bare_re is not null, set it to a boolean indicating whether the
	5275	* arg list reduced (after overloading) to a single bare regex which has
	5276	* been returned (i.e. /$qr/).
	5277	*
	5278	* orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
	5279	*
	5280	* pm_flags contains the PMf_* flags, typically based on those from the
	5281	* pm_flags field of the related PMOP. Currently we're only interested in
	5282	* PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
	5283	*
	5284	* We can't allocate space until we know how big the compiled form will be,
	5285	* but we can't compile it (and thus know how big it is) until we've got a
	5286	* place to put the code. So we cheat: we compile it twice, once with code
	5287	* generation turned off and size counting turned on, and once "for real".
	5288	* This also means that we don't allocate space until we are sure that the
	5289	* thing really will compile successfully, and we never have to move the
	5290	* code and thus invalidate pointers into it. (Note that it has to be in
	5291	* one piece because free() must be able to free it all.) [NB: not true in perl]
	5292	*
	5293	* Beware that the optimization-preparation code in here knows about some
	5294	* of the structure of the compiled regexp. [I'll say.]
	5295	*/
	5296
	5297	REGEXP *
	5298	Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
	5299	OP expr, const regexp_engine eng, REGEXP *VOL old_re,
	5300	bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
	5301	{
	5302	dVAR;
	5303	REGEXP *rx;
	5304	struct regexp *r;
	5305	regexp_internal *ri;
	5306	STRLEN plen;
	5307	char * VOL exp;
	5308	char* xend;
	5309	regnode *scan;
	5310	I32 flags;
	5311	I32 minlen = 0;
	5312	U32 rx_flags;
	5313	SV * VOL pat;
	5314
	5315	/* these are all flags - maybe they should be turned
	5316	* into a single int with different bit masks */
	5317	I32 sawlookahead = 0;
	5318	I32 sawplus = 0;
	5319	I32 sawopen = 0;
	5320	bool used_setjump = FALSE;
	5321	regex_charset initial_charset = get_regex_charset(orig_rx_flags);
	5322	bool code_is_utf8 = 0;
	5323	bool VOL recompile = 0;
	5324	bool runtime_code = 0;
	5325	U8 jump_ret = 0;
	5326	dJMPENV;
	5327	scan_data_t data;
	5328	RExC_state_t RExC_state;
	5329	RExC_state_t * const pRExC_state = &RExC_state;
	5330	#ifdef TRIE_STUDY_OPT
	5331	int restudied;
	5332	RExC_state_t copyRExC_state;
	5333	#endif
	5334	GET_RE_DEBUG_FLAGS_DECL;
	5335
	5336	PERL_ARGS_ASSERT_RE_OP_COMPILE;
	5337
	5338	DEBUG_r(if (!PL_colorset) reginitcolors());
	5339
	5340	#ifndef PERL_IN_XSUB_RE
	5341	/* Initialize these here instead of as-needed, as is quick and avoids
	5342	* having to test them each time otherwise */
	5343	if (! PL_AboveLatin1) {
	5344	PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
	5345	PL_ASCII = _new_invlist_C_array(ASCII_invlist);
	5346	PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
	5347
	5348	PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
	5349	PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
	5350
	5351	PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
	5352	PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
	5353
	5354	PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
	5355	PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
	5356
	5357	PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
	5358
	5359	PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
	5360	PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
	5361
	5362	PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
	5363
	5364	PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
	5365	PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
	5366
	5367	PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
	5368	PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
	5369
	5370	PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
	5371	PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
	5372
	5373	PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
	5374	PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
	5375
	5376	PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
	5377	PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
	5378
	5379	PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
	5380	PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
	5381
	5382	PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
	5383	PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
	5384
	5385	PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
	5386
	5387	PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
	5388	PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
	5389
	5390	PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
	5391	PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
	5392	}
	5393	#endif
	5394
	5395	pRExC_state->code_blocks = NULL;
	5396	pRExC_state->num_code_blocks = 0;
	5397
	5398	if (is_bare_re)
	5399	*is_bare_re = FALSE;
	5400
	5401	if (expr && (expr->op_type == OP_LIST \|\|
	5402	(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
	5403
	5404	/* is the source UTF8, and how many code blocks are there? */
	5405	OP *o;
	5406	int ncode = 0;
	5407
	5408	for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
	5409	if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
	5410	code_is_utf8 = 1;
	5411	else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
	5412	/* count of DO blocks */
	5413	ncode++;
	5414	}
	5415	if (ncode) {
	5416	pRExC_state->num_code_blocks = ncode;
	5417	Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
	5418	}
	5419	}
	5420
	5421	if (pat_count) {
	5422	/* handle a list of SVs */
	5423
	5424	SV **svp;
	5425
	5426	/* apply magic and RE overloading to each arg */
	5427	for (svp = patternp; svp < patternp + pat_count; svp++) {
	5428	SV rx = svp;
	5429	SvGETMAGIC(rx);
	5430	if (SvROK(rx) && SvAMAGIC(rx)) {
	5431	SV *sv = AMG_CALLunary(rx, regexp_amg);
	5432	if (sv) {
	5433	if (SvROK(sv))
	5434	sv = SvRV(sv);
	5435	if (SvTYPE(sv) != SVt_REGEXP)
	5436	Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
	5437	*svp = sv;
	5438	}
	5439	}
	5440	}
	5441
	5442	if (pat_count > 1) {
	5443	/* concat multiple args and find any code block indexes */
	5444
	5445	OP *o = NULL;
	5446	int n = 0;
	5447	bool utf8 = 0;
	5448	STRLEN orig_patlen = 0;
	5449
	5450	if (pRExC_state->num_code_blocks) {
	5451	o = cLISTOPx(expr)->op_first;
	5452	assert(o->op_type == OP_PUSHMARK);
	5453	o = o->op_sibling;
	5454	}
	5455
	5456	pat = newSVpvn("", 0);
	5457	SAVEFREESV(pat);
	5458
	5459	/* determine if the pattern is going to be utf8 (needed
	5460	* in advance to align code block indices correctly).
	5461	* XXX This could fail to be detected for an arg with
	5462	* overloading but not concat overloading; but the main effect
	5463	* in this obscure case is to need a 'use re eval' for a
	5464	* literal code block */
	5465	for (svp = patternp; svp < patternp + pat_count; svp++) {
	5466	if (SvUTF8(*svp))
	5467	utf8 = 1;
	5468	}
	5469	if (utf8)
	5470	SvUTF8_on(pat);
	5471
	5472	for (svp = patternp; svp < patternp + pat_count; svp++) {
	5473	SV sv, msv = *svp;
	5474	SV *rx;
	5475	bool code = 0;
	5476	if (o) {
	5477	if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
	5478	assert(n < pRExC_state->num_code_blocks);
	5479	pRExC_state->code_blocks[n].start = SvCUR(pat);
	5480	pRExC_state->code_blocks[n].block = o;
	5481	pRExC_state->code_blocks[n].src_regex = NULL;
	5482	n++;
	5483	code = 1;
	5484	o = o->op_sibling; /* skip CONST */
	5485	assert(o);
	5486	}
	5487	o = o->op_sibling;;
	5488	}
	5489
	5490	if ((SvAMAGIC(pat) \|\| SvAMAGIC(msv)) &&
	5491	(sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
	5492	{
	5493	sv_setsv(pat, sv);
	5494	/* overloading involved: all bets are off over literal
	5495	* code. Pretend we haven't seen it */
	5496	pRExC_state->num_code_blocks -= n;
	5497	n = 0;
	5498	rx = NULL;
	5499
	5500	}
	5501	else {
	5502	while (SvAMAGIC(msv)
	5503	&& (sv = AMG_CALLunary(msv, string_amg))
	5504	&& sv != msv
	5505	&& !( SvROK(msv)
	5506	&& SvROK(sv)
	5507	&& SvRV(msv) == SvRV(sv))
	5508	) {
	5509	msv = sv;
	5510	SvGETMAGIC(msv);
	5511	}
	5512	if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
	5513	msv = SvRV(msv);
	5514	orig_patlen = SvCUR(pat);
	5515	sv_catsv_nomg(pat, msv);
	5516	rx = msv;
	5517	if (code)
	5518	pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
	5519	}
	5520
	5521	/* extract any code blocks within any embedded qr//'s */
	5522	if (rx && SvTYPE(rx) == SVt_REGEXP
	5523	&& RX_ENGINE((REGEXP*)rx)->op_comp)
	5524	{
	5525
	5526	RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
	5527	if (ri->num_code_blocks) {
	5528	int i;
	5529	/* the presence of an embedded qr// with code means
	5530	* we should always recompile: the text of the
	5531	* qr// may not have changed, but it may be a
	5532	* different closure than last time */
	5533	recompile = 1;
	5534	Renew(pRExC_state->code_blocks,
	5535	pRExC_state->num_code_blocks + ri->num_code_blocks,
	5536	struct reg_code_block);
	5537	pRExC_state->num_code_blocks += ri->num_code_blocks;
	5538	for (i=0; i < ri->num_code_blocks; i++) {
	5539	struct reg_code_block src, dst;
	5540	STRLEN offset = orig_patlen
	5541	+ ((struct regexp *)SvANY(rx))->pre_prefix;
	5542	assert(n < pRExC_state->num_code_blocks);
	5543	src = &ri->code_blocks[i];
	5544	dst = &pRExC_state->code_blocks[n];
	5545	dst->start = src->start + offset;
	5546	dst->end = src->end + offset;
	5547	dst->block = src->block;
	5548	dst->src_regex = (REGEXP) SvREFCNT_inc( (SV)
	5549	src->src_regex
	5550	? src->src_regex
	5551	: (REGEXP*)rx);
	5552	n++;
	5553	}
	5554	}
	5555	}
	5556	}
	5557	SvSETMAGIC(pat);
	5558	}
	5559	else {
	5560	SV *sv;
	5561	pat = *patternp;
	5562	while (SvAMAGIC(pat)
	5563	&& (sv = AMG_CALLunary(pat, string_amg))
	5564	&& sv != pat)
	5565	{
	5566	pat = sv;
	5567	SvGETMAGIC(pat);
	5568	}
	5569	}
	5570
	5571	/* handle bare regex: foo =~ $re */
	5572	{
	5573	SV *re = pat;
	5574	if (SvROK(re))
	5575	re = SvRV(re);
	5576	if (SvTYPE(re) == SVt_REGEXP) {
	5577	if (is_bare_re)
	5578	*is_bare_re = TRUE;
	5579	SvREFCNT_inc(re);
	5580	Safefree(pRExC_state->code_blocks);
	5581	return (REGEXP*)re;
	5582	}
	5583	}
	5584	}
	5585	else {
	5586	/* not a list of SVs, so must be a list of OPs */
	5587	assert(expr);
	5588	if (expr->op_type == OP_LIST) {
	5589	int i = -1;
	5590	bool is_code = 0;
	5591	OP *o;
	5592
	5593	pat = newSVpvn("", 0);
	5594	SAVEFREESV(pat);
	5595	if (code_is_utf8)
	5596	SvUTF8_on(pat);
	5597
	5598	/* given a list of CONSTs and DO blocks in expr, append all
	5599	* the CONSTs to pat, and record the start and end of each
	5600	* code block in code_blocks[] (each DO{} op is followed by an
	5601	* OP_CONST containing the corresponding literal '(?{...})
	5602	* text)
	5603	*/
	5604	for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
	5605	if (o->op_type == OP_CONST) {
	5606	sv_catsv(pat, cSVOPo_sv);
	5607	if (is_code) {
	5608	pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
	5609	is_code = 0;
	5610	}
	5611	}
	5612	else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
	5613	assert(i+1 < pRExC_state->num_code_blocks);
	5614	pRExC_state->code_blocks[++i].start = SvCUR(pat);
	5615	pRExC_state->code_blocks[i].block = o;
	5616	pRExC_state->code_blocks[i].src_regex = NULL;
	5617	is_code = 1;
	5618	}
	5619	}
	5620	}
	5621	else {
	5622	assert(expr->op_type == OP_CONST);
	5623	pat = cSVOPx_sv(expr);
	5624	}
	5625	}
	5626
	5627	exp = SvPV_nomg(pat, plen);
	5628
	5629	if (!eng->op_comp) {
	5630	if ((SvUTF8(pat) && IN_BYTES)
	5631	\|\| SvGMAGICAL(pat) \|\| SvAMAGIC(pat))
	5632	{
	5633	/* make a temporary copy; either to convert to bytes,
	5634	* or to avoid repeating get-magic / overloaded stringify */
	5635	pat = newSVpvn_flags(exp, plen, SVs_TEMP \|
	5636	(IN_BYTES ? 0 : SvUTF8(pat)));
	5637	}
	5638	Safefree(pRExC_state->code_blocks);
	5639	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
	5640	}
	5641
	5642	/* ignore the utf8ness if the pattern is 0 length */
	5643	RExC_utf8 = RExC_orig_utf8 = (plen == 0 \|\| IN_BYTES) ? 0 : SvUTF8(pat);
	5644	RExC_uni_semantics = 0;
	5645	RExC_contains_locale = 0;
	5646	pRExC_state->runtime_code_qr = NULL;
	5647
	5648	/**************** LONG JUMP TARGET HERE*********************/
	5649	/* Longjmp back to here if have to switch in midstream to utf8 */
	5650	if (! RExC_orig_utf8) {
	5651	JMPENV_PUSH(jump_ret);
	5652	used_setjump = TRUE;
	5653	}
	5654
	5655	if (jump_ret == 0) { /* First time through */
	5656	xend = exp + plen;
	5657
	5658	DEBUG_COMPILE_r({
	5659	SV *dsv= sv_newmortal();
	5660	RE_PV_QUOTED_DECL(s, RExC_utf8,
	5661	dsv, exp, plen, 60);
	5662	PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
	5663	PL_colors[4],PL_colors[5],s);
	5664	});
	5665	}
	5666	else { /* longjumped back */
	5667	U8 src, dst;
	5668	int n=0;
	5669	STRLEN s = 0, d = 0;
	5670	bool do_end = 0;
	5671
	5672	/* If the cause for the longjmp was other than changing to utf8, pop
	5673	* our own setjmp, and longjmp to the correct handler */
	5674	if (jump_ret != UTF8_LONGJMP) {
	5675	JMPENV_POP;
	5676	JMPENV_JUMP(jump_ret);
	5677	}
	5678
	5679	GET_RE_DEBUG_FLAGS;
	5680
	5681	/* It's possible to write a regexp in ascii that represents Unicode
	5682	codepoints outside of the byte range, such as via \x{100}. If we
	5683	detect such a sequence we have to convert the entire pattern to utf8
	5684	and then recompile, as our sizing calculation will have been based
	5685	on 1 byte == 1 character, but we will need to use utf8 to encode
	5686	at least some part of the pattern, and therefore must convert the whole
	5687	thing.
	5688	-- dmq */
	5689	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
	5690	"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
	5691
	5692	/* upgrade pattern to UTF8, and if there are code blocks,
	5693	* recalculate the indices.
	5694	* This is essentially an unrolled Perl_bytes_to_utf8() */
	5695
	5696	src = (U8*)SvPV_nomg(pat, plen);
	5697	Newx(dst, plen * 2 + 1, U8);
	5698
	5699	while (s < plen) {
	5700	const UV uv = NATIVE_TO_ASCII(src[s]);
	5701	if (UNI_IS_INVARIANT(uv))
	5702	dst[d] = (U8)UTF_TO_NATIVE(uv);
	5703	else {
	5704	dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
	5705	dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
	5706	}
	5707	if (n < pRExC_state->num_code_blocks) {
	5708	if (!do_end && pRExC_state->code_blocks[n].start == s) {
	5709	pRExC_state->code_blocks[n].start = d;
	5710	assert(dst[d] == '(');
	5711	do_end = 1;
	5712	}
	5713	else if (do_end && pRExC_state->code_blocks[n].end == s) {
	5714	pRExC_state->code_blocks[n].end = d;
	5715	assert(dst[d] == ')');
	5716	do_end = 0;
	5717	n++;
	5718	}
	5719	}
	5720	s++;
	5721	d++;
	5722	}
	5723	dst[d] = '\0';
	5724	plen = d;
	5725	exp = (char*) dst;
	5726	xend = exp + plen;
	5727	SAVEFREEPV(exp);
	5728	RExC_orig_utf8 = RExC_utf8 = 1;
	5729	}
	5730
	5731	/* return old regex if pattern hasn't changed */
	5732
	5733	if ( old_re
	5734	&& !recompile
	5735	&& !!RX_UTF8(old_re) == !!RExC_utf8
	5736	&& RX_PRECOMP(old_re)
	5737	&& RX_PRELEN(old_re) == plen
	5738	&& memEQ(RX_PRECOMP(old_re), exp, plen))
	5739	{
	5740	/* with runtime code, always recompile */
	5741	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
	5742	exp, plen);
	5743	if (!runtime_code) {
	5744	if (used_setjump) {
	5745	JMPENV_POP;
	5746	}
	5747	Safefree(pRExC_state->code_blocks);
	5748	return old_re;
	5749	}
	5750	}
	5751	else if ((pm_flags & PMf_USE_RE_EVAL)
	5752	/* this second condition covers the non-regex literal case,
	5753	* i.e. $foo =~ '(?{})'. */
	5754	\|\| ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
	5755	&& (PL_hints & HINT_RE_EVAL))
	5756	)
	5757	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
	5758	exp, plen);
	5759
	5760	#ifdef TRIE_STUDY_OPT
	5761	restudied = 0;
	5762	#endif
	5763
	5764	rx_flags = orig_rx_flags;
	5765
	5766	if (initial_charset == REGEX_LOCALE_CHARSET) {
	5767	RExC_contains_locale = 1;
	5768	}
	5769	else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
	5770
	5771	/* Set to use unicode semantics if the pattern is in utf8 and has the
	5772	* 'depends' charset specified, as it means unicode when utf8 */
	5773	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
	5774	}
	5775
	5776	RExC_precomp = exp;
	5777	RExC_flags = rx_flags;
	5778	RExC_pm_flags = pm_flags;
	5779
	5780	if (runtime_code) {
	5781	if (PL_tainting && PL_tainted)
	5782	Perl_croak(aTHX_ "Eval-group in insecure regular expression");
	5783
	5784	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
	5785	/* whoops, we have a non-utf8 pattern, whilst run-time code
	5786	* got compiled as utf8. Try again with a utf8 pattern */
	5787	JMPENV_JUMP(UTF8_LONGJMP);
	5788	}
	5789	}
	5790	assert(!pRExC_state->runtime_code_qr);
	5791
	5792	RExC_sawback = 0;
	5793
	5794	RExC_seen = 0;
	5795	RExC_in_lookbehind = 0;
	5796	RExC_seen_zerolen = *exp == '^' ? -1 : 0;
	5797	RExC_extralen = 0;
	5798	RExC_override_recoding = 0;
	5799
	5800	/* First pass: determine size, legality. */
	5801	RExC_parse = exp;
	5802	RExC_start = exp;
	5803	RExC_end = xend;
	5804	RExC_naughty = 0;
	5805	RExC_npar = 1;
	5806	RExC_nestroot = 0;
	5807	RExC_size = 0L;
	5808	RExC_emit = &PL_regdummy;
	5809	RExC_whilem_seen = 0;
	5810	RExC_open_parens = NULL;
	5811	RExC_close_parens = NULL;
	5812	RExC_opend = NULL;
	5813	RExC_paren_names = NULL;
	5814	#ifdef DEBUGGING
	5815	RExC_paren_name_list = NULL;
	5816	#endif
	5817	RExC_recurse = NULL;
	5818	RExC_recurse_count = 0;
	5819	pRExC_state->code_index = 0;
	5820
	5821	#if 0 /* REGC() is (currently) a NOP at the first pass.
	5822	* Clever compilers notice this and complain. --jhi */
	5823	REGC((U8)REG_MAGIC, (char*)RExC_emit);
	5824	#endif
	5825	DEBUG_PARSE_r(
	5826	PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
	5827	RExC_lastnum=0;
	5828	RExC_lastparse=NULL;
	5829	);
	5830	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	5831	RExC_precomp = NULL;
	5832	Safefree(pRExC_state->code_blocks);
	5833	return(NULL);
	5834	}
	5835
	5836	/* Here, finished first pass. Get rid of any added setjmp */
	5837	if (used_setjump) {
	5838	JMPENV_POP;
	5839	}
	5840
	5841	DEBUG_PARSE_r({
	5842	PerlIO_printf(Perl_debug_log,
	5843	"Required size %"IVdf" nodes\n"
	5844	"Starting second pass (creation)\n",
	5845	(IV)RExC_size);
	5846	RExC_lastnum=0;
	5847	RExC_lastparse=NULL;
	5848	});
	5849
	5850	/* The first pass could have found things that force Unicode semantics */
	5851	if ((RExC_utf8 \|\| RExC_uni_semantics)
	5852	&& get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
	5853	{
	5854	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
	5855	}
	5856
	5857	/* Small enough for pointer-storage convention?
	5858	If extralen==0, this means that we will not need long jumps. */
	5859	if (RExC_size >= 0x10000L && RExC_extralen)
	5860	RExC_size += RExC_extralen;
	5861	else
	5862	RExC_extralen = 0;
	5863	if (RExC_whilem_seen > 15)
	5864	RExC_whilem_seen = 15;
	5865
	5866	/* Allocate space and zero-initialize. Note, the two step process
	5867	of zeroing when in debug mode, thus anything assigned has to
	5868	happen after that */
	5869	rx = (REGEXP*) newSV_type(SVt_REGEXP);
	5870	r = (struct regexp*)SvANY(rx);
	5871	Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
	5872	char, regexp_internal);
	5873	if ( r == NULL \|\| ri == NULL )
	5874	FAIL("Regexp out of space");
	5875	#ifdef DEBUGGING
	5876	/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
	5877	Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
	5878	#else
	5879	/* bulk initialize base fields with 0. */
	5880	Zero(ri, sizeof(regexp_internal), char);
	5881	#endif
	5882
	5883	/* non-zero initialization begins here */
	5884	RXi_SET( r, ri );
	5885	r->engine= eng;
	5886	r->extflags = rx_flags;
	5887	if (pm_flags & PMf_IS_QR) {
	5888	ri->code_blocks = pRExC_state->code_blocks;
	5889	ri->num_code_blocks = pRExC_state->num_code_blocks;
	5890	}
	5891	else
	5892	SAVEFREEPV(pRExC_state->code_blocks);
	5893
	5894	{
	5895	bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
	5896	bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
	5897
	5898	/* The caret is output if there are any defaults: if not all the STD
	5899	* flags are set, or if no character set specifier is needed */
	5900	bool has_default =
	5901	(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
	5902	\|\| ! has_charset);
	5903	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
	5904	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
	5905	>> RXf_PMf_STD_PMMOD_SHIFT);
	5906	const char fptr = STD_PAT_MODS; /"msix"*/
	5907	char *p;
	5908	/* Allocate for the worst case, which is all the std flags are turned
	5909	* on. If more precision is desired, we could do a population count of
	5910	* the flags set. This could be done with a small lookup table, or by
	5911	* shifting, masking and adding, or even, when available, assembly
	5912	* language for a machine-language population count.
	5913	* We never output a minus, as all those are defaults, so are
	5914	* covered by the caret */
	5915	const STRLEN wraplen = plen + has_p + has_runon
	5916	+ has_default /* If needs a caret */
	5917
	5918	/* If needs a character set specifier */
	5919	+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
	5920	+ (sizeof(STD_PAT_MODS) - 1)
	5921	+ (sizeof("(?:)") - 1);
	5922
	5923	p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
	5924	SvPOK_on(rx);
	5925	if (RExC_utf8)
	5926	SvFLAGS(rx) \|= SVf_UTF8;
	5927	p++='('; p++='?';
	5928
	5929	/* If a default, cover it using the caret */
	5930	if (has_default) {
	5931	*p++= DEFAULT_PAT_MOD;
	5932	}
	5933	if (has_charset) {
	5934	STRLEN len;
	5935	const char* const name = get_regex_charset_name(r->extflags, &len);
	5936	Copy(name, p, len, char);
	5937	p += len;
	5938	}
	5939	if (has_p)
	5940	p++ = KEEPCOPY_PAT_MOD; /'p'*/
	5941	{
	5942	char ch;
	5943	while((ch = *fptr++)) {
	5944	if(reganch & 1)
	5945	*p++ = ch;
	5946	reganch >>= 1;
	5947	}
	5948	}
	5949
	5950	*p++ = ':';
	5951	Copy(RExC_precomp, p, plen, char);
	5952	assert ((RX_WRAPPED(rx) - p) < 16);
	5953	r->pre_prefix = p - RX_WRAPPED(rx);
	5954	p += plen;
	5955	if (has_runon)
	5956	*p++ = '\n';
	5957	*p++ = ')';
	5958	*p = 0;
	5959	SvCUR_set(rx, p - SvPVX_const(rx));
	5960	}
	5961
	5962	r->intflags = 0;
	5963	r->nparens = RExC_npar - 1; /* set early to validate backrefs */
	5964
	5965	if (RExC_seen & REG_SEEN_RECURSE) {
	5966	Newxz(RExC_open_parens, RExC_npar,regnode *);
	5967	SAVEFREEPV(RExC_open_parens);
	5968	Newxz(RExC_close_parens,RExC_npar,regnode *);
	5969	SAVEFREEPV(RExC_close_parens);
	5970	}
	5971
	5972	/* Useful during FAIL. */
	5973	#ifdef RE_TRACK_PATTERN_OFFSETS
	5974	Newxz(ri->u.offsets, 2RExC_size+1, U32); / MJD 20001228 */
	5975	DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
	5976	"%s %"UVuf" bytes for offset annotations.\n",
	5977	ri->u.offsets ? "Got" : "Couldn't get",
	5978	(UV)((2RExC_size+1) sizeof(U32))));
	5979	#endif
	5980	SetProgLen(ri,RExC_size);
	5981	RExC_rx_sv = rx;
	5982	RExC_rx = r;
	5983	RExC_rxi = ri;
	5984
	5985	/* Second pass: emit code. */
	5986	RExC_flags = rx_flags; /* don't let top level (?i) bleed */
	5987	RExC_pm_flags = pm_flags;
	5988	RExC_parse = exp;
	5989	RExC_end = xend;
	5990	RExC_naughty = 0;
	5991	RExC_npar = 1;
	5992	RExC_emit_start = ri->program;
	5993	RExC_emit = ri->program;
	5994	RExC_emit_bound = ri->program + RExC_size + 1;
	5995	pRExC_state->code_index = 0;
	5996
	5997	REGC((U8)REG_MAGIC, (char*) RExC_emit++);
	5998	if (reg(pRExC_state, 0, &flags,1) == NULL) {
	5999	ReREFCNT_dec(rx);
	6000	return(NULL);
	6001	}
	6002	/* XXXX To minimize changes to RE engine we always allocate
	6003	3-units-long substrs field. */
	6004	Newx(r->substrs, 1, struct reg_substr_data);
	6005	if (RExC_recurse_count) {
	6006	Newxz(RExC_recurse,RExC_recurse_count,regnode *);
	6007	SAVEFREEPV(RExC_recurse);
	6008	}
	6009
	6010	reStudy:
	6011	r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
	6012	Zero(r->substrs, 1, struct reg_substr_data);
	6013
	6014	#ifdef TRIE_STUDY_OPT
	6015	if (!restudied) {
	6016	StructCopy(&zero_scan_data, &data, scan_data_t);
	6017	copyRExC_state = RExC_state;
	6018	} else {
	6019	U32 seen=RExC_seen;
	6020	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
	6021
	6022	RExC_state = copyRExC_state;
	6023	if (seen & REG_TOP_LEVEL_BRANCHES)
	6024	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	6025	else
	6026	RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
	6027	if (data.last_found) {
	6028	SvREFCNT_dec(data.longest_fixed);
	6029	SvREFCNT_dec(data.longest_float);
	6030	SvREFCNT_dec(data.last_found);
	6031	}
	6032	StructCopy(&zero_scan_data, &data, scan_data_t);
	6033	}
	6034	#else
	6035	StructCopy(&zero_scan_data, &data, scan_data_t);
	6036	#endif
	6037
	6038	/* Dig out information for optimizations. */
	6039	r->extflags = RExC_flags; /* was pm_op */
	6040	/dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; /
	6041
	6042	if (UTF)
	6043	SvUTF8_on(rx); /* Unicode in it? */
	6044	ri->regstclass = NULL;
	6045	if (RExC_naughty >= 10) /* Probably an expensive pattern. */
	6046	r->intflags \|= PREGf_NAUGHTY;
	6047	scan = ri->program + 1; /* First BRANCH. */
	6048
	6049	/* testing for BRANCH here tells us whether there is "must appear"
	6050	data in the pattern. If there is then we can use it for optimisations */
	6051	if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
	6052	I32 fake;
	6053	STRLEN longest_float_length, longest_fixed_length;
	6054	struct regnode_charclass_class ch_class; /* pointed to by data */
	6055	int stclass_flag;
	6056	I32 last_close = 0; /* pointed to by data */
	6057	regnode *first= scan;
	6058	regnode *first_next= regnext(first);
	6059	/*
	6060	* Skip introductions and multiplicators >= 1
	6061	* so that we can extract the 'meat' of the pattern that must
	6062	* match in the large if() sequence following.
	6063	* NOTE that EXACT is NOT covered here, as it is normally
	6064	* picked up by the optimiser separately.
	6065	*
	6066	* This is unfortunate as the optimiser isnt handling lookahead
	6067	* properly currently.
	6068	*
	6069	*/
	6070	while ((OP(first) == OPEN && (sawopen = 1)) \|\|
	6071	/* An OR of one alternative - should not happen now. */
	6072	(OP(first) == BRANCH && OP(first_next) != BRANCH) \|\|
	6073	/* for now we can't handle lookbehind IFMATCH*/
	6074	(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) \|\|
	6075	(OP(first) == PLUS) \|\|
	6076	(OP(first) == MINMOD) \|\|
	6077	/* An {n,m} with n>0 */
	6078	(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) \|\|
	6079	(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
	6080	{
	6081	/*
	6082	* the only op that could be a regnode is PLUS, all the rest
	6083	* will be regnode_1 or regnode_2.
	6084	*
	6085	*/
	6086	if (OP(first) == PLUS)
	6087	sawplus = 1;
	6088	else
	6089	first += regarglen[OP(first)];
	6090
	6091	first = NEXTOPER(first);
	6092	first_next= regnext(first);
	6093	}
	6094
	6095	/* Starting-point info. */
	6096	again:
	6097	DEBUG_PEEP("first:",first,0);
	6098	/* Ignore EXACT as we deal with it later. */
	6099	if (PL_regkind[OP(first)] == EXACT) {
	6100	if (OP(first) == EXACT)
	6101	NOOP; /* Empty, get anchored substr later. */
	6102	else
	6103	ri->regstclass = first;
	6104	}
	6105	#ifdef TRIE_STCLASS
	6106	else if (PL_regkind[OP(first)] == TRIE &&
	6107	((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
	6108	{
	6109	regnode *trie_op;
	6110	/* this can happen only on restudy */
	6111	if ( OP(first) == TRIE ) {
	6112	struct regnode_1 trieop = (struct regnode_1 )
	6113	PerlMemShared_calloc(1, sizeof(struct regnode_1));
	6114	StructCopy(first,trieop,struct regnode_1);
	6115	trie_op=(regnode *)trieop;
	6116	} else {
	6117	struct regnode_charclass trieop = (struct regnode_charclass )
	6118	PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
	6119	StructCopy(first,trieop,struct regnode_charclass);
	6120	trie_op=(regnode *)trieop;
	6121	}
	6122	OP(trie_op)+=2;
	6123	make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
	6124	ri->regstclass = trie_op;
	6125	}
	6126	#endif
	6127	else if (REGNODE_SIMPLE(OP(first)))
	6128	ri->regstclass = first;
	6129	else if (PL_regkind[OP(first)] == BOUND \|\|
	6130	PL_regkind[OP(first)] == NBOUND)
	6131	ri->regstclass = first;
	6132	else if (PL_regkind[OP(first)] == BOL) {
	6133	r->extflags \|= (OP(first) == MBOL
	6134	? RXf_ANCH_MBOL
	6135	: (OP(first) == SBOL
	6136	? RXf_ANCH_SBOL
	6137	: RXf_ANCH_BOL));
	6138	first = NEXTOPER(first);
	6139	goto again;
	6140	}
	6141	else if (OP(first) == GPOS) {
	6142	r->extflags \|= RXf_ANCH_GPOS;
	6143	first = NEXTOPER(first);
	6144	goto again;
	6145	}
	6146	else if ((!sawopen \|\| !RExC_sawback) &&
	6147	(OP(first) == STAR &&
	6148	PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
	6149	!(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
	6150	{
	6151	/* turn .* into ^.* with an implied $=1 /
	6152	const int type =
	6153	(OP(NEXTOPER(first)) == REG_ANY)
	6154	? RXf_ANCH_MBOL
	6155	: RXf_ANCH_SBOL;
	6156	r->extflags \|= type;
	6157	r->intflags \|= PREGf_IMPLICIT;
	6158	first = NEXTOPER(first);
	6159	goto again;
	6160	}
	6161	if (sawplus && !sawlookahead && (!sawopen \|\| !RExC_sawback)
	6162	&& !pRExC_state->num_code_blocks) /* May examine pos and $& */
	6163	/* x+ must match at the 1st pos of run of x's */
	6164	r->intflags \|= PREGf_SKIP;
	6165
	6166	/* Scan is after the zeroth branch, first is atomic matcher. */
	6167	#ifdef TRIE_STUDY_OPT
	6168	DEBUG_PARSE_r(
	6169	if (!restudied)
	6170	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	6171	(IV)(first - scan + 1))
	6172	);
	6173	#else
	6174	DEBUG_PARSE_r(
	6175	PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
	6176	(IV)(first - scan + 1))
	6177	);
	6178	#endif
	6179
	6180
	6181	/*
	6182	* If there's something expensive in the r.e., find the
	6183	* longest literal string that must appear and make it the
	6184	* regmust. Resolve ties in favor of later strings, since
	6185	* the regstart check works with the beginning of the r.e.
	6186	* and avoiding duplication strengthens checking. Not a
	6187	* strong reason, but sufficient in the absence of others.
	6188	* [Now we resolve ties in favor of the earlier string if
	6189	* it happens that c_offset_min has been invalidated, since the
	6190	* earlier string may buy us something the later one won't.]
	6191	*/
	6192
	6193	data.longest_fixed = newSVpvs("");
	6194	data.longest_float = newSVpvs("");
	6195	data.last_found = newSVpvs("");
	6196	data.longest = &(data.longest_fixed);
	6197	first = scan;
	6198	if (!ri->regstclass) {
	6199	cl_init(pRExC_state, &ch_class);
	6200	data.start_class = &ch_class;
	6201	stclass_flag = SCF_DO_STCLASS_AND;
	6202	} else /* XXXX Check for BOUND? */
	6203	stclass_flag = 0;
	6204	data.last_closep = &last_close;
	6205
	6206	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
	6207	&data, -1, NULL, NULL,
	6208	SCF_DO_SUBSTR \| SCF_WHILEM_VISITED_POS \| stclass_flag,0);
	6209
	6210
	6211	CHECK_RESTUDY_GOTO;
	6212
	6213
	6214	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
	6215	&& data.last_start_min == 0 && data.last_end > 0
	6216	&& !RExC_seen_zerolen
	6217	&& !(RExC_seen & REG_SEEN_VERBARG)
	6218	&& (!(RExC_seen & REG_SEEN_GPOS) \|\| (r->extflags & RXf_ANCH_GPOS)))
	6219	r->extflags \|= RXf_CHECK_ALL;
	6220	scan_commit(pRExC_state, &data,&minlen,0);
	6221	SvREFCNT_dec(data.last_found);
	6222
	6223	longest_float_length = CHR_SVLEN(data.longest_float);
	6224
	6225	if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
	6226	&& data.offset_fixed == data.offset_float_min
	6227	&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
	6228	&& S_setup_longest (aTHX_ pRExC_state,
	6229	data.longest_float,
	6230	&(r->float_utf8),
	6231	&(r->float_substr),
	6232	&(r->float_end_shift),
	6233	data.lookbehind_float,
	6234	data.offset_float_min,
	6235	data.minlen_float,
	6236	longest_float_length,
	6237	data.flags & SF_FL_BEFORE_EOL,
	6238	data.flags & SF_FL_BEFORE_MEOL))
	6239	{
	6240	r->float_min_offset = data.offset_float_min - data.lookbehind_float;
	6241	r->float_max_offset = data.offset_float_max;
	6242	if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
	6243	r->float_max_offset -= data.lookbehind_float;
	6244	}
	6245	else {
	6246	r->float_substr = r->float_utf8 = NULL;
	6247	SvREFCNT_dec(data.longest_float);
	6248	longest_float_length = 0;
	6249	}
	6250
	6251	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
	6252
	6253	if (S_setup_longest (aTHX_ pRExC_state,
	6254	data.longest_fixed,
	6255	&(r->anchored_utf8),
	6256	&(r->anchored_substr),
	6257	&(r->anchored_end_shift),
	6258	data.lookbehind_fixed,
	6259	data.offset_fixed,
	6260	data.minlen_fixed,
	6261	longest_fixed_length,
	6262	data.flags & SF_FIX_BEFORE_EOL,
	6263	data.flags & SF_FIX_BEFORE_MEOL))
	6264	{
	6265	r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
	6266	}
	6267	else {
	6268	r->anchored_substr = r->anchored_utf8 = NULL;
	6269	SvREFCNT_dec(data.longest_fixed);
	6270	longest_fixed_length = 0;
	6271	}
	6272
	6273	if (ri->regstclass
	6274	&& (OP(ri->regstclass) == REG_ANY \|\| OP(ri->regstclass) == SANY))
	6275	ri->regstclass = NULL;
	6276
	6277	if ((!(r->anchored_substr \|\| r->anchored_utf8) \|\| r->anchored_offset)
	6278	&& stclass_flag
	6279	&& !(data.start_class->flags & ANYOF_EOS)
	6280	&& !cl_is_anything(data.start_class))
	6281	{
	6282	const U32 n = add_data(pRExC_state, 1, "f");
	6283	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	6284
	6285	Newx(RExC_rxi->data->data[n], 1,
	6286	struct regnode_charclass_class);
	6287	StructCopy(data.start_class,
	6288	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	6289	struct regnode_charclass_class);
	6290	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	6291	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	6292	DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
	6293	regprop(r, sv, (regnode*)data.start_class);
	6294	PerlIO_printf(Perl_debug_log,
	6295	"synthetic stclass \"%s\".\n",
	6296	SvPVX_const(sv));});
	6297	}
	6298
	6299	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
	6300	if (longest_fixed_length > longest_float_length) {
	6301	r->check_end_shift = r->anchored_end_shift;
	6302	r->check_substr = r->anchored_substr;
	6303	r->check_utf8 = r->anchored_utf8;
	6304	r->check_offset_min = r->check_offset_max = r->anchored_offset;
	6305	if (r->extflags & RXf_ANCH_SINGLE)
	6306	r->extflags \|= RXf_NOSCAN;
	6307	}
	6308	else {
	6309	r->check_end_shift = r->float_end_shift;
	6310	r->check_substr = r->float_substr;
	6311	r->check_utf8 = r->float_utf8;
	6312	r->check_offset_min = r->float_min_offset;
	6313	r->check_offset_max = r->float_max_offset;
	6314	}
	6315	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
	6316	This should be changed ASAP! */
	6317	if ((r->check_substr \|\| r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
	6318	r->extflags \|= RXf_USE_INTUIT;
	6319	if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
	6320	r->extflags \|= RXf_INTUIT_TAIL;
	6321	}
	6322	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
	6323	if ( (STRLEN)minlen < longest_float_length )
	6324	minlen= longest_float_length;
	6325	if ( (STRLEN)minlen < longest_fixed_length )
	6326	minlen= longest_fixed_length;
	6327	*/
	6328	}
	6329	else {
	6330	/* Several toplevels. Best we can is to set minlen. */
	6331	I32 fake;
	6332	struct regnode_charclass_class ch_class;
	6333	I32 last_close = 0;
	6334
	6335	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
	6336
	6337	scan = ri->program + 1;
	6338	cl_init(pRExC_state, &ch_class);
	6339	data.start_class = &ch_class;
	6340	data.last_closep = &last_close;
	6341
	6342
	6343	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
	6344	&data, -1, NULL, NULL, SCF_DO_STCLASS_AND\|SCF_WHILEM_VISITED_POS,0);
	6345
	6346	CHECK_RESTUDY_GOTO;
	6347
	6348	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
	6349	= r->float_substr = r->float_utf8 = NULL;
	6350
	6351	if (!(data.start_class->flags & ANYOF_EOS)
	6352	&& !cl_is_anything(data.start_class))
	6353	{
	6354	const U32 n = add_data(pRExC_state, 1, "f");
	6355	data.start_class->flags \|= ANYOF_IS_SYNTHETIC;
	6356
	6357	Newx(RExC_rxi->data->data[n], 1,
	6358	struct regnode_charclass_class);
	6359	StructCopy(data.start_class,
	6360	(struct regnode_charclass_class*)RExC_rxi->data->data[n],
	6361	struct regnode_charclass_class);
	6362	ri->regstclass = (regnode*)RExC_rxi->data->data[n];
	6363	r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
	6364	DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
	6365	regprop(r, sv, (regnode*)data.start_class);
	6366	PerlIO_printf(Perl_debug_log,
	6367	"synthetic stclass \"%s\".\n",
	6368	SvPVX_const(sv));});
	6369	}
	6370	}
	6371
	6372	/* Guard against an embedded (?=) or (?<=) with a longer minlen than
	6373	the "real" pattern. */
	6374	DEBUG_OPTIMISE_r({
	6375	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
	6376	(IV)minlen, (IV)r->minlen);
	6377	});
	6378	r->minlenret = minlen;
	6379	if (r->minlen < minlen)
	6380	r->minlen = minlen;
	6381
	6382	if (RExC_seen & REG_SEEN_GPOS)
	6383	r->extflags \|= RXf_GPOS_SEEN;
	6384	if (RExC_seen & REG_SEEN_LOOKBEHIND)
	6385	r->extflags \|= RXf_LOOKBEHIND_SEEN;
	6386	if (pRExC_state->num_code_blocks)
	6387	r->extflags \|= RXf_EVAL_SEEN;
	6388	if (RExC_seen & REG_SEEN_CANY)
	6389	r->extflags \|= RXf_CANY_SEEN;
	6390	if (RExC_seen & REG_SEEN_VERBARG)
	6391	r->intflags \|= PREGf_VERBARG_SEEN;
	6392	if (RExC_seen & REG_SEEN_CUTGROUP)
	6393	r->intflags \|= PREGf_CUTGROUP_SEEN;
	6394	if (pm_flags & PMf_USE_RE_EVAL)
	6395	r->intflags \|= PREGf_USE_RE_EVAL;
	6396	if (RExC_paren_names)
	6397	RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
	6398	else
	6399	RXp_PAREN_NAMES(r) = NULL;
	6400
	6401	#ifdef STUPID_PATTERN_CHECKS
	6402	if (RX_PRELEN(rx) == 0)
	6403	r->extflags \|= RXf_NULL;
	6404	if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
	6405	r->extflags \|= RXf_WHITE;
	6406	else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
	6407	r->extflags \|= RXf_START_ONLY;
	6408	#else
	6409	{
	6410	regnode *first = ri->program + 1;
	6411	U8 fop = OP(first);
	6412
	6413	if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
	6414	r->extflags \|= RXf_NULL;
	6415	else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
	6416	r->extflags \|= RXf_START_ONLY;
	6417	else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
	6418	&& OP(regnext(first)) == END)
	6419	r->extflags \|= RXf_WHITE;
	6420	}
	6421	#endif
	6422	#ifdef DEBUGGING
	6423	if (RExC_paren_names) {
	6424	ri->name_list_idx = add_data( pRExC_state, 1, "a" );
	6425	ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
	6426	} else
	6427	#endif
	6428	ri->name_list_idx = 0;
	6429
	6430	if (RExC_recurse_count) {
	6431	for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
	6432	const regnode *scan = RExC_recurse[RExC_recurse_count-1];
	6433	ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
	6434	}
	6435	}
	6436	Newxz(r->offs, RExC_npar, regexp_paren_pair);
	6437	/* assume we don't need to swap parens around before we match */
	6438
	6439	DEBUG_DUMP_r({
	6440	PerlIO_printf(Perl_debug_log,"Final program:\n");
	6441	regdump(r);
	6442	});
	6443	#ifdef RE_TRACK_PATTERN_OFFSETS
	6444	DEBUG_OFFSETS_r(if (ri->u.offsets) {
	6445	const U32 len = ri->u.offsets[0];
	6446	U32 i;
	6447	GET_RE_DEBUG_FLAGS_DECL;
	6448	PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
	6449	for (i = 1; i <= len; i++) {
	6450	if (ri->u.offsets[i2-1] \|\| ri->u.offsets[i2])
	6451	PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
	6452	(UV)i, (UV)ri->u.offsets[i2-1], (UV)ri->u.offsets[i2]);
	6453	}
	6454	PerlIO_printf(Perl_debug_log, "\n");
	6455	});
	6456	#endif
	6457	return rx;
	6458	}
	6459
	6460
	6461	SV*
	6462	Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
	6463	const U32 flags)
	6464	{
	6465	PERL_ARGS_ASSERT_REG_NAMED_BUFF;
	6466
	6467	PERL_UNUSED_ARG(value);
	6468
	6469	if (flags & RXapif_FETCH) {
	6470	return reg_named_buff_fetch(rx, key, flags);
	6471	} else if (flags & (RXapif_STORE \| RXapif_DELETE \| RXapif_CLEAR)) {
	6472	Perl_croak_no_modify(aTHX);
	6473	return NULL;
	6474	} else if (flags & RXapif_EXISTS) {
	6475	return reg_named_buff_exists(rx, key, flags)
	6476	? &PL_sv_yes
	6477	: &PL_sv_no;
	6478	} else if (flags & RXapif_REGNAMES) {
	6479	return reg_named_buff_all(rx, flags);
	6480	} else if (flags & (RXapif_SCALAR \| RXapif_REGNAMES_COUNT)) {
	6481	return reg_named_buff_scalar(rx, flags);
	6482	} else {
	6483	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
	6484	return NULL;
	6485	}
	6486	}
	6487
	6488	SV*
	6489	Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
	6490	const U32 flags)
	6491	{
	6492	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
	6493	PERL_UNUSED_ARG(lastkey);
	6494
	6495	if (flags & RXapif_FIRSTKEY)
	6496	return reg_named_buff_firstkey(rx, flags);
	6497	else if (flags & RXapif_NEXTKEY)
	6498	return reg_named_buff_nextkey(rx, flags);
	6499	else {
	6500	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
	6501	return NULL;
	6502	}
	6503	}
	6504
	6505	SV*
	6506	Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
	6507	const U32 flags)
	6508	{
	6509	AV *retarray = NULL;
	6510	SV *ret;
	6511	struct regexp const rx = (struct regexp )SvANY(r);
	6512
	6513	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
	6514
	6515	if (flags & RXapif_ALL)
	6516	retarray=newAV();
	6517
	6518	if (rx && RXp_PAREN_NAMES(rx)) {
	6519	HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
	6520	if (he_str) {
	6521	IV i;
	6522	SV* sv_dat=HeVAL(he_str);
	6523	I32 nums=(I32)SvPVX(sv_dat);
	6524	for ( i=0; i<SvIVX(sv_dat); i++ ) {
	6525	if ((I32)(rx->nparens) >= nums[i]
	6526	&& rx->offs[nums[i]].start != -1
	6527	&& rx->offs[nums[i]].end != -1)
	6528	{
	6529	ret = newSVpvs("");
	6530	CALLREG_NUMBUF_FETCH(r,nums[i],ret);
	6531	if (!retarray)
	6532	return ret;
	6533	} else {
	6534	if (retarray)
	6535	ret = newSVsv(&PL_sv_undef);
	6536	}
	6537	if (retarray)
	6538	av_push(retarray, ret);
	6539	}
	6540	if (retarray)
	6541	return newRV_noinc(MUTABLE_SV(retarray));
	6542	}
	6543	}
	6544	return NULL;
	6545	}
	6546
	6547	bool
	6548	Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
	6549	const U32 flags)
	6550	{
	6551	struct regexp const rx = (struct regexp )SvANY(r);
	6552
	6553	PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
	6554
	6555	if (rx && RXp_PAREN_NAMES(rx)) {
	6556	if (flags & RXapif_ALL) {
	6557	return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
	6558	} else {
	6559	SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
	6560	if (sv) {
	6561	SvREFCNT_dec(sv);
	6562	return TRUE;
	6563	} else {
	6564	return FALSE;
	6565	}
	6566	}
	6567	} else {
	6568	return FALSE;
	6569	}
	6570	}
	6571
	6572	SV*
	6573	Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
	6574	{
	6575	struct regexp const rx = (struct regexp )SvANY(r);
	6576
	6577	PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
	6578
	6579	if ( rx && RXp_PAREN_NAMES(rx) ) {
	6580	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
	6581
	6582	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
	6583	} else {
	6584	return FALSE;
	6585	}
	6586	}
	6587
	6588	SV*
	6589	Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
	6590	{
	6591	struct regexp const rx = (struct regexp )SvANY(r);
	6592	GET_RE_DEBUG_FLAGS_DECL;
	6593
	6594	PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
	6595
	6596	if (rx && RXp_PAREN_NAMES(rx)) {
	6597	HV *hv = RXp_PAREN_NAMES(rx);
	6598	HE *temphe;
	6599	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	6600	IV i;
	6601	IV parno = 0;
	6602	SV* sv_dat = HeVAL(temphe);
	6603	I32 nums = (I32)SvPVX(sv_dat);
	6604	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	6605	if ((I32)(rx->lastparen) >= nums[i] &&
	6606	rx->offs[nums[i]].start != -1 &&
	6607	rx->offs[nums[i]].end != -1)
	6608	{
	6609	parno = nums[i];
	6610	break;
	6611	}
	6612	}
	6613	if (parno \|\| flags & RXapif_ALL) {
	6614	return newSVhek(HeKEY_hek(temphe));
	6615	}
	6616	}
	6617	}
	6618	return NULL;
	6619	}
	6620
	6621	SV*
	6622	Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
	6623	{
	6624	SV *ret;
	6625	AV *av;
	6626	I32 length;
	6627	struct regexp const rx = (struct regexp )SvANY(r);
	6628
	6629	PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
	6630
	6631	if (rx && RXp_PAREN_NAMES(rx)) {
	6632	if (flags & (RXapif_ALL \| RXapif_REGNAMES_COUNT)) {
	6633	return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
	6634	} else if (flags & RXapif_ONE) {
	6635	ret = CALLREG_NAMED_BUFF_ALL(r, (flags \| RXapif_REGNAMES));
	6636	av = MUTABLE_AV(SvRV(ret));
	6637	length = av_len(av);
	6638	SvREFCNT_dec(ret);
	6639	return newSViv(length + 1);
	6640	} else {
	6641	Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
	6642	return NULL;
	6643	}
	6644	}
	6645	return &PL_sv_undef;
	6646	}
	6647
	6648	SV*
	6649	Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
	6650	{
	6651	struct regexp const rx = (struct regexp )SvANY(r);
	6652	AV *av = newAV();
	6653
	6654	PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
	6655
	6656	if (rx && RXp_PAREN_NAMES(rx)) {
	6657	HV *hv= RXp_PAREN_NAMES(rx);
	6658	HE *temphe;
	6659	(void)hv_iterinit(hv);
	6660	while ( (temphe = hv_iternext_flags(hv,0)) ) {
	6661	IV i;
	6662	IV parno = 0;
	6663	SV* sv_dat = HeVAL(temphe);
	6664	I32 nums = (I32)SvPVX(sv_dat);
	6665	for ( i = 0; i < SvIVX(sv_dat); i++ ) {
	6666	if ((I32)(rx->lastparen) >= nums[i] &&
	6667	rx->offs[nums[i]].start != -1 &&
	6668	rx->offs[nums[i]].end != -1)
	6669	{
	6670	parno = nums[i];
	6671	break;
	6672	}
	6673	}
	6674	if (parno \|\| flags & RXapif_ALL) {
	6675	av_push(av, newSVhek(HeKEY_hek(temphe)));
	6676	}
	6677	}
	6678	}
	6679
	6680	return newRV_noinc(MUTABLE_SV(av));
	6681	}
	6682
	6683	void
	6684	Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
	6685	SV * const sv)
	6686	{
	6687	struct regexp const rx = (struct regexp )SvANY(r);
	6688	char *s = NULL;
	6689	I32 i = 0;
	6690	I32 s1, t1;
	6691	I32 n = paren;
	6692
	6693	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
	6694
	6695	if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
	6696	\|\| n == RX_BUFF_IDX_CARET_FULLMATCH
	6697	\|\| n == RX_BUFF_IDX_CARET_POSTMATCH
	6698	)
	6699	&& !(rx->extflags & RXf_PMf_KEEPCOPY)
	6700	)
	6701	goto ret_undef;
	6702
	6703	if (!rx->subbeg)
	6704	goto ret_undef;
	6705
	6706	if (n == RX_BUFF_IDX_CARET_FULLMATCH)
	6707	/* no need to distinguish between them any more */
	6708	n = RX_BUFF_IDX_FULLMATCH;
	6709
	6710	if ((n == RX_BUFF_IDX_PREMATCH \|\| n == RX_BUFF_IDX_CARET_PREMATCH)
	6711	&& rx->offs[0].start != -1)
	6712	{
	6713	/* $`, ${^PREMATCH} */
	6714	i = rx->offs[0].start;
	6715	s = rx->subbeg;
	6716	}
	6717	else
	6718	if ((n == RX_BUFF_IDX_POSTMATCH \|\| n == RX_BUFF_IDX_CARET_POSTMATCH)
	6719	&& rx->offs[0].end != -1)
	6720	{
	6721	/* $', ${^POSTMATCH} */
	6722	s = rx->subbeg - rx->suboffset + rx->offs[0].end;
	6723	i = rx->sublen + rx->suboffset - rx->offs[0].end;
	6724	}
	6725	else
	6726	if ( 0 <= n && n <= (I32)rx->nparens &&
	6727	(s1 = rx->offs[n].start) != -1 &&
	6728	(t1 = rx->offs[n].end) != -1)
	6729	{
	6730	/* $&, ${^MATCH}, $1 ... */
	6731	i = t1 - s1;
	6732	s = rx->subbeg + s1 - rx->suboffset;
	6733	} else {
	6734	goto ret_undef;
	6735	}
	6736
	6737	assert(s >= rx->subbeg);
	6738	assert(rx->sublen >= (s - rx->subbeg) + i );
	6739	if (i >= 0) {
	6740	const int oldtainted = PL_tainted;
	6741	TAINT_NOT;
	6742	sv_setpvn(sv, s, i);
	6743	PL_tainted = oldtainted;
	6744	if ( (rx->extflags & RXf_CANY_SEEN)
	6745	? (RXp_MATCH_UTF8(rx)
	6746	&& (!i \|\| is_utf8_string((U8*)s, i)))
	6747	: (RXp_MATCH_UTF8(rx)) )
	6748	{
	6749	SvUTF8_on(sv);
	6750	}
	6751	else
	6752	SvUTF8_off(sv);
	6753	if (PL_tainting) {
	6754	if (RXp_MATCH_TAINTED(rx)) {
	6755	if (SvTYPE(sv) >= SVt_PVMG) {
	6756	MAGIC* const mg = SvMAGIC(sv);
	6757	MAGIC* mgt;
	6758	PL_tainted = 1;
	6759	SvMAGIC_set(sv, mg->mg_moremagic);
	6760	SvTAINT(sv);
	6761	if ((mgt = SvMAGIC(sv))) {
	6762	mg->mg_moremagic = mgt;
	6763	SvMAGIC_set(sv, mg);
	6764	}
	6765	} else {
	6766	PL_tainted = 1;
	6767	SvTAINT(sv);
	6768	}
	6769	} else
	6770	SvTAINTED_off(sv);
	6771	}
	6772	} else {
	6773	ret_undef:
	6774	sv_setsv(sv,&PL_sv_undef);
	6775	return;
	6776	}
	6777	}
	6778
	6779	void
	6780	Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
	6781	SV const * const value)
	6782	{
	6783	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
	6784
	6785	PERL_UNUSED_ARG(rx);
	6786	PERL_UNUSED_ARG(paren);
	6787	PERL_UNUSED_ARG(value);
	6788
	6789	if (!PL_localizing)
	6790	Perl_croak_no_modify(aTHX);
	6791	}
	6792
	6793	I32
	6794	Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
	6795	const I32 paren)
	6796	{
	6797	struct regexp const rx = (struct regexp )SvANY(r);
	6798	I32 i;
	6799	I32 s1, t1;
	6800
	6801	PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
	6802
	6803	/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
	6804	switch (paren) {
	6805	case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
	6806	if (!(rx->extflags & RXf_PMf_KEEPCOPY))
	6807	goto warn_undef;
	6808	/FALLTHROUGH/
	6809
	6810	case RX_BUFF_IDX_PREMATCH: /* $` */
	6811	if (rx->offs[0].start != -1) {
	6812	i = rx->offs[0].start;
	6813	if (i > 0) {
	6814	s1 = 0;
	6815	t1 = i;
	6816	goto getlen;
	6817	}
	6818	}
	6819	return 0;
	6820
	6821	case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
	6822	if (!(rx->extflags & RXf_PMf_KEEPCOPY))
	6823	goto warn_undef;
	6824	case RX_BUFF_IDX_POSTMATCH: /* $' */
	6825	if (rx->offs[0].end != -1) {
	6826	i = rx->sublen - rx->offs[0].end;
	6827	if (i > 0) {
	6828	s1 = rx->offs[0].end;
	6829	t1 = rx->sublen;
	6830	goto getlen;
	6831	}
	6832	}
	6833	return 0;
	6834
	6835	case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
	6836	if (!(rx->extflags & RXf_PMf_KEEPCOPY))
	6837	goto warn_undef;
	6838	/FALLTHROUGH/
	6839
	6840	/* $& / ${^MATCH}, $1, $2, ... */
	6841	default:
	6842	if (paren <= (I32)rx->nparens &&
	6843	(s1 = rx->offs[paren].start) != -1 &&
	6844	(t1 = rx->offs[paren].end) != -1)
	6845	{
	6846	i = t1 - s1;
	6847	goto getlen;
	6848	} else {
	6849	warn_undef:
	6850	if (ckWARN(WARN_UNINITIALIZED))
	6851	report_uninit((const SV *)sv);
	6852	return 0;
	6853	}
	6854	}
	6855	getlen:
	6856	if (i > 0 && RXp_MATCH_UTF8(rx)) {
	6857	const char * const s = rx->subbeg - rx->suboffset + s1;
	6858	const U8 *ep;
	6859	STRLEN el;
	6860
	6861	i = t1 - s1;
	6862	if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
	6863	i = el;
	6864	}
	6865	return i;
	6866	}
	6867
	6868	SV*
	6869	Perl_reg_qr_package(pTHX_ REGEXP * const rx)
	6870	{
	6871	PERL_ARGS_ASSERT_REG_QR_PACKAGE;
	6872	PERL_UNUSED_ARG(rx);
	6873	if (0)
	6874	return NULL;
	6875	else
	6876	return newSVpvs("Regexp");
	6877	}
	6878
	6879	/* Scans the name of a named buffer from the pattern.
	6880	* If flags is REG_RSN_RETURN_NULL returns null.
	6881	* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
	6882	* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
	6883	* to the parsed name as looked up in the RExC_paren_names hash.
	6884	* If there is an error throws a vFAIL().. type exception.
	6885	*/
	6886
	6887	#define REG_RSN_RETURN_NULL 0
	6888	#define REG_RSN_RETURN_NAME 1
	6889	#define REG_RSN_RETURN_DATA 2
	6890
	6891	STATIC SV*
	6892	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
	6893	{
	6894	char *name_start = RExC_parse;
	6895
	6896	PERL_ARGS_ASSERT_REG_SCAN_NAME;
	6897
	6898	if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
	6899	/* skip IDFIRST by using do...while */
	6900	if (UTF)
	6901	do {
	6902	RExC_parse += UTF8SKIP(RExC_parse);
	6903	} while (isALNUM_utf8((U8*)RExC_parse));
	6904	else
	6905	do {
	6906	RExC_parse++;
	6907	} while (isALNUM(*RExC_parse));
	6908	} else {
	6909	RExC_parse++; /* so the <- from the vFAIL is after the offending character */
	6910	vFAIL("Group name must start with a non-digit word character");
	6911	}
	6912	if ( flags ) {
	6913	SV* sv_name
	6914	= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
	6915	SVs_TEMP \| (UTF ? SVf_UTF8 : 0));
	6916	if ( flags == REG_RSN_RETURN_NAME)
	6917	return sv_name;
	6918	else if (flags==REG_RSN_RETURN_DATA) {
	6919	HE *he_str = NULL;
	6920	SV *sv_dat = NULL;
	6921	if ( ! sv_name ) /* should not happen*/
	6922	Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
	6923	if (RExC_paren_names)
	6924	he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
	6925	if ( he_str )
	6926	sv_dat = HeVAL(he_str);
	6927	if ( ! sv_dat )
	6928	vFAIL("Reference to nonexistent named group");
	6929	return sv_dat;
	6930	}
	6931	else {
	6932	Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
	6933	(unsigned long) flags);
	6934	}
	6935	assert(0); /* NOT REACHED */
	6936	}
	6937	return NULL;
	6938	}
	6939
	6940	#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
	6941	int rem=(int)(RExC_end - RExC_parse); \
	6942	int cut; \
	6943	int num; \
	6944	int iscut=0; \
	6945	if (rem>10) { \
	6946	rem=10; \
	6947	iscut=1; \
	6948	} \
	6949	cut=10-rem; \
	6950	if (RExC_lastparse!=RExC_parse) \
	6951	PerlIO_printf(Perl_debug_log," >%.s%-s", \
	6952	rem, RExC_parse, \
	6953	cut + 4, \
	6954	iscut ? "..." : "<" \
	6955	); \
	6956	else \
	6957	PerlIO_printf(Perl_debug_log,"%16s",""); \
	6958	\
	6959	if (SIZE_ONLY) \
	6960	num = RExC_size + 1; \
	6961	else \
	6962	num=REG_NODE_NUM(RExC_emit); \
	6963	if (RExC_lastnum!=num) \
	6964	PerlIO_printf(Perl_debug_log,"\|%4d",num); \
	6965	else \
	6966	PerlIO_printf(Perl_debug_log,"\|%4s",""); \
	6967	PerlIO_printf(Perl_debug_log,"\|%*s%-4s", \
	6968	(int)((depth*2)), "", \
	6969	(funcname) \
	6970	); \
	6971	RExC_lastnum=num; \
	6972	RExC_lastparse=RExC_parse; \
	6973	})
	6974
	6975
	6976
	6977	#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
	6978	DEBUG_PARSE_MSG((funcname)); \
	6979	PerlIO_printf(Perl_debug_log,"%4s","\n"); \
	6980	})
	6981	#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
	6982	DEBUG_PARSE_MSG((funcname)); \
	6983	PerlIO_printf(Perl_debug_log,fmt "\n",args); \
	6984	})
	6985
	6986	/* This section of code defines the inversion list object and its methods. The
	6987	* interfaces are highly subject to change, so as much as possible is static to
	6988	* this file. An inversion list is here implemented as a malloc'd C UV array
	6989	* with some added info that is placed as UVs at the beginning in a header
	6990	* portion. An inversion list for Unicode is an array of code points, sorted
	6991	* by ordinal number. The zeroth element is the first code point in the list.
	6992	* The 1th element is the first element beyond that not in the list. In other
	6993	* words, the first range is
	6994	* invlist[0]..(invlist[1]-1)
	6995	* The other ranges follow. Thus every element whose index is divisible by two
	6996	* marks the beginning of a range that is in the list, and every element not
	6997	* divisible by two marks the beginning of a range not in the list. A single
	6998	* element inversion list that contains the single code point N generally
	6999	* consists of two elements
	7000	* invlist[0] == N
	7001	* invlist[1] == N+1
	7002	* (The exception is when N is the highest representable value on the
	7003	* machine, in which case the list containing just it would be a single
	7004	* element, itself. By extension, if the last range in the list extends to
	7005	* infinity, then the first element of that range will be in the inversion list
	7006	* at a position that is divisible by two, and is the final element in the
	7007	* list.)
	7008	* Taking the complement (inverting) an inversion list is quite simple, if the
	7009	* first element is 0, remove it; otherwise add a 0 element at the beginning.
	7010	* This implementation reserves an element at the beginning of each inversion
	7011	* list to contain 0 when the list contains 0, and contains 1 otherwise. The
	7012	* actual beginning of the list is either that element if 0, or the next one if
	7013	* 1.
	7014	*
	7015	* More about inversion lists can be found in "Unicode Demystified"
	7016	* Chapter 13 by Richard Gillam, published by Addison-Wesley.
	7017	* More will be coming when functionality is added later.
	7018	*
	7019	* The inversion list data structure is currently implemented as an SV pointing
	7020	* to an array of UVs that the SV thinks are bytes. This allows us to have an
	7021	* array of UV whose memory management is automatically handled by the existing
	7022	* facilities for SV's.
	7023	*
	7024	* Some of the methods should always be private to the implementation, and some
	7025	* should eventually be made public */
	7026
	7027	/* The header definitions are in F<inline_invlist.c> */
	7028
	7029	#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
	7030	#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
	7031
	7032	#define INVLIST_INITIAL_LEN 10
	7033
	7034	PERL_STATIC_INLINE UV*
	7035	S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
	7036	{
	7037	/* Returns a pointer to the first element in the inversion list's array.
	7038	* This is called upon initialization of an inversion list. Where the
	7039	* array begins depends on whether the list has the code point U+0000
	7040	* in it or not. The other parameter tells it whether the code that
	7041	* follows this call is about to put a 0 in the inversion list or not.
	7042	* The first element is either the element with 0, if 0, or the next one,
	7043	* if 1 */
	7044
	7045	UV* zero = get_invlist_zero_addr(invlist);
	7046
	7047	PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
	7048
	7049	/* Must be empty */
	7050	assert(! *_get_invlist_len_addr(invlist));
	7051
	7052	/* 1^1 = 0; 1^0 = 1 */
	7053	*zero = 1 ^ will_have_0;
	7054	return zero + *zero;
	7055	}
	7056
	7057	PERL_STATIC_INLINE UV*
	7058	S_invlist_array(pTHX_ SV* const invlist)
	7059	{
	7060	/* Returns the pointer to the inversion list's array. Every time the
	7061	* length changes, this needs to be called in case malloc or realloc moved
	7062	* it */
	7063
	7064	PERL_ARGS_ASSERT_INVLIST_ARRAY;
	7065
	7066	/* Must not be empty. If these fail, you probably didn't check for <len>
	7067	* being non-zero before trying to get the array */
	7068	assert(*_get_invlist_len_addr(invlist));
	7069	assert(*get_invlist_zero_addr(invlist) == 0
	7070	\|\| *get_invlist_zero_addr(invlist) == 1);
	7071
	7072	/* The array begins either at the element reserved for zero if the
	7073	* list contains 0 (that element will be set to 0), or otherwise the next
	7074	* element (in which case the reserved element will be set to 1). */
	7075	return (UV *) (get_invlist_zero_addr(invlist)
	7076	+ *get_invlist_zero_addr(invlist));
	7077	}
	7078
	7079	PERL_STATIC_INLINE void
	7080	S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
	7081	{
	7082	/* Sets the current number of elements stored in the inversion list */
	7083
	7084	PERL_ARGS_ASSERT_INVLIST_SET_LEN;
	7085
	7086	*_get_invlist_len_addr(invlist) = len;
	7087
	7088	assert(len <= SvLEN(invlist));
	7089
	7090	SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
	7091	/* If the list contains U+0000, that element is part of the header,
	7092	* and should not be counted as part of the array. It will contain
	7093	* 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
	7094	* subtract:
	7095	* SvCUR_set(invlist,
	7096	* TO_INTERNAL_SIZE(len
	7097	* - (*get_invlist_zero_addr(inv_list) ^ 1)));
	7098	* But, this is only valid if len is not 0. The consequences of not doing
	7099	* this is that the memory allocation code may think that 1 more UV is
	7100	* being used than actually is, and so might do an unnecessary grow. That
	7101	* seems worth not bothering to make this the precise amount.
	7102	*
	7103	* Note that when inverting, SvCUR shouldn't change */
	7104	}
	7105
	7106	PERL_STATIC_INLINE IV*
	7107	S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
	7108	{
	7109	/* Return the address of the UV that is reserved to hold the cached index
	7110	* */
	7111
	7112	PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
	7113
	7114	return (IV ) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET sizeof (UV)));
	7115	}
	7116
	7117	PERL_STATIC_INLINE IV
	7118	S_invlist_previous_index(pTHX_ SV* const invlist)
	7119	{
	7120	/* Returns cached index of previous search */
	7121
	7122	PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
	7123
	7124	return *get_invlist_previous_index_addr(invlist);
	7125	}
	7126
	7127	PERL_STATIC_INLINE void
	7128	S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
	7129	{
	7130	/* Caches <index> for later retrieval */
	7131
	7132	PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
	7133
	7134	assert(index == 0 \|\| index < (int) _invlist_len(invlist));
	7135
	7136	*get_invlist_previous_index_addr(invlist) = index;
	7137	}
	7138
	7139	PERL_STATIC_INLINE UV
	7140	S_invlist_max(pTHX_ SV* const invlist)
	7141	{
	7142	/* Returns the maximum number of elements storable in the inversion list's
	7143	* array, without having to realloc() */
	7144
	7145	PERL_ARGS_ASSERT_INVLIST_MAX;
	7146
	7147	return FROM_INTERNAL_SIZE(SvLEN(invlist));
	7148	}
	7149
	7150	PERL_STATIC_INLINE UV*
	7151	S_get_invlist_zero_addr(pTHX_ SV* invlist)
	7152	{
	7153	/* Return the address of the UV that is reserved to hold 0 if the inversion
	7154	* list contains 0. This has to be the last element of the heading, as the
	7155	* list proper starts with either it if 0, or the next element if not.
	7156	* (But we force it to contain either 0 or 1) */
	7157
	7158	PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
	7159
	7160	return (UV ) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET sizeof (UV)));
	7161	}
	7162
	7163	#ifndef PERL_IN_XSUB_RE
	7164	SV*
	7165	Perl__new_invlist(pTHX_ IV initial_size)
	7166	{
	7167
	7168	/* Return a pointer to a newly constructed inversion list, with enough
	7169	* space to store 'initial_size' elements. If that number is negative, a
	7170	* system default is used instead */
	7171
	7172	SV* new_list;
	7173
	7174	if (initial_size < 0) {
	7175	initial_size = INVLIST_INITIAL_LEN;
	7176	}
	7177
	7178	/* Allocate the initial space */
	7179	new_list = newSV(TO_INTERNAL_SIZE(initial_size));
	7180	invlist_set_len(new_list, 0);
	7181
	7182	/* Force iterinit() to be used to get iteration to work */
	7183	*get_invlist_iter_addr(new_list) = UV_MAX;
	7184
	7185	/* This should force a segfault if a method doesn't initialize this
	7186	* properly */
	7187	*get_invlist_zero_addr(new_list) = UV_MAX;
	7188
	7189	*get_invlist_previous_index_addr(new_list) = 0;
	7190	*get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
	7191	#if HEADER_LENGTH != 5
	7192	# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
	7193	#endif
	7194
	7195	return new_list;
	7196	}
	7197	#endif
	7198
	7199	STATIC SV*
	7200	S__new_invlist_C_array(pTHX_ UV* list)
	7201	{
	7202	/* Return a pointer to a newly constructed inversion list, initialized to
	7203	* point to <list>, which has to be in the exact correct inversion list
	7204	* form, including internal fields. Thus this is a dangerous routine that
	7205	* should not be used in the wrong hands */
	7206
	7207	SV* invlist = newSV_type(SVt_PV);
	7208
	7209	PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
	7210
	7211	SvPV_set(invlist, (char *) list);
	7212	SvLEN_set(invlist, 0); /* Means we own the contents, and the system
	7213	shouldn't touch it */
	7214	SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
	7215
	7216	if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
	7217	Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
	7218	}
	7219
	7220	return invlist;
	7221	}
	7222
	7223	STATIC void
	7224	S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
	7225	{
	7226	/* Grow the maximum size of an inversion list */
	7227
	7228	PERL_ARGS_ASSERT_INVLIST_EXTEND;
	7229
	7230	SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
	7231	}
	7232
	7233	PERL_STATIC_INLINE void
	7234	S_invlist_trim(pTHX_ SV* const invlist)
	7235	{
	7236	PERL_ARGS_ASSERT_INVLIST_TRIM;
	7237
	7238	/* Change the length of the inversion list to how many entries it currently
	7239	* has */
	7240
	7241	SvPV_shrink_to_cur((SV *) invlist);
	7242	}
	7243
	7244	#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
	7245
	7246	STATIC void
	7247	S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
	7248	{
	7249	/* Subject to change or removal. Append the range from 'start' to 'end' at
	7250	* the end of the inversion list. The range must be above any existing
	7251	* ones. */
	7252
	7253	UV* array;
	7254	UV max = invlist_max(invlist);
	7255	UV len = _invlist_len(invlist);
	7256
	7257	PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
	7258
	7259	if (len == 0) { /* Empty lists must be initialized */
	7260	array = _invlist_array_init(invlist, start == 0);
	7261	}
	7262	else {
	7263	/* Here, the existing list is non-empty. The current max entry in the
	7264	* list is generally the first value not in the set, except when the
	7265	* set extends to the end of permissible values, in which case it is
	7266	* the first entry in that final set, and so this call is an attempt to
	7267	* append out-of-order */
	7268
	7269	UV final_element = len - 1;
	7270	array = invlist_array(invlist);
	7271	if (array[final_element] > start
	7272	\|\| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
	7273	{
	7274	Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
	7275	array[final_element], start,
	7276	ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
	7277	}
	7278
	7279	/* Here, it is a legal append. If the new range begins with the first
	7280	* value not in the set, it is extending the set, so the new first
	7281	* value not in the set is one greater than the newly extended range.
	7282	* */
	7283	if (array[final_element] == start) {
	7284	if (end != UV_MAX) {
	7285	array[final_element] = end + 1;
	7286	}
	7287	else {
	7288	/* But if the end is the maximum representable on the machine,
	7289	* just let the range that this would extend to have no end */
	7290	invlist_set_len(invlist, len - 1);
	7291	}
	7292	return;
	7293	}
	7294	}
	7295
	7296	/* Here the new range doesn't extend any existing set. Add it */
	7297
	7298	len += 2; /* Includes an element each for the start and end of range */
	7299
	7300	/* If overflows the existing space, extend, which may cause the array to be
	7301	* moved */
	7302	if (max < len) {
	7303	invlist_extend(invlist, len);
	7304	invlist_set_len(invlist, len); /* Have to set len here to avoid assert
	7305	failure in invlist_array() */
	7306	array = invlist_array(invlist);
	7307	}
	7308	else {
	7309	invlist_set_len(invlist, len);
	7310	}
	7311
	7312	/* The next item on the list starts the range, the one after that is
	7313	* one past the new range. */
	7314	array[len - 2] = start;
	7315	if (end != UV_MAX) {
	7316	array[len - 1] = end + 1;
	7317	}
	7318	else {
	7319	/* But if the end is the maximum representable on the machine, just let
	7320	* the range have no end */
	7321	invlist_set_len(invlist, len - 1);
	7322	}
	7323	}
	7324
	7325	#ifndef PERL_IN_XSUB_RE
	7326
	7327	IV
	7328	Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
	7329	{
	7330	/* Searches the inversion list for the entry that contains the input code
	7331	* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
	7332	* return value is the index into the list's array of the range that
	7333	* contains <cp> */
	7334
	7335	IV low = 0;
	7336	IV mid;
	7337	IV high = _invlist_len(invlist);
	7338	const IV highest_element = high - 1;
	7339	const UV* array;
	7340
	7341	PERL_ARGS_ASSERT__INVLIST_SEARCH;
	7342
	7343	/* If list is empty, return failure. */
	7344	if (high == 0) {
	7345	return -1;
	7346	}
	7347
	7348	/* If the code point is before the first element, return failure. (We
	7349	* can't combine this with the test above, because we can't get the array
	7350	* unless we know the list is non-empty) */
	7351	array = invlist_array(invlist);
	7352
	7353	mid = invlist_previous_index(invlist);
	7354	assert(mid >=0 && mid <= highest_element);
	7355
	7356	/* <mid> contains the cache of the result of the previous call to this
	7357	* function (0 the first time). See if this call is for the same result,
	7358	* or if it is for mid-1. This is under the theory that calls to this
	7359	* function will often be for related code points that are near each other.
	7360	* And benchmarks show that caching gives better results. We also test
	7361	* here if the code point is within the bounds of the list. These tests
	7362	* replace others that would have had to be made anyway to make sure that
	7363	* the array bounds were not exceeded, and give us extra information at the
	7364	* same time */
	7365	if (cp >= array[mid]) {
	7366	if (cp >= array[highest_element]) {
	7367	return highest_element;
	7368	}
	7369
	7370	/* Here, array[mid] <= cp < array[highest_element]. This means that
	7371	* the final element is not the answer, so can exclude it; it also
	7372	* means that <mid> is not the final element, so can refer to 'mid + 1'
	7373	* safely */
	7374	if (cp < array[mid + 1]) {
	7375	return mid;
	7376	}
	7377	high--;
	7378	low = mid + 1;
	7379	}
	7380	else { /* cp < aray[mid] */
	7381	if (cp < array[0]) { /* Fail if outside the array */
	7382	return -1;
	7383	}
	7384	high = mid;
	7385	if (cp >= array[mid - 1]) {
	7386	goto found_entry;
	7387	}
	7388	}
	7389
	7390	/* Binary search. What we are looking for is <i> such that
	7391	* array[i] <= cp < array[i+1]
	7392	* The loop below converges on the i+1. Note that there may not be an
	7393	* (i+1)th element in the array, and things work nonetheless */
	7394	while (low < high) {
	7395	mid = (low + high) / 2;
	7396	assert(mid <= highest_element);
	7397	if (array[mid] <= cp) { /* cp >= array[mid] */
	7398	low = mid + 1;
	7399
	7400	/* We could do this extra test to exit the loop early.
	7401	if (cp < array[low]) {
	7402	return mid;
	7403	}
	7404	*/
	7405	}
	7406	else { /* cp < array[mid] */
	7407	high = mid;
	7408	}
	7409	}
	7410
	7411	found_entry:
	7412	high--;
	7413	invlist_set_previous_index(invlist, high);
	7414	return high;
	7415	}
	7416
	7417	void
	7418	Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
	7419	{
	7420	/* populates a swatch of a swash the same way swatch_get() does in utf8.c,
	7421	* but is used when the swash has an inversion list. This makes this much
	7422	* faster, as it uses a binary search instead of a linear one. This is
	7423	* intimately tied to that function, and perhaps should be in utf8.c,
	7424	* except it is intimately tied to inversion lists as well. It assumes
	7425	* that <swatch> is all 0's on input */
	7426
	7427	UV current = start;
	7428	const IV len = _invlist_len(invlist);
	7429	IV i;
	7430	const UV * array;
	7431
	7432	PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
	7433
	7434	if (len == 0) { /* Empty inversion list */
	7435	return;
	7436	}
	7437
	7438	array = invlist_array(invlist);
	7439
	7440	/* Find which element it is */
	7441	i = _invlist_search(invlist, start);
	7442
	7443	/* We populate from <start> to <end> */
	7444	while (current < end) {
	7445	UV upper;
	7446
	7447	/* The inversion list gives the results for every possible code point
	7448	* after the first one in the list. Only those ranges whose index is
	7449	* even are ones that the inversion list matches. For the odd ones,
	7450	* and if the initial code point is not in the list, we have to skip
	7451	* forward to the next element */
	7452	if (i == -1 \|\| ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
	7453	i++;
	7454	if (i >= len) { /* Finished if beyond the end of the array */
	7455	return;
	7456	}
	7457	current = array[i];
	7458	if (current >= end) { /* Finished if beyond the end of what we
	7459	are populating */
	7460	if (LIKELY(end < UV_MAX)) {
	7461	return;
	7462	}
	7463
	7464	/* We get here when the upper bound is the maximum
	7465	* representable on the machine, and we are looking for just
	7466	* that code point. Have to special case it */
	7467	i = len;
	7468	goto join_end_of_list;
	7469	}
	7470	}
	7471	assert(current >= start);
	7472
	7473	/* The current range ends one below the next one, except don't go past
	7474	* <end> */
	7475	i++;
	7476	upper = (i < len && array[i] < end) ? array[i] : end;
	7477
	7478	/* Here we are in a range that matches. Populate a bit in the 3-bit U8
	7479	* for each code point in it */
	7480	for (; current < upper; current++) {
	7481	const STRLEN offset = (STRLEN)(current - start);
	7482	swatch[offset >> 3] \|= 1 << (offset & 7);
	7483	}
	7484
	7485	join_end_of_list:
	7486
	7487	/* Quit if at the end of the list */
	7488	if (i >= len) {
	7489
	7490	/* But first, have to deal with the highest possible code point on
	7491	* the platform. The previous code assumes that <end> is one
	7492	* beyond where we want to populate, but that is impossible at the
	7493	* platform's infinity, so have to handle it specially */
	7494	if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
	7495	{
	7496	const STRLEN offset = (STRLEN)(end - start);
	7497	swatch[offset >> 3] \|= 1 << (offset & 7);
	7498	}
	7499	return;
	7500	}
	7501
	7502	/* Advance to the next range, which will be for code points not in the
	7503	* inversion list */
	7504	current = array[i];
	7505	}
	7506
	7507	return;
	7508	}
	7509
	7510	void
	7511	Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
	7512	{
	7513	/* Take the union of two inversion lists and point <output> to it. *output
	7514	* should be defined upon input, and if it points to one of the two lists,
	7515	* the reference count to that list will be decremented. The first list,
	7516	* <a>, may be NULL, in which case a copy of the second list is returned.
	7517	* If <complement_b> is TRUE, the union is taken of the complement
	7518	* (inversion) of <b> instead of b itself.
	7519	*
	7520	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	7521	* Richard Gillam, published by Addison-Wesley, and explained at some
	7522	* length there. The preface says to incorporate its examples into your
	7523	* code at your own risk.
	7524	*
	7525	* The algorithm is like a merge sort.
	7526	*
	7527	* XXX A potential performance improvement is to keep track as we go along
	7528	* if only one of the inputs contributes to the result, meaning the other
	7529	* is a subset of that one. In that case, we can skip the final copy and
	7530	* return the larger of the input lists, but then outside code might need
	7531	* to keep track of whether to free the input list or not */
	7532
	7533	UV* array_a; /* a's array */
	7534	UV* array_b;
	7535	UV len_a; /* length of a's array */
	7536	UV len_b;
	7537
	7538	SV* u; /* the resulting union */
	7539	UV* array_u;
	7540	UV len_u;
	7541
	7542	UV i_a = 0; /* current index into a's array */
	7543	UV i_b = 0;
	7544	UV i_u = 0;
	7545
	7546	/* running count, as explained in the algorithm source book; items are
	7547	* stopped accumulating and are output when the count changes to/from 0.
	7548	* The count is incremented when we start a range that's in the set, and
	7549	* decremented when we start a range that's not in the set. So its range
	7550	* is 0 to 2. Only when the count is zero is something not in the set.
	7551	*/
	7552	UV count = 0;
	7553
	7554	PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
	7555	assert(a != b);
	7556
	7557	/* If either one is empty, the union is the other one */
	7558	if (a == NULL \|\| ((len_a = _invlist_len(a)) == 0)) {
	7559	if (*output == a) {
	7560	if (a != NULL) {
	7561	SvREFCNT_dec(a);
	7562	}
	7563	}
	7564	if (*output != b) {
	7565	*output = invlist_clone(b);
	7566	if (complement_b) {
	7567	_invlist_invert(*output);
	7568	}
	7569	} /* else output already = b; /
	7570	return;
	7571	}
	7572	else if ((len_b = _invlist_len(b)) == 0) {
	7573	if (*output == b) {
	7574	SvREFCNT_dec(b);
	7575	}
	7576
	7577	/* The complement of an empty list is a list that has everything in it,
	7578	* so the union with <a> includes everything too */
	7579	if (complement_b) {
	7580	if (a == *output) {
	7581	SvREFCNT_dec(a);
	7582	}
	7583	*output = _new_invlist(1);
	7584	_append_range_to_invlist(*output, 0, UV_MAX);
	7585	}
	7586	else if (*output != a) {
	7587	*output = invlist_clone(a);
	7588	}
	7589	/* else output already = a; /
	7590	return;
	7591	}
	7592
	7593	/* Here both lists exist and are non-empty */
	7594	array_a = invlist_array(a);
	7595	array_b = invlist_array(b);
	7596
	7597	/* If are to take the union of 'a' with the complement of b, set it
	7598	* up so are looking at b's complement. */
	7599	if (complement_b) {
	7600
	7601	/* To complement, we invert: if the first element is 0, remove it. To
	7602	* do this, we just pretend the array starts one later, and clear the
	7603	* flag as we don't have to do anything else later */
	7604	if (array_b[0] == 0) {
	7605	array_b++;
	7606	len_b--;
	7607	complement_b = FALSE;
	7608	}
	7609	else {
	7610
	7611	/* But if the first element is not zero, we unshift a 0 before the
	7612	* array. The data structure reserves a space for that 0 (which
	7613	* should be a '1' right now), so physical shifting is unneeded,
	7614	* but temporarily change that element to 0. Before exiting the
	7615	* routine, we must restore the element to '1' */
	7616	array_b--;
	7617	len_b++;
	7618	array_b[0] = 0;
	7619	}
	7620	}
	7621
	7622	/* Size the union for the worst case: that the sets are completely
	7623	* disjoint */
	7624	u = _new_invlist(len_a + len_b);
	7625
	7626	/* Will contain U+0000 if either component does */
	7627	array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
	7628	\|\| (len_b > 0 && array_b[0] == 0));
	7629
	7630	/* Go through each list item by item, stopping when exhausted one of
	7631	* them */
	7632	while (i_a < len_a && i_b < len_b) {
	7633	UV cp; /* The element to potentially add to the union's array */
	7634	bool cp_in_set; /* is it in the the input list's set or not */
	7635
	7636	/* We need to take one or the other of the two inputs for the union.
	7637	* Since we are merging two sorted lists, we take the smaller of the
	7638	* next items. In case of a tie, we take the one that is in its set
	7639	* first. If we took one not in the set first, it would decrement the
	7640	* count, possibly to 0 which would cause it to be output as ending the
	7641	* range, and the next time through we would take the same number, and
	7642	* output it again as beginning the next range. By doing it the
	7643	* opposite way, there is no possibility that the count will be
	7644	* momentarily decremented to 0, and thus the two adjoining ranges will
	7645	* be seamlessly merged. (In a tie and both are in the set or both not
	7646	* in the set, it doesn't matter which we take first.) */
	7647	if (array_a[i_a] < array_b[i_b]
	7648	\|\| (array_a[i_a] == array_b[i_b]
	7649	&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	7650	{
	7651	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	7652	cp= array_a[i_a++];
	7653	}
	7654	else {
	7655	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	7656	cp= array_b[i_b++];
	7657	}
	7658
	7659	/* Here, have chosen which of the two inputs to look at. Only output
	7660	* if the running count changes to/from 0, which marks the
	7661	* beginning/end of a range in that's in the set */
	7662	if (cp_in_set) {
	7663	if (count == 0) {
	7664	array_u[i_u++] = cp;
	7665	}
	7666	count++;
	7667	}
	7668	else {
	7669	count--;
	7670	if (count == 0) {
	7671	array_u[i_u++] = cp;
	7672	}
	7673	}
	7674	}
	7675
	7676	/* Here, we are finished going through at least one of the lists, which
	7677	* means there is something remaining in at most one. We check if the list
	7678	* that hasn't been exhausted is positioned such that we are in the middle
	7679	* of a range in its set or not. (i_a and i_b point to the element beyond
	7680	* the one we care about.) If in the set, we decrement 'count'; if 0, there
	7681	* is potentially more to output.
	7682	* There are four cases:
	7683	* 1) Both weren't in their sets, count is 0, and remains 0. What's left
	7684	* in the union is entirely from the non-exhausted set.
	7685	* 2) Both were in their sets, count is 2. Nothing further should
	7686	* be output, as everything that remains will be in the exhausted
	7687	* list's set, hence in the union; decrementing to 1 but not 0 insures
	7688	* that
	7689	* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
	7690	* Nothing further should be output because the union includes
	7691	* everything from the exhausted set. Not decrementing ensures that.
	7692	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
	7693	* decrementing to 0 insures that we look at the remainder of the
	7694	* non-exhausted set */
	7695	if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	7696	\|\| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	7697	{
	7698	count--;
	7699	}
	7700
	7701	/* The final length is what we've output so far, plus what else is about to
	7702	* be output. (If 'count' is non-zero, then the input list we exhausted
	7703	* has everything remaining up to the machine's limit in its set, and hence
	7704	* in the union, so there will be no further output. */
	7705	len_u = i_u;
	7706	if (count == 0) {
	7707	/* At most one of the subexpressions will be non-zero */
	7708	len_u += (len_a - i_a) + (len_b - i_b);
	7709	}
	7710
	7711	/* Set result to final length, which can change the pointer to array_u, so
	7712	* re-find it */
	7713	if (len_u != _invlist_len(u)) {
	7714	invlist_set_len(u, len_u);
	7715	invlist_trim(u);
	7716	array_u = invlist_array(u);
	7717	}
	7718
	7719	/* When 'count' is 0, the list that was exhausted (if one was shorter than
	7720	* the other) ended with everything above it not in its set. That means
	7721	* that the remaining part of the union is precisely the same as the
	7722	* non-exhausted list, so can just copy it unchanged. (If both list were
	7723	* exhausted at the same time, then the operations below will be both 0.)
	7724	*/
	7725	if (count == 0) {
	7726	IV copy_count; /* At most one will have a non-zero copy count */
	7727	if ((copy_count = len_a - i_a) > 0) {
	7728	Copy(array_a + i_a, array_u + i_u, copy_count, UV);
	7729	}
	7730	else if ((copy_count = len_b - i_b) > 0) {
	7731	Copy(array_b + i_b, array_u + i_u, copy_count, UV);
	7732	}
	7733	}
	7734
	7735	/* We may be removing a reference to one of the inputs */
	7736	if (a == output \|\| b == output) {
	7737	SvREFCNT_dec(*output);
	7738	}
	7739
	7740	/* If we've changed b, restore it */
	7741	if (complement_b) {
	7742	array_b[0] = 1;
	7743	}
	7744
	7745	*output = u;
	7746	return;
	7747	}
	7748
	7749	void
	7750	Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
	7751	{
	7752	/* Take the intersection of two inversion lists and point <i> to it. *i
	7753	* should be defined upon input, and if it points to one of the two lists,
	7754	* the reference count to that list will be decremented.
	7755	* If <complement_b> is TRUE, the result will be the intersection of <a>
	7756	* and the complement (or inversion) of <b> instead of <b> directly.
	7757	*
	7758	* The basis for this comes from "Unicode Demystified" Chapter 13 by
	7759	* Richard Gillam, published by Addison-Wesley, and explained at some
	7760	* length there. The preface says to incorporate its examples into your
	7761	* code at your own risk. In fact, it had bugs
	7762	*
	7763	* The algorithm is like a merge sort, and is essentially the same as the
	7764	* union above
	7765	*/
	7766
	7767	UV* array_a; /* a's array */
	7768	UV* array_b;
	7769	UV len_a; /* length of a's array */
	7770	UV len_b;
	7771
	7772	SV* r; /* the resulting intersection */
	7773	UV* array_r;
	7774	UV len_r;
	7775
	7776	UV i_a = 0; /* current index into a's array */
	7777	UV i_b = 0;
	7778	UV i_r = 0;
	7779
	7780	/* running count, as explained in the algorithm source book; items are
	7781	* stopped accumulating and are output when the count changes to/from 2.
	7782	* The count is incremented when we start a range that's in the set, and
	7783	* decremented when we start a range that's not in the set. So its range
	7784	* is 0 to 2. Only when the count is 2 is something in the intersection.
	7785	*/
	7786	UV count = 0;
	7787
	7788	PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
	7789	assert(a != b);
	7790
	7791	/* Special case if either one is empty */
	7792	len_a = _invlist_len(a);
	7793	if ((len_a == 0) \|\| ((len_b = _invlist_len(b)) == 0)) {
	7794
	7795	if (len_a != 0 && complement_b) {
	7796
	7797	/* Here, 'a' is not empty, therefore from the above 'if', 'b' must
	7798	* be empty. Here, also we are using 'b's complement, which hence
	7799	* must be every possible code point. Thus the intersection is
	7800	* simply 'a'. */
	7801	if (*i != a) {
	7802	*i = invlist_clone(a);
	7803
	7804	if (*i == b) {
	7805	SvREFCNT_dec(b);
	7806	}
	7807	}
	7808	/* else i is already 'a' /
	7809	return;
	7810	}
	7811
	7812	/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
	7813	* intersection must be empty */
	7814	if (*i == a) {
	7815	SvREFCNT_dec(a);
	7816	}
	7817	else if (*i == b) {
	7818	SvREFCNT_dec(b);
	7819	}
	7820	*i = _new_invlist(0);
	7821	return;
	7822	}
	7823
	7824	/* Here both lists exist and are non-empty */
	7825	array_a = invlist_array(a);
	7826	array_b = invlist_array(b);
	7827
	7828	/* If are to take the intersection of 'a' with the complement of b, set it
	7829	* up so are looking at b's complement. */
	7830	if (complement_b) {
	7831
	7832	/* To complement, we invert: if the first element is 0, remove it. To
	7833	* do this, we just pretend the array starts one later, and clear the
	7834	* flag as we don't have to do anything else later */
	7835	if (array_b[0] == 0) {
	7836	array_b++;
	7837	len_b--;
	7838	complement_b = FALSE;
	7839	}
	7840	else {
	7841
	7842	/* But if the first element is not zero, we unshift a 0 before the
	7843	* array. The data structure reserves a space for that 0 (which
	7844	* should be a '1' right now), so physical shifting is unneeded,
	7845	* but temporarily change that element to 0. Before exiting the
	7846	* routine, we must restore the element to '1' */
	7847	array_b--;
	7848	len_b++;
	7849	array_b[0] = 0;
	7850	}
	7851	}
	7852
	7853	/* Size the intersection for the worst case: that the intersection ends up
	7854	* fragmenting everything to be completely disjoint */
	7855	r= _new_invlist(len_a + len_b);
	7856
	7857	/* Will contain U+0000 iff both components do */
	7858	array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
	7859	&& len_b > 0 && array_b[0] == 0);
	7860
	7861	/* Go through each list item by item, stopping when exhausted one of
	7862	* them */
	7863	while (i_a < len_a && i_b < len_b) {
	7864	UV cp; /* The element to potentially add to the intersection's
	7865	array */
	7866	bool cp_in_set; /* Is it in the input list's set or not */
	7867
	7868	/* We need to take one or the other of the two inputs for the
	7869	* intersection. Since we are merging two sorted lists, we take the
	7870	* smaller of the next items. In case of a tie, we take the one that
	7871	* is not in its set first (a difference from the union algorithm). If
	7872	* we took one in the set first, it would increment the count, possibly
	7873	* to 2 which would cause it to be output as starting a range in the
	7874	* intersection, and the next time through we would take that same
	7875	* number, and output it again as ending the set. By doing it the
	7876	* opposite of this, there is no possibility that the count will be
	7877	* momentarily incremented to 2. (In a tie and both are in the set or
	7878	* both not in the set, it doesn't matter which we take first.) */
	7879	if (array_a[i_a] < array_b[i_b]
	7880	\|\| (array_a[i_a] == array_b[i_b]
	7881	&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
	7882	{
	7883	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
	7884	cp= array_a[i_a++];
	7885	}
	7886	else {
	7887	cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
	7888	cp= array_b[i_b++];
	7889	}
	7890
	7891	/* Here, have chosen which of the two inputs to look at. Only output
	7892	* if the running count changes to/from 2, which marks the
	7893	* beginning/end of a range that's in the intersection */
	7894	if (cp_in_set) {
	7895	count++;
	7896	if (count == 2) {
	7897	array_r[i_r++] = cp;
	7898	}
	7899	}
	7900	else {
	7901	if (count == 2) {
	7902	array_r[i_r++] = cp;
	7903	}
	7904	count--;
	7905	}
	7906	}
	7907
	7908	/* Here, we are finished going through at least one of the lists, which
	7909	* means there is something remaining in at most one. We check if the list
	7910	* that has been exhausted is positioned such that we are in the middle
	7911	* of a range in its set or not. (i_a and i_b point to elements 1 beyond
	7912	* the ones we care about.) There are four cases:
	7913	* 1) Both weren't in their sets, count is 0, and remains 0. There's
	7914	* nothing left in the intersection.
	7915	* 2) Both were in their sets, count is 2 and perhaps is incremented to
	7916	* above 2. What should be output is exactly that which is in the
	7917	* non-exhausted set, as everything it has is also in the intersection
	7918	* set, and everything it doesn't have can't be in the intersection
	7919	* 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
	7920	* gets incremented to 2. Like the previous case, the intersection is
	7921	* everything that remains in the non-exhausted set.
	7922	* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
	7923	* remains 1. And the intersection has nothing more. */
	7924	if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
	7925	\|\| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
	7926	{
	7927	count++;
	7928	}
	7929
	7930	/* The final length is what we've output so far plus what else is in the
	7931	* intersection. At most one of the subexpressions below will be non-zero */
	7932	len_r = i_r;
	7933	if (count >= 2) {
	7934	len_r += (len_a - i_a) + (len_b - i_b);
	7935	}
	7936
	7937	/* Set result to final length, which can change the pointer to array_r, so
	7938	* re-find it */
	7939	if (len_r != _invlist_len(r)) {
	7940	invlist_set_len(r, len_r);
	7941	invlist_trim(r);
	7942	array_r = invlist_array(r);
	7943	}
	7944
	7945	/* Finish outputting any remaining */
	7946	if (count >= 2) { /* At most one will have a non-zero copy count */
	7947	IV copy_count;
	7948	if ((copy_count = len_a - i_a) > 0) {
	7949	Copy(array_a + i_a, array_r + i_r, copy_count, UV);
	7950	}
	7951	else if ((copy_count = len_b - i_b) > 0) {
	7952	Copy(array_b + i_b, array_r + i_r, copy_count, UV);
	7953	}
	7954	}
	7955
	7956	/* We may be removing a reference to one of the inputs */
	7957	if (a == i \|\| b == i) {
	7958	SvREFCNT_dec(*i);
	7959	}
	7960
	7961	/* If we've changed b, restore it */
	7962	if (complement_b) {
	7963	array_b[0] = 1;
	7964	}
	7965
	7966	*i = r;
	7967	return;
	7968	}
	7969
	7970	SV*
	7971	Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
	7972	{
	7973	/* Add the range from 'start' to 'end' inclusive to the inversion list's
	7974	* set. A pointer to the inversion list is returned. This may actually be
	7975	* a new list, in which case the passed in one has been destroyed. The
	7976	* passed in inversion list can be NULL, in which case a new one is created
	7977	* with just the one range in it */
	7978
	7979	SV* range_invlist;
	7980	UV len;
	7981
	7982	if (invlist == NULL) {
	7983	invlist = _new_invlist(2);
	7984	len = 0;
	7985	}
	7986	else {
	7987	len = _invlist_len(invlist);
	7988	}
	7989
	7990	/* If comes after the final entry, can just append it to the end */
	7991	if (len == 0
	7992	\|\| start >= invlist_array(invlist)
	7993	[_invlist_len(invlist) - 1])
	7994	{
	7995	_append_range_to_invlist(invlist, start, end);
	7996	return invlist;
	7997	}
	7998
	7999	/* Here, can't just append things, create and return a new inversion list
	8000	* which is the union of this range and the existing inversion list */
	8001	range_invlist = _new_invlist(2);
	8002	_append_range_to_invlist(range_invlist, start, end);
	8003
	8004	_invlist_union(invlist, range_invlist, &invlist);
	8005
	8006	/* The temporary can be freed */
	8007	SvREFCNT_dec(range_invlist);
	8008
	8009	return invlist;
	8010	}
	8011
	8012	#endif
	8013
	8014	PERL_STATIC_INLINE SV*
	8015	S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
	8016	return _add_range_to_invlist(invlist, cp, cp);
	8017	}
	8018
	8019	#ifndef PERL_IN_XSUB_RE
	8020	void
	8021	Perl__invlist_invert(pTHX_ SV* const invlist)
	8022	{
	8023	/* Complement the input inversion list. This adds a 0 if the list didn't
	8024	* have a zero; removes it otherwise. As described above, the data
	8025	* structure is set up so that this is very efficient */
	8026
	8027	UV* len_pos = _get_invlist_len_addr(invlist);
	8028
	8029	PERL_ARGS_ASSERT__INVLIST_INVERT;
	8030
	8031	/* The inverse of matching nothing is matching everything */
	8032	if (*len_pos == 0) {
	8033	_append_range_to_invlist(invlist, 0, UV_MAX);
	8034	return;
	8035	}
	8036
	8037	/* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
	8038	* zero element was a 0, so it is being removed, so the length decrements
	8039	* by 1; and vice-versa. SvCUR is unaffected */
	8040	if (*get_invlist_zero_addr(invlist) ^= 1) {
	8041	(*len_pos)--;
	8042	}
	8043	else {
	8044	(*len_pos)++;
	8045	}
	8046	}
	8047
	8048	void
	8049	Perl__invlist_invert_prop(pTHX_ SV* const invlist)
	8050	{
	8051	/* Complement the input inversion list (which must be a Unicode property,
	8052	* all of which don't match above the Unicode maximum code point.) And
	8053	* Perl has chosen to not have the inversion match above that either. This
	8054	* adds a 0x110000 if the list didn't end with it, and removes it if it did
	8055	*/
	8056
	8057	UV len;
	8058	UV* array;
	8059
	8060	PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
	8061
	8062	_invlist_invert(invlist);
	8063
	8064	len = _invlist_len(invlist);
	8065
	8066	if (len != 0) { /* If empty do nothing */
	8067	array = invlist_array(invlist);
	8068	if (array[len - 1] != PERL_UNICODE_MAX + 1) {
	8069	/* Add 0x110000. First, grow if necessary */
	8070	len++;
	8071	if (invlist_max(invlist) < len) {
	8072	invlist_extend(invlist, len);
	8073	array = invlist_array(invlist);
	8074	}
	8075	invlist_set_len(invlist, len);
	8076	array[len - 1] = PERL_UNICODE_MAX + 1;
	8077	}
	8078	else { /* Remove the 0x110000 */
	8079	invlist_set_len(invlist, len - 1);
	8080	}
	8081	}
	8082
	8083	return;
	8084	}
	8085	#endif
	8086
	8087	PERL_STATIC_INLINE SV*
	8088	S_invlist_clone(pTHX_ SV* const invlist)
	8089	{
	8090
	8091	/* Return a new inversion list that is a copy of the input one, which is
	8092	* unchanged */
	8093
	8094	/* Need to allocate extra space to accommodate Perl's addition of a
	8095	* trailing NUL to SvPV's, since it thinks they are always strings */
	8096	SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
	8097	STRLEN length = SvCUR(invlist);
	8098
	8099	PERL_ARGS_ASSERT_INVLIST_CLONE;
	8100
	8101	SvCUR_set(new_invlist, length); /* This isn't done automatically */
	8102	Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
	8103
	8104	return new_invlist;
	8105	}
	8106
	8107	PERL_STATIC_INLINE UV*
	8108	S_get_invlist_iter_addr(pTHX_ SV* invlist)
	8109	{
	8110	/* Return the address of the UV that contains the current iteration
	8111	* position */
	8112
	8113	PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
	8114
	8115	return (UV ) (SvPVX(invlist) + (INVLIST_ITER_OFFSET sizeof (UV)));
	8116	}
	8117
	8118	PERL_STATIC_INLINE UV*
	8119	S_get_invlist_version_id_addr(pTHX_ SV* invlist)
	8120	{
	8121	/* Return the address of the UV that contains the version id. */
	8122
	8123	PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
	8124
	8125	return (UV ) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET sizeof (UV)));
	8126	}
	8127
	8128	PERL_STATIC_INLINE void
	8129	S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
	8130	{
	8131	PERL_ARGS_ASSERT_INVLIST_ITERINIT;
	8132
	8133	*get_invlist_iter_addr(invlist) = 0;
	8134	}
	8135
	8136	STATIC bool
	8137	S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
	8138	{
	8139	/* An C<invlist_iterinit> call on <invlist> must be used to set this up.
	8140	* This call sets in <start> and <end>, the next range in <invlist>.
	8141	* Returns <TRUE> if successful and the next call will return the next
	8142	* range; <FALSE> if was already at the end of the list. If the latter,
	8143	* <start> and <end> are unchanged, and the next call to this function
	8144	* will start over at the beginning of the list */
	8145
	8146	UV* pos = get_invlist_iter_addr(invlist);
	8147	UV len = _invlist_len(invlist);
	8148	UV *array;
	8149
	8150	PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
	8151
	8152	if (*pos >= len) {
	8153	pos = UV_MAX; / Force iternit() to be required next time */
	8154	return FALSE;
	8155	}
	8156
	8157	array = invlist_array(invlist);
	8158
	8159	start = array[(pos)++];
	8160
	8161	if (*pos >= len) {
	8162	*end = UV_MAX;
	8163	}
	8164	else {
	8165	end = array[(pos)++] - 1;
	8166	}
	8167
	8168	return TRUE;
	8169	}
	8170
	8171	PERL_STATIC_INLINE UV
	8172	S_invlist_highest(pTHX_ SV* const invlist)
	8173	{
	8174	/* Returns the highest code point that matches an inversion list. This API
	8175	* has an ambiguity, as it returns 0 under either the highest is actually
	8176	* 0, or if the list is empty. If this distinction matters to you, check
	8177	* for emptiness before calling this function */
	8178
	8179	UV len = _invlist_len(invlist);
	8180	UV *array;
	8181
	8182	PERL_ARGS_ASSERT_INVLIST_HIGHEST;
	8183
	8184	if (len == 0) {
	8185	return 0;
	8186	}
	8187
	8188	array = invlist_array(invlist);
	8189
	8190	/* The last element in the array in the inversion list always starts a
	8191	* range that goes to infinity. That range may be for code points that are
	8192	* matched in the inversion list, or it may be for ones that aren't
	8193	* matched. In the latter case, the highest code point in the set is one
	8194	* less than the beginning of this range; otherwise it is the final element
	8195	* of this range: infinity */
	8196	return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
	8197	? UV_MAX
	8198	: array[len - 1] - 1;
	8199	}
	8200
	8201	#ifndef PERL_IN_XSUB_RE
	8202	SV *
	8203	Perl__invlist_contents(pTHX_ SV* const invlist)
	8204	{
	8205	/* Get the contents of an inversion list into a string SV so that they can
	8206	* be printed out. It uses the format traditionally done for debug tracing
	8207	*/
	8208
	8209	UV start, end;
	8210	SV* output = newSVpvs("\n");
	8211
	8212	PERL_ARGS_ASSERT__INVLIST_CONTENTS;
	8213
	8214	invlist_iterinit(invlist);
	8215	while (invlist_iternext(invlist, &start, &end)) {
	8216	if (end == UV_MAX) {
	8217	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
	8218	}
	8219	else if (end != start) {
	8220	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
	8221	start, end);
	8222	}
	8223	else {
	8224	Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
	8225	}
	8226	}
	8227
	8228	return output;
	8229	}
	8230	#endif
	8231
	8232	#if 0
	8233	void
	8234	S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
	8235	{
	8236	/* Dumps out the ranges in an inversion list. The string 'header'
	8237	* if present is output on a line before the first range */
	8238
	8239	UV start, end;
	8240
	8241	if (header && strlen(header)) {
	8242	PerlIO_printf(Perl_debug_log, "%s\n", header);
	8243	}
	8244	invlist_iterinit(invlist);
	8245	while (invlist_iternext(invlist, &start, &end)) {
	8246	if (end == UV_MAX) {
	8247	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
	8248	}
	8249	else {
	8250	PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
	8251	}
	8252	}
	8253	}
	8254	#endif
	8255
	8256	#if 0
	8257	bool
	8258	S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
	8259	{
	8260	/* Return a boolean as to if the two passed in inversion lists are
	8261	* identical. The final argument, if TRUE, says to take the complement of
	8262	* the second inversion list before doing the comparison */
	8263
	8264	UV* array_a = invlist_array(a);
	8265	UV* array_b = invlist_array(b);
	8266	UV len_a = _invlist_len(a);
	8267	UV len_b = _invlist_len(b);
	8268
	8269	UV i = 0; /* current index into the arrays */
	8270	bool retval = TRUE; /* Assume are identical until proven otherwise */
	8271
	8272	PERL_ARGS_ASSERT__INVLISTEQ;
	8273
	8274	/* If are to compare 'a' with the complement of b, set it
	8275	* up so are looking at b's complement. */
	8276	if (complement_b) {
	8277
	8278	/* The complement of nothing is everything, so <a> would have to have
	8279	* just one element, starting at zero (ending at infinity) */
	8280	if (len_b == 0) {
	8281	return (len_a == 1 && array_a[0] == 0);
	8282	}
	8283	else if (array_b[0] == 0) {
	8284
	8285	/* Otherwise, to complement, we invert. Here, the first element is
	8286	* 0, just remove it. To do this, we just pretend the array starts
	8287	* one later, and clear the flag as we don't have to do anything
	8288	* else later */
	8289
	8290	array_b++;
	8291	len_b--;
	8292	complement_b = FALSE;
	8293	}
	8294	else {
	8295
	8296	/* But if the first element is not zero, we unshift a 0 before the
	8297	* array. The data structure reserves a space for that 0 (which
	8298	* should be a '1' right now), so physical shifting is unneeded,
	8299	* but temporarily change that element to 0. Before exiting the
	8300	* routine, we must restore the element to '1' */
	8301	array_b--;
	8302	len_b++;
	8303	array_b[0] = 0;
	8304	}
	8305	}
	8306
	8307	/* Make sure that the lengths are the same, as well as the final element
	8308	* before looping through the remainder. (Thus we test the length, final,
	8309	* and first elements right off the bat) */
	8310	if (len_a != len_b \|\| array_a[len_a-1] != array_b[len_a-1]) {
	8311	retval = FALSE;
	8312	}
	8313	else for (i = 0; i < len_a - 1; i++) {
	8314	if (array_a[i] != array_b[i]) {
	8315	retval = FALSE;
	8316	break;
	8317	}
	8318	}
	8319
	8320	if (complement_b) {
	8321	array_b[0] = 1;
	8322	}
	8323	return retval;
	8324	}
	8325	#endif
	8326
	8327	#undef HEADER_LENGTH
	8328	#undef INVLIST_INITIAL_LENGTH
	8329	#undef TO_INTERNAL_SIZE
	8330	#undef FROM_INTERNAL_SIZE
	8331	#undef INVLIST_LEN_OFFSET
	8332	#undef INVLIST_ZERO_OFFSET
	8333	#undef INVLIST_ITER_OFFSET
	8334	#undef INVLIST_VERSION_ID
	8335
	8336	/* End of inversion list object */
	8337
	8338	/*
	8339	- reg - regular expression, i.e. main body or parenthesized thing
	8340	*
	8341	* Caller must absorb opening parenthesis.
	8342	*
	8343	* Combining parenthesis handling with the base level of regular expression
	8344	* is a trifle forced, but the need to tie the tails of the branches to what
	8345	* follows makes it hard to avoid.
	8346	*/
	8347	#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
	8348	#ifdef DEBUGGING
	8349	#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
	8350	#else
	8351	#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
	8352	#endif
	8353
	8354	STATIC regnode *
	8355	S_reg(pTHX_ RExC_state_t pRExC_state, I32 paren, I32 flagp,U32 depth)
	8356	/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
	8357	{
	8358	dVAR;
	8359	regnode ret; / Will be the head of the group. */
	8360	regnode *br;
	8361	regnode *lastbr;
	8362	regnode *ender = NULL;
	8363	I32 parno = 0;
	8364	I32 flags;
	8365	U32 oregflags = RExC_flags;
	8366	bool have_branch = 0;
	8367	bool is_open = 0;
	8368	I32 freeze_paren = 0;
	8369	I32 after_freeze = 0;
	8370
	8371	/* for (?g), (?gc), and (?o) warnings; warning
	8372	about (?c) will warn about (?g) -- japhy */
	8373
	8374	#define WASTED_O 0x01
	8375	#define WASTED_G 0x02
	8376	#define WASTED_C 0x04
	8377	#define WASTED_GC (0x02\|0x04)
	8378	I32 wastedflags = 0x00;
	8379
	8380	char * parse_start = RExC_parse; /* MJD */
	8381	char * const oregcomp_parse = RExC_parse;
	8382
	8383	GET_RE_DEBUG_FLAGS_DECL;
	8384
	8385	PERL_ARGS_ASSERT_REG;
	8386	DEBUG_PARSE("reg ");
	8387
	8388	flagp = 0; / Tentatively. */
	8389
	8390
	8391	/* Make an OPEN node, if parenthesized. */
	8392	if (paren) {
	8393	if ( RExC_parse == '') { /* (VERB:ARG) /
	8394	char *start_verb = RExC_parse;
	8395	STRLEN verb_len = 0;
	8396	char *start_arg = NULL;
	8397	unsigned char op = 0;
	8398	int argok = 1;
	8399	int internal_argval = 0; /* internal_argval is only useful if !argok */
	8400	while ( RExC_parse && RExC_parse != ')' ) {
	8401	if ( *RExC_parse == ':' ) {
	8402	start_arg = RExC_parse + 1;
	8403	break;
	8404	}
	8405	RExC_parse++;
	8406	}
	8407	++start_verb;
	8408	verb_len = RExC_parse - start_verb;
	8409	if ( start_arg ) {
	8410	RExC_parse++;
	8411	while ( RExC_parse && RExC_parse != ')' )
	8412	RExC_parse++;
	8413	if ( *RExC_parse != ')' )
	8414	vFAIL("Unterminated verb pattern argument");
	8415	if ( RExC_parse == start_arg )
	8416	start_arg = NULL;
	8417	} else {
	8418	if ( *RExC_parse != ')' )
	8419	vFAIL("Unterminated verb pattern");
	8420	}
	8421
	8422	switch ( *start_verb ) {
	8423	case 'A': /* (ACCEPT) /
	8424	if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
	8425	op = ACCEPT;
	8426	internal_argval = RExC_nestroot;
	8427	}
	8428	break;
	8429	case 'C': /* (COMMIT) /
	8430	if ( memEQs(start_verb,verb_len,"COMMIT") )
	8431	op = COMMIT;
	8432	break;
	8433	case 'F': /* (FAIL) /
	8434	if ( verb_len==1 \|\| memEQs(start_verb,verb_len,"FAIL") ) {
	8435	op = OPFAIL;
	8436	argok = 0;
	8437	}
	8438	break;
	8439	case ':': /* (:NAME) /
	8440	case 'M': /* (MARK:NAME) /
	8441	if ( verb_len==0 \|\| memEQs(start_verb,verb_len,"MARK") ) {
	8442	op = MARKPOINT;
	8443	argok = -1;
	8444	}
	8445	break;
	8446	case 'P': /* (PRUNE) /
	8447	if ( memEQs(start_verb,verb_len,"PRUNE") )
	8448	op = PRUNE;
	8449	break;
	8450	case 'S': /* (SKIP) /
	8451	if ( memEQs(start_verb,verb_len,"SKIP") )
	8452	op = SKIP;
	8453	break;
	8454	case 'T': /* (THEN) /
	8455	/* [19:06] <TimToady> :: is then */
	8456	if ( memEQs(start_verb,verb_len,"THEN") ) {
	8457	op = CUTGROUP;
	8458	RExC_seen \|= REG_SEEN_CUTGROUP;
	8459	}
	8460	break;
	8461	}
	8462	if ( ! op ) {
	8463	RExC_parse++;
	8464	vFAIL3("Unknown verb pattern '%.*s'",
	8465	verb_len, start_verb);
	8466	}
	8467	if ( argok ) {
	8468	if ( start_arg && internal_argval ) {
	8469	vFAIL3("Verb pattern '%.*s' may not have an argument",
	8470	verb_len, start_verb);
	8471	} else if ( argok < 0 && !start_arg ) {
	8472	vFAIL3("Verb pattern '%.*s' has a mandatory argument",
	8473	verb_len, start_verb);
	8474	} else {
	8475	ret = reganode(pRExC_state, op, internal_argval);
	8476	if ( ! internal_argval && ! SIZE_ONLY ) {
	8477	if (start_arg) {
	8478	SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
	8479	ARG(ret) = add_data( pRExC_state, 1, "S" );
	8480	RExC_rxi->data->data[ARG(ret)]=(void*)sv;
	8481	ret->flags = 0;
	8482	} else {
	8483	ret->flags = 1;
	8484	}
	8485	}
	8486	}
	8487	if (!internal_argval)
	8488	RExC_seen \|= REG_SEEN_VERBARG;
	8489	} else if ( start_arg ) {
	8490	vFAIL3("Verb pattern '%.*s' may not have an argument",
	8491	verb_len, start_verb);
	8492	} else {
	8493	ret = reg_node(pRExC_state, op);
	8494	}
	8495	nextchar(pRExC_state);
	8496	return ret;
	8497	} else
	8498	if (RExC_parse == '?') { / (?...) */
	8499	bool is_logical = 0;
	8500	const char * const seqstart = RExC_parse;
	8501	bool has_use_defaults = FALSE;
	8502
	8503	RExC_parse++;
	8504	paren = *RExC_parse++;
	8505	ret = NULL; /* For look-ahead/behind. */
	8506	switch (paren) {
	8507
	8508	case 'P': /* (?P...) variants for those used to PCRE/Python */
	8509	paren = *RExC_parse++;
	8510	if ( paren == '<') /* (?P<...>) named capture */
	8511	goto named_capture;
	8512	else if (paren == '>') { /* (?P>name) named recursion */
	8513	goto named_recursion;
	8514	}
	8515	else if (paren == '=') { /* (?P=...) named backref */
	8516	/* this pretty much dupes the code for \k<NAME> in regatom(), if
	8517	you change this make sure you change that */
	8518	char* name_start = RExC_parse;
	8519	U32 num = 0;
	8520	SV *sv_dat = reg_scan_name(pRExC_state,
	8521	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8522	if (RExC_parse == name_start \|\| *RExC_parse != ')')
	8523	vFAIL2("Sequence %.3s... not terminated",parse_start);
	8524
	8525	if (!SIZE_ONLY) {
	8526	num = add_data( pRExC_state, 1, "S" );
	8527	RExC_rxi->data->data[num]=(void*)sv_dat;
	8528	SvREFCNT_inc_simple_void(sv_dat);
	8529	}
	8530	RExC_sawback = 1;
	8531	ret = reganode(pRExC_state,
	8532	((! FOLD)
	8533	? NREF
	8534	: (ASCII_FOLD_RESTRICTED)
	8535	? NREFFA
	8536	: (AT_LEAST_UNI_SEMANTICS)
	8537	? NREFFU
	8538	: (LOC)
	8539	? NREFFL
	8540	: NREFF),
	8541	num);
	8542	*flagp \|= HASWIDTH;
	8543
	8544	Set_Node_Offset(ret, parse_start+1);
	8545	Set_Node_Cur_Length(ret); /* MJD */
	8546
	8547	nextchar(pRExC_state);
	8548	return ret;
	8549	}
	8550	RExC_parse++;
	8551	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8552	/NOTREACHED/
	8553	case '<': /* (?<...) */
	8554	if (*RExC_parse == '!')
	8555	paren = ',';
	8556	else if (*RExC_parse != '=')
	8557	named_capture:
	8558	{ /* (?<...>) */
	8559	char *name_start;
	8560	SV *svname;
	8561	paren= '>';
	8562	case '\'': /* (?'...') */
	8563	name_start= RExC_parse;
	8564	svname = reg_scan_name(pRExC_state,
	8565	SIZE_ONLY ? /* reverse test from the others */
	8566	REG_RSN_RETURN_NAME :
	8567	REG_RSN_RETURN_NULL);
	8568	if (RExC_parse == name_start) {
	8569	RExC_parse++;
	8570	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8571	/NOTREACHED/
	8572	}
	8573	if (*RExC_parse != paren)
	8574	vFAIL2("Sequence (?%c... not terminated",
	8575	paren=='>' ? '<' : paren);
	8576	if (SIZE_ONLY) {
	8577	HE *he_str;
	8578	SV *sv_dat = NULL;
	8579	if (!svname) /* shouldn't happen */
	8580	Perl_croak(aTHX_
	8581	"panic: reg_scan_name returned NULL");
	8582	if (!RExC_paren_names) {
	8583	RExC_paren_names= newHV();
	8584	sv_2mortal(MUTABLE_SV(RExC_paren_names));
	8585	#ifdef DEBUGGING
	8586	RExC_paren_name_list= newAV();
	8587	sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
	8588	#endif
	8589	}
	8590	he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
	8591	if ( he_str )
	8592	sv_dat = HeVAL(he_str);
	8593	if ( ! sv_dat ) {
	8594	/* croak baby croak */
	8595	Perl_croak(aTHX_
	8596	"panic: paren_name hash element allocation failed");
	8597	} else if ( SvPOK(sv_dat) ) {
	8598	/* (?\|...) can mean we have dupes so scan to check
	8599	its already been stored. Maybe a flag indicating
	8600	we are inside such a construct would be useful,
	8601	but the arrays are likely to be quite small, so
	8602	for now we punt -- dmq */
	8603	IV count = SvIV(sv_dat);
	8604	I32 pv = (I32)SvPVX(sv_dat);
	8605	IV i;
	8606	for ( i = 0 ; i < count ; i++ ) {
	8607	if ( pv[i] == RExC_npar ) {
	8608	count = 0;
	8609	break;
	8610	}
	8611	}
	8612	if ( count ) {
	8613	pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
	8614	SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
	8615	pv[count] = RExC_npar;
	8616	SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
	8617	}
	8618	} else {
	8619	(void)SvUPGRADE(sv_dat,SVt_PVNV);
	8620	sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
	8621	SvIOK_on(sv_dat);
	8622	SvIV_set(sv_dat, 1);
	8623	}
	8624	#ifdef DEBUGGING
	8625	/* Yes this does cause a memory leak in debugging Perls */
	8626	if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
	8627	SvREFCNT_dec(svname);
	8628	#endif
	8629
	8630	/sv_dump(sv_dat);/
	8631	}
	8632	nextchar(pRExC_state);
	8633	paren = 1;
	8634	goto capturing_parens;
	8635	}
	8636	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	8637	RExC_in_lookbehind++;
	8638	RExC_parse++;
	8639	case '=': /* (?=...) */
	8640	RExC_seen_zerolen++;
	8641	break;
	8642	case '!': /* (?!...) */
	8643	RExC_seen_zerolen++;
	8644	if (*RExC_parse == ')') {
	8645	ret=reg_node(pRExC_state, OPFAIL);
	8646	nextchar(pRExC_state);
	8647	return ret;
	8648	}
	8649	break;
	8650	case '\|': /* (?\|...) */
	8651	/* branch reset, behave like a (?:...) except that
	8652	buffers in alternations share the same numbers */
	8653	paren = ':';
	8654	after_freeze = freeze_paren = RExC_npar;
	8655	break;
	8656	case ':': /* (?:...) */
	8657	case '>': /* (?>...) */
	8658	break;
	8659	case '$': /* (?$...) */
	8660	case '@': /* (?@...) */
	8661	vFAIL2("Sequence (?%c...) not implemented", (int)paren);
	8662	break;
	8663	case '#': /* (?#...) */
	8664	while (RExC_parse && RExC_parse != ')')
	8665	RExC_parse++;
	8666	if (*RExC_parse != ')')
	8667	FAIL("Sequence (?#... not terminated");
	8668	nextchar(pRExC_state);
	8669	*flagp = TRYAGAIN;
	8670	return NULL;
	8671	case '0' : /* (?0) */
	8672	case 'R' : /* (?R) */
	8673	if (*RExC_parse != ')')
	8674	FAIL("Sequence (?R) not terminated");
	8675	ret = reg_node(pRExC_state, GOSTART);
	8676	*flagp \|= POSTPONED;
	8677	nextchar(pRExC_state);
	8678	return ret;
	8679	/notreached/
	8680	{ /* named and numeric backreferences */
	8681	I32 num;
	8682	case '&': /* (?&NAME) */
	8683	parse_start = RExC_parse - 1;
	8684	named_recursion:
	8685	{
	8686	SV *sv_dat = reg_scan_name(pRExC_state,
	8687	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8688	num = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	8689	}
	8690	goto gen_recurse_regop;
	8691	assert(0); /* NOT REACHED */
	8692	case '+':
	8693	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	8694	RExC_parse++;
	8695	vFAIL("Illegal pattern");
	8696	}
	8697	goto parse_recursion;
	8698	/* NOT REACHED*/
	8699	case '-': /* (?-1) */
	8700	if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
	8701	RExC_parse--; /* rewind to let it be handled later */
	8702	goto parse_flags;
	8703	}
	8704	/FALLTHROUGH /
	8705	case '1': case '2': case '3': case '4': /* (?1) */
	8706	case '5': case '6': case '7': case '8': case '9':
	8707	RExC_parse--;
	8708	parse_recursion:
	8709	num = atoi(RExC_parse);
	8710	parse_start = RExC_parse - 1; /* MJD */
	8711	if (*RExC_parse == '-')
	8712	RExC_parse++;
	8713	while (isDIGIT(*RExC_parse))
	8714	RExC_parse++;
	8715	if (*RExC_parse!=')')
	8716	vFAIL("Expecting close bracket");
	8717
	8718	gen_recurse_regop:
	8719	if ( paren == '-' ) {
	8720	/*
	8721	Diagram of capture buffer numbering.
	8722	Top line is the normal capture buffer numbers
	8723	Bottom line is the negative indexing as from
	8724	the X (the (?-2))
	8725
	8726	+ 1 2 3 4 5 X 6 7
	8727	/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
	8728	- 5 4 3 2 1 X x x
	8729
	8730	*/
	8731	num = RExC_npar + num;
	8732	if (num < 1) {
	8733	RExC_parse++;
	8734	vFAIL("Reference to nonexistent group");
	8735	}
	8736	} else if ( paren == '+' ) {
	8737	num = RExC_npar + num - 1;
	8738	}
	8739
	8740	ret = reganode(pRExC_state, GOSUB, num);
	8741	if (!SIZE_ONLY) {
	8742	if (num > (I32)RExC_rx->nparens) {
	8743	RExC_parse++;
	8744	vFAIL("Reference to nonexistent group");
	8745	}
	8746	ARG2L_SET( ret, RExC_recurse_count++);
	8747	RExC_emit++;
	8748	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	8749	"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
	8750	} else {
	8751	RExC_size++;
	8752	}
	8753	RExC_seen \|= REG_SEEN_RECURSE;
	8754	Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
	8755	Set_Node_Offset(ret, parse_start); /* MJD */
	8756
	8757	*flagp \|= POSTPONED;
	8758	nextchar(pRExC_state);
	8759	return ret;
	8760	} /* named and numeric backreferences */
	8761	assert(0); /* NOT REACHED */
	8762
	8763	case '?': /* (??...) */
	8764	is_logical = 1;
	8765	if (*RExC_parse != '{') {
	8766	RExC_parse++;
	8767	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	8768	/NOTREACHED/
	8769	}
	8770	*flagp \|= POSTPONED;
	8771	paren = *RExC_parse++;
	8772	/* FALL THROUGH */
	8773	case '{': /* (?{...}) */
	8774	{
	8775	U32 n = 0;
	8776	struct reg_code_block *cb;
	8777
	8778	RExC_seen_zerolen++;
	8779
	8780	if ( !pRExC_state->num_code_blocks
	8781	\|\| pRExC_state->code_index >= pRExC_state->num_code_blocks
	8782	\|\| pRExC_state->code_blocks[pRExC_state->code_index].start
	8783	!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
	8784	- RExC_start)
	8785	) {
	8786	if (RExC_pm_flags & PMf_USE_RE_EVAL)
	8787	FAIL("panic: Sequence (?{...}): no code block found\n");
	8788	FAIL("Eval-group not allowed at runtime, use re 'eval'");
	8789	}
	8790	/* this is a pre-compiled code block (?{...}) */
	8791	cb = &pRExC_state->code_blocks[pRExC_state->code_index];
	8792	RExC_parse = RExC_start + cb->end;
	8793	if (!SIZE_ONLY) {
	8794	OP *o = cb->block;
	8795	if (cb->src_regex) {
	8796	n = add_data(pRExC_state, 2, "rl");
	8797	RExC_rxi->data->data[n] =
	8798	(void)SvREFCNT_inc((SV)cb->src_regex);
	8799	RExC_rxi->data->data[n+1] = (void*)o;
	8800	}
	8801	else {
	8802	n = add_data(pRExC_state, 1,
	8803	(RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
	8804	RExC_rxi->data->data[n] = (void*)o;
	8805	}
	8806	}
	8807	pRExC_state->code_index++;
	8808	nextchar(pRExC_state);
	8809
	8810	if (is_logical) {
	8811	regnode *eval;
	8812	ret = reg_node(pRExC_state, LOGICAL);
	8813	eval = reganode(pRExC_state, EVAL, n);
	8814	if (!SIZE_ONLY) {
	8815	ret->flags = 2;
	8816	/* for later propagation into (??{}) return value */
	8817	eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
	8818	}
	8819	REGTAIL(pRExC_state, ret, eval);
	8820	/* deal with the length of this later - MJD */
	8821	return ret;
	8822	}
	8823	ret = reganode(pRExC_state, EVAL, n);
	8824	Set_Node_Length(ret, RExC_parse - parse_start + 1);
	8825	Set_Node_Offset(ret, parse_start);
	8826	return ret;
	8827	}
	8828	case '(': /* (?(?{...})...) and (?(?=...)...) */
	8829	{
	8830	int is_define= 0;
	8831	if (RExC_parse[0] == '?') { /* (?(?...)) */
	8832	if (RExC_parse[1] == '=' \|\| RExC_parse[1] == '!'
	8833	\|\| RExC_parse[1] == '<'
	8834	\|\| RExC_parse[1] == '{') { /* Lookahead or eval. */
	8835	I32 flag;
	8836
	8837	ret = reg_node(pRExC_state, LOGICAL);
	8838	if (!SIZE_ONLY)
	8839	ret->flags = 1;
	8840	REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
	8841	goto insert_if;
	8842	}
	8843	}
	8844	else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
	8845	\|\| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
	8846	{
	8847	char ch = RExC_parse[0] == '<' ? '>' : '\'';
	8848	char *name_start= RExC_parse++;
	8849	U32 num = 0;
	8850	SV *sv_dat=reg_scan_name(pRExC_state,
	8851	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8852	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	8853	vFAIL2("Sequence (?(%c... not terminated",
	8854	(ch == '>' ? '<' : ch));
	8855	RExC_parse++;
	8856	if (!SIZE_ONLY) {
	8857	num = add_data( pRExC_state, 1, "S" );
	8858	RExC_rxi->data->data[num]=(void*)sv_dat;
	8859	SvREFCNT_inc_simple_void(sv_dat);
	8860	}
	8861	ret = reganode(pRExC_state,NGROUPP,num);
	8862	goto insert_if_check_paren;
	8863	}
	8864	else if (RExC_parse[0] == 'D' &&
	8865	RExC_parse[1] == 'E' &&
	8866	RExC_parse[2] == 'F' &&
	8867	RExC_parse[3] == 'I' &&
	8868	RExC_parse[4] == 'N' &&
	8869	RExC_parse[5] == 'E')
	8870	{
	8871	ret = reganode(pRExC_state,DEFINEP,0);
	8872	RExC_parse +=6 ;
	8873	is_define = 1;
	8874	goto insert_if_check_paren;
	8875	}
	8876	else if (RExC_parse[0] == 'R') {
	8877	RExC_parse++;
	8878	parno = 0;
	8879	if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	8880	parno = atoi(RExC_parse++);
	8881	while (isDIGIT(*RExC_parse))
	8882	RExC_parse++;
	8883	} else if (RExC_parse[0] == '&') {
	8884	SV *sv_dat;
	8885	RExC_parse++;
	8886	sv_dat = reg_scan_name(pRExC_state,
	8887	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	8888	parno = sv_dat ? ((I32 )SvPVX(sv_dat)) : 0;
	8889	}
	8890	ret = reganode(pRExC_state,INSUBP,parno);
	8891	goto insert_if_check_paren;
	8892	}
	8893	else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
	8894	/* (?(1)...) */
	8895	char c;
	8896	parno = atoi(RExC_parse++);
	8897
	8898	while (isDIGIT(*RExC_parse))
	8899	RExC_parse++;
	8900	ret = reganode(pRExC_state, GROUPP, parno);
	8901
	8902	insert_if_check_paren:
	8903	if ((c = *nextchar(pRExC_state)) != ')')
	8904	vFAIL("Switch condition not recognized");
	8905	insert_if:
	8906	REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
	8907	br = regbranch(pRExC_state, &flags, 1,depth+1);
	8908	if (br == NULL)
	8909	br = reganode(pRExC_state, LONGJMP, 0);
	8910	else
	8911	REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
	8912	c = *nextchar(pRExC_state);
	8913	if (flags&HASWIDTH)
	8914	*flagp \|= HASWIDTH;
	8915	if (c == '\|') {
	8916	if (is_define)
	8917	vFAIL("(?(DEFINE)....) does not allow branches");
	8918	lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
	8919	regbranch(pRExC_state, &flags, 1,depth+1);
	8920	REGTAIL(pRExC_state, ret, lastbr);
	8921	if (flags&HASWIDTH)
	8922	*flagp \|= HASWIDTH;
	8923	c = *nextchar(pRExC_state);
	8924	}
	8925	else
	8926	lastbr = NULL;
	8927	if (c != ')')
	8928	vFAIL("Switch (?(condition)... contains too many branches");
	8929	ender = reg_node(pRExC_state, TAIL);
	8930	REGTAIL(pRExC_state, br, ender);
	8931	if (lastbr) {
	8932	REGTAIL(pRExC_state, lastbr, ender);
	8933	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
	8934	}
	8935	else
	8936	REGTAIL(pRExC_state, ret, ender);
	8937	RExC_size++; /* XXX WHY do we need this?!!
	8938	For large programs it seems to be required
	8939	but I can't figure out why. -- dmq*/
	8940	return ret;
	8941	}
	8942	else {
	8943	vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
	8944	}
	8945	}
	8946	case 0:
	8947	RExC_parse--; /* for vFAIL to print correctly */
	8948	vFAIL("Sequence (? incomplete");
	8949	break;
	8950	case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
	8951	that follow */
	8952	has_use_defaults = TRUE;
	8953	STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
	8954	set_regex_charset(&RExC_flags, (RExC_utf8 \|\| RExC_uni_semantics)
	8955	? REGEX_UNICODE_CHARSET
	8956	: REGEX_DEPENDS_CHARSET);
	8957	goto parse_flags;
	8958	default:
	8959	--RExC_parse;
	8960	parse_flags: /* (?i) */
	8961	{
	8962	U32 posflags = 0, negflags = 0;
	8963	U32 *flagsp = &posflags;
	8964	char has_charset_modifier = '\0';
	8965	regex_charset cs = get_regex_charset(RExC_flags);
	8966	if (cs == REGEX_DEPENDS_CHARSET
	8967	&& (RExC_utf8 \|\| RExC_uni_semantics))
	8968	{
	8969	cs = REGEX_UNICODE_CHARSET;
	8970	}
	8971
	8972	while (*RExC_parse) {
	8973	/* && strchr("iogcmsx", RExC_parse) /
	8974	/* (?g), (?gc) and (?o) are useless here
	8975	and must be globally applied -- japhy */
	8976	switch (*RExC_parse) {
	8977	CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
	8978	case LOCALE_PAT_MOD:
	8979	if (has_charset_modifier) {
	8980	goto excess_modifier;
	8981	}
	8982	else if (flagsp == &negflags) {
	8983	goto neg_modifier;
	8984	}
	8985	cs = REGEX_LOCALE_CHARSET;
	8986	has_charset_modifier = LOCALE_PAT_MOD;
	8987	RExC_contains_locale = 1;
	8988	break;
	8989	case UNICODE_PAT_MOD:
	8990	if (has_charset_modifier) {
	8991	goto excess_modifier;
	8992	}
	8993	else if (flagsp == &negflags) {
	8994	goto neg_modifier;
	8995	}
	8996	cs = REGEX_UNICODE_CHARSET;
	8997	has_charset_modifier = UNICODE_PAT_MOD;
	8998	break;
	8999	case ASCII_RESTRICT_PAT_MOD:
	9000	if (flagsp == &negflags) {
	9001	goto neg_modifier;
	9002	}
	9003	if (has_charset_modifier) {
	9004	if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
	9005	goto excess_modifier;
	9006	}
	9007	/* Doubled modifier implies more restricted */
	9008	cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
	9009	}
	9010	else {
	9011	cs = REGEX_ASCII_RESTRICTED_CHARSET;
	9012	}
	9013	has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
	9014	break;
	9015	case DEPENDS_PAT_MOD:
	9016	if (has_use_defaults) {
	9017	goto fail_modifiers;
	9018	}
	9019	else if (flagsp == &negflags) {
	9020	goto neg_modifier;
	9021	}
	9022	else if (has_charset_modifier) {
	9023	goto excess_modifier;
	9024	}
	9025
	9026	/* The dual charset means unicode semantics if the
	9027	* pattern (or target, not known until runtime) are
	9028	* utf8, or something in the pattern indicates unicode
	9029	* semantics */
	9030	cs = (RExC_utf8 \|\| RExC_uni_semantics)
	9031	? REGEX_UNICODE_CHARSET
	9032	: REGEX_DEPENDS_CHARSET;
	9033	has_charset_modifier = DEPENDS_PAT_MOD;
	9034	break;
	9035	excess_modifier:
	9036	RExC_parse++;
	9037	if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
	9038	vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
	9039	}
	9040	else if (has_charset_modifier == *(RExC_parse - 1)) {
	9041	vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
	9042	}
	9043	else {
	9044	vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
	9045	}
	9046	/NOTREACHED/
	9047	neg_modifier:
	9048	RExC_parse++;
	9049	vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
	9050	/NOTREACHED/
	9051	case ONCE_PAT_MOD: /* 'o' */
	9052	case GLOBAL_PAT_MOD: /* 'g' */
	9053	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	9054	const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
	9055	if (! (wastedflags & wflagbit) ) {
	9056	wastedflags \|= wflagbit;
	9057	vWARN5(
	9058	RExC_parse + 1,
	9059	"Useless (%s%c) - %suse /%c modifier",
	9060	flagsp == &negflags ? "?-" : "?",
	9061	*RExC_parse,
	9062	flagsp == &negflags ? "don't " : "",
	9063	*RExC_parse
	9064	);
	9065	}
	9066	}
	9067	break;
	9068
	9069	case CONTINUE_PAT_MOD: /* 'c' */
	9070	if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
	9071	if (! (wastedflags & WASTED_C) ) {
	9072	wastedflags \|= WASTED_GC;
	9073	vWARN3(
	9074	RExC_parse + 1,
	9075	"Useless (%sc) - %suse /gc modifier",
	9076	flagsp == &negflags ? "?-" : "?",
	9077	flagsp == &negflags ? "don't " : ""
	9078	);
	9079	}
	9080	}
	9081	break;
	9082	case KEEPCOPY_PAT_MOD: /* 'p' */
	9083	if (flagsp == &negflags) {
	9084	if (SIZE_ONLY)
	9085	ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
	9086	} else {
	9087	*flagsp \|= RXf_PMf_KEEPCOPY;
	9088	}
	9089	break;
	9090	case '-':
	9091	/* A flag is a default iff it is following a minus, so
	9092	* if there is a minus, it means will be trying to
	9093	* re-specify a default which is an error */
	9094	if (has_use_defaults \|\| flagsp == &negflags) {
	9095	fail_modifiers:
	9096	RExC_parse++;
	9097	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	9098	/NOTREACHED/
	9099	}
	9100	flagsp = &negflags;
	9101	wastedflags = 0; /* reset so (?g-c) warns twice */
	9102	break;
	9103	case ':':
	9104	paren = ':';
	9105	/FALLTHROUGH/
	9106	case ')':
	9107	RExC_flags \|= posflags;
	9108	RExC_flags &= ~negflags;
	9109	set_regex_charset(&RExC_flags, cs);
	9110	if (paren != ':') {
	9111	oregflags \|= posflags;
	9112	oregflags &= ~negflags;
	9113	set_regex_charset(&oregflags, cs);
	9114	}
	9115	nextchar(pRExC_state);
	9116	if (paren != ':') {
	9117	*flagp = TRYAGAIN;
	9118	return NULL;
	9119	} else {
	9120	ret = NULL;
	9121	goto parse_rest;
	9122	}
	9123	/NOTREACHED/
	9124	default:
	9125	RExC_parse++;
	9126	vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
	9127	/NOTREACHED/
	9128	}
	9129	++RExC_parse;
	9130	}
	9131	}} /* one for the default block, one for the switch */
	9132	}
	9133	else { /* (...) */
	9134	capturing_parens:
	9135	parno = RExC_npar;
	9136	RExC_npar++;
	9137
	9138	ret = reganode(pRExC_state, OPEN, parno);
	9139	if (!SIZE_ONLY ){
	9140	if (!RExC_nestroot)
	9141	RExC_nestroot = parno;
	9142	if (RExC_seen & REG_SEEN_RECURSE
	9143	&& !RExC_open_parens[parno-1])
	9144	{
	9145	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	9146	"Setting open paren #%"IVdf" to %d\n",
	9147	(IV)parno, REG_NODE_NUM(ret)));
	9148	RExC_open_parens[parno-1]= ret;
	9149	}
	9150	}
	9151	Set_Node_Length(ret, 1); /* MJD */
	9152	Set_Node_Offset(ret, RExC_parse); /* MJD */
	9153	is_open = 1;
	9154	}
	9155	}
	9156	else /* ! paren */
	9157	ret = NULL;
	9158
	9159	parse_rest:
	9160	/* Pick up the branches, linking them together. */
	9161	parse_start = RExC_parse; /* MJD */
	9162	br = regbranch(pRExC_state, &flags, 1,depth+1);
	9163
	9164	/* branch_len = (paren != 0); */
	9165
	9166	if (br == NULL)
	9167	return(NULL);
	9168	if (*RExC_parse == '\|') {
	9169	if (!SIZE_ONLY && RExC_extralen) {
	9170	reginsert(pRExC_state, BRANCHJ, br, depth+1);
	9171	}
	9172	else { /* MJD */
	9173	reginsert(pRExC_state, BRANCH, br, depth+1);
	9174	Set_Node_Length(br, paren != 0);
	9175	Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
	9176	}
	9177	have_branch = 1;
	9178	if (SIZE_ONLY)
	9179	RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
	9180	}
	9181	else if (paren == ':') {
	9182	*flagp \|= flags&SIMPLE;
	9183	}
	9184	if (is_open) { /* Starts with OPEN. */
	9185	REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
	9186	}
	9187	else if (paren != '?') /* Not Conditional */
	9188	ret = br;
	9189	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	9190	lastbr = br;
	9191	while (*RExC_parse == '\|') {
	9192	if (!SIZE_ONLY && RExC_extralen) {
	9193	ender = reganode(pRExC_state, LONGJMP,0);
	9194	REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
	9195	}
	9196	if (SIZE_ONLY)
	9197	RExC_extralen += 2; /* Account for LONGJMP. */
	9198	nextchar(pRExC_state);
	9199	if (freeze_paren) {
	9200	if (RExC_npar > after_freeze)
	9201	after_freeze = RExC_npar;
	9202	RExC_npar = freeze_paren;
	9203	}
	9204	br = regbranch(pRExC_state, &flags, 0, depth+1);
	9205
	9206	if (br == NULL)
	9207	return(NULL);
	9208	REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
	9209	lastbr = br;
	9210	*flagp \|= flags & (SPSTART \| HASWIDTH \| POSTPONED);
	9211	}
	9212
	9213	if (have_branch \|\| paren != ':') {
	9214	/* Make a closing node, and hook it on the end. */
	9215	switch (paren) {
	9216	case ':':
	9217	ender = reg_node(pRExC_state, TAIL);
	9218	break;
	9219	case 1:
	9220	ender = reganode(pRExC_state, CLOSE, parno);
	9221	if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
	9222	DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
	9223	"Setting close paren #%"IVdf" to %d\n",
	9224	(IV)parno, REG_NODE_NUM(ender)));
	9225	RExC_close_parens[parno-1]= ender;
	9226	if (RExC_nestroot == parno)
	9227	RExC_nestroot = 0;
	9228	}
	9229	Set_Node_Offset(ender,RExC_parse+1); /* MJD */
	9230	Set_Node_Length(ender,1); /* MJD */
	9231	break;
	9232	case '<':
	9233	case ',':
	9234	case '=':
	9235	case '!':
	9236	*flagp &= ~HASWIDTH;
	9237	/* FALL THROUGH */
	9238	case '>':
	9239	ender = reg_node(pRExC_state, SUCCEED);
	9240	break;
	9241	case 0:
	9242	ender = reg_node(pRExC_state, END);
	9243	if (!SIZE_ONLY) {
	9244	assert(!RExC_opend); /* there can only be one! */
	9245	RExC_opend = ender;
	9246	}
	9247	break;
	9248	}
	9249	DEBUG_PARSE_r(if (!SIZE_ONLY) {
	9250	SV * const mysv_val1=sv_newmortal();
	9251	SV * const mysv_val2=sv_newmortal();
	9252	DEBUG_PARSE_MSG("lsbr");
	9253	regprop(RExC_rx, mysv_val1, lastbr);
	9254	regprop(RExC_rx, mysv_val2, ender);
	9255	PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
	9256	SvPV_nolen_const(mysv_val1),
	9257	(IV)REG_NODE_NUM(lastbr),
	9258	SvPV_nolen_const(mysv_val2),
	9259	(IV)REG_NODE_NUM(ender),
	9260	(IV)(ender - lastbr)
	9261	);
	9262	});
	9263	REGTAIL(pRExC_state, lastbr, ender);
	9264
	9265	if (have_branch && !SIZE_ONLY) {
	9266	char is_nothing= 1;
	9267	if (depth==1)
	9268	RExC_seen \|= REG_TOP_LEVEL_BRANCHES;
	9269
	9270	/* Hook the tails of the branches to the closing node. */
	9271	for (br = ret; br; br = regnext(br)) {
	9272	const U8 op = PL_regkind[OP(br)];
	9273	if (op == BRANCH) {
	9274	REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
	9275	if (OP(NEXTOPER(br)) != NOTHING \|\| regnext(NEXTOPER(br)) != ender)
	9276	is_nothing= 0;
	9277	}
	9278	else if (op == BRANCHJ) {
	9279	REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
	9280	/* for now we always disable this optimisation * /
	9281	if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING \|\| regnext(NEXTOPER(NEXTOPER(br))) != ender)
	9282	*/
	9283	is_nothing= 0;
	9284	}
	9285	}
	9286	if (is_nothing) {
	9287	br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
	9288	DEBUG_PARSE_r(if (!SIZE_ONLY) {
	9289	SV * const mysv_val1=sv_newmortal();
	9290	SV * const mysv_val2=sv_newmortal();
	9291	DEBUG_PARSE_MSG("NADA");
	9292	regprop(RExC_rx, mysv_val1, ret);
	9293	regprop(RExC_rx, mysv_val2, ender);
	9294	PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
	9295	SvPV_nolen_const(mysv_val1),
	9296	(IV)REG_NODE_NUM(ret),
	9297	SvPV_nolen_const(mysv_val2),
	9298	(IV)REG_NODE_NUM(ender),
	9299	(IV)(ender - ret)
	9300	);
	9301	});
	9302	OP(br)= NOTHING;
	9303	if (OP(ender) == TAIL) {
	9304	NEXT_OFF(br)= 0;
	9305	RExC_emit= br + 1;
	9306	} else {
	9307	regnode *opt;
	9308	for ( opt= br + 1; opt < ender ; opt++ )
	9309	OP(opt)= OPTIMIZED;
	9310	NEXT_OFF(br)= ender - br;
	9311	}
	9312	}
	9313	}
	9314	}
	9315
	9316	{
	9317	const char *p;
	9318	static const char parens[] = "=!<,>";
	9319
	9320	if (paren && (p = strchr(parens, paren))) {
	9321	U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
	9322	int flag = (p - parens) > 1;
	9323
	9324	if (paren == '>')
	9325	node = SUSPEND, flag = 0;
	9326	reginsert(pRExC_state, node,ret, depth+1);
	9327	Set_Node_Cur_Length(ret);
	9328	Set_Node_Offset(ret, parse_start + 1);
	9329	ret->flags = flag;
	9330	REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
	9331	}
	9332	}
	9333
	9334	/* Check for proper termination. */
	9335	if (paren) {
	9336	RExC_flags = oregflags;
	9337	if (RExC_parse >= RExC_end \|\| *nextchar(pRExC_state) != ')') {
	9338	RExC_parse = oregcomp_parse;
	9339	vFAIL("Unmatched (");
	9340	}
	9341	}
	9342	else if (!paren && RExC_parse < RExC_end) {
	9343	if (*RExC_parse == ')') {
	9344	RExC_parse++;
	9345	vFAIL("Unmatched )");
	9346	}
	9347	else
	9348	FAIL("Junk on end of regexp"); /* "Can't happen". */
	9349	assert(0); /* NOTREACHED */
	9350	}
	9351
	9352	if (RExC_in_lookbehind) {
	9353	RExC_in_lookbehind--;
	9354	}
	9355	if (after_freeze > RExC_npar)
	9356	RExC_npar = after_freeze;
	9357	return(ret);
	9358	}
	9359
	9360	/*
	9361	- regbranch - one alternative of an \| operator
	9362	*
	9363	* Implements the concatenation operator.
	9364	*/
	9365	STATIC regnode *
	9366	S_regbranch(pTHX_ RExC_state_t pRExC_state, I32 flagp, I32 first, U32 depth)
	9367	{
	9368	dVAR;
	9369	regnode *ret;
	9370	regnode *chain = NULL;
	9371	regnode *latest;
	9372	I32 flags = 0, c = 0;
	9373	GET_RE_DEBUG_FLAGS_DECL;
	9374
	9375	PERL_ARGS_ASSERT_REGBRANCH;
	9376
	9377	DEBUG_PARSE("brnc");
	9378
	9379	if (first)
	9380	ret = NULL;
	9381	else {
	9382	if (!SIZE_ONLY && RExC_extralen)
	9383	ret = reganode(pRExC_state, BRANCHJ,0);
	9384	else {
	9385	ret = reg_node(pRExC_state, BRANCH);
	9386	Set_Node_Length(ret, 1);
	9387	}
	9388	}
	9389
	9390	if (!first && SIZE_ONLY)
	9391	RExC_extralen += 1; /* BRANCHJ */
	9392
	9393	flagp = WORST; / Tentatively. */
	9394
	9395	RExC_parse--;
	9396	nextchar(pRExC_state);
	9397	while (RExC_parse < RExC_end && RExC_parse != '\|' && RExC_parse != ')') {
	9398	flags &= ~TRYAGAIN;
	9399	latest = regpiece(pRExC_state, &flags,depth+1);
	9400	if (latest == NULL) {
	9401	if (flags & TRYAGAIN)
	9402	continue;
	9403	return(NULL);
	9404	}
	9405	else if (ret == NULL)
	9406	ret = latest;
	9407	*flagp \|= flags&(HASWIDTH\|POSTPONED);
	9408	if (chain == NULL) /* First piece. */
	9409	*flagp \|= flags&SPSTART;
	9410	else {
	9411	RExC_naughty++;
	9412	REGTAIL(pRExC_state, chain, latest);
	9413	}
	9414	chain = latest;
	9415	c++;
	9416	}
	9417	if (chain == NULL) { /* Loop ran zero times. */
	9418	chain = reg_node(pRExC_state, NOTHING);
	9419	if (ret == NULL)
	9420	ret = chain;
	9421	}
	9422	if (c == 1) {
	9423	*flagp \|= flags&SIMPLE;
	9424	}
	9425
	9426	return ret;
	9427	}
	9428
	9429	/*
	9430	- regpiece - something followed by possible [*+?]
	9431	*
	9432	* Note that the branching code sequences used for ? and the general cases
	9433	* of * and + are somewhat optimized: they use the same NOTHING node as
	9434	* both the endmarker for their branch list and the body of the last branch.
	9435	* It might seem that this node could be dispensed with entirely, but the
	9436	* endmarker role is not redundant.
	9437	*/
	9438	STATIC regnode *
	9439	S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	9440	{
	9441	dVAR;
	9442	regnode *ret;
	9443	char op;
	9444	char *next;
	9445	I32 flags;
	9446	const char * const origparse = RExC_parse;
	9447	I32 min;
	9448	I32 max = REG_INFTY;
	9449	#ifdef RE_TRACK_PATTERN_OFFSETS
	9450	char *parse_start;
	9451	#endif
	9452	const char *maxpos = NULL;
	9453
	9454	/* Save the original in case we change the emitted regop to a FAIL. */
	9455	regnode * const orig_emit = RExC_emit;
	9456
	9457	GET_RE_DEBUG_FLAGS_DECL;
	9458
	9459	PERL_ARGS_ASSERT_REGPIECE;
	9460
	9461	DEBUG_PARSE("piec");
	9462
	9463	ret = regatom(pRExC_state, &flags,depth+1);
	9464	if (ret == NULL) {
	9465	if (flags & TRYAGAIN)
	9466	*flagp \|= TRYAGAIN;
	9467	return(NULL);
	9468	}
	9469
	9470	op = *RExC_parse;
	9471
	9472	if (op == '{' && regcurly(RExC_parse)) {
	9473	maxpos = NULL;
	9474	#ifdef RE_TRACK_PATTERN_OFFSETS
	9475	parse_start = RExC_parse; /* MJD */
	9476	#endif
	9477	next = RExC_parse + 1;
	9478	while (isDIGIT(next) \|\| next == ',') {
	9479	if (*next == ',') {
	9480	if (maxpos)
	9481	break;
	9482	else
	9483	maxpos = next;
	9484	}
	9485	next++;
	9486	}
	9487	if (next == '}') { / got one */
	9488	if (!maxpos)
	9489	maxpos = next;
	9490	RExC_parse++;
	9491	min = atoi(RExC_parse);
	9492	if (*maxpos == ',')
	9493	maxpos++;
	9494	else
	9495	maxpos = RExC_parse;
	9496	max = atoi(maxpos);
	9497	if (!max && *maxpos != '0')
	9498	max = REG_INFTY; /* meaning "infinity" */
	9499	else if (max >= REG_INFTY)
	9500	vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
	9501	RExC_parse = next;
	9502	nextchar(pRExC_state);
	9503	if (max < min) { /* If can't match, warn and optimize to fail
	9504	unconditionally */
	9505	if (SIZE_ONLY) {
	9506	ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
	9507
	9508	/* We can't back off the size because we have to reserve
	9509	* enough space for all the things we are about to throw
	9510	* away, but we can shrink it by the ammount we are about
	9511	* to re-use here */
	9512	RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
	9513	}
	9514	else {
	9515	RExC_emit = orig_emit;
	9516	}
	9517	ret = reg_node(pRExC_state, OPFAIL);
	9518	return ret;
	9519	}
	9520
	9521	do_curly:
	9522	if ((flags&SIMPLE)) {
	9523	RExC_naughty += 2 + RExC_naughty / 2;
	9524	reginsert(pRExC_state, CURLY, ret, depth+1);
	9525	Set_Node_Offset(ret, parse_start+1); /* MJD */
	9526	Set_Node_Cur_Length(ret);
	9527	}
	9528	else {
	9529	regnode * const w = reg_node(pRExC_state, WHILEM);
	9530
	9531	w->flags = 0;
	9532	REGTAIL(pRExC_state, ret, w);
	9533	if (!SIZE_ONLY && RExC_extralen) {
	9534	reginsert(pRExC_state, LONGJMP,ret, depth+1);
	9535	reginsert(pRExC_state, NOTHING,ret, depth+1);
	9536	NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
	9537	}
	9538	reginsert(pRExC_state, CURLYX,ret, depth+1);
	9539	/* MJD hk */
	9540	Set_Node_Offset(ret, parse_start+1);
	9541	Set_Node_Length(ret,
	9542	op == '{' ? (RExC_parse - parse_start) : 1);
	9543
	9544	if (!SIZE_ONLY && RExC_extralen)
	9545	NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
	9546	REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
	9547	if (SIZE_ONLY)
	9548	RExC_whilem_seen++, RExC_extralen += 3;
	9549	RExC_naughty += 4 + RExC_naughty; /* compound interest */
	9550	}
	9551	ret->flags = 0;
	9552
	9553	if (min > 0)
	9554	*flagp = WORST;
	9555	if (max > 0)
	9556	*flagp \|= HASWIDTH;
	9557	if (!SIZE_ONLY) {
	9558	ARG1_SET(ret, (U16)min);
	9559	ARG2_SET(ret, (U16)max);
	9560	}
	9561
	9562	goto nest_check;
	9563	}
	9564	}
	9565
	9566	if (!ISMULT1(op)) {
	9567	*flagp = flags;
	9568	return(ret);
	9569	}
	9570
	9571	#if 0 /* Now runtime fix should be reliable. */
	9572
	9573	/* if this is reinstated, don't forget to put this back into perldiag:
	9574
	9575	=item Regexp *+ operand could be empty at {#} in regex m/%s/
	9576
	9577	(F) The part of the regexp subject to either the * or + quantifier
	9578	could match an empty string. The {#} shows in the regular
	9579	expression about where the problem was discovered.
	9580
	9581	*/
	9582
	9583	if (!(flags&HASWIDTH) && op != '?')
	9584	vFAIL("Regexp *+ operand could be empty");
	9585	#endif
	9586
	9587	#ifdef RE_TRACK_PATTERN_OFFSETS
	9588	parse_start = RExC_parse;
	9589	#endif
	9590	nextchar(pRExC_state);
	9591
	9592	*flagp = (op != '+') ? (WORST\|SPSTART\|HASWIDTH) : (WORST\|HASWIDTH);
	9593
	9594	if (op == '*' && (flags&SIMPLE)) {
	9595	reginsert(pRExC_state, STAR, ret, depth+1);
	9596	ret->flags = 0;
	9597	RExC_naughty += 4;
	9598	}
	9599	else if (op == '*') {
	9600	min = 0;
	9601	goto do_curly;
	9602	}
	9603	else if (op == '+' && (flags&SIMPLE)) {
	9604	reginsert(pRExC_state, PLUS, ret, depth+1);
	9605	ret->flags = 0;
	9606	RExC_naughty += 3;
	9607	}
	9608	else if (op == '+') {
	9609	min = 1;
	9610	goto do_curly;
	9611	}
	9612	else if (op == '?') {
	9613	min = 0; max = 1;
	9614	goto do_curly;
	9615	}
	9616	nest_check:
	9617	if (!SIZE_ONLY && !(flags&(HASWIDTH\|POSTPONED)) && max > REG_INFTY/3) {
	9618	ckWARN3reg(RExC_parse,
	9619	"%.*s matches null string many times",
	9620	(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
	9621	origparse);
	9622	}
	9623
	9624	if (RExC_parse < RExC_end && *RExC_parse == '?') {
	9625	nextchar(pRExC_state);
	9626	reginsert(pRExC_state, MINMOD, ret, depth+1);
	9627	REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
	9628	}
	9629	#ifndef REG_ALLOW_MINMOD_SUSPEND
	9630	else
	9631	#endif
	9632	if (RExC_parse < RExC_end && *RExC_parse == '+') {
	9633	regnode *ender;
	9634	nextchar(pRExC_state);
	9635	ender = reg_node(pRExC_state, SUCCEED);
	9636	REGTAIL(pRExC_state, ret, ender);
	9637	reginsert(pRExC_state, SUSPEND, ret, depth+1);
	9638	ret->flags = 0;
	9639	ender = reg_node(pRExC_state, TAIL);
	9640	REGTAIL(pRExC_state, ret, ender);
	9641	/ret= ender;/
	9642	}
	9643
	9644	if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
	9645	RExC_parse++;
	9646	vFAIL("Nested quantifiers");
	9647	}
	9648
	9649	return(ret);
	9650	}
	9651
	9652	STATIC bool
	9653	S_grok_bslash_N(pTHX_ RExC_state_t pRExC_state, regnode* node_p, UV valuep, I32 flagp, U32 depth, bool in_char_class)
	9654	{
	9655
	9656	/* This is expected to be called by a parser routine that has recognized '\N'
	9657	and needs to handle the rest. RExC_parse is expected to point at the first
	9658	char following the N at the time of the call. On successful return,
	9659	RExC_parse has been updated to point to just after the sequence identified
	9660	by this routine, and <*flagp> has been updated.
	9661
	9662	The \N may be inside (indicated by the boolean <in_char_class>) or outside a
	9663	character class.
	9664
	9665	\N may begin either a named sequence, or if outside a character class, mean
	9666	to match a non-newline. For non single-quoted regexes, the tokenizer has
	9667	attempted to decide which, and in the case of a named sequence, converted it
	9668	into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
	9669	where c1... are the characters in the sequence. For single-quoted regexes,
	9670	the tokenizer passes the \N sequence through unchanged; this code will not
	9671	attempt to determine this nor expand those, instead raising a syntax error.
	9672	The net effect is that if the beginning of the passed-in pattern isn't '{U+'
	9673	or there is no '}', it signals that this \N occurrence means to match a
	9674	non-newline.
	9675
	9676	Only the \N{U+...} form should occur in a character class, for the same
	9677	reason that '.' inside a character class means to just match a period: it
	9678	just doesn't make sense.
	9679
	9680	The function raises an error (via vFAIL), and doesn't return for various
	9681	syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
	9682	success; it returns FALSE otherwise.
	9683
	9684	If <valuep> is non-null, it means the caller can accept an input sequence
	9685	consisting of a just a single code point; <*valuep> is set to that value
	9686	if the input is such.
	9687
	9688	If <node_p> is non-null it signifies that the caller can accept any other
	9689	legal sequence (i.e., one that isn't just a single code point). <*node_p>
	9690	is set as follows:
	9691	1) \N means not-a-NL: points to a newly created REG_ANY node;
	9692	2) \N{}: points to a new NOTHING node;
	9693	3) otherwise: points to a new EXACT node containing the resolved
	9694	string.
	9695	Note that FALSE is returned for single code point sequences if <valuep> is
	9696	null.
	9697	*/
	9698
	9699	char * endbrace; /* '}' following the name */
	9700	char* p;
	9701	char endchar; / Points to '.' or '}' ending cur char in the input
	9702	stream */
	9703	bool has_multiple_chars; /* true if the input stream contains a sequence of
	9704	more than one character */
	9705
	9706	GET_RE_DEBUG_FLAGS_DECL;
	9707
	9708	PERL_ARGS_ASSERT_GROK_BSLASH_N;
	9709
	9710	GET_RE_DEBUG_FLAGS;
	9711
	9712	assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
	9713
	9714	/* The [^\n] meaning of \N ignores spaces and comments under the /x
	9715	* modifier. The other meaning does not */
	9716	p = (RExC_flags & RXf_PMf_EXTENDED)
	9717	? regwhite( pRExC_state, RExC_parse )
	9718	: RExC_parse;
	9719
	9720	/* Disambiguate between \N meaning a named character versus \N meaning
	9721	* [^\n]. The former is assumed when it can't be the latter. */
	9722	if (*p != '{' \|\| regcurly(p)) {
	9723	RExC_parse = p;
	9724	if (! node_p) {
	9725	/* no bare \N in a charclass */
	9726	if (in_char_class) {
	9727	vFAIL("\\N in a character class must be a named character: \\N{...}");
	9728	}
	9729	return FALSE;
	9730	}
	9731	nextchar(pRExC_state);
	9732	*node_p = reg_node(pRExC_state, REG_ANY);
	9733	*flagp \|= HASWIDTH\|SIMPLE;
	9734	RExC_naughty++;
	9735	RExC_parse--;
	9736	Set_Node_Length(node_p, 1); / MJD */
	9737	return TRUE;
	9738	}
	9739
	9740	/* Here, we have decided it should be a named character or sequence */
	9741
	9742	/* The test above made sure that the next real character is a '{', but
	9743	* under the /x modifier, it could be separated by space (or a comment and
	9744	* \n) and this is not allowed (for consistency with \x{...} and the
	9745	* tokenizer handling of \N{NAME}). */
	9746	if (*RExC_parse != '{') {
	9747	vFAIL("Missing braces on \\N{}");
	9748	}
	9749
	9750	RExC_parse++; /* Skip past the '{' */
	9751
	9752	if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
	9753	\|\| ! (endbrace == RExC_parse /* nothing between the {} */
	9754	\|\| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
	9755	&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
	9756	{
	9757	if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
	9758	vFAIL("\\N{NAME} must be resolved by the lexer");
	9759	}
	9760
	9761	if (endbrace == RExC_parse) { /* empty: \N{} */
	9762	bool ret = TRUE;
	9763	if (node_p) {
	9764	*node_p = reg_node(pRExC_state,NOTHING);
	9765	}
	9766	else if (in_char_class) {
	9767	if (SIZE_ONLY && in_char_class) {
	9768	ckWARNreg(RExC_parse,
	9769	"Ignoring zero length \\N{} in character class"
	9770	);
	9771	}
	9772	ret = FALSE;
	9773	}
	9774	else {
	9775	return FALSE;
	9776	}
	9777	nextchar(pRExC_state);
	9778	return ret;
	9779	}
	9780
	9781	RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
	9782	RExC_parse += 2; /* Skip past the 'U+' */
	9783
	9784	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	9785
	9786	/* Code points are separated by dots. If none, there is only one code
	9787	* point, and is terminated by the brace */
	9788	has_multiple_chars = (endchar < endbrace);
	9789
	9790	if (valuep && (! has_multiple_chars \|\| in_char_class)) {
	9791	/* We only pay attention to the first char of
	9792	multichar strings being returned in char classes. I kinda wonder
	9793	if this makes sense as it does change the behaviour
	9794	from earlier versions, OTOH that behaviour was broken
	9795	as well. XXX Solution is to recharacterize as
	9796	[rest-of-class]\|multi1\|multi2... */
	9797
	9798	STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
	9799	I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
	9800	\| PERL_SCAN_DISALLOW_PREFIX
	9801	\| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
	9802
	9803	*valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
	9804
	9805	/* The tokenizer should have guaranteed validity, but it's possible to
	9806	* bypass it by using single quoting, so check */
	9807	if (length_of_hex == 0
	9808	\|\| length_of_hex != (STRLEN)(endchar - RExC_parse) )
	9809	{
	9810	RExC_parse += length_of_hex; /* Includes all the valid */
	9811	RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
	9812	? UTF8SKIP(RExC_parse)
	9813	: 1;
	9814	/* Guard against malformed utf8 */
	9815	if (RExC_parse >= endchar) {
	9816	RExC_parse = endchar;
	9817	}
	9818	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	9819	}
	9820
	9821	if (in_char_class && has_multiple_chars) {
	9822	ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
	9823	}
	9824	RExC_parse = endbrace + 1;
	9825	}
	9826	else if (! node_p \|\| ! has_multiple_chars) {
	9827
	9828	/* Here, the input is legal, but not according to the caller's
	9829	* options. We fail without advancing the parse, so that the
	9830	* caller can try again */
	9831	RExC_parse = p;
	9832	return FALSE;
	9833	}
	9834	else {
	9835
	9836	/* What is done here is to convert this to a sub-pattern of the form
	9837	* (?:\x{char1}\x{char2}...)
	9838	* and then call reg recursively. That way, it retains its atomicness,
	9839	* while not having to worry about special handling that some code
	9840	* points may have. toke.c has converted the original Unicode values
	9841	* to native, so that we can just pass on the hex values unchanged. We
	9842	* do have to set a flag to keep recoding from happening in the
	9843	* recursion */
	9844
	9845	SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8\|SVs_TEMP);
	9846	STRLEN len;
	9847	char *orig_end = RExC_end;
	9848	I32 flags;
	9849
	9850	while (RExC_parse < endbrace) {
	9851
	9852	/* Convert to notation the rest of the code understands */
	9853	sv_catpv(substitute_parse, "\\x{");
	9854	sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
	9855	sv_catpv(substitute_parse, "}");
	9856
	9857	/* Point to the beginning of the next character in the sequence. */
	9858	RExC_parse = endchar + 1;
	9859	endchar = RExC_parse + strcspn(RExC_parse, ".}");
	9860	}
	9861	sv_catpv(substitute_parse, ")");
	9862
	9863	RExC_parse = SvPV(substitute_parse, len);
	9864
	9865	/* Don't allow empty number */
	9866	if (len < 8) {
	9867	vFAIL("Invalid hexadecimal number in \\N{U+...}");
	9868	}
	9869	RExC_end = RExC_parse + len;
	9870
	9871	/* The values are Unicode, and therefore not subject to recoding */
	9872	RExC_override_recoding = 1;
	9873
	9874	*node_p = reg(pRExC_state, 1, &flags, depth+1);
	9875	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	9876
	9877	RExC_parse = endbrace;
	9878	RExC_end = orig_end;
	9879	RExC_override_recoding = 0;
	9880
	9881	nextchar(pRExC_state);
	9882	}
	9883
	9884	return TRUE;
	9885	}
	9886
	9887
	9888	/*
	9889	* reg_recode
	9890	*
	9891	* It returns the code point in utf8 for the value in *encp.
	9892	* value: a code value in the source encoding
	9893	* encp: a pointer to an Encode object
	9894	*
	9895	* If the result from Encode is not a single character,
	9896	* it returns U+FFFD (Replacement character) and sets *encp to NULL.
	9897	*/
	9898	STATIC UV
	9899	S_reg_recode(pTHX_ const char value, SV **encp)
	9900	{
	9901	STRLEN numlen = 1;
	9902	SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
	9903	const char * const s = encp ? sv_recode_to_utf8(sv, encp) : SvPVX(sv);
	9904	const STRLEN newlen = SvCUR(sv);
	9905	UV uv = UNICODE_REPLACEMENT;
	9906
	9907	PERL_ARGS_ASSERT_REG_RECODE;
	9908
	9909	if (newlen)
	9910	uv = SvUTF8(sv)
	9911	? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	9912	: (U8)s;
	9913
	9914	if (!newlen \|\| numlen != newlen) {
	9915	uv = UNICODE_REPLACEMENT;
	9916	*encp = NULL;
	9917	}
	9918	return uv;
	9919	}
	9920
	9921	PERL_STATIC_INLINE U8
	9922	S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
	9923	{
	9924	U8 op;
	9925
	9926	PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
	9927
	9928	if (! FOLD) {
	9929	return EXACT;
	9930	}
	9931
	9932	op = get_regex_charset(RExC_flags);
	9933	if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
	9934	op--; /* /a is same as /u, and map /aa's offset to what /a's would have
	9935	been, so there is no hole */
	9936	}
	9937
	9938	return op + EXACTF;
	9939	}
	9940
	9941	PERL_STATIC_INLINE void
	9942	S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t pRExC_state, regnode node, I32* flagp, STRLEN len, UV code_point)
	9943	{
	9944	/* This knows the details about sizing an EXACTish node, setting flags for
	9945	* it (by setting <*flagp>, and potentially populating it with a single
	9946	* character.
	9947	*
	9948	* If <len> is non-zero, this function assumes that the node has already
	9949	* been populated, and just does the sizing. In this case <code_point>
	9950	* should be the final code point that has already been placed into the
	9951	* node. This value will be ignored except that under some circumstances
	9952	* <*flagp> is set based on it.
	9953	*
	9954	* If <len is zero, the function assumes that the node is to contain only
	9955	* the single character given by <code_point> and calculates what <len>
	9956	* should be. In pass 1, it sizes the node appropriately. In pass 2, it
	9957	* additionally will populate the node's STRING with <code_point>, if <len>
	9958	* is 0. In both cases <*flagp> is appropriately set
	9959	*
	9960	* It knows that under FOLD, UTF characters and the Latin Sharp S must be
	9961	* folded (the latter only when the rules indicate it can match 'ss') */
	9962
	9963	bool len_passed_in = cBOOL(len != 0);
	9964	U8 character[UTF8_MAXBYTES_CASE+1];
	9965
	9966	PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
	9967
	9968	if (! len_passed_in) {
	9969	if (UTF) {
	9970	if (FOLD) {
	9971	to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
	9972	}
	9973	else {
	9974	uvchr_to_utf8( character, code_point);
	9975	len = UTF8SKIP(character);
	9976	}
	9977	}
	9978	else if (! FOLD
	9979	\|\| code_point != LATIN_SMALL_LETTER_SHARP_S
	9980	\|\| ASCII_FOLD_RESTRICTED
	9981	\|\| ! AT_LEAST_UNI_SEMANTICS)
	9982	{
	9983	*character = (U8) code_point;
	9984	len = 1;
	9985	}
	9986	else {
	9987	*character = 's';
	9988	*(character + 1) = 's';
	9989	len = 2;
	9990	}
	9991	}
	9992
	9993	if (SIZE_ONLY) {
	9994	RExC_size += STR_SZ(len);
	9995	}
	9996	else {
	9997	RExC_emit += STR_SZ(len);
	9998	STR_LEN(node) = len;
	9999	if (! len_passed_in) {
	10000	Copy((char *) character, STRING(node), len, char);
	10001	}
	10002	}
	10003
	10004	*flagp \|= HASWIDTH;
	10005	if (len == 1 && UNI_IS_INVARIANT(code_point))
	10006	*flagp \|= SIMPLE;
	10007	}
	10008
	10009	/*
	10010	- regatom - the lowest level
	10011
	10012	Try to identify anything special at the start of the pattern. If there
	10013	is, then handle it as required. This may involve generating a single regop,
	10014	such as for an assertion; or it may involve recursing, such as to
	10015	handle a () structure.
	10016
	10017	If the string doesn't start with something special then we gobble up
	10018	as much literal text as we can.
	10019
	10020	Once we have been able to handle whatever type of thing started the
	10021	sequence, we return.
	10022
	10023	Note: we have to be careful with escapes, as they can be both literal
	10024	and special, and in the case of \10 and friends, context determines which.
	10025
	10026	A summary of the code structure is:
	10027
	10028	switch (first_byte) {
	10029	cases for each special:
	10030	handle this special;
	10031	break;
	10032	case '\\':
	10033	switch (2nd byte) {
	10034	cases for each unambiguous special:
	10035	handle this special;
	10036	break;
	10037	cases for each ambigous special/literal:
	10038	disambiguate;
	10039	if (special) handle here
	10040	else goto defchar;
	10041	default: // unambiguously literal:
	10042	goto defchar;
	10043	}
	10044	default: // is a literal char
	10045	// FALL THROUGH
	10046	defchar:
	10047	create EXACTish node for literal;
	10048	while (more input and node isn't full) {
	10049	switch (input_byte) {
	10050	cases for each special;
	10051	make sure parse pointer is set so that the next call to
	10052	regatom will see this special first
	10053	goto loopdone; // EXACTish node terminated by prev. char
	10054	default:
	10055	append char to EXACTISH node;
	10056	}
	10057	get next input byte;
	10058	}
	10059	loopdone:
	10060	}
	10061	return the generated node;
	10062
	10063	Specifically there are two separate switches for handling
	10064	escape sequences, with the one for handling literal escapes requiring
	10065	a dummy entry for all of the special escapes that are actually handled
	10066	by the other.
	10067	*/
	10068
	10069	STATIC regnode *
	10070	S_regatom(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	10071	{
	10072	dVAR;
	10073	regnode *ret = NULL;
	10074	I32 flags;
	10075	char *parse_start = RExC_parse;
	10076	U8 op;
	10077	GET_RE_DEBUG_FLAGS_DECL;
	10078	DEBUG_PARSE("atom");
	10079	flagp = WORST; / Tentatively. */
	10080
	10081	PERL_ARGS_ASSERT_REGATOM;
	10082
	10083	tryagain:
	10084	switch ((U8)*RExC_parse) {
	10085	case '^':
	10086	RExC_seen_zerolen++;
	10087	nextchar(pRExC_state);
	10088	if (RExC_flags & RXf_PMf_MULTILINE)
	10089	ret = reg_node(pRExC_state, MBOL);
	10090	else if (RExC_flags & RXf_PMf_SINGLELINE)
	10091	ret = reg_node(pRExC_state, SBOL);
	10092	else
	10093	ret = reg_node(pRExC_state, BOL);
	10094	Set_Node_Length(ret, 1); /* MJD */
	10095	break;
	10096	case '$':
	10097	nextchar(pRExC_state);
	10098	if (*RExC_parse)
	10099	RExC_seen_zerolen++;
	10100	if (RExC_flags & RXf_PMf_MULTILINE)
	10101	ret = reg_node(pRExC_state, MEOL);
	10102	else if (RExC_flags & RXf_PMf_SINGLELINE)
	10103	ret = reg_node(pRExC_state, SEOL);
	10104	else
	10105	ret = reg_node(pRExC_state, EOL);
	10106	Set_Node_Length(ret, 1); /* MJD */
	10107	break;
	10108	case '.':
	10109	nextchar(pRExC_state);
	10110	if (RExC_flags & RXf_PMf_SINGLELINE)
	10111	ret = reg_node(pRExC_state, SANY);
	10112	else
	10113	ret = reg_node(pRExC_state, REG_ANY);
	10114	*flagp \|= HASWIDTH\|SIMPLE;
	10115	RExC_naughty++;
	10116	Set_Node_Length(ret, 1); /* MJD */
	10117	break;
	10118	case '[':
	10119	{
	10120	char * const oregcomp_parse = ++RExC_parse;
	10121	ret = regclass(pRExC_state, flagp,depth+1);
	10122	if (*RExC_parse != ']') {
	10123	RExC_parse = oregcomp_parse;
	10124	vFAIL("Unmatched [");
	10125	}
	10126	nextchar(pRExC_state);
	10127	Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
	10128	break;
	10129	}
	10130	case '(':
	10131	nextchar(pRExC_state);
	10132	ret = reg(pRExC_state, 1, &flags,depth+1);
	10133	if (ret == NULL) {
	10134	if (flags & TRYAGAIN) {
	10135	if (RExC_parse == RExC_end) {
	10136	/* Make parent create an empty node if needed. */
	10137	*flagp \|= TRYAGAIN;
	10138	return(NULL);
	10139	}
	10140	goto tryagain;
	10141	}
	10142	return(NULL);
	10143	}
	10144	*flagp \|= flags&(HASWIDTH\|SPSTART\|SIMPLE\|POSTPONED);
	10145	break;
	10146	case '\|':
	10147	case ')':
	10148	if (flags & TRYAGAIN) {
	10149	*flagp \|= TRYAGAIN;
	10150	return NULL;
	10151	}
	10152	vFAIL("Internal urp");
	10153	/* Supposed to be caught earlier. */
	10154	break;
	10155	case '?':
	10156	case '+':
	10157	case '*':
	10158	RExC_parse++;
	10159	vFAIL("Quantifier follows nothing");
	10160	break;
	10161	case '\\':
	10162	/* Special Escapes
	10163
	10164	This switch handles escape sequences that resolve to some kind
	10165	of special regop and not to literal text. Escape sequnces that
	10166	resolve to literal text are handled below in the switch marked
	10167	"Literal Escapes".
	10168
	10169	Every entry in this switch must have a corresponding entry
	10170	in the literal escape switch. However, the opposite is not
	10171	required, as the default for this switch is to jump to the
	10172	literal text handling code.
	10173	*/
	10174	switch ((U8)*++RExC_parse) {
	10175	/* Special Escapes */
	10176	case 'A':
	10177	RExC_seen_zerolen++;
	10178	ret = reg_node(pRExC_state, SBOL);
	10179	*flagp \|= SIMPLE;
	10180	goto finish_meta_pat;
	10181	case 'G':
	10182	ret = reg_node(pRExC_state, GPOS);
	10183	RExC_seen \|= REG_SEEN_GPOS;
	10184	*flagp \|= SIMPLE;
	10185	goto finish_meta_pat;
	10186	case 'K':
	10187	RExC_seen_zerolen++;
	10188	ret = reg_node(pRExC_state, KEEPS);
	10189	*flagp \|= SIMPLE;
	10190	/* XXX:dmq : disabling in-place substitution seems to
	10191	* be necessary here to avoid cases of memory corruption, as
	10192	* with: C<$_="x" x 80; s/x\K/y/> -- rgs
	10193	*/
	10194	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	10195	goto finish_meta_pat;
	10196	case 'Z':
	10197	ret = reg_node(pRExC_state, SEOL);
	10198	*flagp \|= SIMPLE;
	10199	RExC_seen_zerolen++; /* Do not optimize RE away */
	10200	goto finish_meta_pat;
	10201	case 'z':
	10202	ret = reg_node(pRExC_state, EOS);
	10203	*flagp \|= SIMPLE;
	10204	RExC_seen_zerolen++; /* Do not optimize RE away */
	10205	goto finish_meta_pat;
	10206	case 'C':
	10207	ret = reg_node(pRExC_state, CANY);
	10208	RExC_seen \|= REG_SEEN_CANY;
	10209	*flagp \|= HASWIDTH\|SIMPLE;
	10210	goto finish_meta_pat;
	10211	case 'X':
	10212	ret = reg_node(pRExC_state, CLUMP);
	10213	*flagp \|= HASWIDTH;
	10214	goto finish_meta_pat;
	10215	case 'w':
	10216	op = ALNUM + get_regex_charset(RExC_flags);
	10217	if (op > ALNUMA) { /* /aa is same as /a */
	10218	op = ALNUMA;
	10219	}
	10220	ret = reg_node(pRExC_state, op);
	10221	*flagp \|= HASWIDTH\|SIMPLE;
	10222	goto finish_meta_pat;
	10223	case 'W':
	10224	op = NALNUM + get_regex_charset(RExC_flags);
	10225	if (op > NALNUMA) { /* /aa is same as /a */
	10226	op = NALNUMA;
	10227	}
	10228	ret = reg_node(pRExC_state, op);
	10229	*flagp \|= HASWIDTH\|SIMPLE;
	10230	goto finish_meta_pat;
	10231	case 'b':
	10232	RExC_seen_zerolen++;
	10233	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	10234	op = BOUND + get_regex_charset(RExC_flags);
	10235	if (op > BOUNDA) { /* /aa is same as /a */
	10236	op = BOUNDA;
	10237	}
	10238	ret = reg_node(pRExC_state, op);
	10239	FLAGS(ret) = get_regex_charset(RExC_flags);
	10240	*flagp \|= SIMPLE;
	10241	goto finish_meta_pat;
	10242	case 'B':
	10243	RExC_seen_zerolen++;
	10244	RExC_seen \|= REG_SEEN_LOOKBEHIND;
	10245	op = NBOUND + get_regex_charset(RExC_flags);
	10246	if (op > NBOUNDA) { /* /aa is same as /a */
	10247	op = NBOUNDA;
	10248	}
	10249	ret = reg_node(pRExC_state, op);
	10250	FLAGS(ret) = get_regex_charset(RExC_flags);
	10251	*flagp \|= SIMPLE;
	10252	goto finish_meta_pat;
	10253	case 's':
	10254	op = SPACE + get_regex_charset(RExC_flags);
	10255	if (op > SPACEA) { /* /aa is same as /a */
	10256	op = SPACEA;
	10257	}
	10258	ret = reg_node(pRExC_state, op);
	10259	*flagp \|= HASWIDTH\|SIMPLE;
	10260	goto finish_meta_pat;
	10261	case 'S':
	10262	op = NSPACE + get_regex_charset(RExC_flags);
	10263	if (op > NSPACEA) { /* /aa is same as /a */
	10264	op = NSPACEA;
	10265	}
	10266	ret = reg_node(pRExC_state, op);
	10267	*flagp \|= HASWIDTH\|SIMPLE;
	10268	goto finish_meta_pat;
	10269	case 'D':
	10270	op = NDIGIT;
	10271	goto join_D_and_d;
	10272	case 'd':
	10273	op = DIGIT;
	10274	join_D_and_d:
	10275	{
	10276	U8 offset = get_regex_charset(RExC_flags);
	10277	if (offset == REGEX_UNICODE_CHARSET) {
	10278	offset = REGEX_DEPENDS_CHARSET;
	10279	}
	10280	else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
	10281	offset = REGEX_ASCII_RESTRICTED_CHARSET;
	10282	}
	10283	op += offset;
	10284	}
	10285	ret = reg_node(pRExC_state, op);
	10286	*flagp \|= HASWIDTH\|SIMPLE;
	10287	goto finish_meta_pat;
	10288	case 'R':
	10289	ret = reg_node(pRExC_state, LNBREAK);
	10290	*flagp \|= HASWIDTH\|SIMPLE;
	10291	goto finish_meta_pat;
	10292	case 'h':
	10293	ret = reg_node(pRExC_state, HORIZWS);
	10294	*flagp \|= HASWIDTH\|SIMPLE;
	10295	goto finish_meta_pat;
	10296	case 'H':
	10297	ret = reg_node(pRExC_state, NHORIZWS);
	10298	*flagp \|= HASWIDTH\|SIMPLE;
	10299	goto finish_meta_pat;
	10300	case 'v':
	10301	ret = reg_node(pRExC_state, VERTWS);
	10302	*flagp \|= HASWIDTH\|SIMPLE;
	10303	goto finish_meta_pat;
	10304	case 'V':
	10305	ret = reg_node(pRExC_state, NVERTWS);
	10306	*flagp \|= HASWIDTH\|SIMPLE;
	10307	finish_meta_pat:
	10308	nextchar(pRExC_state);
	10309	Set_Node_Length(ret, 2); /* MJD */
	10310	break;
	10311	case 'p':
	10312	case 'P':
	10313	{
	10314	char* const oldregxend = RExC_end;
	10315	#ifdef DEBUGGING
	10316	char* parse_start = RExC_parse - 2;
	10317	#endif
	10318
	10319	if (RExC_parse[1] == '{') {
	10320	/* a lovely hack--pretend we saw [\pX] instead */
	10321	RExC_end = strchr(RExC_parse, '}');
	10322	if (!RExC_end) {
	10323	const U8 c = (U8)*RExC_parse;
	10324	RExC_parse += 2;
	10325	RExC_end = oldregxend;
	10326	vFAIL2("Missing right brace on \\%c{}", c);
	10327	}
	10328	RExC_end++;
	10329	}
	10330	else {
	10331	RExC_end = RExC_parse + 2;
	10332	if (RExC_end > oldregxend)
	10333	RExC_end = oldregxend;
	10334	}
	10335	RExC_parse--;
	10336
	10337	ret = regclass(pRExC_state, flagp,depth+1);
	10338
	10339	RExC_end = oldregxend;
	10340	RExC_parse--;
	10341
	10342	Set_Node_Offset(ret, parse_start + 2);
	10343	Set_Node_Cur_Length(ret);
	10344	nextchar(pRExC_state);
	10345	}
	10346	break;
	10347	case 'N':
	10348	/* Handle \N and \N{NAME} with multiple code points here and not
	10349	* below because it can be multicharacter. join_exact() will join
	10350	* them up later on. Also this makes sure that things like
	10351	* /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
	10352	* The options to the grok function call causes it to fail if the
	10353	* sequence is just a single code point. We then go treat it as
	10354	* just another character in the current EXACT node, and hence it
	10355	* gets uniform treatment with all the other characters. The
	10356	* special treatment for quantifiers is not needed for such single
	10357	* character sequences */
	10358	++RExC_parse;
	10359	if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
	10360	RExC_parse--;
	10361	goto defchar;
	10362	}
	10363	break;
	10364	case 'k': /* Handle \k<NAME> and \k'NAME' */
	10365	parse_named_seq:
	10366	{
	10367	char ch= RExC_parse[1];
	10368	if (ch != '<' && ch != '\'' && ch != '{') {
	10369	RExC_parse++;
	10370	vFAIL2("Sequence %.2s... not terminated",parse_start);
	10371	} else {
	10372	/* this pretty much dupes the code for (?P=...) in reg(), if
	10373	you change this make sure you change that */
	10374	char* name_start = (RExC_parse += 2);
	10375	U32 num = 0;
	10376	SV *sv_dat = reg_scan_name(pRExC_state,
	10377	SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
	10378	ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
	10379	if (RExC_parse == name_start \|\| *RExC_parse != ch)
	10380	vFAIL2("Sequence %.3s... not terminated",parse_start);
	10381
	10382	if (!SIZE_ONLY) {
	10383	num = add_data( pRExC_state, 1, "S" );
	10384	RExC_rxi->data->data[num]=(void*)sv_dat;
	10385	SvREFCNT_inc_simple_void(sv_dat);
	10386	}
	10387
	10388	RExC_sawback = 1;
	10389	ret = reganode(pRExC_state,
	10390	((! FOLD)
	10391	? NREF
	10392	: (ASCII_FOLD_RESTRICTED)
	10393	? NREFFA
	10394	: (AT_LEAST_UNI_SEMANTICS)
	10395	? NREFFU
	10396	: (LOC)
	10397	? NREFFL
	10398	: NREFF),
	10399	num);
	10400	*flagp \|= HASWIDTH;
	10401
	10402	/* override incorrect value set in reganode MJD */
	10403	Set_Node_Offset(ret, parse_start+1);
	10404	Set_Node_Cur_Length(ret); /* MJD */
	10405	nextchar(pRExC_state);
	10406
	10407	}
	10408	break;
	10409	}
	10410	case 'g':
	10411	case '1': case '2': case '3': case '4':
	10412	case '5': case '6': case '7': case '8': case '9':
	10413	{
	10414	I32 num;
	10415	bool isg = *RExC_parse == 'g';
	10416	bool isrel = 0;
	10417	bool hasbrace = 0;
	10418	if (isg) {
	10419	RExC_parse++;
	10420	if (*RExC_parse == '{') {
	10421	RExC_parse++;
	10422	hasbrace = 1;
	10423	}
	10424	if (*RExC_parse == '-') {
	10425	RExC_parse++;
	10426	isrel = 1;
	10427	}
	10428	if (hasbrace && !isDIGIT(*RExC_parse)) {
	10429	if (isrel) RExC_parse--;
	10430	RExC_parse -= 2;
	10431	goto parse_named_seq;
	10432	} }
	10433	num = atoi(RExC_parse);
	10434	if (isg && num == 0)
	10435	vFAIL("Reference to invalid group 0");
	10436	if (isrel) {
	10437	num = RExC_npar - num;
	10438	if (num < 1)
	10439	vFAIL("Reference to nonexistent or unclosed group");
	10440	}
	10441	if (!isg && num > 9 && num >= RExC_npar)
	10442	/* Probably a character specified in octal, e.g. \35 */
	10443	goto defchar;
	10444	else {
	10445	char * const parse_start = RExC_parse - 1; /* MJD */
	10446	while (isDIGIT(*RExC_parse))
	10447	RExC_parse++;
	10448	if (parse_start == RExC_parse - 1)
	10449	vFAIL("Unterminated \\g... pattern");
	10450	if (hasbrace) {
	10451	if (*RExC_parse != '}')
	10452	vFAIL("Unterminated \\g{...} pattern");
	10453	RExC_parse++;
	10454	}
	10455	if (!SIZE_ONLY) {
	10456	if (num > (I32)RExC_rx->nparens)
	10457	vFAIL("Reference to nonexistent group");
	10458	}
	10459	RExC_sawback = 1;
	10460	ret = reganode(pRExC_state,
	10461	((! FOLD)
	10462	? REF
	10463	: (ASCII_FOLD_RESTRICTED)
	10464	? REFFA
	10465	: (AT_LEAST_UNI_SEMANTICS)
	10466	? REFFU
	10467	: (LOC)
	10468	? REFFL
	10469	: REFF),
	10470	num);
	10471	*flagp \|= HASWIDTH;
	10472
	10473	/* override incorrect value set in reganode MJD */
	10474	Set_Node_Offset(ret, parse_start+1);
	10475	Set_Node_Cur_Length(ret); /* MJD */
	10476	RExC_parse--;
	10477	nextchar(pRExC_state);
	10478	}
	10479	}
	10480	break;
	10481	case '\0':
	10482	if (RExC_parse >= RExC_end)
	10483	FAIL("Trailing \\");
	10484	/* FALL THROUGH */
	10485	default:
	10486	/* Do not generate "unrecognized" warnings here, we fall
	10487	back into the quick-grab loop below */
	10488	parse_start--;
	10489	goto defchar;
	10490	}
	10491	break;
	10492
	10493	case '#':
	10494	if (RExC_flags & RXf_PMf_EXTENDED) {
	10495	if ( reg_skipcomment( pRExC_state ) )
	10496	goto tryagain;
	10497	}
	10498	/* FALL THROUGH */
	10499
	10500	default:
	10501
	10502	parse_start = RExC_parse - 1;
	10503
	10504	RExC_parse++;
	10505
	10506	defchar: {
	10507	STRLEN len = 0;
	10508	UV ender;
	10509	char *p;
	10510	char *s;
	10511	#define MAX_NODE_STRING_SIZE 127
	10512	char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
	10513	char *s0;
	10514	U8 upper_parse = MAX_NODE_STRING_SIZE;
	10515	STRLEN foldlen;
	10516	U8 node_type;
	10517	bool next_is_quantifier;
	10518	char * oldp = NULL;
	10519
	10520	ender = 0;
	10521	node_type = compute_EXACTish(pRExC_state);
	10522	ret = reg_node(pRExC_state, node_type);
	10523
	10524	/* In pass1, folded, we use a temporary buffer instead of the
	10525	* actual node, as the node doesn't exist yet */
	10526	s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
	10527
	10528	s0 = s;
	10529
	10530	reparse:
	10531
	10532	/* XXX The node can hold up to 255 bytes, yet this only goes to
	10533	* 127. I (khw) do not know why. Keeping it somewhat less than
	10534	* 255 allows us to not have to worry about overflow due to
	10535	* converting to utf8 and fold expansion, but that value is
	10536	* 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
	10537	* split up by this limit into a single one using the real max of
	10538	* 255. Even at 127, this breaks under rare circumstances. If
	10539	* folding, we do not want to split a node at a character that is a
	10540	* non-final in a multi-char fold, as an input string could just
	10541	* happen to want to match across the node boundary. The join
	10542	* would solve that problem if the join actually happens. But a
	10543	* series of more than two nodes in a row each of 127 would cause
	10544	* the first join to succeed to get to 254, but then there wouldn't
	10545	* be room for the next one, which could at be one of those split
	10546	* multi-char folds. I don't know of any fool-proof solution. One
	10547	* could back off to end with only a code point that isn't such a
	10548	* non-final, but it is possible for there not to be any in the
	10549	* entire node. */
	10550	for (p = RExC_parse - 1;
	10551	len < upper_parse && p < RExC_end;
	10552	len++)
	10553	{
	10554	oldp = p;
	10555
	10556	if (RExC_flags & RXf_PMf_EXTENDED)
	10557	p = regwhite( pRExC_state, p );
	10558	switch ((U8)*p) {
	10559	case '^':
	10560	case '$':
	10561	case '.':
	10562	case '[':
	10563	case '(':
	10564	case ')':
	10565	case '\|':
	10566	goto loopdone;
	10567	case '\\':
	10568	/* Literal Escapes Switch
	10569
	10570	This switch is meant to handle escape sequences that
	10571	resolve to a literal character.
	10572
	10573	Every escape sequence that represents something
	10574	else, like an assertion or a char class, is handled
	10575	in the switch marked 'Special Escapes' above in this
	10576	routine, but also has an entry here as anything that
	10577	isn't explicitly mentioned here will be treated as
	10578	an unescaped equivalent literal.
	10579	*/
	10580
	10581	switch ((U8)*++p) {
	10582	/* These are all the special escapes. */
	10583	case 'A': /* Start assertion */
	10584	case 'b': case 'B': /* Word-boundary assertion*/
	10585	case 'C': /* Single char !DANGEROUS! */
	10586	case 'd': case 'D': /* digit class */
	10587	case 'g': case 'G': /* generic-backref, pos assertion */
	10588	case 'h': case 'H': /* HORIZWS */
	10589	case 'k': case 'K': /* named backref, keep marker */
	10590	case 'p': case 'P': /* Unicode property */
	10591	case 'R': /* LNBREAK */
	10592	case 's': case 'S': /* space class */
	10593	case 'v': case 'V': /* VERTWS */
	10594	case 'w': case 'W': /* word class */
	10595	case 'X': /* eXtended Unicode "combining character sequence" */
	10596	case 'z': case 'Z': /* End of line/string assertion */
	10597	--p;
	10598	goto loopdone;
	10599
	10600	/* Anything after here is an escape that resolves to a
	10601	literal. (Except digits, which may or may not)
	10602	*/
	10603	case 'n':
	10604	ender = '\n';
	10605	p++;
	10606	break;
	10607	case 'N': /* Handle a single-code point named character. */
	10608	/* The options cause it to fail if a multiple code
	10609	* point sequence. Handle those in the switch() above
	10610	* */
	10611	RExC_parse = p + 1;
	10612	if (! grok_bslash_N(pRExC_state, NULL, &ender,
	10613	flagp, depth, FALSE))
	10614	{
	10615	RExC_parse = p = oldp;
	10616	goto loopdone;
	10617	}
	10618	p = RExC_parse;
	10619	if (ender > 0xff) {
	10620	REQUIRE_UTF8;
	10621	}
	10622	break;
	10623	case 'r':
	10624	ender = '\r';
	10625	p++;
	10626	break;
	10627	case 't':
	10628	ender = '\t';
	10629	p++;
	10630	break;
	10631	case 'f':
	10632	ender = '\f';
	10633	p++;
	10634	break;
	10635	case 'e':
	10636	ender = ASCII_TO_NATIVE('\033');
	10637	p++;
	10638	break;
	10639	case 'a':
	10640	ender = ASCII_TO_NATIVE('\007');
	10641	p++;
	10642	break;
	10643	case 'o':
	10644	{
	10645	STRLEN brace_len = len;
	10646	UV result;
	10647	const char* error_msg;
	10648
	10649	bool valid = grok_bslash_o(p,
	10650	&result,
	10651	&brace_len,
	10652	&error_msg,
	10653	1);
	10654	p += brace_len;
	10655	if (! valid) {
	10656	RExC_parse = p; /* going to die anyway; point
	10657	to exact spot of failure */
	10658	vFAIL(error_msg);
	10659	}
	10660	else
	10661	{
	10662	ender = result;
	10663	}
	10664	if (PL_encoding && ender < 0x100) {
	10665	goto recode_encoding;
	10666	}
	10667	if (ender > 0xff) {
	10668	REQUIRE_UTF8;
	10669	}
	10670	break;
	10671	}
	10672	case 'x':
	10673	{
	10674	STRLEN brace_len = len;
	10675	UV result;
	10676	const char* error_msg;
	10677
	10678	bool valid = grok_bslash_x(p,
	10679	&result,
	10680	&brace_len,
	10681	&error_msg,
	10682	1);
	10683	p += brace_len;
	10684	if (! valid) {
	10685	RExC_parse = p; /* going to die anyway; point
	10686	to exact spot of failure */
	10687	vFAIL(error_msg);
	10688	}
	10689	else {
	10690	ender = result;
	10691	}
	10692	if (PL_encoding && ender < 0x100) {
	10693	goto recode_encoding;
	10694	}
	10695	if (ender > 0xff) {
	10696	REQUIRE_UTF8;
	10697	}
	10698	break;
	10699	}
	10700	case 'c':
	10701	p++;
	10702	ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
	10703	break;
	10704	case '0': case '1': case '2': case '3':case '4':
	10705	case '5': case '6': case '7':
	10706	if (*p == '0' \|\|
	10707	(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
	10708	{
	10709	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	10710	STRLEN numlen = 3;
	10711	ender = grok_oct(p, &numlen, &flags, NULL);
	10712	if (ender > 0xff) {
	10713	REQUIRE_UTF8;
	10714	}
	10715	p += numlen;
	10716	}
	10717	else {
	10718	--p;
	10719	goto loopdone;
	10720	}
	10721	if (PL_encoding && ender < 0x100)
	10722	goto recode_encoding;
	10723	break;
	10724	recode_encoding:
	10725	if (! RExC_override_recoding) {
	10726	SV* enc = PL_encoding;
	10727	ender = reg_recode((const char)(U8)ender, &enc);
	10728	if (!enc && SIZE_ONLY)
	10729	ckWARNreg(p, "Invalid escape in the specified encoding");
	10730	REQUIRE_UTF8;
	10731	}
	10732	break;
	10733	case '\0':
	10734	if (p >= RExC_end)
	10735	FAIL("Trailing \\");
	10736	/* FALL THROUGH */
	10737	default:
	10738	if (!SIZE_ONLY&& isALNUMC(*p)) {
	10739	ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
	10740	}
	10741	goto normal_default;
	10742	}
	10743	break;
	10744	case '{':
	10745	/* Currently we don't warn when the lbrace is at the start
	10746	* of a construct. This catches it in the middle of a
	10747	* literal string, or when its the first thing after
	10748	* something like "\b" */
	10749	if (! SIZE_ONLY
	10750	&& (len \|\| (p > RExC_start && isALPHA_A(*(p -1)))))
	10751	{
	10752	ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
	10753	}
	10754	/FALLTHROUGH/
	10755	default:
	10756	normal_default:
	10757	if (UTF8_IS_START(*p) && UTF) {
	10758	STRLEN numlen;
	10759	ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
	10760	&numlen, UTF8_ALLOW_DEFAULT);
	10761	p += numlen;
	10762	}
	10763	else
	10764	ender = (U8) *p++;
	10765	break;
	10766	} /* End of switch on the literal */
	10767
	10768	/* Here, have looked at the literal character and <ender>
	10769	* contains its ordinal, <p> points to the character after it
	10770	*/
	10771
	10772	if ( RExC_flags & RXf_PMf_EXTENDED)
	10773	p = regwhite( pRExC_state, p );
	10774
	10775	/* If the next thing is a quantifier, it applies to this
	10776	* character only, which means that this character has to be in
	10777	* its own node and can't just be appended to the string in an
	10778	* existing node, so if there are already other characters in
	10779	* the node, close the node with just them, and set up to do
	10780	* this character again next time through, when it will be the
	10781	* only thing in its new node */
	10782	if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
	10783	{
	10784	p = oldp;
	10785	goto loopdone;
	10786	}
	10787
	10788	if (FOLD) {
	10789	if (UTF
	10790	/* See comments for join_exact() as to why we fold
	10791	* this non-UTF at compile time */
	10792	\|\| (node_type == EXACTFU
	10793	&& ender == LATIN_SMALL_LETTER_SHARP_S))
	10794	{
	10795
	10796
	10797	/* Prime the casefolded buffer. Locale rules, which
	10798	* apply only to code points < 256, aren't known until
	10799	* execution, so for them, just output the original
	10800	* character using utf8. If we start to fold non-UTF
	10801	* patterns, be sure to update join_exact() */
	10802	if (LOC && ender < 256) {
	10803	if (UNI_IS_INVARIANT(ender)) {
	10804	*s = (U8) ender;
	10805	foldlen = 1;
	10806	} else {
	10807	*s = UTF8_TWO_BYTE_HI(ender);
	10808	*(s + 1) = UTF8_TWO_BYTE_LO(ender);
	10809	foldlen = 2;
	10810	}
	10811	}
	10812	else {
	10813	ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
	10814	FOLD_FLAGS_FULL
	10815	\| ((LOC) ? FOLD_FLAGS_LOCALE
	10816	: (ASCII_FOLD_RESTRICTED)
	10817	? FOLD_FLAGS_NOMIX_ASCII
	10818	: 0)
	10819	);
	10820	}
	10821	s += foldlen;
	10822
	10823	/* The loop increments <len> each time, as all but this
	10824	* path (and the one just below for UTF) through it add
	10825	* a single byte to the EXACTish node. But this one
	10826	* has changed len to be the correct final value, so
	10827	* subtract one to cancel out the increment that
	10828	* follows */
	10829	len += foldlen - 1;
	10830	}
	10831	else {
	10832	*(s++) = ender;
	10833	}
	10834	}
	10835	else if (UTF) {
	10836	const STRLEN unilen = reguni(pRExC_state, ender, s);
	10837	if (unilen > 0) {
	10838	s += unilen;
	10839	len += unilen;
	10840	}
	10841
	10842	/* See comment just above for - 1 */
	10843	len--;
	10844	}
	10845	else {
	10846	REGC((char)ender, s++);
	10847	}
	10848
	10849	if (next_is_quantifier) {
	10850
	10851	/* Here, the next input is a quantifier, and to get here,
	10852	* the current character is the only one in the node.
	10853	* Also, here <len> doesn't include the final byte for this
	10854	* character */
	10855	len++;
	10856	goto loopdone;
	10857	}
	10858
	10859	} /* End of loop through literal characters */
	10860
	10861	/* Here we have either exhausted the input or ran out of room in
	10862	* the node. (If we encountered a character that can't be in the
	10863	* node, transfer is made directly to <loopdone>, and so we
	10864	* wouldn't have fallen off the end of the loop.) In the latter
	10865	* case, we artificially have to split the node into two, because
	10866	* we just don't have enough space to hold everything. This
	10867	* creates a problem if the final character participates in a
	10868	* multi-character fold in the non-final position, as a match that
	10869	* should have occurred won't, due to the way nodes are matched,
	10870	* and our artificial boundary. So back off until we find a non-
	10871	* problematic character -- one that isn't at the beginning or
	10872	* middle of such a fold. (Either it doesn't participate in any
	10873	* folds, or appears only in the final position of all the folds it
	10874	* does participate in.) A better solution with far fewer false
	10875	* positives, and that would fill the nodes more completely, would
	10876	* be to actually have available all the multi-character folds to
	10877	* test against, and to back-off only far enough to be sure that
	10878	* this node isn't ending with a partial one. <upper_parse> is set
	10879	* further below (if we need to reparse the node) to include just
	10880	* up through that final non-problematic character that this code
	10881	* identifies, so when it is set to less than the full node, we can
	10882	* skip the rest of this */
	10883	if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
	10884
	10885	const STRLEN full_len = len;
	10886
	10887	assert(len >= MAX_NODE_STRING_SIZE);
	10888
	10889	/* Here, <s> points to the final byte of the final character.
	10890	* Look backwards through the string until find a non-
	10891	* problematic character */
	10892
	10893	if (! UTF) {
	10894
	10895	/* These two have no multi-char folds to non-UTF characters
	10896	*/
	10897	if (ASCII_FOLD_RESTRICTED \|\| LOC) {
	10898	goto loopdone;
	10899	}
	10900
	10901	while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
	10902	len = s - s0 + 1;
	10903	}
	10904	else {
	10905	if (! PL_NonL1NonFinalFold) {
	10906	PL_NonL1NonFinalFold = _new_invlist_C_array(
	10907	NonL1_Perl_Non_Final_Folds_invlist);
	10908	}
	10909
	10910	/* Point to the first byte of the final character */
	10911	s = (char ) utf8_hop((U8 ) s, -1);
	10912
	10913	while (s >= s0) { /* Search backwards until find
	10914	non-problematic char */
	10915	if (UTF8_IS_INVARIANT(*s)) {
	10916
	10917	/* There are no ascii characters that participate
	10918	* in multi-char folds under /aa. In EBCDIC, the
	10919	* non-ascii invariants are all control characters,
	10920	* so don't ever participate in any folds. */
	10921	if (ASCII_FOLD_RESTRICTED
	10922	\|\| ! IS_NON_FINAL_FOLD(*s))
	10923	{
	10924	break;
	10925	}
	10926	}
	10927	else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
	10928
	10929	/* No Latin1 characters participate in multi-char
	10930	* folds under /l */
	10931	if (LOC
	10932	\|\| ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
	10933	s, (s+1))))
	10934	{
	10935	break;
	10936	}
	10937	}
	10938	else if (! _invlist_contains_cp(
	10939	PL_NonL1NonFinalFold,
	10940	valid_utf8_to_uvchr((U8 *) s, NULL)))
	10941	{
	10942	break;
	10943	}
	10944
	10945	/* Here, the current character is problematic in that
	10946	* it does occur in the non-final position of some
	10947	* fold, so try the character before it, but have to
	10948	* special case the very first byte in the string, so
	10949	* we don't read outside the string */
	10950	s = (s == s0) ? s -1 : (char ) utf8_hop((U8 ) s, -1);
	10951	} /* End of loop backwards through the string */
	10952
	10953	/* If there were only problematic characters in the string,
	10954	* <s> will point to before s0, in which case the length
	10955	* should be 0, otherwise include the length of the
	10956	* non-problematic character just found */
	10957	len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
	10958	}
	10959
	10960	/* Here, have found the final character, if any, that is
	10961	* non-problematic as far as ending the node without splitting
	10962	* it across a potential multi-char fold. <len> contains the
	10963	* number of bytes in the node up-to and including that
	10964	* character, or is 0 if there is no such character, meaning
	10965	* the whole node contains only problematic characters. In
	10966	* this case, give up and just take the node as-is. We can't
	10967	* do any better */
	10968	if (len == 0) {
	10969	len = full_len;
	10970	} else {
	10971
	10972	/* Here, the node does contain some characters that aren't
	10973	* problematic. If one such is the final character in the
	10974	* node, we are done */
	10975	if (len == full_len) {
	10976	goto loopdone;
	10977	}
	10978	else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
	10979
	10980	/* If the final character is problematic, but the
	10981	* penultimate is not, back-off that last character to
	10982	* later start a new node with it */
	10983	p = oldp;
	10984	goto loopdone;
	10985	}
	10986
	10987	/* Here, the final non-problematic character is earlier
	10988	* in the input than the penultimate character. What we do
	10989	* is reparse from the beginning, going up only as far as
	10990	* this final ok one, thus guaranteeing that the node ends
	10991	* in an acceptable character. The reason we reparse is
	10992	* that we know how far in the character is, but we don't
	10993	* know how to correlate its position with the input parse.
	10994	* An alternate implementation would be to build that
	10995	* correlation as we go along during the original parse,
	10996	* but that would entail extra work for every node, whereas
	10997	* this code gets executed only when the string is too
	10998	* large for the node, and the final two characters are
	10999	* problematic, an infrequent occurrence. Yet another
	11000	* possible strategy would be to save the tail of the
	11001	* string, and the next time regatom is called, initialize
	11002	* with that. The problem with this is that unless you
	11003	* back off one more character, you won't be guaranteed
	11004	* regatom will get called again, unless regbranch,
	11005	* regpiece ... are also changed. If you do back off that
	11006	* extra character, so that there is input guaranteed to
	11007	* force calling regatom, you can't handle the case where
	11008	* just the first character in the node is acceptable. I
	11009	* (khw) decided to try this method which doesn't have that
	11010	* pitfall; if performance issues are found, we can do a
	11011	* combination of the current approach plus that one */
	11012	upper_parse = len;
	11013	len = 0;
	11014	s = s0;
	11015	goto reparse;
	11016	}
	11017	} /* End of verifying node ends with an appropriate char */
	11018
	11019	loopdone: /* Jumped to when encounters something that shouldn't be in
	11020	the node */
	11021
	11022	/* I (khw) don't know if you can get here with zero length, but the
	11023	* old code handled this situation by creating a zero-length EXACT
	11024	* node. Might as well be NOTHING instead */
	11025	if (len == 0) {
	11026	OP(ret) = NOTHING;
	11027	}
	11028	else{
	11029	alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
	11030	}
	11031
	11032	RExC_parse = p - 1;
	11033	Set_Node_Cur_Length(ret); /* MJD */
	11034	nextchar(pRExC_state);
	11035	{
	11036	/* len is STRLEN which is unsigned, need to copy to signed */
	11037	IV iv = len;
	11038	if (iv < 0)
	11039	vFAIL("Internal disaster");
	11040	}
	11041
	11042	} /* End of label 'defchar:' */
	11043	break;
	11044	} /* End of giant switch on input character */
	11045
	11046	return(ret);
	11047	}
	11048
	11049	STATIC char *
	11050	S_regwhite( RExC_state_t pRExC_state, char p )
	11051	{
	11052	const char *e = RExC_end;
	11053
	11054	PERL_ARGS_ASSERT_REGWHITE;
	11055
	11056	while (p < e) {
	11057	if (isSPACE(*p))
	11058	++p;
	11059	else if (*p == '#') {
	11060	bool ended = 0;
	11061	do {
	11062	if (*p++ == '\n') {
	11063	ended = 1;
	11064	break;
	11065	}
	11066	} while (p < e);
	11067	if (!ended)
	11068	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	11069	}
	11070	else
	11071	break;
	11072	}
	11073	return p;
	11074	}
	11075
	11076	/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
	11077	Character classes ([:foo:]) can also be negated ([:^foo:]).
	11078	Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
	11079	Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
	11080	but trigger failures because they are currently unimplemented. */
	11081
	11082	#define POSIXCC_DONE(c) ((c) == ':')
	11083	#define POSIXCC_NOTYET(c) ((c) == '=' \|\| (c) == '.')
	11084	#define POSIXCC(c) (POSIXCC_DONE(c) \|\| POSIXCC_NOTYET(c))
	11085
	11086	STATIC I32
	11087	S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
	11088	{
	11089	dVAR;
	11090	I32 namedclass = OOB_NAMEDCLASS;
	11091
	11092	PERL_ARGS_ASSERT_REGPPOSIXCC;
	11093
	11094	if (value == '[' && RExC_parse + 1 < RExC_end &&
	11095	/* I smell either [: or [= or [. -- POSIX has been here, right? */
	11096	POSIXCC(UCHARAT(RExC_parse))) {
	11097	const char c = UCHARAT(RExC_parse);
	11098	char* const s = RExC_parse++;
	11099
	11100	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
	11101	RExC_parse++;
	11102	if (RExC_parse == RExC_end)
	11103	/* Grandfather lone [:, [=, [. */
	11104	RExC_parse = s;
	11105	else {
	11106	const char* const t = RExC_parse++; /* skip over the c */
	11107	assert(*t == c);
	11108
	11109	if (UCHARAT(RExC_parse) == ']') {
	11110	const char *posixcc = s + 1;
	11111	RExC_parse++; /* skip over the ending ] */
	11112
	11113	if (*s == ':') {
	11114	const I32 complement = posixcc == '^' ? posixcc++ : 0;
	11115	const I32 skip = t - posixcc;
	11116
	11117	/* Initially switch on the length of the name. */
	11118	switch (skip) {
	11119	case 4:
	11120	if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
	11121	namedclass = ANYOF_ALNUM;
	11122	break;
	11123	case 5:
	11124	/* Names all of length 5. */
	11125	/* alnum alpha ascii blank cntrl digit graph lower
	11126	print punct space upper */
	11127	/* Offset 4 gives the best switch position. */
	11128	switch (posixcc[4]) {
	11129	case 'a':
	11130	if (memEQ(posixcc, "alph", 4)) /* alpha */
	11131	namedclass = ANYOF_ALPHA;
	11132	break;
	11133	case 'e':
	11134	if (memEQ(posixcc, "spac", 4)) /* space */
	11135	namedclass = ANYOF_PSXSPC;
	11136	break;
	11137	case 'h':
	11138	if (memEQ(posixcc, "grap", 4)) /* graph */
	11139	namedclass = ANYOF_GRAPH;
	11140	break;
	11141	case 'i':
	11142	if (memEQ(posixcc, "asci", 4)) /* ascii */
	11143	namedclass = ANYOF_ASCII;
	11144	break;
	11145	case 'k':
	11146	if (memEQ(posixcc, "blan", 4)) /* blank */
	11147	namedclass = ANYOF_BLANK;
	11148	break;
	11149	case 'l':
	11150	if (memEQ(posixcc, "cntr", 4)) /* cntrl */
	11151	namedclass = ANYOF_CNTRL;
	11152	break;
	11153	case 'm':
	11154	if (memEQ(posixcc, "alnu", 4)) /* alnum */
	11155	namedclass = ANYOF_ALNUMC;
	11156	break;
	11157	case 'r':
	11158	if (memEQ(posixcc, "lowe", 4)) /* lower */
	11159	namedclass = ANYOF_LOWER;
	11160	else if (memEQ(posixcc, "uppe", 4)) /* upper */
	11161	namedclass = ANYOF_UPPER;
	11162	break;
	11163	case 't':
	11164	if (memEQ(posixcc, "digi", 4)) /* digit */
	11165	namedclass = ANYOF_DIGIT;
	11166	else if (memEQ(posixcc, "prin", 4)) /* print */
	11167	namedclass = ANYOF_PRINT;
	11168	else if (memEQ(posixcc, "punc", 4)) /* punct */
	11169	namedclass = ANYOF_PUNCT;
	11170	break;
	11171	}
	11172	break;
	11173	case 6:
	11174	if (memEQ(posixcc, "xdigit", 6))
	11175	namedclass = ANYOF_XDIGIT;
	11176	break;
	11177	}
	11178
	11179	if (namedclass == OOB_NAMEDCLASS)
	11180	Simple_vFAIL3("POSIX class [:%.*s:] unknown",
	11181	t - s - 1, s + 1);
	11182
	11183	/* The #defines are structured so each complement is +1 to
	11184	* the normal one */
	11185	if (complement) {
	11186	namedclass++;
	11187	}
	11188	assert (posixcc[skip] == ':');
	11189	assert (posixcc[skip+1] == ']');
	11190	} else if (!SIZE_ONLY) {
	11191	/* [[=foo=]] and [[.foo.]] are still future. */
	11192
	11193	/* adjust RExC_parse so the warning shows after
	11194	the class closes */
	11195	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
	11196	RExC_parse++;
	11197	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	11198	}
	11199	} else {
	11200	/* Maternal grandfather:
	11201	* "[:" ending in ":" but not in ":]" */
	11202	RExC_parse = s;
	11203	}
	11204	}
	11205	}
	11206
	11207	return namedclass;
	11208	}
	11209
	11210	STATIC void
	11211	S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
	11212	{
	11213	dVAR;
	11214
	11215	PERL_ARGS_ASSERT_CHECKPOSIXCC;
	11216
	11217	if (POSIXCC(UCHARAT(RExC_parse))) {
	11218	const char *s = RExC_parse;
	11219	const char c = *s++;
	11220
	11221	while (isALNUM(*s))
	11222	s++;
	11223	if (s && c == s && s[1] == ']') {
	11224	ckWARN3reg(s+2,
	11225	"POSIX syntax [%c %c] belongs inside character classes",
	11226	c, c);
	11227
	11228	/* [[=foo=]] and [[.foo.]] are still future. */
	11229	if (POSIXCC_NOTYET(c)) {
	11230	/* adjust RExC_parse so the error shows after
	11231	the class closes */
	11232	while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
	11233	NOOP;
	11234	Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
	11235	}
	11236	}
	11237	}
	11238	}
	11239
	11240	/* Generate the code to add a full posix character <class> to the bracketed
	11241	* character class given by <node>. (<node> is needed only under locale rules)
	11242	* destlist is the inversion list for non-locale rules that this class is
	11243	* to be added to
	11244	* sourcelist is the ASCII-range inversion list to add under /a rules
	11245	* Xsourcelist is the full Unicode range list to use otherwise. */
	11246	#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
	11247	if (LOC) { \
	11248	SV* scratch_list = NULL; \
	11249	\
	11250	/* Set this class in the node for runtime matching */ \
	11251	ANYOF_CLASS_SET(node, class); \
	11252	\
	11253	/* For above Latin1 code points, we use the full Unicode range */ \
	11254	_invlist_intersection(PL_AboveLatin1, \
	11255	Xsourcelist, \
	11256	&scratch_list); \
	11257	/* And set the output to it, adding instead if there already is an \
	11258	* output. Checking if <destlist> is NULL first saves an extra \
	11259	* clone. Its reference count will be decremented at the next \
	11260	* union, etc, or if this is the only instance, at the end of the \
	11261	* routine */ \
	11262	if (! destlist) { \
	11263	destlist = scratch_list; \
	11264	} \
	11265	else { \
	11266	_invlist_union(destlist, scratch_list, &destlist); \
	11267	SvREFCNT_dec(scratch_list); \
	11268	} \
	11269	} \
	11270	else { \
	11271	/* For non-locale, just add it to any existing list */ \
	11272	_invlist_union(destlist, \
	11273	(AT_LEAST_ASCII_RESTRICTED) \
	11274	? sourcelist \
	11275	: Xsourcelist, \
	11276	&destlist); \
	11277	}
	11278
	11279	/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
	11280	*/
	11281	#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
	11282	if (LOC) { \
	11283	SV* scratch_list = NULL; \
	11284	ANYOF_CLASS_SET(node, class); \
	11285	_invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
	11286	if (! destlist) { \
	11287	destlist = scratch_list; \
	11288	} \
	11289	else { \
	11290	_invlist_union(destlist, scratch_list, &destlist); \
	11291	SvREFCNT_dec(scratch_list); \
	11292	} \
	11293	} \
	11294	else { \
	11295	_invlist_union_complement_2nd(destlist, \
	11296	(AT_LEAST_ASCII_RESTRICTED) \
	11297	? sourcelist \
	11298	: Xsourcelist, \
	11299	&destlist); \
	11300	/* Under /d, everything in the upper half of the Latin1 range \
	11301	* matches this complement */ \
	11302	if (DEPENDS_SEMANTICS) { \
	11303	ANYOF_FLAGS(node) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	11304	} \
	11305	}
	11306
	11307	/* Generate the code to add a posix character <class> to the bracketed
	11308	* character class given by <node>. (<node> is needed only under locale rules)
	11309	* destlist is the inversion list for non-locale rules that this class is
	11310	* to be added to
	11311	* sourcelist is the ASCII-range inversion list to add under /a rules
	11312	* l1_sourcelist is the Latin1 range list to use otherwise.
	11313	* Xpropertyname is the name to add to <run_time_list> of the property to
	11314	* specify the code points above Latin1 that will have to be
	11315	* determined at run-time
	11316	* run_time_list is a SV* that contains text names of properties that are to
	11317	* be computed at run time. This concatenates <Xpropertyname>
	11318	* to it, appropriately
	11319	* This is essentially DO_POSIX, but we know only the Latin1 values at compile
	11320	* time */
	11321	#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
	11322	l1_sourcelist, Xpropertyname, run_time_list) \
	11323	/* First, resolve whether to use the ASCII-only list or the L1 \
	11324	* list */ \
	11325	DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
	11326	((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
	11327	Xpropertyname, run_time_list)
	11328
	11329	#define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
	11330	Xpropertyname, run_time_list) \
	11331	/* If not /a matching, there are going to be code points we will have \
	11332	* to defer to runtime to look-up */ \
	11333	if (! AT_LEAST_ASCII_RESTRICTED) { \
	11334	Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
	11335	} \
	11336	if (LOC) { \
	11337	ANYOF_CLASS_SET(node, class); \
	11338	} \
	11339	else { \
	11340	_invlist_union(destlist, sourcelist, &destlist); \
	11341	}
	11342
	11343	/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
	11344	* this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
	11345	* otherwise */
	11346	#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
	11347	l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
	11348	if (AT_LEAST_ASCII_RESTRICTED) { \
	11349	_invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
	11350	} \
	11351	else { \
	11352	Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
	11353	matches_above_unicode = TRUE; \
	11354	if (LOC) { \
	11355	ANYOF_CLASS_SET(node, namedclass); \
	11356	} \
	11357	else { \
	11358	SV* scratch_list = NULL; \
	11359	_invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
	11360	if (! destlist) { \
	11361	destlist = scratch_list; \
	11362	} \
	11363	else { \
	11364	_invlist_union(destlist, scratch_list, &destlist); \
	11365	SvREFCNT_dec(scratch_list); \
	11366	} \
	11367	if (DEPENDS_SEMANTICS) { \
	11368	ANYOF_FLAGS(node) \|= ANYOF_NON_UTF8_LATIN1_ALL; \
	11369	} \
	11370	} \
	11371	}
	11372
	11373	STATIC void
	11374	S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
	11375	{
	11376	/* Adds input 'string' with length 'len' to the ANYOF node's unicode
	11377	* alternate list, pointed to by 'alternate_ptr'. This is an array of
	11378	* the multi-character folds of characters in the node */
	11379	SV *sv;
	11380
	11381	PERL_ARGS_ASSERT_ADD_ALTERNATE;
	11382
	11383	if (! *alternate_ptr) {
	11384	*alternate_ptr = newAV();
	11385	}
	11386	sv = newSVpvn_utf8((char*)string, len, TRUE);
	11387	av_push(*alternate_ptr, sv);
	11388	return;
	11389	}
	11390
	11391	/* The names of properties whose definitions are not known at compile time are
	11392	* stored in this SV, after a constant heading. So if the length has been
	11393	* changed since initialization, then there is a run-time definition. */
	11394	#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
	11395
	11396	/* This converts the named class defined in regcomp.h to its equivalent class
	11397	* number defined in handy.h. */
	11398	#define namedclass_to_classnum(class) ((class) / 2)
	11399
	11400	/*
	11401	parse a class specification and produce either an ANYOF node that
	11402	matches the pattern or perhaps will be optimized into an EXACTish node
	11403	instead. The node contains a bit map for the first 256 characters, with the
	11404	corresponding bit set if that character is in the list. For characters
	11405	above 255, a range list is used */
	11406
	11407	STATIC regnode *
	11408	S_regclass(pTHX_ RExC_state_t pRExC_state, I32 flagp, U32 depth)
	11409	{
	11410	dVAR;
	11411	UV nextvalue;
	11412	UV prevvalue = OOB_UNICODE;
	11413	IV range = 0;
	11414	UV value = 0;
	11415	regnode *ret;
	11416	STRLEN numlen;
	11417	IV namedclass = OOB_NAMEDCLASS;
	11418	char *rangebegin = NULL;
	11419	bool need_class = 0;
	11420	bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
	11421	SV *listsv = NULL;
	11422	STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
	11423	than just initialized. */
	11424	SV* properties = NULL; /* Code points that match \p{} \P{} */
	11425	SV* posixes = NULL; /* Code points that match classes like, [:word:],
	11426	extended beyond the Latin1 range */
	11427	UV element_count = 0; /* Number of distinct elements in the class.
	11428	Optimizations may be possible if this is tiny */
	11429	UV n;
	11430
	11431	/* Unicode properties are stored in a swash; this holds the current one
	11432	* being parsed. If this swash is the only above-latin1 component of the
	11433	* character class, an optimization is to pass it directly on to the
	11434	* execution engine. Otherwise, it is set to NULL to indicate that there
	11435	* are other things in the class that have to be dealt with at execution
	11436	* time */
	11437	SV* swash = NULL; /* Code points that match \p{} \P{} */
	11438
	11439	/* Set if a component of this character class is user-defined; just passed
	11440	* on to the engine */
	11441	bool has_user_defined_property = FALSE;
	11442
	11443	/* inversion list of code points this node matches only when the target
	11444	* string is in UTF-8. (Because is under /d) */
	11445	SV* depends_list = NULL;
	11446
	11447	/* inversion list of code points this node matches. For much of the
	11448	* function, it includes only those that match regardless of the utf8ness
	11449	* of the target string */
	11450	SV* cp_list = NULL;
	11451
	11452	/* List of multi-character folds that are matched by this node */
	11453	AV* unicode_alternate = NULL;
	11454	#ifdef EBCDIC
	11455	/* In a range, counts how many 0-2 of the ends of it came from literals,
	11456	* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
	11457	UV literal_endpoint = 0;
	11458	#endif
	11459	bool invert = FALSE; /* Is this class to be complemented */
	11460
	11461	/* Is there any thing like \W or [:^digit:] that matches above the legal
	11462	* Unicode range? */
	11463	bool runtime_posix_matches_above_Unicode = FALSE;
	11464
	11465	regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
	11466	case we need to change the emitted regop to an EXACT. */
	11467	const char * orig_parse = RExC_parse;
	11468	const I32 orig_size = RExC_size;
	11469	GET_RE_DEBUG_FLAGS_DECL;
	11470
	11471	PERL_ARGS_ASSERT_REGCLASS;
	11472	#ifndef DEBUGGING
	11473	PERL_UNUSED_ARG(depth);
	11474	#endif
	11475
	11476	DEBUG_PARSE("clas");
	11477
	11478	/* Assume we are going to generate an ANYOF node. */
	11479	ret = reganode(pRExC_state, ANYOF, 0);
	11480
	11481
	11482	if (!SIZE_ONLY) {
	11483	ANYOF_FLAGS(ret) = 0;
	11484	}
	11485
	11486	if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
	11487	RExC_naughty++;
	11488	RExC_parse++;
	11489	invert = TRUE;
	11490
	11491	/* We have decided to not allow multi-char folds in inverted character
	11492	* classes, due to the confusion that can happen, especially with
	11493	* classes that are designed for a non-Unicode world: You have the
	11494	* peculiar case that:
	11495	"s s" =~ /^[^\xDF]+$/i => Y
	11496	"ss" =~ /^[^\xDF]+$/i => N
	11497	*
	11498	* See [perl #89750] */
	11499	allow_full_fold = FALSE;
	11500	}
	11501
	11502	if (SIZE_ONLY) {
	11503	RExC_size += ANYOF_SKIP;
	11504	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
	11505	}
	11506	else {
	11507	RExC_emit += ANYOF_SKIP;
	11508	if (LOC) {
	11509	ANYOF_FLAGS(ret) \|= ANYOF_LOCALE;
	11510	}
	11511	listsv = newSVpvs("# comment\n");
	11512	initial_listsv_len = SvCUR(listsv);
	11513	}
	11514
	11515	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	11516
	11517	if (!SIZE_ONLY && POSIXCC(nextvalue))
	11518	checkposixcc(pRExC_state);
	11519
	11520	/* allow 1st char to be ] (allowing it to be - is dealt with later) */
	11521	if (UCHARAT(RExC_parse) == ']')
	11522	goto charclassloop;
	11523
	11524	parseit:
	11525	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
	11526
	11527	charclassloop:
	11528
	11529	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
	11530
	11531	if (!range) {
	11532	rangebegin = RExC_parse;
	11533	element_count++;
	11534	}
	11535	if (UTF) {
	11536	value = utf8n_to_uvchr((U8*)RExC_parse,
	11537	RExC_end - RExC_parse,
	11538	&numlen, UTF8_ALLOW_DEFAULT);
	11539	RExC_parse += numlen;
	11540	}
	11541	else
	11542	value = UCHARAT(RExC_parse++);
	11543
	11544	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
	11545	if (value == '[' && POSIXCC(nextvalue))
	11546	namedclass = regpposixcc(pRExC_state, value);
	11547	else if (value == '\\') {
	11548	if (UTF) {
	11549	value = utf8n_to_uvchr((U8*)RExC_parse,
	11550	RExC_end - RExC_parse,
	11551	&numlen, UTF8_ALLOW_DEFAULT);
	11552	RExC_parse += numlen;
	11553	}
	11554	else
	11555	value = UCHARAT(RExC_parse++);
	11556	/* Some compilers cannot handle switching on 64-bit integer
	11557	* values, therefore value cannot be an UV. Yes, this will
	11558	* be a problem later if we want switch on Unicode.
	11559	* A similar issue a little bit later when switching on
	11560	* namedclass. --jhi */
	11561	switch ((I32)value) {
	11562	case 'w': namedclass = ANYOF_ALNUM; break;
	11563	case 'W': namedclass = ANYOF_NALNUM; break;
	11564	case 's': namedclass = ANYOF_SPACE; break;
	11565	case 'S': namedclass = ANYOF_NSPACE; break;
	11566	case 'd': namedclass = ANYOF_DIGIT; break;
	11567	case 'D': namedclass = ANYOF_NDIGIT; break;
	11568	case 'v': namedclass = ANYOF_VERTWS; break;
	11569	case 'V': namedclass = ANYOF_NVERTWS; break;
	11570	case 'h': namedclass = ANYOF_HORIZWS; break;
	11571	case 'H': namedclass = ANYOF_NHORIZWS; break;
	11572	case 'N': /* Handle \N{NAME} in class */
	11573	{
	11574	/* We only pay attention to the first char of
	11575	multichar strings being returned. I kinda wonder
	11576	if this makes sense as it does change the behaviour
	11577	from earlier versions, OTOH that behaviour was broken
	11578	as well. */
	11579	if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
	11580	TRUE /* => charclass */))
	11581	{
	11582	goto parseit;
	11583	}
	11584	}
	11585	break;
	11586	case 'p':
	11587	case 'P':
	11588	{
	11589	char *e;
	11590
	11591	/* This routine will handle any undefined properties */
	11592	U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
	11593
	11594	if (RExC_parse >= RExC_end)
	11595	vFAIL2("Empty \\%c{}", (U8)value);
	11596	if (*RExC_parse == '{') {
	11597	const U8 c = (U8)value;
	11598	e = strchr(RExC_parse++, '}');
	11599	if (!e)
	11600	vFAIL2("Missing right brace on \\%c{}", c);
	11601	while (isSPACE(UCHARAT(RExC_parse)))
	11602	RExC_parse++;
	11603	if (e == RExC_parse)
	11604	vFAIL2("Empty \\%c{}", c);
	11605	n = e - RExC_parse;
	11606	while (isSPACE(UCHARAT(RExC_parse + n - 1)))
	11607	n--;
	11608	}
	11609	else {
	11610	e = RExC_parse;
	11611	n = 1;
	11612	}
	11613	if (!SIZE_ONLY) {
	11614	SV* invlist;
	11615	char* name;
	11616
	11617	if (UCHARAT(RExC_parse) == '^') {
	11618	RExC_parse++;
	11619	n--;
	11620	value = value == 'p' ? 'P' : 'p'; /* toggle */
	11621	while (isSPACE(UCHARAT(RExC_parse))) {
	11622	RExC_parse++;
	11623	n--;
	11624	}
	11625	}
	11626	/* Try to get the definition of the property into
	11627	* <invlist>. If /i is in effect, the effective property
	11628	* will have its name be <__NAME_i>. The design is
	11629	* discussed in commit
	11630	* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
	11631	Newx(name, n + sizeof("_i__\n"), char);
	11632
	11633	sprintf(name, "%s%.*s%s\n",
	11634	(FOLD) ? "__" : "",
	11635	(int)n,
	11636	RExC_parse,
	11637	(FOLD) ? "_i" : ""
	11638	);
	11639
	11640	/* Look up the property name, and get its swash and
	11641	* inversion list, if the property is found */
	11642	if (swash) {
	11643	SvREFCNT_dec(swash);
	11644	}
	11645	swash = _core_swash_init("utf8", name, &PL_sv_undef,
	11646	1, /* binary */
	11647	0, /* not tr/// */
	11648	NULL, /* No inversion list */
	11649	&swash_init_flags
	11650	);
	11651	if (! swash \|\| ! (invlist = _get_swash_invlist(swash))) {
	11652	if (swash) {
	11653	SvREFCNT_dec(swash);
	11654	swash = NULL;
	11655	}
	11656
	11657	/* Here didn't find it. It could be a user-defined
	11658	* property that will be available at run-time. Add it
	11659	* to the list to look up then */
	11660	Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
	11661	(value == 'p' ? '+' : '!'),
	11662	name);
	11663	has_user_defined_property = TRUE;
	11664
	11665	/* We don't know yet, so have to assume that the
	11666	* property could match something in the Latin1 range,
	11667	* hence something that isn't utf8. Note that this
	11668	* would cause things in <depends_list> to match
	11669	* inappropriately, except that any \p{}, including
	11670	* this one forces Unicode semantics, which means there
	11671	* is <no depends_list> */
	11672	ANYOF_FLAGS(ret) \|= ANYOF_NONBITMAP_NON_UTF8;
	11673	}
	11674	else {
	11675
	11676	/* Here, did get the swash and its inversion list. If
	11677	* the swash is from a user-defined property, then this
	11678	* whole character class should be regarded as such */
	11679	has_user_defined_property =
	11680	(swash_init_flags
	11681	& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
	11682
	11683	/* Invert if asking for the complement */
	11684	if (value == 'P') {
	11685	_invlist_union_complement_2nd(properties,
	11686	invlist,
	11687	&properties);
	11688
	11689	/* The swash can't be used as-is, because we've
	11690	* inverted things; delay removing it to here after
	11691	* have copied its invlist above */
	11692	SvREFCNT_dec(swash);
	11693	swash = NULL;
	11694	}
	11695	else {
	11696	_invlist_union(properties, invlist, &properties);
	11697	}
	11698	}
	11699	Safefree(name);
	11700	}
	11701	RExC_parse = e + 1;
	11702	namedclass = ANYOF_MAX; /* no official name, but it's named */
	11703
	11704	/* \p means they want Unicode semantics */
	11705	RExC_uni_semantics = 1;
	11706	}
	11707	break;
	11708	case 'n': value = '\n'; break;
	11709	case 'r': value = '\r'; break;
	11710	case 't': value = '\t'; break;
	11711	case 'f': value = '\f'; break;
	11712	case 'b': value = '\b'; break;
	11713	case 'e': value = ASCII_TO_NATIVE('\033');break;
	11714	case 'a': value = ASCII_TO_NATIVE('\007');break;
	11715	case 'o':
	11716	RExC_parse--; /* function expects to be pointed at the 'o' */
	11717	{
	11718	const char* error_msg;
	11719	bool valid = grok_bslash_o(RExC_parse,
	11720	&value,
	11721	&numlen,
	11722	&error_msg,
	11723	SIZE_ONLY);
	11724	RExC_parse += numlen;
	11725	if (! valid) {
	11726	vFAIL(error_msg);
	11727	}
	11728	}
	11729	if (PL_encoding && value < 0x100) {
	11730	goto recode_encoding;
	11731	}
	11732	break;
	11733	case 'x':
	11734	RExC_parse--; /* function expects to be pointed at the 'x' */
	11735	{
	11736	const char* error_msg;
	11737	bool valid = grok_bslash_x(RExC_parse,
	11738	&value,
	11739	&numlen,
	11740	&error_msg,
	11741	1);
	11742	RExC_parse += numlen;
	11743	if (! valid) {
	11744	vFAIL(error_msg);
	11745	}
	11746	}
	11747	if (PL_encoding && value < 0x100)
	11748	goto recode_encoding;
	11749	break;
	11750	case 'c':
	11751	value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
	11752	break;
	11753	case '0': case '1': case '2': case '3': case '4':
	11754	case '5': case '6': case '7':
	11755	{
	11756	/* Take 1-3 octal digits */
	11757	I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
	11758	numlen = 3;
	11759	value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	11760	RExC_parse += numlen;
	11761	if (PL_encoding && value < 0x100)
	11762	goto recode_encoding;
	11763	break;
	11764	}
	11765	recode_encoding:
	11766	if (! RExC_override_recoding) {
	11767	SV* enc = PL_encoding;
	11768	value = reg_recode((const char)(U8)value, &enc);
	11769	if (!enc && SIZE_ONLY)
	11770	ckWARNreg(RExC_parse,
	11771	"Invalid escape in the specified encoding");
	11772	break;
	11773	}
	11774	default:
	11775	/* Allow \_ to not give an error */
	11776	if (!SIZE_ONLY && isALNUM(value) && value != '_') {
	11777	ckWARN2reg(RExC_parse,
	11778	"Unrecognized escape \\%c in character class passed through",
	11779	(int)value);
	11780	}
	11781	break;
	11782	}
	11783	} /* end of \blah */
	11784	#ifdef EBCDIC
	11785	else
	11786	literal_endpoint++;
	11787	#endif
	11788
	11789	/* What matches in a locale is not known until runtime. This
	11790	* includes what the Posix classes (like \w, [:space:]) match.
	11791	* Room must be reserved (one time per class) to store such
	11792	* classes, either if Perl is compiled so that locale nodes always
	11793	* should have this space, or if there is such class info to be
	11794	* stored. The space will contain a bit for each named class that
	11795	* is to be matched against. This isn't needed for \p{} and
	11796	* pseudo-classes, as they are not affected by locale, and hence
	11797	* are dealt with separately */
	11798	if (LOC
	11799	&& ! need_class
	11800	&& (ANYOF_LOCALE == ANYOF_CLASS
	11801	\|\| (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
	11802	{
	11803	need_class = 1;
	11804	if (SIZE_ONLY) {
	11805	RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	11806	}
	11807	else {
	11808	RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
	11809	ANYOF_CLASS_ZERO(ret);
	11810	}
	11811	ANYOF_FLAGS(ret) \|= ANYOF_CLASS;
	11812	}
	11813
	11814	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
	11815
	11816	/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
	11817	* literal, as is the character that began the false range, i.e.
	11818	* the 'a' in the examples */
	11819	if (range) {
	11820	if (!SIZE_ONLY) {
	11821	const int w =
	11822	RExC_parse >= rangebegin ?
	11823	RExC_parse - rangebegin : 0;
	11824	ckWARN4reg(RExC_parse,
	11825	"False [] range \"%.s\"",
	11826	w, w, rangebegin);
	11827	cp_list = add_cp_to_invlist(cp_list, '-');
	11828	cp_list = add_cp_to_invlist(cp_list, prevvalue);
	11829	}
	11830
	11831	range = 0; /* this was not a true range */
	11832	element_count += 2; /* So counts for three values */
	11833	}
	11834
	11835	if (! SIZE_ONLY) {
	11836	switch ((I32)namedclass) {
	11837
	11838	case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
	11839	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11840	PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
	11841	break;
	11842	case ANYOF_NALNUMC:
	11843	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11844	PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
	11845	runtime_posix_matches_above_Unicode);
	11846	break;
	11847	case ANYOF_ALPHA:
	11848	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11849	PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
	11850	break;
	11851	case ANYOF_NALPHA:
	11852	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11853	PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
	11854	runtime_posix_matches_above_Unicode);
	11855	break;
	11856	case ANYOF_ASCII:
	11857	if (LOC) {
	11858	ANYOF_CLASS_SET(ret, namedclass);
	11859	}
	11860	else {
	11861	_invlist_union(posixes, PL_ASCII, &posixes);
	11862	}
	11863	break;
	11864	case ANYOF_NASCII:
	11865	if (LOC) {
	11866	ANYOF_CLASS_SET(ret, namedclass);
	11867	}
	11868	else {
	11869	_invlist_union_complement_2nd(posixes,
	11870	PL_ASCII, &posixes);
	11871	if (DEPENDS_SEMANTICS) {
	11872	ANYOF_FLAGS(ret) \|= ANYOF_NON_UTF8_LATIN1_ALL;
	11873	}
	11874	}
	11875	break;
	11876	case ANYOF_BLANK:
	11877	DO_POSIX(ret, namedclass, posixes,
	11878	PL_PosixBlank, PL_XPosixBlank);
	11879	break;
	11880	case ANYOF_NBLANK:
	11881	DO_N_POSIX(ret, namedclass, posixes,
	11882	PL_PosixBlank, PL_XPosixBlank);
	11883	break;
	11884	case ANYOF_CNTRL:
	11885	DO_POSIX(ret, namedclass, posixes,
	11886	PL_PosixCntrl, PL_XPosixCntrl);
	11887	break;
	11888	case ANYOF_NCNTRL:
	11889	DO_N_POSIX(ret, namedclass, posixes,
	11890	PL_PosixCntrl, PL_XPosixCntrl);
	11891	break;
	11892	case ANYOF_DIGIT:
	11893	/* There are no digits in the Latin1 range outside of
	11894	* ASCII, so call the macro that doesn't have to resolve
	11895	* them */
	11896	DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
	11897	PL_PosixDigit, "XPosixDigit", listsv);
	11898	break;
	11899	case ANYOF_NDIGIT:
	11900	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11901	PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
	11902	runtime_posix_matches_above_Unicode);
	11903	break;
	11904	case ANYOF_GRAPH:
	11905	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11906	PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
	11907	break;
	11908	case ANYOF_NGRAPH:
	11909	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11910	PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
	11911	runtime_posix_matches_above_Unicode);
	11912	break;
	11913	case ANYOF_HORIZWS:
	11914	/* For these, we use the cp_list, as /d doesn't make a
	11915	* difference in what these match. There would be problems
	11916	* if these characters had folds other than themselves, as
	11917	* cp_list is subject to folding. It turns out that \h
	11918	* is just a synonym for XPosixBlank */
	11919	_invlist_union(cp_list, PL_XPosixBlank, &cp_list);
	11920	break;
	11921	case ANYOF_NHORIZWS:
	11922	_invlist_union_complement_2nd(cp_list,
	11923	PL_XPosixBlank, &cp_list);
	11924	break;
	11925	case ANYOF_LOWER:
	11926	case ANYOF_NLOWER:
	11927	{ /* These require special handling, as they differ under
	11928	folding, matching Cased there (which in the ASCII range
	11929	is the same as Alpha */
	11930
	11931	SV* ascii_source;
	11932	SV* l1_source;
	11933	const char *Xname;
	11934
	11935	if (FOLD && ! LOC) {
	11936	ascii_source = PL_PosixAlpha;
	11937	l1_source = PL_L1Cased;
	11938	Xname = "Cased";
	11939	}
	11940	else {
	11941	ascii_source = PL_PosixLower;
	11942	l1_source = PL_L1PosixLower;
	11943	Xname = "XPosixLower";
	11944	}
	11945	if (namedclass == ANYOF_LOWER) {
	11946	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11947	ascii_source, l1_source, Xname, listsv);
	11948	}
	11949	else {
	11950	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
	11951	posixes, ascii_source, l1_source, Xname, listsv,
	11952	runtime_posix_matches_above_Unicode);
	11953	}
	11954	break;
	11955	}
	11956	case ANYOF_PRINT:
	11957	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11958	PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
	11959	break;
	11960	case ANYOF_NPRINT:
	11961	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11962	PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
	11963	runtime_posix_matches_above_Unicode);
	11964	break;
	11965	case ANYOF_PUNCT:
	11966	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11967	PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
	11968	break;
	11969	case ANYOF_NPUNCT:
	11970	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	11971	PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
	11972	runtime_posix_matches_above_Unicode);
	11973	break;
	11974	case ANYOF_PSXSPC:
	11975	DO_POSIX(ret, namedclass, posixes,
	11976	PL_PosixSpace, PL_XPosixSpace);
	11977	break;
	11978	case ANYOF_NPSXSPC:
	11979	DO_N_POSIX(ret, namedclass, posixes,
	11980	PL_PosixSpace, PL_XPosixSpace);
	11981	break;
	11982	case ANYOF_SPACE:
	11983	DO_POSIX(ret, namedclass, posixes,
	11984	PL_PerlSpace, PL_XPerlSpace);
	11985	break;
	11986	case ANYOF_NSPACE:
	11987	DO_N_POSIX(ret, namedclass, posixes,
	11988	PL_PerlSpace, PL_XPerlSpace);
	11989	break;
	11990	case ANYOF_UPPER: /* Same as LOWER, above */
	11991	case ANYOF_NUPPER:
	11992	{
	11993	SV* ascii_source;
	11994	SV* l1_source;
	11995	const char *Xname;
	11996
	11997	if (FOLD && ! LOC) {
	11998	ascii_source = PL_PosixAlpha;
	11999	l1_source = PL_L1Cased;
	12000	Xname = "Cased";
	12001	}
	12002	else {
	12003	ascii_source = PL_PosixUpper;
	12004	l1_source = PL_L1PosixUpper;
	12005	Xname = "XPosixUpper";
	12006	}
	12007	if (namedclass == ANYOF_UPPER) {
	12008	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	12009	ascii_source, l1_source, Xname, listsv);
	12010	}
	12011	else {
	12012	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
	12013	posixes, ascii_source, l1_source, Xname, listsv,
	12014	runtime_posix_matches_above_Unicode);
	12015	}
	12016	break;
	12017	}
	12018	case ANYOF_ALNUM: /* Really is 'Word' */
	12019	DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	12020	PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
	12021	break;
	12022	case ANYOF_NALNUM:
	12023	DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
	12024	PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
	12025	runtime_posix_matches_above_Unicode);
	12026	break;
	12027	case ANYOF_VERTWS:
	12028	/* For these, we use the cp_list, as /d doesn't make a
	12029	* difference in what these match. There would be problems
	12030	* if these characters had folds other than themselves, as
	12031	* cp_list is subject to folding */
	12032	_invlist_union(cp_list, PL_VertSpace, &cp_list);
	12033	break;
	12034	case ANYOF_NVERTWS:
	12035	_invlist_union_complement_2nd(cp_list,
	12036	PL_VertSpace, &cp_list);
	12037	break;
	12038	case ANYOF_XDIGIT:
	12039	DO_POSIX(ret, namedclass, posixes,
	12040	PL_PosixXDigit, PL_XPosixXDigit);
	12041	break;
	12042	case ANYOF_NXDIGIT:
	12043	DO_N_POSIX(ret, namedclass, posixes,
	12044	PL_PosixXDigit, PL_XPosixXDigit);
	12045	break;
	12046	case ANYOF_MAX:
	12047	/* this is to handle \p and \P */
	12048	break;
	12049	default:
	12050	vFAIL("Invalid [::] class");
	12051	break;
	12052	}
	12053
	12054	continue; /* Go get next character */
	12055	}
	12056	} /* end of namedclass \blah */
	12057
	12058	if (range) {
	12059	if (prevvalue > value) /* b-a */ {
	12060	const int w = RExC_parse - rangebegin;
	12061	Simple_vFAIL4("Invalid [] range \"%.s\"", w, w, rangebegin);
	12062	range = 0; /* not a valid range */
	12063	}
	12064	}
	12065	else {
	12066	prevvalue = value; /* save the beginning of the potential range */
	12067	if (RExC_parse+1 < RExC_end
	12068	&& *RExC_parse == '-'
	12069	&& RExC_parse[1] != ']')
	12070	{
	12071	RExC_parse++;
	12072
	12073	/* a bad range like \w-, [:word:]- ? */
	12074	if (namedclass > OOB_NAMEDCLASS) {
	12075	if (ckWARN(WARN_REGEXP)) {
	12076	const int w =
	12077	RExC_parse >= rangebegin ?
	12078	RExC_parse - rangebegin : 0;
	12079	vWARN4(RExC_parse,
	12080	"False [] range \"%.s\"",
	12081	w, w, rangebegin);
	12082	}
	12083	if (!SIZE_ONLY) {
	12084	cp_list = add_cp_to_invlist(cp_list, '-');
	12085	}
	12086	element_count++;
	12087	} else
	12088	range = 1; /* yeah, it's a range! */
	12089	continue; /* but do it the next time */
	12090	}
	12091	}
	12092
	12093	/* Here, <prevvalue> is the beginning of the range, if any; or <value>
	12094	* if not */
	12095
	12096	/* non-Latin1 code point implies unicode semantics. Must be set in
	12097	* pass1 so is there for the whole of pass 2 */
	12098	if (value > 255) {
	12099	RExC_uni_semantics = 1;
	12100	}
	12101
	12102	/* Ready to process either the single value, or the completed range */
	12103	if (!SIZE_ONLY) {
	12104	#ifndef EBCDIC
	12105	cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
	12106	#else
	12107	UV* this_range = _new_invlist(1);
	12108	_append_range_to_invlist(this_range, prevvalue, value);
	12109
	12110	/* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
	12111	* If this range was specified using something like 'i-j', we want
	12112	* to include only the 'i' and the 'j', and not anything in
	12113	* between, so exclude non-ASCII, non-alphabetics from it.
	12114	* However, if the range was specified with something like
	12115	* [\x89-\x91] or [\x89-j], all code points within it should be
	12116	* included. literal_endpoint==2 means both ends of the range used
	12117	* a literal character, not \x{foo} */
	12118	if (literal_endpoint == 2
	12119	&& (prevvalue >= 'a' && value <= 'z')
	12120	\|\| (prevvalue >= 'A' && value <= 'Z'))
	12121	{
	12122	_invlist_intersection(this_range, PL_ASCII, &this_range, );
	12123	_invlist_intersection(this_range, PL_Alpha, &this_range, );
	12124	}
	12125	_invlist_union(cp_list, this_range, &cp_list);
	12126	literal_endpoint = 0;
	12127	#endif
	12128	}
	12129
	12130	range = 0; /* this range (if it was one) is done now */
	12131	} /* End of loop through all the text within the brackets */
	12132
	12133	/* If the character class contains only a single element, it may be
	12134	* optimizable into another node type which is smaller and runs faster.
	12135	* Check if this is the case for this class */
	12136	if (element_count == 1) {
	12137	U8 op = END;
	12138	U8 arg = 0;
	12139
	12140	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
	12141	[:digit:] or \p{foo} */
	12142
	12143	/* Certain named classes have equivalents that can appear outside a
	12144	* character class, e.g. \w, \H. We use these instead of a
	12145	* character class. */
	12146	switch ((I32)namedclass) {
	12147	U8 offset;
	12148
	12149	/* The first group is for node types that depend on the charset
	12150	* modifier to the regex. We first calculate the base node
	12151	* type, and if it should be inverted */
	12152
	12153	case ANYOF_NALNUM:
	12154	invert = ! invert;
	12155	/* FALLTHROUGH */
	12156	case ANYOF_ALNUM:
	12157	op = ALNUM;
	12158	goto join_charset_classes;
	12159
	12160	case ANYOF_NSPACE:
	12161	invert = ! invert;
	12162	/* FALLTHROUGH */
	12163	case ANYOF_SPACE:
	12164	op = SPACE;
	12165	goto join_charset_classes;
	12166
	12167	case ANYOF_NDIGIT:
	12168	invert = ! invert;
	12169	/* FALLTHROUGH */
	12170	case ANYOF_DIGIT:
	12171	op = DIGIT;
	12172
	12173	join_charset_classes:
	12174
	12175	/* Now that we have the base node type, we take advantage
	12176	* of the enum ordering of the charset modifiers to get the
	12177	* exact node type, For example the base SPACE also has
	12178	* SPACEL, SPACEU, and SPACEA */
	12179
	12180	offset = get_regex_charset(RExC_flags);
	12181
	12182	/* /aa is the same as /a for these */
	12183	if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
	12184	offset = REGEX_ASCII_RESTRICTED_CHARSET;
	12185	}
	12186	else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
	12187	offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
	12188	}
	12189
	12190	op += offset;
	12191
	12192	/* The number of varieties of each of these is the same,
	12193	* hence, so is the delta between the normal and
	12194	* complemented nodes */
	12195	if (invert) {
	12196	op += NALNUM - ALNUM;
	12197	}
	12198	*flagp \|= HASWIDTH\|SIMPLE;
	12199	break;
	12200
	12201	/* The second group doesn't depend of the charset modifiers.
	12202	* We just have normal and complemented */
	12203	case ANYOF_NHORIZWS:
	12204	invert = ! invert;
	12205	/* FALLTHROUGH */
	12206	case ANYOF_HORIZWS:
	12207	is_horizws:
	12208	op = (invert) ? NHORIZWS : HORIZWS;
	12209	*flagp \|= HASWIDTH\|SIMPLE;
	12210	break;
	12211
	12212	case ANYOF_NVERTWS:
	12213	invert = ! invert;
	12214	/* FALLTHROUGH */
	12215	case ANYOF_VERTWS:
	12216	op = (invert) ? NVERTWS : VERTWS;
	12217	*flagp \|= HASWIDTH\|SIMPLE;
	12218	break;
	12219
	12220	case ANYOF_MAX:
	12221	break;
	12222
	12223	case ANYOF_NBLANK:
	12224	invert = ! invert;
	12225	/* FALLTHROUGH */
	12226	case ANYOF_BLANK:
	12227	if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
	12228	goto is_horizws;
	12229	}
	12230	/* FALLTHROUGH */
	12231	default:
	12232	/* A generic posix class. All the /a ones can be handled
	12233	* by the POSIXA opcode. And all are closed under folding
	12234	* in the ASCII range, so FOLD doesn't matter */
	12235	if (AT_LEAST_ASCII_RESTRICTED
	12236	\|\| (! LOC && namedclass == ANYOF_ASCII))
	12237	{
	12238	/* The odd numbered ones are the complements of the
	12239	* next-lower even number one */
	12240	if (namedclass % 2 == 1) {
	12241	invert = ! invert;
	12242	namedclass--;
	12243	}
	12244	arg = namedclass_to_classnum(namedclass);
	12245	op = (invert) ? NPOSIXA : POSIXA;
	12246	}
	12247	break;
	12248	}
	12249	}
	12250	else if (value == prevvalue) {
	12251
	12252	/* Here, the class consists of just a single code point */
	12253
	12254	if (invert) {
	12255	if (! LOC && value == '\n') {
	12256	op = REG_ANY; /* Optimize [^\n] */
	12257	*flagp \|= HASWIDTH\|SIMPLE;
	12258	RExC_naughty++;
	12259	}
	12260	}
	12261	else if (value < 256 \|\| UTF) {
	12262
	12263	/* Optimize a single value into an EXACTish node, but not if it
	12264	* would require converting the pattern to UTF-8. */
	12265	op = compute_EXACTish(pRExC_state);
	12266	}
	12267	} /* Otherwise is a range */
	12268	else if (! LOC) { /* locale could vary these */
	12269	if (prevvalue == '0') {
	12270	if (value == '9') {
	12271	op = (invert) ? NDIGITA : DIGITA;
	12272	*flagp \|= HASWIDTH\|SIMPLE;
	12273	}
	12274	}
	12275	}
	12276
	12277	/* Here, we have changed <op> away from its initial value iff we found
	12278	* an optimization */
	12279	if (op != END) {
	12280
	12281	/* Throw away this ANYOF regnode, and emit the calculated one,
	12282	* which should correspond to the beginning, not current, state of
	12283	* the parse */
	12284	const char * cur_parse = RExC_parse;
	12285	RExC_parse = (char *)orig_parse;
	12286	if ( SIZE_ONLY) {
	12287	if (! LOC) {
	12288
	12289	/* To get locale nodes to not use the full ANYOF size would
	12290	* require moving the code above that writes the portions
	12291	* of it that aren't in other nodes to after this point.
	12292	* e.g. ANYOF_CLASS_SET */
	12293	RExC_size = orig_size;
	12294	}
	12295	}
	12296	else {
	12297	RExC_emit = (regnode *)orig_emit;
	12298	}
	12299
	12300	ret = reg_node(pRExC_state, op);
	12301
	12302	if (PL_regkind[op] == POSIXD) {
	12303	if (! SIZE_ONLY) {
	12304	FLAGS(ret) = arg;
	12305	}
	12306	*flagp \|= HASWIDTH\|SIMPLE;
	12307	}
	12308	else if (PL_regkind[op] == EXACT) {
	12309	alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
	12310	}
	12311
	12312	RExC_parse = (char *) cur_parse;
	12313
	12314	SvREFCNT_dec(listsv);
	12315	return ret;
	12316	}
	12317	}
	12318
	12319	if (SIZE_ONLY)
	12320	return ret;
	12321	/**** !SIZE_ONLY (Pass 2) AFTER HERE *******/
	12322
	12323	/* If folding, we calculate all characters that could fold to or from the
	12324	* ones already on the list */
	12325	if (FOLD && cp_list) {
	12326	UV start, end; /* End points of code point ranges */
	12327
	12328	SV* fold_intersection = NULL;
	12329
	12330	/* In the Latin1 range, the characters that can be folded-to or -from
	12331	* are precisely the alphabetic characters. If the highest code point
	12332	* is within Latin1, we can use the compiled-in list, and not have to
	12333	* go out to disk. */
	12334	if (invlist_highest(cp_list) < 256) {
	12335	_invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
	12336	}
	12337	else {
	12338
	12339	/* Here, there are non-Latin1 code points, so we will have to go
	12340	* fetch the list of all the characters that participate in folds
	12341	*/
	12342	if (! PL_utf8_foldable) {
	12343	SV* swash = swash_init("utf8", "_Perl_Any_Folds",
	12344	&PL_sv_undef, 1, 0);
	12345	PL_utf8_foldable = _get_swash_invlist(swash);
	12346	SvREFCNT_dec(swash);
	12347	}
	12348
	12349	/* This is a hash that for a particular fold gives all characters
	12350	* that are involved in it */
	12351	if (! PL_utf8_foldclosures) {
	12352
	12353	/* If we were unable to find any folds, then we likely won't be
	12354	* able to find the closures. So just create an empty list.
	12355	* Folding will effectively be restricted to the non-Unicode
	12356	* rules hard-coded into Perl. (This case happens legitimately
	12357	* during compilation of Perl itself before the Unicode tables
	12358	* are generated) */
	12359	if (_invlist_len(PL_utf8_foldable) == 0) {
	12360	PL_utf8_foldclosures = newHV();
	12361	}
	12362	else {
	12363	/* If the folds haven't been read in, call a fold function
	12364	* to force that */
	12365	if (! PL_utf8_tofold) {
	12366	U8 dummy[UTF8_MAXBYTES+1];
	12367	STRLEN dummy_len;
	12368
	12369	/* This string is just a short named one above \xff */
	12370	to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
	12371	assert(PL_utf8_tofold); /* Verify that worked */
	12372	}
	12373	PL_utf8_foldclosures =
	12374	_swash_inversion_hash(PL_utf8_tofold);
	12375	}
	12376	}
	12377
	12378	/* Only the characters in this class that participate in folds need
	12379	* be checked. Get the intersection of this class and all the
	12380	* possible characters that are foldable. This can quickly narrow
	12381	* down a large class */
	12382	_invlist_intersection(PL_utf8_foldable, cp_list,
	12383	&fold_intersection);
	12384	}
	12385
	12386	/* Now look at the foldable characters in this class individually */
	12387	invlist_iterinit(fold_intersection);
	12388	while (invlist_iternext(fold_intersection, &start, &end)) {
	12389	UV j;
	12390
	12391	/* Locale folding for Latin1 characters is deferred until runtime */
	12392	if (LOC && start < 256) {
	12393	start = 256;
	12394	}
	12395
	12396	/* Look at every character in the range */
	12397	for (j = start; j <= end; j++) {
	12398
	12399	U8 foldbuf[UTF8_MAXBYTES_CASE+1];
	12400	STRLEN foldlen;
	12401	UV f;
	12402
	12403	if (j < 256) {
	12404
	12405	/* We have the latin1 folding rules hard-coded here so that
	12406	* an innocent-looking character class, like /[ks]/i won't
	12407	* have to go out to disk to find the possible matches.
	12408	* XXX It would be better to generate these via regen, in
	12409	* case a new version of the Unicode standard adds new
	12410	* mappings, though that is not really likely, and may be
	12411	* caught by the default: case of the switch below. */
	12412
	12413	if (PL_fold_latin1[j] != j) {
	12414
	12415	/* ASCII is always matched; non-ASCII is matched only
	12416	* under Unicode rules */
	12417	if (isASCII(j) \|\| AT_LEAST_UNI_SEMANTICS) {
	12418	cp_list =
	12419	add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
	12420	}
	12421	else {
	12422	depends_list =
	12423	add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
	12424	}
	12425	}
	12426
	12427	if (HAS_NONLATIN1_FOLD_CLOSURE(j)
	12428	&& (! isASCII(j) \|\| ! ASCII_FOLD_RESTRICTED))
	12429	{
	12430	/* Certain Latin1 characters have matches outside
	12431	* Latin1, or are multi-character. To get here, 'j' is
	12432	* one of those characters. None of these matches is
	12433	* valid for ASCII characters under /aa, which is why
	12434	* the 'if' just above excludes those. The matches
	12435	* fall into three categories:
	12436	* 1) They are singly folded-to or -from an above 255
	12437	* character, e.g., LATIN SMALL LETTER Y WITH
	12438	* DIAERESIS and LATIN CAPITAL LETTER Y WITH
	12439	* DIAERESIS;
	12440	* 2) They are part of a multi-char fold with another
	12441	* latin1 character; only LATIN SMALL LETTER
	12442	* SHARP S => "ss" fits this;
	12443	* 3) They are part of a multi-char fold with a
	12444	* character outside of Latin1, such as various
	12445	* ligatures.
	12446	* We aren't dealing fully with multi-char folds, except
	12447	* we do deal with the pattern containing a character
	12448	* that has a multi-char fold (not so much the inverse).
	12449	* For types 1) and 3), the matches only happen when the
	12450	* target string is utf8; that's not true for 2), and we
	12451	* set a flag for it.
	12452	*
	12453	* The code below adds the single fold closures for 'j'
	12454	* to the inversion list. */
	12455	switch (j) {
	12456	case 'k':
	12457	case 'K':
	12458	cp_list =
	12459	add_cp_to_invlist(cp_list, KELVIN_SIGN);
	12460	break;
	12461	case 's':
	12462	case 'S':
	12463	cp_list = add_cp_to_invlist(cp_list,
	12464	LATIN_SMALL_LETTER_LONG_S);
	12465	break;
	12466	case MICRO_SIGN:
	12467	cp_list = add_cp_to_invlist(cp_list,
	12468	GREEK_CAPITAL_LETTER_MU);
	12469	cp_list = add_cp_to_invlist(cp_list,
	12470	GREEK_SMALL_LETTER_MU);
	12471	break;
	12472	case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
	12473	case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
	12474	cp_list =
	12475	add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
	12476	break;
	12477	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	12478	cp_list = add_cp_to_invlist(cp_list,
	12479	LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
	12480	break;
	12481	case LATIN_SMALL_LETTER_SHARP_S:
	12482	cp_list = add_cp_to_invlist(cp_list,
	12483	LATIN_CAPITAL_LETTER_SHARP_S);
	12484
	12485	/* Under /a, /d, and /u, this can match the two
	12486	* chars "ss" */
	12487	if (! ASCII_FOLD_RESTRICTED) {
	12488	add_alternate(&unicode_alternate,
	12489	(U8 *) "ss", 2);
	12490
	12491	/* And under /u or /a, it can match even if
	12492	* the target is not utf8 */
	12493	if (AT_LEAST_UNI_SEMANTICS) {
	12494	ANYOF_FLAGS(ret) \|=
	12495	ANYOF_NONBITMAP_NON_UTF8;
	12496	}
	12497	}
	12498	break;
	12499	case 'F': case 'f':
	12500	case 'I': case 'i':
	12501	case 'L': case 'l':
	12502	case 'T': case 't':
	12503	case 'A': case 'a':
	12504	case 'H': case 'h':
	12505	case 'J': case 'j':
	12506	case 'N': case 'n':
	12507	case 'W': case 'w':
	12508	case 'Y': case 'y':
	12509	/* These all are targets of multi-character
	12510	* folds from code points that require UTF8 to
	12511	* express, so they can't match unless the
	12512	* target string is in UTF-8, so no action here
	12513	* is necessary, as regexec.c properly handles
	12514	* the general case for UTF-8 matching */
	12515	break;
	12516	default:
	12517	/* Use deprecated warning to increase the
	12518	* chances of this being output */
	12519	ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
	12520	break;
	12521	}
	12522	}
	12523	continue;
	12524	}
	12525
	12526	/* Here is an above Latin1 character. We don't have the rules
	12527	* hard-coded for it. First, get its fold */
	12528	f = _to_uni_fold_flags(j, foldbuf, &foldlen,
	12529	((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
	12530	\| ((LOC)
	12531	? FOLD_FLAGS_LOCALE
	12532	: (ASCII_FOLD_RESTRICTED)
	12533	? FOLD_FLAGS_NOMIX_ASCII
	12534	: 0));
	12535
	12536	if (foldlen > (STRLEN)UNISKIP(f)) {
	12537
	12538	/* Any multicharacter foldings (disallowed in lookbehind
	12539	* patterns) require the following transform: [ABCDEF] ->
	12540	* (?:[ABCabcDEFd]\|pq\|rst) where E folds into "pq" and F
	12541	* folds into "rst", all other characters fold to single
	12542	* characters. We save away these multicharacter foldings,
	12543	* to be later saved as part of the additional "s" data. */
	12544	if (! RExC_in_lookbehind) {
	12545	U8* loc = foldbuf;
	12546	U8* e = foldbuf + foldlen;
	12547
	12548	/* If any of the folded characters of this are in the
	12549	* Latin1 range, tell the regex engine that this can
	12550	* match a non-utf8 target string. */
	12551	while (loc < e) {
	12552	if (UTF8_IS_INVARIANT(*loc)
	12553	\|\| UTF8_IS_DOWNGRADEABLE_START(*loc))
	12554	{
	12555	ANYOF_FLAGS(ret)
	12556	\|= ANYOF_NONBITMAP_NON_UTF8;
	12557	break;
	12558	}
	12559	loc += UTF8SKIP(loc);
	12560	}
	12561
	12562	add_alternate(&unicode_alternate, foldbuf, foldlen);
	12563	}
	12564	}
	12565	else {
	12566	/* Single character fold of above Latin1. Add everything
	12567	* in its fold closure to the list that this node should
	12568	* match */
	12569	SV** listp;
	12570
	12571	/* The fold closures data structure is a hash with the keys
	12572	* being every character that is folded to, like 'k', and
	12573	* the values each an array of everything that folds to its
	12574	* key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
	12575	if ((listp = hv_fetch(PL_utf8_foldclosures,
	12576	(char *) foldbuf, foldlen, FALSE)))
	12577	{
	12578	AV* list = (AV) listp;
	12579	IV k;
	12580	for (k = 0; k <= av_len(list); k++) {
	12581	SV** c_p = av_fetch(list, k, FALSE);
	12582	UV c;
	12583	if (c_p == NULL) {
	12584	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	12585	}
	12586	c = SvUV(*c_p);
	12587
	12588	/* /aa doesn't allow folds between ASCII and non-;
	12589	* /l doesn't allow them between above and below
	12590	* 256 */
	12591	if ((ASCII_FOLD_RESTRICTED
	12592	&& (isASCII(c) != isASCII(j)))
	12593	\|\| (LOC && ((c < 256) != (j < 256))))
	12594	{
	12595	continue;
	12596	}
	12597
	12598	/* Folds involving non-ascii Latin1 characters
	12599	* under /d are added to a separate list */
	12600	if (isASCII(c) \|\| c > 255 \|\| AT_LEAST_UNI_SEMANTICS)
	12601	{
	12602	cp_list = add_cp_to_invlist(cp_list, c);
	12603	}
	12604	else {
	12605	depends_list = add_cp_to_invlist(depends_list, c);
	12606	}
	12607	}
	12608	}
	12609	}
	12610	}
	12611	}
	12612	SvREFCNT_dec(fold_intersection);
	12613	}
	12614
	12615	/* And combine the result (if any) with any inversion list from posix
	12616	* classes. The lists are kept separate up to now because we don't want to
	12617	* fold the classes (folding of those is automatically handled by the swash
	12618	* fetching code) */
	12619	if (posixes) {
	12620	if (! DEPENDS_SEMANTICS) {
	12621	if (cp_list) {
	12622	_invlist_union(cp_list, posixes, &cp_list);
	12623	SvREFCNT_dec(posixes);
	12624	}
	12625	else {
	12626	cp_list = posixes;
	12627	}
	12628	}
	12629	else {
	12630	/* Under /d, we put into a separate list the Latin1 things that
	12631	* match only when the target string is utf8 */
	12632	SV* nonascii_but_latin1_properties = NULL;
	12633	_invlist_intersection(posixes, PL_Latin1,
	12634	&nonascii_but_latin1_properties);
	12635	_invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
	12636	&nonascii_but_latin1_properties);
	12637	_invlist_subtract(posixes, nonascii_but_latin1_properties,
	12638	&posixes);
	12639	if (cp_list) {
	12640	_invlist_union(cp_list, posixes, &cp_list);
	12641	SvREFCNT_dec(posixes);
	12642	}
	12643	else {
	12644	cp_list = posixes;
	12645	}
	12646
	12647	if (depends_list) {
	12648	_invlist_union(depends_list, nonascii_but_latin1_properties,
	12649	&depends_list);
	12650	SvREFCNT_dec(nonascii_but_latin1_properties);
	12651	}
	12652	else {
	12653	depends_list = nonascii_but_latin1_properties;
	12654	}
	12655	}
	12656	}
	12657
	12658	/* And combine the result (if any) with any inversion list from properties.
	12659	* The lists are kept separate up to now so that we can distinguish the two
	12660	* in regards to matching above-Unicode. A run-time warning is generated
	12661	* if a Unicode property is matched against a non-Unicode code point. But,
	12662	* we allow user-defined properties to match anything, without any warning,
	12663	* and we also suppress the warning if there is a portion of the character
	12664	* class that isn't a Unicode property, and which matches above Unicode, \W
	12665	* or [\x{110000}] for example.
	12666	* (Note that in this case, unlike the Posix one above, there is no
	12667	* <depends_list>, because having a Unicode property forces Unicode
	12668	* semantics */
	12669	if (properties) {
	12670	bool warn_super = ! has_user_defined_property;
	12671	if (cp_list) {
	12672
	12673	/* If it matters to the final outcome, see if a non-property
	12674	* component of the class matches above Unicode. If so, the
	12675	* warning gets suppressed. This is true even if just a single
	12676	* such code point is specified, as though not strictly correct if
	12677	* another such code point is matched against, the fact that they
	12678	* are using above-Unicode code points indicates they should know
	12679	* the issues involved */
	12680	if (warn_super) {
	12681	bool non_prop_matches_above_Unicode =
	12682	runtime_posix_matches_above_Unicode
	12683	\| (invlist_highest(cp_list) > PERL_UNICODE_MAX);
	12684	if (invert) {
	12685	non_prop_matches_above_Unicode =
	12686	! non_prop_matches_above_Unicode;
	12687	}
	12688	warn_super = ! non_prop_matches_above_Unicode;
	12689	}
	12690
	12691	_invlist_union(properties, cp_list, &cp_list);
	12692	SvREFCNT_dec(properties);
	12693	}
	12694	else {
	12695	cp_list = properties;
	12696	}
	12697
	12698	if (warn_super) {
	12699	ANYOF_FLAGS(ret) \|= ANYOF_WARN_SUPER;
	12700	}
	12701	}
	12702
	12703	/* Here, we have calculated what code points should be in the character
	12704	* class.
	12705	*
	12706	* Now we can see about various optimizations. Fold calculation (which we
	12707	* did above) needs to take place before inversion. Otherwise /[^k]/i
	12708	* would invert to include K, which under /i would match k, which it
	12709	* shouldn't. Therefore we can't invert folded locale now, as it won't be
	12710	* folded until runtime */
	12711
	12712	/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
	12713	* at compile time. Besides not inverting folded locale now, we can't invert
	12714	* if there are things such as \w, which aren't known until runtime */
	12715	if (invert
	12716	&& ! (LOC && (FOLD \|\| (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
	12717	&& ! depends_list
	12718	&& ! unicode_alternate
	12719	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	12720	{
	12721	_invlist_invert(cp_list);
	12722
	12723	/* Any swash can't be used as-is, because we've inverted things */
	12724	if (swash) {
	12725	SvREFCNT_dec(swash);
	12726	swash = NULL;
	12727	}
	12728
	12729	/* Clear the invert flag since have just done it here */
	12730	invert = FALSE;
	12731	}
	12732
	12733	/* If we didn't do folding, it's because some information isn't available
	12734	* until runtime; set the run-time fold flag for these. (We don't have to
	12735	* worry about properties folding, as that is taken care of by the swash
	12736	* fetching) */
	12737	if (FOLD && (LOC \|\| unicode_alternate))
	12738	{
	12739	ANYOF_FLAGS(ret) \|= ANYOF_LOC_NONBITMAP_FOLD;
	12740	}
	12741
	12742	/* Some character classes are equivalent to other nodes. Such nodes take
	12743	* up less room and generally fewer operations to execute than ANYOF nodes.
	12744	* Above, we checked for and optimized into some such equivalents for
	12745	* certain common classes that are easy to test. Getting to this point in
	12746	* the code means that the class didn't get optimized there. Since this
	12747	* code is only executed in Pass 2, it is too late to save space--it has
	12748	* been allocated in Pass 1, and currently isn't given back. But turning
	12749	* things into an EXACTish node can allow the optimizer to join it to any
	12750	* adjacent such nodes. And if the class is equivalent to things like /./,
	12751	* expensive run-time swashes can be avoided. Now that we have more
	12752	* complete information, we can find things necessarily missed by the
	12753	* earlier code. I (khw) am not sure how much to look for here. It would
	12754	* be easy, but perhaps too slow, to check any candidates against all the
	12755	* node types they could possibly match using _invlistEQ(). */
	12756
	12757	if (cp_list
	12758	&& ! unicode_alternate
	12759	&& ! invert
	12760	&& ! depends_list
	12761	&& ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
	12762	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	12763	{
	12764	UV start, end;
	12765	U8 op = END; /* The optimzation node-type */
	12766	const char * cur_parse= RExC_parse;
	12767
	12768	invlist_iterinit(cp_list);
	12769	if (! invlist_iternext(cp_list, &start, &end)) {
	12770
	12771	/* Here, the list is empty. This happens, for example, when a
	12772	* Unicode property is the only thing in the character class, and
	12773	* it doesn't match anything. (perluniprops.pod notes such
	12774	* properties) */
	12775	op = OPFAIL;
	12776	*flagp \|= HASWIDTH\|SIMPLE;
	12777	}
	12778	else if (start == end) { /* The range is a single code point */
	12779	if (! invlist_iternext(cp_list, &start, &end)
	12780
	12781	/* Don't do this optimization if it would require changing
	12782	* the pattern to UTF-8 */
	12783	&& (start < 256 \|\| UTF))
	12784	{
	12785	/* Here, the list contains a single code point. Can optimize
	12786	* into an EXACT node */
	12787
	12788	value = start;
	12789
	12790	if (! FOLD) {
	12791	op = EXACT;
	12792	}
	12793	else if (LOC) {
	12794
	12795	/* A locale node under folding with one code point can be
	12796	* an EXACTFL, as its fold won't be calculated until
	12797	* runtime */
	12798	op = EXACTFL;
	12799	}
	12800	else {
	12801
	12802	/* Here, we are generally folding, but there is only one
	12803	* code point to match. If we have to, we use an EXACT
	12804	* node, but it would be better for joining with adjacent
	12805	* nodes in the optimization pass if we used the same
	12806	* EXACTFish node that any such are likely to be. We can
	12807	* do this iff the code point doesn't participate in any
	12808	* folds. For example, an EXACTF of a colon is the same as
	12809	* an EXACT one, since nothing folds to or from a colon.
	12810	* In the Latin1 range, being an alpha means that the
	12811	* character participates in a fold (except for the
	12812	* feminine and masculine ordinals, which I (khw) don't
	12813	* think are worrying about optimizing for). */
	12814	if (value < 256) {
	12815	if (isALPHA_L1(value)) {
	12816	op = EXACT;
	12817	}
	12818	}
	12819	else {
	12820	if (! PL_utf8_foldable) {
	12821	SV* swash = swash_init("utf8", "_Perl_Any_Folds",
	12822	&PL_sv_undef, 1, 0);
	12823	PL_utf8_foldable = _get_swash_invlist(swash);
	12824	SvREFCNT_dec(swash);
	12825	}
	12826	if (_invlist_contains_cp(PL_utf8_foldable, value)) {
	12827	op = EXACT;
	12828	}
	12829	}
	12830
	12831	/* If we haven't found the node type, above, it means we
	12832	* can use the prevailing one */
	12833	if (op == END) {
	12834	op = compute_EXACTish(pRExC_state);
	12835	}
	12836	}
	12837	}
	12838	}
	12839	else if (start == 0) {
	12840	if (end == UV_MAX) {
	12841	op = SANY;
	12842	*flagp \|= HASWIDTH\|SIMPLE;
	12843	RExC_naughty++;
	12844	}
	12845	else if (end == '\n' - 1
	12846	&& invlist_iternext(cp_list, &start, &end)
	12847	&& start == '\n' + 1 && end == UV_MAX)
	12848	{
	12849	op = REG_ANY;
	12850	*flagp \|= HASWIDTH\|SIMPLE;
	12851	RExC_naughty++;
	12852	}
	12853	}
	12854
	12855	if (op != END) {
	12856	RExC_parse = (char *)orig_parse;
	12857	RExC_emit = (regnode *)orig_emit;
	12858
	12859	ret = reg_node(pRExC_state, op);
	12860
	12861	RExC_parse = (char *)cur_parse;
	12862
	12863	if (PL_regkind[op] == EXACT) {
	12864	alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
	12865	}
	12866
	12867	SvREFCNT_dec(listsv);
	12868	return ret;
	12869	}
	12870	}
	12871
	12872	/* Here, <cp_list> contains all the code points we can determine at
	12873	* compile time that match under all conditions. Go through it, and
	12874	* for things that belong in the bitmap, put them there, and delete from
	12875	* <cp_list>. While we are at it, see if everything above 255 is in the
	12876	* list, and if so, set a flag to speed up execution */
	12877	ANYOF_BITMAP_ZERO(ret);
	12878	if (cp_list) {
	12879
	12880	/* This gets set if we actually need to modify things */
	12881	bool change_invlist = FALSE;
	12882
	12883	UV start, end;
	12884
	12885	/* Start looking through <cp_list> */
	12886	invlist_iterinit(cp_list);
	12887	while (invlist_iternext(cp_list, &start, &end)) {
	12888	UV high;
	12889	int i;
	12890
	12891	if (end == UV_MAX && start <= 256) {
	12892	ANYOF_FLAGS(ret) \|= ANYOF_UNICODE_ALL;
	12893	}
	12894
	12895	/* Quit if are above what we should change */
	12896	if (start > 255) {
	12897	break;
	12898	}
	12899
	12900	change_invlist = TRUE;
	12901
	12902	/* Set all the bits in the range, up to the max that we are doing */
	12903	high = (end < 255) ? end : 255;
	12904	for (i = start; i <= (int) high; i++) {
	12905	if (! ANYOF_BITMAP_TEST(ret, i)) {
	12906	ANYOF_BITMAP_SET(ret, i);
	12907	prevvalue = value;
	12908	value = i;
	12909	}
	12910	}
	12911	}
	12912
	12913	/* Done with loop; remove any code points that are in the bitmap from
	12914	* <cp_list> */
	12915	if (change_invlist) {
	12916	_invlist_subtract(cp_list, PL_Latin1, &cp_list);
	12917	}
	12918
	12919	/* If have completely emptied it, remove it completely */
	12920	if (_invlist_len(cp_list) == 0) {
	12921	SvREFCNT_dec(cp_list);
	12922	cp_list = NULL;
	12923	}
	12924	}
	12925
	12926	if (invert) {
	12927	ANYOF_FLAGS(ret) \|= ANYOF_INVERT;
	12928	}
	12929
	12930	/* Here, the bitmap has been populated with all the Latin1 code points that
	12931	* always match. Can now add to the overall list those that match only
	12932	* when the target string is UTF-8 (<depends_list>). */
	12933	if (depends_list) {
	12934	if (cp_list) {
	12935	_invlist_union(cp_list, depends_list, &cp_list);
	12936	SvREFCNT_dec(depends_list);
	12937	}
	12938	else {
	12939	cp_list = depends_list;
	12940	}
	12941	}
	12942
	12943	/* If there is a swash and more than one element, we can't use the swash in
	12944	* the optimization below. */
	12945	if (swash && element_count > 1) {
	12946	SvREFCNT_dec(swash);
	12947	swash = NULL;
	12948	}
	12949
	12950	if (! cp_list
	12951	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
	12952	&& ! unicode_alternate)
	12953	{
	12954	ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
	12955	SvREFCNT_dec(listsv);
	12956	SvREFCNT_dec(unicode_alternate);
	12957	}
	12958	else {
	12959	/* av[0] stores the character class description in its textual form:
	12960	* used later (regexec.c:Perl_regclass_swash()) to initialize the
	12961	* appropriate swash, and is also useful for dumping the regnode.
	12962	* av[1] if NULL, is a placeholder to later contain the swash computed
	12963	* from av[0]. But if no further computation need be done, the
	12964	* swash is stored there now.
	12965	* av[2] stores the multicharacter foldings, used later in
	12966	* regexec.c:S_reginclass().
	12967	* av[3] stores the cp_list inversion list for use in addition or
	12968	* instead of av[0]; used only if av[1] is NULL
	12969	* av[4] is set if any component of the class is from a user-defined
	12970	* property; used only if av[1] is NULL */
	12971	AV * const av = newAV();
	12972	SV *rv;
	12973
	12974	av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
	12975	? listsv
	12976	: &PL_sv_undef);
	12977	if (swash) {
	12978	av_store(av, 1, swash);
	12979	SvREFCNT_dec(cp_list);
	12980	}
	12981	else {
	12982	av_store(av, 1, NULL);
	12983	if (cp_list) {
	12984	av_store(av, 3, cp_list);
	12985	av_store(av, 4, newSVuv(has_user_defined_property));
	12986	}
	12987	}
	12988
	12989	/* Store any computed multi-char folds only if we are allowing
	12990	* them */
	12991	if (allow_full_fold) {
	12992	av_store(av, 2, MUTABLE_SV(unicode_alternate));
	12993	if (unicode_alternate) { /* This node is variable length */
	12994	OP(ret) = ANYOFV;
	12995	}
	12996	}
	12997	else {
	12998	av_store(av, 2, NULL);
	12999	}
	13000	rv = newRV_noinc(MUTABLE_SV(av));
	13001	n = add_data(pRExC_state, 1, "s");
	13002	RExC_rxi->data->data[n] = (void*)rv;
	13003	ARG_SET(ret, n);
	13004	}
	13005
	13006	*flagp \|= HASWIDTH\|SIMPLE;
	13007	return ret;
	13008	}
	13009	#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
	13010
	13011
	13012	/* reg_skipcomment()
	13013
	13014	Absorbs an /x style # comments from the input stream.
	13015	Returns true if there is more text remaining in the stream.
	13016	Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
	13017	terminates the pattern without including a newline.
	13018
	13019	Note its the callers responsibility to ensure that we are
	13020	actually in /x mode
	13021
	13022	*/
	13023
	13024	STATIC bool
	13025	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
	13026	{
	13027	bool ended = 0;
	13028
	13029	PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
	13030
	13031	while (RExC_parse < RExC_end)
	13032	if (*RExC_parse++ == '\n') {
	13033	ended = 1;
	13034	break;
	13035	}
	13036	if (!ended) {
	13037	/* we ran off the end of the pattern without ending
	13038	the comment, so we have to add an \n when wrapping */
	13039	RExC_seen \|= REG_SEEN_RUN_ON_COMMENT;
	13040	return 0;
	13041	} else
	13042	return 1;
	13043	}
	13044
	13045	/* nextchar()
	13046
	13047	Advances the parse position, and optionally absorbs
	13048	"whitespace" from the inputstream.
	13049
	13050	Without /x "whitespace" means (?#...) style comments only,
	13051	with /x this means (?#...) and # comments and whitespace proper.
	13052
	13053	Returns the RExC_parse point from BEFORE the scan occurs.
	13054
	13055	This is the /x friendly way of saying RExC_parse++.
	13056	*/
	13057
	13058	STATIC char*
	13059	S_nextchar(pTHX_ RExC_state_t *pRExC_state)
	13060	{
	13061	char* const retval = RExC_parse++;
	13062
	13063	PERL_ARGS_ASSERT_NEXTCHAR;
	13064
	13065	for (;;) {
	13066	if (RExC_end - RExC_parse >= 3
	13067	&& *RExC_parse == '('
	13068	&& RExC_parse[1] == '?'
	13069	&& RExC_parse[2] == '#')
	13070	{
	13071	while (*RExC_parse != ')') {
	13072	if (RExC_parse == RExC_end)
	13073	FAIL("Sequence (?#... not terminated");
	13074	RExC_parse++;
	13075	}
	13076	RExC_parse++;
	13077	continue;
	13078	}
	13079	if (RExC_flags & RXf_PMf_EXTENDED) {
	13080	if (isSPACE(*RExC_parse)) {
	13081	RExC_parse++;
	13082	continue;
	13083	}
	13084	else if (*RExC_parse == '#') {
	13085	if ( reg_skipcomment( pRExC_state ) )
	13086	continue;
	13087	}
	13088	}
	13089	return retval;
	13090	}
	13091	}
	13092
	13093	/*
	13094	- reg_node - emit a node
	13095	*/
	13096	STATIC regnode * /* Location. */
	13097	S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
	13098	{
	13099	dVAR;
	13100	regnode *ptr;
	13101	regnode * const ret = RExC_emit;
	13102	GET_RE_DEBUG_FLAGS_DECL;
	13103
	13104	PERL_ARGS_ASSERT_REG_NODE;
	13105
	13106	if (SIZE_ONLY) {
	13107	SIZE_ALIGN(RExC_size);
	13108	RExC_size += 1;
	13109	return(ret);
	13110	}
	13111	if (RExC_emit >= RExC_emit_bound)
	13112	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	13113	op, RExC_emit, RExC_emit_bound);
	13114
	13115	NODE_ALIGN_FILL(ret);
	13116	ptr = ret;
	13117	FILL_ADVANCE_NODE(ptr, op);
	13118	#ifdef RE_TRACK_PATTERN_OFFSETS
	13119	if (RExC_offsets) { /* MJD */
	13120	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
	13121	"reg_node", __LINE__,
	13122	PL_reg_name[op],
	13123	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
	13124	? "Overwriting end of array!\n" : "OK",
	13125	(UV)(RExC_emit - RExC_emit_start),
	13126	(UV)(RExC_parse - RExC_start),
	13127	(UV)RExC_offsets[0]));
	13128	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
	13129	}
	13130	#endif
	13131	RExC_emit = ptr;
	13132	return(ret);
	13133	}
	13134
	13135	/*
	13136	- reganode - emit a node with an argument
	13137	*/
	13138	STATIC regnode * /* Location. */
	13139	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
	13140	{
	13141	dVAR;
	13142	regnode *ptr;
	13143	regnode * const ret = RExC_emit;
	13144	GET_RE_DEBUG_FLAGS_DECL;
	13145
	13146	PERL_ARGS_ASSERT_REGANODE;
	13147
	13148	if (SIZE_ONLY) {
	13149	SIZE_ALIGN(RExC_size);
	13150	RExC_size += 2;
	13151	/*
	13152	We can't do this:
	13153
	13154	assert(2==regarglen[op]+1);
	13155
	13156	Anything larger than this has to allocate the extra amount.
	13157	If we changed this to be:
	13158
	13159	RExC_size += (1 + regarglen[op]);
	13160
	13161	then it wouldn't matter. Its not clear what side effect
	13162	might come from that so its not done so far.
	13163	-- dmq
	13164	*/
	13165	return(ret);
	13166	}
	13167	if (RExC_emit >= RExC_emit_bound)
	13168	Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
	13169	op, RExC_emit, RExC_emit_bound);
	13170
	13171	NODE_ALIGN_FILL(ret);
	13172	ptr = ret;
	13173	FILL_ADVANCE_NODE_ARG(ptr, op, arg);
	13174	#ifdef RE_TRACK_PATTERN_OFFSETS
	13175	if (RExC_offsets) { /* MJD */
	13176	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	13177	"reganode",
	13178	__LINE__,
	13179	PL_reg_name[op],
	13180	(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
	13181	"Overwriting end of array!\n" : "OK",
	13182	(UV)(RExC_emit - RExC_emit_start),
	13183	(UV)(RExC_parse - RExC_start),
	13184	(UV)RExC_offsets[0]));
	13185	Set_Cur_Node_Offset;
	13186	}
	13187	#endif
	13188	RExC_emit = ptr;
	13189	return(ret);
	13190	}
	13191
	13192	/*
	13193	- reguni - emit (if appropriate) a Unicode character
	13194	*/
	13195	STATIC STRLEN
	13196	S_reguni(pTHX_ const RExC_state_t pRExC_state, UV uv, char s)
	13197	{
	13198	dVAR;
	13199
	13200	PERL_ARGS_ASSERT_REGUNI;
	13201
	13202	return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8)s, uv) - (U8)s);
	13203	}
	13204
	13205	/*
	13206	- reginsert - insert an operator in front of already-emitted operand
	13207	*
	13208	* Means relocating the operand.
	13209	*/
	13210	STATIC void
	13211	S_reginsert(pTHX_ RExC_state_t pRExC_state, U8 op, regnode opnd, U32 depth)
	13212	{
	13213	dVAR;
	13214	regnode *src;
	13215	regnode *dst;
	13216	regnode *place;
	13217	const int offset = regarglen[(U8)op];
	13218	const int size = NODE_STEP_REGNODE + offset;
	13219	GET_RE_DEBUG_FLAGS_DECL;
	13220
	13221	PERL_ARGS_ASSERT_REGINSERT;
	13222	PERL_UNUSED_ARG(depth);
	13223	/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
	13224	DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
	13225	if (SIZE_ONLY) {
	13226	RExC_size += size;
	13227	return;
	13228	}
	13229
	13230	src = RExC_emit;
	13231	RExC_emit += size;
	13232	dst = RExC_emit;
	13233	if (RExC_open_parens) {
	13234	int paren;
	13235	/DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);/
	13236	for ( paren=0 ; paren < RExC_npar ; paren++ ) {
	13237	if ( RExC_open_parens[paren] >= opnd ) {
	13238	/DEBUG_PARSE_FMT("open"," - %d",size);/
	13239	RExC_open_parens[paren] += size;
	13240	} else {
	13241	/DEBUG_PARSE_FMT("open"," - %s","ok");/
	13242	}
	13243	if ( RExC_close_parens[paren] >= opnd ) {
	13244	/DEBUG_PARSE_FMT("close"," - %d",size);/
	13245	RExC_close_parens[paren] += size;
	13246	} else {
	13247	/DEBUG_PARSE_FMT("close"," - %s","ok");/
	13248	}
	13249	}
	13250	}
	13251
	13252	while (src > opnd) {
	13253	StructCopy(--src, --dst, regnode);
	13254	#ifdef RE_TRACK_PATTERN_OFFSETS
	13255	if (RExC_offsets) { /* MJD 20010112 */
	13256	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
	13257	"reg_insert",
	13258	__LINE__,
	13259	PL_reg_name[op],
	13260	(UV)(dst - RExC_emit_start) > RExC_offsets[0]
	13261	? "Overwriting end of array!\n" : "OK",
	13262	(UV)(src - RExC_emit_start),
	13263	(UV)(dst - RExC_emit_start),
	13264	(UV)RExC_offsets[0]));
	13265	Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
	13266	Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
	13267	}
	13268	#endif
	13269	}
	13270
	13271
	13272	place = opnd; /* Op node, where operand used to be. */
	13273	#ifdef RE_TRACK_PATTERN_OFFSETS
	13274	if (RExC_offsets) { /* MJD */
	13275	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
	13276	"reginsert",
	13277	__LINE__,
	13278	PL_reg_name[op],
	13279	(UV)(place - RExC_emit_start) > RExC_offsets[0]
	13280	? "Overwriting end of array!\n" : "OK",
	13281	(UV)(place - RExC_emit_start),
	13282	(UV)(RExC_parse - RExC_start),
	13283	(UV)RExC_offsets[0]));
	13284	Set_Node_Offset(place, RExC_parse);
	13285	Set_Node_Length(place, 1);
	13286	}
	13287	#endif
	13288	src = NEXTOPER(place);
	13289	FILL_ADVANCE_NODE(place, op);
	13290	Zero(src, offset, regnode);
	13291	}
	13292
	13293	/*
	13294	- regtail - set the next-pointer at the end of a node chain of p to val.
	13295	- SEE ALSO: regtail_study
	13296	*/
	13297	/* TODO: All three parms should be const */
	13298	STATIC void
	13299	S_regtail(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	13300	{
	13301	dVAR;
	13302	regnode *scan;
	13303	GET_RE_DEBUG_FLAGS_DECL;
	13304
	13305	PERL_ARGS_ASSERT_REGTAIL;
	13306	#ifndef DEBUGGING
	13307	PERL_UNUSED_ARG(depth);
	13308	#endif
	13309
	13310	if (SIZE_ONLY)
	13311	return;
	13312
	13313	/* Find last node. */
	13314	scan = p;
	13315	for (;;) {
	13316	regnode * const temp = regnext(scan);
	13317	DEBUG_PARSE_r({
	13318	SV * const mysv=sv_newmortal();
	13319	DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
	13320	regprop(RExC_rx, mysv, scan);
	13321	PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
	13322	SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
	13323	(temp == NULL ? "->" : ""),
	13324	(temp == NULL ? PL_reg_name[OP(val)] : "")
	13325	);
	13326	});
	13327	if (temp == NULL)
	13328	break;
	13329	scan = temp;
	13330	}
	13331
	13332	if (reg_off_by_arg[OP(scan)]) {
	13333	ARG_SET(scan, val - scan);
	13334	}
	13335	else {
	13336	NEXT_OFF(scan) = val - scan;
	13337	}
	13338	}
	13339
	13340	#ifdef DEBUGGING
	13341	/*
	13342	- regtail_study - set the next-pointer at the end of a node chain of p to val.
	13343	- Look for optimizable sequences at the same time.
	13344	- currently only looks for EXACT chains.
	13345
	13346	This is experimental code. The idea is to use this routine to perform
	13347	in place optimizations on branches and groups as they are constructed,
	13348	with the long term intention of removing optimization from study_chunk so
	13349	that it is purely analytical.
	13350
	13351	Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
	13352	to control which is which.
	13353
	13354	*/
	13355	/* TODO: All four parms should be const */
	13356
	13357	STATIC U8
	13358	S_regtail_study(pTHX_ RExC_state_t pRExC_state, regnode p, const regnode *val,U32 depth)
	13359	{
	13360	dVAR;
	13361	regnode *scan;
	13362	U8 exact = PSEUDO;
	13363	#ifdef EXPERIMENTAL_INPLACESCAN
	13364	I32 min = 0;
	13365	#endif
	13366	GET_RE_DEBUG_FLAGS_DECL;
	13367
	13368	PERL_ARGS_ASSERT_REGTAIL_STUDY;
	13369
	13370
	13371	if (SIZE_ONLY)
	13372	return exact;
	13373
	13374	/* Find last node. */
	13375
	13376	scan = p;
	13377	for (;;) {
	13378	regnode * const temp = regnext(scan);
	13379	#ifdef EXPERIMENTAL_INPLACESCAN
	13380	if (PL_regkind[OP(scan)] == EXACT) {
	13381	bool has_exactf_sharp_s; /* Unexamined in this routine */
	13382	if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
	13383	return EXACT;
	13384	}
	13385	#endif
	13386	if ( exact ) {
	13387	switch (OP(scan)) {
	13388	case EXACT:
	13389	case EXACTF:
	13390	case EXACTFA:
	13391	case EXACTFU:
	13392	case EXACTFU_SS:
	13393	case EXACTFU_TRICKYFOLD:
	13394	case EXACTFL:
	13395	if( exact == PSEUDO )
	13396	exact= OP(scan);
	13397	else if ( exact != OP(scan) )
	13398	exact= 0;
	13399	case NOTHING:
	13400	break;
	13401	default:
	13402	exact= 0;
	13403	}
	13404	}
	13405	DEBUG_PARSE_r({
	13406	SV * const mysv=sv_newmortal();
	13407	DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
	13408	regprop(RExC_rx, mysv, scan);
	13409	PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
	13410	SvPV_nolen_const(mysv),
	13411	REG_NODE_NUM(scan),
	13412	PL_reg_name[exact]);
	13413	});
	13414	if (temp == NULL)
	13415	break;
	13416	scan = temp;
	13417	}
	13418	DEBUG_PARSE_r({
	13419	SV * const mysv_val=sv_newmortal();
	13420	DEBUG_PARSE_MSG("");
	13421	regprop(RExC_rx, mysv_val, val);
	13422	PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
	13423	SvPV_nolen_const(mysv_val),
	13424	(IV)REG_NODE_NUM(val),
	13425	(IV)(val - scan)
	13426	);
	13427	});
	13428	if (reg_off_by_arg[OP(scan)]) {
	13429	ARG_SET(scan, val - scan);
	13430	}
	13431	else {
	13432	NEXT_OFF(scan) = val - scan;
	13433	}
	13434
	13435	return exact;
	13436	}
	13437	#endif
	13438
	13439	/*
	13440	- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
	13441	*/
	13442	#ifdef DEBUGGING
	13443	static void
	13444	S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
	13445	{
	13446	int bit;
	13447	int set=0;
	13448	regex_charset cs;
	13449
	13450	for (bit=0; bit<32; bit++) {
	13451	if (flags & (1<<bit)) {
	13452	if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
	13453	continue;
	13454	}
	13455	if (!set++ && lead)
	13456	PerlIO_printf(Perl_debug_log, "%s",lead);
	13457	PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
	13458	}
	13459	}
	13460	if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
	13461	if (!set++ && lead) {
	13462	PerlIO_printf(Perl_debug_log, "%s",lead);
	13463	}
	13464	switch (cs) {
	13465	case REGEX_UNICODE_CHARSET:
	13466	PerlIO_printf(Perl_debug_log, "UNICODE");
	13467	break;
	13468	case REGEX_LOCALE_CHARSET:
	13469	PerlIO_printf(Perl_debug_log, "LOCALE");
	13470	break;
	13471	case REGEX_ASCII_RESTRICTED_CHARSET:
	13472	PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
	13473	break;
	13474	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	13475	PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
	13476	break;
	13477	default:
	13478	PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
	13479	break;
	13480	}
	13481	}
	13482	if (lead) {
	13483	if (set)
	13484	PerlIO_printf(Perl_debug_log, "\n");
	13485	else
	13486	PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
	13487	}
	13488	}
	13489	#endif
	13490
	13491	void
	13492	Perl_regdump(pTHX_ const regexp *r)
	13493	{
	13494	#ifdef DEBUGGING
	13495	dVAR;
	13496	SV * const sv = sv_newmortal();
	13497	SV *dsv= sv_newmortal();
	13498	RXi_GET_DECL(r,ri);
	13499	GET_RE_DEBUG_FLAGS_DECL;
	13500
	13501	PERL_ARGS_ASSERT_REGDUMP;
	13502
	13503	(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
	13504
	13505	/* Header fields of interest. */
	13506	if (r->anchored_substr) {
	13507	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
	13508	RE_SV_DUMPLEN(r->anchored_substr), 30);
	13509	PerlIO_printf(Perl_debug_log,
	13510	"anchored %s%s at %"IVdf" ",
	13511	s, RE_SV_TAIL(r->anchored_substr),
	13512	(IV)r->anchored_offset);
	13513	} else if (r->anchored_utf8) {
	13514	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
	13515	RE_SV_DUMPLEN(r->anchored_utf8), 30);
	13516	PerlIO_printf(Perl_debug_log,
	13517	"anchored utf8 %s%s at %"IVdf" ",
	13518	s, RE_SV_TAIL(r->anchored_utf8),
	13519	(IV)r->anchored_offset);
	13520	}
	13521	if (r->float_substr) {
	13522	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
	13523	RE_SV_DUMPLEN(r->float_substr), 30);
	13524	PerlIO_printf(Perl_debug_log,
	13525	"floating %s%s at %"IVdf"..%"UVuf" ",
	13526	s, RE_SV_TAIL(r->float_substr),
	13527	(IV)r->float_min_offset, (UV)r->float_max_offset);
	13528	} else if (r->float_utf8) {
	13529	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
	13530	RE_SV_DUMPLEN(r->float_utf8), 30);
	13531	PerlIO_printf(Perl_debug_log,
	13532	"floating utf8 %s%s at %"IVdf"..%"UVuf" ",
	13533	s, RE_SV_TAIL(r->float_utf8),
	13534	(IV)r->float_min_offset, (UV)r->float_max_offset);
	13535	}
	13536	if (r->check_substr \|\| r->check_utf8)
	13537	PerlIO_printf(Perl_debug_log,
	13538	(const char *)
	13539	(r->check_substr == r->float_substr
	13540	&& r->check_utf8 == r->float_utf8
	13541	? "(checking floating" : "(checking anchored"));
	13542	if (r->extflags & RXf_NOSCAN)
	13543	PerlIO_printf(Perl_debug_log, " noscan");
	13544	if (r->extflags & RXf_CHECK_ALL)
	13545	PerlIO_printf(Perl_debug_log, " isall");
	13546	if (r->check_substr \|\| r->check_utf8)
	13547	PerlIO_printf(Perl_debug_log, ") ");
	13548
	13549	if (ri->regstclass) {
	13550	regprop(r, sv, ri->regstclass);
	13551	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
	13552	}
	13553	if (r->extflags & RXf_ANCH) {
	13554	PerlIO_printf(Perl_debug_log, "anchored");
	13555	if (r->extflags & RXf_ANCH_BOL)
	13556	PerlIO_printf(Perl_debug_log, "(BOL)");
	13557	if (r->extflags & RXf_ANCH_MBOL)
	13558	PerlIO_printf(Perl_debug_log, "(MBOL)");
	13559	if (r->extflags & RXf_ANCH_SBOL)
	13560	PerlIO_printf(Perl_debug_log, "(SBOL)");
	13561	if (r->extflags & RXf_ANCH_GPOS)
	13562	PerlIO_printf(Perl_debug_log, "(GPOS)");
	13563	PerlIO_putc(Perl_debug_log, ' ');
	13564	}
	13565	if (r->extflags & RXf_GPOS_SEEN)
	13566	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
	13567	if (r->intflags & PREGf_SKIP)
	13568	PerlIO_printf(Perl_debug_log, "plus ");
	13569	if (r->intflags & PREGf_IMPLICIT)
	13570	PerlIO_printf(Perl_debug_log, "implicit ");
	13571	PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
	13572	if (r->extflags & RXf_EVAL_SEEN)
	13573	PerlIO_printf(Perl_debug_log, "with eval ");
	13574	PerlIO_printf(Perl_debug_log, "\n");
	13575	DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
	13576	#else
	13577	PERL_ARGS_ASSERT_REGDUMP;
	13578	PERL_UNUSED_CONTEXT;
	13579	PERL_UNUSED_ARG(r);
	13580	#endif /* DEBUGGING */
	13581	}
	13582
	13583	/*
	13584	- regprop - printable representation of opcode
	13585	*/
	13586	#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
	13587	STMT_START { \
	13588	if (do_sep) { \
	13589	Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
	13590	if (flags & ANYOF_INVERT) \
	13591	/make sure the invert info is in each / \
	13592	sv_catpvs(sv, "^"); \
	13593	do_sep = 0; \
	13594	} \
	13595	} STMT_END
	13596
	13597	void
	13598	Perl_regprop(pTHX_ const regexp prog, SV sv, const regnode *o)
	13599	{
	13600	#ifdef DEBUGGING
	13601	dVAR;
	13602	int k;
	13603
	13604	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
	13605	static const char * const anyofs[] = {
	13606	"\\w",
	13607	"\\W",
	13608	"\\s",
	13609	"\\S",
	13610	"\\d",
	13611	"\\D",
	13612	"[:alnum:]",
	13613	"[:^alnum:]",
	13614	"[:alpha:]",
	13615	"[:^alpha:]",
	13616	"[:ascii:]",
	13617	"[:^ascii:]",
	13618	"[:cntrl:]",
	13619	"[:^cntrl:]",
	13620	"[:graph:]",
	13621	"[:^graph:]",
	13622	"[:lower:]",
	13623	"[:^lower:]",
	13624	"[:print:]",
	13625	"[:^print:]",
	13626	"[:punct:]",
	13627	"[:^punct:]",
	13628	"[:upper:]",
	13629	"[:^upper:]",
	13630	"[:xdigit:]",
	13631	"[:^xdigit:]",
	13632	"[:space:]",
	13633	"[:^space:]",
	13634	"[:blank:]",
	13635	"[:^blank:]"
	13636	};
	13637	RXi_GET_DECL(prog,progi);
	13638	GET_RE_DEBUG_FLAGS_DECL;
	13639
	13640	PERL_ARGS_ASSERT_REGPROP;
	13641
	13642	sv_setpvs(sv, "");
	13643
	13644	if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
	13645	/* It would be nice to FAIL() here, but this may be called from
	13646	regexec.c, and it would be hard to supply pRExC_state. */
	13647	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
	13648	sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
	13649
	13650	k = PL_regkind[OP(o)];
	13651
	13652	if (k == EXACT) {
	13653	sv_catpvs(sv, " ");
	13654	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
	13655	* is a crude hack but it may be the best for now since
	13656	* we have no flag "this EXACTish node was UTF-8"
	13657	* --jhi */
	13658	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
	13659	PERL_PV_ESCAPE_UNI_DETECT \|
	13660	PERL_PV_ESCAPE_NONASCII \|
	13661	PERL_PV_PRETTY_ELLIPSES \|
	13662	PERL_PV_PRETTY_LTGT \|
	13663	PERL_PV_PRETTY_NOCLEAR
	13664	);
	13665	} else if (k == TRIE) {
	13666	/* print the details of the trie in dumpuntil instead, as
	13667	* progi->data isn't available here */
	13668	const char op = OP(o);
	13669	const U32 n = ARG(o);
	13670	const reg_ac_data * const ac = IS_TRIE_AC(op) ?
	13671	(reg_ac_data *)progi->data->data[n] :
	13672	NULL;
	13673	const reg_trie_data * const trie
	13674	= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
	13675
	13676	Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
	13677	DEBUG_TRIE_COMPILE_r(
	13678	Perl_sv_catpvf(aTHX_ sv,
	13679	"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
	13680	(UV)trie->startstate,
	13681	(IV)trie->statecount-1, /* -1 because of the unused 0 element */
	13682	(UV)trie->wordcount,
	13683	(UV)trie->minlen,
	13684	(UV)trie->maxlen,
	13685	(UV)TRIE_CHARCOUNT(trie),
	13686	(UV)trie->uniquecharcount
	13687	)
	13688	);
	13689	if ( IS_ANYOF_TRIE(op) \|\| trie->bitmap ) {
	13690	int i;
	13691	int rangestart = -1;
	13692	U8* bitmap = IS_ANYOF_TRIE(op) ? (U8)ANYOF_BITMAP(o) : (U8)TRIE_BITMAP(trie);
	13693	sv_catpvs(sv, "[");
	13694	for (i = 0; i <= 256; i++) {
	13695	if (i < 256 && BITMAP_TEST(bitmap,i)) {
	13696	if (rangestart == -1)
	13697	rangestart = i;
	13698	} else if (rangestart != -1) {
	13699	if (i <= rangestart + 3)
	13700	for (; rangestart < i; rangestart++)
	13701	put_byte(sv, rangestart);
	13702	else {
	13703	put_byte(sv, rangestart);
	13704	sv_catpvs(sv, "-");
	13705	put_byte(sv, i - 1);
	13706	}
	13707	rangestart = -1;
	13708	}
	13709	}
	13710	sv_catpvs(sv, "]");
	13711	}
	13712
	13713	} else if (k == CURLY) {
	13714	if (OP(o) == CURLYM \|\| OP(o) == CURLYN \|\| OP(o) == CURLYX)
	13715	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
	13716	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
	13717	}
	13718	else if (k == WHILEM && o->flags) /* Ordinal/of */
	13719	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
	13720	else if (k == REF \|\| k == OPEN \|\| k == CLOSE \|\| k == GROUPP \|\| OP(o)==ACCEPT) {
	13721	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
	13722	if ( RXp_PAREN_NAMES(prog) ) {
	13723	if ( k != REF \|\| (OP(o) < NREF)) {
	13724	AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
	13725	SV **name= av_fetch(list, ARG(o), 0 );
	13726	if (name)
	13727	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	13728	}
	13729	else {
	13730	AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
	13731	SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
	13732	I32 nums=(I32)SvPVX(sv_dat);
	13733	SV **name= av_fetch(list, nums[0], 0 );
	13734	I32 n;
	13735	if (name) {
	13736	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	13737	Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
	13738	(n ? "," : ""), (IV)nums[n]);
	13739	}
	13740	Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
	13741	}
	13742	}
	13743	}
	13744	} else if (k == GOSUB)
	13745	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
	13746	else if (k == VERB) {
	13747	if (!o->flags)
	13748	Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
	13749	SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
	13750	} else if (k == LOGICAL)
	13751	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
	13752	else if (k == ANYOF) {
	13753	int i, rangestart = -1;
	13754	const U8 flags = ANYOF_FLAGS(o);
	13755	int do_sep = 0;
	13756
	13757
	13758	if (flags & ANYOF_LOCALE)
	13759	sv_catpvs(sv, "{loc}");
	13760	if (flags & ANYOF_LOC_NONBITMAP_FOLD)
	13761	sv_catpvs(sv, "{i}");
	13762	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
	13763	if (flags & ANYOF_INVERT)
	13764	sv_catpvs(sv, "^");
	13765
	13766	/* output what the standard cp 0-255 bitmap matches */
	13767	for (i = 0; i <= 256; i++) {
	13768	if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
	13769	if (rangestart == -1)
	13770	rangestart = i;
	13771	} else if (rangestart != -1) {
	13772	if (i <= rangestart + 3)
	13773	for (; rangestart < i; rangestart++)
	13774	put_byte(sv, rangestart);
	13775	else {
	13776	put_byte(sv, rangestart);
	13777	sv_catpvs(sv, "-");
	13778	put_byte(sv, i - 1);
	13779	}
	13780	do_sep = 1;
	13781	rangestart = -1;
	13782	}
	13783	}
	13784
	13785	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	13786	/* output any special charclass tests (used entirely under use locale) */
	13787	if (ANYOF_CLASS_TEST_ANY_SET(o))
	13788	for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
	13789	if (ANYOF_CLASS_TEST(o,i)) {
	13790	sv_catpv(sv, anyofs[i]);
	13791	do_sep = 1;
	13792	}
	13793
	13794	EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
	13795
	13796	if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
	13797	sv_catpvs(sv, "{non-utf8-latin1-all}");
	13798	}
	13799
	13800	/* output information about the unicode matching */
	13801	if (flags & ANYOF_UNICODE_ALL)
	13802	sv_catpvs(sv, "{unicode_all}");
	13803	else if (ANYOF_NONBITMAP(o))
	13804	sv_catpvs(sv, "{unicode}");
	13805	if (flags & ANYOF_NONBITMAP_NON_UTF8)
	13806	sv_catpvs(sv, "{outside bitmap}");
	13807
	13808	if (ANYOF_NONBITMAP(o)) {
	13809	SV lv; / Set if there is something outside the bit map */
	13810	SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
	13811	bool byte_output = FALSE; /* If something in the bitmap has been
	13812	output */
	13813
	13814	if (lv && lv != &PL_sv_undef) {
	13815	if (sw) {
	13816	U8 s[UTF8_MAXBYTES_CASE+1];
	13817
	13818	for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
	13819	uvchr_to_utf8(s, i);
	13820
	13821	if (i < 256
	13822	&& ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
	13823	things already
	13824	output as part
	13825	of the bitmap */
	13826	&& swash_fetch(sw, s, TRUE))
	13827	{
	13828	if (rangestart == -1)
	13829	rangestart = i;
	13830	} else if (rangestart != -1) {
	13831	byte_output = TRUE;
	13832	if (i <= rangestart + 3)
	13833	for (; rangestart < i; rangestart++) {
	13834	put_byte(sv, rangestart);
	13835	}
	13836	else {
	13837	put_byte(sv, rangestart);
	13838	sv_catpvs(sv, "-");
	13839	put_byte(sv, i-1);
	13840	}
	13841	rangestart = -1;
	13842	}
	13843	}
	13844	}
	13845
	13846	{
	13847	char *s = savesvpv(lv);
	13848	char * const origs = s;
	13849
	13850	while (s && s != '\n')
	13851	s++;
	13852
	13853	if (*s == '\n') {
	13854	const char * const t = ++s;
	13855
	13856	if (byte_output) {
	13857	sv_catpvs(sv, " ");
	13858	}
	13859
	13860	while (*s) {
	13861	if (*s == '\n') {
	13862
	13863	/* Truncate very long output */
	13864	if (s - origs > 256) {
	13865	Perl_sv_catpvf(aTHX_ sv,
	13866	"%.*s...",
	13867	(int) (s - origs - 1),
	13868	t);
	13869	goto out_dump;
	13870	}
	13871	*s = ' ';
	13872	}
	13873	else if (*s == '\t') {
	13874	*s = '-';
	13875	}
	13876	s++;
	13877	}
	13878	if (s[-1] == ' ')
	13879	s[-1] = 0;
	13880
	13881	sv_catpv(sv, t);
	13882	}
	13883
	13884	out_dump:
	13885
	13886	Safefree(origs);
	13887	}
	13888	SvREFCNT_dec(lv);
	13889	}
	13890	}
	13891
	13892	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
	13893	}
	13894	else if (k == POSIXD) {
	13895	U8 index = FLAGS(o) * 2;
	13896	if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
	13897	Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
	13898	}
	13899	else {
	13900	sv_catpv(sv, anyofs[index]);
	13901	}
	13902	}
	13903	else if (k == BRANCHJ && (OP(o) == UNLESSM \|\| OP(o) == IFMATCH))
	13904	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
	13905	#else
	13906	PERL_UNUSED_CONTEXT;
	13907	PERL_UNUSED_ARG(sv);
	13908	PERL_UNUSED_ARG(o);
	13909	PERL_UNUSED_ARG(prog);
	13910	#endif /* DEBUGGING */
	13911	}
	13912
	13913	SV *
	13914	Perl_re_intuit_string(pTHX_ REGEXP * const r)
	13915	{ /* Assume that RE_INTUIT is set */
	13916	dVAR;
	13917	struct regexp const prog = (struct regexp )SvANY(r);
	13918	GET_RE_DEBUG_FLAGS_DECL;
	13919
	13920	PERL_ARGS_ASSERT_RE_INTUIT_STRING;
	13921	PERL_UNUSED_CONTEXT;
	13922
	13923	DEBUG_COMPILE_r(
	13924	{
	13925	const char * const s = SvPV_nolen_const(prog->check_substr
	13926	? prog->check_substr : prog->check_utf8);
	13927
	13928	if (!PL_colorset) reginitcolors();
	13929	PerlIO_printf(Perl_debug_log,
	13930	"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
	13931	PL_colors[4],
	13932	prog->check_substr ? "" : "utf8 ",
	13933	PL_colors[5],PL_colors[0],
	13934	s,
	13935	PL_colors[1],
	13936	(strlen(s) > 60 ? "..." : ""));
	13937	} );
	13938
	13939	return prog->check_substr ? prog->check_substr : prog->check_utf8;
	13940	}
	13941
	13942	/*
	13943	pregfree()
	13944
	13945	handles refcounting and freeing the perl core regexp structure. When
	13946	it is necessary to actually free the structure the first thing it
	13947	does is call the 'free' method of the regexp_engine associated to
	13948	the regexp, allowing the handling of the void *pprivate; member
	13949	first. (This routine is not overridable by extensions, which is why
	13950	the extensions free is called first.)
	13951
	13952	See regdupe and regdupe_internal if you change anything here.
	13953	*/
	13954	#ifndef PERL_IN_XSUB_RE
	13955	void
	13956	Perl_pregfree(pTHX_ REGEXP *r)
	13957	{
	13958	SvREFCNT_dec(r);
	13959	}
	13960
	13961	void
	13962	Perl_pregfree2(pTHX_ REGEXP *rx)
	13963	{
	13964	dVAR;
	13965	struct regexp const r = (struct regexp )SvANY(rx);
	13966	GET_RE_DEBUG_FLAGS_DECL;
	13967
	13968	PERL_ARGS_ASSERT_PREGFREE2;
	13969
	13970	if (r->mother_re) {
	13971	ReREFCNT_dec(r->mother_re);
	13972	} else {
	13973	CALLREGFREE_PVT(rx); /* free the private data */
	13974	SvREFCNT_dec(RXp_PAREN_NAMES(r));
	13975	}
	13976	if (r->substrs) {
	13977	SvREFCNT_dec(r->anchored_substr);
	13978	SvREFCNT_dec(r->anchored_utf8);
	13979	SvREFCNT_dec(r->float_substr);
	13980	SvREFCNT_dec(r->float_utf8);
	13981	Safefree(r->substrs);
	13982	}
	13983	RX_MATCH_COPY_FREE(rx);
	13984	#ifdef PERL_OLD_COPY_ON_WRITE
	13985	SvREFCNT_dec(r->saved_copy);
	13986	#endif
	13987	Safefree(r->offs);
	13988	SvREFCNT_dec(r->qr_anoncv);
	13989	}
	13990
	13991	/* reg_temp_copy()
	13992
	13993	This is a hacky workaround to the structural issue of match results
	13994	being stored in the regexp structure which is in turn stored in
	13995	PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
	13996	could be PL_curpm in multiple contexts, and could require multiple
	13997	result sets being associated with the pattern simultaneously, such
	13998	as when doing a recursive match with (??{$qr})
	13999
	14000	The solution is to make a lightweight copy of the regexp structure
	14001	when a qr// is returned from the code executed by (??{$qr}) this
	14002	lightweight copy doesn't actually own any of its data except for
	14003	the starp/end and the actual regexp structure itself.
	14004
	14005	*/
	14006
	14007
	14008	REGEXP *
	14009	Perl_reg_temp_copy (pTHX_ REGEXP ret_x, REGEXP rx)
	14010	{
	14011	struct regexp *ret;
	14012	struct regexp const r = (struct regexp )SvANY(rx);
	14013
	14014	PERL_ARGS_ASSERT_REG_TEMP_COPY;
	14015
	14016	if (!ret_x)
	14017	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
	14018	ret = (struct regexp *)SvANY(ret_x);
	14019
	14020	(void)ReREFCNT_inc(rx);
	14021	/* We can take advantage of the existing "copied buffer" mechanism in SVs
	14022	by pointing directly at the buffer, but flagging that the allocated
	14023	space in the copy is zero. As we've just done a struct copy, it's now
	14024	a case of zero-ing that, rather than copying the current length. */
	14025	SvPV_set(ret_x, RX_WRAPPED(rx));
	14026	SvFLAGS(ret_x) \|= SvFLAGS(rx) & (SVf_POK\|SVp_POK\|SVf_UTF8);
	14027	memcpy(&(ret->xpv_cur), &(r->xpv_cur),
	14028	sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
	14029	SvLEN_set(ret_x, 0);
	14030	SvSTASH_set(ret_x, NULL);
	14031	SvMAGIC_set(ret_x, NULL);
	14032	if (r->offs) {
	14033	const I32 npar = r->nparens+1;
	14034	Newx(ret->offs, npar, regexp_paren_pair);
	14035	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	14036	}
	14037	if (r->substrs) {
	14038	Newx(ret->substrs, 1, struct reg_substr_data);
	14039	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	14040
	14041	SvREFCNT_inc_void(ret->anchored_substr);
	14042	SvREFCNT_inc_void(ret->anchored_utf8);
	14043	SvREFCNT_inc_void(ret->float_substr);
	14044	SvREFCNT_inc_void(ret->float_utf8);
	14045
	14046	/* check_substr and check_utf8, if non-NULL, point to either their
	14047	anchored or float namesakes, and don't hold a second reference. */
	14048	}
	14049	RX_MATCH_COPIED_off(ret_x);
	14050	#ifdef PERL_OLD_COPY_ON_WRITE
	14051	ret->saved_copy = NULL;
	14052	#endif
	14053	ret->mother_re = rx;
	14054	SvREFCNT_inc_void(ret->qr_anoncv);
	14055
	14056	return ret_x;
	14057	}
	14058	#endif
	14059
	14060	/* regfree_internal()
	14061
	14062	Free the private data in a regexp. This is overloadable by
	14063	extensions. Perl takes care of the regexp structure in pregfree(),
	14064	this covers the *pprivate pointer which technically perl doesn't
	14065	know about, however of course we have to handle the
	14066	regexp_internal structure when no extension is in use.
	14067
	14068	Note this is called before freeing anything in the regexp
	14069	structure.
	14070	*/
	14071
	14072	void
	14073	Perl_regfree_internal(pTHX_ REGEXP * const rx)
	14074	{
	14075	dVAR;
	14076	struct regexp const r = (struct regexp )SvANY(rx);
	14077	RXi_GET_DECL(r,ri);
	14078	GET_RE_DEBUG_FLAGS_DECL;
	14079
	14080	PERL_ARGS_ASSERT_REGFREE_INTERNAL;
	14081
	14082	DEBUG_COMPILE_r({
	14083	if (!PL_colorset)
	14084	reginitcolors();
	14085	{
	14086	SV *dsv= sv_newmortal();
	14087	RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
	14088	dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
	14089	PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
	14090	PL_colors[4],PL_colors[5],s);
	14091	}
	14092	});
	14093	#ifdef RE_TRACK_PATTERN_OFFSETS
	14094	if (ri->u.offsets)
	14095	Safefree(ri->u.offsets); /* 20010421 MJD */
	14096	#endif
	14097	if (ri->code_blocks) {
	14098	int n;
	14099	for (n = 0; n < ri->num_code_blocks; n++)
	14100	SvREFCNT_dec(ri->code_blocks[n].src_regex);
	14101	Safefree(ri->code_blocks);
	14102	}
	14103
	14104	if (ri->data) {
	14105	int n = ri->data->count;
	14106
	14107	while (--n >= 0) {
	14108	/* If you add a ->what type here, update the comment in regcomp.h */
	14109	switch (ri->data->what[n]) {
	14110	case 'a':
	14111	case 'r':
	14112	case 's':
	14113	case 'S':
	14114	case 'u':
	14115	SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
	14116	break;
	14117	case 'f':
	14118	Safefree(ri->data->data[n]);
	14119	break;
	14120	case 'l':
	14121	case 'L':
	14122	break;
	14123	case 'T':
	14124	{ /* Aho Corasick add-on structure for a trie node.
	14125	Used in stclass optimization only */
	14126	U32 refcount;
	14127	reg_ac_data aho=(reg_ac_data)ri->data->data[n];
	14128	OP_REFCNT_LOCK;
	14129	refcount = --aho->refcount;
	14130	OP_REFCNT_UNLOCK;
	14131	if ( !refcount ) {
	14132	PerlMemShared_free(aho->states);
	14133	PerlMemShared_free(aho->fail);
	14134	/* do this last!!!! */
	14135	PerlMemShared_free(ri->data->data[n]);
	14136	PerlMemShared_free(ri->regstclass);
	14137	}
	14138	}
	14139	break;
	14140	case 't':
	14141	{
	14142	/* trie structure. */
	14143	U32 refcount;
	14144	reg_trie_data trie=(reg_trie_data)ri->data->data[n];
	14145	OP_REFCNT_LOCK;
	14146	refcount = --trie->refcount;
	14147	OP_REFCNT_UNLOCK;
	14148	if ( !refcount ) {
	14149	PerlMemShared_free(trie->charmap);
	14150	PerlMemShared_free(trie->states);
	14151	PerlMemShared_free(trie->trans);
	14152	if (trie->bitmap)
	14153	PerlMemShared_free(trie->bitmap);
	14154	if (trie->jump)
	14155	PerlMemShared_free(trie->jump);
	14156	PerlMemShared_free(trie->wordinfo);
	14157	/* do this last!!!! */
	14158	PerlMemShared_free(ri->data->data[n]);
	14159	}
	14160	}
	14161	break;
	14162	default:
	14163	Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
	14164	}
	14165	}
	14166	Safefree(ri->data->what);
	14167	Safefree(ri->data);
	14168	}
	14169
	14170	Safefree(ri);
	14171	}
	14172
	14173	#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
	14174	#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
	14175	#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
	14176
	14177	/*
	14178	re_dup - duplicate a regexp.
	14179
	14180	This routine is expected to clone a given regexp structure. It is only
	14181	compiled under USE_ITHREADS.
	14182
	14183	After all of the core data stored in struct regexp is duplicated
	14184	the regexp_engine.dupe method is used to copy any private data
	14185	stored in the *pprivate pointer. This allows extensions to handle
	14186	any duplication it needs to do.
	14187
	14188	See pregfree() and regfree_internal() if you change anything here.
	14189	*/
	14190	#if defined(USE_ITHREADS)
	14191	#ifndef PERL_IN_XSUB_RE
	14192	void
	14193	Perl_re_dup_guts(pTHX_ const REGEXP sstr, REGEXP dstr, CLONE_PARAMS *param)
	14194	{
	14195	dVAR;
	14196	I32 npar;
	14197	const struct regexp r = (const struct regexp )SvANY(sstr);
	14198	struct regexp ret = (struct regexp )SvANY(dstr);
	14199
	14200	PERL_ARGS_ASSERT_RE_DUP_GUTS;
	14201
	14202	npar = r->nparens+1;
	14203	Newx(ret->offs, npar, regexp_paren_pair);
	14204	Copy(r->offs, ret->offs, npar, regexp_paren_pair);
	14205	if(ret->swap) {
	14206	/* no need to copy these */
	14207	Newx(ret->swap, npar, regexp_paren_pair);
	14208	}
	14209
	14210	if (ret->substrs) {
	14211	/* Do it this way to avoid reading from *r after the StructCopy().
	14212	That way, if any of the sv_dup_inc()s dislodge *r from the L1
	14213	cache, it doesn't matter. */
	14214	const bool anchored = r->check_substr
	14215	? r->check_substr == r->anchored_substr
	14216	: r->check_utf8 == r->anchored_utf8;
	14217	Newx(ret->substrs, 1, struct reg_substr_data);
	14218	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
	14219
	14220	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
	14221	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
	14222	ret->float_substr = sv_dup_inc(ret->float_substr, param);
	14223	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
	14224
	14225	/* check_substr and check_utf8, if non-NULL, point to either their
	14226	anchored or float namesakes, and don't hold a second reference. */
	14227
	14228	if (ret->check_substr) {
	14229	if (anchored) {
	14230	assert(r->check_utf8 == r->anchored_utf8);
	14231	ret->check_substr = ret->anchored_substr;
	14232	ret->check_utf8 = ret->anchored_utf8;
	14233	} else {
	14234	assert(r->check_substr == r->float_substr);
	14235	assert(r->check_utf8 == r->float_utf8);
	14236	ret->check_substr = ret->float_substr;
	14237	ret->check_utf8 = ret->float_utf8;
	14238	}
	14239	} else if (ret->check_utf8) {
	14240	if (anchored) {
	14241	ret->check_utf8 = ret->anchored_utf8;
	14242	} else {
	14243	ret->check_utf8 = ret->float_utf8;
	14244	}
	14245	}
	14246	}
	14247
	14248	RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
	14249	ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
	14250
	14251	if (ret->pprivate)
	14252	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
	14253
	14254	if (RX_MATCH_COPIED(dstr))
	14255	ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
	14256	else
	14257	ret->subbeg = NULL;
	14258	#ifdef PERL_OLD_COPY_ON_WRITE
	14259	ret->saved_copy = NULL;
	14260	#endif
	14261
	14262	if (ret->mother_re) {
	14263	if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
	14264	/* Our storage points directly to our mother regexp, but that's
	14265	1: a buffer in a different thread
	14266	2: something we no longer hold a reference on
	14267	so we need to copy it locally. */
	14268	/* Note we need to use SvCUR(), rather than
	14269	SvLEN(), on our mother_re, because it, in
	14270	turn, may well be pointing to its own mother_re. */
	14271	SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
	14272	SvCUR(ret->mother_re)+1));
	14273	SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
	14274	}
	14275	ret->mother_re = NULL;
	14276	}
	14277	ret->gofs = 0;
	14278	}
	14279	#endif /* PERL_IN_XSUB_RE */
	14280
	14281	/*
	14282	regdupe_internal()
	14283
	14284	This is the internal complement to regdupe() which is used to copy
	14285	the structure pointed to by the *pprivate pointer in the regexp.
	14286	This is the core version of the extension overridable cloning hook.
	14287	The regexp structure being duplicated will be copied by perl prior
	14288	to this and will be provided as the regexp *r argument, however
	14289	with the /old/ structures pprivate pointer value. Thus this routine
	14290	may override any copying normally done by perl.
	14291
	14292	It returns a pointer to the new regexp_internal structure.
	14293	*/
	14294
	14295	void *
	14296	Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
	14297	{
	14298	dVAR;
	14299	struct regexp const r = (struct regexp )SvANY(rx);
	14300	regexp_internal *reti;
	14301	int len;
	14302	RXi_GET_DECL(r,ri);
	14303
	14304	PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
	14305
	14306	len = ProgLen(ri);
	14307
	14308	Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
	14309	Copy(ri->program, reti->program, len+1, regnode);
	14310
	14311	reti->num_code_blocks = ri->num_code_blocks;
	14312	if (ri->code_blocks) {
	14313	int n;
	14314	Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
	14315	struct reg_code_block);
	14316	Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
	14317	struct reg_code_block);
	14318	for (n = 0; n < ri->num_code_blocks; n++)
	14319	reti->code_blocks[n].src_regex = (REGEXP*)
	14320	sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
	14321	}
	14322	else
	14323	reti->code_blocks = NULL;
	14324
	14325	reti->regstclass = NULL;
	14326
	14327	if (ri->data) {
	14328	struct reg_data *d;
	14329	const int count = ri->data->count;
	14330	int i;
	14331
	14332	Newxc(d, sizeof(struct reg_data) + countsizeof(void ),
	14333	char, struct reg_data);
	14334	Newx(d->what, count, U8);
	14335
	14336	d->count = count;
	14337	for (i = 0; i < count; i++) {
	14338	d->what[i] = ri->data->what[i];
	14339	switch (d->what[i]) {
	14340	/* see also regcomp.h and regfree_internal() */
	14341	case 'a': /* actually an AV, but the dup function is identical. */
	14342	case 'r':
	14343	case 's':
	14344	case 'S':
	14345	case 'u': /* actually an HV, but the dup function is identical. */
	14346	d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
	14347	break;
	14348	case 'f':
	14349	/* This is cheating. */
	14350	Newx(d->data[i], 1, struct regnode_charclass_class);
	14351	StructCopy(ri->data->data[i], d->data[i],
	14352	struct regnode_charclass_class);
	14353	reti->regstclass = (regnode*)d->data[i];
	14354	break;
	14355	case 'T':
	14356	/* Trie stclasses are readonly and can thus be shared
	14357	* without duplication. We free the stclass in pregfree
	14358	* when the corresponding reg_ac_data struct is freed.
	14359	*/
	14360	reti->regstclass= ri->regstclass;
	14361	/* Fall through */
	14362	case 't':
	14363	OP_REFCNT_LOCK;
	14364	((reg_trie_data*)ri->data->data[i])->refcount++;
	14365	OP_REFCNT_UNLOCK;
	14366	/* Fall through */
	14367	case 'l':
	14368	case 'L':
	14369	d->data[i] = ri->data->data[i];
	14370	break;
	14371	default:
	14372	Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
	14373	}
	14374	}
	14375
	14376	reti->data = d;
	14377	}
	14378	else
	14379	reti->data = NULL;
	14380
	14381	reti->name_list_idx = ri->name_list_idx;
	14382
	14383	#ifdef RE_TRACK_PATTERN_OFFSETS
	14384	if (ri->u.offsets) {
	14385	Newx(reti->u.offsets, 2*len+1, U32);
	14386	Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
	14387	}
	14388	#else
	14389	SetProgLen(reti,len);
	14390	#endif
	14391
	14392	return (void*)reti;
	14393	}
	14394
	14395	#endif /* USE_ITHREADS */
	14396
	14397	#ifndef PERL_IN_XSUB_RE
	14398
	14399	/*
	14400	- regnext - dig the "next" pointer out of a node
	14401	*/
	14402	regnode *
	14403	Perl_regnext(pTHX_ register regnode *p)
	14404	{
	14405	dVAR;
	14406	I32 offset;
	14407
	14408	if (!p)
	14409	return(NULL);
	14410
	14411	if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
	14412	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
	14413	}
	14414
	14415	offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
	14416	if (offset == 0)
	14417	return(NULL);
	14418
	14419	return(p+offset);
	14420	}
	14421	#endif
	14422
	14423	STATIC void
	14424	S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
	14425	{
	14426	va_list args;
	14427	STRLEN l1 = strlen(pat1);
	14428	STRLEN l2 = strlen(pat2);
	14429	char buf[512];
	14430	SV *msv;
	14431	const char *message;
	14432
	14433	PERL_ARGS_ASSERT_RE_CROAK2;
	14434
	14435	if (l1 > 510)
	14436	l1 = 510;
	14437	if (l1 + l2 > 510)
	14438	l2 = 510 - l1;
	14439	Copy(pat1, buf, l1 , char);
	14440	Copy(pat2, buf + l1, l2 , char);
	14441	buf[l1 + l2] = '\n';
	14442	buf[l1 + l2 + 1] = '\0';
	14443	#ifdef I_STDARG
	14444	/* ANSI variant takes additional second argument */
	14445	va_start(args, pat2);
	14446	#else
	14447	va_start(args);
	14448	#endif
	14449	msv = vmess(buf, &args);
	14450	va_end(args);
	14451	message = SvPV_const(msv,l1);
	14452	if (l1 > 512)
	14453	l1 = 512;
	14454	Copy(message, buf, l1 , char);
	14455	buf[l1-1] = '\0'; /* Overwrite \n */
	14456	Perl_croak(aTHX_ "%s", buf);
	14457	}
	14458
	14459	/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
	14460
	14461	#ifndef PERL_IN_XSUB_RE
	14462	void
	14463	Perl_save_re_context(pTHX)
	14464	{
	14465	dVAR;
	14466
	14467	struct re_save_state *state;
	14468
	14469	SAVEVPTR(PL_curcop);
	14470	SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
	14471
	14472	state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
	14473	PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
	14474	SSPUSHUV(SAVEt_RE_STATE);
	14475
	14476	Copy(&PL_reg_state, state, 1, struct re_save_state);
	14477
	14478	PL_reg_oldsaved = NULL;
	14479	PL_reg_oldsavedlen = 0;
	14480	PL_reg_oldsavedoffset = 0;
	14481	PL_reg_oldsavedcoffset = 0;
	14482	PL_reg_maxiter = 0;
	14483	PL_reg_leftiter = 0;
	14484	PL_reg_poscache = NULL;
	14485	PL_reg_poscache_size = 0;
	14486	#ifdef PERL_OLD_COPY_ON_WRITE
	14487	PL_nrs = NULL;
	14488	#endif
	14489
	14490	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
	14491	if (PL_curpm) {
	14492	const REGEXP * const rx = PM_GETRE(PL_curpm);
	14493	if (rx) {
	14494	U32 i;
	14495	for (i = 1; i <= RX_NPARENS(rx); i++) {
	14496	char digits[TYPE_CHARS(long)];
	14497	const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
	14498	GV const const gvp
	14499	= (GV**)hv_fetch(PL_defstash, digits, len, 0);
	14500
	14501	if (gvp) {
	14502	GV * const gv = *gvp;
	14503	if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
	14504	save_scalar(gv);
	14505	}
	14506	}
	14507	}
	14508	}
	14509	}
	14510	#endif
	14511
	14512	static void
	14513	clear_re(pTHX_ void *r)
	14514	{
	14515	dVAR;
	14516	ReREFCNT_dec((REGEXP *)r);
	14517	}
	14518
	14519	#ifdef DEBUGGING
	14520
	14521	STATIC void
	14522	S_put_byte(pTHX_ SV *sv, int c)
	14523	{
	14524	PERL_ARGS_ASSERT_PUT_BYTE;
	14525
	14526	/* Our definition of isPRINT() ignores locales, so only bytes that are
	14527	not part of UTF-8 are considered printable. I assume that the same
	14528	holds for UTF-EBCDIC.
	14529	Also, code point 255 is not printable in either (it's E0 in EBCDIC,
	14530	which Wikipedia says:
	14531
	14532	EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
	14533	ones (binary 1111 1111, hexadecimal FF). It is similar, but not
	14534	identical, to the ASCII delete (DEL) or rubout control character.
	14535	) So the old condition can be simplified to !isPRINT(c) */
	14536	if (!isPRINT(c)) {
	14537	if (c < 256) {
	14538	Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
	14539	}
	14540	else {
	14541	Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
	14542	}
	14543	}
	14544	else {
	14545	const char string = c;
	14546	if (c == '-' \|\| c == ']' \|\| c == '\\' \|\| c == '^')
	14547	sv_catpvs(sv, "\\");
	14548	sv_catpvn(sv, &string, 1);
	14549	}
	14550	}
	14551
	14552
	14553	#define CLEAR_OPTSTART \
	14554	if (optstart) STMT_START { \
	14555	DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
	14556	optstart=NULL; \
	14557	} STMT_END
	14558
	14559	#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
	14560
	14561	STATIC const regnode *
	14562	S_dumpuntil(pTHX_ const regexp r, const regnode start, const regnode *node,
	14563	const regnode last, const regnode plast,
	14564	SV* sv, I32 indent, U32 depth)
	14565	{
	14566	dVAR;
	14567	U8 op = PSEUDO; /* Arbitrary non-END op. */
	14568	const regnode *next;
	14569	const regnode *optstart= NULL;
	14570
	14571	RXi_GET_DECL(r,ri);
	14572	GET_RE_DEBUG_FLAGS_DECL;
	14573
	14574	PERL_ARGS_ASSERT_DUMPUNTIL;
	14575
	14576	#ifdef DEBUG_DUMPUNTIL
	14577	PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
	14578	last ? last-start : 0,plast ? plast-start : 0);
	14579	#endif
	14580
	14581	if (plast && plast < last)
	14582	last= plast;
	14583
	14584	while (PL_regkind[op] != END && (!last \|\| node < last)) {
	14585	/* While that wasn't END last time... */
	14586	NODE_ALIGN(node);
	14587	op = OP(node);
	14588	if (op == CLOSE \|\| op == WHILEM)
	14589	indent--;
	14590	next = regnext((regnode *)node);
	14591
	14592	/* Where, what. */
	14593	if (OP(node) == OPTIMIZED) {
	14594	if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
	14595	optstart = node;
	14596	else
	14597	goto after_print;
	14598	} else
	14599	CLEAR_OPTSTART;
	14600
	14601	regprop(r, sv, node);
	14602	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
	14603	(int)(2*indent + 1), "", SvPVX_const(sv));
	14604
	14605	if (OP(node) != OPTIMIZED) {
	14606	if (next == NULL) /* Next ptr. */
	14607	PerlIO_printf(Perl_debug_log, " (0)");
	14608	else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
	14609	PerlIO_printf(Perl_debug_log, " (FAIL)");
	14610	else
	14611	PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
	14612	(void)PerlIO_putc(Perl_debug_log, '\n');
	14613	}
	14614
	14615	after_print:
	14616	if (PL_regkind[(U8)op] == BRANCHJ) {
	14617	assert(next);
	14618	{
	14619	const regnode *nnode = (OP(next) == LONGJMP
	14620	? regnext((regnode *)next)
	14621	: next);
	14622	if (last && nnode > last)
	14623	nnode = last;
	14624	DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
	14625	}
	14626	}
	14627	else if (PL_regkind[(U8)op] == BRANCH) {
	14628	assert(next);
	14629	DUMPUNTIL(NEXTOPER(node), next);
	14630	}
	14631	else if ( PL_regkind[(U8)op] == TRIE ) {
	14632	const regnode *this_trie = node;
	14633	const char op = OP(node);
	14634	const U32 n = ARG(node);
	14635	const reg_ac_data * const ac = op>=AHOCORASICK ?
	14636	(reg_ac_data *)ri->data->data[n] :
	14637	NULL;
	14638	const reg_trie_data * const trie =
	14639	(reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
	14640	#ifdef DEBUGGING
	14641	AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
	14642	#endif
	14643	const regnode *nextbranch= NULL;
	14644	I32 word_idx;
	14645	sv_setpvs(sv, "");
	14646	for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
	14647	SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
	14648
	14649	PerlIO_printf(Perl_debug_log, "%*s%s ",
	14650	(int)(2*(indent+3)), "",
	14651	elem_ptr ? pv_pretty(sv, SvPV_nolen_const(elem_ptr), SvCUR(elem_ptr), 60,
	14652	PL_colors[0], PL_colors[1],
	14653	(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) \|
	14654	PERL_PV_PRETTY_ELLIPSES \|
	14655	PERL_PV_PRETTY_LTGT
	14656	)
	14657	: "???"
	14658	);
	14659	if (trie->jump) {
	14660	U16 dist= trie->jump[word_idx+1];
	14661	PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
	14662	(UV)((dist ? this_trie + dist : next) - start));
	14663	if (dist) {
	14664	if (!nextbranch)
	14665	nextbranch= this_trie + trie->jump[0];
	14666	DUMPUNTIL(this_trie + dist, nextbranch);
	14667	}
	14668	if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
	14669	nextbranch= regnext((regnode *)nextbranch);
	14670	} else {
	14671	PerlIO_printf(Perl_debug_log, "\n");
	14672	}
	14673	}
	14674	if (last && next > last)
	14675	node= last;
	14676	else
	14677	node= next;
	14678	}
	14679	else if ( op == CURLY ) { /* "next" might be very big: optimizer */
	14680	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
	14681	NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
	14682	}
	14683	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
	14684	assert(next);
	14685	DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
	14686	}
	14687	else if ( op == PLUS \|\| op == STAR) {
	14688	DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
	14689	}
	14690	else if (PL_regkind[(U8)op] == ANYOF) {
	14691	/* arglen 1 + class block */
	14692	node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
	14693	? ANYOF_CLASS_SKIP : ANYOF_SKIP);
	14694	node = NEXTOPER(node);
	14695	}
	14696	else if (PL_regkind[(U8)op] == EXACT) {
	14697	/* Literal string, where present. */
	14698	node += NODE_SZ_STR(node) - 1;
	14699	node = NEXTOPER(node);
	14700	}
	14701	else {
	14702	node = NEXTOPER(node);
	14703	node += regarglen[(U8)op];
	14704	}
	14705	if (op == CURLYX \|\| op == OPEN)
	14706	indent++;
	14707	}
	14708	CLEAR_OPTSTART;
	14709	#ifdef DEBUG_DUMPUNTIL
	14710	PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
	14711	#endif
	14712	return node;
	14713	}
	14714
	14715	#endif /* DEBUGGING */
	14716
	14717	/*
	14718	* Local variables:
	14719	* c-indentation-style: bsd
	14720	* c-basic-offset: 4
	14721	* indent-tabs-mode: nil
	14722	* End:
	14723	*
	14724	* ex: set ts=8 sts=4 sw=4 et:
	14725	*/