perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	#ifdef PERL_EXT_RE_BUILD
	2	#include "re_top.h"
	3	#endif
	4
	5	#include "EXTERN.h"
	6	#define PERL_IN_REGEX_ENGINE
	7	#define PERL_IN_REGCOMP_ANY
	8	#define PERL_IN_REGCOMP_STUDY_C
	9	#include "perl.h"
	10
	11	#ifdef PERL_IN_XSUB_RE
	12	# include "re_comp.h"
	13	#else
	14	# include "regcomp.h"
	15	#endif
	16
	17	#include "invlist_inline.h"
	18	#include "unicode_constants.h"
	19	#include "regcomp_internal.h"
	20
	21	#define INIT_AND_WITHP \
	22	assert(!and_withp); \
	23	Newx(and_withp, 1, regnode_ssc); \
	24	SAVEFREEPV(and_withp)
	25
	26
	27	STATIC void
	28	S_unwind_scan_frames(pTHX_ const void *p)
	29	{
	30	PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES;
	31	scan_frame f= (scan_frame )p;
	32	do {
	33	scan_frame *n= f->next_frame;
	34	Safefree(f);
	35	f= n;
	36	} while (f);
	37	}
	38
	39	/* Follow the next-chain of the current node and optimize away
	40	all the NOTHINGs from it.
	41	*/
	42	STATIC void
	43	S_rck_elide_nothing(pTHX_ regnode *node)
	44	{
	45	PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
	46
	47	if (OP(node) != CURLYX) {
	48	const int max = (REGNODE_OFF_BY_ARG(OP(node))
	49	? I32_MAX
	50	/* I32 may be smaller than U16 on CRAYs! */
	51	: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
	52	int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG(node) : NEXT_OFF(node));
	53	int noff;
	54	regnode *n = node;
	55
	56	/* Skip NOTHING and LONGJMP. */
	57	while (
	58	(n = regnext(n))
	59	&& (
	60	(REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
	61	\|\| ((OP(n) == LONGJMP) && (noff = ARG(n)))
	62	)
	63	&& off + noff < max
	64	) {
	65	off += noff;
	66	}
	67	if (REGNODE_OFF_BY_ARG(OP(node)))
	68	ARG(node) = off;
	69	else
	70	NEXT_OFF(node) = off;
	71	}
	72	return;
	73	}
	74
	75
	76	/*
	77	* As best we can, determine the characters that can match the start of
	78	* the given EXACTF-ish node. This is for use in creating ssc nodes, so there
	79	* can be false positive matches
	80	*
	81	* Returns the invlist as a new SV*; it is the caller's responsibility to
	82	* call SvREFCNT_dec() when done with it.
	83	*/
	84	STATIC SV*
	85	S_make_exactf_invlist(pTHX_ RExC_state_t pRExC_state, regnode node)
	86	{
	87	const U8 * s = (U8*)STRING(node);
	88	SSize_t bytelen = STR_LEN(node);
	89	UV uc;
	90	/* Start out big enough for 2 separate code points */
	91	SV* invlist = _new_invlist(4);
	92
	93	PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
	94
	95	if (! UTF) {
	96	uc = *s;
	97
	98	/* We punt and assume can match anything if the node begins
	99	* with a multi-character fold. Things are complicated. For
	100	* example, /ffi/i could match any of:
	101	* "\N{LATIN SMALL LIGATURE FFI}"
	102	* "\N{LATIN SMALL LIGATURE FF}I"
	103	* "F\N{LATIN SMALL LIGATURE FI}"
	104	* plus several other things; and making sure we have all the
	105	* possibilities is hard. */
	106	if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
	107	invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
	108	}
	109	else {
	110	/* Any Latin1 range character can potentially match any
	111	* other depending on the locale, and in Turkic locales, 'I' and
	112	* 'i' can match U+130 and U+131 */
	113	if (OP(node) == EXACTFL) {
	114	_invlist_union(invlist, PL_Latin1, &invlist);
	115	if (isALPHA_FOLD_EQ(uc, 'I')) {
	116	invlist = add_cp_to_invlist(invlist,
	117	LATIN_SMALL_LETTER_DOTLESS_I);
	118	invlist = add_cp_to_invlist(invlist,
	119	LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
	120	}
	121	}
	122	else {
	123	/* But otherwise, it matches at least itself. We can
	124	* quickly tell if it has a distinct fold, and if so,
	125	* it matches that as well */
	126	invlist = add_cp_to_invlist(invlist, uc);
	127	if (IS_IN_SOME_FOLD_L1(uc))
	128	invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
	129	}
	130
	131	/* Some characters match above-Latin1 ones under /i. This
	132	* is true of EXACTFL ones when the locale is UTF-8 */
	133	if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
	134	&& (! isASCII(uc) \|\| ! inRANGE(OP(node), EXACTFAA,
	135	EXACTFAA_NO_TRIE)))
	136	{
	137	add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
	138	}
	139	}
	140	}
	141	else { /* Pattern is UTF-8 */
	142	U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
	143	const U8* e = s + bytelen;
	144	IV fc;
	145
	146	fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
	147
	148	/* The only code points that aren't folded in a UTF EXACTFish
	149	* node are the problematic ones in EXACTFL nodes */
	150	if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
	151	/* We need to check for the possibility that this EXACTFL
	152	* node begins with a multi-char fold. Therefore we fold
	153	* the first few characters of it so that we can make that
	154	* check */
	155	U8 *d = folded;
	156	int i;
	157
	158	fc = -1;
	159	for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
	160	if (isASCII(*s)) {
	161	(d++) = (U8) toFOLD(s);
	162	if (fc < 0) { /* Save the first fold */
	163	fc = *(d-1);
	164	}
	165	s++;
	166	}
	167	else {
	168	STRLEN len;
	169	UV fold = toFOLD_utf8_safe(s, e, d, &len);
	170	if (fc < 0) { /* Save the first fold */
	171	fc = fold;
	172	}
	173	d += len;
	174	s += UTF8SKIP(s);
	175	}
	176	}
	177
	178	/* And set up so the code below that looks in this folded
	179	* buffer instead of the node's string */
	180	e = d;
	181	s = folded;
	182	}
	183
	184	/* When we reach here 's' points to the fold of the first
	185	* character(s) of the node; and 'e' points to far enough along
	186	* the folded string to be just past any possible multi-char
	187	* fold.
	188	*
	189	* Like the non-UTF case above, we punt if the node begins with a
	190	* multi-char fold */
	191
	192	if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
	193	invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
	194	}
	195	else { /* Single char fold */
	196	unsigned int k;
	197	U32 first_fold;
	198	const U32 * remaining_folds;
	199	Size_t folds_count;
	200
	201	/* It matches itself */
	202	invlist = add_cp_to_invlist(invlist, fc);
	203
	204	/* ... plus all the things that fold to it, which are found in
	205	* PL_utf8_foldclosures */
	206	folds_count = _inverse_folds(fc, &first_fold,
	207	&remaining_folds);
	208	for (k = 0; k < folds_count; k++) {
	209	UV c = (k == 0) ? first_fold : remaining_folds[k-1];
	210
	211	/* /aa doesn't allow folds between ASCII and non- */
	212	if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
	213	&& isASCII(c) != isASCII(fc))
	214	{
	215	continue;
	216	}
	217
	218	invlist = add_cp_to_invlist(invlist, c);
	219	}
	220
	221	if (OP(node) == EXACTFL) {
	222
	223	/* If either [iI] are present in an EXACTFL node the above code
	224	* should have added its normal case pair, but under a Turkish
	225	* locale they could match instead the case pairs from it. Add
	226	* those as potential matches as well */
	227	if (isALPHA_FOLD_EQ(fc, 'I')) {
	228	invlist = add_cp_to_invlist(invlist,
	229	LATIN_SMALL_LETTER_DOTLESS_I);
	230	invlist = add_cp_to_invlist(invlist,
	231	LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
	232	}
	233	else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
	234	invlist = add_cp_to_invlist(invlist, 'I');
	235	}
	236	else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
	237	invlist = add_cp_to_invlist(invlist, 'i');
	238	}
	239	}
	240	}
	241	}
	242
	243	return invlist;
	244	}
	245
	246
	247	/* Mark that we cannot extend a found fixed substring at this point.
	248	Update the longest found anchored substring or the longest found
	249	floating substrings if needed. */
	250
	251	void
	252	Perl_scan_commit(pTHX_ const RExC_state_t pRExC_state, scan_data_t data,
	253	SSize_t *minlenp, int is_inf)
	254	{
	255	const STRLEN l = CHR_SVLEN(data->last_found);
	256	SV * const longest_sv = data->substrs[data->cur_is_floating].str;
	257	const STRLEN old_l = CHR_SVLEN(longest_sv);
	258	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	259
	260	PERL_ARGS_ASSERT_SCAN_COMMIT;
	261
	262	if ((l >= old_l) && ((l > old_l) \|\| (data->flags & SF_BEFORE_EOL))) {
	263	const U8 i = data->cur_is_floating;
	264	SvSetMagicSV(longest_sv, data->last_found);
	265	data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
	266
	267	if (!i) /* fixed */
	268	data->substrs[0].max_offset = data->substrs[0].min_offset;
	269	else { /* float */
	270	data->substrs[1].max_offset =
	271	(is_inf)
	272	? OPTIMIZE_INFTY
	273	: (l
	274	? data->last_start_max
	275	: (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
	276	? OPTIMIZE_INFTY
	277	: data->pos_min + data->pos_delta));
	278	}
	279
	280	data->substrs[i].flags &= ~SF_BEFORE_EOL;
	281	data->substrs[i].flags \|= data->flags & SF_BEFORE_EOL;
	282	data->substrs[i].minlenp = minlenp;
	283	data->substrs[i].lookbehind = 0;
	284	}
	285
	286	SvCUR_set(data->last_found, 0);
	287	{
	288	SV * const sv = data->last_found;
	289	if (SvUTF8(sv) && SvMAGICAL(sv)) {
	290	MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
	291	if (mg)
	292	mg->mg_len = 0;
	293	}
	294	}
	295	data->last_end = -1;
	296	data->flags &= ~SF_BEFORE_EOL;
	297	DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
	298	}
	299
	300	/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
	301	* list that describes which code points it matches */
	302
	303	STATIC void
	304	S_ssc_anything(pTHX_ regnode_ssc *ssc)
	305	{
	306	/* Set the SSC 'ssc' to match an empty string or any code point */
	307
	308	PERL_ARGS_ASSERT_SSC_ANYTHING;
	309
	310	assert(is_ANYOF_SYNTHETIC(ssc));
	311
	312	/* mortalize so won't leak */
	313	ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
	314	ANYOF_FLAGS(ssc) \|= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
	315	}
	316
	317	STATIC int
	318	S_ssc_is_anything(const regnode_ssc *ssc)
	319	{
	320	/* Returns TRUE if the SSC 'ssc' can match the empty string and any code
	321	* point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
	322	* us anything: if the function returns TRUE, 'ssc' hasn't been restricted
	323	* in any way, so there's no point in using it */
	324
	325	UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
	326	bool ret;
	327
	328	PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
	329
	330	assert(is_ANYOF_SYNTHETIC(ssc));
	331
	332	if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
	333	return FALSE;
	334	}
	335
	336	/* See if the list consists solely of the range 0 - Infinity */
	337	invlist_iterinit(ssc->invlist);
	338	ret = invlist_iternext(ssc->invlist, &start, &end)
	339	&& start == 0
	340	&& end == UV_MAX;
	341
	342	invlist_iterfinish(ssc->invlist);
	343
	344	if (ret) {
	345	return TRUE;
	346	}
	347
	348	/* If e.g., both \w and \W are set, matches everything */
	349	if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
	350	int i;
	351	for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
	352	if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
	353	return TRUE;
	354	}
	355	}
	356	}
	357
	358	return FALSE;
	359	}
	360
	361	void
	362	Perl_ssc_init(pTHX_ const RExC_state_t pRExC_state, regnode_ssc ssc)
	363	{
	364	/* Initializes the SSC 'ssc'. This includes setting it to match an empty
	365	* string, any code point, or any posix class under locale */
	366
	367	PERL_ARGS_ASSERT_SSC_INIT;
	368
	369	Zero(ssc, 1, regnode_ssc);
	370	set_ANYOF_SYNTHETIC(ssc);
	371	ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
	372	ssc_anything(ssc);
	373
	374	/* If any portion of the regex is to operate under locale rules that aren't
	375	* fully known at compile time, initialization includes it. The reason
	376	* this isn't done for all regexes is that the optimizer was written under
	377	* the assumption that locale was all-or-nothing. Given the complexity and
	378	* lack of documentation in the optimizer, and that there are inadequate
	379	* test cases for locale, many parts of it may not work properly, it is
	380	* safest to avoid locale unless necessary. */
	381	if (RExC_contains_locale) {
	382	ANYOF_POSIXL_SETALL(ssc);
	383	}
	384	else {
	385	ANYOF_POSIXL_ZERO(ssc);
	386	}
	387	}
	388
	389	STATIC int
	390	S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
	391	const regnode_ssc *ssc)
	392	{
	393	/* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
	394	* to the list of code points matched, and locale posix classes; hence does
	395	* not check its flags) */
	396
	397	UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
	398	bool ret;
	399
	400	PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
	401
	402	assert(is_ANYOF_SYNTHETIC(ssc));
	403
	404	invlist_iterinit(ssc->invlist);
	405	ret = invlist_iternext(ssc->invlist, &start, &end)
	406	&& start == 0
	407	&& end == UV_MAX;
	408
	409	invlist_iterfinish(ssc->invlist);
	410
	411	if (! ret) {
	412	return FALSE;
	413	}
	414
	415	if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
	416	return FALSE;
	417	}
	418
	419	return TRUE;
	420	}
	421
	422
	423	STATIC SV*
	424	S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
	425	const regnode_charclass* const node)
	426	{
	427	/* Returns a mortal inversion list defining which code points are matched
	428	* by 'node', which is of ANYOF-ish type . Handles complementing the
	429	* result if appropriate. If some code points aren't knowable at this
	430	* time, the returned list must, and will, contain every code point that is
	431	* a possibility. */
	432
	433	SV* invlist = NULL;
	434	SV* only_utf8_locale_invlist = NULL;
	435	bool new_node_has_latin1 = FALSE;
	436	const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
	437	? ANYOF_FLAGS(node)
	438	: 0;
	439
	440	PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
	441
	442	/* Look at the data structure created by S_set_ANYOF_arg() */
	443	if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
	444	invlist = sv_2mortal(_new_invlist(1));
	445	invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
	446	}
	447	else if (ANYOF_HAS_AUX(node)) {
	448	const U32 n = ARG(node);
	449	SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
	450	AV * const av = MUTABLE_AV(SvRV(rv));
	451	SV **const ary = AvARRAY(av);
	452
	453	if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
	454
	455	/* Here there are things that won't be known until runtime -- we
	456	* have to assume it could be anything */
	457	invlist = sv_2mortal(_new_invlist(1));
	458	return _add_range_to_invlist(invlist, 0, UV_MAX);
	459	}
	460	else if (ary[INVLIST_INDEX]) {
	461
	462	/* Use the node's inversion list */
	463	invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
	464	}
	465
	466	/* Get the code points valid only under UTF-8 locales */
	467	if ( (flags & ANYOFL_FOLD)
	468	&& av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
	469	{
	470	only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
	471	}
	472	}
	473
	474	if (! invlist) {
	475	invlist = sv_2mortal(_new_invlist(0));
	476	}
	477
	478	/* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
	479	* code points, and an inversion list for the others, but if there are code
	480	* points that should match only conditionally on the target string being
	481	* UTF-8, those are placed in the inversion list, and not the bitmap.
	482	* Since there are circumstances under which they could match, they are
	483	* included in the SSC. But if the ANYOF node is to be inverted, we have
	484	* to exclude them here, so that when we invert below, the end result
	485	* actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
	486	* have to do this here before we add the unconditionally matched code
	487	* points */
	488	if (flags & ANYOF_INVERT) {
	489	_invlist_intersection_complement_2nd(invlist,
	490	PL_UpperLatin1,
	491	&invlist);
	492	}
	493
	494	/* Add in the points from the bit map */
	495	if (REGNODE_TYPE(OP(node)) == ANYOF){
	496	for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
	497	if (ANYOF_BITMAP_TEST(node, i)) {
	498	unsigned int start = i++;
	499
	500	for (; i < NUM_ANYOF_CODE_POINTS
	501	&& ANYOF_BITMAP_TEST(node, i); ++i)
	502	{
	503	/* empty */
	504	}
	505	invlist = _add_range_to_invlist(invlist, start, i-1);
	506	new_node_has_latin1 = TRUE;
	507	}
	508	}
	509	}
	510
	511	/* If this can match all upper Latin1 code points, have to add them
	512	* as well. But don't add them if inverting, as when that gets done below,
	513	* it would exclude all these characters, including the ones it shouldn't
	514	* that were added just above */
	515	if ( ! (flags & ANYOF_INVERT)
	516	&& OP(node) == ANYOFD
	517	&& (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
	518	{
	519	_invlist_union(invlist, PL_UpperLatin1, &invlist);
	520	}
	521
	522	/* Similarly for these */
	523	if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
	524	_invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
	525	}
	526
	527	if (flags & ANYOF_INVERT) {
	528	_invlist_invert(invlist);
	529	}
	530	else if (flags & ANYOFL_FOLD) {
	531	if (new_node_has_latin1) {
	532
	533	/* These folds are potential in Turkic locales */
	534	if (_invlist_contains_cp(invlist, 'i')) {
	535	invlist = add_cp_to_invlist(invlist,
	536	LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
	537	}
	538	if (_invlist_contains_cp(invlist, 'I')) {
	539	invlist = add_cp_to_invlist(invlist,
	540	LATIN_SMALL_LETTER_DOTLESS_I);
	541	}
	542
	543	/* Under /li, any 0-255 could fold to any other 0-255, depending on
	544	* the locale. We can skip this if there are no 0-255 at all. */
	545	_invlist_union(invlist, PL_Latin1, &invlist);
	546	}
	547	else {
	548	if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
	549	invlist = add_cp_to_invlist(invlist, 'I');
	550	}
	551	if (_invlist_contains_cp(invlist,
	552	LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
	553	{
	554	invlist = add_cp_to_invlist(invlist, 'i');
	555	}
	556	}
	557	}
	558
	559	/* Similarly add the UTF-8 locale possible matches. These have to be
	560	* deferred until after the non-UTF-8 locale ones are taken care of just
	561	* above, or it leads to wrong results under ANYOF_INVERT */
	562	if (only_utf8_locale_invlist) {
	563	_invlist_union_maybe_complement_2nd(invlist,
	564	only_utf8_locale_invlist,
	565	flags & ANYOF_INVERT,
	566	&invlist);
	567	}
	568
	569	return invlist;
	570	}
	571
	572	/* 'AND' a given class with another one. Can create false positives. 'ssc'
	573	* should not be inverted. */
	574
	575	STATIC void
	576	S_ssc_and(pTHX_ const RExC_state_t pRExC_state, regnode_ssc ssc,
	577	const regnode_charclass *and_with)
	578	{
	579	/* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
	580	* another SSC or a regular ANYOF class. Can create false positives. */
	581
	582	SV* anded_cp_list;
	583	U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
	584	? ANYOF_FLAGS(and_with)
	585	: 0;
	586	U8 anded_flags;
	587
	588	PERL_ARGS_ASSERT_SSC_AND;
	589
	590	assert(is_ANYOF_SYNTHETIC(ssc));
	591
	592	/* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
	593	* the code point inversion list and just the relevant flags */
	594	if (is_ANYOF_SYNTHETIC(and_with)) {
	595	anded_cp_list = ((regnode_ssc *)and_with)->invlist;
	596	anded_flags = and_with_flags;
	597
	598	/* XXX This is a kludge around what appears to be deficiencies in the
	599	* optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
	600	* there are paths through the optimizer where it doesn't get weeded
	601	* out when it should. And if we don't make some extra provision for
	602	* it like the code just below, it doesn't get added when it should.
	603	* This solution is to add it only when AND'ing, which is here, and
	604	* only when what is being AND'ed is the pristine, original node
	605	* matching anything. Thus it is like adding it to ssc_anything() but
	606	* only when the result is to be AND'ed. Probably the same solution
	607	* could be adopted for the same problem we have with /l matching,
	608	* which is solved differently in S_ssc_init(), and that would lead to
	609	* fewer false positives than that solution has. But if this solution
	610	* creates bugs, the consequences are only that a warning isn't raised
	611	* that should be; while the consequences for having /l bugs is
	612	* incorrect matches */
	613	if (ssc_is_anything((regnode_ssc *)and_with)) {
	614	anded_flags \|= ANYOF_WARN_SUPER__shared;
	615	}
	616	}
	617	else {
	618	anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
	619	if (OP(and_with) == ANYOFD) {
	620	anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
	621	}
	622	else {
	623	anded_flags = and_with_flags
	624	& ( ANYOF_COMMON_FLAGS
	625	\|ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
	626	\|ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
	627	if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
	628	anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
	629	}
	630	}
	631	}
	632
	633	ANYOF_FLAGS(ssc) &= anded_flags;
	634
	635	/* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
	636	* C2 is the list of code points in 'and-with'; P2, its posix classes.
	637	* 'and_with' may be inverted. When not inverted, we have the situation of
	638	* computing:
	639	* (C1 \| P1) & (C2 \| P2)
	640	* = (C1 & (C2 \| P2)) \| (P1 & (C2 \| P2))
	641	* = ((C1 & C2) \| (C1 & P2)) \| ((P1 & C2) \| (P1 & P2))
	642	* <= ((C1 & C2) \| P2)) \| ( P1 \| (P1 & P2))
	643	* <= ((C1 & C2) \| P1 \| P2)
	644	* Alternatively, the last few steps could be:
	645	* = ((C1 & C2) \| (C1 & P2)) \| ((P1 & C2) \| (P1 & P2))
	646	* <= ((C1 & C2) \| C1 ) \| ( C2 \| (P1 & P2))
	647	* <= (C1 \| C2 \| (P1 & P2))
	648	* We favor the second approach if either P1 or P2 is non-empty. This is
	649	* because these components are a barrier to doing optimizations, as what
	650	* they match cannot be known until the moment of matching as they are
	651	* dependent on the current locale, 'AND"ing them likely will reduce or
	652	* eliminate them.
	653	* But we can do better if we know that C1,P1 are in their initial state (a
	654	* frequent occurrence), each matching everything:
	655	* (<everything>) & (C2 \| P2) = C2 \| P2
	656	* Similarly, if C2,P2 are in their initial state (again a frequent
	657	* occurrence), the result is a no-op
	658	* (C1 \| P1) & (<everything>) = C1 \| P1
	659	*
	660	* Inverted, we have
	661	* (C1 \| P1) & ~(C2 \| P2) = (C1 \| P1) & (~C2 & ~P2)
	662	* = (C1 & (~C2 & ~P2)) \| (P1 & (~C2 & ~P2))
	663	* <= (C1 & ~C2) \| (P1 & ~P2)
	664	* */
	665
	666	if ((and_with_flags & ANYOF_INVERT)
	667	&& ! is_ANYOF_SYNTHETIC(and_with))
	668	{
	669	unsigned int i;
	670
	671	ssc_intersection(ssc,
	672	anded_cp_list,
	673	FALSE /* Has already been inverted */
	674	);
	675
	676	/* If either P1 or P2 is empty, the intersection will be also; can skip
	677	* the loop */
	678	if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
	679	ANYOF_POSIXL_ZERO(ssc);
	680	}
	681	else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
	682
	683	/* Note that the Posix class component P from 'and_with' actually
	684	* looks like:
	685	* P = Pa \| Pb \| ... \| Pn
	686	* where each component is one posix class, such as in [\w\s].
	687	* Thus
	688	* ~P = ~(Pa \| Pb \| ... \| Pn)
	689	* = ~Pa & ~Pb & ... & ~Pn
	690	* <= ~Pa \| ~Pb \| ... \| ~Pn
	691	* The last is something we can easily calculate, but unfortunately
	692	* is likely to have many false positives. We could do better
	693	* in some (but certainly not all) instances if two classes in
	694	* P have known relationships. For example
	695	* :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
	696	* So
	697	* :lower: & :print: = :lower:
	698	* And similarly for classes that must be disjoint. For example,
	699	* since \s and \w can have no elements in common based on rules in
	700	* the POSIX standard,
	701	* \w & ^\S = nothing
	702	* Unfortunately, some vendor locales do not meet the Posix
	703	* standard, in particular almost everything by Microsoft.
	704	* The loop below just changes e.g., \w into \W and vice versa */
	705
	706	regnode_charclass_posixl temp;
	707	int add = 1; /* To calculate the index of the complement */
	708
	709	Zero(&temp, 1, regnode_charclass_posixl);
	710	ANYOF_POSIXL_ZERO(&temp);
	711	for (i = 0; i < ANYOF_MAX; i++) {
	712	assert(i % 2 != 0
	713	\|\| ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
	714	\|\| ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
	715
	716	if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
	717	ANYOF_POSIXL_SET(&temp, i + add);
	718	}
	719	add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
	720	}
	721	ANYOF_POSIXL_AND(&temp, ssc);
	722
	723	} /* else ssc already has no posixes */
	724	} /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
	725	in its initial state */
	726	else if (! is_ANYOF_SYNTHETIC(and_with)
	727	\|\| ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
	728	{
	729	/* But if 'ssc' is in its initial state, the result is just 'and_with';
	730	* copy it over 'ssc' */
	731	if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
	732	if (is_ANYOF_SYNTHETIC(and_with)) {
	733	StructCopy(and_with, ssc, regnode_ssc);
	734	}
	735	else {
	736	ssc->invlist = anded_cp_list;
	737	ANYOF_POSIXL_ZERO(ssc);
	738	if (and_with_flags & ANYOF_MATCHES_POSIXL) {
	739	ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
	740	}
	741	}
	742	}
	743	else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
	744	\|\| (and_with_flags & ANYOF_MATCHES_POSIXL))
	745	{
	746	/* One or the other of P1, P2 is non-empty. */
	747	if (and_with_flags & ANYOF_MATCHES_POSIXL) {
	748	ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
	749	}
	750	ssc_union(ssc, anded_cp_list, FALSE);
	751	}
	752	else { /* P1 = P2 = empty */
	753	ssc_intersection(ssc, anded_cp_list, FALSE);
	754	}
	755	}
	756	}
	757
	758	STATIC void
	759	S_ssc_or(pTHX_ const RExC_state_t pRExC_state, regnode_ssc ssc,
	760	const regnode_charclass *or_with)
	761	{
	762	/* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
	763	* another SSC or a regular ANYOF class. Can create false positives if
	764	* 'or_with' is to be inverted. */
	765
	766	SV* ored_cp_list;
	767	U8 ored_flags;
	768	U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
	769	? ANYOF_FLAGS(or_with)
	770	: 0;
	771
	772	PERL_ARGS_ASSERT_SSC_OR;
	773
	774	assert(is_ANYOF_SYNTHETIC(ssc));
	775
	776	/* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
	777	* the code point inversion list and just the relevant flags */
	778	if (is_ANYOF_SYNTHETIC(or_with)) {
	779	ored_cp_list = ((regnode_ssc*) or_with)->invlist;
	780	ored_flags = or_with_flags;
	781	}
	782	else {
	783	ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
	784	ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
	785	if (OP(or_with) != ANYOFD) {
	786	ored_flags \|=
	787	or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
	788	\|ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
	789	if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
	790	ored_flags \|= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
	791	}
	792	}
	793	}
	794
	795	ANYOF_FLAGS(ssc) \|= ored_flags;
	796
	797	/* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
	798	* C2 is the list of code points in 'or-with'; P2, its posix classes.
	799	* 'or_with' may be inverted. When not inverted, we have the simple
	800	* situation of computing:
	801	* (C1 \| P1) \| (C2 \| P2) = (C1 \| C2) \| (P1 \| P2)
	802	* If P1\|P2 yields a situation with both a class and its complement are
	803	* set, like having both \w and \W, this matches all code points, and we
	804	* can delete these from the P component of the ssc going forward. XXX We
	805	* might be able to delete all the P components, but I (khw) am not certain
	806	* about this, and it is better to be safe.
	807	*
	808	* Inverted, we have
	809	* (C1 \| P1) \| ~(C2 \| P2) = (C1 \| P1) \| (~C2 & ~P2)
	810	* <= (C1 \| P1) \| ~C2
	811	* <= (C1 \| ~C2) \| P1
	812	* (which results in actually simpler code than the non-inverted case)
	813	* */
	814
	815	if ((or_with_flags & ANYOF_INVERT)
	816	&& ! is_ANYOF_SYNTHETIC(or_with))
	817	{
	818	/* We ignore P2, leaving P1 going forward */
	819	} /* else Not inverted */
	820	else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
	821	ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
	822	if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
	823	unsigned int i;
	824	for (i = 0; i < ANYOF_MAX; i += 2) {
	825	if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
	826	{
	827	ssc_match_all_cp(ssc);
	828	ANYOF_POSIXL_CLEAR(ssc, i);
	829	ANYOF_POSIXL_CLEAR(ssc, i+1);
	830	}
	831	}
	832	}
	833	}
	834
	835	ssc_union(ssc,
	836	ored_cp_list,
	837	FALSE /* Already has been inverted */
	838	);
	839	}
	840
	841	STATIC void
	842	S_ssc_union(pTHX_ regnode_ssc ssc, SV const invlist, const bool invert2nd)
	843	{
	844	PERL_ARGS_ASSERT_SSC_UNION;
	845
	846	assert(is_ANYOF_SYNTHETIC(ssc));
	847
	848	_invlist_union_maybe_complement_2nd(ssc->invlist,
	849	invlist,
	850	invert2nd,
	851	&ssc->invlist);
	852	}
	853
	854	STATIC void
	855	S_ssc_intersection(pTHX_ regnode_ssc *ssc,
	856	SV* const invlist,
	857	const bool invert2nd)
	858	{
	859	PERL_ARGS_ASSERT_SSC_INTERSECTION;
	860
	861	assert(is_ANYOF_SYNTHETIC(ssc));
	862
	863	_invlist_intersection_maybe_complement_2nd(ssc->invlist,
	864	invlist,
	865	invert2nd,
	866	&ssc->invlist);
	867	}
	868
	869	STATIC void
	870	S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
	871	{
	872	PERL_ARGS_ASSERT_SSC_ADD_RANGE;
	873
	874	assert(is_ANYOF_SYNTHETIC(ssc));
	875
	876	ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
	877	}
	878
	879	STATIC void
	880	S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
	881	{
	882	/* AND just the single code point 'cp' into the SSC 'ssc' */
	883
	884	SV* cp_list = _new_invlist(2);
	885
	886	PERL_ARGS_ASSERT_SSC_CP_AND;
	887
	888	assert(is_ANYOF_SYNTHETIC(ssc));
	889
	890	cp_list = add_cp_to_invlist(cp_list, cp);
	891	ssc_intersection(ssc, cp_list,
	892	FALSE /* Not inverted */
	893	);
	894	SvREFCNT_dec_NN(cp_list);
	895	}
	896
	897	STATIC void
	898	S_ssc_clear_locale(regnode_ssc *ssc)
	899	{
	900	/* Set the SSC 'ssc' to not match any locale things */
	901	PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
	902
	903	assert(is_ANYOF_SYNTHETIC(ssc));
	904
	905	ANYOF_POSIXL_ZERO(ssc);
	906	ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
	907	}
	908
	909	bool
	910	Perl_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
	911	{
	912	/* The synthetic start class is used to hopefully quickly winnow down
	913	* places where a pattern could start a match in the target string. If it
	914	* doesn't really narrow things down that much, there isn't much point to
	915	* having the overhead of using it. This function uses some very crude
	916	* heuristics to decide if to use the ssc or not.
	917	*
	918	* It returns TRUE if 'ssc' rules out more than half what it considers to
	919	* be the "likely" possible matches, but of course it doesn't know what the
	920	* actual things being matched are going to be; these are only guesses
	921	*
	922	* For /l matches, it assumes that the only likely matches are going to be
	923	* in the 0-255 range, uniformly distributed, so half of that is 127
	924	* For /a and /d matches, it assumes that the likely matches will be just
	925	* the ASCII range, so half of that is 63
	926	* For /u and there isn't anything matching above the Latin1 range, it
	927	* assumes that that is the only range likely to be matched, and uses
	928	* half that as the cut-off: 127. If anything matches above Latin1,
	929	* it assumes that all of Unicode could match (uniformly), except for
	930	* non-Unicode code points and things in the General Category "Other"
	931	* (unassigned, private use, surrogates, controls and formats). This
	932	* is a much large number. */
	933
	934	U32 count = 0; /* Running total of number of code points matched by
	935	'ssc' */
	936	UV start, end; /* Start and end points of current range in inversion
	937	XXX outdated. UTF-8 locales are common, what about invert? list */
	938	const U32 max_code_points = (LOC)
	939	? 256
	940	: (( ! UNI_SEMANTICS
	941	\|\| invlist_highest(ssc->invlist) < 256)
	942	? 128
	943	: NON_OTHER_COUNT);
	944	const U32 max_match = max_code_points / 2;
	945
	946	PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
	947
	948	invlist_iterinit(ssc->invlist);
	949	while (invlist_iternext(ssc->invlist, &start, &end)) {
	950	if (start >= max_code_points) {
	951	break;
	952	}
	953	end = MIN(end, max_code_points - 1);
	954	count += end - start + 1;
	955	if (count >= max_match) {
	956	invlist_iterfinish(ssc->invlist);
	957	return FALSE;
	958	}
	959	}
	960
	961	return TRUE;
	962	}
	963
	964
	965	void
	966	Perl_ssc_finalize(pTHX_ RExC_state_t pRExC_state, regnode_ssc ssc)
	967	{
	968	/* The inversion list in the SSC is marked mortal; now we need a more
	969	* permanent copy, which is stored the same way that is done in a regular
	970	* ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
	971	* map */
	972
	973	SV* invlist = invlist_clone(ssc->invlist, NULL);
	974
	975	PERL_ARGS_ASSERT_SSC_FINALIZE;
	976
	977	assert(is_ANYOF_SYNTHETIC(ssc));
	978
	979	/* The code in this file assumes that all but these flags aren't relevant
	980	* to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
	981	* by the time we reach here */
	982	assert(! (ANYOF_FLAGS(ssc)
	983	& ~( ANYOF_COMMON_FLAGS
	984	\|ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
	985	\|ANYOF_HAS_EXTRA_RUNTIME_MATCHES)));
	986
	987	populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist);
	988
	989	set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
	990	SvREFCNT_dec(invlist);
	991
	992	/* Make sure is clone-safe */
	993	ssc->invlist = NULL;
	994
	995	if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
	996	ANYOF_FLAGS(ssc) \|= ANYOF_MATCHES_POSIXL;
	997	OP(ssc) = ANYOFPOSIXL;
	998	}
	999	else if (RExC_contains_locale) {
	1000	OP(ssc) = ANYOFL;
	1001	}
	1002
	1003	assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) \|\| RExC_contains_locale);
	1004	}
	1005
	1006	/* The below joins as many adjacent EXACTish nodes as possible into a single
	1007	* one. The regop may be changed if the node(s) contain certain sequences that
	1008	* require special handling. The joining is only done if:
	1009	* 1) there is room in the current conglomerated node to entirely contain the
	1010	* next one.
	1011	* 2) they are compatible node types
	1012	*
	1013	* The adjacent nodes actually may be separated by NOTHING-kind nodes, and
	1014	* these get optimized out
	1015	*
	1016	* XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
	1017	* as possible, even if that means splitting an existing node so that its first
	1018	* part is moved to the preceding node. This would maximise the efficiency of
	1019	* memEQ during matching.
	1020	*
	1021	* If a node is to match under /i (folded), the number of characters it matches
	1022	* can be different than its character length if it contains a multi-character
	1023	* fold. *min_subtract is set to the total delta number of characters of the
	1024	* input nodes.
	1025	*
	1026	* And *unfolded_multi_char is set to indicate whether or not the node contains
	1027	* an unfolded multi-char fold. This happens when it won't be known until
	1028	* runtime whether the fold is valid or not; namely
	1029	* 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
	1030	* target string being matched against turns out to be UTF-8 is that fold
	1031	* valid; or
	1032	* 2) for EXACTFL nodes whose folding rules depend on the locale in force at
	1033	* runtime.
	1034	* (Multi-char folds whose components are all above the Latin1 range are not
	1035	* run-time locale dependent, and have already been folded by the time this
	1036	* function is called.)
	1037	*
	1038	* This is as good a place as any to discuss the design of handling these
	1039	* multi-character fold sequences. It's been wrong in Perl for a very long
	1040	* time. There are three code points in Unicode whose multi-character folds
	1041	* were long ago discovered to mess things up. The previous designs for
	1042	* dealing with these involved assigning a special node for them. This
	1043	* approach doesn't always work, as evidenced by this example:
	1044	* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
	1045	* Both sides fold to "sss", but if the pattern is parsed to create a node that
	1046	* would match just the \xDF, it won't be able to handle the case where a
	1047	* successful match would have to cross the node's boundary. The new approach
	1048	* that hopefully generally solves the problem generates an EXACTFUP node
	1049	* that is "sss" in this case.
	1050	*
	1051	* It turns out that there are problems with all multi-character folds, and not
	1052	* just these three. Now the code is general, for all such cases. The
	1053	* approach taken is:
	1054	* 1) This routine examines each EXACTFish node that could contain multi-
	1055	* character folded sequences. Since a single character can fold into
	1056	* such a sequence, the minimum match length for this node is less than
	1057	* the number of characters in the node. This routine returns in
	1058	* *min_subtract how many characters to subtract from the actual
	1059	* length of the string to get a real minimum match length; it is 0 if
	1060	* there are no multi-char foldeds. This delta is used by the caller to
	1061	* adjust the min length of the match, and the delta between min and max,
	1062	* so that the optimizer doesn't reject these possibilities based on size
	1063	* constraints.
	1064	*
	1065	* 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
	1066	* under /u, we fold it to 'ss' in regatom(), and in this routine, after
	1067	* joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
	1068	* EXACTFU nodes. The node type of such nodes is then changed to
	1069	* EXACTFUP, indicating it is problematic, and needs careful handling.
	1070	* (The procedures in step 1) above are sufficient to handle this case in
	1071	* UTF-8 encoded nodes.) The reason this is problematic is that this is
	1072	* the only case where there is a possible fold length change in non-UTF-8
	1073	* patterns. By reserving a special node type for problematic cases, the
	1074	* far more common regular EXACTFU nodes can be processed faster.
	1075	* regexec.c takes advantage of this.
	1076	*
	1077	* EXACTFUP has been created as a grab-bag for (hopefully uncommon)
	1078	* problematic cases. These all only occur when the pattern is not
	1079	* UTF-8. In addition to the 'ss' sequence where there is a possible fold
	1080	* length change, it handles the situation where the string cannot be
	1081	* entirely folded. The strings in an EXACTFish node are folded as much
	1082	* as possible during compilation in regcomp.c. This saves effort in
	1083	* regex matching. By using an EXACTFUP node when it is not possible to
	1084	* fully fold at compile time, regexec.c can know that everything in an
	1085	* EXACTFU node is folded, so folding can be skipped at runtime. The only
	1086	* case where folding in EXACTFU nodes can't be done at compile time is
	1087	* the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
	1088	* is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
	1089	* handle two very different cases. Alternatively, there could have been
	1090	* a node type where there are length changes, one for unfolded, and one
	1091	* for both. If yet another special case needed to be created, the number
	1092	* of required node types would have to go to 7. khw figures that even
	1093	* though there are plenty of node types to spare, that the maintenance
	1094	* cost wasn't worth the small speedup of doing it that way, especially
	1095	* since he thinks the MICRO SIGN is rarely encountered in practice.
	1096	*
	1097	* There are other cases where folding isn't done at compile time, but
	1098	* none of them are under /u, and hence not for EXACTFU nodes. The folds
	1099	* in EXACTFL nodes aren't known until runtime, and vary as the locale
	1100	* changes. Some folds in EXACTF depend on if the runtime target string
	1101	* is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
	1102	* when no fold in it depends on the UTF-8ness of the target string.)
	1103	*
	1104	* 3) A problem remains for unfolded multi-char folds. (These occur when the
	1105	* validity of the fold won't be known until runtime, and so must remain
	1106	* unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
	1107	* nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
	1108	* be an EXACTF node with a UTF-8 pattern.) They also occur for various
	1109	* folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
	1110	* The reason this is a problem is that the optimizer part of regexec.c
	1111	* (probably unwittingly, in Perl_regexec_flags()) makes an assumption
	1112	* that a character in the pattern corresponds to at most a single
	1113	* character in the target string. (And I do mean character, and not byte
	1114	* here, unlike other parts of the documentation that have never been
	1115	* updated to account for multibyte Unicode.) Sharp s in EXACTF and
	1116	* EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
	1117	* nodes it can match "\x{17F}\x{17F}". These, along with other ones in
	1118	* EXACTFL nodes, violate the assumption, and they are the only instances
	1119	* where it is violated. I'm reluctant to try to change the assumption,
	1120	* as the code involved is impenetrable to me (khw), so instead the code
	1121	* here punts. This routine examines EXACTFL nodes, and (when the pattern
	1122	* isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
	1123	* boolean indicating whether or not the node contains such a fold. When
	1124	* it is true, the caller sets a flag that later causes the optimizer in
	1125	* this file to not set values for the floating and fixed string lengths,
	1126	* and thus avoids the optimizer code in regexec.c that makes the invalid
	1127	* assumption. Thus, there is no optimization based on string lengths for
	1128	* EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
	1129	* EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
	1130	* assumption is wrong only in these cases is that all other non-UTF-8
	1131	* folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
	1132	* their expanded versions. (Again, we can't prefold sharp s to 'ss' in
	1133	* EXACTF nodes because we don't know at compile time if it actually
	1134	* matches 'ss' or not. For EXACTF nodes it will match iff the target
	1135	* string is in UTF-8. This is in contrast to EXACTFU nodes, where it
	1136	* always matches; and EXACTFAA where it never does. In an EXACTFAA node
	1137	* in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
	1138	* problem; but in a non-UTF8 pattern, folding it to that above-Latin1
	1139	* string would require the pattern to be forced into UTF-8, the overhead
	1140	* of which we want to avoid. Similarly the unfolded multi-char folds in
	1141	* EXACTFL nodes will match iff the locale at the time of match is a UTF-8
	1142	* locale.)
	1143	*
	1144	* Similarly, the code that generates tries doesn't currently handle
	1145	* not-already-folded multi-char folds, and it looks like a pain to change
	1146	* that. Therefore, trie generation of EXACTFAA nodes with the sharp s
	1147	* doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
	1148	* EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
	1149	* using /iaa matching will be doing so almost entirely with ASCII
	1150	* strings, so this should rarely be encountered in practice */
	1151
	1152	U32
	1153	Perl_join_exact(pTHX_ RExC_state_t pRExC_state, regnode scan,
	1154	UV min_subtract, bool unfolded_multi_char,
	1155	U32 flags, regnode *val, U32 depth)
	1156	{
	1157	/* Merge several consecutive EXACTish nodes into one. */
	1158
	1159	regnode *n = regnext(scan);
	1160	U32 stringok = 1;
	1161	regnode *next = REGNODE_AFTER_varies(scan);
	1162	U32 merged = 0;
	1163	U32 stopnow = 0;
	1164	#ifdef DEBUGGING
	1165	regnode *stop = scan;
	1166	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	1167	#else
	1168	PERL_UNUSED_ARG(depth);
	1169	#endif
	1170
	1171	PERL_ARGS_ASSERT_JOIN_EXACT;
	1172	#ifndef EXPERIMENTAL_INPLACESCAN
	1173	PERL_UNUSED_ARG(flags);
	1174	PERL_UNUSED_ARG(val);
	1175	#endif
	1176	DEBUG_PEEP("join", scan, depth, 0);
	1177
	1178	assert(REGNODE_TYPE(OP(scan)) == EXACT);
	1179
	1180	/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
	1181	* EXACT ones that are mergeable to the current one. */
	1182	while ( n
	1183	&& ( REGNODE_TYPE(OP(n)) == NOTHING
	1184	\|\| (stringok && REGNODE_TYPE(OP(n)) == EXACT))
	1185	&& NEXT_OFF(n)
	1186	&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
	1187	{
	1188
	1189	if (OP(n) == TAIL \|\| n > next)
	1190	stringok = 0;
	1191	if (REGNODE_TYPE(OP(n)) == NOTHING) {
	1192	DEBUG_PEEP("skip:", n, depth, 0);
	1193	NEXT_OFF(scan) += NEXT_OFF(n);
	1194	next = n + NODE_STEP_REGNODE;
	1195	#ifdef DEBUGGING
	1196	if (stringok)
	1197	stop = n;
	1198	#endif
	1199	n = regnext(n);
	1200	}
	1201	else if (stringok) {
	1202	const unsigned int oldl = STR_LEN(scan);
	1203	regnode * const nnext = regnext(n);
	1204
	1205	/* XXX I (khw) kind of doubt that this works on platforms (should
	1206	* Perl ever run on one) where U8_MAX is above 255 because of lots
	1207	* of other assumptions */
	1208	/* Don't join if the sum can't fit into a single node */
	1209	if (oldl + STR_LEN(n) > U8_MAX)
	1210	break;
	1211
	1212	/* Joining something that requires UTF-8 with something that
	1213	* doesn't, means the result requires UTF-8. */
	1214	if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
	1215	OP(scan) = EXACT_REQ8;
	1216	}
	1217	else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
	1218	; /* join is compatible, no need to change OP */
	1219	}
	1220	else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
	1221	OP(scan) = EXACTFU_REQ8;
	1222	}
	1223	else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
	1224	; /* join is compatible, no need to change OP */
	1225	}
	1226	else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
	1227	; /* join is compatible, no need to change OP */
	1228	}
	1229	else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
	1230
	1231	/* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
	1232	* which can join with EXACTFU ones. We check for this case
	1233	* here. These need to be resolved to either EXACTFU or
	1234	* EXACTF at joining time. They have nothing in them that
	1235	* would forbid them from being the more desirable EXACTFU
	1236	* nodes except that they begin and/or end with a single [Ss].
	1237	* The reason this is problematic is because they could be
	1238	* joined in this loop with an adjacent node that ends and/or
	1239	* begins with [Ss] which would then form the sequence 'ss',
	1240	* which matches differently under /di than /ui, in which case
	1241	* EXACTFU can't be used. If the 'ss' sequence doesn't get
	1242	* formed, the nodes get absorbed into any adjacent EXACTFU
	1243	* node. And if the only adjacent node is EXACTF, they get
	1244	* absorbed into that, under the theory that a longer node is
	1245	* better than two shorter ones, even if one is EXACTFU. Note
	1246	* that EXACTFU_REQ8 is generated only for UTF-8 patterns,
	1247	* and the EXACTFU_S_EDGE ones only for non-UTF-8. */
	1248
	1249	if (STRING(n)[STR_LEN(n)-1] == 's') {
	1250
	1251	/* Here the joined node would end with 's'. If the node
	1252	* following the combination is an EXACTF one, it's better to
	1253	* join this trailing edge 's' node with that one, leaving the
	1254	* current one in 'scan' be the more desirable EXACTFU */
	1255	if (OP(nnext) == EXACTF) {
	1256	break;
	1257	}
	1258
	1259	OP(scan) = EXACTFU_S_EDGE;
	1260
	1261	} /* Otherwise, the beginning 's' of the 2nd node just
	1262	becomes an interior 's' in 'scan' */
	1263	}
	1264	else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
	1265	; /* join is compatible, no need to change OP */
	1266	}
	1267	else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
	1268
	1269	/* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
	1270	* nodes. But the latter nodes can be also joined with EXACTFU
	1271	* ones, and that is a better outcome, so if the node following
	1272	* 'n' is EXACTFU, quit now so that those two can be joined
	1273	* later */
	1274	if (OP(nnext) == EXACTFU) {
	1275	break;
	1276	}
	1277
	1278	/* The join is compatible, and the combined node will be
	1279	* EXACTF. (These don't care if they begin or end with 's' */
	1280	}
	1281	else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
	1282	if ( STRING(scan)[STR_LEN(scan)-1] == 's'
	1283	&& STRING(n)[0] == 's')
	1284	{
	1285	/* When combined, we have the sequence 'ss', which means we
	1286	* have to remain /di */
	1287	OP(scan) = EXACTF;
	1288	}
	1289	}
	1290	else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
	1291	if (STRING(n)[0] == 's') {
	1292	; /* Here the join is compatible and the combined node
	1293	starts with 's', no need to change OP */
	1294	}
	1295	else { /* Now the trailing 's' is in the interior */
	1296	OP(scan) = EXACTFU;
	1297	}
	1298	}
	1299	else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
	1300
	1301	/* The join is compatible, and the combined node will be
	1302	* EXACTF. (These don't care if they begin or end with 's' */
	1303	OP(scan) = EXACTF;
	1304	}
	1305	else if (OP(scan) != OP(n)) {
	1306
	1307	/* The only other compatible joinings are the same node type */
	1308	break;
	1309	}
	1310
	1311	DEBUG_PEEP("merg", n, depth, 0);
	1312	merged++;
	1313
	1314	next = REGNODE_AFTER_varies(n);
	1315	NEXT_OFF(scan) += NEXT_OFF(n);
	1316	assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
	1317	setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
	1318	/* Now we can overwrite n : /
	1319	Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
	1320	#ifdef DEBUGGING
	1321	stop = next - 1;
	1322	#endif
	1323	n = nnext;
	1324	if (stopnow) break;
	1325	}
	1326
	1327	#ifdef EXPERIMENTAL_INPLACESCAN
	1328	if (flags && !NEXT_OFF(n)) {
	1329	DEBUG_PEEP("atch", val, depth, 0);
	1330	if (REGNODE_OFF_BY_ARG(OP(n))) {
	1331	ARG_SET(n, val - n);
	1332	}
	1333	else {
	1334	NEXT_OFF(n) = val - n;
	1335	}
	1336	stopnow = 1;
	1337	}
	1338	#endif
	1339	}
	1340
	1341	/* This temporary node can now be turned into EXACTFU, and must, as
	1342	* regexec.c doesn't handle it */
	1343	if (OP(scan) == EXACTFU_S_EDGE) {
	1344	OP(scan) = EXACTFU;
	1345	}
	1346
	1347	*min_subtract = 0;
	1348	*unfolded_multi_char = FALSE;
	1349
	1350	/* Here, all the adjacent mergeable EXACTish nodes have been merged. We
	1351	* can now analyze for sequences of problematic code points. (Prior to
	1352	* this final joining, sequences could have been split over boundaries, and
	1353	* hence missed). The sequences only happen in folding, hence for any
	1354	* non-EXACT EXACTish node */
	1355	if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
	1356	U8* s0 = (U8*) STRING(scan);
	1357	U8* s = s0;
	1358	U8* s_end = s0 + STR_LEN(scan);
	1359
	1360	int total_count_delta = 0; /* Total delta number of characters that
	1361	multi-char folds expand to */
	1362
	1363	/* One pass is made over the node's string looking for all the
	1364	* possibilities. To avoid some tests in the loop, there are two main
	1365	* cases, for UTF-8 patterns (which can't have EXACTF nodes) and
	1366	* non-UTF-8 */
	1367	if (UTF) {
	1368	U8* folded = NULL;
	1369
	1370	if (OP(scan) == EXACTFL) {
	1371	U8 *d;
	1372
	1373	/* An EXACTFL node would already have been changed to another
	1374	* node type unless there is at least one character in it that
	1375	* is problematic; likely a character whose fold definition
	1376	* won't be known until runtime, and so has yet to be folded.
	1377	* For all but the UTF-8 locale, folds are 1-1 in length, but
	1378	* to handle the UTF-8 case, we need to create a temporary
	1379	* folded copy using UTF-8 locale rules in order to analyze it.
	1380	* This is because our macros that look to see if a sequence is
	1381	* a multi-char fold assume everything is folded (otherwise the
	1382	* tests in those macros would be too complicated and slow).
	1383	* Note that here, the non-problematic folds will have already
	1384	* been done, so we can just copy such characters. We actually
	1385	* don't completely fold the EXACTFL string. We skip the
	1386	* unfolded multi-char folds, as that would just create work
	1387	* below to figure out the size they already are */
	1388
	1389	Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
	1390	d = folded;
	1391	while (s < s_end) {
	1392	STRLEN s_len = UTF8SKIP(s);
	1393	if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
	1394	Copy(s, d, s_len, U8);
	1395	d += s_len;
	1396	}
	1397	else if (is_FOLDS_TO_MULTI_utf8(s)) {
	1398	*unfolded_multi_char = TRUE;
	1399	Copy(s, d, s_len, U8);
	1400	d += s_len;
	1401	}
	1402	else if (isASCII(*s)) {
	1403	(d++) = toFOLD(s);
	1404	}
	1405	else {
	1406	STRLEN len;
	1407	_toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
	1408	d += len;
	1409	}
	1410	s += s_len;
	1411	}
	1412
	1413	/* Point the remainder of the routine to look at our temporary
	1414	* folded copy */
	1415	s = folded;
	1416	s_end = d;
	1417	} /* End of creating folded copy of EXACTFL string */
	1418
	1419	/* Examine the string for a multi-character fold sequence. UTF-8
	1420	* patterns have all characters pre-folded by the time this code is
	1421	* executed */
	1422	while (s < s_end - 1) /* Can stop 1 before the end, as minimum
	1423	length sequence we are looking for is 2 */
	1424	{
	1425	int count = 0; /* How many characters in a multi-char fold */
	1426	int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
	1427	if (! len) { /* Not a multi-char fold: get next char */
	1428	s += UTF8SKIP(s);
	1429	continue;
	1430	}
	1431
	1432	{ /* Here is a generic multi-char fold. */
	1433	U8* multi_end = s + len;
	1434
	1435	/* Count how many characters are in it. In the case of
	1436	* /aa, no folds which contain ASCII code points are
	1437	* allowed, so check for those, and skip if found. */
	1438	if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
	1439	count = utf8_length(s, multi_end);
	1440	s = multi_end;
	1441	}
	1442	else {
	1443	while (s < multi_end) {
	1444	if (isASCII(*s)) {
	1445	s++;
	1446	goto next_iteration;
	1447	}
	1448	else {
	1449	s += UTF8SKIP(s);
	1450	}
	1451	count++;
	1452	}
	1453	}
	1454	}
	1455
	1456	/* The delta is how long the sequence is minus 1 (1 is how long
	1457	* the character that folds to the sequence is) */
	1458	total_count_delta += count - 1;
	1459	next_iteration: ;
	1460	}
	1461
	1462	/* We created a temporary folded copy of the string in EXACTFL
	1463	* nodes. Therefore we need to be sure it doesn't go below zero,
	1464	* as the real string could be shorter */
	1465	if (OP(scan) == EXACTFL) {
	1466	int total_chars = utf8_length((U8*) STRING(scan),
	1467	(U8*) STRING(scan) + STR_LEN(scan));
	1468	if (total_count_delta > total_chars) {
	1469	total_count_delta = total_chars;
	1470	}
	1471	}
	1472
	1473	*min_subtract += total_count_delta;
	1474	Safefree(folded);
	1475	}
	1476	else if (OP(scan) == EXACTFAA) {
	1477
	1478	/* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
	1479	* fold to the ASCII range (and there are no existing ones in the
	1480	* upper latin1 range). But, as outlined in the comments preceding
	1481	* this function, we need to flag any occurrences of the sharp s.
	1482	* This character forbids trie formation (because of added
	1483	* complexity) */
	1484	#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
	1485	\|\| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
	1486	\|\| UNICODE_DOT_DOT_VERSION > 0)
	1487	while (s < s_end) {
	1488	if (*s == LATIN_SMALL_LETTER_SHARP_S) {
	1489	OP(scan) = EXACTFAA_NO_TRIE;
	1490	*unfolded_multi_char = TRUE;
	1491	break;
	1492	}
	1493	s++;
	1494	}
	1495	}
	1496	else if (OP(scan) != EXACTFAA_NO_TRIE) {
	1497
	1498	/* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
	1499	* folds that are all Latin1. As explained in the comments
	1500	* preceding this function, we look also for the sharp s in EXACTF
	1501	* and EXACTFL nodes; it can be in the final position. Otherwise
	1502	* we can stop looking 1 byte earlier because have to find at least
	1503	* two characters for a multi-fold */
	1504	const U8* upper = (OP(scan) == EXACTF \|\| OP(scan) == EXACTFL)
	1505	? s_end
	1506	: s_end -1;
	1507
	1508	while (s < upper) {
	1509	int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
	1510	if (! len) { /* Not a multi-char fold. */
	1511	if (*s == LATIN_SMALL_LETTER_SHARP_S
	1512	&& (OP(scan) == EXACTF \|\| OP(scan) == EXACTFL))
	1513	{
	1514	*unfolded_multi_char = TRUE;
	1515	}
	1516	s++;
	1517	continue;
	1518	}
	1519
	1520	if (len == 2
	1521	&& isALPHA_FOLD_EQ(*s, 's')
	1522	&& isALPHA_FOLD_EQ(*(s+1), 's'))
	1523	{
	1524
	1525	/* EXACTF nodes need to know that the minimum length
	1526	* changed so that a sharp s in the string can match this
	1527	* ss in the pattern, but they remain EXACTF nodes, as they
	1528	* won't match this unless the target string is in UTF-8,
	1529	* which we don't know until runtime. EXACTFL nodes can't
	1530	* transform into EXACTFU nodes */
	1531	if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
	1532	OP(scan) = EXACTFUP;
	1533	}
	1534	}
	1535
	1536	*min_subtract += len - 1;
	1537	s += len;
	1538	}
	1539	#endif
	1540	}
	1541	}
	1542
	1543	#ifdef DEBUGGING
	1544	/* Allow dumping but overwriting the collection of skipped
	1545	* ops and/or strings with fake optimized ops */
	1546	n = REGNODE_AFTER_varies(scan);
	1547	while (n <= stop) {
	1548	OP(n) = OPTIMIZED;
	1549	FLAGS(n) = 0;
	1550	NEXT_OFF(n) = 0;
	1551	n++;
	1552	}
	1553	#endif
	1554	DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
	1555	return stopnow;
	1556	}
	1557
	1558	/* REx optimizer. Converts nodes into quicker variants "in place".
	1559	Finds fixed substrings. */
	1560
	1561
	1562	/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
	1563	to the position after last scanned or to NULL. */
	1564
	1565	/* the return from this sub is the minimum length that could possibly match */
	1566	SSize_t
	1567	Perl_study_chunk(pTHX_
	1568	RExC_state_t *pRExC_state,
	1569	regnode *scanp, / Start here (read-write). */
	1570	SSize_t minlenp, / used for the minlen of substrings? */
	1571	SSize_t deltap, / Write maxlen-minlen here. */
	1572	regnode last, / Stop before this one. */
	1573	scan_data_t data, / string data about the pattern */
	1574	I32 stopparen, /* treat CLOSE-N as END, see GOSUB */
	1575	U32 recursed_depth, /* how deep have we recursed via GOSUB */
	1576	regnode_ssc and_withp, / Valid if flags & SCF_DO_STCLASS_OR */
	1577	U32 flags, /* flags controlling this call, see SCF_ flags */
	1578	U32 depth, /* how deep have we recursed period */
	1579	bool was_mutate_ok /* TRUE if in-place optimizations are allowed.
	1580	FALSE only if the caller (recursively) was
	1581	prohibited from modifying the regops, because
	1582	a higher caller is holding a ptr to them. */
	1583	)
	1584	{
	1585	/* vars about the regnodes we are working with */
	1586	regnode scan = scanp; /* the current opcode we are inspecting */
	1587	regnode next = NULL; / the next opcode beyond scan, tmp var */
	1588	regnode first_non_open = scan; / FIXME: should this init to NULL?
	1589	the first non open regop, if the init
	1590	val IS an OPEN then we will skip past
	1591	it just after the var decls section */
	1592	I32 code = 0; /* temp var used to hold the optype of a regop */
	1593
	1594	/* vars about the min and max length of the pattern */
	1595	SSize_t min = 0; /* min length of this part of the pattern */
	1596	SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
	1597	this is adjusted down if we find
	1598	an ACCEPT */
	1599	SSize_t delta = 0; /* difference between min and max length
	1600	(not accounting for stopmin) */
	1601
	1602	/* vars about capture buffers in the pattern */
	1603	I32 pars = 0; /* count of OPEN opcodes */
	1604	I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */
	1605
	1606	/* vars about whether this pattern contains something that can match
	1607	* infinitely long strings, eg, X* or X+ */
	1608	int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
	1609	int is_inf_internal = 0; /* The studied chunk is infinite */
	1610
	1611	/* scan_data_t (struct) is used to hold information about the substrings
	1612	* and start class we have extracted from the string */
	1613	scan_data_t data_fake; /* temp var used for recursing in some cases */
	1614
	1615	SV re_trie_maxbuff = NULL; / temp var used to hold whether we can do
	1616	trie optimizations */
	1617
	1618	scan_frame frame = NULL; / used as part of fake recursion */
	1619
	1620	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	1621
	1622	PERL_ARGS_ASSERT_STUDY_CHUNK;
	1623	RExC_study_started= 1;
	1624
	1625	Zero(&data_fake, 1, scan_data_t);
	1626
	1627	if ( depth == 0 ) {
	1628	while (first_non_open && OP(first_non_open) == OPEN)
	1629	first_non_open=regnext(first_non_open);
	1630	}
	1631
	1632	fake_study_recurse:
	1633	DEBUG_r(
	1634	RExC_study_chunk_recursed_count++;
	1635	);
	1636	DEBUG_OPTIMISE_MORE_r(
	1637	{
	1638	Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
	1639	depth, (long)stopparen,
	1640	(unsigned long)RExC_study_chunk_recursed_count,
	1641	(unsigned long)depth, (unsigned long)recursed_depth,
	1642	scan,
	1643	last);
	1644	if (recursed_depth) {
	1645	U32 i;
	1646	U32 j;
	1647	for ( j = 0 ; j < recursed_depth ; j++ ) {
	1648	for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
	1649	if (PAREN_TEST(j, i) && (!j \|\| !PAREN_TEST(j - 1, i))) {
	1650	Perl_re_printf( aTHX_ " %d",(int)i);
	1651	break;
	1652	}
	1653	}
	1654	if ( j + 1 < recursed_depth ) {
	1655	Perl_re_printf( aTHX_ ",");
	1656	}
	1657	}
	1658	}
	1659	Perl_re_printf( aTHX_ "\n");
	1660	}
	1661	);
	1662	while ( scan && OP(scan) != END && scan < last ){
	1663	UV min_subtract = 0; /* How mmany chars to subtract from the minimum
	1664	node length to get a real minimum (because
	1665	the folded version may be shorter) */
	1666	bool unfolded_multi_char = FALSE;
	1667	/* avoid mutating ops if we are anywhere within the recursed or
	1668	* enframed handling for a GOSUB: the outermost level will handle it.
	1669	*/
	1670	bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
	1671	/* Peephole optimizer: */
	1672	DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
	1673	DEBUG_PEEP("Peep", scan, depth, flags);
	1674
	1675
	1676	/* The reason we do this here is that we need to deal with things like
	1677	* /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
	1678	* parsing code, as each (?:..) is handled by a different invocation of
	1679	* reg() -- Yves
	1680	*/
	1681	if (REGNODE_TYPE(OP(scan)) == EXACT
	1682	&& OP(scan) != LEXACT
	1683	&& OP(scan) != LEXACT_REQ8
	1684	&& mutate_ok
	1685	) {
	1686	join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
	1687	0, NULL, depth + 1);
	1688	}
	1689
	1690	/* Follow the next-chain of the current node and optimize
	1691	away all the NOTHINGs from it.
	1692	*/
	1693	rck_elide_nothing(scan);
	1694
	1695	/* The principal pseudo-switch. Cannot be a switch, since we look into
	1696	* several different things. */
	1697	if ( OP(scan) == DEFINEP ) {
	1698	SSize_t minlen = 0;
	1699	SSize_t deltanext = 0;
	1700	SSize_t fake_last_close = 0;
	1701	regnode *fake_last_close_op = NULL;
	1702	U32 f = SCF_IN_DEFINE \| (flags & SCF_TRIE_DOING_RESTUDY);
	1703
	1704	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	1705	scan = regnext(scan);
	1706	assert( OP(scan) == IFTHEN );
	1707	DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
	1708
	1709	data_fake.last_closep= &fake_last_close;
	1710	data_fake.last_close_opp= &fake_last_close_op;
	1711	minlen = *minlenp;
	1712	next = regnext(scan);
	1713	scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
	1714	DEBUG_PEEP("scan", scan, depth, flags);
	1715	DEBUG_PEEP("next", next, depth, flags);
	1716
	1717	/* we suppose the run is continuous, last=next...
	1718	* NOTE we dont use the return here! */
	1719	/* DEFINEP study_chunk() recursion */
	1720	(void)study_chunk(pRExC_state, &scan, &minlen,
	1721	&deltanext, next, &data_fake, stopparen,
	1722	recursed_depth, NULL, f, depth+1, mutate_ok);
	1723
	1724	scan = next;
	1725	} else
	1726	if (
	1727	OP(scan) == BRANCH \|\|
	1728	OP(scan) == BRANCHJ \|\|
	1729	OP(scan) == IFTHEN
	1730	) {
	1731	next = regnext(scan);
	1732	code = OP(scan);
	1733
	1734	/* The op(next)==code check below is to see if we
	1735	* have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
	1736	* IFTHEN is special as it might not appear in pairs.
	1737	* Not sure whether BRANCH-BRANCHJ is possible, regardless
	1738	* we dont handle it cleanly. */
	1739	if (OP(next) == code \|\| code == IFTHEN) {
	1740	/* NOTE - There is similar code to this block below for
	1741	* handling TRIE nodes on a re-study. If you change stuff here
	1742	* check there too. */
	1743	SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
	1744	regnode_ssc accum;
	1745	regnode * const startbranch=scan;
	1746
	1747	if (flags & SCF_DO_SUBSTR) {
	1748	/* Cannot merge strings after this. */
	1749	scan_commit(pRExC_state, data, minlenp, is_inf);
	1750	}
	1751
	1752	if (flags & SCF_DO_STCLASS)
	1753	ssc_init_zero(pRExC_state, &accum);
	1754
	1755	while (OP(scan) == code) {
	1756	SSize_t deltanext, minnext, fake_last_close = 0;
	1757	regnode *fake_last_close_op = NULL;
	1758	U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
	1759	regnode_ssc this_class;
	1760
	1761	DEBUG_PEEP("Branch", scan, depth, flags);
	1762
	1763	num++;
	1764	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	1765	if (data) {
	1766	data_fake.whilem_c = data->whilem_c;
	1767	data_fake.last_closep = data->last_closep;
	1768	data_fake.last_close_opp = data->last_close_opp;
	1769	}
	1770	else {
	1771	data_fake.last_closep = &fake_last_close;
	1772	data_fake.last_close_opp = &fake_last_close_op;
	1773	}
	1774
	1775	data_fake.pos_delta = delta;
	1776	next = regnext(scan);
	1777
	1778	scan = REGNODE_AFTER_opcode(scan, code);
	1779
	1780	if (flags & SCF_DO_STCLASS) {
	1781	ssc_init(pRExC_state, &this_class);
	1782	data_fake.start_class = &this_class;
	1783	f \|= SCF_DO_STCLASS_AND;
	1784	}
	1785	if (flags & SCF_WHILEM_VISITED_POS)
	1786	f \|= SCF_WHILEM_VISITED_POS;
	1787
	1788	/* we suppose the run is continuous, last=next...*/
	1789	/* recurse study_chunk() for each BRANCH in an alternation */
	1790	minnext = study_chunk(pRExC_state, &scan, minlenp,
	1791	&deltanext, next, &data_fake, stopparen,
	1792	recursed_depth, NULL, f, depth+1,
	1793	mutate_ok);
	1794
	1795	if (min1 > minnext)
	1796	min1 = minnext;
	1797	if (deltanext == OPTIMIZE_INFTY) {
	1798	is_inf = is_inf_internal = 1;
	1799	max1 = OPTIMIZE_INFTY;
	1800	} else if (max1 < minnext + deltanext)
	1801	max1 = minnext + deltanext;
	1802	scan = next;
	1803	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	1804	pars++;
	1805	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	1806	if ( stopmin > minnext)
	1807	stopmin = min + min1;
	1808	flags &= ~SCF_DO_SUBSTR;
	1809	if (data)
	1810	data->flags \|= SCF_SEEN_ACCEPT;
	1811	}
	1812	if (data) {
	1813	if (data_fake.flags & SF_HAS_EVAL)
	1814	data->flags \|= SF_HAS_EVAL;
	1815	data->whilem_c = data_fake.whilem_c;
	1816	}
	1817	if (flags & SCF_DO_STCLASS)
	1818	ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
	1819	DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
	1820	}
	1821	if (code == IFTHEN && num < 2) /* Empty ELSE branch */
	1822	min1 = 0;
	1823	if (flags & SCF_DO_SUBSTR) {
	1824	data->pos_min += min1;
	1825	if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
	1826	data->pos_delta = OPTIMIZE_INFTY;
	1827	else
	1828	data->pos_delta += max1 - min1;
	1829	if (max1 != min1 \|\| is_inf)
	1830	data->cur_is_floating = 1;
	1831	}
	1832	min += min1;
	1833	if (delta == OPTIMIZE_INFTY
	1834	\|\| OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
	1835	delta = OPTIMIZE_INFTY;
	1836	else
	1837	delta += max1 - min1;
	1838	if (flags & SCF_DO_STCLASS_OR) {
	1839	ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
	1840	if (min1) {
	1841	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	1842	flags &= ~SCF_DO_STCLASS;
	1843	}
	1844	}
	1845	else if (flags & SCF_DO_STCLASS_AND) {
	1846	if (min1) {
	1847	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
	1848	flags &= ~SCF_DO_STCLASS;
	1849	}
	1850	else {
	1851	/* Switch to OR mode: cache the old value of
	1852	* data->start_class */
	1853	INIT_AND_WITHP;
	1854	StructCopy(data->start_class, and_withp, regnode_ssc);
	1855	flags &= ~SCF_DO_STCLASS_AND;
	1856	StructCopy(&accum, data->start_class, regnode_ssc);
	1857	flags \|= SCF_DO_STCLASS_OR;
	1858	}
	1859	}
	1860	DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
	1861
	1862	if (PERL_ENABLE_TRIE_OPTIMISATION
	1863	&& OP(startbranch) == BRANCH
	1864	&& mutate_ok
	1865	) {
	1866	/* demq.
	1867
	1868	Assuming this was/is a branch we are dealing with: 'scan'
	1869	now points at the item that follows the branch sequence,
	1870	whatever it is. We now start at the beginning of the
	1871	sequence and look for subsequences of
	1872
	1873	BRANCH->EXACT=>x1
	1874	BRANCH->EXACT=>x2
	1875	tail
	1876
	1877	which would be constructed from a pattern like
	1878	/A\|LIST\|OF\|WORDS/
	1879
	1880	If we can find such a subsequence we need to turn the first
	1881	element into a trie and then add the subsequent branch exact
	1882	strings to the trie.
	1883
	1884	We have two cases
	1885
	1886	1. patterns where the whole set of branches can be
	1887	converted.
	1888
	1889	2. patterns where only a subset can be converted.
	1890
	1891	In case 1 we can replace the whole set with a single regop
	1892	for the trie. In case 2 we need to keep the start and end
	1893	branches so
	1894
	1895	'BRANCH EXACT; BRANCH EXACT; BRANCH X'
	1896	becomes BRANCH TRIE; BRANCH X;
	1897
	1898	There is an additional case, that being where there is a
	1899	common prefix, which gets split out into an EXACT like node
	1900	preceding the TRIE node.
	1901
	1902	If X(1..n)==tail then we can do a simple trie, if not we make
	1903	a "jump" trie, such that when we match the appropriate word
	1904	we "jump" to the appropriate tail node. Essentially we turn
	1905	a nested if into a case structure of sorts.
	1906
	1907	*/
	1908
	1909	int made=0;
	1910	if (!re_trie_maxbuff) {
	1911	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
	1912	if (!SvIOK(re_trie_maxbuff))
	1913	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	1914	}
	1915	if ( SvIV(re_trie_maxbuff)>=0 ) {
	1916	regnode *cur;
	1917	regnode first = (regnode )NULL;
	1918	regnode prev = (regnode )NULL;
	1919	regnode *tail = scan;
	1920	U8 trietype = 0;
	1921	U32 count=0;
	1922
	1923	/* var tail is used because there may be a TAIL
	1924	regop in the way. Ie, the exacts will point to the
	1925	thing following the TAIL, but the last branch will
	1926	point at the TAIL. So we advance tail. If we
	1927	have nested (?:) we may have to move through several
	1928	tails.
	1929	*/
	1930
	1931	while ( OP( tail ) == TAIL ) {
	1932	/* this is the TAIL generated by (?:) */
	1933	tail = regnext( tail );
	1934	}
	1935
	1936
	1937	DEBUG_TRIE_COMPILE_r({
	1938	regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
	1939	Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
	1940	depth+1,
	1941	"Looking for TRIE'able sequences. Tail node is ",
	1942	(UV) REGNODE_OFFSET(tail),
	1943	SvPV_nolen_const( RExC_mysv )
	1944	);
	1945	});
	1946
	1947	/*
	1948
	1949	Step through the branches
	1950	cur represents each branch,
	1951	noper is the first thing to be matched as part
	1952	of that branch
	1953	noper_next is the regnext() of that node.
	1954
	1955	We normally handle a case like this
	1956	/FOO[xyz]\|BAR[pqr]/ via a "jump trie" but we also
	1957	support building with NOJUMPTRIE, which restricts
	1958	the trie logic to structures like /FOO\|BAR/.
	1959
	1960	If noper is a trieable nodetype then the branch is
	1961	a possible optimization target. If we are building
	1962	under NOJUMPTRIE then we require that noper_next is
	1963	the same as scan (our current position in the regex
	1964	program).
	1965
	1966	Once we have two or more consecutive such branches
	1967	we can create a trie of the EXACT's contents and
	1968	stitch it in place into the program.
	1969
	1970	If the sequence represents all of the branches in
	1971	the alternation we replace the entire thing with a
	1972	single TRIE node.
	1973
	1974	Otherwise when it is a subsequence we need to
	1975	stitch it in place and replace only the relevant
	1976	branches. This means the first branch has to remain
	1977	as it is used by the alternation logic, and its
	1978	next pointer, and needs to be repointed at the item
	1979	on the branch chain following the last branch we
	1980	have optimized away.
	1981
	1982	This could be either a BRANCH, in which case the
	1983	subsequence is internal, or it could be the item
	1984	following the branch sequence in which case the
	1985	subsequence is at the end (which does not
	1986	necessarily mean the first node is the start of the
	1987	alternation).
	1988
	1989	TRIE_TYPE(X) is a define which maps the optype to a
	1990	trietype.
	1991
	1992	optype \| trietype
	1993	----------------+-----------
	1994	NOTHING \| NOTHING
	1995	EXACT \| EXACT
	1996	EXACT_REQ8 \| EXACT
	1997	EXACTFU \| EXACTFU
	1998	EXACTFU_REQ8 \| EXACTFU
	1999	EXACTFUP \| EXACTFU
	2000	EXACTFAA \| EXACTFAA
	2001	EXACTL \| EXACTL
	2002	EXACTFLU8 \| EXACTFLU8
	2003
	2004
	2005	*/
	2006	#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
	2007	? NOTHING \
	2008	: ( EXACT == (X) \|\| EXACT_REQ8 == (X) ) \
	2009	? EXACT \
	2010	: ( EXACTFU == (X) \
	2011	\|\| EXACTFU_REQ8 == (X) \
	2012	\|\| EXACTFUP == (X) ) \
	2013	? EXACTFU \
	2014	: ( EXACTFAA == (X) ) \
	2015	? EXACTFAA \
	2016	: ( EXACTL == (X) ) \
	2017	? EXACTL \
	2018	: ( EXACTFLU8 == (X) ) \
	2019	? EXACTFLU8 \
	2020	: 0 )
	2021
	2022	/* dont use tail as the end marker for this traverse */
	2023	for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
	2024	regnode * const noper = REGNODE_AFTER( cur );
	2025	U8 noper_type = OP( noper );
	2026	U8 noper_trietype = TRIE_TYPE( noper_type );
	2027	#if defined(DEBUGGING) \|\| defined(NOJUMPTRIE)
	2028	regnode * const noper_next = regnext( noper );
	2029	U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
	2030	U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
	2031	#endif
	2032
	2033	DEBUG_TRIE_COMPILE_r({
	2034	regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
	2035	Perl_re_indentf( aTHX_ "- %d:%s (%d)",
	2036	depth+1,
	2037	REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
	2038
	2039	regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
	2040	Perl_re_printf( aTHX_ " -> %d:%s",
	2041	REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
	2042
	2043	if ( noper_next ) {
	2044	regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
	2045	Perl_re_printf( aTHX_ "\t=> %d:%s\t",
	2046	REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
	2047	}
	2048	Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
	2049	REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
	2050	REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
	2051	);
	2052	});
	2053
	2054	/* Is noper a trieable nodetype that can be merged
	2055	* with the current trie (if there is one)? */
	2056	if ( noper_trietype
	2057	&&
	2058	(
	2059	( noper_trietype == NOTHING )
	2060	\|\| ( trietype == NOTHING )
	2061	\|\| ( trietype == noper_trietype )
	2062	)
	2063	#ifdef NOJUMPTRIE
	2064	&& noper_next >= tail
	2065	#endif
	2066	&& count < U16_MAX)
	2067	{
	2068	/* Handle mergable triable node Either we are
	2069	* the first node in a new trieable sequence,
	2070	* in which case we do some bookkeeping,
	2071	* otherwise we update the end pointer. */
	2072	if ( !first ) {
	2073	first = cur;
	2074	if ( noper_trietype == NOTHING ) {
	2075	#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
	2076	regnode * const noper_next = regnext( noper );
	2077	U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
	2078	U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
	2079	#endif
	2080
	2081	if ( noper_next_trietype ) {
	2082	trietype = noper_next_trietype;
	2083	} else if (noper_next_type) {
	2084	/* a NOTHING regop is 1 regop wide.
	2085	* We need at least two for a trie
	2086	* so we can't merge this in */
	2087	first = NULL;
	2088	}
	2089	} else {
	2090	trietype = noper_trietype;
	2091	}
	2092	} else {
	2093	if ( trietype == NOTHING )
	2094	trietype = noper_trietype;
	2095	prev = cur;
	2096	}
	2097	if (first)
	2098	count++;
	2099	} /* end handle mergable triable node */
	2100	else {
	2101	/* handle unmergable node -
	2102	* noper may either be a triable node which can
	2103	* not be tried together with the current trie,
	2104	* or a non triable node */
	2105	if ( prev ) {
	2106	/* If last is set and trietype is not
	2107	* NOTHING then we have found at least two
	2108	* triable branch sequences in a row of a
	2109	* similar trietype so we can turn them
	2110	* into a trie. If/when we allow NOTHING to
	2111	* start a trie sequence this condition
	2112	* will be required, and it isn't expensive
	2113	* so we leave it in for now. */
	2114	if ( trietype && trietype != NOTHING )
	2115	make_trie( pRExC_state,
	2116	startbranch, first, cur, tail,
	2117	count, trietype, depth+1 );
	2118	prev = NULL; /* note: we clear/update
	2119	first, trietype etc below,
	2120	so we dont do it here */
	2121	}
	2122	if ( noper_trietype
	2123	#ifdef NOJUMPTRIE
	2124	&& noper_next >= tail
	2125	#endif
	2126	){
	2127	/* noper is triable, so we can start a new
	2128	* trie sequence */
	2129	count = 1;
	2130	first = cur;
	2131	trietype = noper_trietype;
	2132	} else if (first) {
	2133	/* if we already saw a first but the
	2134	* current node is not triable then we have
	2135	* to reset the first information. */
	2136	count = 0;
	2137	first = NULL;
	2138	trietype = 0;
	2139	}
	2140	} /* end handle unmergable node */
	2141	} /* loop over branches */
	2142	DEBUG_TRIE_COMPILE_r({
	2143	regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
	2144	Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
	2145	depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
	2146	Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
	2147	REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
	2148	REGNODE_NAME(trietype)
	2149	);
	2150
	2151	});
	2152	if ( prev && trietype ) {
	2153	if ( trietype != NOTHING ) {
	2154	/* the last branch of the sequence was part of
	2155	* a trie, so we have to construct it here
	2156	* outside of the loop */
	2157	made= make_trie( pRExC_state, startbranch,
	2158	first, scan, tail, count,
	2159	trietype, depth+1 );
	2160	#ifdef TRIE_STUDY_OPT
	2161	if ( ((made == MADE_EXACT_TRIE &&
	2162	startbranch == first)
	2163	\|\| ( first_non_open == first )) &&
	2164	depth==0 ) {
	2165	flags \|= SCF_TRIE_RESTUDY;
	2166	if ( startbranch == first
	2167	&& scan >= tail )
	2168	{
	2169	RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
	2170	}
	2171	}
	2172	#endif
	2173	} else {
	2174	/* at this point we know whatever we have is a
	2175	* NOTHING sequence/branch AND if 'startbranch'
	2176	* is 'first' then we can turn the whole thing
	2177	* into a NOTHING
	2178	*/
	2179	if ( startbranch == first ) {
	2180	regnode *opt;
	2181	/* the entire thing is a NOTHING sequence,
	2182	* something like this: (?:\|) So we can
	2183	* turn it into a plain NOTHING op. */
	2184	DEBUG_TRIE_COMPILE_r({
	2185	regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
	2186	Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
	2187	depth+1,
	2188	SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
	2189
	2190	});
	2191	OP(startbranch)= NOTHING;
	2192	NEXT_OFF(startbranch)= tail - startbranch;
	2193	for ( opt= startbranch + 1; opt < tail ; opt++ )
	2194	OP(opt)= OPTIMIZED;
	2195	}
	2196	}
	2197	} /* end if ( prev) */
	2198	} /* TRIE_MAXBUF is non zero */
	2199	} /* do trie */
	2200	DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
	2201	}
	2202	else
	2203	scan = REGNODE_AFTER_opcode(scan,code);
	2204	continue;
	2205	} else if (OP(scan) == SUSPEND \|\| OP(scan) == GOSUB) {
	2206	I32 paren = 0;
	2207	regnode *start = NULL;
	2208	regnode *end = NULL;
	2209	U32 my_recursed_depth= recursed_depth;
	2210
	2211	if (OP(scan) != SUSPEND) { /* GOSUB */
	2212	/* Do setup, note this code has side effects beyond
	2213	* the rest of this block. Specifically setting
	2214	* RExC_recurse[] must happen at least once during
	2215	* study_chunk(). */
	2216	paren = ARG(scan);
	2217	RExC_recurse[ARG2L(scan)] = scan;
	2218	start = REGNODE_p(RExC_open_parens[paren]);
	2219	end = REGNODE_p(RExC_close_parens[paren]);
	2220
	2221	/* NOTE we MUST always execute the above code, even
	2222	* if we do nothing with a GOSUB */
	2223	if (
	2224	( flags & SCF_IN_DEFINE )
	2225	\|\|
	2226	(
	2227	(is_inf_internal \|\| is_inf \|\| (data && data->flags & SF_IS_INF))
	2228	&&
	2229	( (flags & (SCF_DO_STCLASS \| SCF_DO_SUBSTR)) == 0 )
	2230	)
	2231	) {
	2232	/* no need to do anything here if we are in a define. */
	2233	/* or we are after some kind of infinite construct
	2234	* so we can skip recursing into this item.
	2235	* Since it is infinite we will not change the maxlen
	2236	* or delta, and if we miss something that might raise
	2237	* the minlen it will merely pessimise a little.
	2238	*
	2239	* Iow /(?(DEFINE)(?<foo>foo\|food))a+(?&foo)/
	2240	* might result in a minlen of 1 and not of 4,
	2241	* but this doesn't make us mismatch, just try a bit
	2242	* harder than we should.
	2243	*
	2244	* However we must assume this GOSUB is infinite, to
	2245	* avoid wrongly applying other optimizations in the
	2246	* enclosing scope - see GH 18096, for example.
	2247	*/
	2248	is_inf = is_inf_internal = 1;
	2249	scan= regnext(scan);
	2250	continue;
	2251	}
	2252
	2253	if (
	2254	!recursed_depth
	2255	\|\| !PAREN_TEST(recursed_depth - 1, paren)
	2256	) {
	2257	/* it is quite possible that there are more efficient ways
	2258	* to do this. We maintain a bitmap per level of recursion
	2259	* of which patterns we have entered so we can detect if a
	2260	* pattern creates a possible infinite loop. When we
	2261	* recurse down a level we copy the previous levels bitmap
	2262	* down. When we are at recursion level 0 we zero the top
	2263	* level bitmap. It would be nice to implement a different
	2264	* more efficient way of doing this. In particular the top
	2265	* level bitmap may be unnecessary.
	2266	*/
	2267	if (!recursed_depth) {
	2268	Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
	2269	} else {
	2270	Copy(PAREN_OFFSET(recursed_depth - 1),
	2271	PAREN_OFFSET(recursed_depth),
	2272	RExC_study_chunk_recursed_bytes, U8);
	2273	}
	2274	/* we havent recursed into this paren yet, so recurse into it */
	2275	DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
	2276	PAREN_SET(recursed_depth, paren);
	2277	my_recursed_depth= recursed_depth + 1;
	2278	} else {
	2279	DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
	2280	/* some form of infinite recursion, assume infinite length
	2281	* */
	2282	if (flags & SCF_DO_SUBSTR) {
	2283	scan_commit(pRExC_state, data, minlenp, is_inf);
	2284	data->cur_is_floating = 1;
	2285	}
	2286	is_inf = is_inf_internal = 1;
	2287	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	2288	ssc_anything(data->start_class);
	2289	flags &= ~SCF_DO_STCLASS;
	2290
	2291	start= NULL; /* reset start so we dont recurse later on. */
	2292	}
	2293	} else {
	2294	paren = stopparen;
	2295	start = scan + 2;
	2296	end = regnext(scan);
	2297	}
	2298	if (start) {
	2299	scan_frame *newframe;
	2300	assert(end);
	2301	if (!RExC_frame_last) {
	2302	Newxz(newframe, 1, scan_frame);
	2303	SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
	2304	RExC_frame_head= newframe;
	2305	RExC_frame_count++;
	2306	} else if (!RExC_frame_last->next_frame) {
	2307	Newxz(newframe, 1, scan_frame);
	2308	RExC_frame_last->next_frame= newframe;
	2309	newframe->prev_frame= RExC_frame_last;
	2310	RExC_frame_count++;
	2311	} else {
	2312	newframe= RExC_frame_last->next_frame;
	2313	}
	2314	RExC_frame_last= newframe;
	2315
	2316	newframe->next_regnode = regnext(scan);
	2317	newframe->last_regnode = last;
	2318	newframe->stopparen = stopparen;
	2319	newframe->prev_recursed_depth = recursed_depth;
	2320	newframe->this_prev_frame= frame;
	2321	newframe->in_gosub = (
	2322	(frame && frame->in_gosub) \|\| OP(scan) == GOSUB
	2323	);
	2324
	2325	DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
	2326	DEBUG_PEEP("fnew", scan, depth, flags);
	2327
	2328	frame = newframe;
	2329	scan = start;
	2330	stopparen = paren;
	2331	last = end;
	2332	depth = depth + 1;
	2333	recursed_depth= my_recursed_depth;
	2334
	2335	continue;
	2336	}
	2337	}
	2338	else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
	2339	SSize_t bytelen = STR_LEN(scan), charlen;
	2340	UV uc;
	2341	assert(bytelen);
	2342	if (UTF) {
	2343	const U8 * const s = (U8*)STRING(scan);
	2344	uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
	2345	charlen = utf8_length(s, s + bytelen);
	2346	} else {
	2347	uc = ((U8)STRING(scan));
	2348	charlen = bytelen;
	2349	}
	2350	min += charlen;
	2351	if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
	2352	/* The code below prefers earlier match for fixed
	2353	offset, later match for variable offset. */
	2354	if (data->last_end == -1) { /* Update the start info. */
	2355	data->last_start_min = data->pos_min;
	2356	data->last_start_max =
	2357	is_inf ? OPTIMIZE_INFTY
	2358	: (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
	2359	? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
	2360	}
	2361	sv_catpvn(data->last_found, STRING(scan), bytelen);
	2362	if (UTF)
	2363	SvUTF8_on(data->last_found);
	2364	{
	2365	SV * const sv = data->last_found;
	2366	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	2367	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	2368	if (mg && mg->mg_len >= 0)
	2369	mg->mg_len += charlen;
	2370	}
	2371	data->last_end = data->pos_min + charlen;
	2372	data->pos_min += charlen; /* As in the first entry. */
	2373	data->flags &= ~SF_BEFORE_EOL;
	2374	}
	2375
	2376	/* ANDing the code point leaves at most it, and not in locale, and
	2377	* can't match null string */
	2378	if (flags & SCF_DO_STCLASS_AND) {
	2379	ssc_cp_and(data->start_class, uc);
	2380	ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
	2381	ssc_clear_locale(data->start_class);
	2382	}
	2383	else if (flags & SCF_DO_STCLASS_OR) {
	2384	ssc_add_cp(data->start_class, uc);
	2385	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	2386
	2387	/* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
	2388	ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
	2389	}
	2390	flags &= ~SCF_DO_STCLASS;
	2391	DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
	2392	}
	2393	else if (REGNODE_TYPE(OP(scan)) == EXACT) {
	2394	/* But OP != EXACT!, so is EXACTFish */
	2395	SSize_t bytelen = STR_LEN(scan), charlen;
	2396	const U8 * s = (U8*)STRING(scan);
	2397
	2398	/* Replace a length 1 ASCII fold pair node with an ANYOFM node,
	2399	* with the mask set to the complement of the bit that differs
	2400	* between upper and lower case, and the lowest code point of the
	2401	* pair (which the '&' forces) */
	2402	if ( bytelen == 1
	2403	&& isALPHA_A(*s)
	2404	&& ( OP(scan) == EXACTFAA
	2405	\|\| ( OP(scan) == EXACTFU
	2406	&& ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
	2407	&& mutate_ok
	2408	) {
	2409	U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
	2410
	2411	OP(scan) = ANYOFM;
	2412	ARG_SET(scan, *s & mask);
	2413	FLAGS(scan) = mask;
	2414	/* We're not EXACTFish any more, so restudy.
	2415	* Search for "restudy" in this file to find
	2416	* a comment with details. */
	2417	continue;
	2418	}
	2419
	2420	/* Search for fixed substrings supports EXACT only. */
	2421	if (flags & SCF_DO_SUBSTR) {
	2422	assert(data);
	2423	scan_commit(pRExC_state, data, minlenp, is_inf);
	2424	}
	2425	charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
	2426	if (unfolded_multi_char) {
	2427	RExC_seen \|= REG_UNFOLDED_MULTI_SEEN;
	2428	}
	2429	min += charlen - min_subtract;
	2430	assert (min >= 0);
	2431	if ((SSize_t)min_subtract < OPTIMIZE_INFTY
	2432	&& delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
	2433	) {
	2434	delta += min_subtract;
	2435	} else {
	2436	delta = OPTIMIZE_INFTY;
	2437	}
	2438	if (flags & SCF_DO_SUBSTR) {
	2439	data->pos_min += charlen - min_subtract;
	2440	if (data->pos_min < 0) {
	2441	data->pos_min = 0;
	2442	}
	2443	if ((SSize_t)min_subtract < OPTIMIZE_INFTY
	2444	&& data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
	2445	) {
	2446	data->pos_delta += min_subtract;
	2447	} else {
	2448	data->pos_delta = OPTIMIZE_INFTY;
	2449	}
	2450	if (min_subtract) {
	2451	data->cur_is_floating = 1; /* float */
	2452	}
	2453	}
	2454
	2455	if (flags & SCF_DO_STCLASS) {
	2456	SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
	2457
	2458	assert(EXACTF_invlist);
	2459	if (flags & SCF_DO_STCLASS_AND) {
	2460	if (OP(scan) != EXACTFL)
	2461	ssc_clear_locale(data->start_class);
	2462	ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
	2463	ANYOF_POSIXL_ZERO(data->start_class);
	2464	ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
	2465	}
	2466	else { /* SCF_DO_STCLASS_OR */
	2467	ssc_union(data->start_class, EXACTF_invlist, FALSE);
	2468	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	2469
	2470	/* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
	2471	ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
	2472	}
	2473	flags &= ~SCF_DO_STCLASS;
	2474	SvREFCNT_dec(EXACTF_invlist);
	2475	}
	2476	DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
	2477	}
	2478	else if (REGNODE_VARIES(OP(scan))) {
	2479	SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
	2480	I32 fl = 0;
	2481	U32 f = flags;
	2482	regnode * const oscan = scan;
	2483	regnode_ssc this_class;
	2484	regnode_ssc *oclass = NULL;
	2485	I32 next_is_eval = 0;
	2486
	2487	switch (REGNODE_TYPE(OP(scan))) {
	2488	case WHILEM: /* End of (?:...)* . */
	2489	scan = REGNODE_AFTER(scan);
	2490	goto finish;
	2491	case PLUS:
	2492	if (flags & (SCF_DO_SUBSTR \| SCF_DO_STCLASS)) {
	2493	next = REGNODE_AFTER(scan);
	2494	if ( ( REGNODE_TYPE(OP(next)) == EXACT
	2495	&& ! isEXACTFish(OP(next)))
	2496	\|\| (flags & SCF_DO_STCLASS))
	2497	{
	2498	mincount = 1;
	2499	maxcount = REG_INFTY;
	2500	next = regnext(scan);
	2501	scan = REGNODE_AFTER(scan);
	2502	goto do_curly;
	2503	}
	2504	}
	2505	if (flags & SCF_DO_SUBSTR)
	2506	data->pos_min++;
	2507	/* This will bypass the formal 'min += minnext * mincount'
	2508	* calculation in the do_curly path, so assumes min width
	2509	* of the PLUS payload is exactly one. */
	2510	min++;
	2511	/* FALLTHROUGH */
	2512	case STAR:
	2513	next = REGNODE_AFTER(scan);
	2514
	2515	/* This temporary node can now be turned into EXACTFU, and
	2516	* must, as regexec.c doesn't handle it */
	2517	if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
	2518	OP(next) = EXACTFU;
	2519	}
	2520
	2521	if ( STR_LEN(next) == 1
	2522	&& isALPHA_A(* STRING(next))
	2523	&& ( OP(next) == EXACTFAA
	2524	\|\| ( OP(next) == EXACTFU
	2525	&& ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
	2526	&& mutate_ok
	2527	) {
	2528	/* These differ in just one bit */
	2529	U8 mask = ~ ('A' ^ 'a');
	2530
	2531	assert(isALPHA_A(* STRING(next)));
	2532
	2533	/* Then replace it by an ANYOFM node, with
	2534	* the mask set to the complement of the
	2535	* bit that differs between upper and lower
	2536	* case, and the lowest code point of the
	2537	* pair (which the '&' forces) */
	2538	OP(next) = ANYOFM;
	2539	ARG_SET(next, *STRING(next) & mask);
	2540	FLAGS(next) = mask;
	2541	}
	2542
	2543	if (flags & SCF_DO_STCLASS) {
	2544	mincount = 0;
	2545	maxcount = REG_INFTY;
	2546	next = regnext(scan);
	2547	scan = REGNODE_AFTER(scan);
	2548	goto do_curly;
	2549	}
	2550	if (flags & SCF_DO_SUBSTR) {
	2551	scan_commit(pRExC_state, data, minlenp, is_inf);
	2552	/* Cannot extend fixed substrings */
	2553	data->cur_is_floating = 1; /* float */
	2554	}
	2555	is_inf = is_inf_internal = 1;
	2556	scan = regnext(scan);
	2557	goto optimize_curly_tail;
	2558	case CURLY:
	2559	if (stopparen>0 && (OP(scan)==CURLYN \|\| OP(scan)==CURLYM)
	2560	&& (scan->flags == stopparen))
	2561	{
	2562	mincount = 1;
	2563	maxcount = 1;
	2564	} else {
	2565	mincount = ARG1(scan);
	2566	maxcount = ARG2(scan);
	2567	}
	2568	next = regnext(scan);
	2569	if (OP(scan) == CURLYX) {
	2570	I32 lp = (data ? *(data->last_closep) : 0);
	2571	scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
	2572	}
	2573	scan = REGNODE_AFTER(scan);
	2574	next_is_eval = (OP(scan) == EVAL);
	2575	do_curly:
	2576	if (flags & SCF_DO_SUBSTR) {
	2577	if (mincount == 0)
	2578	scan_commit(pRExC_state, data, minlenp, is_inf);
	2579	/* Cannot extend fixed substrings */
	2580	pos_before = data->pos_min;
	2581	}
	2582	if (data) {
	2583	fl = data->flags;
	2584	data->flags &= ~(SF_HAS_PAR\|SF_IN_PAR\|SF_HAS_EVAL);
	2585	if (is_inf)
	2586	data->flags \|= SF_IS_INF;
	2587	}
	2588	if (flags & SCF_DO_STCLASS) {
	2589	ssc_init(pRExC_state, &this_class);
	2590	oclass = data->start_class;
	2591	data->start_class = &this_class;
	2592	f \|= SCF_DO_STCLASS_AND;
	2593	f &= ~SCF_DO_STCLASS_OR;
	2594	}
	2595	/* Exclude from super-linear cache processing any {n,m}
	2596	regops for which the combination of input pos and regex
	2597	pos is not enough information to determine if a match
	2598	will be possible.
	2599
	2600	For example, in the regex /foo(bar\s*){4,8}baz/ with the
	2601	regex pos at the \s*, the prospects for a match depend not
	2602	only on the input position but also on how many (bar\s*)
	2603	repeats into the {4,8} we are. */
	2604	if ((mincount > 1) \|\| (maxcount > 1 && maxcount != REG_INFTY))
	2605	f &= ~SCF_WHILEM_VISITED_POS;
	2606
	2607	/* This will finish on WHILEM, setting scan, or on NULL: */
	2608	/* recurse study_chunk() on loop bodies */
	2609	minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
	2610	last, data, stopparen, recursed_depth, NULL,
	2611	(mincount == 0
	2612	? (f & ~SCF_DO_SUBSTR)
	2613	: f)
	2614	, depth+1, mutate_ok);
	2615
	2616	if (data && data->flags & SCF_SEEN_ACCEPT) {
	2617	if (mincount > 1)
	2618	mincount = 1;
	2619	}
	2620
	2621	if (flags & SCF_DO_STCLASS)
	2622	data->start_class = oclass;
	2623	if (mincount == 0 \|\| minnext == 0) {
	2624	if (flags & SCF_DO_STCLASS_OR) {
	2625	ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
	2626	}
	2627	else if (flags & SCF_DO_STCLASS_AND) {
	2628	/* Switch to OR mode: cache the old value of
	2629	* data->start_class */
	2630	INIT_AND_WITHP;
	2631	StructCopy(data->start_class, and_withp, regnode_ssc);
	2632	flags &= ~SCF_DO_STCLASS_AND;
	2633	StructCopy(&this_class, data->start_class, regnode_ssc);
	2634	flags \|= SCF_DO_STCLASS_OR;
	2635	ANYOF_FLAGS(data->start_class)
	2636	\|= SSC_MATCHES_EMPTY_STRING;
	2637	}
	2638	} else { /* Non-zero len */
	2639	if (flags & SCF_DO_STCLASS_OR) {
	2640	ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
	2641	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	2642	}
	2643	else if (flags & SCF_DO_STCLASS_AND)
	2644	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
	2645	flags &= ~SCF_DO_STCLASS;
	2646	}
	2647	if (!scan) /* It was not CURLYX, but CURLY. */
	2648	scan = next;
	2649	if (((flags & (SCF_TRIE_DOING_RESTUDY\|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
	2650	/* ? quantifier ok, except for (?{ ... }) */
	2651	&& (next_is_eval \|\| !(mincount == 0 && maxcount == 1))
	2652	&& (minnext == 0) && (deltanext == 0)
	2653	&& data && !(data->flags & (SF_HAS_PAR\|SF_IN_PAR))
	2654	&& maxcount <= REG_INFTY/3) /* Complement check for big
	2655	count */
	2656	{
	2657	_WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
	2658	Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
	2659	"Quantifier unexpected on zero-length expression "
	2660	"in regex m/%" UTF8f "/",
	2661	UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
	2662	RExC_precomp)));
	2663	}
	2664
	2665	if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
	2666	\|\| min >= SSize_t_MAX - minnext * mincount )
	2667	{
	2668	FAIL("Regexp out of space");
	2669	}
	2670
	2671	min += minnext * mincount;
	2672	is_inf_internal \|= deltanext == OPTIMIZE_INFTY
	2673	\|\| (maxcount == REG_INFTY && minnext + deltanext > 0);
	2674	is_inf \|= is_inf_internal;
	2675	if (is_inf) {
	2676	delta = OPTIMIZE_INFTY;
	2677	} else {
	2678	delta += (minnext + deltanext) * maxcount
	2679	- minnext * mincount;
	2680	}
	2681
	2682	if (data && data->flags & SCF_SEEN_ACCEPT) {
	2683	if (flags & SCF_DO_SUBSTR) {
	2684	scan_commit(pRExC_state, data, minlenp, is_inf);
	2685	flags &= ~SCF_DO_SUBSTR;
	2686	}
	2687	if (stopmin > min)
	2688	stopmin = min;
	2689	DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
	2690	}
	2691	DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
	2692	/* Try powerful optimization CURLYX => CURLYN. */
	2693	if ( RE_OPTIMIZE_CURLYX_TO_CURLYN
	2694	&& OP(oscan) == CURLYX
	2695	&& data
	2696	&& !pRExC_state->code_blocks /* XXX: for now disable whenever eval
	2697	is seen anywhere. We need a better
	2698	way. */
	2699	&& ( ( data->flags & (SF_IN_PAR\|SF_HAS_EVAL) ) == SF_IN_PAR )
	2700	&& !deltanext
	2701	&& minnext == 1
	2702	&& mutate_ok
	2703	) {
	2704	DEBUG_STUDYDATA("CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
	2705	/* Try to optimize to CURLYN. */
	2706	regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
	2707	regnode * const nxt1 = nxt;
	2708	#ifdef DEBUGGING
	2709	regnode *nxt2;
	2710	#endif
	2711	/* Skip open. */
	2712	nxt = regnext(nxt);
	2713	if (!REGNODE_SIMPLE(OP(nxt))
	2714	&& !(REGNODE_TYPE(OP(nxt)) == EXACT
	2715	&& STR_LEN(nxt) == 1))
	2716	goto nogo;
	2717	#ifdef DEBUGGING
	2718	nxt2 = nxt;
	2719	#endif
	2720	nxt = regnext(nxt);
	2721	if (OP(nxt) != CLOSE)
	2722	goto nogo;
	2723	if (RExC_open_parens) {
	2724
	2725	/open->CURLYM/
	2726	RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
	2727
	2728	/close->while/
	2729	RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2;
	2730	}
	2731	/* Now we know that nxt2 is the only contents: */
	2732	oscan->flags = (U8)PARNO(nxt);
	2733	OP(oscan) = CURLYN;
	2734	OP(nxt1) = NOTHING; /* was OPEN. */
	2735
	2736	#ifdef DEBUGGING
	2737	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	2738	NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
	2739	NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
	2740	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	2741	OP(nxt + 1) = OPTIMIZED; /* was count. */
	2742	NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
	2743	#endif
	2744	}
	2745	nogo:
	2746
	2747	DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
	2748
	2749	/* Try optimization CURLYX => CURLYM. */
	2750	if ( RE_OPTIMIZE_CURLYX_TO_CURLYM
	2751	&& OP(oscan) == CURLYX
	2752	&& data
	2753	&& !pRExC_state->code_blocks /* XXX: for now disable whenever eval
	2754	is seen anywhere. We need a better
	2755	way. */
	2756	&& !(data->flags & (SF_HAS_PAR\|SF_HAS_EVAL))
	2757	&& !deltanext /* atom is fixed width */
	2758	&& minnext != 0 /* CURLYM can't handle zero width */
	2759	/* Nor characters whose fold at run-time may be
	2760	* multi-character */
	2761	&& !(RExC_seen & REG_UNFOLDED_MULTI_SEEN)
	2762	&& mutate_ok
	2763	) {
	2764	DEBUG_STUDYDATA("CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
	2765	/* XXXX How to optimize if data == 0? */
	2766	/* Optimize to a simpler form. */
	2767	regnode nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); / OPEN */
	2768	regnode *nxt2;
	2769
	2770	OP(oscan) = CURLYM;
	2771	while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
	2772	&& (OP(nxt2) != WHILEM))
	2773	nxt = nxt2;
	2774	OP(nxt2) = SUCCEED; /* Whas WHILEM */
	2775	/* Need to optimize away parenths. */
	2776	if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
	2777	/* Set the parenth number. */
	2778	/* note that we have changed the type of oscan to CURLYM here */
	2779	regnode nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); / OPEN*/
	2780
	2781	oscan->flags = (U8)PARNO(nxt);
	2782	if (RExC_open_parens) {
	2783	/open->CURLYM/
	2784	RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
	2785
	2786	/close->NOTHING/
	2787	RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2)
	2788	+ 1;
	2789	}
	2790	OP(nxt1) = OPTIMIZED; /* was OPEN. */
	2791	OP(nxt) = OPTIMIZED; /* was CLOSE. */
	2792
	2793	#ifdef DEBUGGING
	2794	OP(nxt1 + 1) = OPTIMIZED; /* was count. */
	2795	OP(nxt + 1) = OPTIMIZED; /* was count. */
	2796	NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
	2797	NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
	2798	#endif
	2799	#if 0
	2800	while ( nxt1 && (OP(nxt1) != WHILEM)) {
	2801	regnode *nnxt = regnext(nxt1);
	2802	if (nnxt == nxt) {
	2803	if (REGNODE_OFF_BY_ARG(OP(nxt1)))
	2804	ARG_SET(nxt1, nxt2 - nxt1);
	2805	else if (nxt2 - nxt1 < U16_MAX)
	2806	NEXT_OFF(nxt1) = nxt2 - nxt1;
	2807	else
	2808	OP(nxt) = NOTHING; /* Cannot beautify */
	2809	}
	2810	nxt1 = nnxt;
	2811	}
	2812	#endif
	2813	/* Optimize again: */
	2814	/* recurse study_chunk() on optimised CURLYX => CURLYM */
	2815	study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
	2816	NULL, stopparen, recursed_depth, NULL, 0,
	2817	depth+1, mutate_ok);
	2818	}
	2819	else
	2820	oscan->flags = 0;
	2821	}
	2822	else if ((OP(oscan) == CURLYX)
	2823	&& (flags & SCF_WHILEM_VISITED_POS)
	2824	/* See the comment on a similar expression above.
	2825	However, this time it's not a subexpression
	2826	we care about, but the expression itself. */
	2827	&& (maxcount == REG_INFTY)
	2828	&& data) {
	2829	/* This stays as CURLYX, we can put the count/of pair. */
	2830	/* Find WHILEM (as in regexec.c) */
	2831	regnode *nxt = oscan + NEXT_OFF(oscan);
	2832
	2833	if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
	2834	nxt += ARG(nxt);
	2835	nxt = REGNODE_BEFORE(nxt);
	2836	if (nxt->flags & 0xf) {
	2837	/* we've already set whilem count on this node */
	2838	} else if (++data->whilem_c < 16) {
	2839	assert(data->whilem_c <= RExC_whilem_seen);
	2840	nxt->flags = (U8)(data->whilem_c
	2841	\| (RExC_whilem_seen << 4)); /* On WHILEM */
	2842	}
	2843	}
	2844	if (data && fl & (SF_HAS_PAR\|SF_IN_PAR))
	2845	pars++;
	2846	if (flags & SCF_DO_SUBSTR) {
	2847	SV *last_str = NULL;
	2848	STRLEN last_chrs = 0;
	2849	int counted = mincount != 0;
	2850
	2851	if (data->last_end > 0 && mincount != 0) { /* Ends with a
	2852	string. */
	2853	SSize_t b = pos_before >= data->last_start_min
	2854	? pos_before : data->last_start_min;
	2855	STRLEN l;
	2856	const char * const s = SvPV_const(data->last_found, l);
	2857	SSize_t old = b - data->last_start_min;
	2858	assert(old >= 0);
	2859
	2860	if (UTF)
	2861	old = utf8_hop_forward((U8*)s, old,
	2862	(U8 *) SvEND(data->last_found))
	2863	- (U8*)s;
	2864	l -= old;
	2865	/* Get the added string: */
	2866	last_str = newSVpvn_utf8(s + old, l, UTF);
	2867	last_chrs = UTF ? utf8_length((U8*)(s + old),
	2868	(U8*)(s + old + l)) : l;
	2869	if (deltanext == 0 && pos_before == b) {
	2870	/* What was added is a constant string */
	2871	if (mincount > 1) {
	2872
	2873	SvGROW(last_str, (mincount * l) + 1);
	2874	repeatcpy(SvPVX(last_str) + l,
	2875	SvPVX_const(last_str), l,
	2876	mincount - 1);
	2877	SvCUR_set(last_str, SvCUR(last_str) * mincount);
	2878	/* Add additional parts. */
	2879	SvCUR_set(data->last_found,
	2880	SvCUR(data->last_found) - l);
	2881	sv_catsv(data->last_found, last_str);
	2882	{
	2883	SV * sv = data->last_found;
	2884	MAGIC *mg =
	2885	SvUTF8(sv) && SvMAGICAL(sv) ?
	2886	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	2887	if (mg && mg->mg_len >= 0)
	2888	mg->mg_len += last_chrs * (mincount-1);
	2889	}
	2890	last_chrs *= mincount;
	2891	data->last_end += l * (mincount - 1);
	2892	}
	2893	} else {
	2894	/* start offset must point into the last copy */
	2895	data->last_start_min += minnext * (mincount - 1);
	2896	data->last_start_max =
	2897	is_inf
	2898	? OPTIMIZE_INFTY
	2899	: data->last_start_max +
	2900	(maxcount - 1) * (minnext + data->pos_delta);
	2901	}
	2902	}
	2903	/* It is counted once already... */
	2904	data->pos_min += minnext * (mincount - counted);
	2905	#if 0
	2906	Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
	2907	" OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
	2908	" maxcount=%" UVuf " mincount=%" UVuf
	2909	" data->pos_delta=%" UVuf "\n",
	2910	(UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
	2911	(UV)maxcount, (UV)mincount, (UV)data->pos_delta);
	2912	if (deltanext != OPTIMIZE_INFTY)
	2913	Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
	2914	(UV)(-counted * deltanext + (minnext + deltanext) * maxcount
	2915	- minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
	2916	#endif
	2917	if (deltanext == OPTIMIZE_INFTY
	2918	\|\| data->pos_delta == OPTIMIZE_INFTY
	2919	\|\| -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
	2920	data->pos_delta = OPTIMIZE_INFTY;
	2921	else
	2922	data->pos_delta += - counted * deltanext +
	2923	(minnext + deltanext) * maxcount - minnext * mincount;
	2924	if (mincount != maxcount) {
	2925	/* Cannot extend fixed substrings found inside
	2926	the group. */
	2927	scan_commit(pRExC_state, data, minlenp, is_inf);
	2928	if (mincount && last_str) {
	2929	SV * const sv = data->last_found;
	2930	MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
	2931	mg_find(sv, PERL_MAGIC_utf8) : NULL;
	2932
	2933	if (mg)
	2934	mg->mg_len = -1;
	2935	sv_setsv(sv, last_str);
	2936	data->last_end = data->pos_min;
	2937	data->last_start_min = data->pos_min - last_chrs;
	2938	data->last_start_max = is_inf
	2939	? OPTIMIZE_INFTY
	2940	: data->pos_min + data->pos_delta - last_chrs;
	2941	}
	2942	data->cur_is_floating = 1; /* float */
	2943	}
	2944	SvREFCNT_dec(last_str);
	2945	}
	2946	if (data && (fl & SF_HAS_EVAL))
	2947	data->flags \|= SF_HAS_EVAL;
	2948	optimize_curly_tail:
	2949	rck_elide_nothing(oscan);
	2950	continue;
	2951
	2952	default:
	2953	Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
	2954	OP(scan));
	2955	case REF:
	2956	case CLUMP:
	2957	if (flags & SCF_DO_SUBSTR) {
	2958	/* Cannot expect anything... */
	2959	scan_commit(pRExC_state, data, minlenp, is_inf);
	2960	data->cur_is_floating = 1; /* float */
	2961	}
	2962	is_inf = is_inf_internal = 1;
	2963	if (flags & SCF_DO_STCLASS_OR) {
	2964	if (OP(scan) == CLUMP) {
	2965	/* Actually is any start char, but very few code points
	2966	* aren't start characters */
	2967	ssc_match_all_cp(data->start_class);
	2968	}
	2969	else {
	2970	ssc_anything(data->start_class);
	2971	}
	2972	}
	2973	flags &= ~SCF_DO_STCLASS;
	2974	break;
	2975	}
	2976	}
	2977	else if (OP(scan) == LNBREAK) {
	2978	if (flags & SCF_DO_STCLASS) {
	2979	if (flags & SCF_DO_STCLASS_AND) {
	2980	ssc_intersection(data->start_class,
	2981	PL_XPosix_ptrs[CC_VERTSPACE_], FALSE);
	2982	ssc_clear_locale(data->start_class);
	2983	ANYOF_FLAGS(data->start_class)
	2984	&= ~SSC_MATCHES_EMPTY_STRING;
	2985	}
	2986	else if (flags & SCF_DO_STCLASS_OR) {
	2987	ssc_union(data->start_class,
	2988	PL_XPosix_ptrs[CC_VERTSPACE_],
	2989	FALSE);
	2990	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	2991
	2992	/* See commit msg for
	2993	* 749e076fceedeb708a624933726e7989f2302f6a */
	2994	ANYOF_FLAGS(data->start_class)
	2995	&= ~SSC_MATCHES_EMPTY_STRING;
	2996	}
	2997	flags &= ~SCF_DO_STCLASS;
	2998	}
	2999	min++;
	3000	if (delta != OPTIMIZE_INFTY)
	3001	delta++; /* Because of the 2 char string cr-lf */
	3002	if (flags & SCF_DO_SUBSTR) {
	3003	/* Cannot expect anything... */
	3004	scan_commit(pRExC_state, data, minlenp, is_inf);
	3005	data->pos_min += 1;
	3006	if (data->pos_delta != OPTIMIZE_INFTY) {
	3007	data->pos_delta += 1;
	3008	}
	3009	data->cur_is_floating = 1; /* float */
	3010	}
	3011	}
	3012	else if (REGNODE_SIMPLE(OP(scan))) {
	3013
	3014	if (flags & SCF_DO_SUBSTR) {
	3015	scan_commit(pRExC_state, data, minlenp, is_inf);
	3016	data->pos_min++;
	3017	}
	3018	min++;
	3019	if (flags & SCF_DO_STCLASS) {
	3020	bool invert = 0;
	3021	SV* my_invlist = NULL;
	3022	U8 namedclass;
	3023
	3024	/* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
	3025	ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
	3026
	3027	/* Some of the logic below assumes that switching
	3028	locale on will only add false positives. */
	3029	switch (OP(scan)) {
	3030
	3031	default:
	3032	#ifdef DEBUGGING
	3033	Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
	3034	OP(scan));
	3035	#endif
	3036	case SANY:
	3037	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3038	ssc_match_all_cp(data->start_class);
	3039	break;
	3040
	3041	case REG_ANY:
	3042	{
	3043	SV* REG_ANY_invlist = _new_invlist(2);
	3044	REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
	3045	'\n');
	3046	if (flags & SCF_DO_STCLASS_OR) {
	3047	ssc_union(data->start_class,
	3048	REG_ANY_invlist,
	3049	TRUE /* TRUE => invert, hence all but \n
	3050	*/
	3051	);
	3052	}
	3053	else if (flags & SCF_DO_STCLASS_AND) {
	3054	ssc_intersection(data->start_class,
	3055	REG_ANY_invlist,
	3056	TRUE /* TRUE => invert */
	3057	);
	3058	ssc_clear_locale(data->start_class);
	3059	}
	3060	SvREFCNT_dec_NN(REG_ANY_invlist);
	3061	}
	3062	break;
	3063
	3064	case ANYOFD:
	3065	case ANYOFL:
	3066	case ANYOFPOSIXL:
	3067	case ANYOFH:
	3068	case ANYOFHb:
	3069	case ANYOFHr:
	3070	case ANYOFHs:
	3071	case ANYOF:
	3072	if (flags & SCF_DO_STCLASS_AND)
	3073	ssc_and(pRExC_state, data->start_class,
	3074	(regnode_charclass *) scan);
	3075	else
	3076	ssc_or(pRExC_state, data->start_class,
	3077	(regnode_charclass *) scan);
	3078	break;
	3079
	3080	case ANYOFHbbm:
	3081	{
	3082	SV* cp_list = get_ANYOFHbbm_contents(scan);
	3083
	3084	if (flags & SCF_DO_STCLASS_OR) {
	3085	ssc_union(data->start_class, cp_list, invert);
	3086	}
	3087	else if (flags & SCF_DO_STCLASS_AND) {
	3088	ssc_intersection(data->start_class, cp_list, invert);
	3089	}
	3090
	3091	SvREFCNT_dec_NN(cp_list);
	3092	break;
	3093	}
	3094
	3095	case NANYOFM: /* NANYOFM already contains the inversion of the
	3096	input ANYOF data, so, unlike things like
	3097	NPOSIXA, don't change 'invert' to TRUE */
	3098	/* FALLTHROUGH */
	3099	case ANYOFM:
	3100	{
	3101	SV* cp_list = get_ANYOFM_contents(scan);
	3102
	3103	if (flags & SCF_DO_STCLASS_OR) {
	3104	ssc_union(data->start_class, cp_list, invert);
	3105	}
	3106	else if (flags & SCF_DO_STCLASS_AND) {
	3107	ssc_intersection(data->start_class, cp_list, invert);
	3108	}
	3109
	3110	SvREFCNT_dec_NN(cp_list);
	3111	break;
	3112	}
	3113
	3114	case ANYOFR:
	3115	case ANYOFRb:
	3116	{
	3117	SV* cp_list = NULL;
	3118
	3119	cp_list = _add_range_to_invlist(cp_list,
	3120	ANYOFRbase(scan),
	3121	ANYOFRbase(scan) + ANYOFRdelta(scan));
	3122
	3123	if (flags & SCF_DO_STCLASS_OR) {
	3124	ssc_union(data->start_class, cp_list, invert);
	3125	}
	3126	else if (flags & SCF_DO_STCLASS_AND) {
	3127	ssc_intersection(data->start_class, cp_list, invert);
	3128	}
	3129
	3130	SvREFCNT_dec_NN(cp_list);
	3131	break;
	3132	}
	3133
	3134	case NPOSIXL:
	3135	invert = 1;
	3136	/* FALLTHROUGH */
	3137
	3138	case POSIXL:
	3139	namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
	3140	if (flags & SCF_DO_STCLASS_AND) {
	3141	bool was_there = cBOOL(
	3142	ANYOF_POSIXL_TEST(data->start_class,
	3143	namedclass));
	3144	ANYOF_POSIXL_ZERO(data->start_class);
	3145	if (was_there) { /* Do an AND */
	3146	ANYOF_POSIXL_SET(data->start_class, namedclass);
	3147	}
	3148	/* No individual code points can now match */
	3149	data->start_class->invlist
	3150	= sv_2mortal(_new_invlist(0));
	3151	}
	3152	else {
	3153	int complement = namedclass + ((invert) ? -1 : 1);
	3154
	3155	assert(flags & SCF_DO_STCLASS_OR);
	3156
	3157	/* If the complement of this class was already there,
	3158	* the result is that they match all code points,
	3159	* (\d + \D == everything). Remove the classes from
	3160	* future consideration. Locale is not relevant in
	3161	* this case */
	3162	if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
	3163	ssc_match_all_cp(data->start_class);
	3164	ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
	3165	ANYOF_POSIXL_CLEAR(data->start_class, complement);
	3166	}
	3167	else { /* The usual case; just add this class to the
	3168	existing set */
	3169	ANYOF_POSIXL_SET(data->start_class, namedclass);
	3170	}
	3171	}
	3172	break;
	3173
	3174	case NPOSIXA: /* For these, we always know the exact set of
	3175	what's matched */
	3176	invert = 1;
	3177	/* FALLTHROUGH */
	3178	case POSIXA:
	3179	my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
	3180	goto join_posix_and_ascii;
	3181
	3182	case NPOSIXD:
	3183	case NPOSIXU:
	3184	invert = 1;
	3185	/* FALLTHROUGH */
	3186	case POSIXD:
	3187	case POSIXU:
	3188	my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
	3189
	3190	/* NPOSIXD matches all upper Latin1 code points unless the
	3191	* target string being matched is UTF-8, which is
	3192	* unknowable until match time. Since we are going to
	3193	* invert, we want to get rid of all of them so that the
	3194	* inversion will match all */
	3195	if (OP(scan) == NPOSIXD) {
	3196	_invlist_subtract(my_invlist, PL_UpperLatin1,
	3197	&my_invlist);
	3198	}
	3199
	3200	join_posix_and_ascii:
	3201
	3202	if (flags & SCF_DO_STCLASS_AND) {
	3203	ssc_intersection(data->start_class, my_invlist, invert);
	3204	ssc_clear_locale(data->start_class);
	3205	}
	3206	else {
	3207	assert(flags & SCF_DO_STCLASS_OR);
	3208	ssc_union(data->start_class, my_invlist, invert);
	3209	}
	3210	SvREFCNT_dec(my_invlist);
	3211	}
	3212	if (flags & SCF_DO_STCLASS_OR)
	3213	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	3214	flags &= ~SCF_DO_STCLASS;
	3215	}
	3216	}
	3217	else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
	3218	data->flags \|= (OP(scan) == MEOL
	3219	? SF_BEFORE_MEOL
	3220	: SF_BEFORE_SEOL);
	3221	scan_commit(pRExC_state, data, minlenp, is_inf);
	3222
	3223	}
	3224	else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ
	3225	/* Lookbehind, or need to calculate parens/evals/stclass: */
	3226	&& (scan->flags \|\| data \|\| (flags & SCF_DO_STCLASS))
	3227	&& (OP(scan) == IFMATCH \|\| OP(scan) == UNLESSM))
	3228	{
	3229	if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	3230	\|\| OP(scan) == UNLESSM )
	3231	{
	3232	/* Negative Lookahead/lookbehind
	3233	In this case we can't do fixed string optimisation.
	3234	*/
	3235
	3236	bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
	3237	SSize_t deltanext, minnext;
	3238	SSize_t fake_last_close = 0;
	3239	regnode *fake_last_close_op = NULL;
	3240	regnode *cur_last_close_op;
	3241	regnode *nscan;
	3242	regnode_ssc intrnl;
	3243	U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
	3244
	3245	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	3246	if (data) {
	3247	data_fake.whilem_c = data->whilem_c;
	3248	data_fake.last_closep = data->last_closep;
	3249	data_fake.last_close_opp = data->last_close_opp;
	3250	}
	3251	else {
	3252	data_fake.last_closep = &fake_last_close;
	3253	data_fake.last_close_opp = &fake_last_close_op;
	3254	}
	3255
	3256	/* remember the last_close_op we saw so we can see if
	3257	* we are dealing with variable length lookbehind that
	3258	* contains capturing buffers, which are considered
	3259	* experimental */
	3260	cur_last_close_op= *(data_fake.last_close_opp);
	3261
	3262	data_fake.pos_delta = delta;
	3263	if ( flags & SCF_DO_STCLASS && !scan->flags
	3264	&& OP(scan) == IFMATCH ) { /* Lookahead */
	3265	ssc_init(pRExC_state, &intrnl);
	3266	data_fake.start_class = &intrnl;
	3267	f \|= SCF_DO_STCLASS_AND;
	3268	}
	3269	if (flags & SCF_WHILEM_VISITED_POS)
	3270	f \|= SCF_WHILEM_VISITED_POS;
	3271	next = regnext(scan);
	3272	nscan = REGNODE_AFTER(scan);
	3273
	3274	/* recurse study_chunk() for lookahead body */
	3275	minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
	3276	last, &data_fake, stopparen,
	3277	recursed_depth, NULL, f, depth+1,
	3278	mutate_ok);
	3279
	3280	if (scan->flags) {
	3281	if ( deltanext < 0
	3282	\|\| deltanext > (I32) U8_MAX
	3283	\|\| minnext > (I32)U8_MAX
	3284	\|\| minnext + deltanext > (I32)U8_MAX)
	3285	{
	3286	FAIL2("Lookbehind longer than %" UVuf " not implemented",
	3287	(UV)U8_MAX);
	3288	}
	3289
	3290	/* The 'next_off' field has been repurposed to count the
	3291	* additional starting positions to try beyond the initial
	3292	* one. (This leaves it at 0 for non-variable length
	3293	* matches to avoid breakage for those not using this
	3294	* extension) */
	3295	if (deltanext) {
	3296	scan->next_off = deltanext;
	3297	if (
	3298	/* See a CLOSE op inside this lookbehind? */
	3299	cur_last_close_op != *(data_fake.last_close_opp)
	3300	/* and not doing restudy. see: restudied */
	3301	&& !(flags & SCF_TRIE_DOING_RESTUDY)
	3302	) {
	3303	/* this is positive variable length lookbehind with
	3304	* capture buffers inside of it */
	3305	ckWARNexperimental_with_arg(RExC_parse,
	3306	WARN_EXPERIMENTAL__VLB,
	3307	"Variable length %s lookbehind with capturing is experimental",
	3308	is_positive ? "positive" : "negative");
	3309	}
	3310	}
	3311	scan->flags = (U8)minnext + deltanext;
	3312	}
	3313	if (data) {
	3314	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3315	pars++;
	3316	if (data_fake.flags & SF_HAS_EVAL)
	3317	data->flags \|= SF_HAS_EVAL;
	3318	data->whilem_c = data_fake.whilem_c;
	3319	}
	3320	if (f & SCF_DO_STCLASS_AND) {
	3321	if (flags & SCF_DO_STCLASS_OR) {
	3322	/* OR before, AND after: ideally we would recurse with
	3323	* data_fake to get the AND applied by study of the
	3324	* remainder of the pattern, and then derecurse;
	3325	* * HACK * for now just treat as "no information".
	3326	* See [perl #56690].
	3327	*/
	3328	ssc_init(pRExC_state, data->start_class);
	3329	} else {
	3330	/* AND before and after: combine and continue. These
	3331	* assertions are zero-length, so can match an EMPTY
	3332	* string */
	3333	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
	3334	ANYOF_FLAGS(data->start_class)
	3335	\|= SSC_MATCHES_EMPTY_STRING;
	3336	}
	3337	}
	3338	DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
	3339	}
	3340	#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
	3341	else {
	3342	/* Positive Lookahead/lookbehind
	3343	In this case we can do fixed string optimisation,
	3344	but we must be careful about it. Note in the case of
	3345	lookbehind the positions will be offset by the minimum
	3346	length of the pattern, something we won't know about
	3347	until after the recurse.
	3348	*/
	3349	SSize_t deltanext, fake_last_close = 0;
	3350	regnode *last_close_op = NULL;
	3351	regnode *nscan;
	3352	regnode_ssc intrnl;
	3353	U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
	3354	/* We use SAVEFREEPV so that when the full compile
	3355	is finished perl will clean up the allocated
	3356	minlens when it's all done. This way we don't
	3357	have to worry about freeing them when we know
	3358	they wont be used, which would be a pain.
	3359	*/
	3360	SSize_t *minnextp;
	3361	Newx( minnextp, 1, SSize_t );
	3362	SAVEFREEPV(minnextp);
	3363
	3364	if (data) {
	3365	StructCopy(data, &data_fake, scan_data_t);
	3366	if ((flags & SCF_DO_SUBSTR) && data->last_found) {
	3367	f \|= SCF_DO_SUBSTR;
	3368	if (scan->flags)
	3369	scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
	3370	data_fake.last_found=newSVsv(data->last_found);
	3371	}
	3372	}
	3373	else {
	3374	data_fake.last_closep = &fake_last_close;
	3375	data_fake.last_close_opp = &fake_last_close_opp;
	3376	}
	3377	data_fake.flags = 0;
	3378	data_fake.substrs[0].flags = 0;
	3379	data_fake.substrs[1].flags = 0;
	3380	data_fake.pos_delta = delta;
	3381	if (is_inf)
	3382	data_fake.flags \|= SF_IS_INF;
	3383	if ( flags & SCF_DO_STCLASS && !scan->flags
	3384	&& OP(scan) == IFMATCH ) { /* Lookahead */
	3385	ssc_init(pRExC_state, &intrnl);
	3386	data_fake.start_class = &intrnl;
	3387	f \|= SCF_DO_STCLASS_AND;
	3388	}
	3389	if (flags & SCF_WHILEM_VISITED_POS)
	3390	f \|= SCF_WHILEM_VISITED_POS;
	3391	next = regnext(scan);
	3392	nscan = REGNODE_AFTER(scan);
	3393
	3394	/* positive lookahead study_chunk() recursion */
	3395	*minnextp = study_chunk(pRExC_state, &nscan, minnextp,
	3396	&deltanext, last, &data_fake,
	3397	stopparen, recursed_depth, NULL,
	3398	f, depth+1, mutate_ok);
	3399	if (scan->flags) {
	3400	assert(0); /* This code has never been tested since this
	3401	is normally not compiled */
	3402	if ( deltanext < 0
	3403	\|\| deltanext > (I32) U8_MAX
	3404	\|\| *minnextp > (I32)U8_MAX
	3405	\|\| *minnextp + deltanext > (I32)U8_MAX)
	3406	{
	3407	FAIL2("Lookbehind longer than %" UVuf " not implemented",
	3408	(UV)U8_MAX);
	3409	}
	3410
	3411	if (deltanext) {
	3412	scan->next_off = deltanext;
	3413	}
	3414	scan->flags = (U8)*minnextp + deltanext;
	3415	}
	3416
	3417	*minnextp += min;
	3418
	3419	if (f & SCF_DO_STCLASS_AND) {
	3420	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
	3421	ANYOF_FLAGS(data->start_class) \|= SSC_MATCHES_EMPTY_STRING;
	3422	}
	3423	if (data) {
	3424	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3425	pars++;
	3426	if (data_fake.flags & SF_HAS_EVAL)
	3427	data->flags \|= SF_HAS_EVAL;
	3428	data->whilem_c = data_fake.whilem_c;
	3429	if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
	3430	int i;
	3431	if (RExC_rx->minlen < *minnextp)
	3432	RExC_rx->minlen = *minnextp;
	3433	scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
	3434	SvREFCNT_dec_NN(data_fake.last_found);
	3435
	3436	for (i = 0; i < 2; i++) {
	3437	if (data_fake.substrs[i].minlenp != minlenp) {
	3438	data->substrs[i].min_offset =
	3439	data_fake.substrs[i].min_offset;
	3440	data->substrs[i].max_offset =
	3441	data_fake.substrs[i].max_offset;
	3442	data->substrs[i].minlenp =
	3443	data_fake.substrs[i].minlenp;
	3444	data->substrs[i].lookbehind += scan->flags;
	3445	}
	3446	}
	3447	}
	3448	}
	3449	}
	3450	#endif
	3451	}
	3452	else if (OP(scan) == OPEN) {
	3453	if (stopparen != (I32)PARNO(scan))
	3454	pars++;
	3455	}
	3456	else if (OP(scan) == CLOSE) {
	3457	if (stopparen == (I32)PARNO(scan)) {
	3458	break;
	3459	}
	3460	if ((I32)PARNO(scan) == is_par) {
	3461	next = regnext(scan);
	3462
	3463	if ( next && (OP(next) != WHILEM) && next < last)
	3464	is_par = 0; /* Disable optimization */
	3465	}
	3466	if (data) {
	3467	*(data->last_closep) = PARNO(scan);
	3468	*(data->last_close_opp) = scan;
	3469	}
	3470	}
	3471	else if (OP(scan) == EVAL) {
	3472	if (data)
	3473	data->flags \|= SF_HAS_EVAL;
	3474	}
	3475	else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
	3476	if (flags & SCF_DO_SUBSTR) {
	3477	scan_commit(pRExC_state, data, minlenp, is_inf);
	3478	flags &= ~SCF_DO_SUBSTR;
	3479	}
	3480	if (OP(scan)==ACCEPT) {
	3481	/* m{(ACCEPT)x} does not have to start with 'x' /
	3482	flags &= ~SCF_DO_STCLASS;
	3483	if (data)
	3484	data->flags \|= SCF_SEEN_ACCEPT;
	3485	if (stopmin > min)
	3486	stopmin = min;
	3487	}
	3488	}
	3489	else if (OP(scan) == COMMIT) {
	3490	/* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
	3491	* must not end up with "abcxyz" as a fixed substring else we'll
	3492	* skip straight to attempting to match at offset 4.
	3493	*/
	3494	if (flags & SCF_DO_SUBSTR) {
	3495	scan_commit(pRExC_state, data, minlenp, is_inf);
	3496	flags &= ~SCF_DO_SUBSTR;
	3497	}
	3498	}
	3499	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
	3500	{
	3501	if (flags & SCF_DO_SUBSTR) {
	3502	scan_commit(pRExC_state, data, minlenp, is_inf);
	3503	data->cur_is_floating = 1; /* float */
	3504	}
	3505	is_inf = is_inf_internal = 1;
	3506	if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
	3507	ssc_anything(data->start_class);
	3508	flags &= ~SCF_DO_STCLASS;
	3509	}
	3510	else if (OP(scan) == GPOS) {
	3511	if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
	3512	!(delta \|\| is_inf \|\| (data && data->pos_delta)))
	3513	{
	3514	if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
	3515	RExC_rx->intflags \|= PREGf_ANCH_GPOS;
	3516	if (RExC_rx->gofs < (STRLEN)min)
	3517	RExC_rx->gofs = min;
	3518	} else {
	3519	RExC_rx->intflags \|= PREGf_GPOS_FLOAT;
	3520	RExC_rx->gofs = 0;
	3521	}
	3522	}
	3523	#ifdef TRIE_STUDY_OPT
	3524	#ifdef FULL_TRIE_STUDY
	3525	else if (REGNODE_TYPE(OP(scan)) == TRIE) {
	3526	/* NOTE - There is similar code to this block above for handling
	3527	BRANCH nodes on the initial study. If you change stuff here
	3528	check there too. */
	3529	regnode *trie_node= scan;
	3530	regnode *tail= regnext(scan);
	3531	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	3532	SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
	3533	regnode_ssc accum;
	3534
	3535	if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
	3536	/* Cannot merge strings after this. */
	3537	scan_commit(pRExC_state, data, minlenp, is_inf);
	3538	}
	3539	if (flags & SCF_DO_STCLASS)
	3540	ssc_init_zero(pRExC_state, &accum);
	3541
	3542	if (!trie->jump) {
	3543	min1= trie->minlen;
	3544	max1= trie->maxlen;
	3545	} else {
	3546	const regnode *nextbranch= NULL;
	3547	U32 word;
	3548
	3549	for ( word=1 ; word <= trie->wordcount ; word++)
	3550	{
	3551	SSize_t deltanext = 0, minnext = 0;
	3552	U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
	3553	SSize_t fake_last_close = 0;
	3554	regnode *fake_last_close_op = NULL;
	3555	regnode_ssc this_class;
	3556
	3557	StructCopy(&zero_scan_data, &data_fake, scan_data_t);
	3558	if (data) {
	3559	data_fake.whilem_c = data->whilem_c;
	3560	data_fake.last_closep = data->last_closep;
	3561	data_fake.last_close_opp = data->last_close_opp;
	3562	}
	3563	else {
	3564	data_fake.last_closep = &fake_last_close;
	3565	data_fake.last_close_opp = &fake_last_close_op;
	3566	}
	3567	data_fake.pos_delta = delta;
	3568	if (flags & SCF_DO_STCLASS) {
	3569	ssc_init(pRExC_state, &this_class);
	3570	data_fake.start_class = &this_class;
	3571	f \|= SCF_DO_STCLASS_AND;
	3572	}
	3573	if (flags & SCF_WHILEM_VISITED_POS)
	3574	f \|= SCF_WHILEM_VISITED_POS;
	3575
	3576	if (trie->jump[word]) {
	3577	if (!nextbranch)
	3578	nextbranch = trie_node + trie->jump[0];
	3579	scan= trie_node + trie->jump[word];
	3580	/* We go from the jump point to the branch that follows
	3581	it. Note this means we need the vestigal unused
	3582	branches even though they arent otherwise used. */
	3583	/* optimise study_chunk() for TRIE */
	3584	minnext = study_chunk(pRExC_state, &scan, minlenp,
	3585	&deltanext, (regnode *)nextbranch, &data_fake,
	3586	stopparen, recursed_depth, NULL, f, depth+1,
	3587	mutate_ok);
	3588	}
	3589	if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
	3590	nextbranch= regnext((regnode*)nextbranch);
	3591
	3592	if (min1 > (SSize_t)(minnext + trie->minlen))
	3593	min1 = minnext + trie->minlen;
	3594	if (deltanext == OPTIMIZE_INFTY) {
	3595	is_inf = is_inf_internal = 1;
	3596	max1 = OPTIMIZE_INFTY;
	3597	} else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
	3598	max1 = minnext + deltanext + trie->maxlen;
	3599
	3600	if (data_fake.flags & (SF_HAS_PAR\|SF_IN_PAR))
	3601	pars++;
	3602	if (data_fake.flags & SCF_SEEN_ACCEPT) {
	3603	if ( stopmin > min + min1)
	3604	stopmin = min + min1;
	3605	flags &= ~SCF_DO_SUBSTR;
	3606	if (data)
	3607	data->flags \|= SCF_SEEN_ACCEPT;
	3608	}
	3609	if (data) {
	3610	if (data_fake.flags & SF_HAS_EVAL)
	3611	data->flags \|= SF_HAS_EVAL;
	3612	data->whilem_c = data_fake.whilem_c;
	3613	}
	3614	if (flags & SCF_DO_STCLASS)
	3615	ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
	3616	}
	3617	DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
	3618	}
	3619	if (flags & SCF_DO_SUBSTR) {
	3620	data->pos_min += min1;
	3621	data->pos_delta += max1 - min1;
	3622	if (max1 != min1 \|\| is_inf)
	3623	data->cur_is_floating = 1; /* float */
	3624	}
	3625	min += min1;
	3626	if (delta != OPTIMIZE_INFTY) {
	3627	if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
	3628	delta += max1 - min1;
	3629	else
	3630	delta = OPTIMIZE_INFTY;
	3631	}
	3632	if (flags & SCF_DO_STCLASS_OR) {
	3633	ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
	3634	if (min1) {
	3635	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	3636	flags &= ~SCF_DO_STCLASS;
	3637	}
	3638	}
	3639	else if (flags & SCF_DO_STCLASS_AND) {
	3640	if (min1) {
	3641	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
	3642	flags &= ~SCF_DO_STCLASS;
	3643	}
	3644	else {
	3645	/* Switch to OR mode: cache the old value of
	3646	* data->start_class */
	3647	INIT_AND_WITHP;
	3648	StructCopy(data->start_class, and_withp, regnode_ssc);
	3649	flags &= ~SCF_DO_STCLASS_AND;
	3650	StructCopy(&accum, data->start_class, regnode_ssc);
	3651	flags \|= SCF_DO_STCLASS_OR;
	3652	}
	3653	}
	3654	scan= tail;
	3655	DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
	3656	continue;
	3657	}
	3658	#else
	3659	else if (REGNODE_TYPE(OP(scan)) == TRIE) {
	3660	reg_trie_data trie = (reg_trie_data)RExC_rxi->data->data[ ARG(scan) ];
	3661	U8*bang=NULL;
	3662
	3663	min += trie->minlen;
	3664	delta += (trie->maxlen - trie->minlen);
	3665	flags &= ~SCF_DO_STCLASS; /* xxx */
	3666	if (flags & SCF_DO_SUBSTR) {
	3667	/* Cannot expect anything... */
	3668	scan_commit(pRExC_state, data, minlenp, is_inf);
	3669	data->pos_min += trie->minlen;
	3670	data->pos_delta += (trie->maxlen - trie->minlen);
	3671	if (trie->maxlen != trie->minlen)
	3672	data->cur_is_floating = 1; /* float */
	3673	}
	3674	if (trie->jump) /* no more substrings -- for now /grr*/
	3675	flags &= ~SCF_DO_SUBSTR;
	3676	}
	3677
	3678	#endif /* old or new */
	3679	#endif /* TRIE_STUDY_OPT */
	3680
	3681	else if (OP(scan) == REGEX_SET) {
	3682	Perl_croak(aTHX_ "panic: %s regnode should be resolved"
	3683	" before optimization", REGNODE_NAME(REGEX_SET));
	3684	}
	3685
	3686	/* Else: zero-length, ignore. */
	3687	scan = regnext(scan);
	3688	}
	3689
	3690	finish:
	3691	if (frame) {
	3692	/* we need to unwind recursion. */
	3693	depth = depth - 1;
	3694
	3695	DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
	3696	DEBUG_PEEP("fend", scan, depth, flags);
	3697
	3698	/* restore previous context */
	3699	last = frame->last_regnode;
	3700	scan = frame->next_regnode;
	3701	stopparen = frame->stopparen;
	3702	recursed_depth = frame->prev_recursed_depth;
	3703
	3704	RExC_frame_last = frame->prev_frame;
	3705	frame = frame->this_prev_frame;
	3706	goto fake_study_recurse;
	3707	}
	3708
	3709	assert(!frame);
	3710	DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
	3711
	3712	/* is this pattern infinite? Eg, consider /(a\|b+)/ */
	3713	if (is_inf_internal)
	3714	delta = OPTIMIZE_INFTY;
	3715
	3716	/* deal with (ACCEPT), Eg, consider /(foo(ACCEPT)\|bop)bar/ */
	3717	if (min > stopmin) {
	3718	/*
	3719	At this point 'min' represents the minimum length string we can
	3720	match while ignoring the implication of ACCEPT, and 'delta'
	3721	represents the difference between the minimum length and maximum
	3722	length, and if the pattern matches an infinitely long string
	3723	(consider the + and * quantifiers) then we use the special delta
	3724	value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
	3725	minimum length that can be matched and accepted.
	3726
	3727	A pattern is accepted when matching was successful and
	3728	complete, and thus there is no further matching needing to be
	3729	done, no backtracking to occur, etc. Prior to the introduction
	3730	of ACCEPT the only opcode that signaled acceptance was the END
	3731	opcode, which is always the very last opcode in a regex program.
	3732	ACCEPT is thus conceptually an early successful return out of
	3733	the matching process. stopmin starts out as OPTIMIZE_INFTY to
	3734	represent "the entire pattern", and is ratched down to the
	3735	"current min" if necessary when an ACCEPT opcode is encountered.
	3736
	3737	Thus stopmin might be smaller than min if we saw an (*ACCEPT),
	3738	and we now need to account for it in both min and delta.
	3739	Consider that in a pattern /AB/ normally the min length it can
	3740	match can be computed as min(A)+min(B). But (*ACCEPT) means
	3741	that it might be something else, not even neccesarily min(A) at
	3742	all. Consider
	3743
	3744	A = /(foo(*ACCEPT)\|x+)/
	3745	B = /whop/
	3746	AB = /(foo(*ACCEPT)\|x+)whop/
	3747
	3748	The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
	3749	for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
	3750	"whop", and the delta of 0 as the pattern is of fixed length, the
	3751	stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
	3752	When handling AB we expect to see a min of 5 for "xwhop", and a
	3753	delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
	3754	for "foo". This should result in a final min of 3 for "foo", and
	3755	a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
	3756
	3757	In something like /(dude(*ACCEPT)\|irk)x{3,7}/ we would have a
	3758	min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
	3759	stop min would be 4 for "dude". This should result in a final
	3760	min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
	3761
	3762	When min is smaller than stopmin then we can ignore it. In the
	3763	fragment /(x{10,20}(*ACCEPT)\|a)b+/, we would have a min of 2,
	3764	and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
	3765	the ACCEPT doesn't reduce the minimum length of the string that
	3766	might be matched, nor affect the maximum length.
	3767
	3768	In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
	3769	for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
	3770	"foo". We currently turn this into a min of 3 for "foo" and a
	3771	delta of 3 for "foobar" even though technically "foobar" isn't
	3772	possible. ACCEPT affects some aspects of the optimizer, like
	3773	length computations and mandatory substring optimizations, but
	3774	there are other optimzations this routine perfoms that are not
	3775	affected and this compromise simplifies implementation.
	3776
	3777	It might be helpful to consider that this C function is called
	3778	recursively on the pattern in a bottom up fashion, and that the
	3779	min returned by a nested call may be marked as coming from an
	3780	ACCEPT, causing its callers to treat the returned min as a
	3781	stopmin as the recursion unwinds. Thus a single ACCEPT can affect
	3782	multiple calls into this function in different ways.
	3783	*/
	3784
	3785	if (OPTIMIZE_INFTY - delta >= min - stopmin)
	3786	delta += min - stopmin;
	3787	else
	3788	delta = OPTIMIZE_INFTY;
	3789	min = stopmin;
	3790	}
	3791
	3792	*scanp = scan;
	3793	*deltap = delta;
	3794
	3795	if (flags & SCF_DO_SUBSTR && is_inf)
	3796	data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
	3797	if (is_par > (I32)U8_MAX)
	3798	is_par = 0;
	3799	if (is_par && pars==1 && data) {
	3800	data->flags \|= SF_IN_PAR;
	3801	data->flags &= ~SF_HAS_PAR;
	3802	}
	3803	else if (pars && data) {
	3804	data->flags \|= SF_HAS_PAR;
	3805	data->flags &= ~SF_IN_PAR;
	3806	}
	3807	if (flags & SCF_DO_STCLASS_OR)
	3808	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
	3809	if (flags & SCF_TRIE_RESTUDY)
	3810	data->flags \|= SCF_TRIE_RESTUDY;
	3811
	3812
	3813	if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
	3814	if (min > OPTIMIZE_INFTY - delta)
	3815	RExC_maxlen = OPTIMIZE_INFTY;
	3816	else if (RExC_maxlen < min + delta)
	3817	RExC_maxlen = min + delta;
	3818	}
	3819	DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
	3820	return min;
	3821	}