perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* regexec.c
	2	*/
	3
	4	/*
	5	* One Ring to rule them all, One Ring to find them
	6	&
	7	* [p.v of _The Lord of the Rings_, opening poem]
	8	* [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
	9	* [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
	10	*/
	11
	12	/* This file contains functions for executing a regular expression. See
	13	* also regcomp.c which funnily enough, contains functions for compiling
	14	* a regular expression.
	15	*
	16	* This file is also copied at build time to ext/re/re_exec.c, where
	17	* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
	18	* This causes the main functions to be compiled under new names and with
	19	* debugging support added, which makes "use re 'debug'" work.
	20	*/
	21
	22	/* NOTE: this is derived from Henry Spencer's regexp code, and should not
	23	* confused with the original package (see point 3 below). Thanks, Henry!
	24	*/
	25
	26	/* Additional note: this code is very heavily munged from Henry's version
	27	* in places. In some spots I've traded clarity for efficiency, so don't
	28	* blame Henry for some of the lack of readability.
	29	*/
	30
	31	/* The names of the functions have been changed from regcomp and
	32	* regexec to pregcomp and pregexec in order to avoid conflicts
	33	* with the POSIX routines of the same names.
	34	*/
	35
	36	#ifdef PERL_EXT_RE_BUILD
	37	#include "re_top.h"
	38	#endif
	39
	40	/* At least one required character in the target string is expressible only in
	41	* UTF-8. */
	42	static const char* const non_utf8_target_but_utf8_required
	43	= "Can't match, because target string needs to be in UTF-8\n";
	44
	45	#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
	46	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
	47	goto target; \
	48	} STMT_END
	49
	50	/*
	51	* pregcomp and pregexec -- regsub and regerror are not used in perl
	52	*
	53	* Copyright (c) 1986 by University of Toronto.
	54	* Written by Henry Spencer. Not derived from licensed software.
	55	*
	56	* Permission is granted to anyone to use this software for any
	57	* purpose on any computer system, and to redistribute it freely,
	58	* subject to the following restrictions:
	59	*
	60	* 1. The author is not responsible for the consequences of use of
	61	* this software, no matter how awful, even if they arise
	62	* from defects in it.
	63	*
	64	* 2. The origin of this software must not be misrepresented, either
	65	* by explicit claim or by omission.
	66	*
	67	* 3. Altered versions must be plainly marked as such, and must not
	68	* be misrepresented as being the original software.
	69	*
	70	**** Alterations to Henry's code are...
	71	****
	72	**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
	73	**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	74	**** by Larry Wall and others
	75	****
	76	**** You may distribute under the terms of either the GNU General Public
	77	**** License or the Artistic License, as specified in the README file.
	78	*
	79	* Beware that some of this code is subtly aware of the way operator
	80	* precedence is structured in regular expressions. Serious changes in
	81	* regular-expression syntax might require a total rethink.
	82	*/
	83	#include "EXTERN.h"
	84	#define PERL_IN_REGEXEC_C
	85	#include "perl.h"
	86
	87	#ifdef PERL_IN_XSUB_RE
	88	# include "re_comp.h"
	89	#else
	90	# include "regcomp.h"
	91	#endif
	92
	93	#include "inline_invlist.c"
	94	#include "unicode_constants.h"
	95
	96	#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
	97
	98	#ifndef STATIC
	99	#define STATIC static
	100	#endif
	101
	102	/* Valid for non-utf8 strings: avoids the reginclass
	103	* call if there are no complications: i.e., if everything matchable is
	104	* straight forward in the bitmap */
	105	#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
	106	: ANYOF_BITMAP_TEST(p,*(c)))
	107
	108	/*
	109	* Forwards.
	110	*/
	111
	112	#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
	113	#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
	114
	115	#define HOPc(pos,off) \
	116	(char *)(reginfo->is_utf8_target \
	117	? reghop3((U8*)pos, off, \
	118	(U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
	119	: (U8*)(pos + off))
	120	#define HOPBACKc(pos, off) \
	121	(char*)(reginfo->is_utf8_target \
	122	? reghopmaybe3((U8)pos, -off, (U8)(reginfo->strbeg)) \
	123	: (pos - off >= reginfo->strbeg) \
	124	? (U8*)pos - off \
	125	: NULL)
	126
	127	#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8)(pos), off, (U8)(lim)) : (U8*)(pos + off))
	128	#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
	129
	130
	131	#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
	132	#define NEXTCHR_IS_EOS (nextchr < 0)
	133
	134	#define SET_nextchr \
	135	nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
	136
	137	#define SET_locinput(p) \
	138	locinput = (p); \
	139	SET_nextchr
	140
	141
	142	#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
	143	if (!swash_ptr) { \
	144	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
	145	swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
	146	1, 0, NULL, &flags); \
	147	assert(swash_ptr); \
	148	} \
	149	} STMT_END
	150
	151	/* If in debug mode, we test that a known character properly matches */
	152	#ifdef DEBUGGING
	153	# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
	154	property_name, \
	155	utf8_char_in_property) \
	156	LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
	157	assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
	158	#else
	159	# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
	160	property_name, \
	161	utf8_char_in_property) \
	162	LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
	163	#endif
	164
	165	#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
	166	PL_utf8_swash_ptrs[_CC_WORDCHAR], \
	167	swash_property_names[_CC_WORDCHAR], \
	168	GREEK_SMALL_LETTER_IOTA_UTF8)
	169
	170	#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
	171	STMT_START { \
	172	LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
	173	"_X_regular_begin", \
	174	GREEK_SMALL_LETTER_IOTA_UTF8); \
	175	LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
	176	"_X_extend", \
	177	COMBINING_GRAVE_ACCENT_UTF8); \
	178	} STMT_END
	179
	180	#define PLACEHOLDER /* Something for the preprocessor to grab onto */
	181	/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
	182
	183	/* for use after a quantifier and before an EXACT-like node -- japhy */
	184	/* it would be nice to rework regcomp.sym to generate this stuff. sigh
	185	*
	186	* NOTE that nothing that affects backtracking should be in here, specifically
	187	* VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
	188	* node that is in between two EXACT like nodes when ascertaining what the required
	189	* "follow" character is. This should probably be moved to regex compile time
	190	* although it may be done at run time beause of the REF possibility - more
	191	* investigation required. -- demerphq
	192	*/
	193	#define JUMPABLE(rn) ( \
	194	OP(rn) == OPEN \|\| \
	195	(OP(rn) == CLOSE && (!cur_eval \|\| cur_eval->u.eval.close_paren != ARG(rn))) \|\| \
	196	OP(rn) == EVAL \|\| \
	197	OP(rn) == SUSPEND \|\| OP(rn) == IFMATCH \|\| \
	198	OP(rn) == PLUS \|\| OP(rn) == MINMOD \|\| \
	199	OP(rn) == KEEPS \|\| \
	200	(PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
	201	)
	202	#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
	203
	204	#define HAS_TEXT(rn) ( IS_EXACT(rn) \|\| PL_regkind[OP(rn)] == REF )
	205
	206	#if 0
	207	/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
	208	we don't need this definition. */
	209	#define IS_TEXT(rn) ( OP(rn)==EXACT \|\| OP(rn)==REF \|\| OP(rn)==NREF )
	210	#define IS_TEXTF(rn) ( OP(rn)==EXACTFU \|\| OP(rn)==EXACTFU_SS \|\| OP(rn)==EXACTFU_TRICKYFOLD \|\| OP(rn)==EXACTFA \|\| OP(rn)==EXACTF \|\| OP(rn)==REFF \|\| OP(rn)==NREFF )
	211	#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL \|\| OP(rn)==REFFL \|\| OP(rn)==NREFFL )
	212
	213	#else
	214	/* ... so we use this as its faster. */
	215	#define IS_TEXT(rn) ( OP(rn)==EXACT )
	216	#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU \|\| OP(rn)==EXACTFU_SS \|\| OP(rn)==EXACTFU_TRICKYFOLD \|\| OP(rn) == EXACTFA)
	217	#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
	218	#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
	219
	220	#endif
	221
	222	/*
	223	Search for mandatory following text node; for lookahead, the text must
	224	follow but for lookbehind (rn->flags != 0) we skip to the next step.
	225	*/
	226	#define FIND_NEXT_IMPT(rn) STMT_START { \
	227	while (JUMPABLE(rn)) { \
	228	const OPCODE type = OP(rn); \
	229	if (type == SUSPEND \|\| PL_regkind[type] == CURLY) \
	230	rn = NEXTOPER(NEXTOPER(rn)); \
	231	else if (type == PLUS) \
	232	rn = NEXTOPER(rn); \
	233	else if (type == IFMATCH) \
	234	rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
	235	else rn += NEXT_OFF(rn); \
	236	} \
	237	} STMT_END
	238
	239	/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
	240	* These are for the pre-composed Hangul syllables, which are all in a
	241	* contiguous block and arranged there in such a way so as to facilitate
	242	* alorithmic determination of their characteristics. As such, they don't need
	243	* a swash, but can be determined by simple arithmetic. Almost all are
	244	* GCB=LVT, but every 28th one is a GCB=LV */
	245	#define SBASE 0xAC00 /* Start of block */
	246	#define SCount 11172 /* Length of block */
	247	#define TCount 28
	248
	249	#define SLAB_FIRST(s) (&(s)->states[0])
	250	#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
	251
	252	static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
	253	static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
	254	static regmatch_state * S_push_slab(pTHX);
	255
	256	#define REGCP_PAREN_ELEMS 3
	257	#define REGCP_OTHER_ELEMS 3
	258	#define REGCP_FRAME_ELEMS 1
	259	/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
	260	* are needed for the regexp context stack bookkeeping. */
	261
	262	STATIC CHECKPOINT
	263	S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
	264	{
	265	dVAR;
	266	const int retval = PL_savestack_ix;
	267	const int paren_elems_to_push =
	268	(maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
	269	const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
	270	const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
	271	I32 p;
	272	GET_RE_DEBUG_FLAGS_DECL;
	273
	274	PERL_ARGS_ASSERT_REGCPPUSH;
	275
	276	if (paren_elems_to_push < 0)
	277	Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
	278	paren_elems_to_push);
	279
	280	if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
	281	Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
	282	" out of range (%lu-%ld)",
	283	total_elems,
	284	(unsigned long)maxopenparen,
	285	(long)parenfloor);
	286
	287	SSGROW(total_elems + REGCP_FRAME_ELEMS);
	288
	289	DEBUG_BUFFERS_r(
	290	if ((int)maxopenparen > (int)parenfloor)
	291	PerlIO_printf(Perl_debug_log,
	292	"rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
	293	PTR2UV(rex),
	294	PTR2UV(rex->offs)
	295	);
	296	);
	297	for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
	298	/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
	299	SSPUSHINT(rex->offs[p].end);
	300	SSPUSHINT(rex->offs[p].start);
	301	SSPUSHINT(rex->offs[p].start_tmp);
	302	DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
	303	" \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
	304	(UV)p,
	305	(IV)rex->offs[p].start,
	306	(IV)rex->offs[p].start_tmp,
	307	(IV)rex->offs[p].end
	308	));
	309	}
	310	/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
	311	SSPUSHINT(maxopenparen);
	312	SSPUSHINT(rex->lastparen);
	313	SSPUSHINT(rex->lastcloseparen);
	314	SSPUSHUV(SAVEt_REGCONTEXT \| elems_shifted); /* Magic cookie. */
	315
	316	return retval;
	317	}
	318
	319	/* These are needed since we do not localize EVAL nodes: */
	320	#define REGCP_SET(cp) \
	321	DEBUG_STATE_r( \
	322	PerlIO_printf(Perl_debug_log, \
	323	" Setting an EVAL scope, savestack=%"IVdf"\n", \
	324	(IV)PL_savestack_ix)); \
	325	cp = PL_savestack_ix
	326
	327	#define REGCP_UNWIND(cp) \
	328	DEBUG_STATE_r( \
	329	if (cp != PL_savestack_ix) \
	330	PerlIO_printf(Perl_debug_log, \
	331	" Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
	332	(IV)(cp), (IV)PL_savestack_ix)); \
	333	regcpblow(cp)
	334
	335	#define UNWIND_PAREN(lp, lcp) \
	336	for (n = rex->lastparen; n > lp; n--) \
	337	rex->offs[n].end = -1; \
	338	rex->lastparen = n; \
	339	rex->lastcloseparen = lcp;
	340
	341
	342	STATIC void
	343	S_regcppop(pTHX_ regexp rex, U32 maxopenparen_p)
	344	{
	345	dVAR;
	346	UV i;
	347	U32 paren;
	348	GET_RE_DEBUG_FLAGS_DECL;
	349
	350	PERL_ARGS_ASSERT_REGCPPOP;
	351
	352	/* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
	353	i = SSPOPUV;
	354	assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
	355	i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
	356	rex->lastcloseparen = SSPOPINT;
	357	rex->lastparen = SSPOPINT;
	358	*maxopenparen_p = SSPOPINT;
	359
	360	i -= REGCP_OTHER_ELEMS;
	361	/* Now restore the parentheses context. */
	362	DEBUG_BUFFERS_r(
	363	if (i \|\| rex->lastparen + 1 <= rex->nparens)
	364	PerlIO_printf(Perl_debug_log,
	365	"rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
	366	PTR2UV(rex),
	367	PTR2UV(rex->offs)
	368	);
	369	);
	370	paren = *maxopenparen_p;
	371	for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
	372	I32 tmps;
	373	rex->offs[paren].start_tmp = SSPOPINT;
	374	rex->offs[paren].start = SSPOPINT;
	375	tmps = SSPOPINT;
	376	if (paren <= rex->lastparen)
	377	rex->offs[paren].end = tmps;
	378	DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
	379	" \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
	380	(UV)paren,
	381	(IV)rex->offs[paren].start,
	382	(IV)rex->offs[paren].start_tmp,
	383	(IV)rex->offs[paren].end,
	384	(paren > rex->lastparen ? "(skipped)" : ""));
	385	);
	386	paren--;
	387	}
	388	#if 1
	389	/* It would seem that the similar code in regtry()
	390	* already takes care of this, and in fact it is in
	391	* a better location to since this code can #if 0-ed out
	392	* but the code in regtry() is needed or otherwise tests
	393	* requiring null fields (pat.t#187 and split.t#{13,14}
	394	* (as of patchlevel 7877) will fail. Then again,
	395	* this code seems to be necessary or otherwise
	396	* this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
	397	* --jhi updated by dapm */
	398	for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
	399	if (i > *maxopenparen_p)
	400	rex->offs[i].start = -1;
	401	rex->offs[i].end = -1;
	402	DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
	403	" \\%"UVuf": %s ..-1 undeffing\n",
	404	(UV)i,
	405	(i > *maxopenparen_p) ? "-1" : " "
	406	));
	407	}
	408	#endif
	409	}
	410
	411	/* restore the parens and associated vars at savestack position ix,
	412	* but without popping the stack */
	413
	414	STATIC void
	415	S_regcp_restore(pTHX_ regexp rex, I32 ix, U32 maxopenparen_p)
	416	{
	417	I32 tmpix = PL_savestack_ix;
	418	PL_savestack_ix = ix;
	419	regcppop(rex, maxopenparen_p);
	420	PL_savestack_ix = tmpix;
	421	}
	422
	423	#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
	424
	425	STATIC bool
	426	S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
	427	{
	428	/* Returns a boolean as to whether or not 'character' is a member of the
	429	* Posix character class given by 'classnum' that should be equivalent to a
	430	* value in the typedef '_char_class_number'.
	431	*
	432	* Ideally this could be replaced by a just an array of function pointers
	433	* to the C library functions that implement the macros this calls.
	434	* However, to compile, the precise function signatures are required, and
	435	* these may vary from platform to to platform. To avoid having to figure
	436	* out what those all are on each platform, I (khw) am using this method,
	437	* which adds an extra layer of function call overhead (unless the C
	438	* optimizer strips it away). But we don't particularly care about
	439	* performance with locales anyway. */
	440
	441	switch ((_char_class_number) classnum) {
	442	case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
	443	case _CC_ENUM_ALPHA: return isALPHA_LC(character);
	444	case _CC_ENUM_ASCII: return isASCII_LC(character);
	445	case _CC_ENUM_BLANK: return isBLANK_LC(character);
	446	case _CC_ENUM_CASED: return isLOWER_LC(character)
	447	\|\| isUPPER_LC(character);
	448	case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
	449	case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
	450	case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
	451	case _CC_ENUM_LOWER: return isLOWER_LC(character);
	452	case _CC_ENUM_PRINT: return isPRINT_LC(character);
	453	case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
	454	case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
	455	case _CC_ENUM_SPACE: return isSPACE_LC(character);
	456	case _CC_ENUM_UPPER: return isUPPER_LC(character);
	457	case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
	458	case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
	459	default: /* VERTSPACE should never occur in locales */
	460	Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
	461	}
	462
	463	assert(0); /* NOTREACHED */
	464	return FALSE;
	465	}
	466
	467	STATIC bool
	468	S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
	469	{
	470	/* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
	471	* 'character' is a member of the Posix character class given by 'classnum'
	472	* that should be equivalent to a value in the typedef
	473	* '_char_class_number'.
	474	*
	475	* This just calls isFOO_lc on the code point for the character if it is in
	476	* the range 0-255. Outside that range, all characters avoid Unicode
	477	* rules, ignoring any locale. So use the Unicode function if this class
	478	* requires a swash, and use the Unicode macro otherwise. */
	479
	480	PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
	481
	482	if (UTF8_IS_INVARIANT(*character)) {
	483	return isFOO_lc(classnum, *character);
	484	}
	485	else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
	486	return isFOO_lc(classnum,
	487	TWO_BYTE_UTF8_TO_UNI(character, (character + 1)));
	488	}
	489
	490	if (classnum < _FIRST_NON_SWASH_CC) {
	491
	492	/* Initialize the swash unless done already */
	493	if (! PL_utf8_swash_ptrs[classnum]) {
	494	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	495	PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
	496	swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
	497	}
	498
	499	return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
	500	character,
	501	TRUE /* is UTF */ ));
	502	}
	503
	504	switch ((_char_class_number) classnum) {
	505	case _CC_ENUM_SPACE:
	506	case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
	507
	508	case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
	509	case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
	510	case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
	511	default: return 0; /* Things like CNTRL are always
	512	below 256 */
	513	}
	514
	515	assert(0); /* NOTREACHED */
	516	return FALSE;
	517	}
	518
	519	/*
	520	* pregexec and friends
	521	*/
	522
	523	#ifndef PERL_IN_XSUB_RE
	524	/*
	525	- pregexec - match a regexp against a string
	526	*/
	527	I32
	528	Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
	529	char strbeg, I32 minend, SV screamer, U32 nosave)
	530	/* stringarg: the point in the string at which to begin matching */
	531	/* strend: pointer to null at end of string */
	532	/* strbeg: real beginning of string */
	533	/* minend: end of match must be >= minend bytes after stringarg. */
	534	/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
	535	* itself is accessed via the pointers above */
	536	/* nosave: For optimizations. */
	537	{
	538	PERL_ARGS_ASSERT_PREGEXEC;
	539
	540	return
	541	regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
	542	nosave ? 0 : REXEC_COPY_STR);
	543	}
	544	#endif
	545
	546	/*
	547	* Need to implement the following flags for reg_anch:
	548	*
	549	* USE_INTUIT_NOML - Useful to call re_intuit_start() first
	550	* USE_INTUIT_ML
	551	* INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
	552	* INTUIT_AUTORITATIVE_ML
	553	* INTUIT_ONCE_NOML - Intuit can match in one location only.
	554	* INTUIT_ONCE_ML
	555	*
	556	* Another flag for this function: SECOND_TIME (so that float substrs
	557	* with giant delta may be not rechecked).
	558	*/
	559
	560	/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
	561
	562	/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
	563	Otherwise, only SvCUR(sv) is used to get strbeg. */
	564
	565	/* XXXX We assume that strpos is strbeg unless sv. */
	566
	567	/* XXXX Some places assume that there is a fixed substring.
	568	An update may be needed if optimizer marks as "INTUITable"
	569	RExen without fixed substrings. Similarly, it is assumed that
	570	lengths of all the strings are no more than minlen, thus they
	571	cannot come from lookahead.
	572	(Or minlen should take into account lookahead.)
	573	NOTE: Some of this comment is not correct. minlen does now take account
	574	of lookahead/behind. Further research is required. -- demerphq
	575
	576	*/
	577
	578	/* A failure to find a constant substring means that there is no need to make
	579	an expensive call to REx engine, thus we celebrate a failure. Similarly,
	580	finding a substring too deep into the string means that fewer calls to
	581	regtry() should be needed.
	582
	583	REx compiler's optimizer found 4 possible hints:
	584	a) Anchored substring;
	585	b) Fixed substring;
	586	c) Whether we are anchored (beginning-of-line or \G);
	587	d) First node (of those at offset 0) which may distinguish positions;
	588	We use a)b)d) and multiline-part of c), and try to find a position in the
	589	string which does not contradict any of them.
	590	*/
	591
	592	/* Most of decisions we do here should have been done at compile time.
	593	The nodes of the REx which we used for the search should have been
	594	deleted from the finite automaton. */
	595
	596	/* args:
	597	* rx: the regex to match against
	598	* sv: the SV being matched: only used for utf8 flag; the string
	599	* itself is accessed via the pointers below. Note that on
	600	* something like an overloaded SV, SvPOK(sv) may be false
	601	* and the string pointers may point to something unrelated to
	602	* the SV itself.
	603	* strbeg: real beginning of string
	604	* strpos: the point in the string at which to begin matching
	605	* strend: pointer to the byte following the last char of the string
	606	* flags currently unused; set to 0
	607	* data: currently unused; set to NULL
	608	*/
	609
	610	char *
	611	Perl_re_intuit_start(pTHX_
	612	REGEXP * const rx,
	613	SV *sv,
	614	const char * const strbeg,
	615	char *strpos,
	616	char *strend,
	617	const U32 flags,
	618	re_scream_pos_data *data)
	619	{
	620	dVAR;
	621	struct regexp *const prog = ReANY(rx);
	622	I32 start_shift = 0;
	623	/* Should be nonnegative! */
	624	I32 end_shift = 0;
	625	char *s;
	626	SV *check;
	627	char *t;
	628	const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
	629	I32 ml_anch;
	630	char other_last = NULL; / other substr checked before this */
	631	char check_at = NULL; / check substr found at this pos */
	632	char checked_upto = NULL; / how far into the string we have already checked using find_byclass*/
	633	const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
	634	RXi_GET_DECL(prog,progi);
	635	regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
	636	regmatch_info *const reginfo = &reginfo_buf;
	637	#ifdef DEBUGGING
	638	const char * const i_strpos = strpos;
	639	#endif
	640	GET_RE_DEBUG_FLAGS_DECL;
	641
	642	PERL_ARGS_ASSERT_RE_INTUIT_START;
	643	PERL_UNUSED_ARG(flags);
	644	PERL_UNUSED_ARG(data);
	645
	646	/* CHR_DIST() would be more correct here but it makes things slow. */
	647	if (prog->minlen > strend - strpos) {
	648	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	649	"String too short... [re_intuit_start]\n"));
	650	goto fail;
	651	}
	652
	653	reginfo->is_utf8_target = cBOOL(utf8_target);
	654	reginfo->info_aux = NULL;
	655	reginfo->strbeg = strbeg;
	656	reginfo->strend = strend;
	657	reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
	658	reginfo->intuit = 1;
	659	/* not actually used within intuit, but zero for safety anyway */
	660	reginfo->poscache_maxiter = 0;
	661
	662	if (utf8_target) {
	663	if (!prog->check_utf8 && prog->check_substr)
	664	to_utf8_substr(prog);
	665	check = prog->check_utf8;
	666	} else {
	667	if (!prog->check_substr && prog->check_utf8) {
	668	if (! to_byte_substr(prog)) {
	669	NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
	670	}
	671	}
	672	check = prog->check_substr;
	673	}
	674	if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
	675	ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
	676	\|\| ( (prog->extflags & RXf_ANCH_BOL)
	677	&& !multiline ) ); /* Check after \n? */
	678
	679	if (!ml_anch) {
	680	if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
	681	&& !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
	682	&& (strpos != strbeg)) {
	683	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
	684	goto fail;
	685	}
	686	if (prog->check_offset_min == prog->check_offset_max
	687	&& !(prog->extflags & RXf_CANY_SEEN)
	688	&& ! multiline) /* /m can cause \n's to match that aren't
	689	accounted for in the string max length.
	690	See [perl #115242] */
	691	{
	692	/* Substring at constant offset from beg-of-str... */
	693	I32 slen;
	694
	695	s = HOP3c(strpos, prog->check_offset_min, strend);
	696
	697	if (SvTAIL(check)) {
	698	slen = SvCUR(check); /* >= 1 */
	699
	700	if ( strend - s > slen \|\| strend - s < slen - 1
	701	\|\| (strend - s == slen && strend[-1] != '\n')) {
	702	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
	703	goto fail_finish;
	704	}
	705	/* Now should match s[0..slen-2] */
	706	slen--;
	707	if (slen && (SvPVX_const(check) != s
	708	\|\| (slen > 1
	709	&& memNE(SvPVX_const(check), s, slen)))) {
	710	report_neq:
	711	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
	712	goto fail_finish;
	713	}
	714	}
	715	else if (SvPVX_const(check) != s
	716	\|\| ((slen = SvCUR(check)) > 1
	717	&& memNE(SvPVX_const(check), s, slen)))
	718	goto report_neq;
	719	check_at = s;
	720	goto success_at_start;
	721	}
	722	}
	723	/* Match is anchored, but substr is not anchored wrt beg-of-str. */
	724	s = strpos;
	725	start_shift = prog->check_offset_min; /* okay to underestimate on CC */
	726	end_shift = prog->check_end_shift;
	727
	728	if (!ml_anch) {
	729	const I32 end = prog->check_offset_max + CHR_SVLEN(check)
	730	- (SvTAIL(check) != 0);
	731	const I32 eshift = CHR_DIST((U8)strend, (U8)s) - end;
	732
	733	if (end_shift < eshift)
	734	end_shift = eshift;
	735	}
	736	}
	737	else { /* Can match at random position */
	738	ml_anch = 0;
	739	s = strpos;
	740	start_shift = prog->check_offset_min; /* okay to underestimate on CC */
	741	end_shift = prog->check_end_shift;
	742
	743	/* end shift should be non negative here */
	744	}
	745
	746	#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
	747	if (end_shift < 0)
	748	Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
	749	(IV)end_shift, RX_PRECOMP(prog));
	750	#endif
	751
	752	restart:
	753	/* Find a possible match in the region s..strend by looking for
	754	the "check" substring in the region corrected by start/end_shift. */
	755
	756	{
	757	I32 srch_start_shift = start_shift;
	758	I32 srch_end_shift = end_shift;
	759	U8* start_point;
	760	U8* end_point;
	761	if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
	762	srch_end_shift -= ((strbeg - s) - srch_start_shift);
	763	srch_start_shift = strbeg - s;
	764	}
	765	DEBUG_OPTIMISE_MORE_r({
	766	PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
	767	(IV)prog->check_offset_min,
	768	(IV)srch_start_shift,
	769	(IV)srch_end_shift,
	770	(IV)prog->check_end_shift);
	771	});
	772
	773	if (prog->extflags & RXf_CANY_SEEN) {
	774	start_point= (U8*)(s + srch_start_shift);
	775	end_point= (U8*)(strend - srch_end_shift);
	776	} else {
	777	start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
	778	end_point= HOP3(strend, -srch_end_shift, strbeg);
	779	}
	780	DEBUG_OPTIMISE_MORE_r({
	781	PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
	782	(int)(end_point - start_point),
	783	(int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
	784	start_point);
	785	});
	786
	787	s = fbm_instr( start_point, end_point,
	788	check, multiline ? FBMrf_MULTILINE : 0);
	789	}
	790	/* Update the count-of-usability, remove useless subpatterns,
	791	unshift s. */
	792
	793	DEBUG_EXECUTE_r({
	794	RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
	795	SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
	796	PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
	797	(s ? "Found" : "Did not find"),
	798	(check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
	799	? "anchored" : "floating"),
	800	quoted,
	801	RE_SV_TAIL(check),
	802	(s ? " at offset " : "...\n") );
	803	});
	804
	805	if (!s)
	806	goto fail_finish;
	807	/* Finish the diagnostic message */
	808	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
	809
	810	/* XXX dmq: first branch is for positive lookbehind...
	811	Our check string is offset from the beginning of the pattern.
	812	So we need to do any stclass tests offset forward from that
	813	point. I think. :-(
	814	*/
	815
	816
	817
	818	check_at=s;
	819
	820
	821	/* Got a candidate. Check MBOL anchoring, and the other substr.
	822	Start with the other substr.
	823	XXXX no SCREAM optimization yet - and a very coarse implementation
	824	XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
	825	always match. Probably should be marked during compile...
	826	Probably it is right to do no SCREAM here...
	827	*/
	828
	829	if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
	830	: (prog->float_substr && prog->anchored_substr))
	831	{
	832	/* Take into account the "other" substring. */
	833	/* XXXX May be hopelessly wrong for UTF... */
	834	if (!other_last)
	835	other_last = strpos;
	836	if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
	837	do_other_anchored:
	838	{
	839	char * const last = HOP3c(s, -start_shift, strbeg);
	840	char last1, last2;
	841	char * const saved_s = s;
	842	SV* must;
	843
	844	t = s - prog->check_offset_max;
	845	if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
	846	&& (!utf8_target
	847	\|\| ((t = (char)reghopmaybe3((U8)s, -(prog->check_offset_max), (U8*)strpos))
	848	&& t > strpos)))
	849	NOOP;
	850	else
	851	t = strpos;
	852	t = HOP3c(t, prog->anchored_offset, strend);
	853	if (t < other_last) /* These positions already checked */
	854	t = other_last;
	855	last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
	856	if (last < last1)
	857	last1 = last;
	858	/* XXXX It is not documented what units *_offsets are in.
	859	We assume bytes, but this is clearly wrong.
	860	Meaning this code needs to be carefully reviewed for errors.
	861	dmq.
	862	*/
	863
	864	/* On end-of-str: see comment below. */
	865	must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
	866	if (must == &PL_sv_undef) {
	867	s = (char*)NULL;
	868	DEBUG_r(must = prog->anchored_utf8); /* for debug */
	869	}
	870	else
	871	s = fbm_instr(
	872	(unsigned char*)t,
	873	HOP3(HOP3(last1, prog->anchored_offset, strend)
	874	+ SvCUR(must), -(SvTAIL(must)!=0), strbeg),
	875	must,
	876	multiline ? FBMrf_MULTILINE : 0
	877	);
	878	DEBUG_EXECUTE_r({
	879	RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
	880	SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
	881	PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
	882	(s ? "Found" : "Contradicts"),
	883	quoted, RE_SV_TAIL(must));
	884	});
	885
	886
	887	if (!s) {
	888	if (last1 >= last2) {
	889	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	890	", giving up...\n"));
	891	goto fail_finish;
	892	}
	893	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	894	", trying floating at offset %ld...\n",
	895	(long)(HOP3c(saved_s, 1, strend) - i_strpos)));
	896	other_last = HOP3c(last1, prog->anchored_offset+1, strend);
	897	s = HOP3c(last, 1, strend);
	898	goto restart;
	899	}
	900	else {
	901	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
	902	(long)(s - i_strpos)));
	903	t = HOP3c(s, -prog->anchored_offset, strbeg);
	904	other_last = HOP3c(s, 1, strend);
	905	s = saved_s;
	906	if (t == strpos)
	907	goto try_at_start;
	908	goto try_at_offset;
	909	}
	910	}
	911	}
	912	else { /* Take into account the floating substring. */
	913	char last, last1;
	914	char * const saved_s = s;
	915	SV* must;
	916
	917	t = HOP3c(s, -start_shift, strbeg);
	918	last1 = last =
	919	HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
	920	if (CHR_DIST((U8)last, (U8)t) > prog->float_max_offset)
	921	last = HOP3c(t, prog->float_max_offset, strend);
	922	s = HOP3c(t, prog->float_min_offset, strend);
	923	if (s < other_last)
	924	s = other_last;
	925	/* XXXX It is not documented what units _offsets are in. Assume bytes. /
	926	must = utf8_target ? prog->float_utf8 : prog->float_substr;
	927	/* fbm_instr() takes into account exact value of end-of-str
	928	if the check is SvTAIL(ed). Since false positives are OK,
	929	and end-of-str is not later than strend we are OK. */
	930	if (must == &PL_sv_undef) {
	931	s = (char*)NULL;
	932	DEBUG_r(must = prog->float_utf8); /* for debug message */
	933	}
	934	else
	935	s = fbm_instr((unsigned char*)s,
	936	(unsigned char*)last + SvCUR(must)
	937	- (SvTAIL(must)!=0),
	938	must, multiline ? FBMrf_MULTILINE : 0);
	939	DEBUG_EXECUTE_r({
	940	RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
	941	SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
	942	PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
	943	(s ? "Found" : "Contradicts"),
	944	quoted, RE_SV_TAIL(must));
	945	});
	946	if (!s) {
	947	if (last1 == last) {
	948	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	949	", giving up...\n"));
	950	goto fail_finish;
	951	}
	952	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	953	", trying anchored starting at offset %ld...\n",
	954	(long)(saved_s + 1 - i_strpos)));
	955	other_last = last;
	956	s = HOP3c(t, 1, strend);
	957	goto restart;
	958	}
	959	else {
	960	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
	961	(long)(s - i_strpos)));
	962	other_last = s; /* Fix this later. --Hugo */
	963	s = saved_s;
	964	if (t == strpos)
	965	goto try_at_start;
	966	goto try_at_offset;
	967	}
	968	}
	969	}
	970
	971
	972	t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
	973
	974	DEBUG_OPTIMISE_MORE_r(
	975	PerlIO_printf(Perl_debug_log,
	976	"Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
	977	(IV)prog->check_offset_min,
	978	(IV)prog->check_offset_max,
	979	(IV)(s-strpos),
	980	(IV)(t-strpos),
	981	(IV)(t-s),
	982	(IV)(strend-strpos)
	983	)
	984	);
	985
	986	if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
	987	&& (!utf8_target
	988	\|\| ((t = (char)reghopmaybe3((U8)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
	989	&& t > strpos)))
	990	{
	991	/* Fixed substring is found far enough so that the match
	992	cannot start at strpos. */
	993	try_at_offset:
	994	if (ml_anch && t[-1] != '\n') {
	995	/* Eventually fbm_*() should handle this, but often
	996	anchored_offset is not 0, so this check will not be wasted. */
	997	/* XXXX In the code below we prefer to look for "^" even in
	998	presence of anchored substrings. And we search even
	999	beyond the found float position. These pessimizations
	1000	are historical artefacts only. */
	1001	find_anchor:
	1002	while (t < strend - prog->minlen) {
	1003	if (*t == '\n') {
	1004	if (t < check_at - prog->check_offset_min) {
	1005	if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
	1006	/* Since we moved from the found position,
	1007	we definitely contradict the found anchored
	1008	substr. Due to the above check we do not
	1009	contradict "check" substr.
	1010	Thus we can arrive here only if check substr
	1011	is float. Redo checking for "other"=="fixed".
	1012	*/
	1013	strpos = t + 1;
	1014	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
	1015	PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
	1016	goto do_other_anchored;
	1017	}
	1018	/* We don't contradict the found floating substring. */
	1019	/* XXXX Why not check for STCLASS? */
	1020	s = t + 1;
	1021	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
	1022	PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
	1023	goto set_useful;
	1024	}
	1025	/* Position contradicts check-string */
	1026	/* XXXX probably better to look for check-string
	1027	than for "\n", so one should lower the limit for t? */
	1028	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
	1029	PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
	1030	other_last = strpos = s = t + 1;
	1031	goto restart;
	1032	}
	1033	t++;
	1034	}
	1035	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
	1036	PL_colors[0], PL_colors[1]));
	1037	goto fail_finish;
	1038	}
	1039	else {
	1040	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
	1041	PL_colors[0], PL_colors[1]));
	1042	}
	1043	s = t;
	1044	set_useful:
	1045	++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
	1046	}
	1047	else {
	1048	/* The found string does not prohibit matching at strpos,
	1049	- no optimization of calling REx engine can be performed,
	1050	unless it was an MBOL and we are not after MBOL,
	1051	or a future STCLASS check will fail this. */
	1052	try_at_start:
	1053	/* Even in this situation we may use MBOL flag if strpos is offset
	1054	wrt the start of the string. */
	1055	if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
	1056	/* May be due to an implicit anchor of m{.foo} /
	1057	&& !(prog->intflags & PREGf_IMPLICIT))
	1058	{
	1059	t = strpos;
	1060	goto find_anchor;
	1061	}
	1062	DEBUG_EXECUTE_r( if (ml_anch)
	1063	PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
	1064	(long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
	1065	);
	1066	success_at_start:
	1067	if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
	1068	&& (utf8_target ? (
	1069	prog->check_utf8 /* Could be deleted already */
	1070	&& --BmUSEFUL(prog->check_utf8) < 0
	1071	&& (prog->check_utf8 == prog->float_utf8)
	1072	) : (
	1073	prog->check_substr /* Could be deleted already */
	1074	&& --BmUSEFUL(prog->check_substr) < 0
	1075	&& (prog->check_substr == prog->float_substr)
	1076	)))
	1077	{
	1078	/* If flags & SOMETHING - do not do it many times on the same match */
	1079	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
	1080	/* XXX Does the destruction order has to change with utf8_target? */
	1081	SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
	1082	SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
	1083	prog->check_substr = prog->check_utf8 = NULL; /* disable */
	1084	prog->float_substr = prog->float_utf8 = NULL; /* clear */
	1085	check = NULL; /* abort */
	1086	s = strpos;
	1087	/* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
	1088	see http://bugs.activestate.com/show_bug.cgi?id=87173 */
	1089	if (prog->intflags & PREGf_IMPLICIT)
	1090	prog->extflags &= ~RXf_ANCH_MBOL;
	1091	/* XXXX This is a remnant of the old implementation. It
	1092	looks wasteful, since now INTUIT can use many
	1093	other heuristics. */
	1094	prog->extflags &= ~RXf_USE_INTUIT;
	1095	/* XXXX What other flags might need to be cleared in this branch? */
	1096	}
	1097	else
	1098	s = strpos;
	1099	}
	1100
	1101	/* Last resort... */
	1102	/* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
	1103	/* trie stclasses are too expensive to use here, we are better off to
	1104	leave it to regmatch itself */
	1105	if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
	1106	/* minlen == 0 is possible if regstclass is \b or \B,
	1107	and the fixed substr is ''$.
	1108	Since minlen is already taken into account, s+1 is before strend;
	1109	accidentally, minlen >= 1 guaranties no false positives at s + 1
	1110	even for \b or \B. But (minlen? 1 : 0) below assumes that
	1111	regstclass does not come from lookahead... */
	1112	/* If regstclass takes bytelength more than 1: If charlength==1, OK.
	1113	This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
	1114	const U8* const str = (U8*)STRING(progi->regstclass);
	1115	const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
	1116	? CHR_DIST(str+STR_LEN(progi->regstclass), str)
	1117	: 1);
	1118	char * endpos;
	1119	if (prog->anchored_substr \|\| prog->anchored_utf8 \|\| ml_anch)
	1120	endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
	1121	else if (prog->float_substr \|\| prog->float_utf8)
	1122	endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
	1123	else
	1124	endpos= strend;
	1125
	1126	if (checked_upto < s)
	1127	checked_upto = s;
	1128	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
	1129	(IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
	1130
	1131	t = s;
	1132	s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
	1133	reginfo);
	1134	if (s) {
	1135	checked_upto = s;
	1136	} else {
	1137	#ifdef DEBUGGING
	1138	const char *what = NULL;
	1139	#endif
	1140	if (endpos == strend) {
	1141	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	1142	"Could not match STCLASS...\n") );
	1143	goto fail;
	1144	}
	1145	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	1146	"This position contradicts STCLASS...\n") );
	1147	if ((prog->extflags & RXf_ANCH) && !ml_anch)
	1148	goto fail;
	1149	checked_upto = HOPBACKc(endpos, start_shift);
	1150	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
	1151	(IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
	1152	/* Contradict one of substrings */
	1153	if (prog->anchored_substr \|\| prog->anchored_utf8) {
	1154	if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
	1155	DEBUG_EXECUTE_r( what = "anchored" );
	1156	hop_and_restart:
	1157	s = HOP3c(t, 1, strend);
	1158	if (s + start_shift + end_shift > strend) {
	1159	/* XXXX Should be taken into account earlier? */
	1160	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	1161	"Could not match STCLASS...\n") );
	1162	goto fail;
	1163	}
	1164	if (!check)
	1165	goto giveup;
	1166	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	1167	"Looking for %s substr starting at offset %ld...\n",
	1168	what, (long)(s + start_shift - i_strpos)) );
	1169	goto restart;
	1170	}
	1171	/* Have both, check_string is floating */
	1172	if (t + start_shift >= check_at) /* Contradicts floating=check */
	1173	goto retry_floating_check;
	1174	/* Recheck anchored substring, but not floating... */
	1175	s = check_at;
	1176	if (!check)
	1177	goto giveup;
	1178	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	1179	"Looking for anchored substr starting at offset %ld...\n",
	1180	(long)(other_last - i_strpos)) );
	1181	goto do_other_anchored;
	1182	}
	1183	/* Another way we could have checked stclass at the
	1184	current position only: */
	1185	if (ml_anch) {
	1186	s = t = t + 1;
	1187	if (!check)
	1188	goto giveup;
	1189	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	1190	"Looking for /%s^%s/m starting at offset %ld...\n",
	1191	PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
	1192	goto try_at_offset;
	1193	}
	1194	if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
	1195	goto fail;
	1196	/* Check is floating substring. */
	1197	retry_floating_check:
	1198	t = check_at - start_shift;
	1199	DEBUG_EXECUTE_r( what = "floating" );
	1200	goto hop_and_restart;
	1201	}
	1202	if (t != s) {
	1203	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	1204	"By STCLASS: moving %ld --> %ld\n",
	1205	(long)(t - i_strpos), (long)(s - i_strpos))
	1206	);
	1207	}
	1208	else {
	1209	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	1210	"Does not contradict STCLASS...\n");
	1211	);
	1212	}
	1213	}
	1214	giveup:
	1215	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
	1216	PL_colors[4], (check ? "Guessed" : "Giving up"),
	1217	PL_colors[5], (long)(s - i_strpos)) );
	1218	return s;
	1219
	1220	fail_finish: /* Substring not found */
	1221	if (prog->check_substr \|\| prog->check_utf8) /* could be removed already */
	1222	BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
	1223	fail:
	1224	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
	1225	PL_colors[4], PL_colors[5]));
	1226	return NULL;
	1227	}
	1228
	1229	#define DECL_TRIE_TYPE(scan) \
	1230	const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
	1231	trie_type = ((scan->flags == EXACT) \
	1232	? (utf8_target ? trie_utf8 : trie_plain) \
	1233	: (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
	1234
	1235	#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
	1236	STMT_START { \
	1237	STRLEN skiplen; \
	1238	switch (trie_type) { \
	1239	case trie_utf8_fold: \
	1240	if ( foldlen>0 ) { \
	1241	uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
	1242	foldlen -= len; \
	1243	uscan += len; \
	1244	len=0; \
	1245	} else { \
	1246	uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
	1247	len = UTF8SKIP(uc); \
	1248	skiplen = UNISKIP( uvc ); \
	1249	foldlen -= skiplen; \
	1250	uscan = foldbuf + skiplen; \
	1251	} \
	1252	break; \
	1253	case trie_latin_utf8_fold: \
	1254	if ( foldlen>0 ) { \
	1255	uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
	1256	foldlen -= len; \
	1257	uscan += len; \
	1258	len=0; \
	1259	} else { \
	1260	len = 1; \
	1261	uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
	1262	skiplen = UNISKIP( uvc ); \
	1263	foldlen -= skiplen; \
	1264	uscan = foldbuf + skiplen; \
	1265	} \
	1266	break; \
	1267	case trie_utf8: \
	1268	uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
	1269	break; \
	1270	case trie_plain: \
	1271	uvc = (UV)*uc; \
	1272	len = 1; \
	1273	} \
	1274	if (uvc < 256) { \
	1275	charid = trie->charmap[ uvc ]; \
	1276	} \
	1277	else { \
	1278	charid = 0; \
	1279	if (widecharmap) { \
	1280	SV** const svpp = hv_fetch(widecharmap, \
	1281	(char*)&uvc, sizeof(UV), 0); \
	1282	if (svpp) \
	1283	charid = (U16)SvIV(*svpp); \
	1284	} \
	1285	} \
	1286	} STMT_END
	1287
	1288	#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
	1289	STMT_START { \
	1290	while (s <= e) { \
	1291	if ( (CoNd) \
	1292	&& (ln == 1 \|\| folder(s, pat_string, ln)) \
	1293	&& (reginfo->intuit \|\| regtry(reginfo, &s)) )\
	1294	goto got_it; \
	1295	s++; \
	1296	} \
	1297	} STMT_END
	1298
	1299	#define REXEC_FBC_UTF8_SCAN(CoDe) \
	1300	STMT_START { \
	1301	while (s < strend) { \
	1302	CoDe \
	1303	s += UTF8SKIP(s); \
	1304	} \
	1305	} STMT_END
	1306
	1307	#define REXEC_FBC_SCAN(CoDe) \
	1308	STMT_START { \
	1309	while (s < strend) { \
	1310	CoDe \
	1311	s++; \
	1312	} \
	1313	} STMT_END
	1314
	1315	#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
	1316	REXEC_FBC_UTF8_SCAN( \
	1317	if (CoNd) { \
	1318	if (tmp && (reginfo->intuit \|\| regtry(reginfo, &s))) \
	1319	goto got_it; \
	1320	else \
	1321	tmp = doevery; \
	1322	} \
	1323	else \
	1324	tmp = 1; \
	1325	)
	1326
	1327	#define REXEC_FBC_CLASS_SCAN(CoNd) \
	1328	REXEC_FBC_SCAN( \
	1329	if (CoNd) { \
	1330	if (tmp && (reginfo->intuit \|\| regtry(reginfo, &s))) \
	1331	goto got_it; \
	1332	else \
	1333	tmp = doevery; \
	1334	} \
	1335	else \
	1336	tmp = 1; \
	1337	)
	1338
	1339	#define REXEC_FBC_TRYIT \
	1340	if ((reginfo->intuit \|\| regtry(reginfo, &s))) \
	1341	goto got_it
	1342
	1343	#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
	1344	if (utf8_target) { \
	1345	REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
	1346	} \
	1347	else { \
	1348	REXEC_FBC_CLASS_SCAN(CoNd); \
	1349	}
	1350
	1351	#define DUMP_EXEC_POS(li,s,doutf8) \
	1352	dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
	1353	startpos, doutf8)
	1354
	1355
	1356	#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
	1357	tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
	1358	tmp = TEST_NON_UTF8(tmp); \
	1359	REXEC_FBC_UTF8_SCAN( \
	1360	if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
	1361	tmp = !tmp; \
	1362	IF_SUCCESS; \
	1363	} \
	1364	else { \
	1365	IF_FAIL; \
	1366	} \
	1367	); \
	1368
	1369	#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
	1370	if (s == reginfo->strbeg) { \
	1371	tmp = '\n'; \
	1372	} \
	1373	else { \
	1374	U8 * const r = reghop3((U8)s, -1, (U8)reginfo->strbeg); \
	1375	tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
	1376	} \
	1377	tmp = TeSt1_UtF8; \
	1378	LOAD_UTF8_CHARCLASS_ALNUM(); \
	1379	REXEC_FBC_UTF8_SCAN( \
	1380	if (tmp == ! (TeSt2_UtF8)) { \
	1381	tmp = !tmp; \
	1382	IF_SUCCESS; \
	1383	} \
	1384	else { \
	1385	IF_FAIL; \
	1386	} \
	1387	); \
	1388
	1389	/* The only difference between the BOUND and NBOUND cases is that
	1390	* REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
	1391	* NBOUND. This is accomplished by passing it in either the if or else clause,
	1392	* with the other one being empty */
	1393	#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
	1394	FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
	1395
	1396	#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
	1397	FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
	1398
	1399	#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
	1400	FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
	1401
	1402	#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
	1403	FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
	1404
	1405
	1406	/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
	1407	* be passed in completely with the variable name being tested, which isn't
	1408	* such a clean interface, but this is easier to read than it was before. We
	1409	* are looking for the boundary (or non-boundary between a word and non-word
	1410	* character. The utf8 and non-utf8 cases have the same logic, but the details
	1411	* must be different. Find the "wordness" of the character just prior to this
	1412	* one, and compare it with the wordness of this one. If they differ, we have
	1413	* a boundary. At the beginning of the string, pretend that the previous
	1414	* character was a new-line */
	1415	#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
	1416	if (utf8_target) { \
	1417	UTF8_CODE \
	1418	} \
	1419	else { /* Not utf8 */ \
	1420	tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
	1421	tmp = TEST_NON_UTF8(tmp); \
	1422	REXEC_FBC_SCAN( \
	1423	if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
	1424	tmp = !tmp; \
	1425	IF_SUCCESS; \
	1426	} \
	1427	else { \
	1428	IF_FAIL; \
	1429	} \
	1430	); \
	1431	} \
	1432	if ((!prog->minlen && tmp) && (reginfo->intuit \|\| regtry(reginfo, &s))) \
	1433	goto got_it;
	1434
	1435	/* We know what class REx starts with. Try to find this position... */
	1436	/* if reginfo->intuit, its a dryrun */
	1437	/* annoyingly all the vars in this routine have different names from their counterparts
	1438	in regmatch. /grrr */
	1439
	1440	STATIC char *
	1441	S_find_byclass(pTHX_ regexp * prog, const regnode c, char s,
	1442	const char strend, regmatch_info reginfo)
	1443	{
	1444	dVAR;
	1445	const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
	1446	char pat_string; / The pattern's exactish string */
	1447	char pat_end; / ptr to end char of pat_string */
	1448	re_fold_t folder; /* Function for computing non-utf8 folds */
	1449	const U8 fold_array; / array for folding ords < 256 */
	1450	STRLEN ln;
	1451	STRLEN lnc;
	1452	U8 c1;
	1453	U8 c2;
	1454	char *e;
	1455	I32 tmp = 1; /* Scratch variable? */
	1456	const bool utf8_target = reginfo->is_utf8_target;
	1457	UV utf8_fold_flags = 0;
	1458	const bool is_utf8_pat = reginfo->is_utf8_pat;
	1459	bool to_complement = FALSE; /* Invert the result? Taking the xor of this
	1460	with a result inverts that result, as 0^1 =
	1461	1 and 1^1 = 0 */
	1462	_char_class_number classnum;
	1463
	1464	RXi_GET_DECL(prog,progi);
	1465
	1466	PERL_ARGS_ASSERT_FIND_BYCLASS;
	1467
	1468	/* We know what class it must start with. */
	1469	switch (OP(c)) {
	1470	case ANYOF:
	1471	case ANYOF_SYNTHETIC:
	1472	case ANYOF_WARN_SUPER:
	1473	if (utf8_target) {
	1474	REXEC_FBC_UTF8_CLASS_SCAN(
	1475	reginclass(prog, c, (U8*)s, utf8_target));
	1476	}
	1477	else {
	1478	REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
	1479	}
	1480	break;
	1481	case CANY:
	1482	REXEC_FBC_SCAN(
	1483	if (tmp && (reginfo->intuit \|\| regtry(reginfo, &s)))
	1484	goto got_it;
	1485	else
	1486	tmp = doevery;
	1487	);
	1488	break;
	1489
	1490	case EXACTFA:
	1491	if (is_utf8_pat \|\| utf8_target) {
	1492	utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
	1493	goto do_exactf_utf8;
	1494	}
	1495	fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
	1496	folder = foldEQ_latin1; /* /a, except the sharp s one which */
	1497	goto do_exactf_non_utf8; /* isn't dealt with by these */
	1498
	1499	case EXACTF:
	1500	if (utf8_target) {
	1501
	1502	/* regcomp.c already folded this if pattern is in UTF-8 */
	1503	utf8_fold_flags = 0;
	1504	goto do_exactf_utf8;
	1505	}
	1506	fold_array = PL_fold;
	1507	folder = foldEQ;
	1508	goto do_exactf_non_utf8;
	1509
	1510	case EXACTFL:
	1511	if (is_utf8_pat \|\| utf8_target) {
	1512	utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
	1513	goto do_exactf_utf8;
	1514	}
	1515	fold_array = PL_fold_locale;
	1516	folder = foldEQ_locale;
	1517	goto do_exactf_non_utf8;
	1518
	1519	case EXACTFU_SS:
	1520	if (is_utf8_pat) {
	1521	utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
	1522	}
	1523	goto do_exactf_utf8;
	1524
	1525	case EXACTFU_TRICKYFOLD:
	1526	case EXACTFU:
	1527	if (is_utf8_pat \|\| utf8_target) {
	1528	utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
	1529	goto do_exactf_utf8;
	1530	}
	1531
	1532	/* Any 'ss' in the pattern should have been replaced by regcomp,
	1533	* so we don't have to worry here about this single special case
	1534	* in the Latin1 range */
	1535	fold_array = PL_fold_latin1;
	1536	folder = foldEQ_latin1;
	1537
	1538	/* FALL THROUGH */
	1539
	1540	do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
	1541	are no glitches with fold-length differences
	1542	between the target string and pattern */
	1543
	1544	/* The idea in the non-utf8 EXACTF* cases is to first find the
	1545	* first character of the EXACTF* node and then, if necessary,
	1546	* case-insensitively compare the full text of the node. c1 is the
	1547	* first character. c2 is its fold. This logic will not work for
	1548	* Unicode semantics and the german sharp ss, which hence should
	1549	* not be compiled into a node that gets here. */
	1550	pat_string = STRING(c);
	1551	ln = STR_LEN(c); /* length to match in octets/bytes */
	1552
	1553	/* We know that we have to match at least 'ln' bytes (which is the
	1554	* same as characters, since not utf8). If we have to match 3
	1555	* characters, and there are only 2 availabe, we know without
	1556	* trying that it will fail; so don't start a match past the
	1557	* required minimum number from the far end */
	1558	e = HOP3c(strend, -((I32)ln), s);
	1559
	1560	if (reginfo->intuit && e < s) {
	1561	e = s; /* Due to minlen logic of intuit() */
	1562	}
	1563
	1564	c1 = *pat_string;
	1565	c2 = fold_array[c1];
	1566	if (c1 == c2) { /* If char and fold are the same */
	1567	REXEC_FBC_EXACTISH_SCAN((U8)s == c1);
	1568	}
	1569	else {
	1570	REXEC_FBC_EXACTISH_SCAN((U8)s == c1 \|\| (U8)s == c2);
	1571	}
	1572	break;
	1573
	1574	do_exactf_utf8:
	1575	{
	1576	unsigned expansion;
	1577
	1578	/* If one of the operands is in utf8, we can't use the simpler folding
	1579	* above, due to the fact that many different characters can have the
	1580	* same fold, or portion of a fold, or different- length fold */
	1581	pat_string = STRING(c);
	1582	ln = STR_LEN(c); /* length to match in octets/bytes */
	1583	pat_end = pat_string + ln;
	1584	lnc = is_utf8_pat /* length to match in characters */
	1585	? utf8_length((U8 ) pat_string, (U8 ) pat_end)
	1586	: ln;
	1587
	1588	/* We have 'lnc' characters to match in the pattern, but because of
	1589	* multi-character folding, each character in the target can match
	1590	* up to 3 characters (Unicode guarantees it will never exceed
	1591	* this) if it is utf8-encoded; and up to 2 if not (based on the
	1592	* fact that the Latin 1 folds are already determined, and the
	1593	* only multi-char fold in that range is the sharp-s folding to
	1594	* 'ss'. Thus, a pattern character can match as little as 1/3 of a
	1595	* string character. Adjust lnc accordingly, rounding up, so that
	1596	* if we need to match at least 4+1/3 chars, that really is 5. */
	1597	expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
	1598	lnc = (lnc + expansion - 1) / expansion;
	1599
	1600	/* As in the non-UTF8 case, if we have to match 3 characters, and
	1601	* only 2 are left, it's guaranteed to fail, so don't start a
	1602	* match that would require us to go beyond the end of the string
	1603	*/
	1604	e = HOP3c(strend, -((I32)lnc), s);
	1605
	1606	if (reginfo->intuit && e < s) {
	1607	e = s; /* Due to minlen logic of intuit() */
	1608	}
	1609
	1610	/* XXX Note that we could recalculate e to stop the loop earlier,
	1611	* as the worst case expansion above will rarely be met, and as we
	1612	* go along we would usually find that e moves further to the left.
	1613	* This would happen only after we reached the point in the loop
	1614	* where if there were no expansion we should fail. Unclear if
	1615	* worth the expense */
	1616
	1617	while (s <= e) {
	1618	char my_strend= (char )strend;
	1619	if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
	1620	pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
	1621	&& (reginfo->intuit \|\| regtry(reginfo, &s)) )
	1622	{
	1623	goto got_it;
	1624	}
	1625	s += (utf8_target) ? UTF8SKIP(s) : 1;
	1626	}
	1627	break;
	1628	}
	1629	case BOUNDL:
	1630	RXp_MATCH_TAINTED_on(prog);
	1631	FBC_BOUND(isWORDCHAR_LC,
	1632	isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
	1633	isWORDCHAR_LC_utf8((U8*)s));
	1634	break;
	1635	case NBOUNDL:
	1636	RXp_MATCH_TAINTED_on(prog);
	1637	FBC_NBOUND(isWORDCHAR_LC,
	1638	isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
	1639	isWORDCHAR_LC_utf8((U8*)s));
	1640	break;
	1641	case BOUND:
	1642	FBC_BOUND(isWORDCHAR,
	1643	isWORDCHAR_uni(tmp),
	1644	cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
	1645	break;
	1646	case BOUNDA:
	1647	FBC_BOUND_NOLOAD(isWORDCHAR_A,
	1648	isWORDCHAR_A(tmp),
	1649	isWORDCHAR_A((U8*)s));
	1650	break;
	1651	case NBOUND:
	1652	FBC_NBOUND(isWORDCHAR,
	1653	isWORDCHAR_uni(tmp),
	1654	cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
	1655	break;
	1656	case NBOUNDA:
	1657	FBC_NBOUND_NOLOAD(isWORDCHAR_A,
	1658	isWORDCHAR_A(tmp),
	1659	isWORDCHAR_A((U8*)s));
	1660	break;
	1661	case BOUNDU:
	1662	FBC_BOUND(isWORDCHAR_L1,
	1663	isWORDCHAR_uni(tmp),
	1664	cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
	1665	break;
	1666	case NBOUNDU:
	1667	FBC_NBOUND(isWORDCHAR_L1,
	1668	isWORDCHAR_uni(tmp),
	1669	cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
	1670	break;
	1671	case LNBREAK:
	1672	REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
	1673	is_LNBREAK_latin1_safe(s, strend)
	1674	);
	1675	break;
	1676
	1677	/* The argument to all the POSIX node types is the class number to pass to
	1678	* _generic_isCC() to build a mask for searching in PL_charclass[] */
	1679
	1680	case NPOSIXL:
	1681	to_complement = 1;
	1682	/* FALLTHROUGH */
	1683
	1684	case POSIXL:
	1685	RXp_MATCH_TAINTED_on(prog);
	1686	REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
	1687	to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
	1688	break;
	1689
	1690	case NPOSIXD:
	1691	to_complement = 1;
	1692	/* FALLTHROUGH */
	1693
	1694	case POSIXD:
	1695	if (utf8_target) {
	1696	goto posix_utf8;
	1697	}
	1698	goto posixa;
	1699
	1700	case NPOSIXA:
	1701	if (utf8_target) {
	1702	/* The complement of something that matches only ASCII matches all
	1703	* UTF-8 variant code points, plus everything in ASCII that isn't
	1704	* in the class */
	1705	REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
	1706	\|\| ! _generic_isCC_A(*s, FLAGS(c)));
	1707	break;
	1708	}
	1709
	1710	to_complement = 1;
	1711	/* FALLTHROUGH */
	1712
	1713	case POSIXA:
	1714	posixa:
	1715	/* Don't need to worry about utf8, as it can match only a single
	1716	* byte invariant character. */
	1717	REXEC_FBC_CLASS_SCAN(
	1718	to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
	1719	break;
	1720
	1721	case NPOSIXU:
	1722	to_complement = 1;
	1723	/* FALLTHROUGH */
	1724
	1725	case POSIXU:
	1726	if (! utf8_target) {
	1727	REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
	1728	FLAGS(c))));
	1729	}
	1730	else {
	1731
	1732	posix_utf8:
	1733	classnum = (_char_class_number) FLAGS(c);
	1734	if (classnum < _FIRST_NON_SWASH_CC) {
	1735	while (s < strend) {
	1736
	1737	/* We avoid loading in the swash as long as possible, but
	1738	* should we have to, we jump to a separate loop. This
	1739	* extra 'if' statement is what keeps this code from being
	1740	* just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
	1741	if (UTF8_IS_ABOVE_LATIN1(*s)) {
	1742	goto found_above_latin1;
	1743	}
	1744	if ((UTF8_IS_INVARIANT(*s)
	1745	&& to_complement ^ cBOOL(_generic_isCC((U8) *s,
	1746	classnum)))
	1747	\|\| (UTF8_IS_DOWNGRADEABLE_START(*s)
	1748	&& to_complement ^ cBOOL(
	1749	_generic_isCC(TWO_BYTE_UTF8_TO_UNI(s, (s + 1)),
	1750	classnum))))
	1751	{
	1752	if (tmp && (reginfo->intuit \|\| regtry(reginfo, &s)))
	1753	goto got_it;
	1754	else {
	1755	tmp = doevery;
	1756	}
	1757	}
	1758	else {
	1759	tmp = 1;
	1760	}
	1761	s += UTF8SKIP(s);
	1762	}
	1763	}
	1764	else switch (classnum) { /* These classes are implemented as
	1765	macros */
	1766	case _CC_ENUM_SPACE: /* XXX would require separate code if we
	1767	revert the change of \v matching this */
	1768	/* FALL THROUGH */
	1769
	1770	case _CC_ENUM_PSXSPC:
	1771	REXEC_FBC_UTF8_CLASS_SCAN(
	1772	to_complement ^ cBOOL(isSPACE_utf8(s)));
	1773	break;
	1774
	1775	case _CC_ENUM_BLANK:
	1776	REXEC_FBC_UTF8_CLASS_SCAN(
	1777	to_complement ^ cBOOL(isBLANK_utf8(s)));
	1778	break;
	1779
	1780	case _CC_ENUM_XDIGIT:
	1781	REXEC_FBC_UTF8_CLASS_SCAN(
	1782	to_complement ^ cBOOL(isXDIGIT_utf8(s)));
	1783	break;
	1784
	1785	case _CC_ENUM_VERTSPACE:
	1786	REXEC_FBC_UTF8_CLASS_SCAN(
	1787	to_complement ^ cBOOL(isVERTWS_utf8(s)));
	1788	break;
	1789
	1790	case _CC_ENUM_CNTRL:
	1791	REXEC_FBC_UTF8_CLASS_SCAN(
	1792	to_complement ^ cBOOL(isCNTRL_utf8(s)));
	1793	break;
	1794
	1795	default:
	1796	Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
	1797	assert(0); /* NOTREACHED */
	1798	}
	1799	}
	1800	break;
	1801
	1802	found_above_latin1: /* Here we have to load a swash to get the result
	1803	for the current code point */
	1804	if (! PL_utf8_swash_ptrs[classnum]) {
	1805	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	1806	PL_utf8_swash_ptrs[classnum] =
	1807	_core_swash_init("utf8", swash_property_names[classnum],
	1808	&PL_sv_undef, 1, 0, NULL, &flags);
	1809	}
	1810
	1811	/* This is a copy of the loop above for swash classes, though using the
	1812	* FBC macro instead of being expanded out. Since we've loaded the
	1813	* swash, we don't have to check for that each time through the loop */
	1814	REXEC_FBC_UTF8_CLASS_SCAN(
	1815	to_complement ^ cBOOL(_generic_utf8(
	1816	classnum,
	1817	s,
	1818	swash_fetch(PL_utf8_swash_ptrs[classnum],
	1819	(U8 *) s, TRUE))));
	1820	break;
	1821
	1822	case AHOCORASICKC:
	1823	case AHOCORASICK:
	1824	{
	1825	DECL_TRIE_TYPE(c);
	1826	/* what trie are we using right now */
	1827	reg_ac_data aho = (reg_ac_data)progi->data->data[ ARG( c ) ];
	1828	reg_trie_data trie = (reg_trie_data)progi->data->data[ aho->trie ];
	1829	HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
	1830
	1831	const char *last_start = strend - trie->minlen;
	1832	#ifdef DEBUGGING
	1833	const char *real_start = s;
	1834	#endif
	1835	STRLEN maxlen = trie->maxlen;
	1836	SV *sv_points;
	1837	U8 *points; / map of where we were in the input string
	1838	when reading a given char. For ASCII this
	1839	is unnecessary overhead as the relationship
	1840	is always 1:1, but for Unicode, especially
	1841	case folded Unicode this is not true. */
	1842	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	1843	U8 *bitmap=NULL;
	1844
	1845
	1846	GET_RE_DEBUG_FLAGS_DECL;
	1847
	1848	/* We can't just allocate points here. We need to wrap it in
	1849	* an SV so it gets freed properly if there is a croak while
	1850	* running the match */
	1851	ENTER;
	1852	SAVETMPS;
	1853	sv_points=newSV(maxlen * sizeof(U8 *));
	1854	SvCUR_set(sv_points,
	1855	maxlen * sizeof(U8 *));
	1856	SvPOK_on(sv_points);
	1857	sv_2mortal(sv_points);
	1858	points=(U8**)SvPV_nolen(sv_points );
	1859	if ( trie_type != trie_utf8_fold
	1860	&& (trie->bitmap \|\| OP(c)==AHOCORASICKC) )
	1861	{
	1862	if (trie->bitmap)
	1863	bitmap=(U8*)trie->bitmap;
	1864	else
	1865	bitmap=(U8*)ANYOF_BITMAP(c);
	1866	}
	1867	/* this is the Aho-Corasick algorithm modified a touch
	1868	to include special handling for long "unknown char" sequences.
	1869	The basic idea being that we use AC as long as we are dealing
	1870	with a possible matching char, when we encounter an unknown char
	1871	(and we have not encountered an accepting state) we scan forward
	1872	until we find a legal starting char.
	1873	AC matching is basically that of trie matching, except that when
	1874	we encounter a failing transition, we fall back to the current
	1875	states "fail state", and try the current char again, a process
	1876	we repeat until we reach the root state, state 1, or a legal
	1877	transition. If we fail on the root state then we can either
	1878	terminate if we have reached an accepting state previously, or
	1879	restart the entire process from the beginning if we have not.
	1880
	1881	*/
	1882	while (s <= last_start) {
	1883	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	1884	U8 uc = (U8)s;
	1885	U16 charid = 0;
	1886	U32 base = 1;
	1887	U32 state = 1;
	1888	UV uvc = 0;
	1889	STRLEN len = 0;
	1890	STRLEN foldlen = 0;
	1891	U8 uscan = (U8)NULL;
	1892	U8 *leftmost = NULL;
	1893	#ifdef DEBUGGING
	1894	U32 accepted_word= 0;
	1895	#endif
	1896	U32 pointpos = 0;
	1897
	1898	while ( state && uc <= (U8*)strend ) {
	1899	int failed=0;
	1900	U32 word = aho->states[ state ].wordnum;
	1901
	1902	if( state==1 ) {
	1903	if ( bitmap ) {
	1904	DEBUG_TRIE_EXECUTE_r(
	1905	if ( uc <= (U8)last_start && !BITMAP_TEST(bitmap,uc) ) {
	1906	dump_exec_pos( (char *)uc, c, strend, real_start,
	1907	(char *)uc, utf8_target );
	1908	PerlIO_printf( Perl_debug_log,
	1909	" Scanning for legal start char...\n");
	1910	}
	1911	);
	1912	if (utf8_target) {
	1913	while ( uc <= (U8)last_start && !BITMAP_TEST(bitmap,uc) ) {
	1914	uc += UTF8SKIP(uc);
	1915	}
	1916	} else {
	1917	while ( uc <= (U8)last_start && !BITMAP_TEST(bitmap,uc) ) {
	1918	uc++;
	1919	}
	1920	}
	1921	s= (char *)uc;
	1922	}
	1923	if (uc >(U8*)last_start) break;
	1924	}
	1925
	1926	if ( word ) {
	1927	U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
	1928	if (!leftmost \|\| lpos < leftmost) {
	1929	DEBUG_r(accepted_word=word);
	1930	leftmost= lpos;
	1931	}
	1932	if (base==0) break;
	1933
	1934	}
	1935	points[pointpos++ % maxlen]= uc;
	1936	if (foldlen \|\| uc < (U8*)strend) {
	1937	REXEC_TRIE_READ_CHAR(trie_type, trie,
	1938	widecharmap, uc,
	1939	uscan, len, uvc, charid, foldlen,
	1940	foldbuf, uniflags);
	1941	DEBUG_TRIE_EXECUTE_r({
	1942	dump_exec_pos( (char *)uc, c, strend,
	1943	real_start, s, utf8_target);
	1944	PerlIO_printf(Perl_debug_log,
	1945	" Charid:%3u CP:%4"UVxf" ",
	1946	charid, uvc);
	1947	});
	1948	}
	1949	else {
	1950	len = 0;
	1951	charid = 0;
	1952	}
	1953
	1954
	1955	do {
	1956	#ifdef DEBUGGING
	1957	word = aho->states[ state ].wordnum;
	1958	#endif
	1959	base = aho->states[ state ].trans.base;
	1960
	1961	DEBUG_TRIE_EXECUTE_r({
	1962	if (failed)
	1963	dump_exec_pos( (char *)uc, c, strend, real_start,
	1964	s, utf8_target );
	1965	PerlIO_printf( Perl_debug_log,
	1966	"%sState: %4"UVxf", word=%"UVxf,
	1967	failed ? " Fail transition to " : "",
	1968	(UV)state, (UV)word);
	1969	});
	1970	if ( base ) {
	1971	U32 tmp;
	1972	I32 offset;
	1973	if (charid &&
	1974	( ((offset = base + charid
	1975	- 1 - trie->uniquecharcount)) >= 0)
	1976	&& ((U32)offset < trie->lasttrans)
	1977	&& trie->trans[offset].check == state
	1978	&& (tmp=trie->trans[offset].next))
	1979	{
	1980	DEBUG_TRIE_EXECUTE_r(
	1981	PerlIO_printf( Perl_debug_log," - legal\n"));
	1982	state = tmp;
	1983	break;
	1984	}
	1985	else {
	1986	DEBUG_TRIE_EXECUTE_r(
	1987	PerlIO_printf( Perl_debug_log," - fail\n"));
	1988	failed = 1;
	1989	state = aho->fail[state];
	1990	}
	1991	}
	1992	else {
	1993	/* we must be accepting here */
	1994	DEBUG_TRIE_EXECUTE_r(
	1995	PerlIO_printf( Perl_debug_log," - accepting\n"));
	1996	failed = 1;
	1997	break;
	1998	}
	1999	} while(state);
	2000	uc += len;
	2001	if (failed) {
	2002	if (leftmost)
	2003	break;
	2004	if (!state) state = 1;
	2005	}
	2006	}
	2007	if ( aho->states[ state ].wordnum ) {
	2008	U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
	2009	if (!leftmost \|\| lpos < leftmost) {
	2010	DEBUG_r(accepted_word=aho->states[ state ].wordnum);
	2011	leftmost = lpos;
	2012	}
	2013	}
	2014	if (leftmost) {
	2015	s = (char*)leftmost;
	2016	DEBUG_TRIE_EXECUTE_r({
	2017	PerlIO_printf(
	2018	Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
	2019	(UV)accepted_word, (IV)(s - real_start)
	2020	);
	2021	});
	2022	if (reginfo->intuit \|\| regtry(reginfo, &s)) {
	2023	FREETMPS;
	2024	LEAVE;
	2025	goto got_it;
	2026	}
	2027	s = HOPc(s,1);
	2028	DEBUG_TRIE_EXECUTE_r({
	2029	PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
	2030	});
	2031	} else {
	2032	DEBUG_TRIE_EXECUTE_r(
	2033	PerlIO_printf( Perl_debug_log,"No match.\n"));
	2034	break;
	2035	}
	2036	}
	2037	FREETMPS;
	2038	LEAVE;
	2039	}
	2040	break;
	2041	default:
	2042	Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
	2043	break;
	2044	}
	2045	return 0;
	2046	got_it:
	2047	return s;
	2048	}
	2049
	2050
	2051	/*
	2052	- regexec_flags - match a regexp against a string
	2053	*/
	2054	I32
	2055	Perl_regexec_flags(pTHX_ REGEXP * const rx, char stringarg, char strend,
	2056	char strbeg, I32 minend, SV sv, void *data, U32 flags)
	2057	/* stringarg: the point in the string at which to begin matching */
	2058	/* strend: pointer to null at end of string */
	2059	/* strbeg: real beginning of string */
	2060	/* minend: end of match must be >= minend bytes after stringarg. */
	2061	/* sv: SV being matched: only used for utf8 flag, pos() etc; string
	2062	* itself is accessed via the pointers above */
	2063	/* data: May be used for some additional optimizations.
	2064	Currently its only used, with a U32 cast, for transmitting
	2065	the ganch offset when doing a /g match. This will change */
	2066	/* nosave: For optimizations. */
	2067
	2068	{
	2069	dVAR;
	2070	struct regexp *const prog = ReANY(rx);
	2071	char *s;
	2072	regnode *c;
	2073	char *startpos = stringarg;
	2074	I32 minlen; /* must match at least this many chars */
	2075	I32 dontbother = 0; /* how many characters not to try at end */
	2076	I32 end_shift = 0; /* Same for the end. / / CC */
	2077	I32 scream_pos = -1; /* Internal iterator of scream. */
	2078	char *scream_olds = NULL;
	2079	const bool utf8_target = cBOOL(DO_UTF8(sv));
	2080	I32 multiline;
	2081	RXi_GET_DECL(prog,progi);
	2082	regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
	2083	regmatch_info *const reginfo = &reginfo_buf;
	2084	regexp_paren_pair *swap = NULL;
	2085	I32 oldsave;
	2086	GET_RE_DEBUG_FLAGS_DECL;
	2087
	2088	PERL_ARGS_ASSERT_REGEXEC_FLAGS;
	2089	PERL_UNUSED_ARG(data);
	2090
	2091	/* Be paranoid... */
	2092	if (prog == NULL \|\| startpos == NULL) {
	2093	Perl_croak(aTHX_ "NULL regexp parameter");
	2094	return 0;
	2095	}
	2096
	2097	DEBUG_EXECUTE_r(
	2098	debug_start_match(rx, utf8_target, startpos, strend,
	2099	"Matching");
	2100	);
	2101
	2102
	2103	/* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
	2104	* which will call destuctors to reset PL_regmatch_state, free higher
	2105	* PL_regmatch_slabs, and clean up regmatch_info_aux and
	2106	* regmatch_info_aux_eval */
	2107
	2108	oldsave = PL_savestack_ix;
	2109
	2110	multiline = prog->extflags & RXf_PMf_MULTILINE;
	2111	minlen = prog->minlen;
	2112
	2113	if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
	2114	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	2115	"String too short [regexec_flags]...\n"));
	2116	goto phooey;
	2117	}
	2118
	2119	/* Check validity of program. */
	2120	if (UCHARAT(progi->program) != REG_MAGIC) {
	2121	Perl_croak(aTHX_ "corrupted regexp program");
	2122	}
	2123
	2124	RX_MATCH_TAINTED_off(rx);
	2125
	2126	reginfo->prog = rx; /* Yes, sorry that this is confusing. */
	2127	reginfo->intuit = 0;
	2128	reginfo->is_utf8_target = cBOOL(utf8_target);
	2129	reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
	2130	reginfo->warned = FALSE;
	2131	reginfo->strbeg = strbeg;
	2132	reginfo->sv = sv;
	2133	reginfo->poscache_maxiter = 0; /* not yet started a countdown */
	2134	reginfo->strend = strend;
	2135	/* see how far we have to get to not match where we matched before */
	2136	reginfo->till = startpos+minend;
	2137
	2138	/* reserve next 2 or 3 slots in PL_regmatch_state:
	2139	* slot N+0: may currently be in use: skip it
	2140	* slot N+1: use for regmatch_info_aux struct
	2141	* slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
	2142	* slot N+3: ready for use by regmatch()
	2143	*/
	2144
	2145	{
	2146	regmatch_state *old_regmatch_state;
	2147	regmatch_slab *old_regmatch_slab;
	2148	int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
	2149
	2150	/* on first ever match, allocate first slab */
	2151	if (!PL_regmatch_slab) {
	2152	Newx(PL_regmatch_slab, 1, regmatch_slab);
	2153	PL_regmatch_slab->prev = NULL;
	2154	PL_regmatch_slab->next = NULL;
	2155	PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
	2156	}
	2157
	2158	old_regmatch_state = PL_regmatch_state;
	2159	old_regmatch_slab = PL_regmatch_slab;
	2160
	2161	for (i=0; i <= max; i++) {
	2162	if (i == 1)
	2163	reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
	2164	else if (i ==2)
	2165	reginfo->info_aux_eval =
	2166	reginfo->info_aux->info_aux_eval =
	2167	&(PL_regmatch_state->u.info_aux_eval);
	2168
	2169	if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
	2170	PL_regmatch_state = S_push_slab(aTHX);
	2171	}
	2172
	2173	/* note initial PL_regmatch_state position; at end of match we'll
	2174	* pop back to there and free any higher slabs */
	2175
	2176	reginfo->info_aux->old_regmatch_state = old_regmatch_state;
	2177	reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
	2178	reginfo->info_aux->poscache = NULL;
	2179
	2180	SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
	2181
	2182	if ((prog->extflags & RXf_EVAL_SEEN))
	2183	S_setup_eval_state(aTHX_ reginfo);
	2184	else
	2185	reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
	2186	}
	2187
	2188	/* If there is a "must appear" string, look for it. */
	2189	s = startpos;
	2190
	2191	if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
	2192	MAGIC *mg;
	2193	if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
	2194	reginfo->ganch = startpos + prog->gofs;
	2195	DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
	2196	"GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
	2197	} else if (sv && (mg = mg_find_mglob(sv))
	2198	&& mg->mg_len >= 0) {
	2199	reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */
	2200	DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
	2201	"GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
	2202
	2203	if (prog->extflags & RXf_ANCH_GPOS) {
	2204	if (s > reginfo->ganch)
	2205	goto phooey;
	2206	s = reginfo->ganch - prog->gofs;
	2207	DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
	2208	"GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
	2209	if (s < strbeg)
	2210	goto phooey;
	2211	}
	2212	}
	2213	else if (data) {
	2214	reginfo->ganch = strbeg + PTR2UV(data);
	2215	DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
	2216	"GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
	2217
	2218	} else { /* pos() not defined */
	2219	reginfo->ganch = strbeg;
	2220	DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
	2221	"GPOS: reginfo->ganch = strbeg\n"));
	2222	}
	2223	}
	2224	if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
	2225	/* We have to be careful. If the previous successful match
	2226	was from this regex we don't want a subsequent partially
	2227	successful match to clobber the old results.
	2228	So when we detect this possibility we add a swap buffer
	2229	to the re, and switch the buffer each match. If we fail,
	2230	we switch it back; otherwise we leave it swapped.
	2231	*/
	2232	swap = prog->offs;
	2233	/* do we need a save destructor here for eval dies? */
	2234	Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
	2235	DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
	2236	"rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
	2237	PTR2UV(prog),
	2238	PTR2UV(swap),
	2239	PTR2UV(prog->offs)
	2240	));
	2241	}
	2242	if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL \|\| prog->check_utf8 != NULL)) {
	2243	re_scream_pos_data d;
	2244
	2245	d.scream_olds = &scream_olds;
	2246	d.scream_pos = &scream_pos;
	2247	s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
	2248	if (!s) {
	2249	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
	2250	goto phooey; /* not present */
	2251	}
	2252	}
	2253
	2254
	2255
	2256	/* Simplest case: anchored match need be tried only once. */
	2257	/* [unless only anchor is BOL and multiline is set] */
	2258	if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
	2259	if (s == startpos && regtry(reginfo, &startpos))
	2260	goto got_it;
	2261	else if (multiline \|\| (prog->intflags & PREGf_IMPLICIT)
	2262	\|\| (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
	2263	{
	2264	char *end;
	2265
	2266	if (minlen)
	2267	dontbother = minlen - 1;
	2268	end = HOP3c(strend, -dontbother, strbeg) - 1;
	2269	/* for multiline we only have to try after newlines */
	2270	if (prog->check_substr \|\| prog->check_utf8) {
	2271	/* because of the goto we can not easily reuse the macros for bifurcating the
	2272	unicode/non-unicode match modes here like we do elsewhere - demerphq */
	2273	if (utf8_target) {
	2274	if (s == startpos)
	2275	goto after_try_utf8;
	2276	while (1) {
	2277	if (regtry(reginfo, &s)) {
	2278	goto got_it;
	2279	}
	2280	after_try_utf8:
	2281	if (s > end) {
	2282	goto phooey;
	2283	}
	2284	if (prog->extflags & RXf_USE_INTUIT) {
	2285	s = re_intuit_start(rx, sv, strbeg,
	2286	s + UTF8SKIP(s), strend, flags, NULL);
	2287	if (!s) {
	2288	goto phooey;
	2289	}
	2290	}
	2291	else {
	2292	s += UTF8SKIP(s);
	2293	}
	2294	}
	2295	} /* end search for check string in unicode */
	2296	else {
	2297	if (s == startpos) {
	2298	goto after_try_latin;
	2299	}
	2300	while (1) {
	2301	if (regtry(reginfo, &s)) {
	2302	goto got_it;
	2303	}
	2304	after_try_latin:
	2305	if (s > end) {
	2306	goto phooey;
	2307	}
	2308	if (prog->extflags & RXf_USE_INTUIT) {
	2309	s = re_intuit_start(rx, sv, strbeg,
	2310	s + 1, strend, flags, NULL);
	2311	if (!s) {
	2312	goto phooey;
	2313	}
	2314	}
	2315	else {
	2316	s++;
	2317	}
	2318	}
	2319	} /* end search for check string in latin*/
	2320	} /* end search for check string */
	2321	else { /* search for newline */
	2322	if (s > startpos) {
	2323	/XXX: The s-- is almost definitely wrong here under unicode - demeprhq/
	2324	s--;
	2325	}
	2326	/* We can use a more efficient search as newlines are the same in unicode as they are in latin */
	2327	while (s <= end) { /* note it could be possible to match at the end of the string */
	2328	if (s++ == '\n') { / don't need PL_utf8skip here */
	2329	if (regtry(reginfo, &s))
	2330	goto got_it;
	2331	}
	2332	}
	2333	} /* end search for newline */
	2334	} /* end anchored/multiline check string search */
	2335	goto phooey;
	2336	} else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
	2337	{
	2338	/* the warning about reginfo->ganch being used without initialization
	2339	is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
	2340	and we only enter this block when the same bit is set. */
	2341	char *tmp_s = reginfo->ganch - prog->gofs;
	2342
	2343	if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
	2344	goto got_it;
	2345	goto phooey;
	2346	}
	2347
	2348	/* Messy cases: unanchored match. */
	2349	if ((prog->anchored_substr \|\| prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
	2350	/* we have /x+whatever/ */
	2351	/* it must be a one character string (XXXX Except is_utf8_pat?) */
	2352	char ch;
	2353	#ifdef DEBUGGING
	2354	int did_match = 0;
	2355	#endif
	2356	if (utf8_target) {
	2357	if (! prog->anchored_utf8) {
	2358	to_utf8_substr(prog);
	2359	}
	2360	ch = SvPVX_const(prog->anchored_utf8)[0];
	2361	REXEC_FBC_SCAN(
	2362	if (*s == ch) {
	2363	DEBUG_EXECUTE_r( did_match = 1 );
	2364	if (regtry(reginfo, &s)) goto got_it;
	2365	s += UTF8SKIP(s);
	2366	while (s < strend && *s == ch)
	2367	s += UTF8SKIP(s);
	2368	}
	2369	);
	2370
	2371	}
	2372	else {
	2373	if (! prog->anchored_substr) {
	2374	if (! to_byte_substr(prog)) {
	2375	NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
	2376	}
	2377	}
	2378	ch = SvPVX_const(prog->anchored_substr)[0];
	2379	REXEC_FBC_SCAN(
	2380	if (*s == ch) {
	2381	DEBUG_EXECUTE_r( did_match = 1 );
	2382	if (regtry(reginfo, &s)) goto got_it;
	2383	s++;
	2384	while (s < strend && *s == ch)
	2385	s++;
	2386	}
	2387	);
	2388	}
	2389	DEBUG_EXECUTE_r(if (!did_match)
	2390	PerlIO_printf(Perl_debug_log,
	2391	"Did not find anchored character...\n")
	2392	);
	2393	}
	2394	else if (prog->anchored_substr != NULL
	2395	\|\| prog->anchored_utf8 != NULL
	2396	\|\| ((prog->float_substr != NULL \|\| prog->float_utf8 != NULL)
	2397	&& prog->float_max_offset < strend - s)) {
	2398	SV *must;
	2399	I32 back_max;
	2400	I32 back_min;
	2401	char *last;
	2402	char last1; / Last position checked before */
	2403	#ifdef DEBUGGING
	2404	int did_match = 0;
	2405	#endif
	2406	if (prog->anchored_substr \|\| prog->anchored_utf8) {
	2407	if (utf8_target) {
	2408	if (! prog->anchored_utf8) {
	2409	to_utf8_substr(prog);
	2410	}
	2411	must = prog->anchored_utf8;
	2412	}
	2413	else {
	2414	if (! prog->anchored_substr) {
	2415	if (! to_byte_substr(prog)) {
	2416	NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
	2417	}
	2418	}
	2419	must = prog->anchored_substr;
	2420	}
	2421	back_max = back_min = prog->anchored_offset;
	2422	} else {
	2423	if (utf8_target) {
	2424	if (! prog->float_utf8) {
	2425	to_utf8_substr(prog);
	2426	}
	2427	must = prog->float_utf8;
	2428	}
	2429	else {
	2430	if (! prog->float_substr) {
	2431	if (! to_byte_substr(prog)) {
	2432	NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
	2433	}
	2434	}
	2435	must = prog->float_substr;
	2436	}
	2437	back_max = prog->float_max_offset;
	2438	back_min = prog->float_min_offset;
	2439	}
	2440
	2441	if (back_min<0) {
	2442	last = strend;
	2443	} else {
	2444	last = HOP3c(strend, /* Cannot start after this */
	2445	-(I32)(CHR_SVLEN(must)
	2446	- (SvTAIL(must) != 0) + back_min), strbeg);
	2447	}
	2448	if (s > reginfo->strbeg)
	2449	last1 = HOPc(s, -1);
	2450	else
	2451	last1 = s - 1; /* bogus */
	2452
	2453	/* XXXX check_substr already used to find "s", can optimize if
	2454	check_substr==must. */
	2455	scream_pos = -1;
	2456	dontbother = end_shift;
	2457	strend = HOPc(strend, -dontbother);
	2458	while ( (s <= last) &&
	2459	(s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
	2460	(unsigned char*)strend, must,
	2461	multiline ? FBMrf_MULTILINE : 0)) ) {
	2462	DEBUG_EXECUTE_r( did_match = 1 );
	2463	if (HOPc(s, -back_max) > last1) {
	2464	last1 = HOPc(s, -back_min);
	2465	s = HOPc(s, -back_max);
	2466	}
	2467	else {
	2468	char * const t = (last1 >= reginfo->strbeg)
	2469	? HOPc(last1, 1) : last1 + 1;
	2470
	2471	last1 = HOPc(s, -back_min);
	2472	s = t;
	2473	}
	2474	if (utf8_target) {
	2475	while (s <= last1) {
	2476	if (regtry(reginfo, &s))
	2477	goto got_it;
	2478	if (s >= last1) {
	2479	s++; /* to break out of outer loop */
	2480	break;
	2481	}
	2482	s += UTF8SKIP(s);
	2483	}
	2484	}
	2485	else {
	2486	while (s <= last1) {
	2487	if (regtry(reginfo, &s))
	2488	goto got_it;
	2489	s++;
	2490	}
	2491	}
	2492	}
	2493	DEBUG_EXECUTE_r(if (!did_match) {
	2494	RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
	2495	SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
	2496	PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
	2497	((must == prog->anchored_substr \|\| must == prog->anchored_utf8)
	2498	? "anchored" : "floating"),
	2499	quoted, RE_SV_TAIL(must));
	2500	});
	2501	goto phooey;
	2502	}
	2503	else if ( (c = progi->regstclass) ) {
	2504	if (minlen) {
	2505	const OPCODE op = OP(progi->regstclass);
	2506	/* don't bother with what can't match */
	2507	if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
	2508	strend = HOPc(strend, -(minlen - 1));
	2509	}
	2510	DEBUG_EXECUTE_r({
	2511	SV * const prop = sv_newmortal();
	2512	regprop(prog, prop, c);
	2513	{
	2514	RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
	2515	s,strend-s,60);
	2516	PerlIO_printf(Perl_debug_log,
	2517	"Matching stclass %.*s against %s (%d bytes)\n",
	2518	(int)SvCUR(prop), SvPVX_const(prop),
	2519	quoted, (int)(strend - s));
	2520	}
	2521	});
	2522	if (find_byclass(prog, c, s, strend, reginfo))
	2523	goto got_it;
	2524	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
	2525	}
	2526	else {
	2527	dontbother = 0;
	2528	if (prog->float_substr != NULL \|\| prog->float_utf8 != NULL) {
	2529	/* Trim the end. */
	2530	char *last= NULL;
	2531	SV* float_real;
	2532	STRLEN len;
	2533	const char *little;
	2534
	2535	if (utf8_target) {
	2536	if (! prog->float_utf8) {
	2537	to_utf8_substr(prog);
	2538	}
	2539	float_real = prog->float_utf8;
	2540	}
	2541	else {
	2542	if (! prog->float_substr) {
	2543	if (! to_byte_substr(prog)) {
	2544	NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
	2545	}
	2546	}
	2547	float_real = prog->float_substr;
	2548	}
	2549
	2550	little = SvPV_const(float_real, len);
	2551	if (SvTAIL(float_real)) {
	2552	/* This means that float_real contains an artificial \n on
	2553	* the end due to the presence of something like this:
	2554	* /foo$/ where we can match both "foo" and "foo\n" at the
	2555	* end of the string. So we have to compare the end of the
	2556	* string first against the float_real without the \n and
	2557	* then against the full float_real with the string. We
	2558	* have to watch out for cases where the string might be
	2559	* smaller than the float_real or the float_real without
	2560	* the \n. */
	2561	char *checkpos= strend - len;
	2562	DEBUG_OPTIMISE_r(
	2563	PerlIO_printf(Perl_debug_log,
	2564	"%sChecking for float_real.%s\n",
	2565	PL_colors[4], PL_colors[5]));
	2566	if (checkpos + 1 < strbeg) {
	2567	/* can't match, even if we remove the trailing \n
	2568	* string is too short to match */
	2569	DEBUG_EXECUTE_r(
	2570	PerlIO_printf(Perl_debug_log,
	2571	"%sString shorter than required trailing substring, cannot match.%s\n",
	2572	PL_colors[4], PL_colors[5]));
	2573	goto phooey;
	2574	} else if (memEQ(checkpos + 1, little, len - 1)) {
	2575	/* can match, the end of the string matches without the
	2576	* "\n" */
	2577	last = checkpos + 1;
	2578	} else if (checkpos < strbeg) {
	2579	/* cant match, string is too short when the "\n" is
	2580	* included */
	2581	DEBUG_EXECUTE_r(
	2582	PerlIO_printf(Perl_debug_log,
	2583	"%sString does not contain required trailing substring, cannot match.%s\n",
	2584	PL_colors[4], PL_colors[5]));
	2585	goto phooey;
	2586	} else if (!multiline) {
	2587	/* non multiline match, so compare with the "\n" at the
	2588	* end of the string */
	2589	if (memEQ(checkpos, little, len)) {
	2590	last= checkpos;
	2591	} else {
	2592	DEBUG_EXECUTE_r(
	2593	PerlIO_printf(Perl_debug_log,
	2594	"%sString does not contain required trailing substring, cannot match.%s\n",
	2595	PL_colors[4], PL_colors[5]));
	2596	goto phooey;
	2597	}
	2598	} else {
	2599	/* multiline match, so we have to search for a place
	2600	* where the full string is located */
	2601	goto find_last;
	2602	}
	2603	} else {
	2604	find_last:
	2605	if (len)
	2606	last = rninstr(s, strend, little, little + len);
	2607	else
	2608	last = strend; /* matching "$" */
	2609	}
	2610	if (!last) {
	2611	/* at one point this block contained a comment which was
	2612	* probably incorrect, which said that this was a "should not
	2613	* happen" case. Even if it was true when it was written I am
	2614	* pretty sure it is not anymore, so I have removed the comment
	2615	* and replaced it with this one. Yves */
	2616	DEBUG_EXECUTE_r(
	2617	PerlIO_printf(Perl_debug_log,
	2618	"String does not contain required substring, cannot match.\n"
	2619	));
	2620	goto phooey;
	2621	}
	2622	dontbother = strend - last + prog->float_min_offset;
	2623	}
	2624	if (minlen && (dontbother < minlen))
	2625	dontbother = minlen - 1;
	2626	strend -= dontbother; /* this one's always in bytes! */
	2627	/* We don't know much -- general case. */
	2628	if (utf8_target) {
	2629	for (;;) {
	2630	if (regtry(reginfo, &s))
	2631	goto got_it;
	2632	if (s >= strend)
	2633	break;
	2634	s += UTF8SKIP(s);
	2635	};
	2636	}
	2637	else {
	2638	do {
	2639	if (regtry(reginfo, &s))
	2640	goto got_it;
	2641	} while (s++ < strend);
	2642	}
	2643	}
	2644
	2645	/* Failure. */
	2646	goto phooey;
	2647
	2648	got_it:
	2649	DEBUG_BUFFERS_r(
	2650	if (swap)
	2651	PerlIO_printf(Perl_debug_log,
	2652	"rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
	2653	PTR2UV(prog),
	2654	PTR2UV(swap)
	2655	);
	2656	);
	2657	Safefree(swap);
	2658
	2659	/* clean up; this will trigger destructors that will free all slabs
	2660	* above the current one, and cleanup the regmatch_info_aux
	2661	* and regmatch_info_aux_eval sructs */
	2662
	2663	LEAVE_SCOPE(oldsave);
	2664
	2665	if (RXp_PAREN_NAMES(prog))
	2666	(void)hv_iterinit(RXp_PAREN_NAMES(prog));
	2667
	2668	RX_MATCH_UTF8_set(rx, utf8_target);
	2669
	2670	/* make sure $`, $&, $', and $digit will work later */
	2671	if ( !(flags & REXEC_NOT_FIRST) ) {
	2672	if (flags & REXEC_COPY_STR) {
	2673	#ifdef PERL_ANY_COW
	2674	if (SvCANCOW(sv)) {
	2675	if (DEBUG_C_TEST) {
	2676	PerlIO_printf(Perl_debug_log,
	2677	"Copy on write: regexp capture, type %d\n",
	2678	(int) SvTYPE(sv));
	2679	}
	2680	RX_MATCH_COPY_FREE(rx);
	2681	prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
	2682	prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
	2683	assert (SvPOKp(prog->saved_copy));
	2684	prog->sublen = reginfo->strend - strbeg;
	2685	prog->suboffset = 0;
	2686	prog->subcoffset = 0;
	2687	} else
	2688	#endif
	2689	{
	2690	I32 min = 0;
	2691	I32 max = reginfo->strend - strbeg;
	2692	I32 sublen;
	2693
	2694	if ( (flags & REXEC_COPY_SKIP_POST)
	2695	&& !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
	2696	&& !(PL_sawampersand & SAWAMPERSAND_RIGHT)
	2697	) { /* don't copy $' part of string */
	2698	U32 n = 0;
	2699	max = -1;
	2700	/* calculate the right-most part of the string covered
	2701	* by a capture. Due to look-ahead, this may be to
	2702	* the right of $&, so we have to scan all captures */
	2703	while (n <= prog->lastparen) {
	2704	if (prog->offs[n].end > max)
	2705	max = prog->offs[n].end;
	2706	n++;
	2707	}
	2708	if (max == -1)
	2709	max = (PL_sawampersand & SAWAMPERSAND_LEFT)
	2710	? prog->offs[0].start
	2711	: 0;
	2712	assert(max >= 0 && max <= reginfo->strend - strbeg);
	2713	}
	2714
	2715	if ( (flags & REXEC_COPY_SKIP_PRE)
	2716	&& !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
	2717	&& !(PL_sawampersand & SAWAMPERSAND_LEFT)
	2718	) { /* don't copy $` part of string */
	2719	U32 n = 0;
	2720	min = max;
	2721	/* calculate the left-most part of the string covered
	2722	* by a capture. Due to look-behind, this may be to
	2723	* the left of $&, so we have to scan all captures */
	2724	while (min && n <= prog->lastparen) {
	2725	if ( prog->offs[n].start != -1
	2726	&& prog->offs[n].start < min)
	2727	{
	2728	min = prog->offs[n].start;
	2729	}
	2730	n++;
	2731	}
	2732	if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
	2733	&& min > prog->offs[0].end
	2734	)
	2735	min = prog->offs[0].end;
	2736
	2737	}
	2738
	2739	assert(min >= 0 && min <= max
	2740	&& min <= reginfo->strend - strbeg);
	2741	sublen = max - min;
	2742
	2743	if (RX_MATCH_COPIED(rx)) {
	2744	if (sublen > prog->sublen)
	2745	prog->subbeg =
	2746	(char*)saferealloc(prog->subbeg, sublen+1);
	2747	}
	2748	else
	2749	prog->subbeg = (char*)safemalloc(sublen+1);
	2750	Copy(strbeg + min, prog->subbeg, sublen, char);
	2751	prog->subbeg[sublen] = '\0';
	2752	prog->suboffset = min;
	2753	prog->sublen = sublen;
	2754	RX_MATCH_COPIED_on(rx);
	2755	}
	2756	prog->subcoffset = prog->suboffset;
	2757	if (prog->suboffset && utf8_target) {
	2758	/* Convert byte offset to chars.
	2759	* XXX ideally should only compute this if @-/@+
	2760	* has been seen, a la PL_sawampersand ??? */
	2761
	2762	/* If there's a direct correspondence between the
	2763	* string which we're matching and the original SV,
	2764	* then we can use the utf8 len cache associated with
	2765	* the SV. In particular, it means that under //g,
	2766	* sv_pos_b2u() will use the previously cached
	2767	* position to speed up working out the new length of
	2768	* subcoffset, rather than counting from the start of
	2769	* the string each time. This stops
	2770	* $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
	2771	* from going quadratic */
	2772	if (SvPOKp(sv) && SvPVX(sv) == strbeg)
	2773	sv_pos_b2u(sv, &(prog->subcoffset));
	2774	else
	2775	prog->subcoffset = utf8_length((U8*)strbeg,
	2776	(U8*)(strbeg+prog->suboffset));
	2777	}
	2778	}
	2779	else {
	2780	RX_MATCH_COPY_FREE(rx);
	2781	prog->subbeg = strbeg;
	2782	prog->suboffset = 0;
	2783	prog->subcoffset = 0;
	2784	/* use reginfo->strend, as strend may have been modified */
	2785	prog->sublen = reginfo->strend - strbeg;
	2786	}
	2787	}
	2788
	2789	return 1;
	2790
	2791	phooey:
	2792	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
	2793	PL_colors[4], PL_colors[5]));
	2794
	2795	/* clean up; this will trigger destructors that will free all slabs
	2796	* above the current one, and cleanup the regmatch_info_aux
	2797	* and regmatch_info_aux_eval sructs */
	2798
	2799	LEAVE_SCOPE(oldsave);
	2800
	2801	if (swap) {
	2802	/* we failed :-( roll it back */
	2803	DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
	2804	"rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
	2805	PTR2UV(prog),
	2806	PTR2UV(prog->offs),
	2807	PTR2UV(swap)
	2808	));
	2809	Safefree(prog->offs);
	2810	prog->offs = swap;
	2811	}
	2812	return 0;
	2813	}
	2814
	2815
	2816	/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
	2817	* Do inc before dec, in case old and new rex are the same */
	2818	#define SET_reg_curpm(Re2) \
	2819	if (reginfo->info_aux_eval) { \
	2820	(void)ReREFCNT_inc(Re2); \
	2821	ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
	2822	PM_SETRE((PL_reg_curpm), (Re2)); \
	2823	}
	2824
	2825
	2826	/*
	2827	- regtry - try match at specific point
	2828	*/
	2829	STATIC I32 /* 0 failure, 1 success */
	2830	S_regtry(pTHX_ regmatch_info reginfo, char *startposp)
	2831	{
	2832	dVAR;
	2833	CHECKPOINT lastcp;
	2834	REGEXP *const rx = reginfo->prog;
	2835	regexp *const prog = ReANY(rx);
	2836	I32 result;
	2837	RXi_GET_DECL(prog,progi);
	2838	GET_RE_DEBUG_FLAGS_DECL;
	2839
	2840	PERL_ARGS_ASSERT_REGTRY;
	2841
	2842	reginfo->cutpoint=NULL;
	2843
	2844	prog->offs[0].start = *startposp - reginfo->strbeg;
	2845	prog->lastparen = 0;
	2846	prog->lastcloseparen = 0;
	2847
	2848	/* XXXX What this code is doing here?!!! There should be no need
	2849	to do this again and again, prog->lastparen should take care of
	2850	this! --ilya*/
	2851
	2852	/* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
	2853	* Actually, the code in regcppop() (which Ilya may be meaning by
	2854	* prog->lastparen), is not needed at all by the test suite
	2855	* (op/regexp, op/pat, op/split), but that code is needed otherwise
	2856	* this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
	2857	* Meanwhile, this code is needed for the
	2858	* above-mentioned test suite tests to succeed. The common theme
	2859	* on those tests seems to be returning null fields from matches.
	2860	* --jhi updated by dapm */
	2861	#if 1
	2862	if (prog->nparens) {
	2863	regexp_paren_pair *pp = prog->offs;
	2864	I32 i;
	2865	for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
	2866	++pp;
	2867	pp->start = -1;
	2868	pp->end = -1;
	2869	}
	2870	}
	2871	#endif
	2872	REGCP_SET(lastcp);
	2873	result = regmatch(reginfo, *startposp, progi->program + 1);
	2874	if (result != -1) {
	2875	prog->offs[0].end = result;
	2876	return 1;
	2877	}
	2878	if (reginfo->cutpoint)
	2879	*startposp= reginfo->cutpoint;
	2880	REGCP_UNWIND(lastcp);
	2881	return 0;
	2882	}
	2883
	2884
	2885	#define sayYES goto yes
	2886	#define sayNO goto no
	2887	#define sayNO_SILENT goto no_silent
	2888
	2889	/* we dont use STMT_START/END here because it leads to
	2890	"unreachable code" warnings, which are bogus, but distracting. */
	2891	#define CACHEsayNO \
	2892	if (ST.cache_mask) \
	2893	reginfo->info_aux->poscache[ST.cache_offset] \|= ST.cache_mask; \
	2894	sayNO
	2895
	2896	/* this is used to determine how far from the left messages like
	2897	'failed...' are printed. It should be set such that messages
	2898	are inline with the regop output that created them.
	2899	*/
	2900	#define REPORT_CODE_OFF 32
	2901
	2902
	2903	#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
	2904	#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
	2905	#define CHRTEST_NOT_A_CP_1 -999
	2906	#define CHRTEST_NOT_A_CP_2 -998
	2907
	2908	/* grab a new slab and return the first slot in it */
	2909
	2910	STATIC regmatch_state *
	2911	S_push_slab(pTHX)
	2912	{
	2913	#if PERL_VERSION < 9 && !defined(PERL_CORE)
	2914	dMY_CXT;
	2915	#endif
	2916	regmatch_slab *s = PL_regmatch_slab->next;
	2917	if (!s) {
	2918	Newx(s, 1, regmatch_slab);
	2919	s->prev = PL_regmatch_slab;
	2920	s->next = NULL;
	2921	PL_regmatch_slab->next = s;
	2922	}
	2923	PL_regmatch_slab = s;
	2924	return SLAB_FIRST(s);
	2925	}
	2926
	2927
	2928	/* push a new state then goto it */
	2929
	2930	#define PUSH_STATE_GOTO(state, node, input) \
	2931	pushinput = input; \
	2932	scan = node; \
	2933	st->resume_state = state; \
	2934	goto push_state;
	2935
	2936	/* push a new state with success backtracking, then goto it */
	2937
	2938	#define PUSH_YES_STATE_GOTO(state, node, input) \
	2939	pushinput = input; \
	2940	scan = node; \
	2941	st->resume_state = state; \
	2942	goto push_yes_state;
	2943
	2944
	2945
	2946
	2947	/*
	2948
	2949	regmatch() - main matching routine
	2950
	2951	This is basically one big switch statement in a loop. We execute an op,
	2952	set 'next' to point the next op, and continue. If we come to a point which
	2953	we may need to backtrack to on failure such as (A\|B\|C), we push a
	2954	backtrack state onto the backtrack stack. On failure, we pop the top
	2955	state, and re-enter the loop at the state indicated. If there are no more
	2956	states to pop, we return failure.
	2957
	2958	Sometimes we also need to backtrack on success; for example /A+/, where
	2959	after successfully matching one A, we need to go back and try to
	2960	match another one; similarly for lookahead assertions: if the assertion
	2961	completes successfully, we backtrack to the state just before the assertion
	2962	and then carry on. In these cases, the pushed state is marked as
	2963	'backtrack on success too'. This marking is in fact done by a chain of
	2964	pointers, each pointing to the previous 'yes' state. On success, we pop to
	2965	the nearest yes state, discarding any intermediate failure-only states.
	2966	Sometimes a yes state is pushed just to force some cleanup code to be
	2967	called at the end of a successful match or submatch; e.g. (??{$re}) uses
	2968	it to free the inner regex.
	2969
	2970	Note that failure backtracking rewinds the cursor position, while
	2971	success backtracking leaves it alone.
	2972
	2973	A pattern is complete when the END op is executed, while a subpattern
	2974	such as (?=foo) is complete when the SUCCESS op is executed. Both of these
	2975	ops trigger the "pop to last yes state if any, otherwise return true"
	2976	behaviour.
	2977
	2978	A common convention in this function is to use A and B to refer to the two
	2979	subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
	2980	the subpattern to be matched possibly multiple times, while B is the entire
	2981	rest of the pattern. Variable and state names reflect this convention.
	2982
	2983	The states in the main switch are the union of ops and failure/success of
	2984	substates associated with with that op. For example, IFMATCH is the op
	2985	that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
	2986	'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
	2987	successfully matched A and IFMATCH_A_fail is a state saying that we have
	2988	just failed to match A. Resume states always come in pairs. The backtrack
	2989	state we push is marked as 'IFMATCH_A', but when that is popped, we resume
	2990	at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
	2991	on success or failure.
	2992
	2993	The struct that holds a backtracking state is actually a big union, with
	2994	one variant for each major type of op. The variable st points to the
	2995	top-most backtrack struct. To make the code clearer, within each
	2996	block of code we #define ST to alias the relevant union.
	2997
	2998	Here's a concrete example of a (vastly oversimplified) IFMATCH
	2999	implementation:
	3000
	3001	switch (state) {
	3002	....
	3003
	3004	#define ST st->u.ifmatch
	3005
	3006	case IFMATCH: // we are executing the IFMATCH op, (?=A)B
	3007	ST.foo = ...; // some state we wish to save
	3008	...
	3009	// push a yes backtrack state with a resume value of
	3010	// IFMATCH_A/IFMATCH_A_fail, then continue execution at the
	3011	// first node of A:
	3012	PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
	3013	// NOTREACHED
	3014
	3015	case IFMATCH_A: // we have successfully executed A; now continue with B
	3016	next = B;
	3017	bar = ST.foo; // do something with the preserved value
	3018	break;
	3019
	3020	case IFMATCH_A_fail: // A failed, so the assertion failed
	3021	...; // do some housekeeping, then ...
	3022	sayNO; // propagate the failure
	3023
	3024	#undef ST
	3025
	3026	...
	3027	}
	3028
	3029	For any old-timers reading this who are familiar with the old recursive
	3030	approach, the code above is equivalent to:
	3031
	3032	case IFMATCH: // we are executing the IFMATCH op, (?=A)B
	3033	{
	3034	int foo = ...
	3035	...
	3036	if (regmatch(A)) {
	3037	next = B;
	3038	bar = foo;
	3039	break;
	3040	}
	3041	...; // do some housekeeping, then ...
	3042	sayNO; // propagate the failure
	3043	}
	3044
	3045	The topmost backtrack state, pointed to by st, is usually free. If you
	3046	want to claim it, populate any ST.foo fields in it with values you wish to
	3047	save, then do one of
	3048
	3049	PUSH_STATE_GOTO(resume_state, node, newinput);
	3050	PUSH_YES_STATE_GOTO(resume_state, node, newinput);
	3051
	3052	which sets that backtrack state's resume value to 'resume_state', pushes a
	3053	new free entry to the top of the backtrack stack, then goes to 'node'.
	3054	On backtracking, the free slot is popped, and the saved state becomes the
	3055	new free state. An ST.foo field in this new top state can be temporarily
	3056	accessed to retrieve values, but once the main loop is re-entered, it
	3057	becomes available for reuse.
	3058
	3059	Note that the depth of the backtrack stack constantly increases during the
	3060	left-to-right execution of the pattern, rather than going up and down with
	3061	the pattern nesting. For example the stack is at its maximum at Z at the
	3062	end of the pattern, rather than at X in the following:
	3063
	3064	/(((X)+)+)+....(Y)+....Z/
	3065
	3066	The only exceptions to this are lookahead/behind assertions and the cut,
	3067	(?>A), which pop all the backtrack states associated with A before
	3068	continuing.
	3069
	3070	Backtrack state structs are allocated in slabs of about 4K in size.
	3071	PL_regmatch_state and st always point to the currently active state,
	3072	and PL_regmatch_slab points to the slab currently containing
	3073	PL_regmatch_state. The first time regmatch() is called, the first slab is
	3074	allocated, and is never freed until interpreter destruction. When the slab
	3075	is full, a new one is allocated and chained to the end. At exit from
	3076	regmatch(), slabs allocated since entry are freed.
	3077
	3078	*/
	3079
	3080
	3081	#define DEBUG_STATE_pp(pp) \
	3082	DEBUG_STATE_r({ \
	3083	DUMP_EXEC_POS(locinput, scan, utf8_target); \
	3084	PerlIO_printf(Perl_debug_log, \
	3085	" %*s"pp" %s%s%s%s%s\n", \
	3086	depth*2, "", \
	3087	PL_reg_name[st->resume_state], \
	3088	((st==yes_state\|\|st==mark_state) ? "[" : ""), \
	3089	((st==yes_state) ? "Y" : ""), \
	3090	((st==mark_state) ? "M" : ""), \
	3091	((st==yes_state\|\|st==mark_state) ? "]" : "") \
	3092	); \
	3093	});
	3094
	3095
	3096	#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
	3097
	3098	#ifdef DEBUGGING
	3099
	3100	STATIC void
	3101	S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
	3102	const char start, const char end, const char *blurb)
	3103	{
	3104	const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
	3105
	3106	PERL_ARGS_ASSERT_DEBUG_START_MATCH;
	3107
	3108	if (!PL_colorset)
	3109	reginitcolors();
	3110	{
	3111	RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
	3112	RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
	3113
	3114	RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
	3115	start, end - start, 60);
	3116
	3117	PerlIO_printf(Perl_debug_log,
	3118	"%s%s REx%s %s against %s\n",
	3119	PL_colors[4], blurb, PL_colors[5], s0, s1);
	3120
	3121	if (utf8_target\|\|utf8_pat)
	3122	PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
	3123	utf8_pat ? "pattern" : "",
	3124	utf8_pat && utf8_target ? " and " : "",
	3125	utf8_target ? "string" : ""
	3126	);
	3127	}
	3128	}
	3129
	3130	STATIC void
	3131	S_dump_exec_pos(pTHX_ const char *locinput,
	3132	const regnode *scan,
	3133	const char *loc_regeol,
	3134	const char *loc_bostr,
	3135	const char *loc_reg_starttry,
	3136	const bool utf8_target)
	3137	{
	3138	const int docolor = PL_colors[0] \|\| PL_colors[2] \|\| *PL_colors[4];
	3139	const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
	3140	int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
	3141	/* The part of the string before starttry has one color
	3142	(pref0_len chars), between starttry and current
	3143	position another one (pref_len - pref0_len chars),
	3144	after the current position the third one.
	3145	We assume that pref0_len <= pref_len, otherwise we
	3146	decrease pref0_len. */
	3147	int pref_len = (locinput - loc_bostr) > (5 + taill) - l
	3148	? (5 + taill) - l : locinput - loc_bostr;
	3149	int pref0_len;
	3150
	3151	PERL_ARGS_ASSERT_DUMP_EXEC_POS;
	3152
	3153	while (utf8_target && UTF8_IS_CONTINUATION((U8)(locinput - pref_len)))
	3154	pref_len++;
	3155	pref0_len = pref_len - (locinput - loc_reg_starttry);
	3156	if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
	3157	l = ( loc_regeol - locinput > (5 + taill) - pref_len
	3158	? (5 + taill) - pref_len : loc_regeol - locinput);
	3159	while (utf8_target && UTF8_IS_CONTINUATION((U8)(locinput + l)))
	3160	l--;
	3161	if (pref0_len < 0)
	3162	pref0_len = 0;
	3163	if (pref0_len > pref_len)
	3164	pref0_len = pref_len;
	3165	{
	3166	const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
	3167
	3168	RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
	3169	(locinput - pref_len),pref0_len, 60, 4, 5);
	3170
	3171	RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
	3172	(locinput - pref_len + pref0_len),
	3173	pref_len - pref0_len, 60, 2, 3);
	3174
	3175	RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
	3176	locinput, loc_regeol - locinput, 10, 0, 1);
	3177
	3178	const STRLEN tlen=len0+len1+len2;
	3179	PerlIO_printf(Perl_debug_log,
	3180	"%4"IVdf" <%.s%.s%s%.s>%s\|",
	3181	(IV)(locinput - loc_bostr),
	3182	len0, s0,
	3183	len1, s1,
	3184	(docolor ? "" : "> <"),
	3185	len2, s2,
	3186	(int)(tlen > 19 ? 0 : 19 - tlen),
	3187	"");
	3188	}
	3189	}
	3190
	3191	#endif
	3192
	3193	/* reg_check_named_buff_matched()
	3194	* Checks to see if a named buffer has matched. The data array of
	3195	* buffer numbers corresponding to the buffer is expected to reside
	3196	* in the regexp->data->data array in the slot stored in the ARG() of
	3197	* node involved. Note that this routine doesn't actually care about the
	3198	* name, that information is not preserved from compilation to execution.
	3199	* Returns the index of the leftmost defined buffer with the given name
	3200	* or 0 if non of the buffers matched.
	3201	*/
	3202	STATIC I32
	3203	S_reg_check_named_buff_matched(pTHX_ const regexp rex, const regnode scan)
	3204	{
	3205	I32 n;
	3206	RXi_GET_DECL(rex,rexi);
	3207	SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
	3208	I32 nums=(I32)SvPVX(sv_dat);
	3209
	3210	PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
	3211
	3212	for ( n=0; n<SvIVX(sv_dat); n++ ) {
	3213	if ((I32)rex->lastparen >= nums[n] &&
	3214	rex->offs[nums[n]].end != -1)
	3215	{
	3216	return nums[n];
	3217	}
	3218	}
	3219	return 0;
	3220	}
	3221
	3222
	3223	static bool
	3224	S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
	3225	U8* c1_utf8, int c2p, U8 c2_utf8, regmatch_info *reginfo)
	3226	{
	3227	/* This function determines if there are one or two characters that match
	3228	* the first character of the passed-in EXACTish node <text_node>, and if
	3229	* so, returns them in the passed-in pointers.
	3230	*
	3231	* If it determines that no possible character in the target string can
	3232	* match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
	3233	* the first character in <text_node> requires UTF-8 to represent, and the
	3234	* target string isn't in UTF-8.)
	3235	*
	3236	* If there are more than two characters that could match the beginning of
	3237	* <text_node>, or if more context is required to determine a match or not,
	3238	* it sets both <c1p> and <c2p> to CHRTEST_VOID.
	3239	*
	3240	* The motiviation behind this function is to allow the caller to set up
	3241	* tight loops for matching. If <text_node> is of type EXACT, there is
	3242	* only one possible character that can match its first character, and so
	3243	* the situation is quite simple. But things get much more complicated if
	3244	* folding is involved. It may be that the first character of an EXACTFish
	3245	* node doesn't participate in any possible fold, e.g., punctuation, so it
	3246	* can be matched only by itself. The vast majority of characters that are
	3247	* in folds match just two things, their lower and upper-case equivalents.
	3248	* But not all are like that; some have multiple possible matches, or match
	3249	* sequences of more than one character. This function sorts all that out.
	3250	*
	3251	* Consider the patterns AB or A?B where A and B are arbitrary. In a
	3252	* loop of trying to match A*, we know we can't exit where the thing
	3253	* following it isn't a B. And something can't be a B unless it is the
	3254	* beginning of B. By putting a quick test for that beginning in a tight
	3255	* loop, we can rule out things that can't possibly be B without having to
	3256	* break out of the loop, thus avoiding work. Similarly, if A is a single
	3257	* character, we can make a tight loop matching A*, using the outputs of
	3258	* this function.
	3259	*
	3260	* If the target string to match isn't in UTF-8, and there aren't
	3261	* complications which require CHRTEST_VOID, <c1p> and <c2p> are set to
	3262	* the one or two possible octets (which are characters in this situation)
	3263	* that can match. In all cases, if there is only one character that can
	3264	* match, <c1p> and <c2p> will be identical.
	3265	*
	3266	* If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
	3267	* and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
	3268	* can match the beginning of <text_node>. They should be declared with at
	3269	* least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
	3270	* undefined what these contain.) If one or both of the buffers are
	3271	* invariant under UTF-8, <c1p>, and <c2p> will also be set to the
	3272	* corresponding invariant. If variant, the corresponding *<c1p> and/or
	3273	* *<c2p> will be set to a negative number(s) that shouldn't match any code
	3274	* point (unless inappropriately coerced to unsigned). *<c1p> will equal
	3275	* <c2p> if and only if <c1_utf8> and <c2_utf8> are the same. /
	3276
	3277	const bool utf8_target = reginfo->is_utf8_target;
	3278
	3279	UV c1 = CHRTEST_NOT_A_CP_1;
	3280	UV c2 = CHRTEST_NOT_A_CP_2;
	3281	bool use_chrtest_void = FALSE;
	3282	const bool is_utf8_pat = reginfo->is_utf8_pat;
	3283
	3284	/* Used when we have both utf8 input and utf8 output, to avoid converting
	3285	* to/from code points */
	3286	bool utf8_has_been_setup = FALSE;
	3287
	3288	dVAR;
	3289
	3290	U8 pat = (U8)STRING(text_node);
	3291
	3292	if (OP(text_node) == EXACT) {
	3293
	3294	/* In an exact node, only one thing can be matched, that first
	3295	* character. If both the pat and the target are UTF-8, we can just
	3296	* copy the input to the output, avoiding finding the code point of
	3297	* that character */
	3298	if (!is_utf8_pat) {
	3299	c2 = c1 = *pat;
	3300	}
	3301	else if (utf8_target) {
	3302	Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
	3303	Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
	3304	utf8_has_been_setup = TRUE;
	3305	}
	3306	else {
	3307	c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
	3308	}
	3309	}
	3310	else /* an EXACTFish node */
	3311	if ((is_utf8_pat
	3312	&& is_MULTI_CHAR_FOLD_utf8_safe(pat,
	3313	pat + STR_LEN(text_node)))
	3314	\|\| (!is_utf8_pat
	3315	&& is_MULTI_CHAR_FOLD_latin1_safe(pat,
	3316	pat + STR_LEN(text_node))))
	3317	{
	3318	/* Multi-character folds require more context to sort out. Also
	3319	* PL_utf8_foldclosures used below doesn't handle them, so have to be
	3320	* handled outside this routine */
	3321	use_chrtest_void = TRUE;
	3322	}
	3323	else { /* an EXACTFish node which doesn't begin with a multi-char fold */
	3324	c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
	3325	if (c1 > 256) {
	3326	/* Load the folds hash, if not already done */
	3327	SV** listp;
	3328	if (! PL_utf8_foldclosures) {
	3329	if (! PL_utf8_tofold) {
	3330	U8 dummy[UTF8_MAXBYTES+1];
	3331
	3332	/* Force loading this by folding an above-Latin1 char */
	3333	to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
	3334	assert(PL_utf8_tofold); /* Verify that worked */
	3335	}
	3336	PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
	3337	}
	3338
	3339	/* The fold closures data structure is a hash with the keys being
	3340	* the UTF-8 of every character that is folded to, like 'k', and
	3341	* the values each an array of all code points that fold to its
	3342	* key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
	3343	* not included */
	3344	if ((! (listp = hv_fetch(PL_utf8_foldclosures,
	3345	(char *) pat,
	3346	UTF8SKIP(pat),
	3347	FALSE))))
	3348	{
	3349	/* Not found in the hash, therefore there are no folds
	3350	* containing it, so there is only a single character that
	3351	* could match */
	3352	c2 = c1;
	3353	}
	3354	else { /* Does participate in folds */
	3355	AV* list = (AV) listp;
	3356	if (av_len(list) != 1) {
	3357
	3358	/* If there aren't exactly two folds to this, it is outside
	3359	* the scope of this function */
	3360	use_chrtest_void = TRUE;
	3361	}
	3362	else { /* There are two. Get them */
	3363	SV** c_p = av_fetch(list, 0, FALSE);
	3364	if (c_p == NULL) {
	3365	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	3366	}
	3367	c1 = SvUV(*c_p);
	3368
	3369	c_p = av_fetch(list, 1, FALSE);
	3370	if (c_p == NULL) {
	3371	Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
	3372	}
	3373	c2 = SvUV(*c_p);
	3374
	3375	/* Folds that cross the 255/256 boundary are forbidden if
	3376	* EXACTFL, or EXACTFA and one is ASCIII. Since the
	3377	* pattern character is above 256, and its only other match
	3378	* is below 256, the only legal match will be to itself.
	3379	* We have thrown away the original, so have to compute
	3380	* which is the one above 255 */
	3381	if ((c1 < 256) != (c2 < 256)) {
	3382	if (OP(text_node) == EXACTFL
	3383	\|\| (OP(text_node) == EXACTFA
	3384	&& (isASCII(c1) \|\| isASCII(c2))))
	3385	{
	3386	if (c1 < 256) {
	3387	c1 = c2;
	3388	}
	3389	else {
	3390	c2 = c1;
	3391	}
	3392	}
	3393	}
	3394	}
	3395	}
	3396	}
	3397	else /* Here, c1 is < 255 */
	3398	if (utf8_target
	3399	&& HAS_NONLATIN1_FOLD_CLOSURE(c1)
	3400	&& OP(text_node) != EXACTFL
	3401	&& (OP(text_node) != EXACTFA \|\| ! isASCII(c1)))
	3402	{
	3403	/* Here, there could be something above Latin1 in the target which
	3404	* folds to this character in the pattern. All such cases except
	3405	* LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
	3406	* involved in their folds, so are outside the scope of this
	3407	* function */
	3408	if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
	3409	c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
	3410	}
	3411	else {
	3412	use_chrtest_void = TRUE;
	3413	}
	3414	}
	3415	else { /* Here nothing above Latin1 can fold to the pattern character */
	3416	switch (OP(text_node)) {
	3417
	3418	case EXACTFL: /* /l rules */
	3419	c2 = PL_fold_locale[c1];
	3420	break;
	3421
	3422	case EXACTF:
	3423	if (! utf8_target) { /* /d rules */
	3424	c2 = PL_fold[c1];
	3425	break;
	3426	}
	3427	/* FALLTHROUGH */
	3428	/* /u rules for all these. This happens to work for
	3429	* EXACTFA as nothing in Latin1 folds to ASCII */
	3430	case EXACTFA:
	3431	case EXACTFU_TRICKYFOLD:
	3432	case EXACTFU_SS:
	3433	case EXACTFU:
	3434	c2 = PL_fold_latin1[c1];
	3435	break;
	3436
	3437	default:
	3438	Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
	3439	assert(0); /* NOTREACHED */
	3440	}
	3441	}
	3442	}
	3443
	3444	/* Here have figured things out. Set up the returns */
	3445	if (use_chrtest_void) {
	3446	c2p = c1p = CHRTEST_VOID;
	3447	}
	3448	else if (utf8_target) {
	3449	if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
	3450	uvchr_to_utf8(c1_utf8, c1);
	3451	uvchr_to_utf8(c2_utf8, c2);
	3452	}
	3453
	3454	/* Invariants are stored in both the utf8 and byte outputs; Use
	3455	* negative numbers otherwise for the byte ones. Make sure that the
	3456	* byte ones are the same iff the utf8 ones are the same */
	3457	c1p = (UTF8_IS_INVARIANT(c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
	3458	c2p = (UTF8_IS_INVARIANT(c2_utf8))
	3459	? *c2_utf8
	3460	: (c1 == c2)
	3461	? CHRTEST_NOT_A_CP_1
	3462	: CHRTEST_NOT_A_CP_2;
	3463	}
	3464	else if (c1 > 255) {
	3465	if (c2 > 255) { /* both possibilities are above what a non-utf8 string
	3466	can represent */
	3467	return FALSE;
	3468	}
	3469
	3470	c1p = c2p = c2; /* c2 is the only representable value */
	3471	}
	3472	else { /* c1 is representable; see about c2 */
	3473	*c1p = c1;
	3474	*c2p = (c2 < 256) ? c2 : c1;
	3475	}
	3476
	3477	return TRUE;
	3478	}
	3479
	3480	/* returns -1 on failure, $+[0] on success */
	3481	STATIC I32
	3482	S_regmatch(pTHX_ regmatch_info reginfo, char startpos, regnode *prog)
	3483	{
	3484	#if PERL_VERSION < 9 && !defined(PERL_CORE)
	3485	dMY_CXT;
	3486	#endif
	3487	dVAR;
	3488	const bool utf8_target = reginfo->is_utf8_target;
	3489	const U32 uniflags = UTF8_ALLOW_DEFAULT;
	3490	REGEXP *rex_sv = reginfo->prog;
	3491	regexp *rex = ReANY(rex_sv);
	3492	RXi_GET_DECL(rex,rexi);
	3493	/* the current state. This is a cached copy of PL_regmatch_state */
	3494	regmatch_state *st;
	3495	/* cache heavy used fields of st in registers */
	3496	regnode *scan;
	3497	regnode *next;
	3498	U32 n = 0; /* general value; init to avoid compiler warning */
	3499	I32 ln = 0; /* len or last; init to avoid compiler warning */
	3500	char *locinput = startpos;
	3501	char pushinput; / where to continue after a PUSH */
	3502	I32 nextchr; /* is always set to UCHARAT(locinput) */
	3503
	3504	bool result = 0; /* return value of S_regmatch */
	3505	int depth = 0; /* depth of backtrack stack */
	3506	U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
	3507	const U32 max_nochange_depth =
	3508	(3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
	3509	3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
	3510	regmatch_state yes_state = NULL; / state to pop to on success of
	3511	subpattern */
	3512	/* mark_state piggy backs on the yes_state logic so that when we unwind
	3513	the stack on success we can update the mark_state as we go */
	3514	regmatch_state mark_state = NULL; / last mark state we have seen */
	3515	regmatch_state cur_eval = NULL; / most recent EVAL_AB state */
	3516	struct regmatch_state cur_curlyx = NULL; / most recent curlyx */
	3517	U32 state_num;
	3518	bool no_final = 0; /* prevent failure from backtracking? */
	3519	bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
	3520	char *startpoint = locinput;
	3521	SV popmark = NULL; / are we looking for a mark? */
	3522	SV sv_commit = NULL; / last mark name seen in failure */
	3523	SV sv_yes_mark = NULL; / last mark name we have seen
	3524	during a successful match */
	3525	U32 lastopen = 0; /* last open we saw */
	3526	bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
	3527	SV* const oreplsv = GvSV(PL_replgv);
	3528	/* these three flags are set by various ops to signal information to
	3529	* the very next op. They have a useful lifetime of exactly one loop
	3530	* iteration, and are not preserved or restored by state pushes/pops
	3531	*/
	3532	bool sw = 0; /* the condition value in (?(cond)a\|b) */
	3533	bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
	3534	int logical = 0; /* the following EVAL is:
	3535	0: (?{...})
	3536	1: (?(?{...})X\|Y)
	3537	2: (??{...})
	3538	or the following IFMATCH/UNLESSM is:
	3539	false: plain (?=foo)
	3540	true: used as a condition: (?(?=foo))
	3541	*/
	3542	PAD* last_pad = NULL;
	3543	dMULTICALL;
	3544	I32 gimme = G_SCALAR;
	3545	CV caller_cv = NULL; / who called us */
	3546	CV last_pushed_cv = NULL; / most recently called (?{}) CV */
	3547	CHECKPOINT runops_cp; /* savestack position before executing EVAL */
	3548	U32 maxopenparen = 0; /* max '(' index seen so far */
	3549	int to_complement; /* Invert the result? */
	3550	_char_class_number classnum;
	3551	bool is_utf8_pat = reginfo->is_utf8_pat;
	3552
	3553	#ifdef DEBUGGING
	3554	GET_RE_DEBUG_FLAGS_DECL;
	3555	#endif
	3556
	3557	/* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
	3558	multicall_oldcatch = 0;
	3559	multicall_cv = NULL;
	3560	cx = NULL;
	3561	PERL_UNUSED_VAR(multicall_cop);
	3562	PERL_UNUSED_VAR(newsp);
	3563
	3564
	3565	PERL_ARGS_ASSERT_REGMATCH;
	3566
	3567	DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
	3568	PerlIO_printf(Perl_debug_log,"regmatch start\n");
	3569	}));
	3570
	3571	st = PL_regmatch_state;
	3572
	3573	/* Note that nextchr is a byte even in UTF */
	3574	SET_nextchr;
	3575	scan = prog;
	3576	while (scan != NULL) {
	3577
	3578	DEBUG_EXECUTE_r( {
	3579	SV * const prop = sv_newmortal();
	3580	regnode *rnext=regnext(scan);
	3581	DUMP_EXEC_POS( locinput, scan, utf8_target );
	3582	regprop(rex, prop, scan);
	3583
	3584	PerlIO_printf(Perl_debug_log,
	3585	"%3"IVdf":%*s%s(%"IVdf")\n",
	3586	(IV)(scan - rexi->program), depth*2, "",
	3587	SvPVX_const(prop),
	3588	(PL_regkind[OP(scan)] == END \|\| !rnext) ?
	3589	0 : (IV)(rnext - rexi->program));
	3590	});
	3591
	3592	next = scan + NEXT_OFF(scan);
	3593	if (next == scan)
	3594	next = NULL;
	3595	state_num = OP(scan);
	3596
	3597	reenter_switch:
	3598	to_complement = 0;
	3599
	3600	SET_nextchr;
	3601	assert(nextchr < 256 && (nextchr >= 0 \|\| nextchr == NEXTCHR_EOS));
	3602
	3603	switch (state_num) {
	3604	case BOL: /* /^../ */
	3605	if (locinput == reginfo->strbeg)
	3606	break;
	3607	sayNO;
	3608
	3609	case MBOL: /* /^../m */
	3610	if (locinput == reginfo->strbeg \|\|
	3611	(!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
	3612	{
	3613	break;
	3614	}
	3615	sayNO;
	3616
	3617	case SBOL: /* /^../s */
	3618	if (locinput == reginfo->strbeg)
	3619	break;
	3620	sayNO;
	3621
	3622	case GPOS: /* \G */
	3623	if (locinput == reginfo->ganch)
	3624	break;
	3625	sayNO;
	3626
	3627	case KEEPS: /* \K */
	3628	/* update the startpoint */
	3629	st->u.keeper.val = rex->offs[0].start;
	3630	rex->offs[0].start = locinput - reginfo->strbeg;
	3631	PUSH_STATE_GOTO(KEEPS_next, next, locinput);
	3632	assert(0); /NOTREACHED/
	3633	case KEEPS_next_fail:
	3634	/* rollback the start point change */
	3635	rex->offs[0].start = st->u.keeper.val;
	3636	sayNO_SILENT;
	3637	assert(0); /NOTREACHED/
	3638
	3639	case EOL: /* /..$/ */
	3640	goto seol;
	3641
	3642	case MEOL: /* /..$/m */
	3643	if (!NEXTCHR_IS_EOS && nextchr != '\n')
	3644	sayNO;
	3645	break;
	3646
	3647	case SEOL: /* /..$/s */
	3648	seol:
	3649	if (!NEXTCHR_IS_EOS && nextchr != '\n')
	3650	sayNO;
	3651	if (reginfo->strend - locinput > 1)
	3652	sayNO;
	3653	break;
	3654
	3655	case EOS: /* \z */
	3656	if (!NEXTCHR_IS_EOS)
	3657	sayNO;
	3658	break;
	3659
	3660	case SANY: /* /./s */
	3661	if (NEXTCHR_IS_EOS)
	3662	sayNO;
	3663	goto increment_locinput;
	3664
	3665	case CANY: /* \C */
	3666	if (NEXTCHR_IS_EOS)
	3667	sayNO;
	3668	locinput++;
	3669	break;
	3670
	3671	case REG_ANY: /* /./ */
	3672	if ((NEXTCHR_IS_EOS) \|\| nextchr == '\n')
	3673	sayNO;
	3674	goto increment_locinput;
	3675
	3676
	3677	#undef ST
	3678	#define ST st->u.trie
	3679	case TRIEC: /* (ab\|cd) with known charclass */
	3680	/* In this case the charclass data is available inline so
	3681	we can fail fast without a lot of extra overhead.
	3682	*/
	3683	if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
	3684	DEBUG_EXECUTE_r(
	3685	PerlIO_printf(Perl_debug_log,
	3686	"%*s %sfailed to match trie start class...%s\n",
	3687	REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
	3688	);
	3689	sayNO_SILENT;
	3690	assert(0); /* NOTREACHED */
	3691	}
	3692	/* FALL THROUGH */
	3693	case TRIE: /* (ab\|cd) */
	3694	/* the basic plan of execution of the trie is:
	3695	* At the beginning, run though all the states, and
	3696	* find the longest-matching word. Also remember the position
	3697	* of the shortest matching word. For example, this pattern:
	3698	* 1 2 3 4 5
	3699	* ab\|a\|x\|abcd\|abc
	3700	* when matched against the string "abcde", will generate
	3701	* accept states for all words except 3, with the longest
	3702	* matching word being 4, and the shortest being 2 (with
	3703	* the position being after char 1 of the string).
	3704	*
	3705	* Then for each matching word, in word order (i.e. 1,2,4,5),
	3706	* we run the remainder of the pattern; on each try setting
	3707	* the current position to the character following the word,
	3708	* returning to try the next word on failure.
	3709	*
	3710	* We avoid having to build a list of words at runtime by
	3711	* using a compile-time structure, wordinfo[].prev, which
	3712	* gives, for each word, the previous accepting word (if any).
	3713	* In the case above it would contain the mappings 1->2, 2->0,
	3714	* 3->0, 4->5, 5->1. We can use this table to generate, from
	3715	* the longest word (4 above), a list of all words, by
	3716	* following the list of prev pointers; this gives us the
	3717	* unordered list 4,5,1,2. Then given the current word we have
	3718	* just tried, we can go through the list and find the
	3719	* next-biggest word to try (so if we just failed on word 2,
	3720	* the next in the list is 4).
	3721	*
	3722	* Since at runtime we don't record the matching position in
	3723	* the string for each word, we have to work that out for
	3724	* each word we're about to process. The wordinfo table holds
	3725	* the character length of each word; given that we recorded
	3726	* at the start: the position of the shortest word and its
	3727	* length in chars, we just need to move the pointer the
	3728	* difference between the two char lengths. Depending on
	3729	* Unicode status and folding, that's cheap or expensive.
	3730	*
	3731	* This algorithm is optimised for the case where are only a
	3732	* small number of accept states, i.e. 0,1, or maybe 2.
	3733	* With lots of accepts states, and having to try all of them,
	3734	* it becomes quadratic on number of accept states to find all
	3735	* the next words.
	3736	*/
	3737
	3738	{
	3739	/* what type of TRIE am I? (utf8 makes this contextual) */
	3740	DECL_TRIE_TYPE(scan);
	3741
	3742	/* what trie are we using right now */
	3743	reg_trie_data * const trie
	3744	= (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
	3745	HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
	3746	U32 state = trie->startstate;
	3747
	3748	if ( trie->bitmap
	3749	&& (NEXTCHR_IS_EOS \|\| !TRIE_BITMAP_TEST(trie, nextchr)))
	3750	{
	3751	if (trie->states[ state ].wordnum) {
	3752	DEBUG_EXECUTE_r(
	3753	PerlIO_printf(Perl_debug_log,
	3754	"%*s %smatched empty string...%s\n",
	3755	REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
	3756	);
	3757	if (!trie->jump)
	3758	break;
	3759	} else {
	3760	DEBUG_EXECUTE_r(
	3761	PerlIO_printf(Perl_debug_log,
	3762	"%*s %sfailed to match trie start class...%s\n",
	3763	REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
	3764	);
	3765	sayNO_SILENT;
	3766	}
	3767	}
	3768
	3769	{
	3770	U8 uc = ( U8 )locinput;
	3771
	3772	STRLEN len = 0;
	3773	STRLEN foldlen = 0;
	3774	U8 uscan = (U8)NULL;
	3775	U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
	3776	U32 charcount = 0; /* how many input chars we have matched */
	3777	U32 accepted = 0; /* have we seen any accepting states? */
	3778
	3779	ST.jump = trie->jump;
	3780	ST.me = scan;
	3781	ST.firstpos = NULL;
	3782	ST.longfold = FALSE; /* char longer if folded => it's harder */
	3783	ST.nextword = 0;
	3784
	3785	/* fully traverse the TRIE; note the position of the
	3786	shortest accept state and the wordnum of the longest
	3787	accept state */
	3788
	3789	while ( state && uc <= (U8*)(reginfo->strend) ) {
	3790	U32 base = trie->states[ state ].trans.base;
	3791	UV uvc = 0;
	3792	U16 charid = 0;
	3793	U16 wordnum;
	3794	wordnum = trie->states[ state ].wordnum;
	3795
	3796	if (wordnum) { /* it's an accept state */
	3797	if (!accepted) {
	3798	accepted = 1;
	3799	/* record first match position */
	3800	if (ST.longfold) {
	3801	ST.firstpos = (U8*)locinput;
	3802	ST.firstchars = 0;
	3803	}
	3804	else {
	3805	ST.firstpos = uc;
	3806	ST.firstchars = charcount;
	3807	}
	3808	}
	3809	if (!ST.nextword \|\| wordnum < ST.nextword)
	3810	ST.nextword = wordnum;
	3811	ST.topword = wordnum;
	3812	}
	3813
	3814	DEBUG_TRIE_EXECUTE_r({
	3815	DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
	3816	PerlIO_printf( Perl_debug_log,
	3817	"%*s %sState: %4"UVxf" Accepted: %c ",
	3818	2+depth * 2, "", PL_colors[4],
	3819	(UV)state, (accepted ? 'Y' : 'N'));
	3820	});
	3821
	3822	/* read a char and goto next state */
	3823	if ( base && (foldlen \|\| uc < (U8*)(reginfo->strend))) {
	3824	I32 offset;
	3825	REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
	3826	uscan, len, uvc, charid, foldlen,
	3827	foldbuf, uniflags);
	3828	charcount++;
	3829	if (foldlen>0)
	3830	ST.longfold = TRUE;
	3831	if (charid &&
	3832	( ((offset =
	3833	base + charid - 1 - trie->uniquecharcount)) >= 0)
	3834
	3835	&& ((U32)offset < trie->lasttrans)
	3836	&& trie->trans[offset].check == state)
	3837	{
	3838	state = trie->trans[offset].next;
	3839	}
	3840	else {
	3841	state = 0;
	3842	}
	3843	uc += len;
	3844
	3845	}
	3846	else {
	3847	state = 0;
	3848	}
	3849	DEBUG_TRIE_EXECUTE_r(
	3850	PerlIO_printf( Perl_debug_log,
	3851	"Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
	3852	charid, uvc, (UV)state, PL_colors[5] );
	3853	);
	3854	}
	3855	if (!accepted)
	3856	sayNO;
	3857
	3858	/* calculate total number of accept states */
	3859	{
	3860	U16 w = ST.topword;
	3861	accepted = 0;
	3862	while (w) {
	3863	w = trie->wordinfo[w].prev;
	3864	accepted++;
	3865	}
	3866	ST.accepted = accepted;
	3867	}
	3868
	3869	DEBUG_EXECUTE_r(
	3870	PerlIO_printf( Perl_debug_log,
	3871	"%*s %sgot %"IVdf" possible matches%s\n",
	3872	REPORT_CODE_OFF + depth * 2, "",
	3873	PL_colors[4], (IV)ST.accepted, PL_colors[5] );
	3874	);
	3875	goto trie_first_try; /* jump into the fail handler */
	3876	}}
	3877	assert(0); /* NOTREACHED */
	3878
	3879	case TRIE_next_fail: /* we failed - try next alternative */
	3880	{
	3881	U8 *uc;
	3882	if ( ST.jump) {
	3883	REGCP_UNWIND(ST.cp);
	3884	UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
	3885	}
	3886	if (!--ST.accepted) {
	3887	DEBUG_EXECUTE_r({
	3888	PerlIO_printf( Perl_debug_log,
	3889	"%*s %sTRIE failed...%s\n",
	3890	REPORT_CODE_OFF+depth*2, "",
	3891	PL_colors[4],
	3892	PL_colors[5] );
	3893	});
	3894	sayNO_SILENT;
	3895	}
	3896	{
	3897	/* Find next-highest word to process. Note that this code
	3898	* is O(N^2) per trie run (O(N) per branch), so keep tight */
	3899	U16 min = 0;
	3900	U16 word;
	3901	U16 const nextword = ST.nextword;
	3902	reg_trie_wordinfo * const wordinfo
	3903	= ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
	3904	for (word=ST.topword; word; word=wordinfo[word].prev) {
	3905	if (word > nextword && (!min \|\| word < min))
	3906	min = word;
	3907	}
	3908	ST.nextword = min;
	3909	}
	3910
	3911	trie_first_try:
	3912	if (do_cutgroup) {
	3913	do_cutgroup = 0;
	3914	no_final = 0;
	3915	}
	3916
	3917	if ( ST.jump) {
	3918	ST.lastparen = rex->lastparen;
	3919	ST.lastcloseparen = rex->lastcloseparen;
	3920	REGCP_SET(ST.cp);
	3921	}
	3922
	3923	/* find start char of end of current word */
	3924	{
	3925	U32 chars; /* how many chars to skip */
	3926	reg_trie_data * const trie
	3927	= (reg_trie_data*)rexi->data->data[ARG(ST.me)];
	3928
	3929	assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
	3930	>= ST.firstchars);
	3931	chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
	3932	- ST.firstchars;
	3933	uc = ST.firstpos;
	3934
	3935	if (ST.longfold) {
	3936	/* the hard option - fold each char in turn and find
	3937	* its folded length (which may be different */
	3938	U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
	3939	STRLEN foldlen;
	3940	STRLEN len;
	3941	UV uvc;
	3942	U8 *uscan;
	3943
	3944	while (chars) {
	3945	if (utf8_target) {
	3946	uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
	3947	uniflags);
	3948	uc += len;
	3949	}
	3950	else {
	3951	uvc = *uc;
	3952	uc++;
	3953	}
	3954	uvc = to_uni_fold(uvc, foldbuf, &foldlen);
	3955	uscan = foldbuf;
	3956	while (foldlen) {
	3957	if (!--chars)
	3958	break;
	3959	uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
	3960	uniflags);
	3961	uscan += len;
	3962	foldlen -= len;
	3963	}
	3964	}
	3965	}
	3966	else {
	3967	if (utf8_target)
	3968	while (chars--)
	3969	uc += UTF8SKIP(uc);
	3970	else
	3971	uc += chars;
	3972	}
	3973	}
	3974
	3975	scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
	3976	? ST.jump[ST.nextword]
	3977	: NEXT_OFF(ST.me));
	3978
	3979	DEBUG_EXECUTE_r({
	3980	PerlIO_printf( Perl_debug_log,
	3981	"%*s %sTRIE matched word #%d, continuing%s\n",
	3982	REPORT_CODE_OFF+depth*2, "",
	3983	PL_colors[4],
	3984	ST.nextword,
	3985	PL_colors[5]
	3986	);
	3987	});
	3988
	3989	if (ST.accepted > 1 \|\| has_cutgroup) {
	3990	PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
	3991	assert(0); /* NOTREACHED */
	3992	}
	3993	/* only one choice left - just continue */
	3994	DEBUG_EXECUTE_r({
	3995	AV *const trie_words
	3996	= MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
	3997	SV ** const tmp = av_fetch( trie_words,
	3998	ST.nextword-1, 0 );
	3999	SV *sv= tmp ? sv_newmortal() : NULL;
	4000
	4001	PerlIO_printf( Perl_debug_log,
	4002	"%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
	4003	REPORT_CODE_OFF+depth*2, "", PL_colors[4],
	4004	ST.nextword,
	4005	tmp ? pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 0,
	4006	PL_colors[0], PL_colors[1],
	4007	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)\|PERL_PV_ESCAPE_NONASCII
	4008	)
	4009	: "not compiled under -Dr",
	4010	PL_colors[5] );
	4011	});
	4012
	4013	locinput = (char*)uc;
	4014	continue; /* execute rest of RE */
	4015	assert(0); /* NOTREACHED */
	4016	}
	4017	#undef ST
	4018
	4019	case EXACT: { /* /abc/ */
	4020	char *s = STRING(scan);
	4021	ln = STR_LEN(scan);
	4022	if (utf8_target != is_utf8_pat) {
	4023	/* The target and the pattern have differing utf8ness. */
	4024	char *l = locinput;
	4025	const char * const e = s + ln;
	4026
	4027	if (utf8_target) {
	4028	/* The target is utf8, the pattern is not utf8.
	4029	* Above-Latin1 code points can't match the pattern;
	4030	* invariants match exactly, and the other Latin1 ones need
	4031	* to be downgraded to a single byte in order to do the
	4032	* comparison. (If we could be confident that the target
	4033	* is not malformed, this could be refactored to have fewer
	4034	* tests by just assuming that if the first bytes match, it
	4035	* is an invariant, but there are tests in the test suite
	4036	* dealing with (??{...}) which violate this) */
	4037	while (s < e) {
	4038	if (l >= reginfo->strend
	4039	\|\| UTF8_IS_ABOVE_LATIN1(* (U8*) l))
	4040	{
	4041	sayNO;
	4042	}
	4043	if (UTF8_IS_INVARIANT((U8)l)) {
	4044	if (l != s) {
	4045	sayNO;
	4046	}
	4047	l++;
	4048	}
	4049	else {
	4050	if (TWO_BYTE_UTF8_TO_UNI(l, (l+1)) != * (U8*) s) {
	4051	sayNO;
	4052	}
	4053	l += 2;
	4054	}
	4055	s++;
	4056	}
	4057	}
	4058	else {
	4059	/* The target is not utf8, the pattern is utf8. */
	4060	while (s < e) {
	4061	if (l >= reginfo->strend
	4062	\|\| UTF8_IS_ABOVE_LATIN1(* (U8*) s))
	4063	{
	4064	sayNO;
	4065	}
	4066	if (UTF8_IS_INVARIANT((U8)s)) {
	4067	if (s != l) {
	4068	sayNO;
	4069	}
	4070	s++;
	4071	}
	4072	else {
	4073	if (TWO_BYTE_UTF8_TO_UNI(s, (s+1)) != * (U8*) l) {
	4074	sayNO;
	4075	}
	4076	s += 2;
	4077	}
	4078	l++;
	4079	}
	4080	}
	4081	locinput = l;
	4082	}
	4083	else {
	4084	/* The target and the pattern have the same utf8ness. */
	4085	/* Inline the first character, for speed. */
	4086	if (reginfo->strend - locinput < ln
	4087	\|\| UCHARAT(s) != nextchr
	4088	\|\| (ln > 1 && memNE(s, locinput, ln)))
	4089	{
	4090	sayNO;
	4091	}
	4092	locinput += ln;
	4093	}
	4094	break;
	4095	}
	4096
	4097	case EXACTFL: { /* /abc/il */
	4098	re_fold_t folder;
	4099	const U8 * fold_array;
	4100	const char * s;
	4101	U32 fold_utf8_flags;
	4102
	4103	RX_MATCH_TAINTED_on(reginfo->prog);
	4104	folder = foldEQ_locale;
	4105	fold_array = PL_fold_locale;
	4106	fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
	4107	goto do_exactf;
	4108
	4109	case EXACTFU_SS: /* /\x{df}/iu */
	4110	case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
	4111	case EXACTFU: /* /abc/iu */
	4112	folder = foldEQ_latin1;
	4113	fold_array = PL_fold_latin1;
	4114	fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
	4115	goto do_exactf;
	4116
	4117	case EXACTFA: /* /abc/iaa */
	4118	folder = foldEQ_latin1;
	4119	fold_array = PL_fold_latin1;
	4120	fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
	4121	goto do_exactf;
	4122
	4123	case EXACTF: /* /abc/i */
	4124	folder = foldEQ;
	4125	fold_array = PL_fold;
	4126	fold_utf8_flags = 0;
	4127
	4128	do_exactf:
	4129	s = STRING(scan);
	4130	ln = STR_LEN(scan);
	4131
	4132	if (utf8_target \|\| is_utf8_pat \|\| state_num == EXACTFU_SS) {
	4133	/* Either target or the pattern are utf8, or has the issue where
	4134	* the fold lengths may differ. */
	4135	const char * const l = locinput;
	4136	char *e = reginfo->strend;
	4137
	4138	if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
	4139	l, &e, 0, utf8_target, fold_utf8_flags))
	4140	{
	4141	sayNO;
	4142	}
	4143	locinput = e;
	4144	break;
	4145	}
	4146
	4147	/* Neither the target nor the pattern are utf8 */
	4148	if (UCHARAT(s) != nextchr
	4149	&& !NEXTCHR_IS_EOS
	4150	&& UCHARAT(s) != fold_array[nextchr])
	4151	{
	4152	sayNO;
	4153	}
	4154	if (reginfo->strend - locinput < ln)
	4155	sayNO;
	4156	if (ln > 1 && ! folder(s, locinput, ln))
	4157	sayNO;
	4158	locinput += ln;
	4159	break;
	4160	}
	4161
	4162	/* XXX Could improve efficiency by separating these all out using a
	4163	* macro or in-line function. At that point regcomp.c would no longer
	4164	* have to set the FLAGS fields of these */
	4165	case BOUNDL: /* /\b/l */
	4166	case NBOUNDL: /* /\B/l */
	4167	RX_MATCH_TAINTED_on(reginfo->prog);
	4168	/* FALL THROUGH */
	4169	case BOUND: /* /\b/ */
	4170	case BOUNDU: /* /\b/u */
	4171	case BOUNDA: /* /\b/a */
	4172	case NBOUND: /* /\B/ */
	4173	case NBOUNDU: /* /\B/u */
	4174	case NBOUNDA: /* /\B/a */
	4175	/* was last char in word? */
	4176	if (utf8_target
	4177	&& FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
	4178	&& FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
	4179	{
	4180	if (locinput == reginfo->strbeg)
	4181	ln = '\n';
	4182	else {
	4183	const U8 * const r =
	4184	reghop3((U8)locinput, -1, (U8)(reginfo->strbeg));
	4185
	4186	ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
	4187	}
	4188	if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
	4189	ln = isWORDCHAR_uni(ln);
	4190	if (NEXTCHR_IS_EOS)
	4191	n = 0;
	4192	else {
	4193	LOAD_UTF8_CHARCLASS_ALNUM();
	4194	n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
	4195	utf8_target);
	4196	}
	4197	}
	4198	else {
	4199	ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
	4200	n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
	4201	}
	4202	}
	4203	else {
	4204
	4205	/* Here the string isn't utf8, or is utf8 and only ascii
	4206	* characters are to match \w. In the latter case looking at
	4207	* the byte just prior to the current one may be just the final
	4208	* byte of a multi-byte character. This is ok. There are two
	4209	* cases:
	4210	* 1) it is a single byte character, and then the test is doing
	4211	* just what it's supposed to.
	4212	* 2) it is a multi-byte character, in which case the final
	4213	* byte is never mistakable for ASCII, and so the test
	4214	* will say it is not a word character, which is the
	4215	* correct answer. */
	4216	ln = (locinput != reginfo->strbeg) ?
	4217	UCHARAT(locinput - 1) : '\n';
	4218	switch (FLAGS(scan)) {
	4219	case REGEX_UNICODE_CHARSET:
	4220	ln = isWORDCHAR_L1(ln);
	4221	n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
	4222	break;
	4223	case REGEX_LOCALE_CHARSET:
	4224	ln = isWORDCHAR_LC(ln);
	4225	n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
	4226	break;
	4227	case REGEX_DEPENDS_CHARSET:
	4228	ln = isWORDCHAR(ln);
	4229	n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
	4230	break;
	4231	case REGEX_ASCII_RESTRICTED_CHARSET:
	4232	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	4233	ln = isWORDCHAR_A(ln);
	4234	n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
	4235	break;
	4236	default:
	4237	Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
	4238	break;
	4239	}
	4240	}
	4241	/* Note requires that all BOUNDs be lower than all NBOUNDs in
	4242	* regcomp.sym */
	4243	if (((!ln) == (!n)) == (OP(scan) < NBOUND))
	4244	sayNO;
	4245	break;
	4246
	4247	case ANYOF: /* /[abc]/ */
	4248	case ANYOF_WARN_SUPER:
	4249	if (NEXTCHR_IS_EOS)
	4250	sayNO;
	4251	if (utf8_target) {
	4252	if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
	4253	sayNO;
	4254	locinput += UTF8SKIP(locinput);
	4255	}
	4256	else {
	4257	if (!REGINCLASS(rex, scan, (U8*)locinput))
	4258	sayNO;
	4259	locinput++;
	4260	}
	4261	break;
	4262
	4263	/* The argument (FLAGS) to all the POSIX node types is the class number
	4264	* */
	4265
	4266	case NPOSIXL: /* \W or [:^punct:] etc. under /l */
	4267	to_complement = 1;
	4268	/* FALLTHROUGH */
	4269
	4270	case POSIXL: /* \w or [:punct:] etc. under /l */
	4271	if (NEXTCHR_IS_EOS)
	4272	sayNO;
	4273
	4274	/* The locale hasn't influenced the outcome before this, so defer
	4275	* tainting until now */
	4276	RX_MATCH_TAINTED_on(reginfo->prog);
	4277
	4278	/* Use isFOO_lc() for characters within Latin1. (Note that
	4279	* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
	4280	* wouldn't be invariant) */
	4281	if (UTF8_IS_INVARIANT(nextchr) \|\| ! utf8_target) {
	4282	if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
	4283	sayNO;
	4284	}
	4285	}
	4286	else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
	4287	if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
	4288	(U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
	4289	*(locinput + 1))))))
	4290	{
	4291	sayNO;
	4292	}
	4293	}
	4294	else { /* Here, must be an above Latin-1 code point */
	4295	goto utf8_posix_not_eos;
	4296	}
	4297
	4298	/* Here, must be utf8 */
	4299	locinput += UTF8SKIP(locinput);
	4300	break;
	4301
	4302	case NPOSIXD: /* \W or [:^punct:] etc. under /d */
	4303	to_complement = 1;
	4304	/* FALLTHROUGH */
	4305
	4306	case POSIXD: /* \w or [:punct:] etc. under /d */
	4307	if (utf8_target) {
	4308	goto utf8_posix;
	4309	}
	4310	goto posixa;
	4311
	4312	case NPOSIXA: /* \W or [:^punct:] etc. under /a */
	4313
	4314	if (NEXTCHR_IS_EOS) {
	4315	sayNO;
	4316	}
	4317
	4318	/* All UTF-8 variants match */
	4319	if (! UTF8_IS_INVARIANT(nextchr)) {
	4320	goto increment_locinput;
	4321	}
	4322
	4323	to_complement = 1;
	4324	/* FALLTHROUGH */
	4325
	4326	case POSIXA: /* \w or [:punct:] etc. under /a */
	4327
	4328	posixa:
	4329	/* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
	4330	* UTF-8, and also from NPOSIXA even in UTF-8 when the current
	4331	* character is a single byte */
	4332
	4333	if (NEXTCHR_IS_EOS
	4334	\|\| ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
	4335	FLAGS(scan)))))
	4336	{
	4337	sayNO;
	4338	}
	4339
	4340	/* Here we are either not in utf8, or we matched a utf8-invariant,
	4341	* so the next char is the next byte */
	4342	locinput++;
	4343	break;
	4344
	4345	case NPOSIXU: /* \W or [:^punct:] etc. under /u */
	4346	to_complement = 1;
	4347	/* FALLTHROUGH */
	4348
	4349	case POSIXU: /* \w or [:punct:] etc. under /u */
	4350	utf8_posix:
	4351	if (NEXTCHR_IS_EOS) {
	4352	sayNO;
	4353	}
	4354	utf8_posix_not_eos:
	4355
	4356	/* Use _generic_isCC() for characters within Latin1. (Note that
	4357	* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
	4358	* wouldn't be invariant) */
	4359	if (UTF8_IS_INVARIANT(nextchr) \|\| ! utf8_target) {
	4360	if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
	4361	FLAGS(scan)))))
	4362	{
	4363	sayNO;
	4364	}
	4365	locinput++;
	4366	}
	4367	else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
	4368	if (! (to_complement
	4369	^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
	4370	*(locinput + 1)),
	4371	FLAGS(scan)))))
	4372	{
	4373	sayNO;
	4374	}
	4375	locinput += 2;
	4376	}
	4377	else { /* Handle above Latin-1 code points */
	4378	classnum = (_char_class_number) FLAGS(scan);
	4379	if (classnum < _FIRST_NON_SWASH_CC) {
	4380
	4381	/* Here, uses a swash to find such code points. Load if if
	4382	* not done already */
	4383	if (! PL_utf8_swash_ptrs[classnum]) {
	4384	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	4385	PL_utf8_swash_ptrs[classnum]
	4386	= _core_swash_init("utf8",
	4387	swash_property_names[classnum],
	4388	&PL_sv_undef, 1, 0, NULL, &flags);
	4389	}
	4390	if (! (to_complement
	4391	^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
	4392	(U8 *) locinput, TRUE))))
	4393	{
	4394	sayNO;
	4395	}
	4396	}
	4397	else { /* Here, uses macros to find above Latin-1 code points */
	4398	switch (classnum) {
	4399	case _CC_ENUM_SPACE: /* XXX would require separate
	4400	code if we revert the change
	4401	of \v matching this */
	4402	case _CC_ENUM_PSXSPC:
	4403	if (! (to_complement
	4404	^ cBOOL(is_XPERLSPACE_high(locinput))))
	4405	{
	4406	sayNO;
	4407	}
	4408	break;
	4409	case _CC_ENUM_BLANK:
	4410	if (! (to_complement
	4411	^ cBOOL(is_HORIZWS_high(locinput))))
	4412	{
	4413	sayNO;
	4414	}
	4415	break;
	4416	case _CC_ENUM_XDIGIT:
	4417	if (! (to_complement
	4418	^ cBOOL(is_XDIGIT_high(locinput))))
	4419	{
	4420	sayNO;
	4421	}
	4422	break;
	4423	case _CC_ENUM_VERTSPACE:
	4424	if (! (to_complement
	4425	^ cBOOL(is_VERTWS_high(locinput))))
	4426	{
	4427	sayNO;
	4428	}
	4429	break;
	4430	default: /* The rest, e.g. [:cntrl:], can't match
	4431	above Latin1 */
	4432	if (! to_complement) {
	4433	sayNO;
	4434	}
	4435	break;
	4436	}
	4437	}
	4438	locinput += UTF8SKIP(locinput);
	4439	}
	4440	break;
	4441
	4442	case CLUMP: /* Match \X: logical Unicode character. This is defined as
	4443	a Unicode extended Grapheme Cluster */
	4444	/* From http://www.unicode.org/reports/tr29 (5.2 version). An
	4445	extended Grapheme Cluster is:
	4446
	4447	CR LF
	4448	\| Prepend* Begin Extend*
	4449	\| .
	4450
	4451	Begin is: ( Special_Begin \| ! Control )
	4452	Special_Begin is: ( Regional-Indicator+ \| Hangul-syllable )
	4453	Extend is: ( Grapheme_Extend \| Spacing_Mark )
	4454	Control is: [ GCB_Control \| CR \| LF ]
	4455	Hangul-syllable is: ( T+ \| ( L* ( L \| ( LVT \| ( V \| LV ) V* ) T* ) ))
	4456
	4457	If we create a 'Regular_Begin' = Begin - Special_Begin, then
	4458	we can rewrite
	4459
	4460	Begin is ( Regular_Begin + Special Begin )
	4461
	4462	It turns out that 98.4% of all Unicode code points match
	4463	Regular_Begin. Doing it this way eliminates a table match in
	4464	the previous implementation for almost all Unicode code points.
	4465
	4466	There is a subtlety with Prepend* which showed up in testing.
	4467	Note that the Begin, and only the Begin is required in:
	4468	\| Prepend* Begin Extend*
	4469	Also, Begin contains '! Control'. A Prepend must be a
	4470	'! Control', which means it must also be a Begin. What it
	4471	comes down to is that if we match Prepend* and then find no
	4472	suitable Begin afterwards, that if we backtrack the last
	4473	Prepend, that one will be a suitable Begin.
	4474	*/
	4475
	4476	if (NEXTCHR_IS_EOS)
	4477	sayNO;
	4478	if (! utf8_target) {
	4479
	4480	/* Match either CR LF or '.', as all the other possibilities
	4481	* require utf8 */
	4482	locinput++; /* Match the . or CR */
	4483	if (nextchr == '\r' /* And if it was CR, and the next is LF,
	4484	match the LF */
	4485	&& locinput < reginfo->strend
	4486	&& UCHARAT(locinput) == '\n')
	4487	{
	4488	locinput++;
	4489	}
	4490	}
	4491	else {
	4492
	4493	/* Utf8: See if is ( CR LF ); already know that locinput <
	4494	* reginfo->strend, so locinput+1 is in bounds */
	4495	if ( nextchr == '\r' && locinput+1 < reginfo->strend
	4496	&& UCHARAT(locinput + 1) == '\n')
	4497	{
	4498	locinput += 2;
	4499	}
	4500	else {
	4501	STRLEN len;
	4502
	4503	/* In case have to backtrack to beginning, then match '.' */
	4504	char *starting = locinput;
	4505
	4506	/* In case have to backtrack the last prepend */
	4507	char *previous_prepend = NULL;
	4508
	4509	LOAD_UTF8_CHARCLASS_GCB();
	4510
	4511	/* Match (prepend)* */
	4512	while (locinput < reginfo->strend
	4513	&& (len = is_GCB_Prepend_utf8(locinput)))
	4514	{
	4515	previous_prepend = locinput;
	4516	locinput += len;
	4517	}
	4518
	4519	/* As noted above, if we matched a prepend character, but
	4520	* the next thing won't match, back off the last prepend we
	4521	* matched, as it is guaranteed to match the begin */
	4522	if (previous_prepend
	4523	&& (locinput >= reginfo->strend
	4524	\|\| (! swash_fetch(PL_utf8_X_regular_begin,
	4525	(U8*)locinput, utf8_target)
	4526	&& ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
	4527	)
	4528	{
	4529	locinput = previous_prepend;
	4530	}
	4531
	4532	/* Note that here we know reginfo->strend > locinput, as we
	4533	* tested that upon input to this switch case, and if we
	4534	* moved locinput forward, we tested the result just above
	4535	* and it either passed, or we backed off so that it will
	4536	* now pass */
	4537	if (swash_fetch(PL_utf8_X_regular_begin,
	4538	(U8*)locinput, utf8_target)) {
	4539	locinput += UTF8SKIP(locinput);
	4540	}
	4541	else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
	4542
	4543	/* Here did not match the required 'Begin' in the
	4544	* second term. So just match the very first
	4545	* character, the '.' of the final term of the regex */
	4546	locinput = starting + UTF8SKIP(starting);
	4547	goto exit_utf8;
	4548	} else {
	4549
	4550	/* Here is a special begin. It can be composed of
	4551	* several individual characters. One possibility is
	4552	* RI+ */
	4553	if ((len = is_GCB_RI_utf8(locinput))) {
	4554	locinput += len;
	4555	while (locinput < reginfo->strend
	4556	&& (len = is_GCB_RI_utf8(locinput)))
	4557	{
	4558	locinput += len;
	4559	}
	4560	} else if ((len = is_GCB_T_utf8(locinput))) {
	4561	/* Another possibility is T+ */
	4562	locinput += len;
	4563	while (locinput < reginfo->strend
	4564	&& (len = is_GCB_T_utf8(locinput)))
	4565	{
	4566	locinput += len;
	4567	}
	4568	} else {
	4569
	4570	/* Here, neither RI+ nor T+; must be some other
	4571	* Hangul. That means it is one of the others: L,
	4572	* LV, LVT or V, and matches:
	4573	* L* (L \| LVT T* \| V * V* T* \| LV V* T) /
	4574
	4575	/* Match L* */
	4576	while (locinput < reginfo->strend
	4577	&& (len = is_GCB_L_utf8(locinput)))
	4578	{
	4579	locinput += len;
	4580	}
	4581
	4582	/* Here, have exhausted L*. If the next character
	4583	* is not an LV, LVT nor V, it means we had to have
	4584	* at least one L, so matches L+ in the original
	4585	* equation, we have a complete hangul syllable.
	4586	* Are done. */
	4587
	4588	if (locinput < reginfo->strend
	4589	&& is_GCB_LV_LVT_V_utf8(locinput))
	4590	{
	4591	/* Otherwise keep going. Must be LV, LVT or V.
	4592	* See if LVT, by first ruling out V, then LV */
	4593	if (! is_GCB_V_utf8(locinput)
	4594	/* All but every TCount one is LV */
	4595	&& (valid_utf8_to_uvchr((U8 *) locinput,
	4596	NULL)
	4597	- SBASE)
	4598	% TCount != 0)
	4599	{
	4600	locinput += UTF8SKIP(locinput);
	4601	} else {
	4602
	4603	/* Must be V or LV. Take it, then match
	4604	* V* */
	4605	locinput += UTF8SKIP(locinput);
	4606	while (locinput < reginfo->strend
	4607	&& (len = is_GCB_V_utf8(locinput)))
	4608	{
	4609	locinput += len;
	4610	}
	4611	}
	4612
	4613	/* And any of LV, LVT, or V can be followed
	4614	* by T* */
	4615	while (locinput < reginfo->strend
	4616	&& (len = is_GCB_T_utf8(locinput)))
	4617	{
	4618	locinput += len;
	4619	}
	4620	}
	4621	}
	4622	}
	4623
	4624	/* Match any extender */
	4625	while (locinput < reginfo->strend
	4626	&& swash_fetch(PL_utf8_X_extend,
	4627	(U8*)locinput, utf8_target))
	4628	{
	4629	locinput += UTF8SKIP(locinput);
	4630	}
	4631	}
	4632	exit_utf8:
	4633	if (locinput > reginfo->strend) sayNO;
	4634	}
	4635	break;
	4636
	4637	case NREFFL: /* /\g{name}/il */
	4638	{ /* The capture buffer cases. The ones beginning with N for the
	4639	named buffers just convert to the equivalent numbered and
	4640	pretend they were called as the corresponding numbered buffer
	4641	op. */
	4642	/* don't initialize these in the declaration, it makes C++
	4643	unhappy */
	4644	const char *s;
	4645	char type;
	4646	re_fold_t folder;
	4647	const U8 *fold_array;
	4648	UV utf8_fold_flags;
	4649
	4650	RX_MATCH_TAINTED_on(reginfo->prog);
	4651	folder = foldEQ_locale;
	4652	fold_array = PL_fold_locale;
	4653	type = REFFL;
	4654	utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
	4655	goto do_nref;
	4656
	4657	case NREFFA: /* /\g{name}/iaa */
	4658	folder = foldEQ_latin1;
	4659	fold_array = PL_fold_latin1;
	4660	type = REFFA;
	4661	utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
	4662	goto do_nref;
	4663
	4664	case NREFFU: /* /\g{name}/iu */
	4665	folder = foldEQ_latin1;
	4666	fold_array = PL_fold_latin1;
	4667	type = REFFU;
	4668	utf8_fold_flags = 0;
	4669	goto do_nref;
	4670
	4671	case NREFF: /* /\g{name}/i */
	4672	folder = foldEQ;
	4673	fold_array = PL_fold;
	4674	type = REFF;
	4675	utf8_fold_flags = 0;
	4676	goto do_nref;
	4677
	4678	case NREF: /* /\g{name}/ */
	4679	type = REF;
	4680	folder = NULL;
	4681	fold_array = NULL;
	4682	utf8_fold_flags = 0;
	4683	do_nref:
	4684
	4685	/* For the named back references, find the corresponding buffer
	4686	* number */
	4687	n = reg_check_named_buff_matched(rex,scan);
	4688
	4689	if ( ! n ) {
	4690	sayNO;
	4691	}
	4692	goto do_nref_ref_common;
	4693
	4694	case REFFL: /* /\1/il */
	4695	RX_MATCH_TAINTED_on(reginfo->prog);
	4696	folder = foldEQ_locale;
	4697	fold_array = PL_fold_locale;
	4698	utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
	4699	goto do_ref;
	4700
	4701	case REFFA: /* /\1/iaa */
	4702	folder = foldEQ_latin1;
	4703	fold_array = PL_fold_latin1;
	4704	utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
	4705	goto do_ref;
	4706
	4707	case REFFU: /* /\1/iu */
	4708	folder = foldEQ_latin1;
	4709	fold_array = PL_fold_latin1;
	4710	utf8_fold_flags = 0;
	4711	goto do_ref;
	4712
	4713	case REFF: /* /\1/i */
	4714	folder = foldEQ;
	4715	fold_array = PL_fold;
	4716	utf8_fold_flags = 0;
	4717	goto do_ref;
	4718
	4719	case REF: /* /\1/ */
	4720	folder = NULL;
	4721	fold_array = NULL;
	4722	utf8_fold_flags = 0;
	4723
	4724	do_ref:
	4725	type = OP(scan);
	4726	n = ARG(scan); /* which paren pair */
	4727
	4728	do_nref_ref_common:
	4729	ln = rex->offs[n].start;
	4730	reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
	4731	if (rex->lastparen < n \|\| ln == -1)
	4732	sayNO; /* Do not match unless seen CLOSEn. */
	4733	if (ln == rex->offs[n].end)
	4734	break;
	4735
	4736	s = reginfo->strbeg + ln;
	4737	if (type != REF /* REF can do byte comparison */
	4738	&& (utf8_target \|\| type == REFFU))
	4739	{ /* XXX handle REFFL better */
	4740	char * limit = reginfo->strend;
	4741
	4742	/* This call case insensitively compares the entire buffer
	4743	* at s, with the current input starting at locinput, but
	4744	* not going off the end given by reginfo->strend, and
	4745	* returns in <limit> upon success, how much of the
	4746	* current input was matched */
	4747	if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
	4748	locinput, &limit, 0, utf8_target, utf8_fold_flags))
	4749	{
	4750	sayNO;
	4751	}
	4752	locinput = limit;
	4753	break;
	4754	}
	4755
	4756	/* Not utf8: Inline the first character, for speed. */
	4757	if (!NEXTCHR_IS_EOS &&
	4758	UCHARAT(s) != nextchr &&
	4759	(type == REF \|\|
	4760	UCHARAT(s) != fold_array[nextchr]))
	4761	sayNO;
	4762	ln = rex->offs[n].end - ln;
	4763	if (locinput + ln > reginfo->strend)
	4764	sayNO;
	4765	if (ln > 1 && (type == REF
	4766	? memNE(s, locinput, ln)
	4767	: ! folder(s, locinput, ln)))
	4768	sayNO;
	4769	locinput += ln;
	4770	break;
	4771	}
	4772
	4773	case NOTHING: /* null op; e.g. the 'nothing' following
	4774	* the '' in m{(a+\|b)}' */
	4775	break;
	4776	case TAIL: /* placeholder while compiling (A\|B\|C) */
	4777	break;
	4778
	4779	case BACK: /* ??? doesn't appear to be used ??? */
	4780	break;
	4781
	4782	#undef ST
	4783	#define ST st->u.eval
	4784	{
	4785	SV *ret;
	4786	REGEXP *re_sv;
	4787	regexp *re;
	4788	regexp_internal *rei;
	4789	regnode *startpoint;
	4790
	4791	case GOSTART: /* (?R) */
	4792	case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
	4793	if (cur_eval && cur_eval->locinput==locinput) {
	4794	if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
	4795	Perl_croak(aTHX_ "Infinite recursion in regex");
	4796	if ( ++nochange_depth > max_nochange_depth )
	4797	Perl_croak(aTHX_
	4798	"Pattern subroutine nesting without pos change"
	4799	" exceeded limit in regex");
	4800	} else {
	4801	nochange_depth = 0;
	4802	}
	4803	re_sv = rex_sv;
	4804	re = rex;
	4805	rei = rexi;
	4806	if (OP(scan)==GOSUB) {
	4807	startpoint = scan + ARG2L(scan);
	4808	ST.close_paren = ARG(scan);
	4809	} else {
	4810	startpoint = rei->program+1;
	4811	ST.close_paren = 0;
	4812	}
	4813	goto eval_recurse_doit;
	4814	assert(0); /* NOTREACHED */
	4815
	4816	case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X\|Y)B/ */
	4817	if (cur_eval && cur_eval->locinput==locinput) {
	4818	if ( ++nochange_depth > max_nochange_depth )
	4819	Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
	4820	} else {
	4821	nochange_depth = 0;
	4822	}
	4823	{
	4824	/* execute the code in the {...} */
	4825
	4826	dSP;
	4827	IV before;
	4828	OP * const oop = PL_op;
	4829	COP * const ocurcop = PL_curcop;
	4830	OP *nop;
	4831	CV *newcv;
	4832
	4833	/* save all paren positions */
	4834	regcppush(rex, 0, maxopenparen);
	4835	REGCP_SET(runops_cp);
	4836
	4837	if (!caller_cv)
	4838	caller_cv = find_runcv(NULL);
	4839
	4840	n = ARG(scan);
	4841
	4842	if (rexi->data->what[n] == 'r') { /* code from an external qr */
	4843	newcv = (ReANY(
	4844	(REGEXP*)(rexi->data->data[n])
	4845	))->qr_anoncv
	4846	;
	4847	nop = (OP*)rexi->data->data[n+1];
	4848	}
	4849	else if (rexi->data->what[n] == 'l') { /* literal code */
	4850	newcv = caller_cv;
	4851	nop = (OP*)rexi->data->data[n];
	4852	assert(CvDEPTH(newcv));
	4853	}
	4854	else {
	4855	/* literal with own CV */
	4856	assert(rexi->data->what[n] == 'L');
	4857	newcv = rex->qr_anoncv;
	4858	nop = (OP*)rexi->data->data[n];
	4859	}
	4860
	4861	/* normally if we're about to execute code from the same
	4862	* CV that we used previously, we just use the existing
	4863	* CX stack entry. However, its possible that in the
	4864	* meantime we may have backtracked, popped from the save
	4865	* stack, and undone the SAVECOMPPAD(s) associated with
	4866	* PUSH_MULTICALL; in which case PL_comppad no longer
	4867	* points to newcv's pad. */
	4868	if (newcv != last_pushed_cv \|\| PL_comppad != last_pad)
	4869	{
	4870	U8 flags = (CXp_SUB_RE \|
	4871	((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
	4872	if (last_pushed_cv) {
	4873	CHANGE_MULTICALL_FLAGS(newcv, flags);
	4874	}
	4875	else {
	4876	PUSH_MULTICALL_FLAGS(newcv, flags);
	4877	}
	4878	last_pushed_cv = newcv;
	4879	}
	4880	else {
	4881	/* these assignments are just to silence compiler
	4882	* warnings */
	4883	multicall_cop = NULL;
	4884	newsp = NULL;
	4885	}
	4886	last_pad = PL_comppad;
	4887
	4888	/* the initial nextstate you would normally execute
	4889	* at the start of an eval (which would cause error
	4890	* messages to come from the eval), may be optimised
	4891	* away from the execution path in the regex code blocks;
	4892	* so manually set PL_curcop to it initially */
	4893	{
	4894	OP *o = cUNOPx(nop)->op_first;
	4895	assert(o->op_type == OP_NULL);
	4896	if (o->op_targ == OP_SCOPE) {
	4897	o = cUNOPo->op_first;
	4898	}
	4899	else {
	4900	assert(o->op_targ == OP_LEAVE);
	4901	o = cUNOPo->op_first;
	4902	assert(o->op_type == OP_ENTER);
	4903	o = o->op_sibling;
	4904	}
	4905
	4906	if (o->op_type != OP_STUB) {
	4907	assert( o->op_type == OP_NEXTSTATE
	4908	\|\| o->op_type == OP_DBSTATE
	4909	\|\| (o->op_type == OP_NULL
	4910	&& ( o->op_targ == OP_NEXTSTATE
	4911	\|\| o->op_targ == OP_DBSTATE
	4912	)
	4913	)
	4914	);
	4915	PL_curcop = (COP*)o;
	4916	}
	4917	}
	4918	nop = nop->op_next;
	4919
	4920	DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
	4921	" re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
	4922
	4923	rex->offs[0].end = locinput - reginfo->strbeg;
	4924	if (reginfo->info_aux_eval->pos_magic)
	4925	reginfo->info_aux_eval->pos_magic->mg_len
	4926	= locinput - reginfo->strbeg;
	4927
	4928	if (sv_yes_mark) {
	4929	SV *sv_mrk = get_sv("REGMARK", 1);
	4930	sv_setsv(sv_mrk, sv_yes_mark);
	4931	}
	4932
	4933	/* we don't use MULTICALL here as we want to call the
	4934	* first op of the block of interest, rather than the
	4935	* first op of the sub */
	4936	before = (IV)(SP-PL_stack_base);
	4937	PL_op = nop;
	4938	CALLRUNOPS(aTHX); /* Scalar context. */
	4939	SPAGAIN;
	4940	if ((IV)(SP-PL_stack_base) == before)
	4941	ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
	4942	else {
	4943	ret = POPs;
	4944	PUTBACK;
	4945	}
	4946
	4947	/* before restoring everything, evaluate the returned
	4948	* value, so that 'uninit' warnings don't use the wrong
	4949	* PL_op or pad. Also need to process any magic vars
	4950	* (e.g. $1) before parentheses are restored */
	4951
	4952	PL_op = NULL;
	4953
	4954	re_sv = NULL;
	4955	if (logical == 0) /* (?{})/ */
	4956	sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
	4957	else if (logical == 1) { /* /(?(?{...})X\|Y)/ */
	4958	sw = cBOOL(SvTRUE(ret));
	4959	logical = 0;
	4960	}
	4961	else { /* /(??{}) */
	4962	/* if its overloaded, let the regex compiler handle
	4963	* it; otherwise extract regex, or stringify */
	4964	if (!SvAMAGIC(ret)) {
	4965	SV *sv = ret;
	4966	if (SvROK(sv))
	4967	sv = SvRV(sv);
	4968	if (SvTYPE(sv) == SVt_REGEXP)
	4969	re_sv = (REGEXP*) sv;
	4970	else if (SvSMAGICAL(sv)) {
	4971	MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
	4972	if (mg)
	4973	re_sv = (REGEXP *) mg->mg_obj;
	4974	}
	4975
	4976	/* force any magic, undef warnings here */
	4977	if (!re_sv) {
	4978	ret = sv_mortalcopy(ret);
	4979	(void) SvPV_force_nolen(ret);
	4980	}
	4981	}
	4982
	4983	}
	4984
	4985	/* *** Note that at this point we don't restore
	4986	* PL_comppad, (or pop the CxSUB) on the assumption it may
	4987	* be used again soon. This is safe as long as nothing
	4988	* in the regexp code uses the pad ! */
	4989	PL_op = oop;
	4990	PL_curcop = ocurcop;
	4991	S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
	4992	PL_curpm = PL_reg_curpm;
	4993
	4994	if (logical != 2)
	4995	break;
	4996	}
	4997
	4998	/* only /(??{})/ from now on */
	4999	logical = 0;
	5000	{
	5001	/* extract RE object from returned value; compiling if
	5002	* necessary */
	5003
	5004	if (re_sv) {
	5005	re_sv = reg_temp_copy(NULL, re_sv);
	5006	}
	5007	else {
	5008	U32 pm_flags = 0;
	5009
	5010	if (SvUTF8(ret) && IN_BYTES) {
	5011	/* In use 'bytes': make a copy of the octet
	5012	* sequence, but without the flag on */
	5013	STRLEN len;
	5014	const char *const p = SvPV(ret, len);
	5015	ret = newSVpvn_flags(p, len, SVs_TEMP);
	5016	}
	5017	if (rex->intflags & PREGf_USE_RE_EVAL)
	5018	pm_flags \|= PMf_USE_RE_EVAL;
	5019
	5020	/* if we got here, it should be an engine which
	5021	* supports compiling code blocks and stuff */
	5022	assert(rex->engine && rex->engine->op_comp);
	5023	assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
	5024	re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
	5025	rex->engine, NULL, NULL,
	5026	/* copy /msix etc to inner pattern */
	5027	scan->flags,
	5028	pm_flags);
	5029
	5030	if (!(SvFLAGS(ret)
	5031	& (SVs_TEMP \| SVs_PADTMP \| SVf_READONLY
	5032	\| SVs_GMG))) {
	5033	/* This isn't a first class regexp. Instead, it's
	5034	caching a regexp onto an existing, Perl visible
	5035	scalar. */
	5036	sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
	5037	}
	5038	/* safe to do now that any $1 etc has been
	5039	* interpolated into the new pattern string and
	5040	* compiled */
	5041	S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
	5042	}
	5043	SAVEFREESV(re_sv);
	5044	re = ReANY(re_sv);
	5045	}
	5046	RXp_MATCH_COPIED_off(re);
	5047	re->subbeg = rex->subbeg;
	5048	re->sublen = rex->sublen;
	5049	re->suboffset = rex->suboffset;
	5050	re->subcoffset = rex->subcoffset;
	5051	rei = RXi_GET(re);
	5052	DEBUG_EXECUTE_r(
	5053	debug_start_match(re_sv, utf8_target, locinput,
	5054	reginfo->strend, "Matching embedded");
	5055	);
	5056	startpoint = rei->program + 1;
	5057	ST.close_paren = 0; /* only used for GOSUB */
	5058
	5059	eval_recurse_doit: /* Share code with GOSUB below this line */
	5060	/* run the pattern returned from (??{...}) */
	5061
	5062	/* Save all the positions. */
	5063	ST.cp = regcppush(rex, 0, maxopenparen);
	5064	REGCP_SET(ST.lastcp);
	5065
	5066	re->lastparen = 0;
	5067	re->lastcloseparen = 0;
	5068
	5069	maxopenparen = 0;
	5070
	5071	/* invalidate the S-L poscache. We're now executing a
	5072	* different set of WHILEM ops (and their associated
	5073	* indexes) against the same string, so the bits in the
	5074	* cache are meaningless. Setting maxiter to zero forces
	5075	* the cache to be invalidated and zeroed before reuse.
	5076	* XXX This is too dramatic a measure. Ideally we should
	5077	* save the old cache and restore when running the outer
	5078	* pattern again */
	5079	reginfo->poscache_maxiter = 0;
	5080
	5081	is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
	5082
	5083	ST.prev_rex = rex_sv;
	5084	ST.prev_curlyx = cur_curlyx;
	5085	rex_sv = re_sv;
	5086	SET_reg_curpm(rex_sv);
	5087	rex = re;
	5088	rexi = rei;
	5089	cur_curlyx = NULL;
	5090	ST.B = next;
	5091	ST.prev_eval = cur_eval;
	5092	cur_eval = st;
	5093	/* now continue from first node in postoned RE */
	5094	PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
	5095	assert(0); /* NOTREACHED */
	5096	}
	5097
	5098	case EVAL_AB: /* cleanup after a successful (??{A})B */
	5099	/* note: this is called twice; first after popping B, then A */
	5100	rex_sv = ST.prev_rex;
	5101	is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
	5102	SET_reg_curpm(rex_sv);
	5103	rex = ReANY(rex_sv);
	5104	rexi = RXi_GET(rex);
	5105	regcpblow(ST.cp);
	5106	cur_eval = ST.prev_eval;
	5107	cur_curlyx = ST.prev_curlyx;
	5108
	5109	/* Invalidate cache. See "invalidate" comment above. */
	5110	reginfo->poscache_maxiter = 0;
	5111	if ( nochange_depth )
	5112	nochange_depth--;
	5113	sayYES;
	5114
	5115
	5116	case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
	5117	/* note: this is called twice; first after popping B, then A */
	5118	rex_sv = ST.prev_rex;
	5119	is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
	5120	SET_reg_curpm(rex_sv);
	5121	rex = ReANY(rex_sv);
	5122	rexi = RXi_GET(rex);
	5123
	5124	REGCP_UNWIND(ST.lastcp);
	5125	regcppop(rex, &maxopenparen);
	5126	cur_eval = ST.prev_eval;
	5127	cur_curlyx = ST.prev_curlyx;
	5128	/* Invalidate cache. See "invalidate" comment above. */
	5129	reginfo->poscache_maxiter = 0;
	5130	if ( nochange_depth )
	5131	nochange_depth--;
	5132	sayNO_SILENT;
	5133	#undef ST
	5134
	5135	case OPEN: /* ( */
	5136	n = ARG(scan); /* which paren pair */
	5137	rex->offs[n].start_tmp = locinput - reginfo->strbeg;
	5138	if (n > maxopenparen)
	5139	maxopenparen = n;
	5140	DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
	5141	"rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
	5142	PTR2UV(rex),
	5143	PTR2UV(rex->offs),
	5144	(UV)n,
	5145	(IV)rex->offs[n].start_tmp,
	5146	(UV)maxopenparen
	5147	));
	5148	lastopen = n;
	5149	break;
	5150
	5151	/* XXX really need to log other places start/end are set too */
	5152	#define CLOSE_CAPTURE \
	5153	rex->offs[n].start = rex->offs[n].start_tmp; \
	5154	rex->offs[n].end = locinput - reginfo->strbeg; \
	5155	DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
	5156	"rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
	5157	PTR2UV(rex), \
	5158	PTR2UV(rex->offs), \
	5159	(UV)n, \
	5160	(IV)rex->offs[n].start, \
	5161	(IV)rex->offs[n].end \
	5162	))
	5163
	5164	case CLOSE: /* ) */
	5165	n = ARG(scan); /* which paren pair */
	5166	CLOSE_CAPTURE;
	5167	if (n > rex->lastparen)
	5168	rex->lastparen = n;
	5169	rex->lastcloseparen = n;
	5170	if (cur_eval && cur_eval->u.eval.close_paren == n) {
	5171	goto fake_end;
	5172	}
	5173	break;
	5174
	5175	case ACCEPT: /* (ACCEPT) /
	5176	if (ARG(scan)){
	5177	regnode *cursor;
	5178	for (cursor=scan;
	5179	cursor && OP(cursor)!=END;
	5180	cursor=regnext(cursor))
	5181	{
	5182	if ( OP(cursor)==CLOSE ){
	5183	n = ARG(cursor);
	5184	if ( n <= lastopen ) {
	5185	CLOSE_CAPTURE;
	5186	if (n > rex->lastparen)
	5187	rex->lastparen = n;
	5188	rex->lastcloseparen = n;
	5189	if ( n == ARG(scan) \|\| (cur_eval &&
	5190	cur_eval->u.eval.close_paren == n))
	5191	break;
	5192	}
	5193	}
	5194	}
	5195	}
	5196	goto fake_end;
	5197	/NOTREACHED/
	5198
	5199	case GROUPP: /* (?(1)) */
	5200	n = ARG(scan); /* which paren pair */
	5201	sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
	5202	break;
	5203
	5204	case NGROUPP: /* (?(<name>)) */
	5205	/* reg_check_named_buff_matched returns 0 for no match */
	5206	sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
	5207	break;
	5208
	5209	case INSUBP: /* (?(R)) */
	5210	n = ARG(scan);
	5211	sw = (cur_eval && (!n \|\| cur_eval->u.eval.close_paren == n));
	5212	break;
	5213
	5214	case DEFINEP: /* (?(DEFINE)) */
	5215	sw = 0;
	5216	break;
	5217
	5218	case IFTHEN: /* (?(cond)A\|B) */
	5219	reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
	5220	if (sw)
	5221	next = NEXTOPER(NEXTOPER(scan));
	5222	else {
	5223	next = scan + ARG(scan);
	5224	if (OP(next) == IFTHEN) /* Fake one. */
	5225	next = NEXTOPER(NEXTOPER(next));
	5226	}
	5227	break;
	5228
	5229	case LOGICAL: /* modifier for EVAL and IFMATCH */
	5230	logical = scan->flags;
	5231	break;
	5232
	5233	/*******************************************************************
	5234
	5235	The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
	5236	pattern, where A and B are subpatterns. (For simple A, CURLYM or
	5237	STAR/PLUS/CURLY/CURLYN are used instead.)
	5238
	5239	A*B is compiled as <CURLYX><A><WHILEM><B>
	5240
	5241	On entry to the subpattern, CURLYX is called. This pushes a CURLYX
	5242	state, which contains the current count, initialised to -1. It also sets
	5243	cur_curlyx to point to this state, with any previous value saved in the
	5244	state block.
	5245
	5246	CURLYX then jumps straight to the WHILEM op, rather than executing A,
	5247	since the pattern may possibly match zero times (i.e. it's a while {} loop
	5248	rather than a do {} while loop).
	5249
	5250	Each entry to WHILEM represents a successful match of A. The count in the
	5251	CURLYX block is incremented, another WHILEM state is pushed, and execution
	5252	passes to A or B depending on greediness and the current count.
	5253
	5254	For example, if matching against the string a1a2a3b (where the aN are
	5255	substrings that match /A/), then the match progresses as follows: (the
	5256	pushed states are interspersed with the bits of strings matched so far):
	5257
	5258	<CURLYX cnt=-1>
	5259	<CURLYX cnt=0><WHILEM>
	5260	<CURLYX cnt=1><WHILEM> a1 <WHILEM>
	5261	<CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
	5262	<CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
	5263	<CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
	5264
	5265	(Contrast this with something like CURLYM, which maintains only a single
	5266	backtrack state:
	5267
	5268	<CURLYM cnt=0> a1
	5269	a1 <CURLYM cnt=1> a2
	5270	a1 a2 <CURLYM cnt=2> a3
	5271	a1 a2 a3 <CURLYM cnt=3> b
	5272	)
	5273
	5274	Each WHILEM state block marks a point to backtrack to upon partial failure
	5275	of A or B, and also contains some minor state data related to that
	5276	iteration. The CURLYX block, pointed to by cur_curlyx, contains the
	5277	overall state, such as the count, and pointers to the A and B ops.
	5278
	5279	This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
	5280	must always point to the current CURLYX block, the rules are:
	5281
	5282	When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
	5283	and set cur_curlyx to point the new block.
	5284
	5285	When popping the CURLYX block after a successful or unsuccessful match,
	5286	restore the previous cur_curlyx.
	5287
	5288	When WHILEM is about to execute B, save the current cur_curlyx, and set it
	5289	to the outer one saved in the CURLYX block.
	5290
	5291	When popping the WHILEM block after a successful or unsuccessful B match,
	5292	restore the previous cur_curlyx.
	5293
	5294	Here's an example for the pattern (AI* BI)*BO
	5295	I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
	5296
	5297	cur_
	5298	curlyx backtrack stack
	5299	------ ---------------
	5300	NULL
	5301	CO <CO prev=NULL> <WO>
	5302	CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
	5303	CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
	5304	NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
	5305
	5306	At this point the pattern succeeds, and we work back down the stack to
	5307	clean up, restoring as we go:
	5308
	5309	CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
	5310	CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
	5311	CO <CO prev=NULL> <WO>
	5312	NULL
	5313
	5314	*******************************************************************/
	5315
	5316	#define ST st->u.curlyx
	5317
	5318	case CURLYX: /* start of /AB/ (for complex A) /
	5319	{
	5320	/* No need to save/restore up to this paren */
	5321	I32 parenfloor = scan->flags;
	5322
	5323	assert(next); /* keep Coverity happy */
	5324	if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
	5325	next += ARG(next);
	5326
	5327	/* XXXX Probably it is better to teach regpush to support
	5328	parenfloor > maxopenparen ... */
	5329	if (parenfloor > (I32)rex->lastparen)
	5330	parenfloor = rex->lastparen; /* Pessimization... */
	5331
	5332	ST.prev_curlyx= cur_curlyx;
	5333	cur_curlyx = st;
	5334	ST.cp = PL_savestack_ix;
	5335
	5336	/* these fields contain the state of the current curly.
	5337	* they are accessed by subsequent WHILEMs */
	5338	ST.parenfloor = parenfloor;
	5339	ST.me = scan;
	5340	ST.B = next;
	5341	ST.minmod = minmod;
	5342	minmod = 0;
	5343	ST.count = -1; /* this will be updated by WHILEM */
	5344	ST.lastloc = NULL; /* this will be updated by WHILEM */
	5345
	5346	PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
	5347	assert(0); /* NOTREACHED */
	5348	}
	5349
	5350	case CURLYX_end: /* just finished matching all of AB /
	5351	cur_curlyx = ST.prev_curlyx;
	5352	sayYES;
	5353	assert(0); /* NOTREACHED */
	5354
	5355	case CURLYX_end_fail: /* just failed to match all of AB /
	5356	regcpblow(ST.cp);
	5357	cur_curlyx = ST.prev_curlyx;
	5358	sayNO;
	5359	assert(0); /* NOTREACHED */
	5360
	5361
	5362	#undef ST
	5363	#define ST st->u.whilem
	5364
	5365	case WHILEM: /* just matched an A in /AB/ (for complex A) /
	5366	{
	5367	/* see the discussion above about CURLYX/WHILEM */
	5368	I32 n;
	5369	int min = ARG1(cur_curlyx->u.curlyx.me);
	5370	int max = ARG2(cur_curlyx->u.curlyx.me);
	5371	regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
	5372
	5373	assert(cur_curlyx); /* keep Coverity happy */
	5374	n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
	5375	ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
	5376	ST.cache_offset = 0;
	5377	ST.cache_mask = 0;
	5378
	5379
	5380	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	5381	"%*s whilem: matched %ld out of %d..%d\n",
	5382	REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
	5383	);
	5384
	5385	/* First just match a string of min A's. */
	5386
	5387	if (n < min) {
	5388	ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
	5389	maxopenparen);
	5390	cur_curlyx->u.curlyx.lastloc = locinput;
	5391	REGCP_SET(ST.lastcp);
	5392
	5393	PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
	5394	assert(0); /* NOTREACHED */
	5395	}
	5396
	5397	/* If degenerate A matches "", assume A done. */
	5398
	5399	if (locinput == cur_curlyx->u.curlyx.lastloc) {
	5400	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	5401	"%*s whilem: empty match detected, trying continuation...\n",
	5402	REPORT_CODE_OFF+depth*2, "")
	5403	);
	5404	goto do_whilem_B_max;
	5405	}
	5406
	5407	/* super-linear cache processing.
	5408	*
	5409	* The idea here is that for certain types of CURLYX/WHILEM -
	5410	* principally those whose upper bound is infinity (and
	5411	* excluding regexes that have things like \1 and other very
	5412	* non-regular expresssiony things), then if a pattern like
	5413	* /....A*.../ fails and we backtrack to the WHILEM, then we
	5414	* make a note that this particular WHILEM op was at string
	5415	* position 47 (say) when the rest of pattern failed. Then, if
	5416	* we ever find ourselves back at that WHILEM, and at string
	5417	* position 47 again, we can just fail immediately rather than
	5418	* running the rest of the pattern again.
	5419	*
	5420	* This is very handy when patterns start to go
	5421	* 'super-linear', like in (a+)(a+)(a+)*, where you end up
	5422	* with a combinatorial explosion of backtracking.
	5423	*
	5424	* The cache is implemented as a bit array, with one bit per
	5425	* string byte position per WHILEM op (up to 16) - so its
	5426	* between 0.25 and 2x the string size.
	5427	*
	5428	* To avoid allocating a poscache buffer every time, we do an
	5429	* initially countdown; only after we have executed a WHILEM
	5430	* op (string-length x #WHILEMs) times do we allocate the
	5431	* cache.
	5432	*
	5433	* The top 4 bits of scan->flags byte say how many different
	5434	* relevant CURLLYX/WHILEM op pairs there are, while the
	5435	* bottom 4-bits is the identifying index number of this
	5436	* WHILEM.
	5437	*/
	5438
	5439	if (scan->flags) {
	5440
	5441	if (!reginfo->poscache_maxiter) {
	5442	/* start the countdown: Postpone detection until we
	5443	* know the match is not that much linear. */
	5444	reginfo->poscache_maxiter
	5445	= (reginfo->strend - reginfo->strbeg + 1)
	5446	* (scan->flags>>4);
	5447	/* possible overflow for long strings and many CURLYX's */
	5448	if (reginfo->poscache_maxiter < 0)
	5449	reginfo->poscache_maxiter = I32_MAX;
	5450	reginfo->poscache_iter = reginfo->poscache_maxiter;
	5451	}
	5452
	5453	if (reginfo->poscache_iter-- == 0) {
	5454	/* initialise cache */
	5455	const I32 size = (reginfo->poscache_maxiter + 7)/8;
	5456	regmatch_info_aux *const aux = reginfo->info_aux;
	5457	if (aux->poscache) {
	5458	if ((I32)reginfo->poscache_size < size) {
	5459	Renew(aux->poscache, size, char);
	5460	reginfo->poscache_size = size;
	5461	}
	5462	Zero(aux->poscache, size, char);
	5463	}
	5464	else {
	5465	reginfo->poscache_size = size;
	5466	Newxz(aux->poscache, size, char);
	5467	}
	5468	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	5469	"%swhilem: Detected a super-linear match, switching on caching%s...\n",
	5470	PL_colors[4], PL_colors[5])
	5471	);
	5472	}
	5473
	5474	if (reginfo->poscache_iter < 0) {
	5475	/* have we already failed at this position? */
	5476	I32 offset, mask;
	5477
	5478	reginfo->poscache_iter = -1; /* stop eventual underflow */
	5479	offset = (scan->flags & 0xf) - 1
	5480	+ (locinput - reginfo->strbeg)
	5481	* (scan->flags>>4);
	5482	mask = 1 << (offset % 8);
	5483	offset /= 8;
	5484	if (reginfo->info_aux->poscache[offset] & mask) {
	5485	DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
	5486	"%*s whilem: (cache) already tried at this position...\n",
	5487	REPORT_CODE_OFF+depth*2, "")
	5488	);
	5489	sayNO; /* cache records failure */
	5490	}
	5491	ST.cache_offset = offset;
	5492	ST.cache_mask = mask;
	5493	}
	5494	}
	5495
	5496	/* Prefer B over A for minimal matching. */
	5497
	5498	if (cur_curlyx->u.curlyx.minmod) {
	5499	ST.save_curlyx = cur_curlyx;
	5500	cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
	5501	ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
	5502	maxopenparen);
	5503	REGCP_SET(ST.lastcp);
	5504	PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
	5505	locinput);
	5506	assert(0); /* NOTREACHED */
	5507	}
	5508
	5509	/* Prefer A over B for maximal matching. */
	5510
	5511	if (n < max) { /* More greed allowed? */
	5512	ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
	5513	maxopenparen);
	5514	cur_curlyx->u.curlyx.lastloc = locinput;
	5515	REGCP_SET(ST.lastcp);
	5516	PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
	5517	assert(0); /* NOTREACHED */
	5518	}
	5519	goto do_whilem_B_max;
	5520	}
	5521	assert(0); /* NOTREACHED */
	5522
	5523	case WHILEM_B_min: /* just matched B in a minimal match */
	5524	case WHILEM_B_max: /* just matched B in a maximal match */
	5525	cur_curlyx = ST.save_curlyx;
	5526	sayYES;
	5527	assert(0); /* NOTREACHED */
	5528
	5529	case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
	5530	cur_curlyx = ST.save_curlyx;
	5531	cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
	5532	cur_curlyx->u.curlyx.count--;
	5533	CACHEsayNO;
	5534	assert(0); /* NOTREACHED */
	5535
	5536	case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
	5537	/* FALL THROUGH */
	5538	case WHILEM_A_pre_fail: /* just failed to match even minimal A */
	5539	REGCP_UNWIND(ST.lastcp);
	5540	regcppop(rex, &maxopenparen);
	5541	cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
	5542	cur_curlyx->u.curlyx.count--;
	5543	CACHEsayNO;
	5544	assert(0); /* NOTREACHED */
	5545
	5546	case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
	5547	REGCP_UNWIND(ST.lastcp);
	5548	regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
	5549	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	5550	"%*s whilem: failed, trying continuation...\n",
	5551	REPORT_CODE_OFF+depth*2, "")
	5552	);
	5553	do_whilem_B_max:
	5554	if (cur_curlyx->u.curlyx.count >= REG_INFTY
	5555	&& ckWARN(WARN_REGEXP)
	5556	&& !reginfo->warned)
	5557	{
	5558	reginfo->warned = TRUE;
	5559	Perl_warner(aTHX_ packWARN(WARN_REGEXP),
	5560	"Complex regular subexpression recursion limit (%d) "
	5561	"exceeded",
	5562	REG_INFTY - 1);
	5563	}
	5564
	5565	/* now try B */
	5566	ST.save_curlyx = cur_curlyx;
	5567	cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
	5568	PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
	5569	locinput);
	5570	assert(0); /* NOTREACHED */
	5571
	5572	case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
	5573	cur_curlyx = ST.save_curlyx;
	5574	REGCP_UNWIND(ST.lastcp);
	5575	regcppop(rex, &maxopenparen);
	5576
	5577	if (cur_curlyx->u.curlyx.count >= /max/ARG2(cur_curlyx->u.curlyx.me)) {
	5578	/* Maximum greed exceeded */
	5579	if (cur_curlyx->u.curlyx.count >= REG_INFTY
	5580	&& ckWARN(WARN_REGEXP)
	5581	&& !reginfo->warned)
	5582	{
	5583	reginfo->warned = TRUE;
	5584	Perl_warner(aTHX_ packWARN(WARN_REGEXP),
	5585	"Complex regular subexpression recursion "
	5586	"limit (%d) exceeded",
	5587	REG_INFTY - 1);
	5588	}
	5589	cur_curlyx->u.curlyx.count--;
	5590	CACHEsayNO;
	5591	}
	5592
	5593	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	5594	"%s trying longer...\n", REPORT_CODE_OFF+depth2, "")
	5595	);
	5596	/* Try grabbing another A and see if it helps. */
	5597	cur_curlyx->u.curlyx.lastloc = locinput;
	5598	ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
	5599	maxopenparen);
	5600	REGCP_SET(ST.lastcp);
	5601	PUSH_STATE_GOTO(WHILEM_A_min,
	5602	/A/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
	5603	locinput);
	5604	assert(0); /* NOTREACHED */
	5605
	5606	#undef ST
	5607	#define ST st->u.branch
	5608
	5609	case BRANCHJ: /* /(...\|A\|...)/ with long next pointer */
	5610	next = scan + ARG(scan);
	5611	if (next == scan)
	5612	next = NULL;
	5613	scan = NEXTOPER(scan);
	5614	/* FALL THROUGH */
	5615
	5616	case BRANCH: /* /(...\|A\|...)/ */
	5617	scan = NEXTOPER(scan); /* scan now points to inner node */
	5618	ST.lastparen = rex->lastparen;
	5619	ST.lastcloseparen = rex->lastcloseparen;
	5620	ST.next_branch = next;
	5621	REGCP_SET(ST.cp);
	5622
	5623	/* Now go into the branch */
	5624	if (has_cutgroup) {
	5625	PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
	5626	} else {
	5627	PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
	5628	}
	5629	assert(0); /* NOTREACHED */
	5630
	5631	case CUTGROUP: /* /(THEN)/ /
	5632	sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
	5633	MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
	5634	PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
	5635	assert(0); /* NOTREACHED */
	5636
	5637	case CUTGROUP_next_fail:
	5638	do_cutgroup = 1;
	5639	no_final = 1;
	5640	if (st->u.mark.mark_name)
	5641	sv_commit = st->u.mark.mark_name;
	5642	sayNO;
	5643	assert(0); /* NOTREACHED */
	5644
	5645	case BRANCH_next:
	5646	sayYES;
	5647	assert(0); /* NOTREACHED */
	5648
	5649	case BRANCH_next_fail: /* that branch failed; try the next, if any */
	5650	if (do_cutgroup) {
	5651	do_cutgroup = 0;
	5652	no_final = 0;
	5653	}
	5654	REGCP_UNWIND(ST.cp);
	5655	UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
	5656	scan = ST.next_branch;
	5657	/* no more branches? */
	5658	if (!scan \|\| (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
	5659	DEBUG_EXECUTE_r({
	5660	PerlIO_printf( Perl_debug_log,
	5661	"%*s %sBRANCH failed...%s\n",
	5662	REPORT_CODE_OFF+depth*2, "",
	5663	PL_colors[4],
	5664	PL_colors[5] );
	5665	});
	5666	sayNO_SILENT;
	5667	}
	5668	continue; /* execute next BRANCH[J] op */
	5669	assert(0); /* NOTREACHED */
	5670
	5671	case MINMOD: /* next op will be non-greedy, e.g. A? /
	5672	minmod = 1;
	5673	break;
	5674
	5675	#undef ST
	5676	#define ST st->u.curlym
	5677
	5678	case CURLYM: /* /A{m,n}B/ where A is fixed-length */
	5679
	5680	/* This is an optimisation of CURLYX that enables us to push
	5681	* only a single backtracking state, no matter how many matches
	5682	* there are in {m,n}. It relies on the pattern being constant
	5683	* length, with no parens to influence future backrefs
	5684	*/
	5685
	5686	ST.me = scan;
	5687	scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
	5688
	5689	ST.lastparen = rex->lastparen;
	5690	ST.lastcloseparen = rex->lastcloseparen;
	5691
	5692	/* if paren positive, emulate an OPEN/CLOSE around A */
	5693	if (ST.me->flags) {
	5694	U32 paren = ST.me->flags;
	5695	if (paren > maxopenparen)
	5696	maxopenparen = paren;
	5697	scan += NEXT_OFF(scan); /* Skip former OPEN. */
	5698	}
	5699	ST.A = scan;
	5700	ST.B = next;
	5701	ST.alen = 0;
	5702	ST.count = 0;
	5703	ST.minmod = minmod;
	5704	minmod = 0;
	5705	ST.c1 = CHRTEST_UNINIT;
	5706	REGCP_SET(ST.cp);
	5707
	5708	if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
	5709	goto curlym_do_B;
	5710
	5711	curlym_do_A: /* execute the A in /A{m,n}B/ */
	5712	PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
	5713	assert(0); /* NOTREACHED */
	5714
	5715	case CURLYM_A: /* we've just matched an A */
	5716	ST.count++;
	5717	/* after first match, determine A's length: u.curlym.alen */
	5718	if (ST.count == 1) {
	5719	if (reginfo->is_utf8_target) {
	5720	char *s = st->locinput;
	5721	while (s < locinput) {
	5722	ST.alen++;
	5723	s += UTF8SKIP(s);
	5724	}
	5725	}
	5726	else {
	5727	ST.alen = locinput - st->locinput;
	5728	}
	5729	if (ST.alen == 0)
	5730	ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
	5731	}
	5732	DEBUG_EXECUTE_r(
	5733	PerlIO_printf(Perl_debug_log,
	5734	"%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
	5735	(int)(REPORT_CODE_OFF+(depth*2)), "",
	5736	(IV) ST.count, (IV)ST.alen)
	5737	);
	5738
	5739	if (cur_eval && cur_eval->u.eval.close_paren &&
	5740	cur_eval->u.eval.close_paren == (U32)ST.me->flags)
	5741	goto fake_end;
	5742
	5743	{
	5744	I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
	5745	if ( max == REG_INFTY \|\| ST.count < max )
	5746	goto curlym_do_A; /* try to match another A */
	5747	}
	5748	goto curlym_do_B; /* try to match B */
	5749
	5750	case CURLYM_A_fail: /* just failed to match an A */
	5751	REGCP_UNWIND(ST.cp);
	5752
	5753	if (ST.minmod \|\| ST.count < ARG1(ST.me) /* min*/
	5754	\|\| (cur_eval && cur_eval->u.eval.close_paren &&
	5755	cur_eval->u.eval.close_paren == (U32)ST.me->flags))
	5756	sayNO;
	5757
	5758	curlym_do_B: /* execute the B in /A{m,n}B/ */
	5759	if (ST.c1 == CHRTEST_UNINIT) {
	5760	/* calculate c1 and c2 for possible match of 1st char
	5761	* following curly */
	5762	ST.c1 = ST.c2 = CHRTEST_VOID;
	5763	if (HAS_TEXT(ST.B) \|\| JUMPABLE(ST.B)) {
	5764	regnode *text_node = ST.B;
	5765	if (! HAS_TEXT(text_node))
	5766	FIND_NEXT_IMPT(text_node);
	5767	/* this used to be
	5768
	5769	(HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
	5770
	5771	But the former is redundant in light of the latter.
	5772
	5773	if this changes back then the macro for
	5774	IS_TEXT and friends need to change.
	5775	*/
	5776	if (PL_regkind[OP(text_node)] == EXACT) {
	5777	if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
	5778	text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
	5779	reginfo))
	5780	{
	5781	sayNO;
	5782	}
	5783	}
	5784	}
	5785	}
	5786
	5787	DEBUG_EXECUTE_r(
	5788	PerlIO_printf(Perl_debug_log,
	5789	"%*s CURLYM trying tail with matches=%"IVdf"...\n",
	5790	(int)(REPORT_CODE_OFF+(depth*2)),
	5791	"", (IV)ST.count)
	5792	);
	5793	if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
	5794	if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
	5795	if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
	5796	&& memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
	5797	{
	5798	/* simulate B failing */
	5799	DEBUG_OPTIMISE_r(
	5800	PerlIO_printf(Perl_debug_log,
	5801	"%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
	5802	(int)(REPORT_CODE_OFF+(depth*2)),"",
	5803	valid_utf8_to_uvchr((U8 *) locinput, NULL),
	5804	valid_utf8_to_uvchr(ST.c1_utf8, NULL),
	5805	valid_utf8_to_uvchr(ST.c2_utf8, NULL))
	5806	);
	5807	state_num = CURLYM_B_fail;
	5808	goto reenter_switch;
	5809	}
	5810	}
	5811	else if (nextchr != ST.c1 && nextchr != ST.c2) {
	5812	/* simulate B failing */
	5813	DEBUG_OPTIMISE_r(
	5814	PerlIO_printf(Perl_debug_log,
	5815	"%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
	5816	(int)(REPORT_CODE_OFF+(depth*2)),"",
	5817	(int) nextchr, ST.c1, ST.c2)
	5818	);
	5819	state_num = CURLYM_B_fail;
	5820	goto reenter_switch;
	5821	}
	5822	}
	5823
	5824	if (ST.me->flags) {
	5825	/* emulate CLOSE: mark current A as captured */
	5826	I32 paren = ST.me->flags;
	5827	if (ST.count) {
	5828	rex->offs[paren].start
	5829	= HOPc(locinput, -ST.alen) - reginfo->strbeg;
	5830	rex->offs[paren].end = locinput - reginfo->strbeg;
	5831	if ((U32)paren > rex->lastparen)
	5832	rex->lastparen = paren;
	5833	rex->lastcloseparen = paren;
	5834	}
	5835	else
	5836	rex->offs[paren].end = -1;
	5837	if (cur_eval && cur_eval->u.eval.close_paren &&
	5838	cur_eval->u.eval.close_paren == (U32)ST.me->flags)
	5839	{
	5840	if (ST.count)
	5841	goto fake_end;
	5842	else
	5843	sayNO;
	5844	}
	5845	}
	5846
	5847	PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
	5848	assert(0); /* NOTREACHED */
	5849
	5850	case CURLYM_B_fail: /* just failed to match a B */
	5851	REGCP_UNWIND(ST.cp);
	5852	UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
	5853	if (ST.minmod) {
	5854	I32 max = ARG2(ST.me);
	5855	if (max != REG_INFTY && ST.count == max)
	5856	sayNO;
	5857	goto curlym_do_A; /* try to match a further A */
	5858	}
	5859	/* backtrack one A */
	5860	if (ST.count == ARG1(ST.me) /* min */)
	5861	sayNO;
	5862	ST.count--;
	5863	SET_locinput(HOPc(locinput, -ST.alen));
	5864	goto curlym_do_B; /* try to match B */
	5865
	5866	#undef ST
	5867	#define ST st->u.curly
	5868
	5869	#define CURLY_SETPAREN(paren, success) \
	5870	if (paren) { \
	5871	if (success) { \
	5872	rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
	5873	rex->offs[paren].end = locinput - reginfo->strbeg; \
	5874	if (paren > rex->lastparen) \
	5875	rex->lastparen = paren; \
	5876	rex->lastcloseparen = paren; \
	5877	} \
	5878	else { \
	5879	rex->offs[paren].end = -1; \
	5880	rex->lastparen = ST.lastparen; \
	5881	rex->lastcloseparen = ST.lastcloseparen; \
	5882	} \
	5883	}
	5884
	5885	case STAR: /* /AB/ where A is width 1 char /
	5886	ST.paren = 0;
	5887	ST.min = 0;
	5888	ST.max = REG_INFTY;
	5889	scan = NEXTOPER(scan);
	5890	goto repeat;
	5891
	5892	case PLUS: /* /A+B/ where A is width 1 char */
	5893	ST.paren = 0;
	5894	ST.min = 1;
	5895	ST.max = REG_INFTY;
	5896	scan = NEXTOPER(scan);
	5897	goto repeat;
	5898
	5899	case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
	5900	ST.paren = scan->flags; /* Which paren to set */
	5901	ST.lastparen = rex->lastparen;
	5902	ST.lastcloseparen = rex->lastcloseparen;
	5903	if (ST.paren > maxopenparen)
	5904	maxopenparen = ST.paren;
	5905	ST.min = ARG1(scan); /* min to match */
	5906	ST.max = ARG2(scan); /* max to match */
	5907	if (cur_eval && cur_eval->u.eval.close_paren &&
	5908	cur_eval->u.eval.close_paren == (U32)ST.paren) {
	5909	ST.min=1;
	5910	ST.max=1;
	5911	}
	5912	scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
	5913	goto repeat;
	5914
	5915	case CURLY: /* /A{m,n}B/ where A is width 1 char */
	5916	ST.paren = 0;
	5917	ST.min = ARG1(scan); /* min to match */
	5918	ST.max = ARG2(scan); /* max to match */
	5919	scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
	5920	repeat:
	5921	/*
	5922	* Lookahead to avoid useless match attempts
	5923	* when we know what character comes next.
	5924	*
	5925	* Used to only do .x and .?x, but now it allows
	5926	* for )'s, ('s and (?{ ... })'s to be in the way
	5927	* of the quantifier and the EXACT-like node. -- japhy
	5928	*/
	5929
	5930	assert(ST.min <= ST.max);
	5931	if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
	5932	ST.c1 = ST.c2 = CHRTEST_VOID;
	5933	}
	5934	else {
	5935	regnode *text_node = next;
	5936
	5937	if (! HAS_TEXT(text_node))
	5938	FIND_NEXT_IMPT(text_node);
	5939
	5940	if (! HAS_TEXT(text_node))
	5941	ST.c1 = ST.c2 = CHRTEST_VOID;
	5942	else {
	5943	if ( PL_regkind[OP(text_node)] != EXACT ) {
	5944	ST.c1 = ST.c2 = CHRTEST_VOID;
	5945	}
	5946	else {
	5947
	5948	/* Currently we only get here when
	5949
	5950	PL_rekind[OP(text_node)] == EXACT
	5951
	5952	if this changes back then the macro for IS_TEXT and
	5953	friends need to change. */
	5954	if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
	5955	text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
	5956	reginfo))
	5957	{
	5958	sayNO;
	5959	}
	5960	}
	5961	}
	5962	}
	5963
	5964	ST.A = scan;
	5965	ST.B = next;
	5966	if (minmod) {
	5967	char *li = locinput;
	5968	minmod = 0;
	5969	if (ST.min &&
	5970	regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
	5971	< ST.min)
	5972	sayNO;
	5973	SET_locinput(li);
	5974	ST.count = ST.min;
	5975	REGCP_SET(ST.cp);
	5976	if (ST.c1 == CHRTEST_VOID)
	5977	goto curly_try_B_min;
	5978
	5979	ST.oldloc = locinput;
	5980
	5981	/* set ST.maxpos to the furthest point along the
	5982	* string that could possibly match */
	5983	if (ST.max == REG_INFTY) {
	5984	ST.maxpos = reginfo->strend - 1;
	5985	if (utf8_target)
	5986	while (UTF8_IS_CONTINUATION((U8)ST.maxpos))
	5987	ST.maxpos--;
	5988	}
	5989	else if (utf8_target) {
	5990	int m = ST.max - ST.min;
	5991	for (ST.maxpos = locinput;
	5992	m >0 && ST.maxpos < reginfo->strend; m--)
	5993	ST.maxpos += UTF8SKIP(ST.maxpos);
	5994	}
	5995	else {
	5996	ST.maxpos = locinput + ST.max - ST.min;
	5997	if (ST.maxpos >= reginfo->strend)
	5998	ST.maxpos = reginfo->strend - 1;
	5999	}
	6000	goto curly_try_B_min_known;
	6001
	6002	}
	6003	else {
	6004	/* avoid taking address of locinput, so it can remain
	6005	* a register var */
	6006	char *li = locinput;
	6007	ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
	6008	if (ST.count < ST.min)
	6009	sayNO;
	6010	SET_locinput(li);
	6011	if ((ST.count > ST.min)
	6012	&& (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
	6013	{
	6014	/* A{m,n} must come at the end of the string, there's
	6015	* no point in backing off ... */
	6016	ST.min = ST.count;
	6017	/* ...except that $ and \Z can match before and after
	6018	newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
	6019	We may back off by one in this case. */
	6020	if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
	6021	ST.min--;
	6022	}
	6023	REGCP_SET(ST.cp);
	6024	goto curly_try_B_max;
	6025	}
	6026	assert(0); /* NOTREACHED */
	6027
	6028
	6029	case CURLY_B_min_known_fail:
	6030	/* failed to find B in a non-greedy match where c1,c2 valid */
	6031
	6032	REGCP_UNWIND(ST.cp);
	6033	if (ST.paren) {
	6034	UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
	6035	}
	6036	/* Couldn't or didn't -- move forward. */
	6037	ST.oldloc = locinput;
	6038	if (utf8_target)
	6039	locinput += UTF8SKIP(locinput);
	6040	else
	6041	locinput++;
	6042	ST.count++;
	6043	curly_try_B_min_known:
	6044	/* find the next place where 'B' could work, then call B */
	6045	{
	6046	int n;
	6047	if (utf8_target) {
	6048	n = (ST.oldloc == locinput) ? 0 : 1;
	6049	if (ST.c1 == ST.c2) {
	6050	/* set n to utf8_distance(oldloc, locinput) */
	6051	while (locinput <= ST.maxpos
	6052	&& memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
	6053	{
	6054	locinput += UTF8SKIP(locinput);
	6055	n++;
	6056	}
	6057	}
	6058	else {
	6059	/* set n to utf8_distance(oldloc, locinput) */
	6060	while (locinput <= ST.maxpos
	6061	&& memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
	6062	&& memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
	6063	{
	6064	locinput += UTF8SKIP(locinput);
	6065	n++;
	6066	}
	6067	}
	6068	}
	6069	else { /* Not utf8_target */
	6070	if (ST.c1 == ST.c2) {
	6071	while (locinput <= ST.maxpos &&
	6072	UCHARAT(locinput) != ST.c1)
	6073	locinput++;
	6074	}
	6075	else {
	6076	while (locinput <= ST.maxpos
	6077	&& UCHARAT(locinput) != ST.c1
	6078	&& UCHARAT(locinput) != ST.c2)
	6079	locinput++;
	6080	}
	6081	n = locinput - ST.oldloc;
	6082	}
	6083	if (locinput > ST.maxpos)
	6084	sayNO;
	6085	if (n) {
	6086	/* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
	6087	* at b; check that everything between oldloc and
	6088	* locinput matches */
	6089	char *li = ST.oldloc;
	6090	ST.count += n;
	6091	if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
	6092	sayNO;
	6093	assert(n == REG_INFTY \|\| locinput == li);
	6094	}
	6095	CURLY_SETPAREN(ST.paren, ST.count);
	6096	if (cur_eval && cur_eval->u.eval.close_paren &&
	6097	cur_eval->u.eval.close_paren == (U32)ST.paren) {
	6098	goto fake_end;
	6099	}
	6100	PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
	6101	}
	6102	assert(0); /* NOTREACHED */
	6103
	6104
	6105	case CURLY_B_min_fail:
	6106	/* failed to find B in a non-greedy match where c1,c2 invalid */
	6107
	6108	REGCP_UNWIND(ST.cp);
	6109	if (ST.paren) {
	6110	UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
	6111	}
	6112	/* failed -- move forward one */
	6113	{
	6114	char *li = locinput;
	6115	if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
	6116	sayNO;
	6117	}
	6118	locinput = li;
	6119	}
	6120	{
	6121	ST.count++;
	6122	if (ST.count <= ST.max \|\| (ST.max == REG_INFTY &&
	6123	ST.count > 0)) /* count overflow ? */
	6124	{
	6125	curly_try_B_min:
	6126	CURLY_SETPAREN(ST.paren, ST.count);
	6127	if (cur_eval && cur_eval->u.eval.close_paren &&
	6128	cur_eval->u.eval.close_paren == (U32)ST.paren) {
	6129	goto fake_end;
	6130	}
	6131	PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
	6132	}
	6133	}
	6134	sayNO;
	6135	assert(0); /* NOTREACHED */
	6136
	6137
	6138	curly_try_B_max:
	6139	/* a successful greedy match: now try to match B */
	6140	if (cur_eval && cur_eval->u.eval.close_paren &&
	6141	cur_eval->u.eval.close_paren == (U32)ST.paren) {
	6142	goto fake_end;
	6143	}
	6144	{
	6145	bool could_match = locinput < reginfo->strend;
	6146
	6147	/* If it could work, try it. */
	6148	if (ST.c1 != CHRTEST_VOID && could_match) {
	6149	if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
	6150	{
	6151	could_match = memEQ(locinput,
	6152	ST.c1_utf8,
	6153	UTF8SKIP(locinput))
	6154	\|\| memEQ(locinput,
	6155	ST.c2_utf8,
	6156	UTF8SKIP(locinput));
	6157	}
	6158	else {
	6159	could_match = UCHARAT(locinput) == ST.c1
	6160	\|\| UCHARAT(locinput) == ST.c2;
	6161	}
	6162	}
	6163	if (ST.c1 == CHRTEST_VOID \|\| could_match) {
	6164	CURLY_SETPAREN(ST.paren, ST.count);
	6165	PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
	6166	assert(0); /* NOTREACHED */
	6167	}
	6168	}
	6169	/* FALL THROUGH */
	6170
	6171	case CURLY_B_max_fail:
	6172	/* failed to find B in a greedy match */
	6173
	6174	REGCP_UNWIND(ST.cp);
	6175	if (ST.paren) {
	6176	UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
	6177	}
	6178	/* back up. */
	6179	if (--ST.count < ST.min)
	6180	sayNO;
	6181	locinput = HOPc(locinput, -1);
	6182	goto curly_try_B_max;
	6183
	6184	#undef ST
	6185
	6186	case END: /* last op of main pattern */
	6187	fake_end:
	6188	if (cur_eval) {
	6189	/* we've just finished A in /(??{A})B/; now continue with B */
	6190
	6191	st->u.eval.prev_rex = rex_sv; /* inner */
	6192
	6193	/* Save all the positions. */
	6194	st->u.eval.cp = regcppush(rex, 0, maxopenparen);
	6195	rex_sv = cur_eval->u.eval.prev_rex;
	6196	is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
	6197	SET_reg_curpm(rex_sv);
	6198	rex = ReANY(rex_sv);
	6199	rexi = RXi_GET(rex);
	6200	cur_curlyx = cur_eval->u.eval.prev_curlyx;
	6201
	6202	REGCP_SET(st->u.eval.lastcp);
	6203
	6204	/* Restore parens of the outer rex without popping the
	6205	* savestack */
	6206	S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
	6207	&maxopenparen);
	6208
	6209	st->u.eval.prev_eval = cur_eval;
	6210	cur_eval = cur_eval->u.eval.prev_eval;
	6211	DEBUG_EXECUTE_r(
	6212	PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
	6213	REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
	6214	if ( nochange_depth )
	6215	nochange_depth--;
	6216
	6217	PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
	6218	locinput); /* match B */
	6219	}
	6220
	6221	if (locinput < reginfo->till) {
	6222	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
	6223	"%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
	6224	PL_colors[4],
	6225	(long)(locinput - startpos),
	6226	(long)(reginfo->till - startpos),
	6227	PL_colors[5]));
	6228
	6229	sayNO_SILENT; /* Cannot match: too short. */
	6230	}
	6231	sayYES; /* Success! */
	6232
	6233	case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
	6234	DEBUG_EXECUTE_r(
	6235	PerlIO_printf(Perl_debug_log,
	6236	"%*s %ssubpattern success...%s\n",
	6237	REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
	6238	sayYES; /* Success! */
	6239
	6240	#undef ST
	6241	#define ST st->u.ifmatch
	6242
	6243	{
	6244	char *newstart;
	6245
	6246	case SUSPEND: /* (?>A) */
	6247	ST.wanted = 1;
	6248	newstart = locinput;
	6249	goto do_ifmatch;
	6250
	6251	case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
	6252	ST.wanted = 0;
	6253	goto ifmatch_trivial_fail_test;
	6254
	6255	case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
	6256	ST.wanted = 1;
	6257	ifmatch_trivial_fail_test:
	6258	if (scan->flags) {
	6259	char * const s = HOPBACKc(locinput, scan->flags);
	6260	if (!s) {
	6261	/* trivial fail */
	6262	if (logical) {
	6263	logical = 0;
	6264	sw = 1 - cBOOL(ST.wanted);
	6265	}
	6266	else if (ST.wanted)
	6267	sayNO;
	6268	next = scan + ARG(scan);
	6269	if (next == scan)
	6270	next = NULL;
	6271	break;
	6272	}
	6273	newstart = s;
	6274	}
	6275	else
	6276	newstart = locinput;
	6277
	6278	do_ifmatch:
	6279	ST.me = scan;
	6280	ST.logical = logical;
	6281	logical = 0; /* XXX: reset state of logical once it has been saved into ST */
	6282
	6283	/* execute body of (?...A) */
	6284	PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
	6285	assert(0); /* NOTREACHED */
	6286	}
	6287
	6288	case IFMATCH_A_fail: /* body of (?...A) failed */
	6289	ST.wanted = !ST.wanted;
	6290	/* FALL THROUGH */
	6291
	6292	case IFMATCH_A: /* body of (?...A) succeeded */
	6293	if (ST.logical) {
	6294	sw = cBOOL(ST.wanted);
	6295	}
	6296	else if (!ST.wanted)
	6297	sayNO;
	6298
	6299	if (OP(ST.me) != SUSPEND) {
	6300	/* restore old position except for (?>...) */
	6301	locinput = st->locinput;
	6302	}
	6303	scan = ST.me + ARG(ST.me);
	6304	if (scan == ST.me)
	6305	scan = NULL;
	6306	continue; /* execute B */
	6307
	6308	#undef ST
	6309
	6310	case LONGJMP: /* alternative with many branches compiles to
	6311	* (BRANCHJ; EXACT ...; LONGJMP ) x N */
	6312	next = scan + ARG(scan);
	6313	if (next == scan)
	6314	next = NULL;
	6315	break;
	6316
	6317	case COMMIT: /* (COMMIT) /
	6318	reginfo->cutpoint = reginfo->strend;
	6319	/* FALLTHROUGH */
	6320
	6321	case PRUNE: /* (PRUNE) /
	6322	if (!scan->flags)
	6323	sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
	6324	PUSH_STATE_GOTO(COMMIT_next, next, locinput);
	6325	assert(0); /* NOTREACHED */
	6326
	6327	case COMMIT_next_fail:
	6328	no_final = 1;
	6329	/* FALLTHROUGH */
	6330
	6331	case OPFAIL: /* (FAIL) /
	6332	sayNO;
	6333	assert(0); /* NOTREACHED */
	6334
	6335	#define ST st->u.mark
	6336	case MARKPOINT: /* (MARK:foo) /
	6337	ST.prev_mark = mark_state;
	6338	ST.mark_name = sv_commit = sv_yes_mark
	6339	= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
	6340	mark_state = st;
	6341	ST.mark_loc = locinput;
	6342	PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
	6343	assert(0); /* NOTREACHED */
	6344
	6345	case MARKPOINT_next:
	6346	mark_state = ST.prev_mark;
	6347	sayYES;
	6348	assert(0); /* NOTREACHED */
	6349
	6350	case MARKPOINT_next_fail:
	6351	if (popmark && sv_eq(ST.mark_name,popmark))
	6352	{
	6353	if (ST.mark_loc > startpoint)
	6354	reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
	6355	popmark = NULL; /* we found our mark */
	6356	sv_commit = ST.mark_name;
	6357
	6358	DEBUG_EXECUTE_r({
	6359	PerlIO_printf(Perl_debug_log,
	6360	"%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
	6361	REPORT_CODE_OFF+depth*2, "",
	6362	PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
	6363	});
	6364	}
	6365	mark_state = ST.prev_mark;
	6366	sv_yes_mark = mark_state ?
	6367	mark_state->u.mark.mark_name : NULL;
	6368	sayNO;
	6369	assert(0); /* NOTREACHED */
	6370
	6371	case SKIP: /* (SKIP) /
	6372	if (scan->flags) {
	6373	/* (SKIP) : if we fail we cut here/
	6374	ST.mark_name = NULL;
	6375	ST.mark_loc = locinput;
	6376	PUSH_STATE_GOTO(SKIP_next,next, locinput);
	6377	} else {
	6378	/* (SKIP:NAME) : if there is a (MARK:NAME) fail where it was,
	6379	otherwise do nothing. Meaning we need to scan
	6380	*/
	6381	regmatch_state *cur = mark_state;
	6382	SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
	6383
	6384	while (cur) {
	6385	if ( sv_eq( cur->u.mark.mark_name,
	6386	find ) )
	6387	{
	6388	ST.mark_name = find;
	6389	PUSH_STATE_GOTO( SKIP_next, next, locinput);
	6390	}
	6391	cur = cur->u.mark.prev_mark;
	6392	}
	6393	}
	6394	/* Didn't find our (MARK:NAME) so ignore this (SKIP:NAME) */
	6395	break;
	6396
	6397	case SKIP_next_fail:
	6398	if (ST.mark_name) {
	6399	/* (*CUT:NAME) - Set up to search for the name as we
	6400	collapse the stack*/
	6401	popmark = ST.mark_name;
	6402	} else {
	6403	/* (CUT) - No name, we cut here./
	6404	if (ST.mark_loc > startpoint)
	6405	reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
	6406	/* but we set sv_commit to latest mark_name if there
	6407	is one so they can test to see how things lead to this
	6408	cut */
	6409	if (mark_state)
	6410	sv_commit=mark_state->u.mark.mark_name;
	6411	}
	6412	no_final = 1;
	6413	sayNO;
	6414	assert(0); /* NOTREACHED */
	6415	#undef ST
	6416
	6417	case LNBREAK: /* \R */
	6418	if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
	6419	locinput += n;
	6420	} else
	6421	sayNO;
	6422	break;
	6423
	6424	default:
	6425	PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
	6426	PTR2UV(scan), OP(scan));
	6427	Perl_croak(aTHX_ "regexp memory corruption");
	6428
	6429	/* this is a point to jump to in order to increment
	6430	* locinput by one character */
	6431	increment_locinput:
	6432	assert(!NEXTCHR_IS_EOS);
	6433	if (utf8_target) {
	6434	locinput += PL_utf8skip[nextchr];
	6435	/* locinput is allowed to go 1 char off the end, but not 2+ */
	6436	if (locinput > reginfo->strend)
	6437	sayNO;
	6438	}
	6439	else
	6440	locinput++;
	6441	break;
	6442
	6443	} /* end switch */
	6444
	6445	/* switch break jumps here */
	6446	scan = next; /* prepare to execute the next op and ... */
	6447	continue; /* ... jump back to the top, reusing st */
	6448	assert(0); /* NOTREACHED */
	6449
	6450	push_yes_state:
	6451	/* push a state that backtracks on success */
	6452	st->u.yes.prev_yes_state = yes_state;
	6453	yes_state = st;
	6454	/* FALL THROUGH */
	6455	push_state:
	6456	/* push a new regex state, then continue at scan */
	6457	{
	6458	regmatch_state *newst;
	6459
	6460	DEBUG_STACK_r({
	6461	regmatch_state *cur = st;
	6462	regmatch_state *curyes = yes_state;
	6463	int curd = depth;
	6464	regmatch_slab *slab = PL_regmatch_slab;
	6465	for (;curd > -1;cur--,curd--) {
	6466	if (cur < SLAB_FIRST(slab)) {
	6467	slab = slab->prev;
	6468	cur = SLAB_LAST(slab);
	6469	}
	6470	PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
	6471	REPORT_CODE_OFF + 2 + depth * 2,"",
	6472	curd, PL_reg_name[cur->resume_state],
	6473	(curyes == cur) ? "yes" : ""
	6474	);
	6475	if (curyes == cur)
	6476	curyes = cur->u.yes.prev_yes_state;
	6477	}
	6478	} else
	6479	DEBUG_STATE_pp("push")
	6480	);
	6481	depth++;
	6482	st->locinput = locinput;
	6483	newst = st+1;
	6484	if (newst > SLAB_LAST(PL_regmatch_slab))
	6485	newst = S_push_slab(aTHX);
	6486	PL_regmatch_state = newst;
	6487
	6488	locinput = pushinput;
	6489	st = newst;
	6490	continue;
	6491	assert(0); /* NOTREACHED */
	6492	}
	6493	}
	6494
	6495	/*
	6496	* We get here only if there's trouble -- normally "case END" is
	6497	* the terminating point.
	6498	*/
	6499	Perl_croak(aTHX_ "corrupted regexp pointers");
	6500	/NOTREACHED/
	6501	sayNO;
	6502
	6503	yes:
	6504	if (yes_state) {
	6505	/* we have successfully completed a subexpression, but we must now
	6506	* pop to the state marked by yes_state and continue from there */
	6507	assert(st != yes_state);
	6508	#ifdef DEBUGGING
	6509	while (st != yes_state) {
	6510	st--;
	6511	if (st < SLAB_FIRST(PL_regmatch_slab)) {
	6512	PL_regmatch_slab = PL_regmatch_slab->prev;
	6513	st = SLAB_LAST(PL_regmatch_slab);
	6514	}
	6515	DEBUG_STATE_r({
	6516	if (no_final) {
	6517	DEBUG_STATE_pp("pop (no final)");
	6518	} else {
	6519	DEBUG_STATE_pp("pop (yes)");
	6520	}
	6521	});
	6522	depth--;
	6523	}
	6524	#else
	6525	while (yes_state < SLAB_FIRST(PL_regmatch_slab)
	6526	\|\| yes_state > SLAB_LAST(PL_regmatch_slab))
	6527	{
	6528	/* not in this slab, pop slab */
	6529	depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
	6530	PL_regmatch_slab = PL_regmatch_slab->prev;
	6531	st = SLAB_LAST(PL_regmatch_slab);
	6532	}
	6533	depth -= (st - yes_state);
	6534	#endif
	6535	st = yes_state;
	6536	yes_state = st->u.yes.prev_yes_state;
	6537	PL_regmatch_state = st;
	6538
	6539	if (no_final)
	6540	locinput= st->locinput;
	6541	state_num = st->resume_state + no_final;
	6542	goto reenter_switch;
	6543	}
	6544
	6545	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
	6546	PL_colors[4], PL_colors[5]));
	6547
	6548	if (reginfo->info_aux_eval) {
	6549	/* each successfully executed (?{...}) block does the equivalent of
	6550	* local $^R = do {...}
	6551	* When popping the save stack, all these locals would be undone;
	6552	* bypass this by setting the outermost saved $^R to the latest
	6553	* value */
	6554	if (oreplsv != GvSV(PL_replgv))
	6555	sv_setsv(oreplsv, GvSV(PL_replgv));
	6556	}
	6557	result = 1;
	6558	goto final_exit;
	6559
	6560	no:
	6561	DEBUG_EXECUTE_r(
	6562	PerlIO_printf(Perl_debug_log,
	6563	"%*s %sfailed...%s\n",
	6564	REPORT_CODE_OFF+depth*2, "",
	6565	PL_colors[4], PL_colors[5])
	6566	);
	6567
	6568	no_silent:
	6569	if (no_final) {
	6570	if (yes_state) {
	6571	goto yes;
	6572	} else {
	6573	goto final_exit;
	6574	}
	6575	}
	6576	if (depth) {
	6577	/* there's a previous state to backtrack to */
	6578	st--;
	6579	if (st < SLAB_FIRST(PL_regmatch_slab)) {
	6580	PL_regmatch_slab = PL_regmatch_slab->prev;
	6581	st = SLAB_LAST(PL_regmatch_slab);
	6582	}
	6583	PL_regmatch_state = st;
	6584	locinput= st->locinput;
	6585
	6586	DEBUG_STATE_pp("pop");
	6587	depth--;
	6588	if (yes_state == st)
	6589	yes_state = st->u.yes.prev_yes_state;
	6590
	6591	state_num = st->resume_state + 1; /* failure = success + 1 */
	6592	goto reenter_switch;
	6593	}
	6594	result = 0;
	6595
	6596	final_exit:
	6597	if (rex->intflags & PREGf_VERBARG_SEEN) {
	6598	SV *sv_err = get_sv("REGERROR", 1);
	6599	SV *sv_mrk = get_sv("REGMARK", 1);
	6600	if (result) {
	6601	sv_commit = &PL_sv_no;
	6602	if (!sv_yes_mark)
	6603	sv_yes_mark = &PL_sv_yes;
	6604	} else {
	6605	if (!sv_commit)
	6606	sv_commit = &PL_sv_yes;
	6607	sv_yes_mark = &PL_sv_no;
	6608	}
	6609	sv_setsv(sv_err, sv_commit);
	6610	sv_setsv(sv_mrk, sv_yes_mark);
	6611	}
	6612
	6613
	6614	if (last_pushed_cv) {
	6615	dSP;
	6616	POP_MULTICALL;
	6617	PERL_UNUSED_VAR(SP);
	6618	}
	6619
	6620	assert(!result \|\| locinput - reginfo->strbeg >= 0);
	6621	return result ? locinput - reginfo->strbeg : -1;
	6622	}
	6623
	6624	/*
	6625	- regrepeat - repeatedly match something simple, report how many
	6626	*
	6627	* What 'simple' means is a node which can be the operand of a quantifier like
	6628	* '+', or {1,3}
	6629	*
	6630	* startposp - pointer a pointer to the start position. This is updated
	6631	* to point to the byte following the highest successful
	6632	* match.
	6633	* p - the regnode to be repeatedly matched against.
	6634	* reginfo - struct holding match state, such as strend
	6635	* max - maximum number of things to match.
	6636	* depth - (for debugging) backtracking depth.
	6637	*/
	6638	STATIC I32
	6639	S_regrepeat(pTHX_ regexp prog, char startposp, const regnode p,
	6640	regmatch_info *const reginfo, I32 max, int depth)
	6641	{
	6642	dVAR;
	6643	char scan; / Pointer to current position in target string */
	6644	I32 c;
	6645	char loceol = reginfo->strend; / local version */
	6646	I32 hardcount = 0; /* How many matches so far */
	6647	bool utf8_target = reginfo->is_utf8_target;
	6648	int to_complement = 0; /* Invert the result? */
	6649	UV utf8_flags;
	6650	_char_class_number classnum;
	6651	#ifndef DEBUGGING
	6652	PERL_UNUSED_ARG(depth);
	6653	#endif
	6654
	6655	PERL_ARGS_ASSERT_REGREPEAT;
	6656
	6657	scan = *startposp;
	6658	if (max == REG_INFTY)
	6659	max = I32_MAX;
	6660	else if (! utf8_target && loceol - scan > max)
	6661	loceol = scan + max;
	6662
	6663	/* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
	6664	* to the maximum of how far we should go in it (leaving it set to the real
	6665	* end, if the maximum permissible would take us beyond that). This allows
	6666	* us to make the loop exit condition that we haven't gone past <loceol> to
	6667	* also mean that we haven't exceeded the max permissible count, saving a
	6668	* test each time through the loop. But it assumes that the OP matches a
	6669	* single byte, which is true for most of the OPs below when applied to a
	6670	* non-UTF-8 target. Those relatively few OPs that don't have this
	6671	* characteristic will have to compensate.
	6672	*
	6673	* There is no adjustment for UTF-8 targets, as the number of bytes per
	6674	* character varies. OPs will have to test both that the count is less
	6675	* than the max permissible (using <hardcount> to keep track), and that we
	6676	* are still within the bounds of the string (using <loceol>. A few OPs
	6677	* match a single byte no matter what the encoding. They can omit the max
	6678	* test if, for the UTF-8 case, they do the adjustment that was skipped
	6679	* above.
	6680	*
	6681	* Thus, the code above sets things up for the common case; and exceptional
	6682	* cases need extra work; the common case is to make sure <scan> doesn't
	6683	* go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
	6684	* count doesn't exceed the maximum permissible */
	6685
	6686	switch (OP(p)) {
	6687	case REG_ANY:
	6688	if (utf8_target) {
	6689	while (scan < loceol && hardcount < max && *scan != '\n') {
	6690	scan += UTF8SKIP(scan);
	6691	hardcount++;
	6692	}
	6693	} else {
	6694	while (scan < loceol && *scan != '\n')
	6695	scan++;
	6696	}
	6697	break;
	6698	case SANY:
	6699	if (utf8_target) {
	6700	while (scan < loceol && hardcount < max) {
	6701	scan += UTF8SKIP(scan);
	6702	hardcount++;
	6703	}
	6704	}
	6705	else
	6706	scan = loceol;
	6707	break;
	6708	case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
	6709	if (utf8_target && loceol - scan > max) {
	6710
	6711	/* <loceol> hadn't been adjusted in the UTF-8 case */
	6712	scan += max;
	6713	}
	6714	else {
	6715	scan = loceol;
	6716	}
	6717	break;
	6718	case EXACT:
	6719	assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
	6720
	6721	c = (U8)*STRING(p);
	6722
	6723	/* Can use a simple loop if the pattern char to match on is invariant
	6724	* under UTF-8, or both target and pattern aren't UTF-8. Note that we
	6725	* can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
	6726	* true iff it doesn't matter if the argument is in UTF-8 or not */
	6727	if (UTF8_IS_INVARIANT(c) \|\| (! utf8_target && ! reginfo->is_utf8_pat)) {
	6728	if (utf8_target && loceol - scan > max) {
	6729	/* We didn't adjust <loceol> because is UTF-8, but ok to do so,
	6730	* since here, to match at all, 1 char == 1 byte */
	6731	loceol = scan + max;
	6732	}
	6733	while (scan < loceol && UCHARAT(scan) == c) {
	6734	scan++;
	6735	}
	6736	}
	6737	else if (reginfo->is_utf8_pat) {
	6738	if (utf8_target) {
	6739	STRLEN scan_char_len;
	6740
	6741	/* When both target and pattern are UTF-8, we have to do
	6742	* string EQ */
	6743	while (hardcount < max
	6744	&& scan < loceol
	6745	&& (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
	6746	&& memEQ(scan, STRING(p), scan_char_len))
	6747	{
	6748	scan += scan_char_len;
	6749	hardcount++;
	6750	}
	6751	}
	6752	else if (! UTF8_IS_ABOVE_LATIN1(c)) {
	6753
	6754	/* Target isn't utf8; convert the character in the UTF-8
	6755	* pattern to non-UTF8, and do a simple loop */
	6756	c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
	6757	while (scan < loceol && UCHARAT(scan) == c) {
	6758	scan++;
	6759	}
	6760	} /* else pattern char is above Latin1, can't possibly match the
	6761	non-UTF-8 target */
	6762	}
	6763	else {
	6764
	6765	/* Here, the string must be utf8; pattern isn't, and <c> is
	6766	* different in utf8 than not, so can't compare them directly.
	6767	* Outside the loop, find the two utf8 bytes that represent c, and
	6768	* then look for those in sequence in the utf8 string */
	6769	U8 high = UTF8_TWO_BYTE_HI(c);
	6770	U8 low = UTF8_TWO_BYTE_LO(c);
	6771
	6772	while (hardcount < max
	6773	&& scan + 1 < loceol
	6774	&& UCHARAT(scan) == high
	6775	&& UCHARAT(scan + 1) == low)
	6776	{
	6777	scan += 2;
	6778	hardcount++;
	6779	}
	6780	}
	6781	break;
	6782
	6783	case EXACTFA:
	6784	utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
	6785	goto do_exactf;
	6786
	6787	case EXACTFL:
	6788	RXp_MATCH_TAINTED_on(prog);
	6789	utf8_flags = FOLDEQ_UTF8_LOCALE;
	6790	goto do_exactf;
	6791
	6792	case EXACTF:
	6793	utf8_flags = 0;
	6794	goto do_exactf;
	6795
	6796	case EXACTFU_SS:
	6797	case EXACTFU_TRICKYFOLD:
	6798	case EXACTFU:
	6799	utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
	6800
	6801	do_exactf: {
	6802	int c1, c2;
	6803	U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
	6804
	6805	assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
	6806
	6807	if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
	6808	reginfo))
	6809	{
	6810	if (c1 == CHRTEST_VOID) {
	6811	/* Use full Unicode fold matching */
	6812	char *tmpeol = reginfo->strend;
	6813	STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
	6814	while (hardcount < max
	6815	&& foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
	6816	STRING(p), NULL, pat_len,
	6817	reginfo->is_utf8_pat, utf8_flags))
	6818	{
	6819	scan = tmpeol;
	6820	tmpeol = reginfo->strend;
	6821	hardcount++;
	6822	}
	6823	}
	6824	else if (utf8_target) {
	6825	if (c1 == c2) {
	6826	while (scan < loceol
	6827	&& hardcount < max
	6828	&& memEQ(scan, c1_utf8, UTF8SKIP(scan)))
	6829	{
	6830	scan += UTF8SKIP(scan);
	6831	hardcount++;
	6832	}
	6833	}
	6834	else {
	6835	while (scan < loceol
	6836	&& hardcount < max
	6837	&& (memEQ(scan, c1_utf8, UTF8SKIP(scan))
	6838	\|\| memEQ(scan, c2_utf8, UTF8SKIP(scan))))
	6839	{
	6840	scan += UTF8SKIP(scan);
	6841	hardcount++;
	6842	}
	6843	}
	6844	}
	6845	else if (c1 == c2) {
	6846	while (scan < loceol && UCHARAT(scan) == c1) {
	6847	scan++;
	6848	}
	6849	}
	6850	else {
	6851	while (scan < loceol &&
	6852	(UCHARAT(scan) == c1 \|\| UCHARAT(scan) == c2))
	6853	{
	6854	scan++;
	6855	}
	6856	}
	6857	}
	6858	break;
	6859	}
	6860	case ANYOF:
	6861	case ANYOF_WARN_SUPER:
	6862	if (utf8_target) {
	6863	while (hardcount < max
	6864	&& scan < loceol
	6865	&& reginclass(prog, p, (U8*)scan, utf8_target))
	6866	{
	6867	scan += UTF8SKIP(scan);
	6868	hardcount++;
	6869	}
	6870	} else {
	6871	while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
	6872	scan++;
	6873	}
	6874	break;
	6875
	6876	/* The argument (FLAGS) to all the POSIX node types is the class number */
	6877
	6878	case NPOSIXL:
	6879	to_complement = 1;
	6880	/* FALLTHROUGH */
	6881
	6882	case POSIXL:
	6883	RXp_MATCH_TAINTED_on(prog);
	6884	if (! utf8_target) {
	6885	while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
	6886	*scan)))
	6887	{
	6888	scan++;
	6889	}
	6890	} else {
	6891	while (hardcount < max && scan < loceol
	6892	&& to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
	6893	(U8 *) scan)))
	6894	{
	6895	scan += UTF8SKIP(scan);
	6896	hardcount++;
	6897	}
	6898	}
	6899	break;
	6900
	6901	case POSIXD:
	6902	if (utf8_target) {
	6903	goto utf8_posix;
	6904	}
	6905	/* FALLTHROUGH */
	6906
	6907	case POSIXA:
	6908	if (utf8_target && loceol - scan > max) {
	6909
	6910	/* We didn't adjust <loceol> at the beginning of this routine
	6911	* because is UTF-8, but it is actually ok to do so, since here, to
	6912	* match, 1 char == 1 byte. */
	6913	loceol = scan + max;
	6914	}
	6915	while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
	6916	scan++;
	6917	}
	6918	break;
	6919
	6920	case NPOSIXD:
	6921	if (utf8_target) {
	6922	to_complement = 1;
	6923	goto utf8_posix;
	6924	}
	6925	/* FALL THROUGH */
	6926
	6927	case NPOSIXA:
	6928	if (! utf8_target) {
	6929	while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
	6930	scan++;
	6931	}
	6932	}
	6933	else {
	6934
	6935	/* The complement of something that matches only ASCII matches all
	6936	* UTF-8 variant code points, plus everything in ASCII that isn't
	6937	* in the class. */
	6938	while (hardcount < max && scan < loceol
	6939	&& (! UTF8_IS_INVARIANT(*scan)
	6940	\|\| ! _generic_isCC_A((U8) *scan, FLAGS(p))))
	6941	{
	6942	scan += UTF8SKIP(scan);
	6943	hardcount++;
	6944	}
	6945	}
	6946	break;
	6947
	6948	case NPOSIXU:
	6949	to_complement = 1;
	6950	/* FALLTHROUGH */
	6951
	6952	case POSIXU:
	6953	if (! utf8_target) {
	6954	while (scan < loceol && to_complement
	6955	^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
	6956	{
	6957	scan++;
	6958	}
	6959	}
	6960	else {
	6961	utf8_posix:
	6962	classnum = (_char_class_number) FLAGS(p);
	6963	if (classnum < _FIRST_NON_SWASH_CC) {
	6964
	6965	/* Here, a swash is needed for above-Latin1 code points.
	6966	* Process as many Latin1 code points using the built-in rules.
	6967	* Go to another loop to finish processing upon encountering
	6968	* the first Latin1 code point. We could do that in this loop
	6969	* as well, but the other way saves having to test if the swash
	6970	* has been loaded every time through the loop: extra space to
	6971	* save a test. */
	6972	while (hardcount < max && scan < loceol) {
	6973	if (UTF8_IS_INVARIANT(*scan)) {
	6974	if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
	6975	classnum))))
	6976	{
	6977	break;
	6978	}
	6979	scan++;
	6980	}
	6981	else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
	6982	if (! (to_complement
	6983	^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
	6984	*(scan + 1)),
	6985	classnum))))
	6986	{
	6987	break;
	6988	}
	6989	scan += 2;
	6990	}
	6991	else {
	6992	goto found_above_latin1;
	6993	}
	6994
	6995	hardcount++;
	6996	}
	6997	}
	6998	else {
	6999	/* For these character classes, the knowledge of how to handle
	7000	* every code point is compiled in to Perl via a macro. This
	7001	* code is written for making the loops as tight as possible.
	7002	* It could be refactored to save space instead */
	7003	switch (classnum) {
	7004	case _CC_ENUM_SPACE: /* XXX would require separate code
	7005	if we revert the change of \v
	7006	matching this */
	7007	/* FALL THROUGH */
	7008	case _CC_ENUM_PSXSPC:
	7009	while (hardcount < max
	7010	&& scan < loceol
	7011	&& (to_complement ^ cBOOL(isSPACE_utf8(scan))))
	7012	{
	7013	scan += UTF8SKIP(scan);
	7014	hardcount++;
	7015	}
	7016	break;
	7017	case _CC_ENUM_BLANK:
	7018	while (hardcount < max
	7019	&& scan < loceol
	7020	&& (to_complement ^ cBOOL(isBLANK_utf8(scan))))
	7021	{
	7022	scan += UTF8SKIP(scan);
	7023	hardcount++;
	7024	}
	7025	break;
	7026	case _CC_ENUM_XDIGIT:
	7027	while (hardcount < max
	7028	&& scan < loceol
	7029	&& (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
	7030	{
	7031	scan += UTF8SKIP(scan);
	7032	hardcount++;
	7033	}
	7034	break;
	7035	case _CC_ENUM_VERTSPACE:
	7036	while (hardcount < max
	7037	&& scan < loceol
	7038	&& (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
	7039	{
	7040	scan += UTF8SKIP(scan);
	7041	hardcount++;
	7042	}
	7043	break;
	7044	case _CC_ENUM_CNTRL:
	7045	while (hardcount < max
	7046	&& scan < loceol
	7047	&& (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
	7048	{
	7049	scan += UTF8SKIP(scan);
	7050	hardcount++;
	7051	}
	7052	break;
	7053	default:
	7054	Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
	7055	}
	7056	}
	7057	}
	7058	break;
	7059
	7060	found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
	7061
	7062	/* Load the swash if not already present */
	7063	if (! PL_utf8_swash_ptrs[classnum]) {
	7064	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	7065	PL_utf8_swash_ptrs[classnum] = _core_swash_init(
	7066	"utf8", swash_property_names[classnum],
	7067	&PL_sv_undef, 1, 0, NULL, &flags);
	7068	}
	7069
	7070	while (hardcount < max && scan < loceol
	7071	&& to_complement ^ cBOOL(_generic_utf8(
	7072	classnum,
	7073	scan,
	7074	swash_fetch(PL_utf8_swash_ptrs[classnum],
	7075	(U8 *) scan,
	7076	TRUE))))
	7077	{
	7078	scan += UTF8SKIP(scan);
	7079	hardcount++;
	7080	}
	7081	break;
	7082
	7083	case LNBREAK:
	7084	if (utf8_target) {
	7085	while (hardcount < max && scan < loceol &&
	7086	(c=is_LNBREAK_utf8_safe(scan, loceol))) {
	7087	scan += c;
	7088	hardcount++;
	7089	}
	7090	} else {
	7091	/* LNBREAK can match one or two latin chars, which is ok, but we
	7092	* have to use hardcount in this situation, and throw away the
	7093	* adjustment to <loceol> done before the switch statement */
	7094	loceol = reginfo->strend;
	7095	while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
	7096	scan+=c;
	7097	hardcount++;
	7098	}
	7099	}
	7100	break;
	7101
	7102	case BOUND:
	7103	case BOUNDA:
	7104	case BOUNDL:
	7105	case BOUNDU:
	7106	case EOS:
	7107	case GPOS:
	7108	case KEEPS:
	7109	case NBOUND:
	7110	case NBOUNDA:
	7111	case NBOUNDL:
	7112	case NBOUNDU:
	7113	case OPFAIL:
	7114	case SBOL:
	7115	case SEOL:
	7116	/* These are all 0 width, so match right here or not at all. */
	7117	break;
	7118
	7119	default:
	7120	Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
	7121	assert(0); /* NOTREACHED */
	7122
	7123	}
	7124
	7125	if (hardcount)
	7126	c = hardcount;
	7127	else
	7128	c = scan - *startposp;
	7129	*startposp = scan;
	7130
	7131	DEBUG_r({
	7132	GET_RE_DEBUG_FLAGS_DECL;
	7133	DEBUG_EXECUTE_r({
	7134	SV * const prop = sv_newmortal();
	7135	regprop(prog, prop, p);
	7136	PerlIO_printf(Perl_debug_log,
	7137	"%*s %s can match %"IVdf" times out of %"IVdf"...\n",
	7138	REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
	7139	});
	7140	});
	7141
	7142	return(c);
	7143	}
	7144
	7145
	7146	#if !defined(PERL_IN_XSUB_RE) \|\| defined(PLUGGABLE_RE_EXTENSION)
	7147	/*
	7148	- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
	7149	create a copy so that changes the caller makes won't change the shared one.
	7150	If <altsvp> is non-null, will return NULL in it, for back-compat.
	7151	*/
	7152	SV *
	7153	Perl_regclass_swash(pTHX_ const regexp prog, const regnode node, bool doinit, SV listsvp, SV altsvp)
	7154	{
	7155	PERL_ARGS_ASSERT_REGCLASS_SWASH;
	7156
	7157	if (altsvp) {
	7158	*altsvp = NULL;
	7159	}
	7160
	7161	return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
	7162	}
	7163	#endif
	7164
	7165	STATIC SV *
	7166	S_core_regclass_swash(pTHX_ const regexp prog, const regnode node, bool doinit, SV** listsvp)
	7167	{
	7168	/* Returns the swash for the input 'node' in the regex 'prog'.
	7169	* If <doinit> is true, will attempt to create the swash if not already
	7170	* done.
	7171	* If <listsvp> is non-null, will return the swash initialization string in
	7172	* it.
	7173	* Tied intimately to how regcomp.c sets up the data structure */
	7174
	7175	dVAR;
	7176	SV *sw = NULL;
	7177	SV *si = NULL;
	7178	SV* invlist = NULL;
	7179
	7180	RXi_GET_DECL(prog,progi);
	7181	const struct reg_data * const data = prog ? progi->data : NULL;
	7182
	7183	PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
	7184
	7185	assert(ANYOF_NONBITMAP(node));
	7186
	7187	if (data && data->count) {
	7188	const U32 n = ARG(node);
	7189
	7190	if (data->what[n] == 's') {
	7191	SV * const rv = MUTABLE_SV(data->data[n]);
	7192	AV * const av = MUTABLE_AV(SvRV(rv));
	7193	SV **const ary = AvARRAY(av);
	7194	U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	7195
	7196	si = ary; / ary[0] = the string to initialize the swash with */
	7197
	7198	/* Elements 2 and 3 are either both present or both absent. [2] is
	7199	* any inversion list generated at compile time; [3] indicates if
	7200	* that inversion list has any user-defined properties in it. */
	7201	if (av_len(av) >= 2) {
	7202	invlist = ary[2];
	7203	if (SvUV(ary[3])) {
	7204	swash_init_flags \|= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
	7205	}
	7206	}
	7207	else {
	7208	invlist = NULL;
	7209	}
	7210
	7211	/* Element [1] is reserved for the set-up swash. If already there,
	7212	* return it; if not, create it and store it there */
	7213	if (SvROK(ary[1])) {
	7214	sw = ary[1];
	7215	}
	7216	else if (si && doinit) {
	7217
	7218	sw = _core_swash_init("utf8", /* the utf8 package */
	7219	"", /* nameless */
	7220	si,
	7221	1, /* binary */
	7222	0, /* not from tr/// */
	7223	invlist,
	7224	&swash_init_flags);
	7225	(void)av_store(av, 1, sw);
	7226	}
	7227	}
	7228	}
	7229
	7230	if (listsvp) {
	7231	SV* matches_string = newSVpvn("", 0);
	7232
	7233	/* Use the swash, if any, which has to have incorporated into it all
	7234	* possibilities */
	7235	if ((! sw \|\| (invlist = _get_swash_invlist(sw)) == NULL)
	7236	&& (si && si != &PL_sv_undef))
	7237	{
	7238
	7239	/* If no swash, use the input initialization string, if available */
	7240	sv_catsv(matches_string, si);
	7241	}
	7242
	7243	/* Add the inversion list to whatever we have. This may have come from
	7244	* the swash, or from an input parameter */
	7245	if (invlist) {
	7246	sv_catsv(matches_string, _invlist_contents(invlist));
	7247	}
	7248	*listsvp = matches_string;
	7249	}
	7250
	7251	return sw;
	7252	}
	7253
	7254	/*
	7255	- reginclass - determine if a character falls into a character class
	7256
	7257	n is the ANYOF regnode
	7258	p is the target string
	7259	utf8_target tells whether p is in UTF-8.
	7260
	7261	Returns true if matched; false otherwise.
	7262
	7263	Note that this can be a synthetic start class, a combination of various
	7264	nodes, so things you think might be mutually exclusive, such as locale,
	7265	aren't. It can match both locale and non-locale
	7266
	7267	*/
	7268
	7269	STATIC bool
	7270	S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
	7271	{
	7272	dVAR;
	7273	const char flags = ANYOF_FLAGS(n);
	7274	bool match = FALSE;
	7275	UV c = *p;
	7276
	7277	PERL_ARGS_ASSERT_REGINCLASS;
	7278
	7279	/* If c is not already the code point, get it. Note that
	7280	* UTF8_IS_INVARIANT() works even if not in UTF-8 */
	7281	if (! UTF8_IS_INVARIANT(c) && utf8_target) {
	7282	STRLEN c_len = 0;
	7283	c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
	7284	(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
	7285	\| UTF8_ALLOW_FFFF \| UTF8_CHECK_ONLY);
	7286	/* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
	7287	* UTF8_ALLOW_FFFF */
	7288	if (c_len == (STRLEN)-1)
	7289	Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
	7290	}
	7291
	7292	/* If this character is potentially in the bitmap, check it */
	7293	if (c < 256) {
	7294	if (ANYOF_BITMAP_TEST(n, c))
	7295	match = TRUE;
	7296	else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
	7297	&& ! utf8_target
	7298	&& ! isASCII(c))
	7299	{
	7300	match = TRUE;
	7301	}
	7302	else if (flags & ANYOF_LOCALE) {
	7303	RXp_MATCH_TAINTED_on(prog);
	7304
	7305	if ((flags & ANYOF_LOC_FOLD)
	7306	&& ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
	7307	{
	7308	match = TRUE;
	7309	}
	7310	else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
	7311
	7312	/* The data structure is arranged so bits 0, 2, 4, ... are set
	7313	* if the class includes the Posix character class given by
	7314	* bit/2; and 1, 3, 5, ... are set if the class includes the
	7315	* complemented Posix class given by int(bit/2). So we loop
	7316	* through the bits, each time changing whether we complement
	7317	* the result or not. Suppose for the sake of illustration
	7318	* that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
	7319	* is set, it means there is a match for this ANYOF node if the
	7320	* character is in the class given by the expression (0 / 2 = 0
	7321	* = \w). If it is in that class, isFOO_lc() will return 1,
	7322	* and since 'to_complement' is 0, the result will stay TRUE,
	7323	* and we exit the loop. Suppose instead that bit 0 is 0, but
	7324	* bit 1 is 1. That means there is a match if the character
	7325	* matches \W. We won't bother to call isFOO_lc() on bit 0,
	7326	* but will on bit 1. On the second iteration 'to_complement'
	7327	* will be 1, so the exclusive or will reverse things, so we
	7328	* are testing for \W. On the third iteration, 'to_complement'
	7329	* will be 0, and we would be testing for \s; the fourth
	7330	* iteration would test for \S, etc.
	7331	*
	7332	* Note that this code assumes that all the classes are closed
	7333	* under folding. For example, if a character matches \w, then
	7334	* its fold does too; and vice versa. This should be true for
	7335	* any well-behaved locale for all the currently defined Posix
	7336	* classes, except for :lower: and :upper:, which are handled
	7337	* by the pseudo-class :cased: which matches if either of the
	7338	* other two does. To get rid of this assumption, an outer
	7339	* loop could be used below to iterate over both the source
	7340	* character, and its fold (if different) */
	7341
	7342	int count = 0;
	7343	int to_complement = 0;
	7344	while (count < ANYOF_MAX) {
	7345	if (ANYOF_CLASS_TEST(n, count)
	7346	&& to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
	7347	{
	7348	match = TRUE;
	7349	break;
	7350	}
	7351	count++;
	7352	to_complement ^= 1;
	7353	}
	7354	}
	7355	}
	7356	}
	7357
	7358	/* If the bitmap didn't (or couldn't) match, and something outside the
	7359	* bitmap could match, try that. Locale nodes specify completely the
	7360	* behavior of code points in the bit map (otherwise, a utf8 target would
	7361	* cause them to be treated as Unicode and not locale), except in
	7362	* the very unlikely event when this node is a synthetic start class, which
	7363	* could be a combination of locale and non-locale nodes. So allow locale
	7364	* to match for the synthetic start class, which will give a false
	7365	* positive that will be resolved when the match is done again as not part
	7366	* of the synthetic start class */
	7367	if (!match) {
	7368	if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
	7369	match = TRUE; /* Everything above 255 matches */
	7370	}
	7371	else if (ANYOF_NONBITMAP(n)
	7372	&& ((flags & ANYOF_NONBITMAP_NON_UTF8)
	7373	\|\| (utf8_target
	7374	&& (c >=256
	7375	\|\| (! (flags & ANYOF_LOCALE))
	7376	\|\| OP(n) == ANYOF_SYNTHETIC))))
	7377	{
	7378	SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
	7379	if (sw) {
	7380	U8 * utf8_p;
	7381	if (utf8_target) {
	7382	utf8_p = (U8 *) p;
	7383	} else { /* Convert to utf8 */
	7384	STRLEN len = 1;
	7385	utf8_p = bytes_to_utf8(p, &len);
	7386	}
	7387
	7388	if (swash_fetch(sw, utf8_p, TRUE)) {
	7389	match = TRUE;
	7390	}
	7391
	7392	/* If we allocated a string above, free it */
	7393	if (! utf8_target) Safefree(utf8_p);
	7394	}
	7395	}
	7396
	7397	if (UNICODE_IS_SUPER(c)
	7398	&& OP(n) == ANYOF_WARN_SUPER
	7399	&& ckWARN_d(WARN_NON_UNICODE))
	7400	{
	7401	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	7402	"Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
	7403	}
	7404	}
	7405
	7406	/* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
	7407	return cBOOL(flags & ANYOF_INVERT) ^ match;
	7408	}
	7409
	7410	STATIC U8 *
	7411	S_reghop3(U8 s, I32 off, const U8 lim)
	7412	{
	7413	/* return the position 'off' UTF-8 characters away from 's', forward if
	7414	* 'off' >= 0, backwards if negative. But don't go outside of position
	7415	* 'lim', which better be < s if off < 0 */
	7416
	7417	dVAR;
	7418
	7419	PERL_ARGS_ASSERT_REGHOP3;
	7420
	7421	if (off >= 0) {
	7422	while (off-- && s < lim) {
	7423	/* XXX could check well-formedness here */
	7424	s += UTF8SKIP(s);
	7425	}
	7426	}
	7427	else {
	7428	while (off++ && s > lim) {
	7429	s--;
	7430	if (UTF8_IS_CONTINUED(*s)) {
	7431	while (s > lim && UTF8_IS_CONTINUATION(*s))
	7432	s--;
	7433	}
	7434	/* XXX could check well-formedness here */
	7435	}
	7436	}
	7437	return s;
	7438	}
	7439
	7440	#ifdef XXX_dmq
	7441	/* there are a bunch of places where we use two reghop3's that should
	7442	be replaced with this routine. but since thats not done yet
	7443	we ifdef it out - dmq
	7444	*/
	7445	STATIC U8 *
	7446	S_reghop4(U8 s, I32 off, const U8 llim, const U8* rlim)
	7447	{
	7448	dVAR;
	7449
	7450	PERL_ARGS_ASSERT_REGHOP4;
	7451
	7452	if (off >= 0) {
	7453	while (off-- && s < rlim) {
	7454	/* XXX could check well-formedness here */
	7455	s += UTF8SKIP(s);
	7456	}
	7457	}
	7458	else {
	7459	while (off++ && s > llim) {
	7460	s--;
	7461	if (UTF8_IS_CONTINUED(*s)) {
	7462	while (s > llim && UTF8_IS_CONTINUATION(*s))
	7463	s--;
	7464	}
	7465	/* XXX could check well-formedness here */
	7466	}
	7467	}
	7468	return s;
	7469	}
	7470	#endif
	7471
	7472	STATIC U8 *
	7473	S_reghopmaybe3(U8* s, I32 off, const U8* lim)
	7474	{
	7475	dVAR;
	7476
	7477	PERL_ARGS_ASSERT_REGHOPMAYBE3;
	7478
	7479	if (off >= 0) {
	7480	while (off-- && s < lim) {
	7481	/* XXX could check well-formedness here */
	7482	s += UTF8SKIP(s);
	7483	}
	7484	if (off >= 0)
	7485	return NULL;
	7486	}
	7487	else {
	7488	while (off++ && s > lim) {
	7489	s--;
	7490	if (UTF8_IS_CONTINUED(*s)) {
	7491	while (s > lim && UTF8_IS_CONTINUATION(*s))
	7492	s--;
	7493	}
	7494	/* XXX could check well-formedness here */
	7495	}
	7496	if (off <= 0)
	7497	return NULL;
	7498	}
	7499	return s;
	7500	}
	7501
	7502
	7503	/* when executing a regex that may have (?{}), extra stuff needs setting
	7504	up that will be visible to the called code, even before the current
	7505	match has finished. In particular:
	7506
	7507	* $_ is localised to the SV currently being matched;
	7508	* pos($_) is created if necessary, ready to be updated on each call-out
	7509	to code;
	7510	* a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
	7511	isn't set until the current pattern is successfully finished), so that
	7512	$1 etc of the match-so-far can be seen;
	7513	* save the old values of subbeg etc of the current regex, and set then
	7514	to the current string (again, this is normally only done at the end
	7515	of execution)
	7516	*/
	7517
	7518	static void
	7519	S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
	7520	{
	7521	MAGIC *mg;
	7522	regexp *const rex = ReANY(reginfo->prog);
	7523	regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
	7524
	7525	eval_state->rex = rex;
	7526
	7527	if (reginfo->sv) {
	7528	/* Make $_ available to executed code. */
	7529	if (reginfo->sv != DEFSV) {
	7530	SAVE_DEFSV;
	7531	DEFSV_set(reginfo->sv);
	7532	}
	7533
	7534	if (!(mg = mg_find_mglob(reginfo->sv))) {
	7535	/* prepare for quick setting of pos */
	7536	mg = sv_magicext_mglob(reginfo->sv);
	7537	mg->mg_len = -1;
	7538	}
	7539	eval_state->pos_magic = mg;
	7540	eval_state->pos = mg->mg_len;
	7541	}
	7542	else
	7543	eval_state->pos_magic = NULL;
	7544
	7545	if (!PL_reg_curpm) {
	7546	/* PL_reg_curpm is a fake PMOP that we can attach the current
	7547	* regex to and point PL_curpm at, so that $1 et al are visible
	7548	* within a /(?{})/. It's just allocated once per interpreter the
	7549	* first time its needed */
	7550	Newxz(PL_reg_curpm, 1, PMOP);
	7551	#ifdef USE_ITHREADS
	7552	{
	7553	SV* const repointer = &PL_sv_undef;
	7554	/* this regexp is also owned by the new PL_reg_curpm, which
	7555	will try to free it. */
	7556	av_push(PL_regex_padav, repointer);
	7557	PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
	7558	PL_regex_pad = AvARRAY(PL_regex_padav);
	7559	}
	7560	#endif
	7561	}
	7562	SET_reg_curpm(reginfo->prog);
	7563	eval_state->curpm = PL_curpm;
	7564	PL_curpm = PL_reg_curpm;
	7565	if (RXp_MATCH_COPIED(rex)) {
	7566	/* Here is a serious problem: we cannot rewrite subbeg,
	7567	since it may be needed if this match fails. Thus
	7568	$` inside (?{}) could fail... */
	7569	eval_state->subbeg = rex->subbeg;
	7570	eval_state->sublen = rex->sublen;
	7571	eval_state->suboffset = rex->suboffset;
	7572	eval_state->subcoffset = rex->subcoffset;
	7573	#ifdef PERL_ANY_COW
	7574	eval_state->saved_copy = rex->saved_copy;
	7575	#endif
	7576	RXp_MATCH_COPIED_off(rex);
	7577	}
	7578	else
	7579	eval_state->subbeg = NULL;
	7580	rex->subbeg = (char *)reginfo->strbeg;
	7581	rex->suboffset = 0;
	7582	rex->subcoffset = 0;
	7583	rex->sublen = reginfo->strend - reginfo->strbeg;
	7584	}
	7585
	7586
	7587	/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
	7588
	7589	static void
	7590	S_cleanup_regmatch_info_aux(pTHX_ void *arg)
	7591	{
	7592	dVAR;
	7593	regmatch_info_aux aux = (regmatch_info_aux ) arg;
	7594	regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
	7595	regmatch_slab *s;
	7596
	7597	Safefree(aux->poscache);
	7598
	7599	if (eval_state) {
	7600
	7601	/* undo the effects of S_setup_eval_state() */
	7602
	7603	if (eval_state->subbeg) {
	7604	regexp * const rex = eval_state->rex;
	7605	rex->subbeg = eval_state->subbeg;
	7606	rex->sublen = eval_state->sublen;
	7607	rex->suboffset = eval_state->suboffset;
	7608	rex->subcoffset = eval_state->subcoffset;
	7609	#ifdef PERL_ANY_COW
	7610	rex->saved_copy = eval_state->saved_copy;
	7611	#endif
	7612	RXp_MATCH_COPIED_on(rex);
	7613	}
	7614	if (eval_state->pos_magic)
	7615	eval_state->pos_magic->mg_len = eval_state->pos;
	7616
	7617	PL_curpm = eval_state->curpm;
	7618	}
	7619
	7620	PL_regmatch_state = aux->old_regmatch_state;
	7621	PL_regmatch_slab = aux->old_regmatch_slab;
	7622
	7623	/* free all slabs above current one - this must be the last action
	7624	* of this function, as aux and eval_state are allocated within
	7625	* slabs and may be freed here */
	7626
	7627	s = PL_regmatch_slab->next;
	7628	if (s) {
	7629	PL_regmatch_slab->next = NULL;
	7630	while (s) {
	7631	regmatch_slab * const osl = s;
	7632	s = s->next;
	7633	Safefree(osl);
	7634	}
	7635	}
	7636	}
	7637
	7638
	7639	STATIC void
	7640	S_to_utf8_substr(pTHX_ regexp *prog)
	7641	{
	7642	/* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
	7643	* on the converted value */
	7644
	7645	int i = 1;
	7646
	7647	PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
	7648
	7649	do {
	7650	if (prog->substrs->data[i].substr
	7651	&& !prog->substrs->data[i].utf8_substr) {
	7652	SV* const sv = newSVsv(prog->substrs->data[i].substr);
	7653	prog->substrs->data[i].utf8_substr = sv;
	7654	sv_utf8_upgrade(sv);
	7655	if (SvVALID(prog->substrs->data[i].substr)) {
	7656	if (SvTAIL(prog->substrs->data[i].substr)) {
	7657	/* Trim the trailing \n that fbm_compile added last
	7658	time. */
	7659	SvCUR_set(sv, SvCUR(sv) - 1);
	7660	/* Whilst this makes the SV technically "invalid" (as its
	7661	buffer is no longer followed by "\0") when fbm_compile()
	7662	adds the "\n" back, a "\0" is restored. */
	7663	fbm_compile(sv, FBMcf_TAIL);
	7664	} else
	7665	fbm_compile(sv, 0);
	7666	}
	7667	if (prog->substrs->data[i].substr == prog->check_substr)
	7668	prog->check_utf8 = sv;
	7669	}
	7670	} while (i--);
	7671	}
	7672
	7673	STATIC bool
	7674	S_to_byte_substr(pTHX_ regexp *prog)
	7675	{
	7676	/* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
	7677	* on the converted value; returns FALSE if can't be converted. */
	7678
	7679	dVAR;
	7680	int i = 1;
	7681
	7682	PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
	7683
	7684	do {
	7685	if (prog->substrs->data[i].utf8_substr
	7686	&& !prog->substrs->data[i].substr) {
	7687	SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
	7688	if (! sv_utf8_downgrade(sv, TRUE)) {
	7689	return FALSE;
	7690	}
	7691	if (SvVALID(prog->substrs->data[i].utf8_substr)) {
	7692	if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
	7693	/* Trim the trailing \n that fbm_compile added last
	7694	time. */
	7695	SvCUR_set(sv, SvCUR(sv) - 1);
	7696	fbm_compile(sv, FBMcf_TAIL);
	7697	} else
	7698	fbm_compile(sv, 0);
	7699	}
	7700	prog->substrs->data[i].substr = sv;
	7701	if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
	7702	prog->check_substr = sv;
	7703	}
	7704	} while (i--);
	7705
	7706	return TRUE;
	7707	}
	7708
	7709	/*
	7710	* Local variables:
	7711	* c-indentation-style: bsd
	7712	* c-basic-offset: 4
	7713	* indent-tabs-mode: nil
	7714	* End:
	7715	*
	7716	* ex: set ts=8 sts=4 sw=4 et:
	7717	*/