5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGEXEC_C
87 #ifdef PERL_IN_XSUB_RE
93 #include "inline_invlist.c"
94 #include "unicode_constants.h"
96 #define RF_tainted 1 /* tainted information used? e.g. locale */
97 #define RF_warned 2 /* warned about big count? */
99 #define RF_utf8 8 /* Pattern contains multibyte chars? */
101 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
103 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
106 #define STATIC static
109 /* Valid for non-utf8 strings: avoids the reginclass
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
112 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
113 : ANYOF_BITMAP_TEST(p,*(c)))
119 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
120 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
122 #define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
126 #define HOPBACKc(pos, off) \
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
137 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138 #define NEXTCHR_IS_EOS (nextchr < 0)
140 #define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
143 #define SET_locinput(p) \
148 /* these are unrolled below in the CCC_TRY_XXX defined */
149 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
150 if (!CAT2(PL_utf8_,class)) { \
152 ENTER; save_re_context(); \
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
156 /* Doesn't do an assert to verify that is correct */
157 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
158 if (!CAT2(PL_utf8_,class)) { \
160 PERL_UNUSED_VAR(throw_away); \
161 ENTER; save_re_context(); \
162 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
163 PERL_UNUSED_VAR(throw_away); \
166 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
167 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
169 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
170 /* No asserts are done for some of these, in case called on a */ \
171 /* Unicode version in which they map to nothing */ \
172 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
173 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
175 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
178 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
180 /* for use after a quantifier and before an EXACT-like node -- japhy */
181 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
183 * NOTE that *nothing* that affects backtracking should be in here, specifically
184 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
185 * node that is in between two EXACT like nodes when ascertaining what the required
186 * "follow" character is. This should probably be moved to regex compile time
187 * although it may be done at run time beause of the REF possibility - more
188 * investigation required. -- demerphq
190 #define JUMPABLE(rn) ( \
192 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
194 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
195 OP(rn) == PLUS || OP(rn) == MINMOD || \
197 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
199 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
201 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
204 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
205 we don't need this definition. */
206 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
207 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
208 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
211 /* ... so we use this as its faster. */
212 #define IS_TEXT(rn) ( OP(rn)==EXACT )
213 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
214 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
215 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
220 Search for mandatory following text node; for lookahead, the text must
221 follow but for lookbehind (rn->flags != 0) we skip to the next step.
223 #define FIND_NEXT_IMPT(rn) STMT_START { \
224 while (JUMPABLE(rn)) { \
225 const OPCODE type = OP(rn); \
226 if (type == SUSPEND || PL_regkind[type] == CURLY) \
227 rn = NEXTOPER(NEXTOPER(rn)); \
228 else if (type == PLUS) \
230 else if (type == IFMATCH) \
231 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
232 else rn += NEXT_OFF(rn); \
236 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
237 * These are for the pre-composed Hangul syllables, which are all in a
238 * contiguous block and arranged there in such a way so as to facilitate
239 * alorithmic determination of their characteristics. As such, they don't need
240 * a swash, but can be determined by simple arithmetic. Almost all are
241 * GCB=LVT, but every 28th one is a GCB=LV */
242 #define SBASE 0xAC00 /* Start of block */
243 #define SCount 11172 /* Length of block */
246 static void restore_pos(pTHX_ void *arg);
248 #define REGCP_PAREN_ELEMS 3
249 #define REGCP_OTHER_ELEMS 3
250 #define REGCP_FRAME_ELEMS 1
251 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
252 * are needed for the regexp context stack bookkeeping. */
255 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
258 const int retval = PL_savestack_ix;
259 const int paren_elems_to_push =
260 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
261 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
262 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
264 GET_RE_DEBUG_FLAGS_DECL;
266 PERL_ARGS_ASSERT_REGCPPUSH;
268 if (paren_elems_to_push < 0)
269 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
270 paren_elems_to_push);
272 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
273 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
274 " out of range (%lu-%ld)",
276 (unsigned long)maxopenparen,
279 SSGROW(total_elems + REGCP_FRAME_ELEMS);
282 if ((int)maxopenparen > (int)parenfloor)
283 PerlIO_printf(Perl_debug_log,
284 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
289 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
290 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
291 SSPUSHINT(rex->offs[p].end);
292 SSPUSHINT(rex->offs[p].start);
293 SSPUSHINT(rex->offs[p].start_tmp);
294 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
295 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
297 (IV)rex->offs[p].start,
298 (IV)rex->offs[p].start_tmp,
302 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
303 SSPUSHINT(maxopenparen);
304 SSPUSHINT(rex->lastparen);
305 SSPUSHINT(rex->lastcloseparen);
306 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
311 /* These are needed since we do not localize EVAL nodes: */
312 #define REGCP_SET(cp) \
314 PerlIO_printf(Perl_debug_log, \
315 " Setting an EVAL scope, savestack=%"IVdf"\n", \
316 (IV)PL_savestack_ix)); \
319 #define REGCP_UNWIND(cp) \
321 if (cp != PL_savestack_ix) \
322 PerlIO_printf(Perl_debug_log, \
323 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
324 (IV)(cp), (IV)PL_savestack_ix)); \
327 #define UNWIND_PAREN(lp, lcp) \
328 for (n = rex->lastparen; n > lp; n--) \
329 rex->offs[n].end = -1; \
330 rex->lastparen = n; \
331 rex->lastcloseparen = lcp;
335 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
340 GET_RE_DEBUG_FLAGS_DECL;
342 PERL_ARGS_ASSERT_REGCPPOP;
344 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
346 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
347 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
348 rex->lastcloseparen = SSPOPINT;
349 rex->lastparen = SSPOPINT;
350 *maxopenparen_p = SSPOPINT;
352 i -= REGCP_OTHER_ELEMS;
353 /* Now restore the parentheses context. */
355 if (i || rex->lastparen + 1 <= rex->nparens)
356 PerlIO_printf(Perl_debug_log,
357 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
362 paren = *maxopenparen_p;
363 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
365 rex->offs[paren].start_tmp = SSPOPINT;
366 rex->offs[paren].start = SSPOPINT;
368 if (paren <= rex->lastparen)
369 rex->offs[paren].end = tmps;
370 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
371 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
373 (IV)rex->offs[paren].start,
374 (IV)rex->offs[paren].start_tmp,
375 (IV)rex->offs[paren].end,
376 (paren > rex->lastparen ? "(skipped)" : ""));
381 /* It would seem that the similar code in regtry()
382 * already takes care of this, and in fact it is in
383 * a better location to since this code can #if 0-ed out
384 * but the code in regtry() is needed or otherwise tests
385 * requiring null fields (pat.t#187 and split.t#{13,14}
386 * (as of patchlevel 7877) will fail. Then again,
387 * this code seems to be necessary or otherwise
388 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
389 * --jhi updated by dapm */
390 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
391 if (i > *maxopenparen_p)
392 rex->offs[i].start = -1;
393 rex->offs[i].end = -1;
394 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
395 " \\%"UVuf": %s ..-1 undeffing\n",
397 (i > *maxopenparen_p) ? "-1" : " "
403 /* restore the parens and associated vars at savestack position ix,
404 * but without popping the stack */
407 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
409 I32 tmpix = PL_savestack_ix;
410 PL_savestack_ix = ix;
411 regcppop(rex, maxopenparen_p);
412 PL_savestack_ix = tmpix;
415 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
418 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
420 /* Returns a boolean as to whether or not 'character' is a member of the
421 * Posix character class given by 'classnum' that should be equivalent to a
422 * value in the typedef '_char_class_number'.
424 * Ideally this could be replaced by a just an array of function pointers
425 * to the C library functions that implement the macros this calls.
426 * However, to compile, the precise function signatures are required, and
427 * these may vary from platform to to platform. To avoid having to figure
428 * out what those all are on each platform, I (khw) am using this method,
429 * which adds an extra layer of function call overhead (unless the C
430 * optimizer strips it away). But we don't particularly care about
431 * performance with locales anyway. */
433 switch ((_char_class_number) classnum) {
434 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
435 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
436 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
437 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
438 case _CC_ENUM_LOWER: return isLOWER_LC(character);
439 case _CC_ENUM_PRINT: return isPRINT_LC(character);
440 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
441 case _CC_ENUM_UPPER: return isUPPER_LC(character);
442 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
443 case _CC_ENUM_SPACE: return isSPACE_LC(character);
444 case _CC_ENUM_BLANK: return isBLANK_LC(character);
445 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
446 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
447 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
448 case _CC_ENUM_ASCII: return isASCII_LC(character);
449 default: /* VERTSPACE should never occur in locales */
450 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
453 assert(0); /* NOTREACHED */
458 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
460 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
461 * 'character' is a member of the Posix character class given by 'classnum'
462 * that should be equivalent to a value in the typedef
463 * '_char_class_number'.
465 * This just calls isFOO_lc on the code point for the character if it is in
466 * the range 0-255. Outside that range, all characters avoid Unicode
467 * rules, ignoring any locale. So use the Unicode function if this class
468 * requires a swash, and use the Unicode macro otherwise. */
470 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
472 if (UTF8_IS_INVARIANT(*character)) {
473 return isFOO_lc(classnum, *character);
475 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
476 return isFOO_lc(classnum,
477 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
480 if (classnum < _FIRST_NON_SWASH_CC) {
482 /* Initialize the swash unless done already */
483 if (! PL_utf8_swash_ptrs[classnum]) {
484 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
485 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
486 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
489 return swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) character, TRUE);
492 switch ((_char_class_number) classnum) {
494 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
496 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
497 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
498 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
499 default: return 0; /* Things like CNTRL are always
503 assert(0); /* NOTREACHED */
508 * pregexec and friends
511 #ifndef PERL_IN_XSUB_RE
513 - pregexec - match a regexp against a string
516 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
517 char *strbeg, I32 minend, SV *screamer, U32 nosave)
518 /* stringarg: the point in the string at which to begin matching */
519 /* strend: pointer to null at end of string */
520 /* strbeg: real beginning of string */
521 /* minend: end of match must be >= minend bytes after stringarg. */
522 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
523 * itself is accessed via the pointers above */
524 /* nosave: For optimizations. */
526 PERL_ARGS_ASSERT_PREGEXEC;
529 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
530 nosave ? 0 : REXEC_COPY_STR);
535 * Need to implement the following flags for reg_anch:
537 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
539 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
540 * INTUIT_AUTORITATIVE_ML
541 * INTUIT_ONCE_NOML - Intuit can match in one location only.
544 * Another flag for this function: SECOND_TIME (so that float substrs
545 * with giant delta may be not rechecked).
548 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
550 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
551 Otherwise, only SvCUR(sv) is used to get strbeg. */
553 /* XXXX We assume that strpos is strbeg unless sv. */
555 /* XXXX Some places assume that there is a fixed substring.
556 An update may be needed if optimizer marks as "INTUITable"
557 RExen without fixed substrings. Similarly, it is assumed that
558 lengths of all the strings are no more than minlen, thus they
559 cannot come from lookahead.
560 (Or minlen should take into account lookahead.)
561 NOTE: Some of this comment is not correct. minlen does now take account
562 of lookahead/behind. Further research is required. -- demerphq
566 /* A failure to find a constant substring means that there is no need to make
567 an expensive call to REx engine, thus we celebrate a failure. Similarly,
568 finding a substring too deep into the string means that less calls to
569 regtry() should be needed.
571 REx compiler's optimizer found 4 possible hints:
572 a) Anchored substring;
574 c) Whether we are anchored (beginning-of-line or \G);
575 d) First node (of those at offset 0) which may distinguish positions;
576 We use a)b)d) and multiline-part of c), and try to find a position in the
577 string which does not contradict any of them.
580 /* Most of decisions we do here should have been done at compile time.
581 The nodes of the REx which we used for the search should have been
582 deleted from the finite automaton. */
585 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
586 char *strend, const U32 flags, re_scream_pos_data *data)
589 struct regexp *const prog = ReANY(rx);
591 /* Should be nonnegative! */
597 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
599 char *other_last = NULL; /* other substr checked before this */
600 char *check_at = NULL; /* check substr found at this pos */
601 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
602 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
603 RXi_GET_DECL(prog,progi);
605 const char * const i_strpos = strpos;
607 GET_RE_DEBUG_FLAGS_DECL;
609 PERL_ARGS_ASSERT_RE_INTUIT_START;
610 PERL_UNUSED_ARG(flags);
611 PERL_UNUSED_ARG(data);
613 RX_MATCH_UTF8_set(rx,utf8_target);
616 PL_reg_flags |= RF_utf8;
619 debug_start_match(rx, utf8_target, strpos, strend,
620 sv ? "Guessing start of match in sv for"
621 : "Guessing start of match in string for");
624 /* CHR_DIST() would be more correct here but it makes things slow. */
625 if (prog->minlen > strend - strpos) {
626 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
627 "String too short... [re_intuit_start]\n"));
631 /* XXX we need to pass strbeg as a separate arg: the following is
632 * guesswork and can be wrong... */
633 if (sv && SvPOK(sv)) {
634 char * p = SvPVX(sv);
635 STRLEN cur = SvCUR(sv);
636 if (p <= strpos && strpos < p + cur) {
638 assert(p <= strend && strend <= p + cur);
641 strbeg = strend - cur;
648 if (!prog->check_utf8 && prog->check_substr)
649 to_utf8_substr(prog);
650 check = prog->check_utf8;
652 if (!prog->check_substr && prog->check_utf8) {
653 if (! to_byte_substr(prog)) {
654 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
657 check = prog->check_substr;
659 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
660 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
661 || ( (prog->extflags & RXf_ANCH_BOL)
662 && !multiline ) ); /* Check after \n? */
665 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
666 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
667 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
669 && (strpos != strbeg)) {
670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
673 if (prog->check_offset_min == prog->check_offset_max
674 && !(prog->extflags & RXf_CANY_SEEN)
675 && ! multiline) /* /m can cause \n's to match that aren't
676 accounted for in the string max length.
677 See [perl #115242] */
679 /* Substring at constant offset from beg-of-str... */
682 s = HOP3c(strpos, prog->check_offset_min, strend);
685 slen = SvCUR(check); /* >= 1 */
687 if ( strend - s > slen || strend - s < slen - 1
688 || (strend - s == slen && strend[-1] != '\n')) {
689 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
692 /* Now should match s[0..slen-2] */
694 if (slen && (*SvPVX_const(check) != *s
696 && memNE(SvPVX_const(check), s, slen)))) {
698 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
702 else if (*SvPVX_const(check) != *s
703 || ((slen = SvCUR(check)) > 1
704 && memNE(SvPVX_const(check), s, slen)))
707 goto success_at_start;
710 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
712 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
713 end_shift = prog->check_end_shift;
716 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
717 - (SvTAIL(check) != 0);
718 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
720 if (end_shift < eshift)
724 else { /* Can match at random position */
727 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
728 end_shift = prog->check_end_shift;
730 /* end shift should be non negative here */
733 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
735 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
736 (IV)end_shift, RX_PRECOMP(prog));
740 /* Find a possible match in the region s..strend by looking for
741 the "check" substring in the region corrected by start/end_shift. */
744 I32 srch_start_shift = start_shift;
745 I32 srch_end_shift = end_shift;
748 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
749 srch_end_shift -= ((strbeg - s) - srch_start_shift);
750 srch_start_shift = strbeg - s;
752 DEBUG_OPTIMISE_MORE_r({
753 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
754 (IV)prog->check_offset_min,
755 (IV)srch_start_shift,
757 (IV)prog->check_end_shift);
760 if (prog->extflags & RXf_CANY_SEEN) {
761 start_point= (U8*)(s + srch_start_shift);
762 end_point= (U8*)(strend - srch_end_shift);
764 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
765 end_point= HOP3(strend, -srch_end_shift, strbeg);
767 DEBUG_OPTIMISE_MORE_r({
768 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
769 (int)(end_point - start_point),
770 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
774 s = fbm_instr( start_point, end_point,
775 check, multiline ? FBMrf_MULTILINE : 0);
777 /* Update the count-of-usability, remove useless subpatterns,
781 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
782 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
783 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
784 (s ? "Found" : "Did not find"),
785 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
786 ? "anchored" : "floating"),
789 (s ? " at offset " : "...\n") );
794 /* Finish the diagnostic message */
795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
797 /* XXX dmq: first branch is for positive lookbehind...
798 Our check string is offset from the beginning of the pattern.
799 So we need to do any stclass tests offset forward from that
808 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
809 Start with the other substr.
810 XXXX no SCREAM optimization yet - and a very coarse implementation
811 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
812 *always* match. Probably should be marked during compile...
813 Probably it is right to do no SCREAM here...
816 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
817 : (prog->float_substr && prog->anchored_substr))
819 /* Take into account the "other" substring. */
820 /* XXXX May be hopelessly wrong for UTF... */
823 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
826 char * const last = HOP3c(s, -start_shift, strbeg);
828 char * const saved_s = s;
831 t = s - prog->check_offset_max;
832 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
834 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
839 t = HOP3c(t, prog->anchored_offset, strend);
840 if (t < other_last) /* These positions already checked */
842 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
845 /* XXXX It is not documented what units *_offsets are in.
846 We assume bytes, but this is clearly wrong.
847 Meaning this code needs to be carefully reviewed for errors.
851 /* On end-of-str: see comment below. */
852 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
853 if (must == &PL_sv_undef) {
855 DEBUG_r(must = prog->anchored_utf8); /* for debug */
860 HOP3(HOP3(last1, prog->anchored_offset, strend)
861 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
863 multiline ? FBMrf_MULTILINE : 0
866 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
867 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
868 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
869 (s ? "Found" : "Contradicts"),
870 quoted, RE_SV_TAIL(must));
875 if (last1 >= last2) {
876 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
877 ", giving up...\n"));
880 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
881 ", trying floating at offset %ld...\n",
882 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
883 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
884 s = HOP3c(last, 1, strend);
888 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
889 (long)(s - i_strpos)));
890 t = HOP3c(s, -prog->anchored_offset, strbeg);
891 other_last = HOP3c(s, 1, strend);
899 else { /* Take into account the floating substring. */
901 char * const saved_s = s;
904 t = HOP3c(s, -start_shift, strbeg);
906 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
907 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
908 last = HOP3c(t, prog->float_max_offset, strend);
909 s = HOP3c(t, prog->float_min_offset, strend);
912 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
913 must = utf8_target ? prog->float_utf8 : prog->float_substr;
914 /* fbm_instr() takes into account exact value of end-of-str
915 if the check is SvTAIL(ed). Since false positives are OK,
916 and end-of-str is not later than strend we are OK. */
917 if (must == &PL_sv_undef) {
919 DEBUG_r(must = prog->float_utf8); /* for debug message */
922 s = fbm_instr((unsigned char*)s,
923 (unsigned char*)last + SvCUR(must)
925 must, multiline ? FBMrf_MULTILINE : 0);
927 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
928 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
929 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
930 (s ? "Found" : "Contradicts"),
931 quoted, RE_SV_TAIL(must));
935 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
936 ", giving up...\n"));
939 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
940 ", trying anchored starting at offset %ld...\n",
941 (long)(saved_s + 1 - i_strpos)));
943 s = HOP3c(t, 1, strend);
947 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
948 (long)(s - i_strpos)));
949 other_last = s; /* Fix this later. --Hugo */
959 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
961 DEBUG_OPTIMISE_MORE_r(
962 PerlIO_printf(Perl_debug_log,
963 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
964 (IV)prog->check_offset_min,
965 (IV)prog->check_offset_max,
973 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
975 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
978 /* Fixed substring is found far enough so that the match
979 cannot start at strpos. */
981 if (ml_anch && t[-1] != '\n') {
982 /* Eventually fbm_*() should handle this, but often
983 anchored_offset is not 0, so this check will not be wasted. */
984 /* XXXX In the code below we prefer to look for "^" even in
985 presence of anchored substrings. And we search even
986 beyond the found float position. These pessimizations
987 are historical artefacts only. */
989 while (t < strend - prog->minlen) {
991 if (t < check_at - prog->check_offset_min) {
992 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
993 /* Since we moved from the found position,
994 we definitely contradict the found anchored
995 substr. Due to the above check we do not
996 contradict "check" substr.
997 Thus we can arrive here only if check substr
998 is float. Redo checking for "other"=="fixed".
1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1002 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1003 goto do_other_anchored;
1005 /* We don't contradict the found floating substring. */
1006 /* XXXX Why not check for STCLASS? */
1008 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1009 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1012 /* Position contradicts check-string */
1013 /* XXXX probably better to look for check-string
1014 than for "\n", so one should lower the limit for t? */
1015 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1016 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1017 other_last = strpos = s = t + 1;
1022 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1023 PL_colors[0], PL_colors[1]));
1027 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1028 PL_colors[0], PL_colors[1]));
1032 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1035 /* The found string does not prohibit matching at strpos,
1036 - no optimization of calling REx engine can be performed,
1037 unless it was an MBOL and we are not after MBOL,
1038 or a future STCLASS check will fail this. */
1040 /* Even in this situation we may use MBOL flag if strpos is offset
1041 wrt the start of the string. */
1042 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1043 && (strpos != strbeg) && strpos[-1] != '\n'
1044 /* May be due to an implicit anchor of m{.*foo} */
1045 && !(prog->intflags & PREGf_IMPLICIT))
1050 DEBUG_EXECUTE_r( if (ml_anch)
1051 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1052 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1055 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1057 prog->check_utf8 /* Could be deleted already */
1058 && --BmUSEFUL(prog->check_utf8) < 0
1059 && (prog->check_utf8 == prog->float_utf8)
1061 prog->check_substr /* Could be deleted already */
1062 && --BmUSEFUL(prog->check_substr) < 0
1063 && (prog->check_substr == prog->float_substr)
1066 /* If flags & SOMETHING - do not do it many times on the same match */
1067 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1068 /* XXX Does the destruction order has to change with utf8_target? */
1069 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1070 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1071 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1072 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1073 check = NULL; /* abort */
1075 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1076 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1077 if (prog->intflags & PREGf_IMPLICIT)
1078 prog->extflags &= ~RXf_ANCH_MBOL;
1079 /* XXXX This is a remnant of the old implementation. It
1080 looks wasteful, since now INTUIT can use many
1081 other heuristics. */
1082 prog->extflags &= ~RXf_USE_INTUIT;
1083 /* XXXX What other flags might need to be cleared in this branch? */
1089 /* Last resort... */
1090 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1091 /* trie stclasses are too expensive to use here, we are better off to
1092 leave it to regmatch itself */
1093 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1094 /* minlen == 0 is possible if regstclass is \b or \B,
1095 and the fixed substr is ''$.
1096 Since minlen is already taken into account, s+1 is before strend;
1097 accidentally, minlen >= 1 guaranties no false positives at s + 1
1098 even for \b or \B. But (minlen? 1 : 0) below assumes that
1099 regstclass does not come from lookahead... */
1100 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1101 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1102 const U8* const str = (U8*)STRING(progi->regstclass);
1103 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1104 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1107 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1108 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1109 else if (prog->float_substr || prog->float_utf8)
1110 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1114 if (checked_upto < s)
1116 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1117 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1120 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1125 const char *what = NULL;
1127 if (endpos == strend) {
1128 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1129 "Could not match STCLASS...\n") );
1132 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1133 "This position contradicts STCLASS...\n") );
1134 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1136 checked_upto = HOPBACKc(endpos, start_shift);
1137 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1138 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1139 /* Contradict one of substrings */
1140 if (prog->anchored_substr || prog->anchored_utf8) {
1141 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1142 DEBUG_EXECUTE_r( what = "anchored" );
1144 s = HOP3c(t, 1, strend);
1145 if (s + start_shift + end_shift > strend) {
1146 /* XXXX Should be taken into account earlier? */
1147 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1148 "Could not match STCLASS...\n") );
1153 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1154 "Looking for %s substr starting at offset %ld...\n",
1155 what, (long)(s + start_shift - i_strpos)) );
1158 /* Have both, check_string is floating */
1159 if (t + start_shift >= check_at) /* Contradicts floating=check */
1160 goto retry_floating_check;
1161 /* Recheck anchored substring, but not floating... */
1165 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1166 "Looking for anchored substr starting at offset %ld...\n",
1167 (long)(other_last - i_strpos)) );
1168 goto do_other_anchored;
1170 /* Another way we could have checked stclass at the
1171 current position only: */
1176 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1177 "Looking for /%s^%s/m starting at offset %ld...\n",
1178 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1181 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1183 /* Check is floating substring. */
1184 retry_floating_check:
1185 t = check_at - start_shift;
1186 DEBUG_EXECUTE_r( what = "floating" );
1187 goto hop_and_restart;
1190 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1191 "By STCLASS: moving %ld --> %ld\n",
1192 (long)(t - i_strpos), (long)(s - i_strpos))
1196 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1197 "Does not contradict STCLASS...\n");
1202 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1203 PL_colors[4], (check ? "Guessed" : "Giving up"),
1204 PL_colors[5], (long)(s - i_strpos)) );
1207 fail_finish: /* Substring not found */
1208 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1209 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1211 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1212 PL_colors[4], PL_colors[5]));
1216 #define DECL_TRIE_TYPE(scan) \
1217 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1218 trie_type = ((scan->flags == EXACT) \
1219 ? (utf8_target ? trie_utf8 : trie_plain) \
1220 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1222 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1223 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1225 switch (trie_type) { \
1226 case trie_utf8_fold: \
1227 if ( foldlen>0 ) { \
1228 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1233 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1234 len = UTF8SKIP(uc); \
1235 skiplen = UNISKIP( uvc ); \
1236 foldlen -= skiplen; \
1237 uscan = foldbuf + skiplen; \
1240 case trie_latin_utf8_fold: \
1241 if ( foldlen>0 ) { \
1242 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1248 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1249 skiplen = UNISKIP( uvc ); \
1250 foldlen -= skiplen; \
1251 uscan = foldbuf + skiplen; \
1255 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1262 charid = trie->charmap[ uvc ]; \
1266 if (widecharmap) { \
1267 SV** const svpp = hv_fetch(widecharmap, \
1268 (char*)&uvc, sizeof(UV), 0); \
1270 charid = (U16)SvIV(*svpp); \
1275 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1279 && (ln == 1 || folder(s, pat_string, ln)) \
1280 && (!reginfo || regtry(reginfo, &s)) ) \
1286 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1288 while (s < strend) { \
1294 #define REXEC_FBC_SCAN(CoDe) \
1296 while (s < strend) { \
1302 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1303 REXEC_FBC_UTF8_SCAN( \
1305 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1314 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1317 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1326 #define REXEC_FBC_TRYIT \
1327 if ((!reginfo || regtry(reginfo, &s))) \
1330 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1331 if (utf8_target) { \
1332 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1335 REXEC_FBC_CLASS_SCAN(CoNd); \
1338 #define DUMP_EXEC_POS(li,s,doutf8) \
1339 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1342 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1343 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1344 tmp = TEST_NON_UTF8(tmp); \
1345 REXEC_FBC_UTF8_SCAN( \
1346 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1355 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1356 if (s == PL_bostr) { \
1360 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1361 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1364 LOAD_UTF8_CHARCLASS_ALNUM(); \
1365 REXEC_FBC_UTF8_SCAN( \
1366 if (tmp == ! (TeSt2_UtF8)) { \
1375 /* The only difference between the BOUND and NBOUND cases is that
1376 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1377 * NBOUND. This is accomplished by passing it in either the if or else clause,
1378 * with the other one being empty */
1379 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1380 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1382 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1383 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1385 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1386 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1388 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1389 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1392 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1393 * be passed in completely with the variable name being tested, which isn't
1394 * such a clean interface, but this is easier to read than it was before. We
1395 * are looking for the boundary (or non-boundary between a word and non-word
1396 * character. The utf8 and non-utf8 cases have the same logic, but the details
1397 * must be different. Find the "wordness" of the character just prior to this
1398 * one, and compare it with the wordness of this one. If they differ, we have
1399 * a boundary. At the beginning of the string, pretend that the previous
1400 * character was a new-line */
1401 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1402 if (utf8_target) { \
1405 else { /* Not utf8 */ \
1406 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1407 tmp = TEST_NON_UTF8(tmp); \
1409 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1418 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1421 /* We know what class REx starts with. Try to find this position... */
1422 /* if reginfo is NULL, its a dryrun */
1423 /* annoyingly all the vars in this routine have different names from their counterparts
1424 in regmatch. /grrr */
1427 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1428 const char *strend, regmatch_info *reginfo)
1431 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1432 char *pat_string; /* The pattern's exactish string */
1433 char *pat_end; /* ptr to end char of pat_string */
1434 re_fold_t folder; /* Function for computing non-utf8 folds */
1435 const U8 *fold_array; /* array for folding ords < 256 */
1441 I32 tmp = 1; /* Scratch variable? */
1442 const bool utf8_target = PL_reg_match_utf8;
1443 UV utf8_fold_flags = 0;
1444 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1445 with a result inverts that result, as 0^1 =
1447 _char_class_number classnum;
1449 RXi_GET_DECL(prog,progi);
1451 PERL_ARGS_ASSERT_FIND_BYCLASS;
1453 /* We know what class it must start with. */
1457 REXEC_FBC_UTF8_CLASS_SCAN(
1458 reginclass(prog, c, (U8*)s, utf8_target));
1461 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1466 if (tmp && (!reginfo || regtry(reginfo, &s)))
1474 if (UTF_PATTERN || utf8_target) {
1475 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1476 goto do_exactf_utf8;
1478 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1479 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1480 goto do_exactf_non_utf8; /* isn't dealt with by these */
1485 /* regcomp.c already folded this if pattern is in UTF-8 */
1486 utf8_fold_flags = 0;
1487 goto do_exactf_utf8;
1489 fold_array = PL_fold;
1491 goto do_exactf_non_utf8;
1494 if (UTF_PATTERN || utf8_target) {
1495 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1496 goto do_exactf_utf8;
1498 fold_array = PL_fold_locale;
1499 folder = foldEQ_locale;
1500 goto do_exactf_non_utf8;
1504 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1506 goto do_exactf_utf8;
1508 case EXACTFU_TRICKYFOLD:
1510 if (UTF_PATTERN || utf8_target) {
1511 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1512 goto do_exactf_utf8;
1515 /* Any 'ss' in the pattern should have been replaced by regcomp,
1516 * so we don't have to worry here about this single special case
1517 * in the Latin1 range */
1518 fold_array = PL_fold_latin1;
1519 folder = foldEQ_latin1;
1523 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1524 are no glitches with fold-length differences
1525 between the target string and pattern */
1527 /* The idea in the non-utf8 EXACTF* cases is to first find the
1528 * first character of the EXACTF* node and then, if necessary,
1529 * case-insensitively compare the full text of the node. c1 is the
1530 * first character. c2 is its fold. This logic will not work for
1531 * Unicode semantics and the german sharp ss, which hence should
1532 * not be compiled into a node that gets here. */
1533 pat_string = STRING(c);
1534 ln = STR_LEN(c); /* length to match in octets/bytes */
1536 /* We know that we have to match at least 'ln' bytes (which is the
1537 * same as characters, since not utf8). If we have to match 3
1538 * characters, and there are only 2 availabe, we know without
1539 * trying that it will fail; so don't start a match past the
1540 * required minimum number from the far end */
1541 e = HOP3c(strend, -((I32)ln), s);
1543 if (!reginfo && e < s) {
1544 e = s; /* Due to minlen logic of intuit() */
1548 c2 = fold_array[c1];
1549 if (c1 == c2) { /* If char and fold are the same */
1550 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1553 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1561 /* If one of the operands is in utf8, we can't use the simpler folding
1562 * above, due to the fact that many different characters can have the
1563 * same fold, or portion of a fold, or different- length fold */
1564 pat_string = STRING(c);
1565 ln = STR_LEN(c); /* length to match in octets/bytes */
1566 pat_end = pat_string + ln;
1567 lnc = (UTF_PATTERN) /* length to match in characters */
1568 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1571 /* We have 'lnc' characters to match in the pattern, but because of
1572 * multi-character folding, each character in the target can match
1573 * up to 3 characters (Unicode guarantees it will never exceed
1574 * this) if it is utf8-encoded; and up to 2 if not (based on the
1575 * fact that the Latin 1 folds are already determined, and the
1576 * only multi-char fold in that range is the sharp-s folding to
1577 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1578 * string character. Adjust lnc accordingly, rounding up, so that
1579 * if we need to match at least 4+1/3 chars, that really is 5. */
1580 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1581 lnc = (lnc + expansion - 1) / expansion;
1583 /* As in the non-UTF8 case, if we have to match 3 characters, and
1584 * only 2 are left, it's guaranteed to fail, so don't start a
1585 * match that would require us to go beyond the end of the string
1587 e = HOP3c(strend, -((I32)lnc), s);
1589 if (!reginfo && e < s) {
1590 e = s; /* Due to minlen logic of intuit() */
1593 /* XXX Note that we could recalculate e to stop the loop earlier,
1594 * as the worst case expansion above will rarely be met, and as we
1595 * go along we would usually find that e moves further to the left.
1596 * This would happen only after we reached the point in the loop
1597 * where if there were no expansion we should fail. Unclear if
1598 * worth the expense */
1601 char *my_strend= (char *)strend;
1602 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1603 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1604 && (!reginfo || regtry(reginfo, &s)) )
1608 s += (utf8_target) ? UTF8SKIP(s) : 1;
1613 PL_reg_flags |= RF_tainted;
1614 FBC_BOUND(isALNUM_LC,
1615 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1616 isALNUM_LC_utf8((U8*)s));
1619 PL_reg_flags |= RF_tainted;
1620 FBC_NBOUND(isALNUM_LC,
1621 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1622 isALNUM_LC_utf8((U8*)s));
1625 FBC_BOUND(isWORDCHAR,
1627 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1630 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1632 isWORDCHAR_A((U8*)s));
1635 FBC_NBOUND(isWORDCHAR,
1637 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1640 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1642 isWORDCHAR_A((U8*)s));
1645 FBC_BOUND(isWORDCHAR_L1,
1647 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1650 FBC_NBOUND(isWORDCHAR_L1,
1652 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1655 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1656 is_LNBREAK_latin1_safe(s, strend)
1660 /* The argument to all the POSIX node types is the class number to pass to
1661 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1668 PL_reg_flags |= RF_tainted;
1669 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1670 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1685 /* The complement of something that matches only ASCII matches all
1686 * UTF-8 variant code points, plus everything in ASCII that isn't
1688 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1689 || ! _generic_isCC_A(*s, FLAGS(c)));
1698 /* Don't need to worry about utf8, as it can match only a single
1699 * byte invariant character. */
1700 REXEC_FBC_CLASS_SCAN(
1701 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1709 if (! utf8_target) {
1710 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1716 classnum = (_char_class_number) FLAGS(c);
1717 if (classnum < _FIRST_NON_SWASH_CC) {
1718 while (s < strend) {
1720 /* We avoid loading in the swash as long as possible, but
1721 * should we have to, we jump to a separate loop. This
1722 * extra 'if' statement is what keeps this code from being
1723 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1724 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1725 goto found_above_latin1;
1727 if ((UTF8_IS_INVARIANT(*s)
1728 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1730 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1731 && to_complement ^ cBOOL(
1732 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1735 if (tmp && (!reginfo || regtry(reginfo, &s)))
1747 else switch (classnum) { /* These classes are implemented as
1749 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1750 revert the change of \v matching this */
1753 case _CC_ENUM_PSXSPC:
1754 REXEC_FBC_UTF8_CLASS_SCAN(
1755 to_complement ^ cBOOL(isSPACE_utf8(s)));
1758 case _CC_ENUM_BLANK:
1759 REXEC_FBC_UTF8_CLASS_SCAN(
1760 to_complement ^ cBOOL(isBLANK_utf8(s)));
1763 case _CC_ENUM_XDIGIT:
1764 REXEC_FBC_UTF8_CLASS_SCAN(
1765 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1768 case _CC_ENUM_VERTSPACE:
1769 REXEC_FBC_UTF8_CLASS_SCAN(
1770 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1773 case _CC_ENUM_CNTRL:
1774 REXEC_FBC_UTF8_CLASS_SCAN(
1775 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1779 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1780 assert(0); /* NOTREACHED */
1785 found_above_latin1: /* Here we have to load a swash to get the result
1786 for the current code point */
1787 if (! PL_utf8_swash_ptrs[classnum]) {
1788 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1789 PL_utf8_swash_ptrs[classnum] =
1790 _core_swash_init("utf8", swash_property_names[classnum],
1791 &PL_sv_undef, 1, 0, NULL, &flags);
1794 /* This is a copy of the loop above for swash classes, though using the
1795 * FBC macro instead of being expanded out. Since we've loaded the
1796 * swash, we don't have to check for that each time through the loop */
1797 REXEC_FBC_UTF8_CLASS_SCAN(
1798 to_complement ^ cBOOL(_generic_utf8(
1801 swash_fetch(PL_utf8_swash_ptrs[classnum],
1809 /* what trie are we using right now */
1810 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1811 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1812 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1814 const char *last_start = strend - trie->minlen;
1816 const char *real_start = s;
1818 STRLEN maxlen = trie->maxlen;
1820 U8 **points; /* map of where we were in the input string
1821 when reading a given char. For ASCII this
1822 is unnecessary overhead as the relationship
1823 is always 1:1, but for Unicode, especially
1824 case folded Unicode this is not true. */
1825 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1829 GET_RE_DEBUG_FLAGS_DECL;
1831 /* We can't just allocate points here. We need to wrap it in
1832 * an SV so it gets freed properly if there is a croak while
1833 * running the match */
1836 sv_points=newSV(maxlen * sizeof(U8 *));
1837 SvCUR_set(sv_points,
1838 maxlen * sizeof(U8 *));
1839 SvPOK_on(sv_points);
1840 sv_2mortal(sv_points);
1841 points=(U8**)SvPV_nolen(sv_points );
1842 if ( trie_type != trie_utf8_fold
1843 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1846 bitmap=(U8*)trie->bitmap;
1848 bitmap=(U8*)ANYOF_BITMAP(c);
1850 /* this is the Aho-Corasick algorithm modified a touch
1851 to include special handling for long "unknown char" sequences.
1852 The basic idea being that we use AC as long as we are dealing
1853 with a possible matching char, when we encounter an unknown char
1854 (and we have not encountered an accepting state) we scan forward
1855 until we find a legal starting char.
1856 AC matching is basically that of trie matching, except that when
1857 we encounter a failing transition, we fall back to the current
1858 states "fail state", and try the current char again, a process
1859 we repeat until we reach the root state, state 1, or a legal
1860 transition. If we fail on the root state then we can either
1861 terminate if we have reached an accepting state previously, or
1862 restart the entire process from the beginning if we have not.
1865 while (s <= last_start) {
1866 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1874 U8 *uscan = (U8*)NULL;
1875 U8 *leftmost = NULL;
1877 U32 accepted_word= 0;
1881 while ( state && uc <= (U8*)strend ) {
1883 U32 word = aho->states[ state ].wordnum;
1887 DEBUG_TRIE_EXECUTE_r(
1888 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1889 dump_exec_pos( (char *)uc, c, strend, real_start,
1890 (char *)uc, utf8_target );
1891 PerlIO_printf( Perl_debug_log,
1892 " Scanning for legal start char...\n");
1896 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1900 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1906 if (uc >(U8*)last_start) break;
1910 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1911 if (!leftmost || lpos < leftmost) {
1912 DEBUG_r(accepted_word=word);
1918 points[pointpos++ % maxlen]= uc;
1919 if (foldlen || uc < (U8*)strend) {
1920 REXEC_TRIE_READ_CHAR(trie_type, trie,
1922 uscan, len, uvc, charid, foldlen,
1924 DEBUG_TRIE_EXECUTE_r({
1925 dump_exec_pos( (char *)uc, c, strend,
1926 real_start, s, utf8_target);
1927 PerlIO_printf(Perl_debug_log,
1928 " Charid:%3u CP:%4"UVxf" ",
1940 word = aho->states[ state ].wordnum;
1942 base = aho->states[ state ].trans.base;
1944 DEBUG_TRIE_EXECUTE_r({
1946 dump_exec_pos( (char *)uc, c, strend, real_start,
1948 PerlIO_printf( Perl_debug_log,
1949 "%sState: %4"UVxf", word=%"UVxf,
1950 failed ? " Fail transition to " : "",
1951 (UV)state, (UV)word);
1957 ( ((offset = base + charid
1958 - 1 - trie->uniquecharcount)) >= 0)
1959 && ((U32)offset < trie->lasttrans)
1960 && trie->trans[offset].check == state
1961 && (tmp=trie->trans[offset].next))
1963 DEBUG_TRIE_EXECUTE_r(
1964 PerlIO_printf( Perl_debug_log," - legal\n"));
1969 DEBUG_TRIE_EXECUTE_r(
1970 PerlIO_printf( Perl_debug_log," - fail\n"));
1972 state = aho->fail[state];
1976 /* we must be accepting here */
1977 DEBUG_TRIE_EXECUTE_r(
1978 PerlIO_printf( Perl_debug_log," - accepting\n"));
1987 if (!state) state = 1;
1990 if ( aho->states[ state ].wordnum ) {
1991 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1992 if (!leftmost || lpos < leftmost) {
1993 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1998 s = (char*)leftmost;
1999 DEBUG_TRIE_EXECUTE_r({
2001 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2002 (UV)accepted_word, (IV)(s - real_start)
2005 if (!reginfo || regtry(reginfo, &s)) {
2011 DEBUG_TRIE_EXECUTE_r({
2012 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2015 DEBUG_TRIE_EXECUTE_r(
2016 PerlIO_printf( Perl_debug_log,"No match.\n"));
2025 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2035 - regexec_flags - match a regexp against a string
2038 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2039 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2040 /* stringarg: the point in the string at which to begin matching */
2041 /* strend: pointer to null at end of string */
2042 /* strbeg: real beginning of string */
2043 /* minend: end of match must be >= minend bytes after stringarg. */
2044 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2045 * itself is accessed via the pointers above */
2046 /* data: May be used for some additional optimizations.
2047 Currently its only used, with a U32 cast, for transmitting
2048 the ganch offset when doing a /g match. This will change */
2049 /* nosave: For optimizations. */
2053 struct regexp *const prog = ReANY(rx);
2056 char *startpos = stringarg;
2057 I32 minlen; /* must match at least this many chars */
2058 I32 dontbother = 0; /* how many characters not to try at end */
2059 I32 end_shift = 0; /* Same for the end. */ /* CC */
2060 I32 scream_pos = -1; /* Internal iterator of scream. */
2061 char *scream_olds = NULL;
2062 const bool utf8_target = cBOOL(DO_UTF8(sv));
2064 RXi_GET_DECL(prog,progi);
2065 regmatch_info reginfo; /* create some info to pass to regtry etc */
2066 regexp_paren_pair *swap = NULL;
2067 GET_RE_DEBUG_FLAGS_DECL;
2069 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2070 PERL_UNUSED_ARG(data);
2072 /* Be paranoid... */
2073 if (prog == NULL || startpos == NULL) {
2074 Perl_croak(aTHX_ "NULL regexp parameter");
2078 multiline = prog->extflags & RXf_PMf_MULTILINE;
2079 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2081 RX_MATCH_UTF8_set(rx, utf8_target);
2083 debug_start_match(rx, utf8_target, startpos, strend,
2087 minlen = prog->minlen;
2089 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2090 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2091 "String too short [regexec_flags]...\n"));
2096 /* Check validity of program. */
2097 if (UCHARAT(progi->program) != REG_MAGIC) {
2098 Perl_croak(aTHX_ "corrupted regexp program");
2102 PL_reg_state.re_state_eval_setup_done = FALSE;
2106 PL_reg_flags |= RF_utf8;
2108 /* Mark beginning of line for ^ and lookbehind. */
2109 reginfo.bol = startpos; /* XXX not used ??? */
2113 /* Mark end of line for $ (and such) */
2116 /* see how far we have to get to not match where we matched before */
2117 reginfo.till = startpos+minend;
2119 /* If there is a "must appear" string, look for it. */
2122 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2124 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2125 reginfo.ganch = startpos + prog->gofs;
2126 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2127 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2128 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2130 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2131 && mg->mg_len >= 0) {
2132 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2133 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2134 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2136 if (prog->extflags & RXf_ANCH_GPOS) {
2137 if (s > reginfo.ganch)
2139 s = reginfo.ganch - prog->gofs;
2140 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2141 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2147 reginfo.ganch = strbeg + PTR2UV(data);
2148 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2149 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2151 } else { /* pos() not defined */
2152 reginfo.ganch = strbeg;
2153 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2154 "GPOS: reginfo.ganch = strbeg\n"));
2157 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2158 /* We have to be careful. If the previous successful match
2159 was from this regex we don't want a subsequent partially
2160 successful match to clobber the old results.
2161 So when we detect this possibility we add a swap buffer
2162 to the re, and switch the buffer each match. If we fail
2163 we switch it back, otherwise we leave it swapped.
2166 /* do we need a save destructor here for eval dies? */
2167 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2168 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2169 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2175 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2176 re_scream_pos_data d;
2178 d.scream_olds = &scream_olds;
2179 d.scream_pos = &scream_pos;
2180 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2183 goto phooey; /* not present */
2189 /* Simplest case: anchored match need be tried only once. */
2190 /* [unless only anchor is BOL and multiline is set] */
2191 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2192 if (s == startpos && regtry(®info, &startpos))
2194 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2195 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2200 dontbother = minlen - 1;
2201 end = HOP3c(strend, -dontbother, strbeg) - 1;
2202 /* for multiline we only have to try after newlines */
2203 if (prog->check_substr || prog->check_utf8) {
2204 /* because of the goto we can not easily reuse the macros for bifurcating the
2205 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2208 goto after_try_utf8;
2210 if (regtry(®info, &s)) {
2217 if (prog->extflags & RXf_USE_INTUIT) {
2218 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2227 } /* end search for check string in unicode */
2229 if (s == startpos) {
2230 goto after_try_latin;
2233 if (regtry(®info, &s)) {
2240 if (prog->extflags & RXf_USE_INTUIT) {
2241 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2250 } /* end search for check string in latin*/
2251 } /* end search for check string */
2252 else { /* search for newline */
2254 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2257 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2258 while (s <= end) { /* note it could be possible to match at the end of the string */
2259 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2260 if (regtry(®info, &s))
2264 } /* end search for newline */
2265 } /* end anchored/multiline check string search */
2267 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2269 /* the warning about reginfo.ganch being used without initialization
2270 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2271 and we only enter this block when the same bit is set. */
2272 char *tmp_s = reginfo.ganch - prog->gofs;
2274 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2279 /* Messy cases: unanchored match. */
2280 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2281 /* we have /x+whatever/ */
2282 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2288 if (! prog->anchored_utf8) {
2289 to_utf8_substr(prog);
2291 ch = SvPVX_const(prog->anchored_utf8)[0];
2294 DEBUG_EXECUTE_r( did_match = 1 );
2295 if (regtry(®info, &s)) goto got_it;
2297 while (s < strend && *s == ch)
2304 if (! prog->anchored_substr) {
2305 if (! to_byte_substr(prog)) {
2306 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2309 ch = SvPVX_const(prog->anchored_substr)[0];
2312 DEBUG_EXECUTE_r( did_match = 1 );
2313 if (regtry(®info, &s)) goto got_it;
2315 while (s < strend && *s == ch)
2320 DEBUG_EXECUTE_r(if (!did_match)
2321 PerlIO_printf(Perl_debug_log,
2322 "Did not find anchored character...\n")
2325 else if (prog->anchored_substr != NULL
2326 || prog->anchored_utf8 != NULL
2327 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2328 && prog->float_max_offset < strend - s)) {
2333 char *last1; /* Last position checked before */
2337 if (prog->anchored_substr || prog->anchored_utf8) {
2339 if (! prog->anchored_utf8) {
2340 to_utf8_substr(prog);
2342 must = prog->anchored_utf8;
2345 if (! prog->anchored_substr) {
2346 if (! to_byte_substr(prog)) {
2347 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2350 must = prog->anchored_substr;
2352 back_max = back_min = prog->anchored_offset;
2355 if (! prog->float_utf8) {
2356 to_utf8_substr(prog);
2358 must = prog->float_utf8;
2361 if (! prog->float_substr) {
2362 if (! to_byte_substr(prog)) {
2363 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2366 must = prog->float_substr;
2368 back_max = prog->float_max_offset;
2369 back_min = prog->float_min_offset;
2375 last = HOP3c(strend, /* Cannot start after this */
2376 -(I32)(CHR_SVLEN(must)
2377 - (SvTAIL(must) != 0) + back_min), strbeg);
2380 last1 = HOPc(s, -1);
2382 last1 = s - 1; /* bogus */
2384 /* XXXX check_substr already used to find "s", can optimize if
2385 check_substr==must. */
2387 dontbother = end_shift;
2388 strend = HOPc(strend, -dontbother);
2389 while ( (s <= last) &&
2390 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2391 (unsigned char*)strend, must,
2392 multiline ? FBMrf_MULTILINE : 0)) ) {
2393 DEBUG_EXECUTE_r( did_match = 1 );
2394 if (HOPc(s, -back_max) > last1) {
2395 last1 = HOPc(s, -back_min);
2396 s = HOPc(s, -back_max);
2399 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2401 last1 = HOPc(s, -back_min);
2405 while (s <= last1) {
2406 if (regtry(®info, &s))
2409 s++; /* to break out of outer loop */
2416 while (s <= last1) {
2417 if (regtry(®info, &s))
2423 DEBUG_EXECUTE_r(if (!did_match) {
2424 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2425 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2426 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2427 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2428 ? "anchored" : "floating"),
2429 quoted, RE_SV_TAIL(must));
2433 else if ( (c = progi->regstclass) ) {
2435 const OPCODE op = OP(progi->regstclass);
2436 /* don't bother with what can't match */
2437 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2438 strend = HOPc(strend, -(minlen - 1));
2441 SV * const prop = sv_newmortal();
2442 regprop(prog, prop, c);
2444 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2446 PerlIO_printf(Perl_debug_log,
2447 "Matching stclass %.*s against %s (%d bytes)\n",
2448 (int)SvCUR(prop), SvPVX_const(prop),
2449 quoted, (int)(strend - s));
2452 if (find_byclass(prog, c, s, strend, ®info))
2454 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2458 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2466 if (! prog->float_utf8) {
2467 to_utf8_substr(prog);
2469 float_real = prog->float_utf8;
2472 if (! prog->float_substr) {
2473 if (! to_byte_substr(prog)) {
2474 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2477 float_real = prog->float_substr;
2480 little = SvPV_const(float_real, len);
2481 if (SvTAIL(float_real)) {
2482 /* This means that float_real contains an artificial \n on
2483 * the end due to the presence of something like this:
2484 * /foo$/ where we can match both "foo" and "foo\n" at the
2485 * end of the string. So we have to compare the end of the
2486 * string first against the float_real without the \n and
2487 * then against the full float_real with the string. We
2488 * have to watch out for cases where the string might be
2489 * smaller than the float_real or the float_real without
2491 char *checkpos= strend - len;
2493 PerlIO_printf(Perl_debug_log,
2494 "%sChecking for float_real.%s\n",
2495 PL_colors[4], PL_colors[5]));
2496 if (checkpos + 1 < strbeg) {
2497 /* can't match, even if we remove the trailing \n
2498 * string is too short to match */
2500 PerlIO_printf(Perl_debug_log,
2501 "%sString shorter than required trailing substring, cannot match.%s\n",
2502 PL_colors[4], PL_colors[5]));
2504 } else if (memEQ(checkpos + 1, little, len - 1)) {
2505 /* can match, the end of the string matches without the
2507 last = checkpos + 1;
2508 } else if (checkpos < strbeg) {
2509 /* cant match, string is too short when the "\n" is
2512 PerlIO_printf(Perl_debug_log,
2513 "%sString does not contain required trailing substring, cannot match.%s\n",
2514 PL_colors[4], PL_colors[5]));
2516 } else if (!multiline) {
2517 /* non multiline match, so compare with the "\n" at the
2518 * end of the string */
2519 if (memEQ(checkpos, little, len)) {
2523 PerlIO_printf(Perl_debug_log,
2524 "%sString does not contain required trailing substring, cannot match.%s\n",
2525 PL_colors[4], PL_colors[5]));
2529 /* multiline match, so we have to search for a place
2530 * where the full string is located */
2536 last = rninstr(s, strend, little, little + len);
2538 last = strend; /* matching "$" */
2541 /* at one point this block contained a comment which was
2542 * probably incorrect, which said that this was a "should not
2543 * happen" case. Even if it was true when it was written I am
2544 * pretty sure it is not anymore, so I have removed the comment
2545 * and replaced it with this one. Yves */
2547 PerlIO_printf(Perl_debug_log,
2548 "String does not contain required substring, cannot match.\n"
2552 dontbother = strend - last + prog->float_min_offset;
2554 if (minlen && (dontbother < minlen))
2555 dontbother = minlen - 1;
2556 strend -= dontbother; /* this one's always in bytes! */
2557 /* We don't know much -- general case. */
2560 if (regtry(®info, &s))
2569 if (regtry(®info, &s))
2571 } while (s++ < strend);
2581 PerlIO_printf(Perl_debug_log,
2582 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2588 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2590 if (PL_reg_state.re_state_eval_setup_done)
2591 restore_pos(aTHX_ prog);
2592 if (RXp_PAREN_NAMES(prog))
2593 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2595 /* make sure $`, $&, $', and $digit will work later */
2596 if ( !(flags & REXEC_NOT_FIRST) ) {
2597 if (flags & REXEC_COPY_STR) {
2601 PerlIO_printf(Perl_debug_log,
2602 "Copy on write: regexp capture, type %d\n",
2605 RX_MATCH_COPY_FREE(rx);
2606 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2607 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2608 assert (SvPOKp(prog->saved_copy));
2609 prog->sublen = PL_regeol - strbeg;
2610 prog->suboffset = 0;
2611 prog->subcoffset = 0;
2616 I32 max = PL_regeol - strbeg;
2619 if ( (flags & REXEC_COPY_SKIP_POST)
2620 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2621 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2622 ) { /* don't copy $' part of string */
2625 /* calculate the right-most part of the string covered
2626 * by a capture. Due to look-ahead, this may be to
2627 * the right of $&, so we have to scan all captures */
2628 while (n <= prog->lastparen) {
2629 if (prog->offs[n].end > max)
2630 max = prog->offs[n].end;
2634 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2635 ? prog->offs[0].start
2637 assert(max >= 0 && max <= PL_regeol - strbeg);
2640 if ( (flags & REXEC_COPY_SKIP_PRE)
2641 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2642 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2643 ) { /* don't copy $` part of string */
2646 /* calculate the left-most part of the string covered
2647 * by a capture. Due to look-behind, this may be to
2648 * the left of $&, so we have to scan all captures */
2649 while (min && n <= prog->lastparen) {
2650 if ( prog->offs[n].start != -1
2651 && prog->offs[n].start < min)
2653 min = prog->offs[n].start;
2657 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2658 && min > prog->offs[0].end
2660 min = prog->offs[0].end;
2664 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2667 if (RX_MATCH_COPIED(rx)) {
2668 if (sublen > prog->sublen)
2670 (char*)saferealloc(prog->subbeg, sublen+1);
2673 prog->subbeg = (char*)safemalloc(sublen+1);
2674 Copy(strbeg + min, prog->subbeg, sublen, char);
2675 prog->subbeg[sublen] = '\0';
2676 prog->suboffset = min;
2677 prog->sublen = sublen;
2678 RX_MATCH_COPIED_on(rx);
2680 prog->subcoffset = prog->suboffset;
2681 if (prog->suboffset && utf8_target) {
2682 /* Convert byte offset to chars.
2683 * XXX ideally should only compute this if @-/@+
2684 * has been seen, a la PL_sawampersand ??? */
2686 /* If there's a direct correspondence between the
2687 * string which we're matching and the original SV,
2688 * then we can use the utf8 len cache associated with
2689 * the SV. In particular, it means that under //g,
2690 * sv_pos_b2u() will use the previously cached
2691 * position to speed up working out the new length of
2692 * subcoffset, rather than counting from the start of
2693 * the string each time. This stops
2694 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2695 * from going quadratic */
2696 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2697 sv_pos_b2u(sv, &(prog->subcoffset));
2699 prog->subcoffset = utf8_length((U8*)strbeg,
2700 (U8*)(strbeg+prog->suboffset));
2704 RX_MATCH_COPY_FREE(rx);
2705 prog->subbeg = strbeg;
2706 prog->suboffset = 0;
2707 prog->subcoffset = 0;
2708 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2716 PL_colors[4], PL_colors[5]));
2717 if (PL_reg_state.re_state_eval_setup_done)
2718 restore_pos(aTHX_ prog);
2720 /* we failed :-( roll it back */
2721 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2722 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2727 Safefree(prog->offs);
2734 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2735 * Do inc before dec, in case old and new rex are the same */
2736 #define SET_reg_curpm(Re2) \
2737 if (PL_reg_state.re_state_eval_setup_done) { \
2738 (void)ReREFCNT_inc(Re2); \
2739 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2740 PM_SETRE((PL_reg_curpm), (Re2)); \
2745 - regtry - try match at specific point
2747 STATIC I32 /* 0 failure, 1 success */
2748 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2752 REGEXP *const rx = reginfo->prog;
2753 regexp *const prog = ReANY(rx);
2755 RXi_GET_DECL(prog,progi);
2756 GET_RE_DEBUG_FLAGS_DECL;
2758 PERL_ARGS_ASSERT_REGTRY;
2760 reginfo->cutpoint=NULL;
2762 if ((prog->extflags & RXf_EVAL_SEEN)
2763 && !PL_reg_state.re_state_eval_setup_done)
2767 PL_reg_state.re_state_eval_setup_done = TRUE;
2769 /* Make $_ available to executed code. */
2770 if (reginfo->sv != DEFSV) {
2772 DEFSV_set(reginfo->sv);
2775 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2776 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2777 /* prepare for quick setting of pos */
2778 #ifdef PERL_OLD_COPY_ON_WRITE
2779 if (SvIsCOW(reginfo->sv))
2780 sv_force_normal_flags(reginfo->sv, 0);
2782 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2783 &PL_vtbl_mglob, NULL, 0);
2787 PL_reg_oldpos = mg->mg_len;
2788 SAVEDESTRUCTOR_X(restore_pos, prog);
2790 if (!PL_reg_curpm) {
2791 Newxz(PL_reg_curpm, 1, PMOP);
2794 SV* const repointer = &PL_sv_undef;
2795 /* this regexp is also owned by the new PL_reg_curpm, which
2796 will try to free it. */
2797 av_push(PL_regex_padav, repointer);
2798 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2799 PL_regex_pad = AvARRAY(PL_regex_padav);
2804 PL_reg_oldcurpm = PL_curpm;
2805 PL_curpm = PL_reg_curpm;
2806 if (RXp_MATCH_COPIED(prog)) {
2807 /* Here is a serious problem: we cannot rewrite subbeg,
2808 since it may be needed if this match fails. Thus
2809 $` inside (?{}) could fail... */
2810 PL_reg_oldsaved = prog->subbeg;
2811 PL_reg_oldsavedlen = prog->sublen;
2812 PL_reg_oldsavedoffset = prog->suboffset;
2813 PL_reg_oldsavedcoffset = prog->suboffset;
2815 PL_nrs = prog->saved_copy;
2817 RXp_MATCH_COPIED_off(prog);
2820 PL_reg_oldsaved = NULL;
2821 prog->subbeg = PL_bostr;
2822 prog->suboffset = 0;
2823 prog->subcoffset = 0;
2824 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2827 PL_reg_starttry = *startposp;
2829 prog->offs[0].start = *startposp - PL_bostr;
2830 prog->lastparen = 0;
2831 prog->lastcloseparen = 0;
2833 /* XXXX What this code is doing here?!!! There should be no need
2834 to do this again and again, prog->lastparen should take care of
2837 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2838 * Actually, the code in regcppop() (which Ilya may be meaning by
2839 * prog->lastparen), is not needed at all by the test suite
2840 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2841 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2842 * Meanwhile, this code *is* needed for the
2843 * above-mentioned test suite tests to succeed. The common theme
2844 * on those tests seems to be returning null fields from matches.
2845 * --jhi updated by dapm */
2847 if (prog->nparens) {
2848 regexp_paren_pair *pp = prog->offs;
2850 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2858 result = regmatch(reginfo, *startposp, progi->program + 1);
2860 prog->offs[0].end = result;
2863 if (reginfo->cutpoint)
2864 *startposp= reginfo->cutpoint;
2865 REGCP_UNWIND(lastcp);
2870 #define sayYES goto yes
2871 #define sayNO goto no
2872 #define sayNO_SILENT goto no_silent
2874 /* we dont use STMT_START/END here because it leads to
2875 "unreachable code" warnings, which are bogus, but distracting. */
2876 #define CACHEsayNO \
2877 if (ST.cache_mask) \
2878 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2881 /* this is used to determine how far from the left messages like
2882 'failed...' are printed. It should be set such that messages
2883 are inline with the regop output that created them.
2885 #define REPORT_CODE_OFF 32
2888 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2889 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2890 #define CHRTEST_NOT_A_CP_1 -999
2891 #define CHRTEST_NOT_A_CP_2 -998
2893 #define SLAB_FIRST(s) (&(s)->states[0])
2894 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2896 /* grab a new slab and return the first slot in it */
2898 STATIC regmatch_state *
2901 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2904 regmatch_slab *s = PL_regmatch_slab->next;
2906 Newx(s, 1, regmatch_slab);
2907 s->prev = PL_regmatch_slab;
2909 PL_regmatch_slab->next = s;
2911 PL_regmatch_slab = s;
2912 return SLAB_FIRST(s);
2916 /* push a new state then goto it */
2918 #define PUSH_STATE_GOTO(state, node, input) \
2919 pushinput = input; \
2921 st->resume_state = state; \
2924 /* push a new state with success backtracking, then goto it */
2926 #define PUSH_YES_STATE_GOTO(state, node, input) \
2927 pushinput = input; \
2929 st->resume_state = state; \
2930 goto push_yes_state;
2937 regmatch() - main matching routine
2939 This is basically one big switch statement in a loop. We execute an op,
2940 set 'next' to point the next op, and continue. If we come to a point which
2941 we may need to backtrack to on failure such as (A|B|C), we push a
2942 backtrack state onto the backtrack stack. On failure, we pop the top
2943 state, and re-enter the loop at the state indicated. If there are no more
2944 states to pop, we return failure.
2946 Sometimes we also need to backtrack on success; for example /A+/, where
2947 after successfully matching one A, we need to go back and try to
2948 match another one; similarly for lookahead assertions: if the assertion
2949 completes successfully, we backtrack to the state just before the assertion
2950 and then carry on. In these cases, the pushed state is marked as
2951 'backtrack on success too'. This marking is in fact done by a chain of
2952 pointers, each pointing to the previous 'yes' state. On success, we pop to
2953 the nearest yes state, discarding any intermediate failure-only states.
2954 Sometimes a yes state is pushed just to force some cleanup code to be
2955 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2956 it to free the inner regex.
2958 Note that failure backtracking rewinds the cursor position, while
2959 success backtracking leaves it alone.
2961 A pattern is complete when the END op is executed, while a subpattern
2962 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2963 ops trigger the "pop to last yes state if any, otherwise return true"
2966 A common convention in this function is to use A and B to refer to the two
2967 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2968 the subpattern to be matched possibly multiple times, while B is the entire
2969 rest of the pattern. Variable and state names reflect this convention.
2971 The states in the main switch are the union of ops and failure/success of
2972 substates associated with with that op. For example, IFMATCH is the op
2973 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2974 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2975 successfully matched A and IFMATCH_A_fail is a state saying that we have
2976 just failed to match A. Resume states always come in pairs. The backtrack
2977 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2978 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2979 on success or failure.
2981 The struct that holds a backtracking state is actually a big union, with
2982 one variant for each major type of op. The variable st points to the
2983 top-most backtrack struct. To make the code clearer, within each
2984 block of code we #define ST to alias the relevant union.
2986 Here's a concrete example of a (vastly oversimplified) IFMATCH
2992 #define ST st->u.ifmatch
2994 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2995 ST.foo = ...; // some state we wish to save
2997 // push a yes backtrack state with a resume value of
2998 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3000 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3003 case IFMATCH_A: // we have successfully executed A; now continue with B
3005 bar = ST.foo; // do something with the preserved value
3008 case IFMATCH_A_fail: // A failed, so the assertion failed
3009 ...; // do some housekeeping, then ...
3010 sayNO; // propagate the failure
3017 For any old-timers reading this who are familiar with the old recursive
3018 approach, the code above is equivalent to:
3020 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3029 ...; // do some housekeeping, then ...
3030 sayNO; // propagate the failure
3033 The topmost backtrack state, pointed to by st, is usually free. If you
3034 want to claim it, populate any ST.foo fields in it with values you wish to
3035 save, then do one of
3037 PUSH_STATE_GOTO(resume_state, node, newinput);
3038 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3040 which sets that backtrack state's resume value to 'resume_state', pushes a
3041 new free entry to the top of the backtrack stack, then goes to 'node'.
3042 On backtracking, the free slot is popped, and the saved state becomes the
3043 new free state. An ST.foo field in this new top state can be temporarily
3044 accessed to retrieve values, but once the main loop is re-entered, it
3045 becomes available for reuse.
3047 Note that the depth of the backtrack stack constantly increases during the
3048 left-to-right execution of the pattern, rather than going up and down with
3049 the pattern nesting. For example the stack is at its maximum at Z at the
3050 end of the pattern, rather than at X in the following:
3052 /(((X)+)+)+....(Y)+....Z/
3054 The only exceptions to this are lookahead/behind assertions and the cut,
3055 (?>A), which pop all the backtrack states associated with A before
3058 Backtrack state structs are allocated in slabs of about 4K in size.
3059 PL_regmatch_state and st always point to the currently active state,
3060 and PL_regmatch_slab points to the slab currently containing
3061 PL_regmatch_state. The first time regmatch() is called, the first slab is
3062 allocated, and is never freed until interpreter destruction. When the slab
3063 is full, a new one is allocated and chained to the end. At exit from
3064 regmatch(), slabs allocated since entry are freed.
3069 #define DEBUG_STATE_pp(pp) \
3071 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3072 PerlIO_printf(Perl_debug_log, \
3073 " %*s"pp" %s%s%s%s%s\n", \
3075 PL_reg_name[st->resume_state], \
3076 ((st==yes_state||st==mark_state) ? "[" : ""), \
3077 ((st==yes_state) ? "Y" : ""), \
3078 ((st==mark_state) ? "M" : ""), \
3079 ((st==yes_state||st==mark_state) ? "]" : "") \
3084 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3089 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3090 const char *start, const char *end, const char *blurb)
3092 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3094 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3099 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3100 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3102 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3103 start, end - start, 60);
3105 PerlIO_printf(Perl_debug_log,
3106 "%s%s REx%s %s against %s\n",
3107 PL_colors[4], blurb, PL_colors[5], s0, s1);
3109 if (utf8_target||utf8_pat)
3110 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3111 utf8_pat ? "pattern" : "",
3112 utf8_pat && utf8_target ? " and " : "",
3113 utf8_target ? "string" : ""
3119 S_dump_exec_pos(pTHX_ const char *locinput,
3120 const regnode *scan,
3121 const char *loc_regeol,
3122 const char *loc_bostr,
3123 const char *loc_reg_starttry,
3124 const bool utf8_target)
3126 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3127 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3128 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3129 /* The part of the string before starttry has one color
3130 (pref0_len chars), between starttry and current
3131 position another one (pref_len - pref0_len chars),
3132 after the current position the third one.
3133 We assume that pref0_len <= pref_len, otherwise we
3134 decrease pref0_len. */
3135 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3136 ? (5 + taill) - l : locinput - loc_bostr;
3139 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3141 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3143 pref0_len = pref_len - (locinput - loc_reg_starttry);
3144 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3145 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3146 ? (5 + taill) - pref_len : loc_regeol - locinput);
3147 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3151 if (pref0_len > pref_len)
3152 pref0_len = pref_len;
3154 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3156 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3157 (locinput - pref_len),pref0_len, 60, 4, 5);
3159 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3160 (locinput - pref_len + pref0_len),
3161 pref_len - pref0_len, 60, 2, 3);
3163 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3164 locinput, loc_regeol - locinput, 10, 0, 1);
3166 const STRLEN tlen=len0+len1+len2;
3167 PerlIO_printf(Perl_debug_log,
3168 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3169 (IV)(locinput - loc_bostr),
3172 (docolor ? "" : "> <"),
3174 (int)(tlen > 19 ? 0 : 19 - tlen),
3181 /* reg_check_named_buff_matched()
3182 * Checks to see if a named buffer has matched. The data array of
3183 * buffer numbers corresponding to the buffer is expected to reside
3184 * in the regexp->data->data array in the slot stored in the ARG() of
3185 * node involved. Note that this routine doesn't actually care about the
3186 * name, that information is not preserved from compilation to execution.
3187 * Returns the index of the leftmost defined buffer with the given name
3188 * or 0 if non of the buffers matched.
3191 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3194 RXi_GET_DECL(rex,rexi);
3195 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3196 I32 *nums=(I32*)SvPVX(sv_dat);
3198 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3200 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3201 if ((I32)rex->lastparen >= nums[n] &&
3202 rex->offs[nums[n]].end != -1)
3211 /* free all slabs above current one - called during LEAVE_SCOPE */
3214 S_clear_backtrack_stack(pTHX_ void *p)
3216 regmatch_slab *s = PL_regmatch_slab->next;
3221 PL_regmatch_slab->next = NULL;
3223 regmatch_slab * const osl = s;
3229 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3231 /* This function determines if there are one or two characters that match
3232 * the first character of the passed-in EXACTish node <text_node>, and if
3233 * so, returns them in the passed-in pointers.
3235 * If it determines that no possible character in the target string can
3236 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3237 * the first character in <text_node> requires UTF-8 to represent, and the
3238 * target string isn't in UTF-8.)
3240 * If there are more than two characters that could match the beginning of
3241 * <text_node>, or if more context is required to determine a match or not,
3242 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3244 * The motiviation behind this function is to allow the caller to set up
3245 * tight loops for matching. If <text_node> is of type EXACT, there is
3246 * only one possible character that can match its first character, and so
3247 * the situation is quite simple. But things get much more complicated if
3248 * folding is involved. It may be that the first character of an EXACTFish
3249 * node doesn't participate in any possible fold, e.g., punctuation, so it
3250 * can be matched only by itself. The vast majority of characters that are
3251 * in folds match just two things, their lower and upper-case equivalents.
3252 * But not all are like that; some have multiple possible matches, or match
3253 * sequences of more than one character. This function sorts all that out.
3255 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3256 * loop of trying to match A*, we know we can't exit where the thing
3257 * following it isn't a B. And something can't be a B unless it is the
3258 * beginning of B. By putting a quick test for that beginning in a tight
3259 * loop, we can rule out things that can't possibly be B without having to
3260 * break out of the loop, thus avoiding work. Similarly, if A is a single
3261 * character, we can make a tight loop matching A*, using the outputs of
3264 * If the target string to match isn't in UTF-8, and there aren't
3265 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3266 * the one or two possible octets (which are characters in this situation)
3267 * that can match. In all cases, if there is only one character that can
3268 * match, *<c1p> and *<c2p> will be identical.
3270 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3271 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3272 * can match the beginning of <text_node>. They should be declared with at
3273 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3274 * undefined what these contain.) If one or both of the buffers are
3275 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3276 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3277 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3278 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3279 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3281 const bool utf8_target = PL_reg_match_utf8;
3283 UV c1 = CHRTEST_NOT_A_CP_1;
3284 UV c2 = CHRTEST_NOT_A_CP_2;
3285 bool use_chrtest_void = FALSE;
3287 /* Used when we have both utf8 input and utf8 output, to avoid converting
3288 * to/from code points */
3289 bool utf8_has_been_setup = FALSE;
3293 U8 *pat = (U8*)STRING(text_node);
3295 if (OP(text_node) == EXACT) {
3297 /* In an exact node, only one thing can be matched, that first
3298 * character. If both the pat and the target are UTF-8, we can just
3299 * copy the input to the output, avoiding finding the code point of
3301 if (! UTF_PATTERN) {
3304 else if (utf8_target) {
3305 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3306 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3307 utf8_has_been_setup = TRUE;
3310 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3313 else /* an EXACTFish node */
3315 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3316 pat + STR_LEN(text_node)))
3318 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3319 pat + STR_LEN(text_node))))
3321 /* Multi-character folds require more context to sort out. Also
3322 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3323 * handled outside this routine */
3324 use_chrtest_void = TRUE;
3326 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3327 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3329 /* Load the folds hash, if not already done */
3331 if (! PL_utf8_foldclosures) {
3332 if (! PL_utf8_tofold) {
3333 U8 dummy[UTF8_MAXBYTES+1];
3335 /* Force loading this by folding an above-Latin1 char */
3336 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3337 assert(PL_utf8_tofold); /* Verify that worked */
3339 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3342 /* The fold closures data structure is a hash with the keys being
3343 * the UTF-8 of every character that is folded to, like 'k', and
3344 * the values each an array of all code points that fold to its
3345 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3347 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3352 /* Not found in the hash, therefore there are no folds
3353 * containing it, so there is only a single character that
3357 else { /* Does participate in folds */
3358 AV* list = (AV*) *listp;
3359 if (av_len(list) != 1) {
3361 /* If there aren't exactly two folds to this, it is outside
3362 * the scope of this function */
3363 use_chrtest_void = TRUE;
3365 else { /* There are two. Get them */
3366 SV** c_p = av_fetch(list, 0, FALSE);
3368 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3372 c_p = av_fetch(list, 1, FALSE);
3374 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3378 /* Folds that cross the 255/256 boundary are forbidden if
3379 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3380 * pattern character is above 256, and its only other match
3381 * is below 256, the only legal match will be to itself.
3382 * We have thrown away the original, so have to compute
3383 * which is the one above 255 */
3384 if ((c1 < 256) != (c2 < 256)) {
3385 if (OP(text_node) == EXACTFL
3386 || (OP(text_node) == EXACTFA
3387 && (isASCII(c1) || isASCII(c2))))
3400 else /* Here, c1 is < 255 */
3402 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3403 && OP(text_node) != EXACTFL
3404 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3406 /* Here, there could be something above Latin1 in the target which
3407 * folds to this character in the pattern. All such cases except
3408 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3409 * involved in their folds, so are outside the scope of this
3411 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3412 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3415 use_chrtest_void = TRUE;
3418 else { /* Here nothing above Latin1 can fold to the pattern character */
3419 switch (OP(text_node)) {
3421 case EXACTFL: /* /l rules */
3422 c2 = PL_fold_locale[c1];
3426 if (! utf8_target) { /* /d rules */
3431 /* /u rules for all these. This happens to work for
3432 * EXACTFA as nothing in Latin1 folds to ASCII */
3434 case EXACTFU_TRICKYFOLD:
3437 c2 = PL_fold_latin1[c1];
3441 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3442 assert(0); /* NOTREACHED */
3447 /* Here have figured things out. Set up the returns */
3448 if (use_chrtest_void) {
3449 *c2p = *c1p = CHRTEST_VOID;
3451 else if (utf8_target) {
3452 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3453 uvchr_to_utf8(c1_utf8, c1);
3454 uvchr_to_utf8(c2_utf8, c2);
3457 /* Invariants are stored in both the utf8 and byte outputs; Use
3458 * negative numbers otherwise for the byte ones. Make sure that the
3459 * byte ones are the same iff the utf8 ones are the same */
3460 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3461 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3464 ? CHRTEST_NOT_A_CP_1
3465 : CHRTEST_NOT_A_CP_2;
3467 else if (c1 > 255) {
3468 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3473 *c1p = *c2p = c2; /* c2 is the only representable value */
3475 else { /* c1 is representable; see about c2 */
3477 *c2p = (c2 < 256) ? c2 : c1;
3483 /* returns -1 on failure, $+[0] on success */
3485 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3487 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3491 const bool utf8_target = PL_reg_match_utf8;
3492 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3493 REGEXP *rex_sv = reginfo->prog;
3494 regexp *rex = ReANY(rex_sv);
3495 RXi_GET_DECL(rex,rexi);
3497 /* the current state. This is a cached copy of PL_regmatch_state */
3499 /* cache heavy used fields of st in registers */
3502 U32 n = 0; /* general value; init to avoid compiler warning */
3503 I32 ln = 0; /* len or last; init to avoid compiler warning */
3504 char *locinput = startpos;
3505 char *pushinput; /* where to continue after a PUSH */
3506 I32 nextchr; /* is always set to UCHARAT(locinput) */
3508 bool result = 0; /* return value of S_regmatch */
3509 int depth = 0; /* depth of backtrack stack */
3510 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3511 const U32 max_nochange_depth =
3512 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3513 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3514 regmatch_state *yes_state = NULL; /* state to pop to on success of
3516 /* mark_state piggy backs on the yes_state logic so that when we unwind
3517 the stack on success we can update the mark_state as we go */
3518 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3519 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3520 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3522 bool no_final = 0; /* prevent failure from backtracking? */
3523 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3524 char *startpoint = locinput;
3525 SV *popmark = NULL; /* are we looking for a mark? */
3526 SV *sv_commit = NULL; /* last mark name seen in failure */
3527 SV *sv_yes_mark = NULL; /* last mark name we have seen
3528 during a successful match */
3529 U32 lastopen = 0; /* last open we saw */
3530 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3531 SV* const oreplsv = GvSV(PL_replgv);
3532 /* these three flags are set by various ops to signal information to
3533 * the very next op. They have a useful lifetime of exactly one loop
3534 * iteration, and are not preserved or restored by state pushes/pops
3536 bool sw = 0; /* the condition value in (?(cond)a|b) */
3537 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3538 int logical = 0; /* the following EVAL is:
3542 or the following IFMATCH/UNLESSM is:
3543 false: plain (?=foo)
3544 true: used as a condition: (?(?=foo))
3546 PAD* last_pad = NULL;
3548 I32 gimme = G_SCALAR;
3549 CV *caller_cv = NULL; /* who called us */
3550 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3551 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3552 U32 maxopenparen = 0; /* max '(' index seen so far */
3553 int to_complement; /* Invert the result? */
3554 _char_class_number classnum;
3557 GET_RE_DEBUG_FLAGS_DECL;
3560 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3561 multicall_oldcatch = 0;
3562 multicall_cv = NULL;
3564 PERL_UNUSED_VAR(multicall_cop);
3565 PERL_UNUSED_VAR(newsp);
3568 PERL_ARGS_ASSERT_REGMATCH;
3570 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3571 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3573 /* on first ever call to regmatch, allocate first slab */
3574 if (!PL_regmatch_slab) {
3575 Newx(PL_regmatch_slab, 1, regmatch_slab);
3576 PL_regmatch_slab->prev = NULL;
3577 PL_regmatch_slab->next = NULL;
3578 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3581 oldsave = PL_savestack_ix;
3582 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3583 SAVEVPTR(PL_regmatch_slab);
3584 SAVEVPTR(PL_regmatch_state);
3586 /* grab next free state slot */
3587 st = ++PL_regmatch_state;
3588 if (st > SLAB_LAST(PL_regmatch_slab))
3589 st = PL_regmatch_state = S_push_slab(aTHX);
3591 /* Note that nextchr is a byte even in UTF */
3594 while (scan != NULL) {
3597 SV * const prop = sv_newmortal();
3598 regnode *rnext=regnext(scan);
3599 DUMP_EXEC_POS( locinput, scan, utf8_target );
3600 regprop(rex, prop, scan);
3602 PerlIO_printf(Perl_debug_log,
3603 "%3"IVdf":%*s%s(%"IVdf")\n",
3604 (IV)(scan - rexi->program), depth*2, "",
3606 (PL_regkind[OP(scan)] == END || !rnext) ?
3607 0 : (IV)(rnext - rexi->program));
3610 next = scan + NEXT_OFF(scan);
3613 state_num = OP(scan);
3619 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3621 switch (state_num) {
3622 case BOL: /* /^../ */
3623 if (locinput == PL_bostr)
3625 /* reginfo->till = reginfo->bol; */
3630 case MBOL: /* /^../m */
3631 if (locinput == PL_bostr ||
3632 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3638 case SBOL: /* /^../s */
3639 if (locinput == PL_bostr)
3644 if (locinput == reginfo->ganch)
3648 case KEEPS: /* \K */
3649 /* update the startpoint */
3650 st->u.keeper.val = rex->offs[0].start;
3651 rex->offs[0].start = locinput - PL_bostr;
3652 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3653 assert(0); /*NOTREACHED*/
3654 case KEEPS_next_fail:
3655 /* rollback the start point change */
3656 rex->offs[0].start = st->u.keeper.val;
3658 assert(0); /*NOTREACHED*/
3660 case EOL: /* /..$/ */
3663 case MEOL: /* /..$/m */
3664 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3668 case SEOL: /* /..$/s */
3670 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3672 if (PL_regeol - locinput > 1)
3677 if (!NEXTCHR_IS_EOS)
3681 case SANY: /* /./s */
3684 goto increment_locinput;
3692 case REG_ANY: /* /./ */
3693 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3695 goto increment_locinput;
3699 #define ST st->u.trie
3700 case TRIEC: /* (ab|cd) with known charclass */
3701 /* In this case the charclass data is available inline so
3702 we can fail fast without a lot of extra overhead.
3704 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3706 PerlIO_printf(Perl_debug_log,
3707 "%*s %sfailed to match trie start class...%s\n",
3708 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3711 assert(0); /* NOTREACHED */
3714 case TRIE: /* (ab|cd) */
3715 /* the basic plan of execution of the trie is:
3716 * At the beginning, run though all the states, and
3717 * find the longest-matching word. Also remember the position
3718 * of the shortest matching word. For example, this pattern:
3721 * when matched against the string "abcde", will generate
3722 * accept states for all words except 3, with the longest
3723 * matching word being 4, and the shortest being 2 (with
3724 * the position being after char 1 of the string).
3726 * Then for each matching word, in word order (i.e. 1,2,4,5),
3727 * we run the remainder of the pattern; on each try setting
3728 * the current position to the character following the word,
3729 * returning to try the next word on failure.
3731 * We avoid having to build a list of words at runtime by
3732 * using a compile-time structure, wordinfo[].prev, which
3733 * gives, for each word, the previous accepting word (if any).
3734 * In the case above it would contain the mappings 1->2, 2->0,
3735 * 3->0, 4->5, 5->1. We can use this table to generate, from
3736 * the longest word (4 above), a list of all words, by
3737 * following the list of prev pointers; this gives us the
3738 * unordered list 4,5,1,2. Then given the current word we have
3739 * just tried, we can go through the list and find the
3740 * next-biggest word to try (so if we just failed on word 2,
3741 * the next in the list is 4).
3743 * Since at runtime we don't record the matching position in
3744 * the string for each word, we have to work that out for
3745 * each word we're about to process. The wordinfo table holds
3746 * the character length of each word; given that we recorded
3747 * at the start: the position of the shortest word and its
3748 * length in chars, we just need to move the pointer the
3749 * difference between the two char lengths. Depending on
3750 * Unicode status and folding, that's cheap or expensive.
3752 * This algorithm is optimised for the case where are only a
3753 * small number of accept states, i.e. 0,1, or maybe 2.
3754 * With lots of accepts states, and having to try all of them,
3755 * it becomes quadratic on number of accept states to find all
3760 /* what type of TRIE am I? (utf8 makes this contextual) */
3761 DECL_TRIE_TYPE(scan);
3763 /* what trie are we using right now */
3764 reg_trie_data * const trie
3765 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3766 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3767 U32 state = trie->startstate;
3770 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3772 if (trie->states[ state ].wordnum) {
3774 PerlIO_printf(Perl_debug_log,
3775 "%*s %smatched empty string...%s\n",
3776 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3782 PerlIO_printf(Perl_debug_log,
3783 "%*s %sfailed to match trie start class...%s\n",
3784 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3791 U8 *uc = ( U8* )locinput;
3795 U8 *uscan = (U8*)NULL;
3796 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3797 U32 charcount = 0; /* how many input chars we have matched */
3798 U32 accepted = 0; /* have we seen any accepting states? */
3800 ST.jump = trie->jump;
3803 ST.longfold = FALSE; /* char longer if folded => it's harder */
3806 /* fully traverse the TRIE; note the position of the
3807 shortest accept state and the wordnum of the longest
3810 while ( state && uc <= (U8*)PL_regeol ) {
3811 U32 base = trie->states[ state ].trans.base;
3815 wordnum = trie->states[ state ].wordnum;
3817 if (wordnum) { /* it's an accept state */
3820 /* record first match position */
3822 ST.firstpos = (U8*)locinput;
3827 ST.firstchars = charcount;
3830 if (!ST.nextword || wordnum < ST.nextword)
3831 ST.nextword = wordnum;
3832 ST.topword = wordnum;
3835 DEBUG_TRIE_EXECUTE_r({
3836 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3837 PerlIO_printf( Perl_debug_log,
3838 "%*s %sState: %4"UVxf" Accepted: %c ",
3839 2+depth * 2, "", PL_colors[4],
3840 (UV)state, (accepted ? 'Y' : 'N'));
3843 /* read a char and goto next state */
3844 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3846 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3847 uscan, len, uvc, charid, foldlen,
3854 base + charid - 1 - trie->uniquecharcount)) >= 0)
3856 && ((U32)offset < trie->lasttrans)
3857 && trie->trans[offset].check == state)
3859 state = trie->trans[offset].next;
3870 DEBUG_TRIE_EXECUTE_r(
3871 PerlIO_printf( Perl_debug_log,
3872 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3873 charid, uvc, (UV)state, PL_colors[5] );
3879 /* calculate total number of accept states */
3884 w = trie->wordinfo[w].prev;
3887 ST.accepted = accepted;
3891 PerlIO_printf( Perl_debug_log,
3892 "%*s %sgot %"IVdf" possible matches%s\n",
3893 REPORT_CODE_OFF + depth * 2, "",
3894 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3896 goto trie_first_try; /* jump into the fail handler */
3898 assert(0); /* NOTREACHED */
3900 case TRIE_next_fail: /* we failed - try next alternative */
3904 REGCP_UNWIND(ST.cp);
3905 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3907 if (!--ST.accepted) {
3909 PerlIO_printf( Perl_debug_log,
3910 "%*s %sTRIE failed...%s\n",
3911 REPORT_CODE_OFF+depth*2, "",
3918 /* Find next-highest word to process. Note that this code
3919 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3922 U16 const nextword = ST.nextword;
3923 reg_trie_wordinfo * const wordinfo
3924 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3925 for (word=ST.topword; word; word=wordinfo[word].prev) {
3926 if (word > nextword && (!min || word < min))
3939 ST.lastparen = rex->lastparen;
3940 ST.lastcloseparen = rex->lastcloseparen;