5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGEXEC_C
87 #ifdef PERL_IN_XSUB_RE
93 #include "inline_invlist.c"
94 #include "unicode_constants.h"
96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
102 /* Valid for non-utf8 strings: avoids the reginclass
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
105 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
106 : ANYOF_BITMAP_TEST(p,*(c)))
112 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
113 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
115 #define HOPc(pos,off) \
116 (char *)(reginfo->is_utf8_target \
117 ? reghop3((U8*)pos, off, \
118 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
120 #define HOPBACKc(pos, off) \
121 (char*)(reginfo->is_utf8_target \
122 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
123 : (pos - off >= reginfo->strbeg) \
127 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
131 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
132 #define NEXTCHR_IS_EOS (nextchr < 0)
134 #define SET_nextchr \
135 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
137 #define SET_locinput(p) \
142 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
144 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
145 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
146 1, 0, NULL, &flags); \
151 /* If in debug mode, we test that a known character properly matches */
153 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
155 utf8_char_in_property) \
156 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
157 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
159 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
161 utf8_char_in_property) \
162 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
166 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
167 swash_property_names[_CC_WORDCHAR], \
168 GREEK_SMALL_LETTER_IOTA_UTF8)
170 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
172 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
173 "_X_regular_begin", \
174 GREEK_SMALL_LETTER_IOTA_UTF8); \
175 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
177 COMBINING_GRAVE_ACCENT_UTF8); \
180 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
181 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
183 /* for use after a quantifier and before an EXACT-like node -- japhy */
184 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
186 * NOTE that *nothing* that affects backtracking should be in here, specifically
187 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
188 * node that is in between two EXACT like nodes when ascertaining what the required
189 * "follow" character is. This should probably be moved to regex compile time
190 * although it may be done at run time beause of the REF possibility - more
191 * investigation required. -- demerphq
193 #define JUMPABLE(rn) ( \
195 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
197 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
198 OP(rn) == PLUS || OP(rn) == MINMOD || \
200 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
202 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
204 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
207 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
208 we don't need this definition. */
209 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
210 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
211 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
214 /* ... so we use this as its faster. */
215 #define IS_TEXT(rn) ( OP(rn)==EXACT )
216 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
217 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
218 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
223 Search for mandatory following text node; for lookahead, the text must
224 follow but for lookbehind (rn->flags != 0) we skip to the next step.
226 #define FIND_NEXT_IMPT(rn) STMT_START { \
227 while (JUMPABLE(rn)) { \
228 const OPCODE type = OP(rn); \
229 if (type == SUSPEND || PL_regkind[type] == CURLY) \
230 rn = NEXTOPER(NEXTOPER(rn)); \
231 else if (type == PLUS) \
233 else if (type == IFMATCH) \
234 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
235 else rn += NEXT_OFF(rn); \
239 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
240 * These are for the pre-composed Hangul syllables, which are all in a
241 * contiguous block and arranged there in such a way so as to facilitate
242 * alorithmic determination of their characteristics. As such, they don't need
243 * a swash, but can be determined by simple arithmetic. Almost all are
244 * GCB=LVT, but every 28th one is a GCB=LV */
245 #define SBASE 0xAC00 /* Start of block */
246 #define SCount 11172 /* Length of block */
249 #define SLAB_FIRST(s) (&(s)->states[0])
250 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
252 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
253 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
254 static regmatch_state * S_push_slab(pTHX);
256 #define REGCP_PAREN_ELEMS 3
257 #define REGCP_OTHER_ELEMS 3
258 #define REGCP_FRAME_ELEMS 1
259 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
260 * are needed for the regexp context stack bookkeeping. */
263 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
266 const int retval = PL_savestack_ix;
267 const int paren_elems_to_push =
268 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
269 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
270 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
272 GET_RE_DEBUG_FLAGS_DECL;
274 PERL_ARGS_ASSERT_REGCPPUSH;
276 if (paren_elems_to_push < 0)
277 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
278 paren_elems_to_push);
280 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
281 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
282 " out of range (%lu-%ld)",
284 (unsigned long)maxopenparen,
287 SSGROW(total_elems + REGCP_FRAME_ELEMS);
290 if ((int)maxopenparen > (int)parenfloor)
291 PerlIO_printf(Perl_debug_log,
292 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
297 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
298 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
299 SSPUSHINT(rex->offs[p].end);
300 SSPUSHINT(rex->offs[p].start);
301 SSPUSHINT(rex->offs[p].start_tmp);
302 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
303 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
305 (IV)rex->offs[p].start,
306 (IV)rex->offs[p].start_tmp,
310 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
311 SSPUSHINT(maxopenparen);
312 SSPUSHINT(rex->lastparen);
313 SSPUSHINT(rex->lastcloseparen);
314 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
319 /* These are needed since we do not localize EVAL nodes: */
320 #define REGCP_SET(cp) \
322 PerlIO_printf(Perl_debug_log, \
323 " Setting an EVAL scope, savestack=%"IVdf"\n", \
324 (IV)PL_savestack_ix)); \
327 #define REGCP_UNWIND(cp) \
329 if (cp != PL_savestack_ix) \
330 PerlIO_printf(Perl_debug_log, \
331 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
332 (IV)(cp), (IV)PL_savestack_ix)); \
335 #define UNWIND_PAREN(lp, lcp) \
336 for (n = rex->lastparen; n > lp; n--) \
337 rex->offs[n].end = -1; \
338 rex->lastparen = n; \
339 rex->lastcloseparen = lcp;
343 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
348 GET_RE_DEBUG_FLAGS_DECL;
350 PERL_ARGS_ASSERT_REGCPPOP;
352 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
354 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
355 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
356 rex->lastcloseparen = SSPOPINT;
357 rex->lastparen = SSPOPINT;
358 *maxopenparen_p = SSPOPINT;
360 i -= REGCP_OTHER_ELEMS;
361 /* Now restore the parentheses context. */
363 if (i || rex->lastparen + 1 <= rex->nparens)
364 PerlIO_printf(Perl_debug_log,
365 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
370 paren = *maxopenparen_p;
371 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
373 rex->offs[paren].start_tmp = SSPOPINT;
374 rex->offs[paren].start = SSPOPINT;
376 if (paren <= rex->lastparen)
377 rex->offs[paren].end = tmps;
378 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
379 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
381 (IV)rex->offs[paren].start,
382 (IV)rex->offs[paren].start_tmp,
383 (IV)rex->offs[paren].end,
384 (paren > rex->lastparen ? "(skipped)" : ""));
389 /* It would seem that the similar code in regtry()
390 * already takes care of this, and in fact it is in
391 * a better location to since this code can #if 0-ed out
392 * but the code in regtry() is needed or otherwise tests
393 * requiring null fields (pat.t#187 and split.t#{13,14}
394 * (as of patchlevel 7877) will fail. Then again,
395 * this code seems to be necessary or otherwise
396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397 * --jhi updated by dapm */
398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399 if (i > *maxopenparen_p)
400 rex->offs[i].start = -1;
401 rex->offs[i].end = -1;
402 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
403 " \\%"UVuf": %s ..-1 undeffing\n",
405 (i > *maxopenparen_p) ? "-1" : " "
411 /* restore the parens and associated vars at savestack position ix,
412 * but without popping the stack */
415 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
417 I32 tmpix = PL_savestack_ix;
418 PL_savestack_ix = ix;
419 regcppop(rex, maxopenparen_p);
420 PL_savestack_ix = tmpix;
423 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
426 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
428 /* Returns a boolean as to whether or not 'character' is a member of the
429 * Posix character class given by 'classnum' that should be equivalent to a
430 * value in the typedef '_char_class_number'.
432 * Ideally this could be replaced by a just an array of function pointers
433 * to the C library functions that implement the macros this calls.
434 * However, to compile, the precise function signatures are required, and
435 * these may vary from platform to to platform. To avoid having to figure
436 * out what those all are on each platform, I (khw) am using this method,
437 * which adds an extra layer of function call overhead (unless the C
438 * optimizer strips it away). But we don't particularly care about
439 * performance with locales anyway. */
441 switch ((_char_class_number) classnum) {
442 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
443 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
444 case _CC_ENUM_ASCII: return isASCII_LC(character);
445 case _CC_ENUM_BLANK: return isBLANK_LC(character);
446 case _CC_ENUM_CASED: return isLOWER_LC(character)
447 || isUPPER_LC(character);
448 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
449 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
450 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
451 case _CC_ENUM_LOWER: return isLOWER_LC(character);
452 case _CC_ENUM_PRINT: return isPRINT_LC(character);
453 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
454 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
455 case _CC_ENUM_SPACE: return isSPACE_LC(character);
456 case _CC_ENUM_UPPER: return isUPPER_LC(character);
457 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
458 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
459 default: /* VERTSPACE should never occur in locales */
460 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
463 assert(0); /* NOTREACHED */
468 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
470 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
471 * 'character' is a member of the Posix character class given by 'classnum'
472 * that should be equivalent to a value in the typedef
473 * '_char_class_number'.
475 * This just calls isFOO_lc on the code point for the character if it is in
476 * the range 0-255. Outside that range, all characters avoid Unicode
477 * rules, ignoring any locale. So use the Unicode function if this class
478 * requires a swash, and use the Unicode macro otherwise. */
480 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
482 if (UTF8_IS_INVARIANT(*character)) {
483 return isFOO_lc(classnum, *character);
485 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
486 return isFOO_lc(classnum,
487 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
490 if (classnum < _FIRST_NON_SWASH_CC) {
492 /* Initialize the swash unless done already */
493 if (! PL_utf8_swash_ptrs[classnum]) {
494 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
495 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
496 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
499 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
501 TRUE /* is UTF */ ));
504 switch ((_char_class_number) classnum) {
506 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
508 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
509 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
510 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
511 default: return 0; /* Things like CNTRL are always
515 assert(0); /* NOTREACHED */
520 * pregexec and friends
523 #ifndef PERL_IN_XSUB_RE
525 - pregexec - match a regexp against a string
528 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
529 char *strbeg, I32 minend, SV *screamer, U32 nosave)
530 /* stringarg: the point in the string at which to begin matching */
531 /* strend: pointer to null at end of string */
532 /* strbeg: real beginning of string */
533 /* minend: end of match must be >= minend bytes after stringarg. */
534 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
535 * itself is accessed via the pointers above */
536 /* nosave: For optimizations. */
538 PERL_ARGS_ASSERT_PREGEXEC;
541 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
542 nosave ? 0 : REXEC_COPY_STR);
547 * Need to implement the following flags for reg_anch:
549 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
551 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
552 * INTUIT_AUTORITATIVE_ML
553 * INTUIT_ONCE_NOML - Intuit can match in one location only.
556 * Another flag for this function: SECOND_TIME (so that float substrs
557 * with giant delta may be not rechecked).
560 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
562 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
563 Otherwise, only SvCUR(sv) is used to get strbeg. */
565 /* XXXX We assume that strpos is strbeg unless sv. */
567 /* XXXX Some places assume that there is a fixed substring.
568 An update may be needed if optimizer marks as "INTUITable"
569 RExen without fixed substrings. Similarly, it is assumed that
570 lengths of all the strings are no more than minlen, thus they
571 cannot come from lookahead.
572 (Or minlen should take into account lookahead.)
573 NOTE: Some of this comment is not correct. minlen does now take account
574 of lookahead/behind. Further research is required. -- demerphq
578 /* A failure to find a constant substring means that there is no need to make
579 an expensive call to REx engine, thus we celebrate a failure. Similarly,
580 finding a substring too deep into the string means that fewer calls to
581 regtry() should be needed.
583 REx compiler's optimizer found 4 possible hints:
584 a) Anchored substring;
586 c) Whether we are anchored (beginning-of-line or \G);
587 d) First node (of those at offset 0) which may distinguish positions;
588 We use a)b)d) and multiline-part of c), and try to find a position in the
589 string which does not contradict any of them.
592 /* Most of decisions we do here should have been done at compile time.
593 The nodes of the REx which we used for the search should have been
594 deleted from the finite automaton. */
597 * rx: the regex to match against
598 * sv: the SV being matched: only used for utf8 flag; the string
599 * itself is accessed via the pointers below. Note that on
600 * something like an overloaded SV, SvPOK(sv) may be false
601 * and the string pointers may point to something unrelated to
603 * strbeg: real beginning of string
604 * strpos: the point in the string at which to begin matching
605 * strend: pointer to the byte following the last char of the string
606 * flags currently unused; set to 0
607 * data: currently unused; set to NULL
611 Perl_re_intuit_start(pTHX_
614 const char * const strbeg,
618 re_scream_pos_data *data)
621 struct regexp *const prog = ReANY(rx);
623 /* Should be nonnegative! */
628 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
630 char *other_last = NULL; /* other substr checked before this */
631 char *check_at = NULL; /* check substr found at this pos */
632 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
633 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
634 RXi_GET_DECL(prog,progi);
635 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
636 regmatch_info *const reginfo = ®info_buf;
638 const char * const i_strpos = strpos;
640 GET_RE_DEBUG_FLAGS_DECL;
642 PERL_ARGS_ASSERT_RE_INTUIT_START;
643 PERL_UNUSED_ARG(flags);
644 PERL_UNUSED_ARG(data);
646 reginfo->is_utf8_target = cBOOL(utf8_target);
648 /* CHR_DIST() would be more correct here but it makes things slow. */
649 if (prog->minlen > strend - strpos) {
650 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
651 "String too short... [re_intuit_start]\n"));
655 reginfo->info_aux = NULL;
656 reginfo->strbeg = strbeg;
657 reginfo->strend = strend;
658 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
660 /* not actually used within intuit, but zero for safety anyway */
661 reginfo->poscache_maxiter = 0;
664 if (!prog->check_utf8 && prog->check_substr)
665 to_utf8_substr(prog);
666 check = prog->check_utf8;
668 if (!prog->check_substr && prog->check_utf8) {
669 if (! to_byte_substr(prog)) {
670 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
673 check = prog->check_substr;
675 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
676 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
677 || ( (prog->extflags & RXf_ANCH_BOL)
678 && !multiline ) ); /* Check after \n? */
681 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
682 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
683 && (strpos != strbeg)) {
684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
687 if (prog->check_offset_min == prog->check_offset_max
688 && !(prog->extflags & RXf_CANY_SEEN)
689 && ! multiline) /* /m can cause \n's to match that aren't
690 accounted for in the string max length.
691 See [perl #115242] */
693 /* Substring at constant offset from beg-of-str... */
696 s = HOP3c(strpos, prog->check_offset_min, strend);
699 slen = SvCUR(check); /* >= 1 */
701 if ( strend - s > slen || strend - s < slen - 1
702 || (strend - s == slen && strend[-1] != '\n')) {
703 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
706 /* Now should match s[0..slen-2] */
708 if (slen && (*SvPVX_const(check) != *s
710 && memNE(SvPVX_const(check), s, slen)))) {
712 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
716 else if (*SvPVX_const(check) != *s
717 || ((slen = SvCUR(check)) > 1
718 && memNE(SvPVX_const(check), s, slen)))
721 goto success_at_start;
724 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
726 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
727 end_shift = prog->check_end_shift;
730 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
731 - (SvTAIL(check) != 0);
732 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
734 if (end_shift < eshift)
738 else { /* Can match at random position */
741 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
742 end_shift = prog->check_end_shift;
744 /* end shift should be non negative here */
747 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
749 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
750 (IV)end_shift, RX_PRECOMP(prog));
754 /* Find a possible match in the region s..strend by looking for
755 the "check" substring in the region corrected by start/end_shift. */
758 I32 srch_start_shift = start_shift;
759 I32 srch_end_shift = end_shift;
762 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
763 srch_end_shift -= ((strbeg - s) - srch_start_shift);
764 srch_start_shift = strbeg - s;
766 DEBUG_OPTIMISE_MORE_r({
767 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
768 (IV)prog->check_offset_min,
769 (IV)srch_start_shift,
771 (IV)prog->check_end_shift);
774 if (prog->extflags & RXf_CANY_SEEN) {
775 start_point= (U8*)(s + srch_start_shift);
776 end_point= (U8*)(strend - srch_end_shift);
778 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
779 end_point= HOP3(strend, -srch_end_shift, strbeg);
781 DEBUG_OPTIMISE_MORE_r({
782 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
783 (int)(end_point - start_point),
784 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
788 s = fbm_instr( start_point, end_point,
789 check, multiline ? FBMrf_MULTILINE : 0);
791 /* Update the count-of-usability, remove useless subpatterns,
795 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
796 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
797 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
798 (s ? "Found" : "Did not find"),
799 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
800 ? "anchored" : "floating"),
803 (s ? " at offset " : "...\n") );
808 /* Finish the diagnostic message */
809 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
811 /* XXX dmq: first branch is for positive lookbehind...
812 Our check string is offset from the beginning of the pattern.
813 So we need to do any stclass tests offset forward from that
822 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
823 Start with the other substr.
824 XXXX no SCREAM optimization yet - and a very coarse implementation
825 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
826 *always* match. Probably should be marked during compile...
827 Probably it is right to do no SCREAM here...
830 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
831 : (prog->float_substr && prog->anchored_substr))
833 /* Take into account the "other" substring. */
834 /* XXXX May be hopelessly wrong for UTF... */
837 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
840 char * const last = HOP3c(s, -start_shift, strbeg);
842 char * const saved_s = s;
845 t = s - prog->check_offset_max;
846 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
848 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
853 t = HOP3c(t, prog->anchored_offset, strend);
854 if (t < other_last) /* These positions already checked */
856 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
859 /* XXXX It is not documented what units *_offsets are in.
860 We assume bytes, but this is clearly wrong.
861 Meaning this code needs to be carefully reviewed for errors.
865 /* On end-of-str: see comment below. */
866 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
867 if (must == &PL_sv_undef) {
869 DEBUG_r(must = prog->anchored_utf8); /* for debug */
874 HOP3(HOP3(last1, prog->anchored_offset, strend)
875 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
877 multiline ? FBMrf_MULTILINE : 0
880 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
881 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
882 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
883 (s ? "Found" : "Contradicts"),
884 quoted, RE_SV_TAIL(must));
889 if (last1 >= last2) {
890 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
891 ", giving up...\n"));
894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
895 ", trying floating at offset %ld...\n",
896 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
897 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
898 s = HOP3c(last, 1, strend);
902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
903 (long)(s - i_strpos)));
904 t = HOP3c(s, -prog->anchored_offset, strbeg);
905 other_last = HOP3c(s, 1, strend);
913 else { /* Take into account the floating substring. */
915 char * const saved_s = s;
918 t = HOP3c(s, -start_shift, strbeg);
920 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
921 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
922 last = HOP3c(t, prog->float_max_offset, strend);
923 s = HOP3c(t, prog->float_min_offset, strend);
926 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
927 must = utf8_target ? prog->float_utf8 : prog->float_substr;
928 /* fbm_instr() takes into account exact value of end-of-str
929 if the check is SvTAIL(ed). Since false positives are OK,
930 and end-of-str is not later than strend we are OK. */
931 if (must == &PL_sv_undef) {
933 DEBUG_r(must = prog->float_utf8); /* for debug message */
936 s = fbm_instr((unsigned char*)s,
937 (unsigned char*)last + SvCUR(must)
939 must, multiline ? FBMrf_MULTILINE : 0);
941 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
942 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
943 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
944 (s ? "Found" : "Contradicts"),
945 quoted, RE_SV_TAIL(must));
949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
950 ", giving up...\n"));
953 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
954 ", trying anchored starting at offset %ld...\n",
955 (long)(saved_s + 1 - i_strpos)));
957 s = HOP3c(t, 1, strend);
961 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
962 (long)(s - i_strpos)));
963 other_last = s; /* Fix this later. --Hugo */
973 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
975 DEBUG_OPTIMISE_MORE_r(
976 PerlIO_printf(Perl_debug_log,
977 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
978 (IV)prog->check_offset_min,
979 (IV)prog->check_offset_max,
987 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
989 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
992 /* Fixed substring is found far enough so that the match
993 cannot start at strpos. */
995 if (ml_anch && t[-1] != '\n') {
996 /* Eventually fbm_*() should handle this, but often
997 anchored_offset is not 0, so this check will not be wasted. */
998 /* XXXX In the code below we prefer to look for "^" even in
999 presence of anchored substrings. And we search even
1000 beyond the found float position. These pessimizations
1001 are historical artefacts only. */
1003 while (t < strend - prog->minlen) {
1005 if (t < check_at - prog->check_offset_min) {
1006 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1007 /* Since we moved from the found position,
1008 we definitely contradict the found anchored
1009 substr. Due to the above check we do not
1010 contradict "check" substr.
1011 Thus we can arrive here only if check substr
1012 is float. Redo checking for "other"=="fixed".
1015 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1016 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1017 goto do_other_anchored;
1019 /* We don't contradict the found floating substring. */
1020 /* XXXX Why not check for STCLASS? */
1022 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1023 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1026 /* Position contradicts check-string */
1027 /* XXXX probably better to look for check-string
1028 than for "\n", so one should lower the limit for t? */
1029 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1030 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1031 other_last = strpos = s = t + 1;
1036 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1037 PL_colors[0], PL_colors[1]));
1041 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1042 PL_colors[0], PL_colors[1]));
1046 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1049 /* The found string does not prohibit matching at strpos,
1050 - no optimization of calling REx engine can be performed,
1051 unless it was an MBOL and we are not after MBOL,
1052 or a future STCLASS check will fail this. */
1054 /* Even in this situation we may use MBOL flag if strpos is offset
1055 wrt the start of the string. */
1056 if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1057 /* May be due to an implicit anchor of m{.*foo} */
1058 && !(prog->intflags & PREGf_IMPLICIT))
1063 DEBUG_EXECUTE_r( if (ml_anch)
1064 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1065 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1068 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1070 prog->check_utf8 /* Could be deleted already */
1071 && --BmUSEFUL(prog->check_utf8) < 0
1072 && (prog->check_utf8 == prog->float_utf8)
1074 prog->check_substr /* Could be deleted already */
1075 && --BmUSEFUL(prog->check_substr) < 0
1076 && (prog->check_substr == prog->float_substr)
1079 /* If flags & SOMETHING - do not do it many times on the same match */
1080 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1081 /* XXX Does the destruction order has to change with utf8_target? */
1082 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1083 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1084 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1085 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1086 check = NULL; /* abort */
1088 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1089 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1090 if (prog->intflags & PREGf_IMPLICIT)
1091 prog->extflags &= ~RXf_ANCH_MBOL;
1092 /* XXXX This is a remnant of the old implementation. It
1093 looks wasteful, since now INTUIT can use many
1094 other heuristics. */
1095 prog->extflags &= ~RXf_USE_INTUIT;
1096 /* XXXX What other flags might need to be cleared in this branch? */
1102 /* Last resort... */
1103 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1104 /* trie stclasses are too expensive to use here, we are better off to
1105 leave it to regmatch itself */
1106 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1107 /* minlen == 0 is possible if regstclass is \b or \B,
1108 and the fixed substr is ''$.
1109 Since minlen is already taken into account, s+1 is before strend;
1110 accidentally, minlen >= 1 guaranties no false positives at s + 1
1111 even for \b or \B. But (minlen? 1 : 0) below assumes that
1112 regstclass does not come from lookahead... */
1113 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1114 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1115 const U8* const str = (U8*)STRING(progi->regstclass);
1116 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1117 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1120 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1121 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1122 else if (prog->float_substr || prog->float_utf8)
1123 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1127 if (checked_upto < s)
1129 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1130 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1133 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1139 const char *what = NULL;
1141 if (endpos == strend) {
1142 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1143 "Could not match STCLASS...\n") );
1146 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1147 "This position contradicts STCLASS...\n") );
1148 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1150 checked_upto = HOPBACKc(endpos, start_shift);
1151 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1152 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1153 /* Contradict one of substrings */
1154 if (prog->anchored_substr || prog->anchored_utf8) {
1155 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1156 DEBUG_EXECUTE_r( what = "anchored" );
1158 s = HOP3c(t, 1, strend);
1159 if (s + start_shift + end_shift > strend) {
1160 /* XXXX Should be taken into account earlier? */
1161 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1162 "Could not match STCLASS...\n") );
1167 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1168 "Looking for %s substr starting at offset %ld...\n",
1169 what, (long)(s + start_shift - i_strpos)) );
1172 /* Have both, check_string is floating */
1173 if (t + start_shift >= check_at) /* Contradicts floating=check */
1174 goto retry_floating_check;
1175 /* Recheck anchored substring, but not floating... */
1179 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1180 "Looking for anchored substr starting at offset %ld...\n",
1181 (long)(other_last - i_strpos)) );
1182 goto do_other_anchored;
1184 /* Another way we could have checked stclass at the
1185 current position only: */
1190 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1191 "Looking for /%s^%s/m starting at offset %ld...\n",
1192 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1195 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1197 /* Check is floating substring. */
1198 retry_floating_check:
1199 t = check_at - start_shift;
1200 DEBUG_EXECUTE_r( what = "floating" );
1201 goto hop_and_restart;
1204 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1205 "By STCLASS: moving %ld --> %ld\n",
1206 (long)(t - i_strpos), (long)(s - i_strpos))
1210 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1211 "Does not contradict STCLASS...\n");
1216 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1217 PL_colors[4], (check ? "Guessed" : "Giving up"),
1218 PL_colors[5], (long)(s - i_strpos)) );
1221 fail_finish: /* Substring not found */
1222 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1223 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1225 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1226 PL_colors[4], PL_colors[5]));
1230 #define DECL_TRIE_TYPE(scan) \
1231 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1232 trie_type = ((scan->flags == EXACT) \
1233 ? (utf8_target ? trie_utf8 : trie_plain) \
1234 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1236 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1239 switch (trie_type) { \
1240 case trie_utf8_fold: \
1241 if ( foldlen>0 ) { \
1242 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1247 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1248 len = UTF8SKIP(uc); \
1249 skiplen = UNISKIP( uvc ); \
1250 foldlen -= skiplen; \
1251 uscan = foldbuf + skiplen; \
1254 case trie_latin_utf8_fold: \
1255 if ( foldlen>0 ) { \
1256 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1262 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
1263 skiplen = UNISKIP( uvc ); \
1264 foldlen -= skiplen; \
1265 uscan = foldbuf + skiplen; \
1269 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1276 charid = trie->charmap[ uvc ]; \
1280 if (widecharmap) { \
1281 SV** const svpp = hv_fetch(widecharmap, \
1282 (char*)&uvc, sizeof(UV), 0); \
1284 charid = (U16)SvIV(*svpp); \
1289 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1293 && (ln == 1 || folder(s, pat_string, ln)) \
1294 && (reginfo->intuit || regtry(reginfo, &s)) )\
1300 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1302 while (s < strend) { \
1308 #define REXEC_FBC_SCAN(CoDe) \
1310 while (s < strend) { \
1316 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1317 REXEC_FBC_UTF8_SCAN( \
1319 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1328 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1331 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1340 #define REXEC_FBC_TRYIT \
1341 if ((reginfo->intuit || regtry(reginfo, &s))) \
1344 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1345 if (utf8_target) { \
1346 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1349 REXEC_FBC_CLASS_SCAN(CoNd); \
1352 #define DUMP_EXEC_POS(li,s,doutf8) \
1353 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1354 (PL_reg_starttry),doutf8)
1357 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1358 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1359 tmp = TEST_NON_UTF8(tmp); \
1360 REXEC_FBC_UTF8_SCAN( \
1361 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1370 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1371 if (s == reginfo->strbeg) { \
1375 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1376 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1379 LOAD_UTF8_CHARCLASS_ALNUM(); \
1380 REXEC_FBC_UTF8_SCAN( \
1381 if (tmp == ! (TeSt2_UtF8)) { \
1390 /* The only difference between the BOUND and NBOUND cases is that
1391 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1392 * NBOUND. This is accomplished by passing it in either the if or else clause,
1393 * with the other one being empty */
1394 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1395 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1397 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1398 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1400 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1401 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1403 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1407 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1408 * be passed in completely with the variable name being tested, which isn't
1409 * such a clean interface, but this is easier to read than it was before. We
1410 * are looking for the boundary (or non-boundary between a word and non-word
1411 * character. The utf8 and non-utf8 cases have the same logic, but the details
1412 * must be different. Find the "wordness" of the character just prior to this
1413 * one, and compare it with the wordness of this one. If they differ, we have
1414 * a boundary. At the beginning of the string, pretend that the previous
1415 * character was a new-line */
1416 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1417 if (utf8_target) { \
1420 else { /* Not utf8 */ \
1421 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1422 tmp = TEST_NON_UTF8(tmp); \
1424 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1433 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
1436 /* We know what class REx starts with. Try to find this position... */
1437 /* if reginfo->intuit, its a dryrun */
1438 /* annoyingly all the vars in this routine have different names from their counterparts
1439 in regmatch. /grrr */
1442 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1443 const char *strend, regmatch_info *reginfo)
1446 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1447 char *pat_string; /* The pattern's exactish string */
1448 char *pat_end; /* ptr to end char of pat_string */
1449 re_fold_t folder; /* Function for computing non-utf8 folds */
1450 const U8 *fold_array; /* array for folding ords < 256 */
1456 I32 tmp = 1; /* Scratch variable? */
1457 const bool utf8_target = reginfo->is_utf8_target;
1458 UV utf8_fold_flags = 0;
1459 const bool is_utf8_pat = reginfo->is_utf8_pat;
1460 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1461 with a result inverts that result, as 0^1 =
1463 _char_class_number classnum;
1465 RXi_GET_DECL(prog,progi);
1467 PERL_ARGS_ASSERT_FIND_BYCLASS;
1469 /* We know what class it must start with. */
1472 case ANYOF_SYNTHETIC:
1473 case ANYOF_WARN_SUPER:
1475 REXEC_FBC_UTF8_CLASS_SCAN(
1476 reginclass(prog, c, (U8*)s, utf8_target));
1479 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1484 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1492 if (is_utf8_pat || utf8_target) {
1493 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1494 goto do_exactf_utf8;
1496 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1497 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1498 goto do_exactf_non_utf8; /* isn't dealt with by these */
1503 /* regcomp.c already folded this if pattern is in UTF-8 */
1504 utf8_fold_flags = 0;
1505 goto do_exactf_utf8;
1507 fold_array = PL_fold;
1509 goto do_exactf_non_utf8;
1512 if (is_utf8_pat || utf8_target) {
1513 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1514 goto do_exactf_utf8;
1516 fold_array = PL_fold_locale;
1517 folder = foldEQ_locale;
1518 goto do_exactf_non_utf8;
1522 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1524 goto do_exactf_utf8;
1526 case EXACTFU_TRICKYFOLD:
1528 if (is_utf8_pat || utf8_target) {
1529 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1530 goto do_exactf_utf8;
1533 /* Any 'ss' in the pattern should have been replaced by regcomp,
1534 * so we don't have to worry here about this single special case
1535 * in the Latin1 range */
1536 fold_array = PL_fold_latin1;
1537 folder = foldEQ_latin1;
1541 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1542 are no glitches with fold-length differences
1543 between the target string and pattern */
1545 /* The idea in the non-utf8 EXACTF* cases is to first find the
1546 * first character of the EXACTF* node and then, if necessary,
1547 * case-insensitively compare the full text of the node. c1 is the
1548 * first character. c2 is its fold. This logic will not work for
1549 * Unicode semantics and the german sharp ss, which hence should
1550 * not be compiled into a node that gets here. */
1551 pat_string = STRING(c);
1552 ln = STR_LEN(c); /* length to match in octets/bytes */
1554 /* We know that we have to match at least 'ln' bytes (which is the
1555 * same as characters, since not utf8). If we have to match 3
1556 * characters, and there are only 2 availabe, we know without
1557 * trying that it will fail; so don't start a match past the
1558 * required minimum number from the far end */
1559 e = HOP3c(strend, -((I32)ln), s);
1561 if (reginfo->intuit && e < s) {
1562 e = s; /* Due to minlen logic of intuit() */
1566 c2 = fold_array[c1];
1567 if (c1 == c2) { /* If char and fold are the same */
1568 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1571 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1579 /* If one of the operands is in utf8, we can't use the simpler folding
1580 * above, due to the fact that many different characters can have the
1581 * same fold, or portion of a fold, or different- length fold */
1582 pat_string = STRING(c);
1583 ln = STR_LEN(c); /* length to match in octets/bytes */
1584 pat_end = pat_string + ln;
1585 lnc = is_utf8_pat /* length to match in characters */
1586 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1589 /* We have 'lnc' characters to match in the pattern, but because of
1590 * multi-character folding, each character in the target can match
1591 * up to 3 characters (Unicode guarantees it will never exceed
1592 * this) if it is utf8-encoded; and up to 2 if not (based on the
1593 * fact that the Latin 1 folds are already determined, and the
1594 * only multi-char fold in that range is the sharp-s folding to
1595 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1596 * string character. Adjust lnc accordingly, rounding up, so that
1597 * if we need to match at least 4+1/3 chars, that really is 5. */
1598 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1599 lnc = (lnc + expansion - 1) / expansion;
1601 /* As in the non-UTF8 case, if we have to match 3 characters, and
1602 * only 2 are left, it's guaranteed to fail, so don't start a
1603 * match that would require us to go beyond the end of the string
1605 e = HOP3c(strend, -((I32)lnc), s);
1607 if (reginfo->intuit && e < s) {
1608 e = s; /* Due to minlen logic of intuit() */
1611 /* XXX Note that we could recalculate e to stop the loop earlier,
1612 * as the worst case expansion above will rarely be met, and as we
1613 * go along we would usually find that e moves further to the left.
1614 * This would happen only after we reached the point in the loop
1615 * where if there were no expansion we should fail. Unclear if
1616 * worth the expense */
1619 char *my_strend= (char *)strend;
1620 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1621 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1622 && (reginfo->intuit || regtry(reginfo, &s)) )
1626 s += (utf8_target) ? UTF8SKIP(s) : 1;
1631 RXp_MATCH_TAINTED_on(prog);
1632 FBC_BOUND(isWORDCHAR_LC,
1633 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1634 isWORDCHAR_LC_utf8((U8*)s));
1637 RXp_MATCH_TAINTED_on(prog);
1638 FBC_NBOUND(isWORDCHAR_LC,
1639 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1640 isWORDCHAR_LC_utf8((U8*)s));
1643 FBC_BOUND(isWORDCHAR,
1644 isWORDCHAR_uni(tmp),
1645 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1648 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1650 isWORDCHAR_A((U8*)s));
1653 FBC_NBOUND(isWORDCHAR,
1654 isWORDCHAR_uni(tmp),
1655 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1658 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1660 isWORDCHAR_A((U8*)s));
1663 FBC_BOUND(isWORDCHAR_L1,
1664 isWORDCHAR_uni(tmp),
1665 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1668 FBC_NBOUND(isWORDCHAR_L1,
1669 isWORDCHAR_uni(tmp),
1670 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1673 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1674 is_LNBREAK_latin1_safe(s, strend)
1678 /* The argument to all the POSIX node types is the class number to pass to
1679 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1686 RXp_MATCH_TAINTED_on(prog);
1687 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1688 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1703 /* The complement of something that matches only ASCII matches all
1704 * UTF-8 variant code points, plus everything in ASCII that isn't
1706 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1707 || ! _generic_isCC_A(*s, FLAGS(c)));
1716 /* Don't need to worry about utf8, as it can match only a single
1717 * byte invariant character. */
1718 REXEC_FBC_CLASS_SCAN(
1719 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1727 if (! utf8_target) {
1728 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1734 classnum = (_char_class_number) FLAGS(c);
1735 if (classnum < _FIRST_NON_SWASH_CC) {
1736 while (s < strend) {
1738 /* We avoid loading in the swash as long as possible, but
1739 * should we have to, we jump to a separate loop. This
1740 * extra 'if' statement is what keeps this code from being
1741 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1742 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1743 goto found_above_latin1;
1745 if ((UTF8_IS_INVARIANT(*s)
1746 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1748 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1749 && to_complement ^ cBOOL(
1750 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1753 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1765 else switch (classnum) { /* These classes are implemented as
1767 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1768 revert the change of \v matching this */
1771 case _CC_ENUM_PSXSPC:
1772 REXEC_FBC_UTF8_CLASS_SCAN(
1773 to_complement ^ cBOOL(isSPACE_utf8(s)));
1776 case _CC_ENUM_BLANK:
1777 REXEC_FBC_UTF8_CLASS_SCAN(
1778 to_complement ^ cBOOL(isBLANK_utf8(s)));
1781 case _CC_ENUM_XDIGIT:
1782 REXEC_FBC_UTF8_CLASS_SCAN(
1783 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1786 case _CC_ENUM_VERTSPACE:
1787 REXEC_FBC_UTF8_CLASS_SCAN(
1788 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1791 case _CC_ENUM_CNTRL:
1792 REXEC_FBC_UTF8_CLASS_SCAN(
1793 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1797 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1798 assert(0); /* NOTREACHED */
1803 found_above_latin1: /* Here we have to load a swash to get the result
1804 for the current code point */
1805 if (! PL_utf8_swash_ptrs[classnum]) {
1806 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1807 PL_utf8_swash_ptrs[classnum] =
1808 _core_swash_init("utf8", swash_property_names[classnum],
1809 &PL_sv_undef, 1, 0, NULL, &flags);
1812 /* This is a copy of the loop above for swash classes, though using the
1813 * FBC macro instead of being expanded out. Since we've loaded the
1814 * swash, we don't have to check for that each time through the loop */
1815 REXEC_FBC_UTF8_CLASS_SCAN(
1816 to_complement ^ cBOOL(_generic_utf8(
1819 swash_fetch(PL_utf8_swash_ptrs[classnum],
1827 /* what trie are we using right now */
1828 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1829 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1830 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1832 const char *last_start = strend - trie->minlen;
1834 const char *real_start = s;
1836 STRLEN maxlen = trie->maxlen;
1838 U8 **points; /* map of where we were in the input string
1839 when reading a given char. For ASCII this
1840 is unnecessary overhead as the relationship
1841 is always 1:1, but for Unicode, especially
1842 case folded Unicode this is not true. */
1843 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1847 GET_RE_DEBUG_FLAGS_DECL;
1849 /* We can't just allocate points here. We need to wrap it in
1850 * an SV so it gets freed properly if there is a croak while
1851 * running the match */
1854 sv_points=newSV(maxlen * sizeof(U8 *));
1855 SvCUR_set(sv_points,
1856 maxlen * sizeof(U8 *));
1857 SvPOK_on(sv_points);
1858 sv_2mortal(sv_points);
1859 points=(U8**)SvPV_nolen(sv_points );
1860 if ( trie_type != trie_utf8_fold
1861 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1864 bitmap=(U8*)trie->bitmap;
1866 bitmap=(U8*)ANYOF_BITMAP(c);
1868 /* this is the Aho-Corasick algorithm modified a touch
1869 to include special handling for long "unknown char" sequences.
1870 The basic idea being that we use AC as long as we are dealing
1871 with a possible matching char, when we encounter an unknown char
1872 (and we have not encountered an accepting state) we scan forward
1873 until we find a legal starting char.
1874 AC matching is basically that of trie matching, except that when
1875 we encounter a failing transition, we fall back to the current
1876 states "fail state", and try the current char again, a process
1877 we repeat until we reach the root state, state 1, or a legal
1878 transition. If we fail on the root state then we can either
1879 terminate if we have reached an accepting state previously, or
1880 restart the entire process from the beginning if we have not.
1883 while (s <= last_start) {
1884 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1892 U8 *uscan = (U8*)NULL;
1893 U8 *leftmost = NULL;
1895 U32 accepted_word= 0;
1899 while ( state && uc <= (U8*)strend ) {
1901 U32 word = aho->states[ state ].wordnum;
1905 DEBUG_TRIE_EXECUTE_r(
1906 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1907 dump_exec_pos( (char *)uc, c, strend, real_start,
1908 (char *)uc, utf8_target );
1909 PerlIO_printf( Perl_debug_log,
1910 " Scanning for legal start char...\n");
1914 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1918 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1924 if (uc >(U8*)last_start) break;
1928 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1929 if (!leftmost || lpos < leftmost) {
1930 DEBUG_r(accepted_word=word);
1936 points[pointpos++ % maxlen]= uc;
1937 if (foldlen || uc < (U8*)strend) {
1938 REXEC_TRIE_READ_CHAR(trie_type, trie,
1940 uscan, len, uvc, charid, foldlen,
1942 DEBUG_TRIE_EXECUTE_r({
1943 dump_exec_pos( (char *)uc, c, strend,
1944 real_start, s, utf8_target);
1945 PerlIO_printf(Perl_debug_log,
1946 " Charid:%3u CP:%4"UVxf" ",
1958 word = aho->states[ state ].wordnum;
1960 base = aho->states[ state ].trans.base;
1962 DEBUG_TRIE_EXECUTE_r({
1964 dump_exec_pos( (char *)uc, c, strend, real_start,
1966 PerlIO_printf( Perl_debug_log,
1967 "%sState: %4"UVxf", word=%"UVxf,
1968 failed ? " Fail transition to " : "",
1969 (UV)state, (UV)word);
1975 ( ((offset = base + charid
1976 - 1 - trie->uniquecharcount)) >= 0)
1977 && ((U32)offset < trie->lasttrans)
1978 && trie->trans[offset].check == state
1979 && (tmp=trie->trans[offset].next))
1981 DEBUG_TRIE_EXECUTE_r(
1982 PerlIO_printf( Perl_debug_log," - legal\n"));
1987 DEBUG_TRIE_EXECUTE_r(
1988 PerlIO_printf( Perl_debug_log," - fail\n"));
1990 state = aho->fail[state];
1994 /* we must be accepting here */
1995 DEBUG_TRIE_EXECUTE_r(
1996 PerlIO_printf( Perl_debug_log," - accepting\n"));
2005 if (!state) state = 1;
2008 if ( aho->states[ state ].wordnum ) {
2009 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2010 if (!leftmost || lpos < leftmost) {
2011 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2016 s = (char*)leftmost;
2017 DEBUG_TRIE_EXECUTE_r({
2019 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2020 (UV)accepted_word, (IV)(s - real_start)
2023 if (reginfo->intuit || regtry(reginfo, &s)) {
2029 DEBUG_TRIE_EXECUTE_r({
2030 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2033 DEBUG_TRIE_EXECUTE_r(
2034 PerlIO_printf( Perl_debug_log,"No match.\n"));
2043 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2053 - regexec_flags - match a regexp against a string
2056 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2057 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2058 /* stringarg: the point in the string at which to begin matching */
2059 /* strend: pointer to null at end of string */
2060 /* strbeg: real beginning of string */
2061 /* minend: end of match must be >= minend bytes after stringarg. */
2062 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2063 * itself is accessed via the pointers above */
2064 /* data: May be used for some additional optimizations.
2065 Currently its only used, with a U32 cast, for transmitting
2066 the ganch offset when doing a /g match. This will change */
2067 /* nosave: For optimizations. */
2071 struct regexp *const prog = ReANY(rx);
2074 char *startpos = stringarg;
2075 I32 minlen; /* must match at least this many chars */
2076 I32 dontbother = 0; /* how many characters not to try at end */
2077 I32 end_shift = 0; /* Same for the end. */ /* CC */
2078 I32 scream_pos = -1; /* Internal iterator of scream. */
2079 char *scream_olds = NULL;
2080 const bool utf8_target = cBOOL(DO_UTF8(sv));
2082 RXi_GET_DECL(prog,progi);
2083 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2084 regmatch_info *const reginfo = ®info_buf;
2085 regexp_paren_pair *swap = NULL;
2087 GET_RE_DEBUG_FLAGS_DECL;
2089 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2090 PERL_UNUSED_ARG(data);
2092 /* Be paranoid... */
2093 if (prog == NULL || startpos == NULL) {
2094 Perl_croak(aTHX_ "NULL regexp parameter");
2098 multiline = prog->extflags & RXf_PMf_MULTILINE;
2100 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2101 reginfo->intuit = 0;
2102 reginfo->is_utf8_target = cBOOL(utf8_target);
2104 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2105 * which will call destuctors to reset PL_regmatch_state, free higher
2106 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2107 * regmatch_info_aux_eval */
2109 oldsave = PL_savestack_ix;
2112 debug_start_match(rx, utf8_target, startpos, strend,
2116 minlen = prog->minlen;
2118 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2119 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2120 "String too short [regexec_flags]...\n"));
2125 /* Check validity of program. */
2126 if (UCHARAT(progi->program) != REG_MAGIC) {
2127 Perl_croak(aTHX_ "corrupted regexp program");
2130 RX_MATCH_TAINTED_off(rx);
2132 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2133 reginfo->warned = FALSE;
2134 reginfo->strbeg = strbeg;
2136 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2138 /* Mark end of string for $ (and such) */
2139 reginfo->strend = strend;
2141 /* see how far we have to get to not match where we matched before */
2142 reginfo->till = startpos+minend;
2144 /* reserve next 2 or 3 slots in PL_regmatch_state:
2145 * slot N+0: may currently be in use: skip it
2146 * slot N+1: use for regmatch_info_aux struct
2147 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2148 * slot N+3: ready for use by regmatch()
2152 regmatch_state *old_regmatch_state;
2153 regmatch_slab *old_regmatch_slab;
2154 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2156 /* on first ever match, allocate first slab */
2157 if (!PL_regmatch_slab) {
2158 Newx(PL_regmatch_slab, 1, regmatch_slab);
2159 PL_regmatch_slab->prev = NULL;
2160 PL_regmatch_slab->next = NULL;
2161 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2164 old_regmatch_state = PL_regmatch_state;
2165 old_regmatch_slab = PL_regmatch_slab;
2167 for (i=0; i <= max; i++) {
2169 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2171 reginfo->info_aux_eval =
2172 reginfo->info_aux->info_aux_eval =
2173 &(PL_regmatch_state->u.info_aux_eval);
2175 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2176 PL_regmatch_state = S_push_slab(aTHX);
2179 /* note initial PL_regmatch_state position; at end of match we'll
2180 * pop back to there and free any higher slabs */
2182 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2183 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2184 reginfo->info_aux->poscache = NULL;
2186 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2188 if ((prog->extflags & RXf_EVAL_SEEN))
2189 S_setup_eval_state(aTHX_ reginfo);
2191 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2194 /* If there is a "must appear" string, look for it. */
2197 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2199 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2200 reginfo->ganch = startpos + prog->gofs;
2201 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2202 "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2203 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2205 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2206 && mg->mg_len >= 0) {
2207 reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */
2208 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2209 "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2211 if (prog->extflags & RXf_ANCH_GPOS) {
2212 if (s > reginfo->ganch)
2214 s = reginfo->ganch - prog->gofs;
2215 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2216 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2222 reginfo->ganch = strbeg + PTR2UV(data);
2223 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2224 "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2226 } else { /* pos() not defined */
2227 reginfo->ganch = strbeg;
2228 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2229 "GPOS: reginfo->ganch = strbeg\n"));
2232 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2233 /* We have to be careful. If the previous successful match
2234 was from this regex we don't want a subsequent partially
2235 successful match to clobber the old results.
2236 So when we detect this possibility we add a swap buffer
2237 to the re, and switch the buffer each match. If we fail,
2238 we switch it back; otherwise we leave it swapped.
2241 /* do we need a save destructor here for eval dies? */
2242 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2243 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2244 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2250 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2251 re_scream_pos_data d;
2253 d.scream_olds = &scream_olds;
2254 d.scream_pos = &scream_pos;
2255 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
2257 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2258 goto phooey; /* not present */
2264 /* Simplest case: anchored match need be tried only once. */
2265 /* [unless only anchor is BOL and multiline is set] */
2266 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2267 if (s == startpos && regtry(reginfo, &startpos))
2269 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2270 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2275 dontbother = minlen - 1;
2276 end = HOP3c(strend, -dontbother, strbeg) - 1;
2277 /* for multiline we only have to try after newlines */
2278 if (prog->check_substr || prog->check_utf8) {
2279 /* because of the goto we can not easily reuse the macros for bifurcating the
2280 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2283 goto after_try_utf8;
2285 if (regtry(reginfo, &s)) {
2292 if (prog->extflags & RXf_USE_INTUIT) {
2293 s = re_intuit_start(rx, sv, strbeg,
2294 s + UTF8SKIP(s), strend, flags, NULL);
2303 } /* end search for check string in unicode */
2305 if (s == startpos) {
2306 goto after_try_latin;
2309 if (regtry(reginfo, &s)) {
2316 if (prog->extflags & RXf_USE_INTUIT) {
2317 s = re_intuit_start(rx, sv, strbeg,
2318 s + 1, strend, flags, NULL);
2327 } /* end search for check string in latin*/
2328 } /* end search for check string */
2329 else { /* search for newline */
2331 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2334 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2335 while (s <= end) { /* note it could be possible to match at the end of the string */
2336 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2337 if (regtry(reginfo, &s))
2341 } /* end search for newline */
2342 } /* end anchored/multiline check string search */
2344 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2346 /* the warning about reginfo->ganch being used without initialization
2347 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2348 and we only enter this block when the same bit is set. */
2349 char *tmp_s = reginfo->ganch - prog->gofs;
2351 if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
2356 /* Messy cases: unanchored match. */
2357 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2358 /* we have /x+whatever/ */
2359 /* it must be a one character string (XXXX Except is_utf8_pat?) */
2365 if (! prog->anchored_utf8) {
2366 to_utf8_substr(prog);
2368 ch = SvPVX_const(prog->anchored_utf8)[0];
2371 DEBUG_EXECUTE_r( did_match = 1 );
2372 if (regtry(reginfo, &s)) goto got_it;
2374 while (s < strend && *s == ch)
2381 if (! prog->anchored_substr) {
2382 if (! to_byte_substr(prog)) {
2383 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2386 ch = SvPVX_const(prog->anchored_substr)[0];
2389 DEBUG_EXECUTE_r( did_match = 1 );
2390 if (regtry(reginfo, &s)) goto got_it;
2392 while (s < strend && *s == ch)
2397 DEBUG_EXECUTE_r(if (!did_match)
2398 PerlIO_printf(Perl_debug_log,
2399 "Did not find anchored character...\n")
2402 else if (prog->anchored_substr != NULL
2403 || prog->anchored_utf8 != NULL
2404 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2405 && prog->float_max_offset < strend - s)) {
2410 char *last1; /* Last position checked before */
2414 if (prog->anchored_substr || prog->anchored_utf8) {
2416 if (! prog->anchored_utf8) {
2417 to_utf8_substr(prog);
2419 must = prog->anchored_utf8;
2422 if (! prog->anchored_substr) {
2423 if (! to_byte_substr(prog)) {
2424 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2427 must = prog->anchored_substr;
2429 back_max = back_min = prog->anchored_offset;
2432 if (! prog->float_utf8) {
2433 to_utf8_substr(prog);
2435 must = prog->float_utf8;
2438 if (! prog->float_substr) {
2439 if (! to_byte_substr(prog)) {
2440 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2443 must = prog->float_substr;
2445 back_max = prog->float_max_offset;
2446 back_min = prog->float_min_offset;
2452 last = HOP3c(strend, /* Cannot start after this */
2453 -(I32)(CHR_SVLEN(must)
2454 - (SvTAIL(must) != 0) + back_min), strbeg);
2456 if (s > reginfo->strbeg)
2457 last1 = HOPc(s, -1);
2459 last1 = s - 1; /* bogus */
2461 /* XXXX check_substr already used to find "s", can optimize if
2462 check_substr==must. */
2464 dontbother = end_shift;
2465 strend = HOPc(strend, -dontbother);
2466 while ( (s <= last) &&
2467 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2468 (unsigned char*)strend, must,
2469 multiline ? FBMrf_MULTILINE : 0)) ) {
2470 DEBUG_EXECUTE_r( did_match = 1 );
2471 if (HOPc(s, -back_max) > last1) {
2472 last1 = HOPc(s, -back_min);
2473 s = HOPc(s, -back_max);
2476 char * const t = (last1 >= reginfo->strbeg)
2477 ? HOPc(last1, 1) : last1 + 1;
2479 last1 = HOPc(s, -back_min);
2483 while (s <= last1) {
2484 if (regtry(reginfo, &s))
2487 s++; /* to break out of outer loop */
2494 while (s <= last1) {
2495 if (regtry(reginfo, &s))
2501 DEBUG_EXECUTE_r(if (!did_match) {
2502 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2503 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2504 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2505 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2506 ? "anchored" : "floating"),
2507 quoted, RE_SV_TAIL(must));
2511 else if ( (c = progi->regstclass) ) {
2513 const OPCODE op = OP(progi->regstclass);
2514 /* don't bother with what can't match */
2515 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2516 strend = HOPc(strend, -(minlen - 1));
2519 SV * const prop = sv_newmortal();
2520 regprop(prog, prop, c);
2522 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2524 PerlIO_printf(Perl_debug_log,
2525 "Matching stclass %.*s against %s (%d bytes)\n",
2526 (int)SvCUR(prop), SvPVX_const(prop),
2527 quoted, (int)(strend - s));
2530 if (find_byclass(prog, c, s, strend, reginfo))
2532 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2536 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2544 if (! prog->float_utf8) {
2545 to_utf8_substr(prog);
2547 float_real = prog->float_utf8;
2550 if (! prog->float_substr) {
2551 if (! to_byte_substr(prog)) {
2552 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2555 float_real = prog->float_substr;
2558 little = SvPV_const(float_real, len);
2559 if (SvTAIL(float_real)) {
2560 /* This means that float_real contains an artificial \n on
2561 * the end due to the presence of something like this:
2562 * /foo$/ where we can match both "foo" and "foo\n" at the
2563 * end of the string. So we have to compare the end of the
2564 * string first against the float_real without the \n and
2565 * then against the full float_real with the string. We
2566 * have to watch out for cases where the string might be
2567 * smaller than the float_real or the float_real without
2569 char *checkpos= strend - len;
2571 PerlIO_printf(Perl_debug_log,
2572 "%sChecking for float_real.%s\n",
2573 PL_colors[4], PL_colors[5]));
2574 if (checkpos + 1 < strbeg) {
2575 /* can't match, even if we remove the trailing \n
2576 * string is too short to match */
2578 PerlIO_printf(Perl_debug_log,
2579 "%sString shorter than required trailing substring, cannot match.%s\n",
2580 PL_colors[4], PL_colors[5]));
2582 } else if (memEQ(checkpos + 1, little, len - 1)) {
2583 /* can match, the end of the string matches without the
2585 last = checkpos + 1;
2586 } else if (checkpos < strbeg) {
2587 /* cant match, string is too short when the "\n" is
2590 PerlIO_printf(Perl_debug_log,
2591 "%sString does not contain required trailing substring, cannot match.%s\n",
2592 PL_colors[4], PL_colors[5]));
2594 } else if (!multiline) {
2595 /* non multiline match, so compare with the "\n" at the
2596 * end of the string */
2597 if (memEQ(checkpos, little, len)) {
2601 PerlIO_printf(Perl_debug_log,
2602 "%sString does not contain required trailing substring, cannot match.%s\n",
2603 PL_colors[4], PL_colors[5]));
2607 /* multiline match, so we have to search for a place
2608 * where the full string is located */
2614 last = rninstr(s, strend, little, little + len);
2616 last = strend; /* matching "$" */
2619 /* at one point this block contained a comment which was
2620 * probably incorrect, which said that this was a "should not
2621 * happen" case. Even if it was true when it was written I am
2622 * pretty sure it is not anymore, so I have removed the comment
2623 * and replaced it with this one. Yves */
2625 PerlIO_printf(Perl_debug_log,
2626 "String does not contain required substring, cannot match.\n"
2630 dontbother = strend - last + prog->float_min_offset;
2632 if (minlen && (dontbother < minlen))
2633 dontbother = minlen - 1;
2634 strend -= dontbother; /* this one's always in bytes! */
2635 /* We don't know much -- general case. */
2638 if (regtry(reginfo, &s))
2647 if (regtry(reginfo, &s))
2649 } while (s++ < strend);
2659 PerlIO_printf(Perl_debug_log,
2660 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2667 /* clean up; this will trigger destructors that will free all slabs
2668 * above the current one, and cleanup the regmatch_info_aux
2669 * and regmatch_info_aux_eval sructs */
2671 LEAVE_SCOPE(oldsave);
2673 if (RXp_PAREN_NAMES(prog))
2674 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2676 RX_MATCH_UTF8_set(rx, utf8_target);
2678 /* make sure $`, $&, $', and $digit will work later */
2679 if ( !(flags & REXEC_NOT_FIRST) ) {
2680 if (flags & REXEC_COPY_STR) {
2684 PerlIO_printf(Perl_debug_log,
2685 "Copy on write: regexp capture, type %d\n",
2688 RX_MATCH_COPY_FREE(rx);
2689 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2690 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2691 assert (SvPOKp(prog->saved_copy));
2692 prog->sublen = reginfo->strend - strbeg;
2693 prog->suboffset = 0;
2694 prog->subcoffset = 0;
2699 I32 max = reginfo->strend - strbeg;
2702 if ( (flags & REXEC_COPY_SKIP_POST)
2703 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2704 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2705 ) { /* don't copy $' part of string */
2708 /* calculate the right-most part of the string covered
2709 * by a capture. Due to look-ahead, this may be to
2710 * the right of $&, so we have to scan all captures */
2711 while (n <= prog->lastparen) {
2712 if (prog->offs[n].end > max)
2713 max = prog->offs[n].end;
2717 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2718 ? prog->offs[0].start
2720 assert(max >= 0 && max <= reginfo->strend - strbeg);
2723 if ( (flags & REXEC_COPY_SKIP_PRE)
2724 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2725 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2726 ) { /* don't copy $` part of string */
2729 /* calculate the left-most part of the string covered
2730 * by a capture. Due to look-behind, this may be to
2731 * the left of $&, so we have to scan all captures */
2732 while (min && n <= prog->lastparen) {
2733 if ( prog->offs[n].start != -1
2734 && prog->offs[n].start < min)
2736 min = prog->offs[n].start;
2740 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2741 && min > prog->offs[0].end
2743 min = prog->offs[0].end;
2747 assert(min >= 0 && min <= max
2748 && min <= reginfo->strend - strbeg);
2751 if (RX_MATCH_COPIED(rx)) {
2752 if (sublen > prog->sublen)
2754 (char*)saferealloc(prog->subbeg, sublen+1);
2757 prog->subbeg = (char*)safemalloc(sublen+1);
2758 Copy(strbeg + min, prog->subbeg, sublen, char);
2759 prog->subbeg[sublen] = '\0';
2760 prog->suboffset = min;
2761 prog->sublen = sublen;
2762 RX_MATCH_COPIED_on(rx);
2764 prog->subcoffset = prog->suboffset;
2765 if (prog->suboffset && utf8_target) {
2766 /* Convert byte offset to chars.
2767 * XXX ideally should only compute this if @-/@+
2768 * has been seen, a la PL_sawampersand ??? */
2770 /* If there's a direct correspondence between the
2771 * string which we're matching and the original SV,
2772 * then we can use the utf8 len cache associated with
2773 * the SV. In particular, it means that under //g,
2774 * sv_pos_b2u() will use the previously cached
2775 * position to speed up working out the new length of
2776 * subcoffset, rather than counting from the start of
2777 * the string each time. This stops
2778 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2779 * from going quadratic */
2780 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2781 sv_pos_b2u(sv, &(prog->subcoffset));
2783 prog->subcoffset = utf8_length((U8*)strbeg,
2784 (U8*)(strbeg+prog->suboffset));
2788 RX_MATCH_COPY_FREE(rx);
2789 prog->subbeg = strbeg;
2790 prog->suboffset = 0;
2791 prog->subcoffset = 0;
2792 /* use reginfo->strend, as strend may have been modified */
2793 prog->sublen = reginfo->strend - strbeg;
2800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2801 PL_colors[4], PL_colors[5]));
2803 /* clean up; this will trigger destructors that will free all slabs
2804 * above the current one, and cleanup the regmatch_info_aux
2805 * and regmatch_info_aux_eval sructs */
2807 LEAVE_SCOPE(oldsave);
2810 /* we failed :-( roll it back */
2811 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2812 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2817 Safefree(prog->offs);
2824 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2825 * Do inc before dec, in case old and new rex are the same */
2826 #define SET_reg_curpm(Re2) \
2827 if (reginfo->info_aux_eval) { \
2828 (void)ReREFCNT_inc(Re2); \
2829 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2830 PM_SETRE((PL_reg_curpm), (Re2)); \
2835 - regtry - try match at specific point
2837 STATIC I32 /* 0 failure, 1 success */
2838 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2842 REGEXP *const rx = reginfo->prog;
2843 regexp *const prog = ReANY(rx);
2845 RXi_GET_DECL(prog,progi);
2846 GET_RE_DEBUG_FLAGS_DECL;
2848 PERL_ARGS_ASSERT_REGTRY;
2850 reginfo->cutpoint=NULL;
2853 PL_reg_starttry = *startposp;
2855 prog->offs[0].start = *startposp - reginfo->strbeg;
2856 prog->lastparen = 0;
2857 prog->lastcloseparen = 0;
2859 /* XXXX What this code is doing here?!!! There should be no need
2860 to do this again and again, prog->lastparen should take care of
2863 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2864 * Actually, the code in regcppop() (which Ilya may be meaning by
2865 * prog->lastparen), is not needed at all by the test suite
2866 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2867 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2868 * Meanwhile, this code *is* needed for the
2869 * above-mentioned test suite tests to succeed. The common theme
2870 * on those tests seems to be returning null fields from matches.
2871 * --jhi updated by dapm */
2873 if (prog->nparens) {
2874 regexp_paren_pair *pp = prog->offs;
2876 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2884 result = regmatch(reginfo, *startposp, progi->program + 1);
2886 prog->offs[0].end = result;
2889 if (reginfo->cutpoint)
2890 *startposp= reginfo->cutpoint;
2891 REGCP_UNWIND(lastcp);
2896 #define sayYES goto yes
2897 #define sayNO goto no
2898 #define sayNO_SILENT goto no_silent
2900 /* we dont use STMT_START/END here because it leads to
2901 "unreachable code" warnings, which are bogus, but distracting. */
2902 #define CACHEsayNO \
2903 if (ST.cache_mask) \
2904 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
2907 /* this is used to determine how far from the left messages like
2908 'failed...' are printed. It should be set such that messages
2909 are inline with the regop output that created them.
2911 #define REPORT_CODE_OFF 32
2914 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2915 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2916 #define CHRTEST_NOT_A_CP_1 -999
2917 #define CHRTEST_NOT_A_CP_2 -998
2919 /* grab a new slab and return the first slot in it */
2921 STATIC regmatch_state *
2924 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2927 regmatch_slab *s = PL_regmatch_slab->next;
2929 Newx(s, 1, regmatch_slab);
2930 s->prev = PL_regmatch_slab;
2932 PL_regmatch_slab->next = s;
2934 PL_regmatch_slab = s;
2935 return SLAB_FIRST(s);
2939 /* push a new state then goto it */
2941 #define PUSH_STATE_GOTO(state, node, input) \
2942 pushinput = input; \
2944 st->resume_state = state; \
2947 /* push a new state with success backtracking, then goto it */
2949 #define PUSH_YES_STATE_GOTO(state, node, input) \
2950 pushinput = input; \
2952 st->resume_state = state; \
2953 goto push_yes_state;
2960 regmatch() - main matching routine
2962 This is basically one big switch statement in a loop. We execute an op,
2963 set 'next' to point the next op, and continue. If we come to a point which
2964 we may need to backtrack to on failure such as (A|B|C), we push a
2965 backtrack state onto the backtrack stack. On failure, we pop the top
2966 state, and re-enter the loop at the state indicated. If there are no more
2967 states to pop, we return failure.
2969 Sometimes we also need to backtrack on success; for example /A+/, where
2970 after successfully matching one A, we need to go back and try to
2971 match another one; similarly for lookahead assertions: if the assertion
2972 completes successfully, we backtrack to the state just before the assertion
2973 and then carry on. In these cases, the pushed state is marked as
2974 'backtrack on success too'. This marking is in fact done by a chain of
2975 pointers, each pointing to the previous 'yes' state. On success, we pop to
2976 the nearest yes state, discarding any intermediate failure-only states.
2977 Sometimes a yes state is pushed just to force some cleanup code to be
2978 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2979 it to free the inner regex.
2981 Note that failure backtracking rewinds the cursor position, while
2982 success backtracking leaves it alone.
2984 A pattern is complete when the END op is executed, while a subpattern
2985 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2986 ops trigger the "pop to last yes state if any, otherwise return true"
2989 A common convention in this function is to use A and B to refer to the two
2990 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2991 the subpattern to be matched possibly multiple times, while B is the entire
2992 rest of the pattern. Variable and state names reflect this convention.
2994 The states in the main switch are the union of ops and failure/success of
2995 substates associated with with that op. For example, IFMATCH is the op
2996 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2997 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2998 successfully matched A and IFMATCH_A_fail is a state saying that we have
2999 just failed to match A. Resume states always come in pairs. The backtrack
3000 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3001 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3002 on success or failure.
3004 The struct that holds a backtracking state is actually a big union, with
3005 one variant for each major type of op. The variable st points to the
3006 top-most backtrack struct. To make the code clearer, within each
3007 block of code we #define ST to alias the relevant union.
3009 Here's a concrete example of a (vastly oversimplified) IFMATCH
3015 #define ST st->u.ifmatch
3017 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3018 ST.foo = ...; // some state we wish to save
3020 // push a yes backtrack state with a resume value of
3021 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3023 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3026 case IFMATCH_A: // we have successfully executed A; now continue with B
3028 bar = ST.foo; // do something with the preserved value
3031 case IFMATCH_A_fail: // A failed, so the assertion failed
3032 ...; // do some housekeeping, then ...
3033 sayNO; // propagate the failure
3040 For any old-timers reading this who are familiar with the old recursive
3041 approach, the code above is equivalent to:
3043 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3052 ...; // do some housekeeping, then ...
3053 sayNO; // propagate the failure
3056 The topmost backtrack state, pointed to by st, is usually free. If you
3057 want to claim it, populate any ST.foo fields in it with values you wish to
3058 save, then do one of
3060 PUSH_STATE_GOTO(resume_state, node, newinput);
3061 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3063 which sets that backtrack state's resume value to 'resume_state', pushes a
3064 new free entry to the top of the backtrack stack, then goes to 'node'.
3065 On backtracking, the free slot is popped, and the saved state becomes the
3066 new free state. An ST.foo field in this new top state can be temporarily
3067 accessed to retrieve values, but once the main loop is re-entered, it
3068 becomes available for reuse.
3070 Note that the depth of the backtrack stack constantly increases during the
3071 left-to-right execution of the pattern, rather than going up and down with
3072 the pattern nesting. For example the stack is at its maximum at Z at the
3073 end of the pattern, rather than at X in the following:
3075 /(((X)+)+)+....(Y)+....Z/
3077 The only exceptions to this are lookahead/behind assertions and the cut,
3078 (?>A), which pop all the backtrack states associated with A before
3081 Backtrack state structs are allocated in slabs of about 4K in size.
3082 PL_regmatch_state and st always point to the currently active state,
3083 and PL_regmatch_slab points to the slab currently containing
3084 PL_regmatch_state. The first time regmatch() is called, the first slab is
3085 allocated, and is never freed until interpreter destruction. When the slab
3086 is full, a new one is allocated and chained to the end. At exit from
3087 regmatch(), slabs allocated since entry are freed.
3092 #define DEBUG_STATE_pp(pp) \
3094 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3095 PerlIO_printf(Perl_debug_log, \
3096 " %*s"pp" %s%s%s%s%s\n", \
3098 PL_reg_name[st->resume_state], \
3099 ((st==yes_state||st==mark_state) ? "[" : ""), \
3100 ((st==yes_state) ? "Y" : ""), \
3101 ((st==mark_state) ? "M" : ""), \
3102 ((st==yes_state||st==mark_state) ? "]" : "") \
3107 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3112 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3113 const char *start, const char *end, const char *blurb)
3115 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3117 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3122 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3123 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3125 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3126 start, end - start, 60);
3128 PerlIO_printf(Perl_debug_log,
3129 "%s%s REx%s %s against %s\n",
3130 PL_colors[4], blurb, PL_colors[5], s0, s1);
3132 if (utf8_target||utf8_pat)
3133 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3134 utf8_pat ? "pattern" : "",
3135 utf8_pat && utf8_target ? " and " : "",
3136 utf8_target ? "string" : ""
3142 S_dump_exec_pos(pTHX_ const char *locinput,
3143 const regnode *scan,
3144 const char *loc_regeol,
3145 const char *loc_bostr,
3146 const char *loc_reg_starttry,
3147 const bool utf8_target)
3149 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3150 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3151 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3152 /* The part of the string before starttry has one color
3153 (pref0_len chars), between starttry and current
3154 position another one (pref_len - pref0_len chars),
3155 after the current position the third one.
3156 We assume that pref0_len <= pref_len, otherwise we
3157 decrease pref0_len. */
3158 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3159 ? (5 + taill) - l : locinput - loc_bostr;
3162 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3164 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3166 pref0_len = pref_len - (locinput - loc_reg_starttry);
3167 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3168 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3169 ? (5 + taill) - pref_len : loc_regeol - locinput);
3170 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3174 if (pref0_len > pref_len)
3175 pref0_len = pref_len;
3177 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3179 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3180 (locinput - pref_len),pref0_len, 60, 4, 5);
3182 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3183 (locinput - pref_len + pref0_len),
3184 pref_len - pref0_len, 60, 2, 3);
3186 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3187 locinput, loc_regeol - locinput, 10, 0, 1);
3189 const STRLEN tlen=len0+len1+len2;
3190 PerlIO_printf(Perl_debug_log,
3191 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3192 (IV)(locinput - loc_bostr),
3195 (docolor ? "" : "> <"),
3197 (int)(tlen > 19 ? 0 : 19 - tlen),
3204 /* reg_check_named_buff_matched()
3205 * Checks to see if a named buffer has matched. The data array of
3206 * buffer numbers corresponding to the buffer is expected to reside
3207 * in the regexp->data->data array in the slot stored in the ARG() of
3208 * node involved. Note that this routine doesn't actually care about the
3209 * name, that information is not preserved from compilation to execution.
3210 * Returns the index of the leftmost defined buffer with the given name
3211 * or 0 if non of the buffers matched.
3214 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3217 RXi_GET_DECL(rex,rexi);
3218 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3219 I32 *nums=(I32*)SvPVX(sv_dat);
3221 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3223 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3224 if ((I32)rex->lastparen >= nums[n] &&
3225 rex->offs[nums[n]].end != -1)
3235 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3236 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3238 /* This function determines if there are one or two characters that match
3239 * the first character of the passed-in EXACTish node <text_node>, and if
3240 * so, returns them in the passed-in pointers.
3242 * If it determines that no possible character in the target string can
3243 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3244 * the first character in <text_node> requires UTF-8 to represent, and the
3245 * target string isn't in UTF-8.)
3247 * If there are more than two characters that could match the beginning of
3248 * <text_node>, or if more context is required to determine a match or not,
3249 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3251 * The motiviation behind this function is to allow the caller to set up
3252 * tight loops for matching. If <text_node> is of type EXACT, there is
3253 * only one possible character that can match its first character, and so
3254 * the situation is quite simple. But things get much more complicated if
3255 * folding is involved. It may be that the first character of an EXACTFish
3256 * node doesn't participate in any possible fold, e.g., punctuation, so it
3257 * can be matched only by itself. The vast majority of characters that are
3258 * in folds match just two things, their lower and upper-case equivalents.
3259 * But not all are like that; some have multiple possible matches, or match
3260 * sequences of more than one character. This function sorts all that out.
3262 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3263 * loop of trying to match A*, we know we can't exit where the thing
3264 * following it isn't a B. And something can't be a B unless it is the
3265 * beginning of B. By putting a quick test for that beginning in a tight
3266 * loop, we can rule out things that can't possibly be B without having to
3267 * break out of the loop, thus avoiding work. Similarly, if A is a single
3268 * character, we can make a tight loop matching A*, using the outputs of
3271 * If the target string to match isn't in UTF-8, and there aren't
3272 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3273 * the one or two possible octets (which are characters in this situation)
3274 * that can match. In all cases, if there is only one character that can
3275 * match, *<c1p> and *<c2p> will be identical.
3277 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3278 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3279 * can match the beginning of <text_node>. They should be declared with at
3280 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3281 * undefined what these contain.) If one or both of the buffers are
3282 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3283 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3284 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3285 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3286 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3288 const bool utf8_target = reginfo->is_utf8_target;
3290 UV c1 = CHRTEST_NOT_A_CP_1;
3291 UV c2 = CHRTEST_NOT_A_CP_2;
3292 bool use_chrtest_void = FALSE;
3293 const bool is_utf8_pat = reginfo->is_utf8_pat;
3295 /* Used when we have both utf8 input and utf8 output, to avoid converting
3296 * to/from code points */
3297 bool utf8_has_been_setup = FALSE;
3301 U8 *pat = (U8*)STRING(text_node);
3303 if (OP(text_node) == EXACT) {
3305 /* In an exact node, only one thing can be matched, that first
3306 * character. If both the pat and the target are UTF-8, we can just
3307 * copy the input to the output, avoiding finding the code point of
3312 else if (utf8_target) {
3313 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3314 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3315 utf8_has_been_setup = TRUE;
3318 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3321 else /* an EXACTFish node */
3323 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3324 pat + STR_LEN(text_node)))
3326 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3327 pat + STR_LEN(text_node))))
3329 /* Multi-character folds require more context to sort out. Also
3330 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3331 * handled outside this routine */
3332 use_chrtest_void = TRUE;
3334 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3335 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3337 /* Load the folds hash, if not already done */
3339 if (! PL_utf8_foldclosures) {
3340 if (! PL_utf8_tofold) {
3341 U8 dummy[UTF8_MAXBYTES+1];
3343 /* Force loading this by folding an above-Latin1 char */
3344 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3345 assert(PL_utf8_tofold); /* Verify that worked */
3347 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3350 /* The fold closures data structure is a hash with the keys being
3351 * the UTF-8 of every character that is folded to, like 'k', and
3352 * the values each an array of all code points that fold to its
3353 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3355 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3360 /* Not found in the hash, therefore there are no folds
3361 * containing it, so there is only a single character that
3365 else { /* Does participate in folds */
3366 AV* list = (AV*) *listp;
3367 if (av_len(list) != 1) {
3369 /* If there aren't exactly two folds to this, it is outside
3370 * the scope of this function */
3371 use_chrtest_void = TRUE;
3373 else { /* There are two. Get them */
3374 SV** c_p = av_fetch(list, 0, FALSE);
3376 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3380 c_p = av_fetch(list, 1, FALSE);
3382 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3386 /* Folds that cross the 255/256 boundary are forbidden if
3387 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3388 * pattern character is above 256, and its only other match
3389 * is below 256, the only legal match will be to itself.
3390 * We have thrown away the original, so have to compute
3391 * which is the one above 255 */
3392 if ((c1 < 256) != (c2 < 256)) {
3393 if (OP(text_node) == EXACTFL
3394 || (OP(text_node) == EXACTFA
3395 && (isASCII(c1) || isASCII(c2))))
3408 else /* Here, c1 is < 255 */
3410 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3411 && OP(text_node) != EXACTFL
3412 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3414 /* Here, there could be something above Latin1 in the target which
3415 * folds to this character in the pattern. All such cases except
3416 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3417 * involved in their folds, so are outside the scope of this
3419 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3420 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3423 use_chrtest_void = TRUE;
3426 else { /* Here nothing above Latin1 can fold to the pattern character */
3427 switch (OP(text_node)) {
3429 case EXACTFL: /* /l rules */
3430 c2 = PL_fold_locale[c1];
3434 if (! utf8_target) { /* /d rules */
3439 /* /u rules for all these. This happens to work for
3440 * EXACTFA as nothing in Latin1 folds to ASCII */
3442 case EXACTFU_TRICKYFOLD:
3445 c2 = PL_fold_latin1[c1];
3449 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3450 assert(0); /* NOTREACHED */
3455 /* Here have figured things out. Set up the returns */
3456 if (use_chrtest_void) {
3457 *c2p = *c1p = CHRTEST_VOID;
3459 else if (utf8_target) {
3460 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3461 uvchr_to_utf8(c1_utf8, c1);
3462 uvchr_to_utf8(c2_utf8, c2);
3465 /* Invariants are stored in both the utf8 and byte outputs; Use
3466 * negative numbers otherwise for the byte ones. Make sure that the
3467 * byte ones are the same iff the utf8 ones are the same */
3468 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3469 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3472 ? CHRTEST_NOT_A_CP_1
3473 : CHRTEST_NOT_A_CP_2;
3475 else if (c1 > 255) {
3476 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3481 *c1p = *c2p = c2; /* c2 is the only representable value */
3483 else { /* c1 is representable; see about c2 */
3485 *c2p = (c2 < 256) ? c2 : c1;
3491 /* returns -1 on failure, $+[0] on success */
3493 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3495 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3499 const bool utf8_target = reginfo->is_utf8_target;
3500 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3501 REGEXP *rex_sv = reginfo->prog;
3502 regexp *rex = ReANY(rex_sv);
3503 RXi_GET_DECL(rex,rexi);
3504 /* the current state. This is a cached copy of PL_regmatch_state */
3506 /* cache heavy used fields of st in registers */
3509 U32 n = 0; /* general value; init to avoid compiler warning */
3510 I32 ln = 0; /* len or last; init to avoid compiler warning */
3511 char *locinput = startpos;
3512 char *pushinput; /* where to continue after a PUSH */
3513 I32 nextchr; /* is always set to UCHARAT(locinput) */
3515 bool result = 0; /* return value of S_regmatch */
3516 int depth = 0; /* depth of backtrack stack */
3517 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3518 const U32 max_nochange_depth =
3519 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3520 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3521 regmatch_state *yes_state = NULL; /* state to pop to on success of
3523 /* mark_state piggy backs on the yes_state logic so that when we unwind
3524 the stack on success we can update the mark_state as we go */
3525 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3526 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3527 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3529 bool no_final = 0; /* prevent failure from backtracking? */
3530 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3531 char *startpoint = locinput;
3532 SV *popmark = NULL; /* are we looking for a mark? */
3533 SV *sv_commit = NULL; /* last mark name seen in failure */
3534 SV *sv_yes_mark = NULL; /* last mark name we have seen
3535 during a successful match */
3536 U32 lastopen = 0; /* last open we saw */
3537 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3538 SV* const oreplsv = GvSV(PL_replgv);
3539 /* these three flags are set by various ops to signal information to
3540 * the very next op. They have a useful lifetime of exactly one loop
3541 * iteration, and are not preserved or restored by state pushes/pops
3543 bool sw = 0; /* the condition value in (?(cond)a|b) */
3544 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3545 int logical = 0; /* the following EVAL is:
3549 or the following IFMATCH/UNLESSM is:
3550 false: plain (?=foo)
3551 true: used as a condition: (?(?=foo))
3553 PAD* last_pad = NULL;
3555 I32 gimme = G_SCALAR;
3556 CV *caller_cv = NULL; /* who called us */
3557 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3558 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3559 U32 maxopenparen = 0; /* max '(' index seen so far */
3560 int to_complement; /* Invert the result? */
3561 _char_class_number classnum;
3562 bool is_utf8_pat = reginfo->is_utf8_pat;
3565 GET_RE_DEBUG_FLAGS_DECL;
3568 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3569 multicall_oldcatch = 0;
3570 multicall_cv = NULL;
3572 PERL_UNUSED_VAR(multicall_cop);
3573 PERL_UNUSED_VAR(newsp);
3576 PERL_ARGS_ASSERT_REGMATCH;
3578 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3579 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3582 st = PL_regmatch_state;
3584 /* Note that nextchr is a byte even in UTF */
3587 while (scan != NULL) {
3590 SV * const prop = sv_newmortal();
3591 regnode *rnext=regnext(scan);
3592 DUMP_EXEC_POS( locinput, scan, utf8_target );
3593 regprop(rex, prop, scan);
3595 PerlIO_printf(Perl_debug_log,
3596 "%3"IVdf":%*s%s(%"IVdf")\n",
3597 (IV)(scan - rexi->program), depth*2, "",
3599 (PL_regkind[OP(scan)] == END || !rnext) ?
3600 0 : (IV)(rnext - rexi->program));
3603 next = scan + NEXT_OFF(scan);
3606 state_num = OP(scan);
3612 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3614 switch (state_num) {
3615 case BOL: /* /^../ */
3616 if (locinput == reginfo->strbeg)
3620 case MBOL: /* /^../m */
3621 if (locinput == reginfo->strbeg ||
3622 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3628 case SBOL: /* /^../s */
3629 if (locinput == reginfo->strbeg)
3634 if (locinput == reginfo->ganch)
3638 case KEEPS: /* \K */
3639 /* update the startpoint */
3640 st->u.keeper.val = rex->offs[0].start;
3641 rex->offs[0].start = locinput - reginfo->strbeg;
3642 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3643 assert(0); /*NOTREACHED*/
3644 case KEEPS_next_fail:
3645 /* rollback the start point change */
3646 rex->offs[0].start = st->u.keeper.val;
3648 assert(0); /*NOTREACHED*/
3650 case EOL: /* /..$/ */
3653 case MEOL: /* /..$/m */
3654 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3658 case SEOL: /* /..$/s */
3660 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3662 if (reginfo->strend - locinput > 1)
3667 if (!NEXTCHR_IS_EOS)
3671 case SANY: /* /./s */
3674 goto increment_locinput;
3682 case REG_ANY: /* /./ */
3683 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3685 goto increment_locinput;
3689 #define ST st->u.trie
3690 case TRIEC: /* (ab|cd) with known charclass */
3691 /* In this case the charclass data is available inline so
3692 we can fail fast without a lot of extra overhead.
3694 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3696 PerlIO_printf(Perl_debug_log,
3697 "%*s %sfailed to match trie start class...%s\n",
3698 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3701 assert(0); /* NOTREACHED */
3704 case TRIE: /* (ab|cd) */
3705 /* the basic plan of execution of the trie is:
3706 * At the beginning, run though all the states, and
3707 * find the longest-matching word. Also remember the position
3708 * of the shortest matching word. For example, this pattern:
3711 * when matched against the string "abcde", will generate
3712 * accept states for all words except 3, with the longest
3713 * matching word being 4, and the shortest being 2 (with
3714 * the position being after char 1 of the string).
3716 * Then for each matching word, in word order (i.e. 1,2,4,5),
3717 * we run the remainder of the pattern; on each try setting
3718 * the current position to the character following the word,
3719 * returning to try the next word on failure.
3721 * We avoid having to build a list of words at runtime by
3722 * using a compile-time structure, wordinfo[].prev, which
3723 * gives, for each word, the previous accepting word (if any).
3724 * In the case above it would contain the mappings 1->2, 2->0,
3725 * 3->0, 4->5, 5->1. We can use this table to generate, from
3726 * the longest word (4 above), a list of all words, by
3727 * following the list of prev pointers; this gives us the
3728 * unordered list 4,5,1,2. Then given the current word we have
3729 * just tried, we can go through the list and find the
3730 * next-biggest word to try (so if we just failed on word 2,
3731 * the next in the list is 4).
3733 * Since at runtime we don't record the matching position in
3734 * the string for each word, we have to work that out for
3735 * each word we're about to process. The wordinfo table holds
3736 * the character length of each word; given that we recorded
3737 * at the start: the position of the shortest word and its
3738 * length in chars, we just need to move the pointer the
3739 * difference between the two char lengths. Depending on
3740 * Unicode status and folding, that's cheap or expensive.
3742 * This algorithm is optimised for the case where are only a
3743 * small number of accept states, i.e. 0,1, or maybe 2.
3744 * With lots of accepts states, and having to try all of them,
3745 * it becomes quadratic on number of accept states to find all
3750 /* what type of TRIE am I? (utf8 makes this contextual) */
3751 DECL_TRIE_TYPE(scan);
3753 /* what trie are we using right now */
3754 reg_trie_data * const trie
3755 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3756 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3757 U32 state = trie->startstate;
3760 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3762 if (trie->states[ state ].wordnum) {
3764 PerlIO_printf(Perl_debug_log,
3765 "%*s %smatched empty string...%s\n",
3766 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3772 PerlIO_printf(Perl_debug_log,
3773 "%*s %sfailed to match trie start class...%s\n",
3774 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3781 U8 *uc = ( U8* )locinput;
3785 U8 *uscan = (U8*)NULL;
3786 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3787 U32 charcount = 0; /* how many input chars we have matched */
3788 U32 accepted = 0; /* have we seen any accepting states? */
3790 ST.jump = trie->jump;
3793 ST.longfold = FALSE; /* char longer if folded => it's harder */
3796 /* fully traverse the TRIE; note the position of the
3797 shortest accept state and the wordnum of the longest
3800 while ( state && uc <= (U8*)(reginfo->strend) ) {
3801 U32 base = trie->states[ state ].trans.base;
3805 wordnum = trie->states[ state ].wordnum;
3807 if (wordnum) { /* it's an accept state */
3810 /* record first match position */
3812 ST.firstpos = (U8*)locinput;
3817 ST.firstchars = charcount;
3820 if (!ST.nextword || wordnum < ST.nextword)
3821 ST.nextword = wordnum;
3822 ST.topword = wordnum;
3825 DEBUG_TRIE_EXECUTE_r({
3826 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3827 PerlIO_printf( Perl_debug_log,
3828 "%*s %sState: %4"UVxf" Accepted: %c ",
3829 2+depth * 2, "", PL_colors[4],
3830 (UV)state, (accepted ? 'Y' : 'N'));
3833 /* read a char and goto next state */
3834 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3836 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3837 uscan, len, uvc, charid, foldlen,
3844 base + charid - 1 - trie->uniquecharcount)) >= 0)
3846 && ((U32)offset < trie->lasttrans)
3847 && trie->trans[offset].check == state)
3849 state = trie->trans[offset].next;
3860 DEBUG_TRIE_EXECUTE_r(
3861 PerlIO_printf( Perl_debug_log,
3862 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3863 charid, uvc, (UV)state, PL_colors[5] );
3869 /* calculate total number of accept states */
3874 w = trie->wordinfo[w].prev;
3877 ST.accepted = accepted;
3881 PerlIO_printf( Perl_debug_log,
3882 "%*s %sgot %"IVdf" possible matches%s\n",
3883 REPORT_CODE_OFF + depth * 2, "",
3884 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3886 goto trie_first_try; /* jump into the fail handler */
3888 assert(0); /* NOTREACHED */
3890 case TRIE_next_fail: /* we failed - try next alternative */
3894 REGCP_UNWIND(ST.cp);
3895 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3897 if (!--ST.accepted) {
3899 PerlIO_printf( Perl_debug_log,
3900 "%*s %sTRIE failed...%s\n",
3901 REPORT_CODE_OFF+depth*2, "",
3908 /* Find next-highest word to process. Note that this code
3909 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3912 U16 const nextword = ST.nextword;
3913 reg_trie_wordinfo * const wordinfo
3914 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3915 for (word=ST.topword; word; word=wordinfo[word].prev) {
3916 if (word > nextword && (!min || word < min))
3929 ST.lastparen = rex->lastparen;