5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGEXEC_C
87 #ifdef PERL_IN_XSUB_RE
93 #include "inline_invlist.c"
94 #include "unicode_constants.h"
96 #define RF_tainted 1 /* tainted information used? e.g. locale */
98 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define STATIC static
104 /* Valid for non-utf8 strings: avoids the reginclass
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
107 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
108 : ANYOF_BITMAP_TEST(p,*(c)))
114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
115 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
117 #define HOPc(pos,off) \
118 (char *)(PL_reg_match_utf8 \
119 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
121 #define HOPBACKc(pos, off) \
122 (char*)(PL_reg_match_utf8\
123 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
124 : (pos - off >= PL_bostr) \
128 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
129 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
132 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
133 #define NEXTCHR_IS_EOS (nextchr < 0)
135 #define SET_nextchr \
136 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
138 #define SET_locinput(p) \
143 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
145 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
146 ENTER; save_re_context(); \
147 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
148 1, 0, NULL, &flags); \
153 /* If in debug mode, we test that a known character properly matches */
155 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
157 utf8_char_in_property) \
158 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
159 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
161 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
163 utf8_char_in_property) \
164 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
167 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
168 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
169 swash_property_names[_CC_WORDCHAR], \
170 GREEK_SMALL_LETTER_IOTA_UTF8)
172 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
174 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
175 "_X_regular_begin", \
176 GREEK_SMALL_LETTER_IOTA_UTF8); \
177 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
179 COMBINING_GRAVE_ACCENT_UTF8); \
182 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
183 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
185 /* for use after a quantifier and before an EXACT-like node -- japhy */
186 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
188 * NOTE that *nothing* that affects backtracking should be in here, specifically
189 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
190 * node that is in between two EXACT like nodes when ascertaining what the required
191 * "follow" character is. This should probably be moved to regex compile time
192 * although it may be done at run time beause of the REF possibility - more
193 * investigation required. -- demerphq
195 #define JUMPABLE(rn) ( \
197 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
199 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
200 OP(rn) == PLUS || OP(rn) == MINMOD || \
202 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
204 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
206 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
209 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
210 we don't need this definition. */
211 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
212 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
213 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
216 /* ... so we use this as its faster. */
217 #define IS_TEXT(rn) ( OP(rn)==EXACT )
218 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
219 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
220 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
225 Search for mandatory following text node; for lookahead, the text must
226 follow but for lookbehind (rn->flags != 0) we skip to the next step.
228 #define FIND_NEXT_IMPT(rn) STMT_START { \
229 while (JUMPABLE(rn)) { \
230 const OPCODE type = OP(rn); \
231 if (type == SUSPEND || PL_regkind[type] == CURLY) \
232 rn = NEXTOPER(NEXTOPER(rn)); \
233 else if (type == PLUS) \
235 else if (type == IFMATCH) \
236 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
237 else rn += NEXT_OFF(rn); \
241 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
242 * These are for the pre-composed Hangul syllables, which are all in a
243 * contiguous block and arranged there in such a way so as to facilitate
244 * alorithmic determination of their characteristics. As such, they don't need
245 * a swash, but can be determined by simple arithmetic. Almost all are
246 * GCB=LVT, but every 28th one is a GCB=LV */
247 #define SBASE 0xAC00 /* Start of block */
248 #define SCount 11172 /* Length of block */
251 static void restore_pos(pTHX_ void *arg);
253 #define REGCP_PAREN_ELEMS 3
254 #define REGCP_OTHER_ELEMS 3
255 #define REGCP_FRAME_ELEMS 1
256 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
257 * are needed for the regexp context stack bookkeeping. */
260 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
263 const int retval = PL_savestack_ix;
264 const int paren_elems_to_push =
265 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
266 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
267 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
269 GET_RE_DEBUG_FLAGS_DECL;
271 PERL_ARGS_ASSERT_REGCPPUSH;
273 if (paren_elems_to_push < 0)
274 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
275 paren_elems_to_push);
277 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
278 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
279 " out of range (%lu-%ld)",
281 (unsigned long)maxopenparen,
284 SSGROW(total_elems + REGCP_FRAME_ELEMS);
287 if ((int)maxopenparen > (int)parenfloor)
288 PerlIO_printf(Perl_debug_log,
289 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
294 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
295 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
296 SSPUSHINT(rex->offs[p].end);
297 SSPUSHINT(rex->offs[p].start);
298 SSPUSHINT(rex->offs[p].start_tmp);
299 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
300 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
302 (IV)rex->offs[p].start,
303 (IV)rex->offs[p].start_tmp,
307 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
308 SSPUSHINT(maxopenparen);
309 SSPUSHINT(rex->lastparen);
310 SSPUSHINT(rex->lastcloseparen);
311 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
316 /* These are needed since we do not localize EVAL nodes: */
317 #define REGCP_SET(cp) \
319 PerlIO_printf(Perl_debug_log, \
320 " Setting an EVAL scope, savestack=%"IVdf"\n", \
321 (IV)PL_savestack_ix)); \
324 #define REGCP_UNWIND(cp) \
326 if (cp != PL_savestack_ix) \
327 PerlIO_printf(Perl_debug_log, \
328 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
329 (IV)(cp), (IV)PL_savestack_ix)); \
332 #define UNWIND_PAREN(lp, lcp) \
333 for (n = rex->lastparen; n > lp; n--) \
334 rex->offs[n].end = -1; \
335 rex->lastparen = n; \
336 rex->lastcloseparen = lcp;
340 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
345 GET_RE_DEBUG_FLAGS_DECL;
347 PERL_ARGS_ASSERT_REGCPPOP;
349 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
351 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
352 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
353 rex->lastcloseparen = SSPOPINT;
354 rex->lastparen = SSPOPINT;
355 *maxopenparen_p = SSPOPINT;
357 i -= REGCP_OTHER_ELEMS;
358 /* Now restore the parentheses context. */
360 if (i || rex->lastparen + 1 <= rex->nparens)
361 PerlIO_printf(Perl_debug_log,
362 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
367 paren = *maxopenparen_p;
368 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
370 rex->offs[paren].start_tmp = SSPOPINT;
371 rex->offs[paren].start = SSPOPINT;
373 if (paren <= rex->lastparen)
374 rex->offs[paren].end = tmps;
375 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
376 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
378 (IV)rex->offs[paren].start,
379 (IV)rex->offs[paren].start_tmp,
380 (IV)rex->offs[paren].end,
381 (paren > rex->lastparen ? "(skipped)" : ""));
386 /* It would seem that the similar code in regtry()
387 * already takes care of this, and in fact it is in
388 * a better location to since this code can #if 0-ed out
389 * but the code in regtry() is needed or otherwise tests
390 * requiring null fields (pat.t#187 and split.t#{13,14}
391 * (as of patchlevel 7877) will fail. Then again,
392 * this code seems to be necessary or otherwise
393 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
394 * --jhi updated by dapm */
395 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
396 if (i > *maxopenparen_p)
397 rex->offs[i].start = -1;
398 rex->offs[i].end = -1;
399 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
400 " \\%"UVuf": %s ..-1 undeffing\n",
402 (i > *maxopenparen_p) ? "-1" : " "
408 /* restore the parens and associated vars at savestack position ix,
409 * but without popping the stack */
412 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
414 I32 tmpix = PL_savestack_ix;
415 PL_savestack_ix = ix;
416 regcppop(rex, maxopenparen_p);
417 PL_savestack_ix = tmpix;
420 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
423 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
425 /* Returns a boolean as to whether or not 'character' is a member of the
426 * Posix character class given by 'classnum' that should be equivalent to a
427 * value in the typedef '_char_class_number'.
429 * Ideally this could be replaced by a just an array of function pointers
430 * to the C library functions that implement the macros this calls.
431 * However, to compile, the precise function signatures are required, and
432 * these may vary from platform to to platform. To avoid having to figure
433 * out what those all are on each platform, I (khw) am using this method,
434 * which adds an extra layer of function call overhead (unless the C
435 * optimizer strips it away). But we don't particularly care about
436 * performance with locales anyway. */
438 switch ((_char_class_number) classnum) {
439 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
440 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
441 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
442 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
443 case _CC_ENUM_LOWER: return isLOWER_LC(character);
444 case _CC_ENUM_PRINT: return isPRINT_LC(character);
445 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
446 case _CC_ENUM_UPPER: return isUPPER_LC(character);
447 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
448 case _CC_ENUM_SPACE: return isSPACE_LC(character);
449 case _CC_ENUM_BLANK: return isBLANK_LC(character);
450 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
451 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
452 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
453 case _CC_ENUM_ASCII: return isASCII_LC(character);
454 default: /* VERTSPACE should never occur in locales */
455 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
458 assert(0); /* NOTREACHED */
463 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
465 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
466 * 'character' is a member of the Posix character class given by 'classnum'
467 * that should be equivalent to a value in the typedef
468 * '_char_class_number'.
470 * This just calls isFOO_lc on the code point for the character if it is in
471 * the range 0-255. Outside that range, all characters avoid Unicode
472 * rules, ignoring any locale. So use the Unicode function if this class
473 * requires a swash, and use the Unicode macro otherwise. */
475 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
477 if (UTF8_IS_INVARIANT(*character)) {
478 return isFOO_lc(classnum, *character);
480 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
481 return isFOO_lc(classnum,
482 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
485 if (classnum < _FIRST_NON_SWASH_CC) {
487 /* Initialize the swash unless done already */
488 if (! PL_utf8_swash_ptrs[classnum]) {
489 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
490 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
491 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
494 return swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) character, TRUE);
497 switch ((_char_class_number) classnum) {
499 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
501 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
502 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
503 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
504 default: return 0; /* Things like CNTRL are always
508 assert(0); /* NOTREACHED */
513 * pregexec and friends
516 #ifndef PERL_IN_XSUB_RE
518 - pregexec - match a regexp against a string
521 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
522 char *strbeg, I32 minend, SV *screamer, U32 nosave)
523 /* stringarg: the point in the string at which to begin matching */
524 /* strend: pointer to null at end of string */
525 /* strbeg: real beginning of string */
526 /* minend: end of match must be >= minend bytes after stringarg. */
527 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
528 * itself is accessed via the pointers above */
529 /* nosave: For optimizations. */
531 PERL_ARGS_ASSERT_PREGEXEC;
534 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
535 nosave ? 0 : REXEC_COPY_STR);
540 * Need to implement the following flags for reg_anch:
542 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
544 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
545 * INTUIT_AUTORITATIVE_ML
546 * INTUIT_ONCE_NOML - Intuit can match in one location only.
549 * Another flag for this function: SECOND_TIME (so that float substrs
550 * with giant delta may be not rechecked).
553 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
555 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
556 Otherwise, only SvCUR(sv) is used to get strbeg. */
558 /* XXXX We assume that strpos is strbeg unless sv. */
560 /* XXXX Some places assume that there is a fixed substring.
561 An update may be needed if optimizer marks as "INTUITable"
562 RExen without fixed substrings. Similarly, it is assumed that
563 lengths of all the strings are no more than minlen, thus they
564 cannot come from lookahead.
565 (Or minlen should take into account lookahead.)
566 NOTE: Some of this comment is not correct. minlen does now take account
567 of lookahead/behind. Further research is required. -- demerphq
571 /* A failure to find a constant substring means that there is no need to make
572 an expensive call to REx engine, thus we celebrate a failure. Similarly,
573 finding a substring too deep into the string means that fewer calls to
574 regtry() should be needed.
576 REx compiler's optimizer found 4 possible hints:
577 a) Anchored substring;
579 c) Whether we are anchored (beginning-of-line or \G);
580 d) First node (of those at offset 0) which may distinguish positions;
581 We use a)b)d) and multiline-part of c), and try to find a position in the
582 string which does not contradict any of them.
585 /* Most of decisions we do here should have been done at compile time.
586 The nodes of the REx which we used for the search should have been
587 deleted from the finite automaton. */
590 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
591 char *strend, const U32 flags, re_scream_pos_data *data)
594 struct regexp *const prog = ReANY(rx);
596 /* Should be nonnegative! */
602 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
604 char *other_last = NULL; /* other substr checked before this */
605 char *check_at = NULL; /* check substr found at this pos */
606 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
607 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
608 RXi_GET_DECL(prog,progi);
611 const char * const i_strpos = strpos;
613 GET_RE_DEBUG_FLAGS_DECL;
615 PERL_ARGS_ASSERT_RE_INTUIT_START;
616 PERL_UNUSED_ARG(flags);
617 PERL_UNUSED_ARG(data);
619 RX_MATCH_UTF8_set(rx,utf8_target);
621 is_utf8_pat = cBOOL(RX_UTF8(rx));
624 debug_start_match(rx, utf8_target, strpos, strend,
625 sv ? "Guessing start of match in sv for"
626 : "Guessing start of match in string for");
629 /* CHR_DIST() would be more correct here but it makes things slow. */
630 if (prog->minlen > strend - strpos) {
631 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
632 "String too short... [re_intuit_start]\n"));
636 /* XXX we need to pass strbeg as a separate arg: the following is
637 * guesswork and can be wrong... */
638 if (sv && SvPOK(sv)) {
639 char * p = SvPVX(sv);
640 STRLEN cur = SvCUR(sv);
641 if (p <= strpos && strpos < p + cur) {
643 assert(p <= strend && strend <= p + cur);
646 strbeg = strend - cur;
653 if (!prog->check_utf8 && prog->check_substr)
654 to_utf8_substr(prog);
655 check = prog->check_utf8;
657 if (!prog->check_substr && prog->check_utf8) {
658 if (! to_byte_substr(prog)) {
659 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
662 check = prog->check_substr;
664 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
665 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
666 || ( (prog->extflags & RXf_ANCH_BOL)
667 && !multiline ) ); /* Check after \n? */
670 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
671 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
672 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
674 && (strpos != strbeg)) {
675 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
678 if (prog->check_offset_min == prog->check_offset_max
679 && !(prog->extflags & RXf_CANY_SEEN)
680 && ! multiline) /* /m can cause \n's to match that aren't
681 accounted for in the string max length.
682 See [perl #115242] */
684 /* Substring at constant offset from beg-of-str... */
687 s = HOP3c(strpos, prog->check_offset_min, strend);
690 slen = SvCUR(check); /* >= 1 */
692 if ( strend - s > slen || strend - s < slen - 1
693 || (strend - s == slen && strend[-1] != '\n')) {
694 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
697 /* Now should match s[0..slen-2] */
699 if (slen && (*SvPVX_const(check) != *s
701 && memNE(SvPVX_const(check), s, slen)))) {
703 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
707 else if (*SvPVX_const(check) != *s
708 || ((slen = SvCUR(check)) > 1
709 && memNE(SvPVX_const(check), s, slen)))
712 goto success_at_start;
715 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
717 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
718 end_shift = prog->check_end_shift;
721 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
722 - (SvTAIL(check) != 0);
723 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
725 if (end_shift < eshift)
729 else { /* Can match at random position */
732 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
733 end_shift = prog->check_end_shift;
735 /* end shift should be non negative here */
738 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
740 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
741 (IV)end_shift, RX_PRECOMP(prog));
745 /* Find a possible match in the region s..strend by looking for
746 the "check" substring in the region corrected by start/end_shift. */
749 I32 srch_start_shift = start_shift;
750 I32 srch_end_shift = end_shift;
753 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
754 srch_end_shift -= ((strbeg - s) - srch_start_shift);
755 srch_start_shift = strbeg - s;
757 DEBUG_OPTIMISE_MORE_r({
758 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
759 (IV)prog->check_offset_min,
760 (IV)srch_start_shift,
762 (IV)prog->check_end_shift);
765 if (prog->extflags & RXf_CANY_SEEN) {
766 start_point= (U8*)(s + srch_start_shift);
767 end_point= (U8*)(strend - srch_end_shift);
769 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
770 end_point= HOP3(strend, -srch_end_shift, strbeg);
772 DEBUG_OPTIMISE_MORE_r({
773 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
774 (int)(end_point - start_point),
775 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
779 s = fbm_instr( start_point, end_point,
780 check, multiline ? FBMrf_MULTILINE : 0);
782 /* Update the count-of-usability, remove useless subpatterns,
786 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
787 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
788 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
789 (s ? "Found" : "Did not find"),
790 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
791 ? "anchored" : "floating"),
794 (s ? " at offset " : "...\n") );
799 /* Finish the diagnostic message */
800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
802 /* XXX dmq: first branch is for positive lookbehind...
803 Our check string is offset from the beginning of the pattern.
804 So we need to do any stclass tests offset forward from that
813 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
814 Start with the other substr.
815 XXXX no SCREAM optimization yet - and a very coarse implementation
816 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
817 *always* match. Probably should be marked during compile...
818 Probably it is right to do no SCREAM here...
821 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
822 : (prog->float_substr && prog->anchored_substr))
824 /* Take into account the "other" substring. */
825 /* XXXX May be hopelessly wrong for UTF... */
828 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
831 char * const last = HOP3c(s, -start_shift, strbeg);
833 char * const saved_s = s;
836 t = s - prog->check_offset_max;
837 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
839 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
844 t = HOP3c(t, prog->anchored_offset, strend);
845 if (t < other_last) /* These positions already checked */
847 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
850 /* XXXX It is not documented what units *_offsets are in.
851 We assume bytes, but this is clearly wrong.
852 Meaning this code needs to be carefully reviewed for errors.
856 /* On end-of-str: see comment below. */
857 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
858 if (must == &PL_sv_undef) {
860 DEBUG_r(must = prog->anchored_utf8); /* for debug */
865 HOP3(HOP3(last1, prog->anchored_offset, strend)
866 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
868 multiline ? FBMrf_MULTILINE : 0
871 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
872 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
873 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
874 (s ? "Found" : "Contradicts"),
875 quoted, RE_SV_TAIL(must));
880 if (last1 >= last2) {
881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
882 ", giving up...\n"));
885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
886 ", trying floating at offset %ld...\n",
887 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
888 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
889 s = HOP3c(last, 1, strend);
893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
894 (long)(s - i_strpos)));
895 t = HOP3c(s, -prog->anchored_offset, strbeg);
896 other_last = HOP3c(s, 1, strend);
904 else { /* Take into account the floating substring. */
906 char * const saved_s = s;
909 t = HOP3c(s, -start_shift, strbeg);
911 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
912 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
913 last = HOP3c(t, prog->float_max_offset, strend);
914 s = HOP3c(t, prog->float_min_offset, strend);
917 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
918 must = utf8_target ? prog->float_utf8 : prog->float_substr;
919 /* fbm_instr() takes into account exact value of end-of-str
920 if the check is SvTAIL(ed). Since false positives are OK,
921 and end-of-str is not later than strend we are OK. */
922 if (must == &PL_sv_undef) {
924 DEBUG_r(must = prog->float_utf8); /* for debug message */
927 s = fbm_instr((unsigned char*)s,
928 (unsigned char*)last + SvCUR(must)
930 must, multiline ? FBMrf_MULTILINE : 0);
932 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
933 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
934 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
935 (s ? "Found" : "Contradicts"),
936 quoted, RE_SV_TAIL(must));
940 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
941 ", giving up...\n"));
944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
945 ", trying anchored starting at offset %ld...\n",
946 (long)(saved_s + 1 - i_strpos)));
948 s = HOP3c(t, 1, strend);
952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
953 (long)(s - i_strpos)));
954 other_last = s; /* Fix this later. --Hugo */
964 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
966 DEBUG_OPTIMISE_MORE_r(
967 PerlIO_printf(Perl_debug_log,
968 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
969 (IV)prog->check_offset_min,
970 (IV)prog->check_offset_max,
978 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
980 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
983 /* Fixed substring is found far enough so that the match
984 cannot start at strpos. */
986 if (ml_anch && t[-1] != '\n') {
987 /* Eventually fbm_*() should handle this, but often
988 anchored_offset is not 0, so this check will not be wasted. */
989 /* XXXX In the code below we prefer to look for "^" even in
990 presence of anchored substrings. And we search even
991 beyond the found float position. These pessimizations
992 are historical artefacts only. */
994 while (t < strend - prog->minlen) {
996 if (t < check_at - prog->check_offset_min) {
997 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
998 /* Since we moved from the found position,
999 we definitely contradict the found anchored
1000 substr. Due to the above check we do not
1001 contradict "check" substr.
1002 Thus we can arrive here only if check substr
1003 is float. Redo checking for "other"=="fixed".
1006 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1007 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1008 goto do_other_anchored;
1010 /* We don't contradict the found floating substring. */
1011 /* XXXX Why not check for STCLASS? */
1013 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1014 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1017 /* Position contradicts check-string */
1018 /* XXXX probably better to look for check-string
1019 than for "\n", so one should lower the limit for t? */
1020 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1021 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1022 other_last = strpos = s = t + 1;
1027 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1028 PL_colors[0], PL_colors[1]));
1032 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1033 PL_colors[0], PL_colors[1]));
1037 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1040 /* The found string does not prohibit matching at strpos,
1041 - no optimization of calling REx engine can be performed,
1042 unless it was an MBOL and we are not after MBOL,
1043 or a future STCLASS check will fail this. */
1045 /* Even in this situation we may use MBOL flag if strpos is offset
1046 wrt the start of the string. */
1047 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1048 && (strpos != strbeg) && strpos[-1] != '\n'
1049 /* May be due to an implicit anchor of m{.*foo} */
1050 && !(prog->intflags & PREGf_IMPLICIT))
1055 DEBUG_EXECUTE_r( if (ml_anch)
1056 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1057 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1060 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1062 prog->check_utf8 /* Could be deleted already */
1063 && --BmUSEFUL(prog->check_utf8) < 0
1064 && (prog->check_utf8 == prog->float_utf8)
1066 prog->check_substr /* Could be deleted already */
1067 && --BmUSEFUL(prog->check_substr) < 0
1068 && (prog->check_substr == prog->float_substr)
1071 /* If flags & SOMETHING - do not do it many times on the same match */
1072 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1073 /* XXX Does the destruction order has to change with utf8_target? */
1074 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1075 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1076 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1077 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1078 check = NULL; /* abort */
1080 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1081 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1082 if (prog->intflags & PREGf_IMPLICIT)
1083 prog->extflags &= ~RXf_ANCH_MBOL;
1084 /* XXXX This is a remnant of the old implementation. It
1085 looks wasteful, since now INTUIT can use many
1086 other heuristics. */
1087 prog->extflags &= ~RXf_USE_INTUIT;
1088 /* XXXX What other flags might need to be cleared in this branch? */
1094 /* Last resort... */
1095 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1096 /* trie stclasses are too expensive to use here, we are better off to
1097 leave it to regmatch itself */
1098 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1099 /* minlen == 0 is possible if regstclass is \b or \B,
1100 and the fixed substr is ''$.
1101 Since minlen is already taken into account, s+1 is before strend;
1102 accidentally, minlen >= 1 guaranties no false positives at s + 1
1103 even for \b or \B. But (minlen? 1 : 0) below assumes that
1104 regstclass does not come from lookahead... */
1105 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1106 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1107 const U8* const str = (U8*)STRING(progi->regstclass);
1108 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1109 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1112 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1113 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1114 else if (prog->float_substr || prog->float_utf8)
1115 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1119 if (checked_upto < s)
1121 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1122 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1125 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1131 const char *what = NULL;
1133 if (endpos == strend) {
1134 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1135 "Could not match STCLASS...\n") );
1138 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1139 "This position contradicts STCLASS...\n") );
1140 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1142 checked_upto = HOPBACKc(endpos, start_shift);
1143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1144 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1145 /* Contradict one of substrings */
1146 if (prog->anchored_substr || prog->anchored_utf8) {
1147 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1148 DEBUG_EXECUTE_r( what = "anchored" );
1150 s = HOP3c(t, 1, strend);
1151 if (s + start_shift + end_shift > strend) {
1152 /* XXXX Should be taken into account earlier? */
1153 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1154 "Could not match STCLASS...\n") );
1159 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1160 "Looking for %s substr starting at offset %ld...\n",
1161 what, (long)(s + start_shift - i_strpos)) );
1164 /* Have both, check_string is floating */
1165 if (t + start_shift >= check_at) /* Contradicts floating=check */
1166 goto retry_floating_check;
1167 /* Recheck anchored substring, but not floating... */
1171 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1172 "Looking for anchored substr starting at offset %ld...\n",
1173 (long)(other_last - i_strpos)) );
1174 goto do_other_anchored;
1176 /* Another way we could have checked stclass at the
1177 current position only: */
1182 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1183 "Looking for /%s^%s/m starting at offset %ld...\n",
1184 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1187 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1189 /* Check is floating substring. */
1190 retry_floating_check:
1191 t = check_at - start_shift;
1192 DEBUG_EXECUTE_r( what = "floating" );
1193 goto hop_and_restart;
1196 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1197 "By STCLASS: moving %ld --> %ld\n",
1198 (long)(t - i_strpos), (long)(s - i_strpos))
1202 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1203 "Does not contradict STCLASS...\n");
1208 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1209 PL_colors[4], (check ? "Guessed" : "Giving up"),
1210 PL_colors[5], (long)(s - i_strpos)) );
1213 fail_finish: /* Substring not found */
1214 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1215 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1218 PL_colors[4], PL_colors[5]));
1222 #define DECL_TRIE_TYPE(scan) \
1223 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1224 trie_type = ((scan->flags == EXACT) \
1225 ? (utf8_target ? trie_utf8 : trie_plain) \
1226 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1228 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1229 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1231 switch (trie_type) { \
1232 case trie_utf8_fold: \
1233 if ( foldlen>0 ) { \
1234 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1239 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1240 len = UTF8SKIP(uc); \
1241 skiplen = UNISKIP( uvc ); \
1242 foldlen -= skiplen; \
1243 uscan = foldbuf + skiplen; \
1246 case trie_latin_utf8_fold: \
1247 if ( foldlen>0 ) { \
1248 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1254 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1255 skiplen = UNISKIP( uvc ); \
1256 foldlen -= skiplen; \
1257 uscan = foldbuf + skiplen; \
1261 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1268 charid = trie->charmap[ uvc ]; \
1272 if (widecharmap) { \
1273 SV** const svpp = hv_fetch(widecharmap, \
1274 (char*)&uvc, sizeof(UV), 0); \
1276 charid = (U16)SvIV(*svpp); \
1281 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1285 && (ln == 1 || folder(s, pat_string, ln)) \
1286 && (!reginfo || regtry(reginfo, &s)) ) \
1292 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1294 while (s < strend) { \
1300 #define REXEC_FBC_SCAN(CoDe) \
1302 while (s < strend) { \
1308 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1309 REXEC_FBC_UTF8_SCAN( \
1311 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1320 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1323 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1332 #define REXEC_FBC_TRYIT \
1333 if ((!reginfo || regtry(reginfo, &s))) \
1336 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1337 if (utf8_target) { \
1338 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1341 REXEC_FBC_CLASS_SCAN(CoNd); \
1344 #define DUMP_EXEC_POS(li,s,doutf8) \
1345 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1348 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1349 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1350 tmp = TEST_NON_UTF8(tmp); \
1351 REXEC_FBC_UTF8_SCAN( \
1352 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1361 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1362 if (s == PL_bostr) { \
1366 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1367 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1370 LOAD_UTF8_CHARCLASS_ALNUM(); \
1371 REXEC_FBC_UTF8_SCAN( \
1372 if (tmp == ! (TeSt2_UtF8)) { \
1381 /* The only difference between the BOUND and NBOUND cases is that
1382 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1383 * NBOUND. This is accomplished by passing it in either the if or else clause,
1384 * with the other one being empty */
1385 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1386 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1388 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1389 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1391 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1392 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1394 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1395 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1398 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1399 * be passed in completely with the variable name being tested, which isn't
1400 * such a clean interface, but this is easier to read than it was before. We
1401 * are looking for the boundary (or non-boundary between a word and non-word
1402 * character. The utf8 and non-utf8 cases have the same logic, but the details
1403 * must be different. Find the "wordness" of the character just prior to this
1404 * one, and compare it with the wordness of this one. If they differ, we have
1405 * a boundary. At the beginning of the string, pretend that the previous
1406 * character was a new-line */
1407 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1408 if (utf8_target) { \
1411 else { /* Not utf8 */ \
1412 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1413 tmp = TEST_NON_UTF8(tmp); \
1415 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1424 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1427 /* We know what class REx starts with. Try to find this position... */
1428 /* if reginfo is NULL, its a dryrun */
1429 /* annoyingly all the vars in this routine have different names from their counterparts
1430 in regmatch. /grrr */
1433 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1434 const char *strend, regmatch_info *reginfo, bool is_utf8_pat)
1437 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1438 char *pat_string; /* The pattern's exactish string */
1439 char *pat_end; /* ptr to end char of pat_string */
1440 re_fold_t folder; /* Function for computing non-utf8 folds */
1441 const U8 *fold_array; /* array for folding ords < 256 */
1447 I32 tmp = 1; /* Scratch variable? */
1448 const bool utf8_target = PL_reg_match_utf8;
1449 UV utf8_fold_flags = 0;
1450 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1451 with a result inverts that result, as 0^1 =
1453 _char_class_number classnum;
1455 RXi_GET_DECL(prog,progi);
1457 PERL_ARGS_ASSERT_FIND_BYCLASS;
1459 /* We know what class it must start with. */
1463 REXEC_FBC_UTF8_CLASS_SCAN(
1464 reginclass(prog, c, (U8*)s, utf8_target));
1467 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1472 if (tmp && (!reginfo || regtry(reginfo, &s)))
1480 if (is_utf8_pat || utf8_target) {
1481 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1482 goto do_exactf_utf8;
1484 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1485 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1486 goto do_exactf_non_utf8; /* isn't dealt with by these */
1491 /* regcomp.c already folded this if pattern is in UTF-8 */
1492 utf8_fold_flags = 0;
1493 goto do_exactf_utf8;
1495 fold_array = PL_fold;
1497 goto do_exactf_non_utf8;
1500 if (is_utf8_pat || utf8_target) {
1501 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1502 goto do_exactf_utf8;
1504 fold_array = PL_fold_locale;
1505 folder = foldEQ_locale;
1506 goto do_exactf_non_utf8;
1510 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1512 goto do_exactf_utf8;
1514 case EXACTFU_TRICKYFOLD:
1516 if (is_utf8_pat || utf8_target) {
1517 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1518 goto do_exactf_utf8;
1521 /* Any 'ss' in the pattern should have been replaced by regcomp,
1522 * so we don't have to worry here about this single special case
1523 * in the Latin1 range */
1524 fold_array = PL_fold_latin1;
1525 folder = foldEQ_latin1;
1529 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1530 are no glitches with fold-length differences
1531 between the target string and pattern */
1533 /* The idea in the non-utf8 EXACTF* cases is to first find the
1534 * first character of the EXACTF* node and then, if necessary,
1535 * case-insensitively compare the full text of the node. c1 is the
1536 * first character. c2 is its fold. This logic will not work for
1537 * Unicode semantics and the german sharp ss, which hence should
1538 * not be compiled into a node that gets here. */
1539 pat_string = STRING(c);
1540 ln = STR_LEN(c); /* length to match in octets/bytes */
1542 /* We know that we have to match at least 'ln' bytes (which is the
1543 * same as characters, since not utf8). If we have to match 3
1544 * characters, and there are only 2 availabe, we know without
1545 * trying that it will fail; so don't start a match past the
1546 * required minimum number from the far end */
1547 e = HOP3c(strend, -((I32)ln), s);
1549 if (!reginfo && e < s) {
1550 e = s; /* Due to minlen logic of intuit() */
1554 c2 = fold_array[c1];
1555 if (c1 == c2) { /* If char and fold are the same */
1556 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1559 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1567 /* If one of the operands is in utf8, we can't use the simpler folding
1568 * above, due to the fact that many different characters can have the
1569 * same fold, or portion of a fold, or different- length fold */
1570 pat_string = STRING(c);
1571 ln = STR_LEN(c); /* length to match in octets/bytes */
1572 pat_end = pat_string + ln;
1573 lnc = is_utf8_pat /* length to match in characters */
1574 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1577 /* We have 'lnc' characters to match in the pattern, but because of
1578 * multi-character folding, each character in the target can match
1579 * up to 3 characters (Unicode guarantees it will never exceed
1580 * this) if it is utf8-encoded; and up to 2 if not (based on the
1581 * fact that the Latin 1 folds are already determined, and the
1582 * only multi-char fold in that range is the sharp-s folding to
1583 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1584 * string character. Adjust lnc accordingly, rounding up, so that
1585 * if we need to match at least 4+1/3 chars, that really is 5. */
1586 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1587 lnc = (lnc + expansion - 1) / expansion;
1589 /* As in the non-UTF8 case, if we have to match 3 characters, and
1590 * only 2 are left, it's guaranteed to fail, so don't start a
1591 * match that would require us to go beyond the end of the string
1593 e = HOP3c(strend, -((I32)lnc), s);
1595 if (!reginfo && e < s) {
1596 e = s; /* Due to minlen logic of intuit() */
1599 /* XXX Note that we could recalculate e to stop the loop earlier,
1600 * as the worst case expansion above will rarely be met, and as we
1601 * go along we would usually find that e moves further to the left.
1602 * This would happen only after we reached the point in the loop
1603 * where if there were no expansion we should fail. Unclear if
1604 * worth the expense */
1607 char *my_strend= (char *)strend;
1608 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1609 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1610 && (!reginfo || regtry(reginfo, &s)) )
1614 s += (utf8_target) ? UTF8SKIP(s) : 1;
1619 PL_reg_flags |= RF_tainted;
1620 FBC_BOUND(isALNUM_LC,
1621 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1622 isALNUM_LC_utf8((U8*)s));
1625 PL_reg_flags |= RF_tainted;
1626 FBC_NBOUND(isALNUM_LC,
1627 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1628 isALNUM_LC_utf8((U8*)s));
1631 FBC_BOUND(isWORDCHAR,
1633 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1636 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1638 isWORDCHAR_A((U8*)s));
1641 FBC_NBOUND(isWORDCHAR,
1643 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1646 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1648 isWORDCHAR_A((U8*)s));
1651 FBC_BOUND(isWORDCHAR_L1,
1653 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1656 FBC_NBOUND(isWORDCHAR_L1,
1658 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1661 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1662 is_LNBREAK_latin1_safe(s, strend)
1666 /* The argument to all the POSIX node types is the class number to pass to
1667 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1674 PL_reg_flags |= RF_tainted;
1675 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1676 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1691 /* The complement of something that matches only ASCII matches all
1692 * UTF-8 variant code points, plus everything in ASCII that isn't
1694 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1695 || ! _generic_isCC_A(*s, FLAGS(c)));
1704 /* Don't need to worry about utf8, as it can match only a single
1705 * byte invariant character. */
1706 REXEC_FBC_CLASS_SCAN(
1707 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1715 if (! utf8_target) {
1716 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1722 classnum = (_char_class_number) FLAGS(c);
1723 if (classnum < _FIRST_NON_SWASH_CC) {
1724 while (s < strend) {
1726 /* We avoid loading in the swash as long as possible, but
1727 * should we have to, we jump to a separate loop. This
1728 * extra 'if' statement is what keeps this code from being
1729 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1730 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1731 goto found_above_latin1;
1733 if ((UTF8_IS_INVARIANT(*s)
1734 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1736 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1737 && to_complement ^ cBOOL(
1738 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1741 if (tmp && (!reginfo || regtry(reginfo, &s)))
1753 else switch (classnum) { /* These classes are implemented as
1755 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1756 revert the change of \v matching this */
1759 case _CC_ENUM_PSXSPC:
1760 REXEC_FBC_UTF8_CLASS_SCAN(
1761 to_complement ^ cBOOL(isSPACE_utf8(s)));
1764 case _CC_ENUM_BLANK:
1765 REXEC_FBC_UTF8_CLASS_SCAN(
1766 to_complement ^ cBOOL(isBLANK_utf8(s)));
1769 case _CC_ENUM_XDIGIT:
1770 REXEC_FBC_UTF8_CLASS_SCAN(
1771 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1774 case _CC_ENUM_VERTSPACE:
1775 REXEC_FBC_UTF8_CLASS_SCAN(
1776 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1779 case _CC_ENUM_CNTRL:
1780 REXEC_FBC_UTF8_CLASS_SCAN(
1781 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1785 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1786 assert(0); /* NOTREACHED */
1791 found_above_latin1: /* Here we have to load a swash to get the result
1792 for the current code point */
1793 if (! PL_utf8_swash_ptrs[classnum]) {
1794 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1795 PL_utf8_swash_ptrs[classnum] =
1796 _core_swash_init("utf8", swash_property_names[classnum],
1797 &PL_sv_undef, 1, 0, NULL, &flags);
1800 /* This is a copy of the loop above for swash classes, though using the
1801 * FBC macro instead of being expanded out. Since we've loaded the
1802 * swash, we don't have to check for that each time through the loop */
1803 REXEC_FBC_UTF8_CLASS_SCAN(
1804 to_complement ^ cBOOL(_generic_utf8(
1807 swash_fetch(PL_utf8_swash_ptrs[classnum],
1815 /* what trie are we using right now */
1816 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1817 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1818 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1820 const char *last_start = strend - trie->minlen;
1822 const char *real_start = s;
1824 STRLEN maxlen = trie->maxlen;
1826 U8 **points; /* map of where we were in the input string
1827 when reading a given char. For ASCII this
1828 is unnecessary overhead as the relationship
1829 is always 1:1, but for Unicode, especially
1830 case folded Unicode this is not true. */
1831 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1835 GET_RE_DEBUG_FLAGS_DECL;
1837 /* We can't just allocate points here. We need to wrap it in
1838 * an SV so it gets freed properly if there is a croak while
1839 * running the match */
1842 sv_points=newSV(maxlen * sizeof(U8 *));
1843 SvCUR_set(sv_points,
1844 maxlen * sizeof(U8 *));
1845 SvPOK_on(sv_points);
1846 sv_2mortal(sv_points);
1847 points=(U8**)SvPV_nolen(sv_points );
1848 if ( trie_type != trie_utf8_fold
1849 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1852 bitmap=(U8*)trie->bitmap;
1854 bitmap=(U8*)ANYOF_BITMAP(c);
1856 /* this is the Aho-Corasick algorithm modified a touch
1857 to include special handling for long "unknown char" sequences.
1858 The basic idea being that we use AC as long as we are dealing
1859 with a possible matching char, when we encounter an unknown char
1860 (and we have not encountered an accepting state) we scan forward
1861 until we find a legal starting char.
1862 AC matching is basically that of trie matching, except that when
1863 we encounter a failing transition, we fall back to the current
1864 states "fail state", and try the current char again, a process
1865 we repeat until we reach the root state, state 1, or a legal
1866 transition. If we fail on the root state then we can either
1867 terminate if we have reached an accepting state previously, or
1868 restart the entire process from the beginning if we have not.
1871 while (s <= last_start) {
1872 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1880 U8 *uscan = (U8*)NULL;
1881 U8 *leftmost = NULL;
1883 U32 accepted_word= 0;
1887 while ( state && uc <= (U8*)strend ) {
1889 U32 word = aho->states[ state ].wordnum;
1893 DEBUG_TRIE_EXECUTE_r(
1894 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1895 dump_exec_pos( (char *)uc, c, strend, real_start,
1896 (char *)uc, utf8_target );
1897 PerlIO_printf( Perl_debug_log,
1898 " Scanning for legal start char...\n");
1902 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1906 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1912 if (uc >(U8*)last_start) break;
1916 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1917 if (!leftmost || lpos < leftmost) {
1918 DEBUG_r(accepted_word=word);
1924 points[pointpos++ % maxlen]= uc;
1925 if (foldlen || uc < (U8*)strend) {
1926 REXEC_TRIE_READ_CHAR(trie_type, trie,
1928 uscan, len, uvc, charid, foldlen,
1930 DEBUG_TRIE_EXECUTE_r({
1931 dump_exec_pos( (char *)uc, c, strend,
1932 real_start, s, utf8_target);
1933 PerlIO_printf(Perl_debug_log,
1934 " Charid:%3u CP:%4"UVxf" ",
1946 word = aho->states[ state ].wordnum;
1948 base = aho->states[ state ].trans.base;
1950 DEBUG_TRIE_EXECUTE_r({
1952 dump_exec_pos( (char *)uc, c, strend, real_start,
1954 PerlIO_printf( Perl_debug_log,
1955 "%sState: %4"UVxf", word=%"UVxf,
1956 failed ? " Fail transition to " : "",
1957 (UV)state, (UV)word);
1963 ( ((offset = base + charid
1964 - 1 - trie->uniquecharcount)) >= 0)
1965 && ((U32)offset < trie->lasttrans)
1966 && trie->trans[offset].check == state
1967 && (tmp=trie->trans[offset].next))
1969 DEBUG_TRIE_EXECUTE_r(
1970 PerlIO_printf( Perl_debug_log," - legal\n"));
1975 DEBUG_TRIE_EXECUTE_r(
1976 PerlIO_printf( Perl_debug_log," - fail\n"));
1978 state = aho->fail[state];
1982 /* we must be accepting here */
1983 DEBUG_TRIE_EXECUTE_r(
1984 PerlIO_printf( Perl_debug_log," - accepting\n"));
1993 if (!state) state = 1;
1996 if ( aho->states[ state ].wordnum ) {
1997 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1998 if (!leftmost || lpos < leftmost) {
1999 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2004 s = (char*)leftmost;
2005 DEBUG_TRIE_EXECUTE_r({
2007 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2008 (UV)accepted_word, (IV)(s - real_start)
2011 if (!reginfo || regtry(reginfo, &s)) {
2017 DEBUG_TRIE_EXECUTE_r({
2018 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2021 DEBUG_TRIE_EXECUTE_r(
2022 PerlIO_printf( Perl_debug_log,"No match.\n"));
2031 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2041 - regexec_flags - match a regexp against a string
2044 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2045 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2046 /* stringarg: the point in the string at which to begin matching */
2047 /* strend: pointer to null at end of string */
2048 /* strbeg: real beginning of string */
2049 /* minend: end of match must be >= minend bytes after stringarg. */
2050 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2051 * itself is accessed via the pointers above */
2052 /* data: May be used for some additional optimizations.
2053 Currently its only used, with a U32 cast, for transmitting
2054 the ganch offset when doing a /g match. This will change */
2055 /* nosave: For optimizations. */
2059 struct regexp *const prog = ReANY(rx);
2062 char *startpos = stringarg;
2063 I32 minlen; /* must match at least this many chars */
2064 I32 dontbother = 0; /* how many characters not to try at end */
2065 I32 end_shift = 0; /* Same for the end. */ /* CC */
2066 I32 scream_pos = -1; /* Internal iterator of scream. */
2067 char *scream_olds = NULL;
2068 const bool utf8_target = cBOOL(DO_UTF8(sv));
2070 RXi_GET_DECL(prog,progi);
2071 regmatch_info reginfo; /* create some info to pass to regtry etc */
2072 regexp_paren_pair *swap = NULL;
2073 GET_RE_DEBUG_FLAGS_DECL;
2075 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2076 PERL_UNUSED_ARG(data);
2078 /* Be paranoid... */
2079 if (prog == NULL || startpos == NULL) {
2080 Perl_croak(aTHX_ "NULL regexp parameter");
2084 multiline = prog->extflags & RXf_PMf_MULTILINE;
2085 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2087 RX_MATCH_UTF8_set(rx, utf8_target);
2089 debug_start_match(rx, utf8_target, startpos, strend,
2093 minlen = prog->minlen;
2095 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2096 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2097 "String too short [regexec_flags]...\n"));
2102 /* Check validity of program. */
2103 if (UCHARAT(progi->program) != REG_MAGIC) {
2104 Perl_croak(aTHX_ "corrupted regexp program");
2108 PL_reg_state.re_state_eval_setup_done = FALSE;
2111 reginfo.is_utf8_pat = cBOOL(RX_UTF8(rx));
2112 reginfo.warned = FALSE;
2113 /* Mark beginning of line for ^ and lookbehind. */
2114 reginfo.bol = startpos; /* XXX not used ??? */
2118 /* Mark end of line for $ (and such) */
2121 /* see how far we have to get to not match where we matched before */
2122 reginfo.till = startpos+minend;
2124 /* If there is a "must appear" string, look for it. */
2127 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2129 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2130 reginfo.ganch = startpos + prog->gofs;
2131 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2132 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2133 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2135 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2136 && mg->mg_len >= 0) {
2137 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2138 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2139 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2141 if (prog->extflags & RXf_ANCH_GPOS) {
2142 if (s > reginfo.ganch)
2144 s = reginfo.ganch - prog->gofs;
2145 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2146 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2152 reginfo.ganch = strbeg + PTR2UV(data);
2153 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2154 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2156 } else { /* pos() not defined */
2157 reginfo.ganch = strbeg;
2158 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159 "GPOS: reginfo.ganch = strbeg\n"));
2162 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2163 /* We have to be careful. If the previous successful match
2164 was from this regex we don't want a subsequent partially
2165 successful match to clobber the old results.
2166 So when we detect this possibility we add a swap buffer
2167 to the re, and switch the buffer each match. If we fail,
2168 we switch it back; otherwise we leave it swapped.
2171 /* do we need a save destructor here for eval dies? */
2172 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2173 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2174 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2180 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2181 re_scream_pos_data d;
2183 d.scream_olds = &scream_olds;
2184 d.scream_pos = &scream_pos;
2185 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2187 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2188 goto phooey; /* not present */
2194 /* Simplest case: anchored match need be tried only once. */
2195 /* [unless only anchor is BOL and multiline is set] */
2196 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2197 if (s == startpos && regtry(®info, &startpos))
2199 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2200 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2205 dontbother = minlen - 1;
2206 end = HOP3c(strend, -dontbother, strbeg) - 1;
2207 /* for multiline we only have to try after newlines */
2208 if (prog->check_substr || prog->check_utf8) {
2209 /* because of the goto we can not easily reuse the macros for bifurcating the
2210 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2213 goto after_try_utf8;
2215 if (regtry(®info, &s)) {
2222 if (prog->extflags & RXf_USE_INTUIT) {
2223 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2232 } /* end search for check string in unicode */
2234 if (s == startpos) {
2235 goto after_try_latin;
2238 if (regtry(®info, &s)) {
2245 if (prog->extflags & RXf_USE_INTUIT) {
2246 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2255 } /* end search for check string in latin*/
2256 } /* end search for check string */
2257 else { /* search for newline */
2259 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2262 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2263 while (s <= end) { /* note it could be possible to match at the end of the string */
2264 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2265 if (regtry(®info, &s))
2269 } /* end search for newline */
2270 } /* end anchored/multiline check string search */
2272 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2274 /* the warning about reginfo.ganch being used without initialization
2275 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2276 and we only enter this block when the same bit is set. */
2277 char *tmp_s = reginfo.ganch - prog->gofs;
2279 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2284 /* Messy cases: unanchored match. */
2285 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2286 /* we have /x+whatever/ */
2287 /* it must be a one character string (XXXX Except is_utf8_pat?) */
2293 if (! prog->anchored_utf8) {
2294 to_utf8_substr(prog);
2296 ch = SvPVX_const(prog->anchored_utf8)[0];
2299 DEBUG_EXECUTE_r( did_match = 1 );
2300 if (regtry(®info, &s)) goto got_it;
2302 while (s < strend && *s == ch)
2309 if (! prog->anchored_substr) {
2310 if (! to_byte_substr(prog)) {
2311 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2314 ch = SvPVX_const(prog->anchored_substr)[0];
2317 DEBUG_EXECUTE_r( did_match = 1 );
2318 if (regtry(®info, &s)) goto got_it;
2320 while (s < strend && *s == ch)
2325 DEBUG_EXECUTE_r(if (!did_match)
2326 PerlIO_printf(Perl_debug_log,
2327 "Did not find anchored character...\n")
2330 else if (prog->anchored_substr != NULL
2331 || prog->anchored_utf8 != NULL
2332 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2333 && prog->float_max_offset < strend - s)) {
2338 char *last1; /* Last position checked before */
2342 if (prog->anchored_substr || prog->anchored_utf8) {
2344 if (! prog->anchored_utf8) {
2345 to_utf8_substr(prog);
2347 must = prog->anchored_utf8;
2350 if (! prog->anchored_substr) {
2351 if (! to_byte_substr(prog)) {
2352 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2355 must = prog->anchored_substr;
2357 back_max = back_min = prog->anchored_offset;
2360 if (! prog->float_utf8) {
2361 to_utf8_substr(prog);
2363 must = prog->float_utf8;
2366 if (! prog->float_substr) {
2367 if (! to_byte_substr(prog)) {
2368 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2371 must = prog->float_substr;
2373 back_max = prog->float_max_offset;
2374 back_min = prog->float_min_offset;
2380 last = HOP3c(strend, /* Cannot start after this */
2381 -(I32)(CHR_SVLEN(must)
2382 - (SvTAIL(must) != 0) + back_min), strbeg);
2385 last1 = HOPc(s, -1);
2387 last1 = s - 1; /* bogus */
2389 /* XXXX check_substr already used to find "s", can optimize if
2390 check_substr==must. */
2392 dontbother = end_shift;
2393 strend = HOPc(strend, -dontbother);
2394 while ( (s <= last) &&
2395 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2396 (unsigned char*)strend, must,
2397 multiline ? FBMrf_MULTILINE : 0)) ) {
2398 DEBUG_EXECUTE_r( did_match = 1 );
2399 if (HOPc(s, -back_max) > last1) {
2400 last1 = HOPc(s, -back_min);
2401 s = HOPc(s, -back_max);
2404 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2406 last1 = HOPc(s, -back_min);
2410 while (s <= last1) {
2411 if (regtry(®info, &s))
2414 s++; /* to break out of outer loop */
2421 while (s <= last1) {
2422 if (regtry(®info, &s))
2428 DEBUG_EXECUTE_r(if (!did_match) {
2429 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2430 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2431 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2432 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2433 ? "anchored" : "floating"),
2434 quoted, RE_SV_TAIL(must));
2438 else if ( (c = progi->regstclass) ) {
2440 const OPCODE op = OP(progi->regstclass);
2441 /* don't bother with what can't match */
2442 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2443 strend = HOPc(strend, -(minlen - 1));
2446 SV * const prop = sv_newmortal();
2447 regprop(prog, prop, c);
2449 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2451 PerlIO_printf(Perl_debug_log,
2452 "Matching stclass %.*s against %s (%d bytes)\n",
2453 (int)SvCUR(prop), SvPVX_const(prop),
2454 quoted, (int)(strend - s));
2457 if (find_byclass(prog, c, s, strend, ®info, reginfo.is_utf8_pat))
2459 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2463 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2471 if (! prog->float_utf8) {
2472 to_utf8_substr(prog);
2474 float_real = prog->float_utf8;
2477 if (! prog->float_substr) {
2478 if (! to_byte_substr(prog)) {
2479 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2482 float_real = prog->float_substr;
2485 little = SvPV_const(float_real, len);
2486 if (SvTAIL(float_real)) {
2487 /* This means that float_real contains an artificial \n on
2488 * the end due to the presence of something like this:
2489 * /foo$/ where we can match both "foo" and "foo\n" at the
2490 * end of the string. So we have to compare the end of the
2491 * string first against the float_real without the \n and
2492 * then against the full float_real with the string. We
2493 * have to watch out for cases where the string might be
2494 * smaller than the float_real or the float_real without
2496 char *checkpos= strend - len;
2498 PerlIO_printf(Perl_debug_log,
2499 "%sChecking for float_real.%s\n",
2500 PL_colors[4], PL_colors[5]));
2501 if (checkpos + 1 < strbeg) {
2502 /* can't match, even if we remove the trailing \n
2503 * string is too short to match */
2505 PerlIO_printf(Perl_debug_log,
2506 "%sString shorter than required trailing substring, cannot match.%s\n",
2507 PL_colors[4], PL_colors[5]));
2509 } else if (memEQ(checkpos + 1, little, len - 1)) {
2510 /* can match, the end of the string matches without the
2512 last = checkpos + 1;
2513 } else if (checkpos < strbeg) {
2514 /* cant match, string is too short when the "\n" is
2517 PerlIO_printf(Perl_debug_log,
2518 "%sString does not contain required trailing substring, cannot match.%s\n",
2519 PL_colors[4], PL_colors[5]));
2521 } else if (!multiline) {
2522 /* non multiline match, so compare with the "\n" at the
2523 * end of the string */
2524 if (memEQ(checkpos, little, len)) {
2528 PerlIO_printf(Perl_debug_log,
2529 "%sString does not contain required trailing substring, cannot match.%s\n",
2530 PL_colors[4], PL_colors[5]));
2534 /* multiline match, so we have to search for a place
2535 * where the full string is located */
2541 last = rninstr(s, strend, little, little + len);
2543 last = strend; /* matching "$" */
2546 /* at one point this block contained a comment which was
2547 * probably incorrect, which said that this was a "should not
2548 * happen" case. Even if it was true when it was written I am
2549 * pretty sure it is not anymore, so I have removed the comment
2550 * and replaced it with this one. Yves */
2552 PerlIO_printf(Perl_debug_log,
2553 "String does not contain required substring, cannot match.\n"
2557 dontbother = strend - last + prog->float_min_offset;
2559 if (minlen && (dontbother < minlen))
2560 dontbother = minlen - 1;
2561 strend -= dontbother; /* this one's always in bytes! */
2562 /* We don't know much -- general case. */
2565 if (regtry(®info, &s))
2574 if (regtry(®info, &s))
2576 } while (s++ < strend);
2586 PerlIO_printf(Perl_debug_log,
2587 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2593 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2595 if (PL_reg_state.re_state_eval_setup_done)
2596 restore_pos(aTHX_ prog);
2597 if (RXp_PAREN_NAMES(prog))
2598 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2600 /* make sure $`, $&, $', and $digit will work later */
2601 if ( !(flags & REXEC_NOT_FIRST) ) {
2602 if (flags & REXEC_COPY_STR) {
2606 PerlIO_printf(Perl_debug_log,
2607 "Copy on write: regexp capture, type %d\n",
2610 RX_MATCH_COPY_FREE(rx);
2611 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2612 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2613 assert (SvPOKp(prog->saved_copy));
2614 prog->sublen = PL_regeol - strbeg;
2615 prog->suboffset = 0;
2616 prog->subcoffset = 0;
2621 I32 max = PL_regeol - strbeg;
2624 if ( (flags & REXEC_COPY_SKIP_POST)
2625 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2626 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2627 ) { /* don't copy $' part of string */
2630 /* calculate the right-most part of the string covered
2631 * by a capture. Due to look-ahead, this may be to
2632 * the right of $&, so we have to scan all captures */
2633 while (n <= prog->lastparen) {
2634 if (prog->offs[n].end > max)
2635 max = prog->offs[n].end;
2639 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2640 ? prog->offs[0].start
2642 assert(max >= 0 && max <= PL_regeol - strbeg);
2645 if ( (flags & REXEC_COPY_SKIP_PRE)
2646 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2647 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2648 ) { /* don't copy $` part of string */
2651 /* calculate the left-most part of the string covered
2652 * by a capture. Due to look-behind, this may be to
2653 * the left of $&, so we have to scan all captures */
2654 while (min && n <= prog->lastparen) {
2655 if ( prog->offs[n].start != -1
2656 && prog->offs[n].start < min)
2658 min = prog->offs[n].start;
2662 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2663 && min > prog->offs[0].end
2665 min = prog->offs[0].end;
2669 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2672 if (RX_MATCH_COPIED(rx)) {
2673 if (sublen > prog->sublen)
2675 (char*)saferealloc(prog->subbeg, sublen+1);
2678 prog->subbeg = (char*)safemalloc(sublen+1);
2679 Copy(strbeg + min, prog->subbeg, sublen, char);
2680 prog->subbeg[sublen] = '\0';
2681 prog->suboffset = min;
2682 prog->sublen = sublen;
2683 RX_MATCH_COPIED_on(rx);
2685 prog->subcoffset = prog->suboffset;
2686 if (prog->suboffset && utf8_target) {
2687 /* Convert byte offset to chars.
2688 * XXX ideally should only compute this if @-/@+
2689 * has been seen, a la PL_sawampersand ??? */
2691 /* If there's a direct correspondence between the
2692 * string which we're matching and the original SV,
2693 * then we can use the utf8 len cache associated with
2694 * the SV. In particular, it means that under //g,
2695 * sv_pos_b2u() will use the previously cached
2696 * position to speed up working out the new length of
2697 * subcoffset, rather than counting from the start of
2698 * the string each time. This stops
2699 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2700 * from going quadratic */
2701 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2702 sv_pos_b2u(sv, &(prog->subcoffset));
2704 prog->subcoffset = utf8_length((U8*)strbeg,
2705 (U8*)(strbeg+prog->suboffset));
2709 RX_MATCH_COPY_FREE(rx);
2710 prog->subbeg = strbeg;
2711 prog->suboffset = 0;
2712 prog->subcoffset = 0;
2713 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2721 PL_colors[4], PL_colors[5]));
2722 if (PL_reg_state.re_state_eval_setup_done)
2723 restore_pos(aTHX_ prog);
2725 /* we failed :-( roll it back */
2726 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2727 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2732 Safefree(prog->offs);
2739 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2740 * Do inc before dec, in case old and new rex are the same */
2741 #define SET_reg_curpm(Re2) \
2742 if (PL_reg_state.re_state_eval_setup_done) { \
2743 (void)ReREFCNT_inc(Re2); \
2744 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2745 PM_SETRE((PL_reg_curpm), (Re2)); \
2750 - regtry - try match at specific point
2752 STATIC I32 /* 0 failure, 1 success */
2753 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2757 REGEXP *const rx = reginfo->prog;
2758 regexp *const prog = ReANY(rx);
2760 RXi_GET_DECL(prog,progi);
2761 GET_RE_DEBUG_FLAGS_DECL;
2763 PERL_ARGS_ASSERT_REGTRY;
2765 reginfo->cutpoint=NULL;
2767 if ((prog->extflags & RXf_EVAL_SEEN)
2768 && !PL_reg_state.re_state_eval_setup_done)
2772 PL_reg_state.re_state_eval_setup_done = TRUE;
2774 /* Make $_ available to executed code. */
2775 if (reginfo->sv != DEFSV) {
2777 DEFSV_set(reginfo->sv);
2780 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2781 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2782 /* prepare for quick setting of pos */
2783 #ifdef PERL_OLD_COPY_ON_WRITE
2784 if (SvIsCOW(reginfo->sv))
2785 sv_force_normal_flags(reginfo->sv, 0);
2787 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2788 &PL_vtbl_mglob, NULL, 0);
2792 PL_reg_oldpos = mg->mg_len;
2793 SAVEDESTRUCTOR_X(restore_pos, prog);
2795 if (!PL_reg_curpm) {
2796 Newxz(PL_reg_curpm, 1, PMOP);
2799 SV* const repointer = &PL_sv_undef;
2800 /* this regexp is also owned by the new PL_reg_curpm, which
2801 will try to free it. */
2802 av_push(PL_regex_padav, repointer);
2803 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2804 PL_regex_pad = AvARRAY(PL_regex_padav);
2809 PL_reg_oldcurpm = PL_curpm;
2810 PL_curpm = PL_reg_curpm;
2811 if (RXp_MATCH_COPIED(prog)) {
2812 /* Here is a serious problem: we cannot rewrite subbeg,
2813 since it may be needed if this match fails. Thus
2814 $` inside (?{}) could fail... */
2815 PL_reg_oldsaved = prog->subbeg;
2816 PL_reg_oldsavedlen = prog->sublen;
2817 PL_reg_oldsavedoffset = prog->suboffset;
2818 PL_reg_oldsavedcoffset = prog->suboffset;
2820 PL_nrs = prog->saved_copy;
2822 RXp_MATCH_COPIED_off(prog);
2825 PL_reg_oldsaved = NULL;
2826 prog->subbeg = PL_bostr;
2827 prog->suboffset = 0;
2828 prog->subcoffset = 0;
2829 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2832 PL_reg_starttry = *startposp;
2834 prog->offs[0].start = *startposp - PL_bostr;
2835 prog->lastparen = 0;
2836 prog->lastcloseparen = 0;
2838 /* XXXX What this code is doing here?!!! There should be no need
2839 to do this again and again, prog->lastparen should take care of
2842 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2843 * Actually, the code in regcppop() (which Ilya may be meaning by
2844 * prog->lastparen), is not needed at all by the test suite
2845 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2846 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2847 * Meanwhile, this code *is* needed for the
2848 * above-mentioned test suite tests to succeed. The common theme
2849 * on those tests seems to be returning null fields from matches.
2850 * --jhi updated by dapm */
2852 if (prog->nparens) {
2853 regexp_paren_pair *pp = prog->offs;
2855 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2863 result = regmatch(reginfo, *startposp, progi->program + 1);
2865 prog->offs[0].end = result;
2868 if (reginfo->cutpoint)
2869 *startposp= reginfo->cutpoint;
2870 REGCP_UNWIND(lastcp);
2875 #define sayYES goto yes
2876 #define sayNO goto no
2877 #define sayNO_SILENT goto no_silent
2879 /* we dont use STMT_START/END here because it leads to
2880 "unreachable code" warnings, which are bogus, but distracting. */
2881 #define CACHEsayNO \
2882 if (ST.cache_mask) \
2883 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2886 /* this is used to determine how far from the left messages like
2887 'failed...' are printed. It should be set such that messages
2888 are inline with the regop output that created them.
2890 #define REPORT_CODE_OFF 32
2893 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2894 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2895 #define CHRTEST_NOT_A_CP_1 -999
2896 #define CHRTEST_NOT_A_CP_2 -998
2898 #define SLAB_FIRST(s) (&(s)->states[0])
2899 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2901 /* grab a new slab and return the first slot in it */
2903 STATIC regmatch_state *
2906 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2909 regmatch_slab *s = PL_regmatch_slab->next;
2911 Newx(s, 1, regmatch_slab);
2912 s->prev = PL_regmatch_slab;
2914 PL_regmatch_slab->next = s;
2916 PL_regmatch_slab = s;
2917 return SLAB_FIRST(s);
2921 /* push a new state then goto it */
2923 #define PUSH_STATE_GOTO(state, node, input) \
2924 pushinput = input; \
2926 st->resume_state = state; \
2929 /* push a new state with success backtracking, then goto it */
2931 #define PUSH_YES_STATE_GOTO(state, node, input) \
2932 pushinput = input; \
2934 st->resume_state = state; \
2935 goto push_yes_state;
2942 regmatch() - main matching routine
2944 This is basically one big switch statement in a loop. We execute an op,
2945 set 'next' to point the next op, and continue. If we come to a point which
2946 we may need to backtrack to on failure such as (A|B|C), we push a
2947 backtrack state onto the backtrack stack. On failure, we pop the top
2948 state, and re-enter the loop at the state indicated. If there are no more
2949 states to pop, we return failure.
2951 Sometimes we also need to backtrack on success; for example /A+/, where
2952 after successfully matching one A, we need to go back and try to
2953 match another one; similarly for lookahead assertions: if the assertion
2954 completes successfully, we backtrack to the state just before the assertion
2955 and then carry on. In these cases, the pushed state is marked as
2956 'backtrack on success too'. This marking is in fact done by a chain of
2957 pointers, each pointing to the previous 'yes' state. On success, we pop to
2958 the nearest yes state, discarding any intermediate failure-only states.
2959 Sometimes a yes state is pushed just to force some cleanup code to be
2960 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2961 it to free the inner regex.
2963 Note that failure backtracking rewinds the cursor position, while
2964 success backtracking leaves it alone.
2966 A pattern is complete when the END op is executed, while a subpattern
2967 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2968 ops trigger the "pop to last yes state if any, otherwise return true"
2971 A common convention in this function is to use A and B to refer to the two
2972 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2973 the subpattern to be matched possibly multiple times, while B is the entire
2974 rest of the pattern. Variable and state names reflect this convention.
2976 The states in the main switch are the union of ops and failure/success of
2977 substates associated with with that op. For example, IFMATCH is the op
2978 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2979 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2980 successfully matched A and IFMATCH_A_fail is a state saying that we have
2981 just failed to match A. Resume states always come in pairs. The backtrack
2982 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2983 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2984 on success or failure.
2986 The struct that holds a backtracking state is actually a big union, with
2987 one variant for each major type of op. The variable st points to the
2988 top-most backtrack struct. To make the code clearer, within each
2989 block of code we #define ST to alias the relevant union.
2991 Here's a concrete example of a (vastly oversimplified) IFMATCH
2997 #define ST st->u.ifmatch
2999 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3000 ST.foo = ...; // some state we wish to save
3002 // push a yes backtrack state with a resume value of
3003 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3005 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3008 case IFMATCH_A: // we have successfully executed A; now continue with B
3010 bar = ST.foo; // do something with the preserved value
3013 case IFMATCH_A_fail: // A failed, so the assertion failed
3014 ...; // do some housekeeping, then ...
3015 sayNO; // propagate the failure
3022 For any old-timers reading this who are familiar with the old recursive
3023 approach, the code above is equivalent to:
3025 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3034 ...; // do some housekeeping, then ...
3035 sayNO; // propagate the failure
3038 The topmost backtrack state, pointed to by st, is usually free. If you
3039 want to claim it, populate any ST.foo fields in it with values you wish to
3040 save, then do one of
3042 PUSH_STATE_GOTO(resume_state, node, newinput);
3043 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3045 which sets that backtrack state's resume value to 'resume_state', pushes a
3046 new free entry to the top of the backtrack stack, then goes to 'node'.
3047 On backtracking, the free slot is popped, and the saved state becomes the
3048 new free state. An ST.foo field in this new top state can be temporarily
3049 accessed to retrieve values, but once the main loop is re-entered, it
3050 becomes available for reuse.
3052 Note that the depth of the backtrack stack constantly increases during the
3053 left-to-right execution of the pattern, rather than going up and down with
3054 the pattern nesting. For example the stack is at its maximum at Z at the
3055 end of the pattern, rather than at X in the following:
3057 /(((X)+)+)+....(Y)+....Z/
3059 The only exceptions to this are lookahead/behind assertions and the cut,
3060 (?>A), which pop all the backtrack states associated with A before
3063 Backtrack state structs are allocated in slabs of about 4K in size.
3064 PL_regmatch_state and st always point to the currently active state,
3065 and PL_regmatch_slab points to the slab currently containing
3066 PL_regmatch_state. The first time regmatch() is called, the first slab is
3067 allocated, and is never freed until interpreter destruction. When the slab
3068 is full, a new one is allocated and chained to the end. At exit from
3069 regmatch(), slabs allocated since entry are freed.
3074 #define DEBUG_STATE_pp(pp) \
3076 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3077 PerlIO_printf(Perl_debug_log, \
3078 " %*s"pp" %s%s%s%s%s\n", \
3080 PL_reg_name[st->resume_state], \
3081 ((st==yes_state||st==mark_state) ? "[" : ""), \
3082 ((st==yes_state) ? "Y" : ""), \
3083 ((st==mark_state) ? "M" : ""), \
3084 ((st==yes_state||st==mark_state) ? "]" : "") \
3089 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3094 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3095 const char *start, const char *end, const char *blurb)
3097 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3099 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3104 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3105 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3107 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3108 start, end - start, 60);
3110 PerlIO_printf(Perl_debug_log,
3111 "%s%s REx%s %s against %s\n",
3112 PL_colors[4], blurb, PL_colors[5], s0, s1);
3114 if (utf8_target||utf8_pat)
3115 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3116 utf8_pat ? "pattern" : "",
3117 utf8_pat && utf8_target ? " and " : "",
3118 utf8_target ? "string" : ""
3124 S_dump_exec_pos(pTHX_ const char *locinput,
3125 const regnode *scan,
3126 const char *loc_regeol,
3127 const char *loc_bostr,
3128 const char *loc_reg_starttry,
3129 const bool utf8_target)
3131 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3132 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3133 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3134 /* The part of the string before starttry has one color
3135 (pref0_len chars), between starttry and current
3136 position another one (pref_len - pref0_len chars),
3137 after the current position the third one.
3138 We assume that pref0_len <= pref_len, otherwise we
3139 decrease pref0_len. */
3140 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3141 ? (5 + taill) - l : locinput - loc_bostr;
3144 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3146 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3148 pref0_len = pref_len - (locinput - loc_reg_starttry);
3149 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3150 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3151 ? (5 + taill) - pref_len : loc_regeol - locinput);
3152 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3156 if (pref0_len > pref_len)
3157 pref0_len = pref_len;
3159 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3161 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3162 (locinput - pref_len),pref0_len, 60, 4, 5);
3164 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3165 (locinput - pref_len + pref0_len),
3166 pref_len - pref0_len, 60, 2, 3);
3168 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3169 locinput, loc_regeol - locinput, 10, 0, 1);
3171 const STRLEN tlen=len0+len1+len2;
3172 PerlIO_printf(Perl_debug_log,
3173 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3174 (IV)(locinput - loc_bostr),
3177 (docolor ? "" : "> <"),
3179 (int)(tlen > 19 ? 0 : 19 - tlen),
3186 /* reg_check_named_buff_matched()
3187 * Checks to see if a named buffer has matched. The data array of
3188 * buffer numbers corresponding to the buffer is expected to reside
3189 * in the regexp->data->data array in the slot stored in the ARG() of
3190 * node involved. Note that this routine doesn't actually care about the
3191 * name, that information is not preserved from compilation to execution.
3192 * Returns the index of the leftmost defined buffer with the given name
3193 * or 0 if non of the buffers matched.
3196 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3199 RXi_GET_DECL(rex,rexi);
3200 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3201 I32 *nums=(I32*)SvPVX(sv_dat);
3203 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3205 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3206 if ((I32)rex->lastparen >= nums[n] &&
3207 rex->offs[nums[n]].end != -1)
3216 /* free all slabs above current one - called during LEAVE_SCOPE */
3219 S_clear_backtrack_stack(pTHX_ void *p)
3221 regmatch_slab *s = PL_regmatch_slab->next;
3226 PL_regmatch_slab->next = NULL;
3228 regmatch_slab * const osl = s;
3234 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3235 U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat)
3237 /* This function determines if there are one or two characters that match
3238 * the first character of the passed-in EXACTish node <text_node>, and if
3239 * so, returns them in the passed-in pointers.
3241 * If it determines that no possible character in the target string can
3242 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3243 * the first character in <text_node> requires UTF-8 to represent, and the
3244 * target string isn't in UTF-8.)
3246 * If there are more than two characters that could match the beginning of
3247 * <text_node>, or if more context is required to determine a match or not,
3248 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3250 * The motiviation behind this function is to allow the caller to set up
3251 * tight loops for matching. If <text_node> is of type EXACT, there is
3252 * only one possible character that can match its first character, and so
3253 * the situation is quite simple. But things get much more complicated if
3254 * folding is involved. It may be that the first character of an EXACTFish
3255 * node doesn't participate in any possible fold, e.g., punctuation, so it
3256 * can be matched only by itself. The vast majority of characters that are
3257 * in folds match just two things, their lower and upper-case equivalents.
3258 * But not all are like that; some have multiple possible matches, or match
3259 * sequences of more than one character. This function sorts all that out.
3261 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3262 * loop of trying to match A*, we know we can't exit where the thing
3263 * following it isn't a B. And something can't be a B unless it is the
3264 * beginning of B. By putting a quick test for that beginning in a tight
3265 * loop, we can rule out things that can't possibly be B without having to
3266 * break out of the loop, thus avoiding work. Similarly, if A is a single
3267 * character, we can make a tight loop matching A*, using the outputs of
3270 * If the target string to match isn't in UTF-8, and there aren't
3271 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3272 * the one or two possible octets (which are characters in this situation)
3273 * that can match. In all cases, if there is only one character that can
3274 * match, *<c1p> and *<c2p> will be identical.
3276 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3277 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3278 * can match the beginning of <text_node>. They should be declared with at
3279 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3280 * undefined what these contain.) If one or both of the buffers are
3281 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3282 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3283 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3284 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3285 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3287 const bool utf8_target = PL_reg_match_utf8;
3289 UV c1 = CHRTEST_NOT_A_CP_1;
3290 UV c2 = CHRTEST_NOT_A_CP_2;
3291 bool use_chrtest_void = FALSE;
3293 /* Used when we have both utf8 input and utf8 output, to avoid converting
3294 * to/from code points */
3295 bool utf8_has_been_setup = FALSE;
3299 U8 *pat = (U8*)STRING(text_node);
3301 if (OP(text_node) == EXACT) {
3303 /* In an exact node, only one thing can be matched, that first
3304 * character. If both the pat and the target are UTF-8, we can just
3305 * copy the input to the output, avoiding finding the code point of
3310 else if (utf8_target) {
3311 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3312 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3313 utf8_has_been_setup = TRUE;
3316 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3319 else /* an EXACTFish node */
3321 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3322 pat + STR_LEN(text_node)))
3324 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3325 pat + STR_LEN(text_node))))
3327 /* Multi-character folds require more context to sort out. Also
3328 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3329 * handled outside this routine */
3330 use_chrtest_void = TRUE;
3332 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3333 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3335 /* Load the folds hash, if not already done */
3337 if (! PL_utf8_foldclosures) {
3338 if (! PL_utf8_tofold) {
3339 U8 dummy[UTF8_MAXBYTES+1];
3341 /* Force loading this by folding an above-Latin1 char */
3342 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3343 assert(PL_utf8_tofold); /* Verify that worked */
3345 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3348 /* The fold closures data structure is a hash with the keys being
3349 * the UTF-8 of every character that is folded to, like 'k', and
3350 * the values each an array of all code points that fold to its
3351 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3353 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3358 /* Not found in the hash, therefore there are no folds
3359 * containing it, so there is only a single character that
3363 else { /* Does participate in folds */
3364 AV* list = (AV*) *listp;
3365 if (av_len(list) != 1) {
3367 /* If there aren't exactly two folds to this, it is outside
3368 * the scope of this function */
3369 use_chrtest_void = TRUE;
3371 else { /* There are two. Get them */
3372 SV** c_p = av_fetch(list, 0, FALSE);
3374 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3378 c_p = av_fetch(list, 1, FALSE);
3380 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3384 /* Folds that cross the 255/256 boundary are forbidden if
3385 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3386 * pattern character is above 256, and its only other match
3387 * is below 256, the only legal match will be to itself.
3388 * We have thrown away the original, so have to compute
3389 * which is the one above 255 */
3390 if ((c1 < 256) != (c2 < 256)) {
3391 if (OP(text_node) == EXACTFL
3392 || (OP(text_node) == EXACTFA
3393 && (isASCII(c1) || isASCII(c2))))
3406 else /* Here, c1 is < 255 */
3408 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3409 && OP(text_node) != EXACTFL
3410 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3412 /* Here, there could be something above Latin1 in the target which
3413 * folds to this character in the pattern. All such cases except
3414 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3415 * involved in their folds, so are outside the scope of this
3417 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3418 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3421 use_chrtest_void = TRUE;
3424 else { /* Here nothing above Latin1 can fold to the pattern character */
3425 switch (OP(text_node)) {
3427 case EXACTFL: /* /l rules */
3428 c2 = PL_fold_locale[c1];
3432 if (! utf8_target) { /* /d rules */
3437 /* /u rules for all these. This happens to work for
3438 * EXACTFA as nothing in Latin1 folds to ASCII */
3440 case EXACTFU_TRICKYFOLD:
3443 c2 = PL_fold_latin1[c1];
3447 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3448 assert(0); /* NOTREACHED */
3453 /* Here have figured things out. Set up the returns */
3454 if (use_chrtest_void) {
3455 *c2p = *c1p = CHRTEST_VOID;
3457 else if (utf8_target) {
3458 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3459 uvchr_to_utf8(c1_utf8, c1);
3460 uvchr_to_utf8(c2_utf8, c2);
3463 /* Invariants are stored in both the utf8 and byte outputs; Use
3464 * negative numbers otherwise for the byte ones. Make sure that the
3465 * byte ones are the same iff the utf8 ones are the same */
3466 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3467 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3470 ? CHRTEST_NOT_A_CP_1
3471 : CHRTEST_NOT_A_CP_2;
3473 else if (c1 > 255) {
3474 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3479 *c1p = *c2p = c2; /* c2 is the only representable value */
3481 else { /* c1 is representable; see about c2 */
3483 *c2p = (c2 < 256) ? c2 : c1;
3489 /* returns -1 on failure, $+[0] on success */
3491 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3493 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3497 const bool utf8_target = PL_reg_match_utf8;
3498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3499 REGEXP *rex_sv = reginfo->prog;
3500 regexp *rex = ReANY(rex_sv);
3501 RXi_GET_DECL(rex,rexi);
3503 /* the current state. This is a cached copy of PL_regmatch_state */
3505 /* cache heavy used fields of st in registers */
3508 U32 n = 0; /* general value; init to avoid compiler warning */
3509 I32 ln = 0; /* len or last; init to avoid compiler warning */
3510 char *locinput = startpos;
3511 char *pushinput; /* where to continue after a PUSH */
3512 I32 nextchr; /* is always set to UCHARAT(locinput) */
3514 bool result = 0; /* return value of S_regmatch */
3515 int depth = 0; /* depth of backtrack stack */
3516 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3517 const U32 max_nochange_depth =
3518 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3519 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3520 regmatch_state *yes_state = NULL; /* state to pop to on success of
3522 /* mark_state piggy backs on the yes_state logic so that when we unwind
3523 the stack on success we can update the mark_state as we go */
3524 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3525 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3526 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3528 bool no_final = 0; /* prevent failure from backtracking? */
3529 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3530 char *startpoint = locinput;
3531 SV *popmark = NULL; /* are we looking for a mark? */
3532 SV *sv_commit = NULL; /* last mark name seen in failure */
3533 SV *sv_yes_mark = NULL; /* last mark name we have seen
3534 during a successful match */
3535 U32 lastopen = 0; /* last open we saw */
3536 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3537 SV* const oreplsv = GvSV(PL_replgv);
3538 /* these three flags are set by various ops to signal information to
3539 * the very next op. They have a useful lifetime of exactly one loop
3540 * iteration, and are not preserved or restored by state pushes/pops
3542 bool sw = 0; /* the condition value in (?(cond)a|b) */
3543 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3544 int logical = 0; /* the following EVAL is:
3548 or the following IFMATCH/UNLESSM is:
3549 false: plain (?=foo)
3550 true: used as a condition: (?(?=foo))
3552 PAD* last_pad = NULL;
3554 I32 gimme = G_SCALAR;
3555 CV *caller_cv = NULL; /* who called us */
3556 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3557 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3558 U32 maxopenparen = 0; /* max '(' index seen so far */
3559 int to_complement; /* Invert the result? */
3560 _char_class_number classnum;
3561 bool is_utf8_pat = reginfo->is_utf8_pat;
3564 GET_RE_DEBUG_FLAGS_DECL;
3567 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3568 multicall_oldcatch = 0;
3569 multicall_cv = NULL;
3571 PERL_UNUSED_VAR(multicall_cop);
3572 PERL_UNUSED_VAR(newsp);
3575 PERL_ARGS_ASSERT_REGMATCH;
3577 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3578 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3580 /* on first ever call to regmatch, allocate first slab */
3581 if (!PL_regmatch_slab) {
3582 Newx(PL_regmatch_slab, 1, regmatch_slab);
3583 PL_regmatch_slab->prev = NULL;
3584 PL_regmatch_slab->next = NULL;
3585 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3588 oldsave = PL_savestack_ix;
3589 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3590 SAVEVPTR(PL_regmatch_slab);
3591 SAVEVPTR(PL_regmatch_state);
3593 /* grab next free state slot */
3594 st = ++PL_regmatch_state;
3595 if (st > SLAB_LAST(PL_regmatch_slab))
3596 st = PL_regmatch_state = S_push_slab(aTHX);
3598 /* Note that nextchr is a byte even in UTF */
3601 while (scan != NULL) {
3604 SV * const prop = sv_newmortal();
3605 regnode *rnext=regnext(scan);
3606 DUMP_EXEC_POS( locinput, scan, utf8_target );
3607 regprop(rex, prop, scan);
3609 PerlIO_printf(Perl_debug_log,
3610 "%3"IVdf":%*s%s(%"IVdf")\n",
3611 (IV)(scan - rexi->program), depth*2, "",
3613 (PL_regkind[OP(scan)] == END || !rnext) ?
3614 0 : (IV)(rnext - rexi->program));
3617 next = scan + NEXT_OFF(scan);
3620 state_num = OP(scan);
3626 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3628 switch (state_num) {
3629 case BOL: /* /^../ */
3630 if (locinput == PL_bostr)
3632 /* reginfo->till = reginfo->bol; */
3637 case MBOL: /* /^../m */
3638 if (locinput == PL_bostr ||
3639 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3645 case SBOL: /* /^../s */
3646 if (locinput == PL_bostr)
3651 if (locinput == reginfo->ganch)
3655 case KEEPS: /* \K */
3656 /* update the startpoint */
3657 st->u.keeper.val = rex->offs[0].start;
3658 rex->offs[0].start = locinput - PL_bostr;
3659 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3660 assert(0); /*NOTREACHED*/
3661 case KEEPS_next_fail:
3662 /* rollback the start point change */
3663 rex->offs[0].start = st->u.keeper.val;
3665 assert(0); /*NOTREACHED*/
3667 case EOL: /* /..$/ */
3670 case MEOL: /* /..$/m */
3671 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3675 case SEOL: /* /..$/s */
3677 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3679 if (PL_regeol - locinput > 1)
3684 if (!NEXTCHR_IS_EOS)
3688 case SANY: /* /./s */
3691 goto increment_locinput;
3699 case REG_ANY: /* /./ */
3700 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3702 goto increment_locinput;
3706 #define ST st->u.trie
3707 case TRIEC: /* (ab|cd) with known charclass */
3708 /* In this case the charclass data is available inline so
3709 we can fail fast without a lot of extra overhead.
3711 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3713 PerlIO_printf(Perl_debug_log,
3714 "%*s %sfailed to match trie start class...%s\n",
3715 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3718 assert(0); /* NOTREACHED */
3721 case TRIE: /* (ab|cd) */
3722 /* the basic plan of execution of the trie is:
3723 * At the beginning, run though all the states, and
3724 * find the longest-matching word. Also remember the position
3725 * of the shortest matching word. For example, this pattern:
3728 * when matched against the string "abcde", will generate
3729 * accept states for all words except 3, with the longest
3730 * matching word being 4, and the shortest being 2 (with
3731 * the position being after char 1 of the string).
3733 * Then for each matching word, in word order (i.e. 1,2,4,5),
3734 * we run the remainder of the pattern; on each try setting
3735 * the current position to the character following the word,
3736 * returning to try the next word on failure.
3738 * We avoid having to build a list of words at runtime by
3739 * using a compile-time structure, wordinfo[].prev, which
3740 * gives, for each word, the previous accepting word (if any).
3741 * In the case above it would contain the mappings 1->2, 2->0,
3742 * 3->0, 4->5, 5->1. We can use this table to generate, from
3743 * the longest word (4 above), a list of all words, by
3744 * following the list of prev pointers; this gives us the
3745 * unordered list 4,5,1,2. Then given the current word we have
3746 * just tried, we can go through the list and find the
3747 * next-biggest word to try (so if we just failed on word 2,
3748 * the next in the list is 4).
3750 * Since at runtime we don't record the matching position in
3751 * the string for each word, we have to work that out for
3752 * each word we're about to process. The wordinfo table holds
3753 * the character length of each word; given that we recorded
3754 * at the start: the position of the shortest word and its
3755 * length in chars, we just need to move the pointer the
3756 * difference between the two char lengths. Depending on
3757 * Unicode status and folding, that's cheap or expensive.
3759 * This algorithm is optimised for the case where are only a
3760 * small number of accept states, i.e. 0,1, or maybe 2.
3761 * With lots of accepts states, and having to try all of them,
3762 * it becomes quadratic on number of accept states to find all
3767 /* what type of TRIE am I? (utf8 makes this contextual) */
3768 DECL_TRIE_TYPE(scan);
3770 /* what trie are we using right now */
3771 reg_trie_data * const trie
3772 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3773 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3774 U32 state = trie->startstate;
3777 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3779 if (trie->states[ state ].wordnum) {
3781 PerlIO_printf(Perl_debug_log,
3782 "%*s %smatched empty string...%s\n",
3783 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3789 PerlIO_printf(Perl_debug_log,
3790 "%*s %sfailed to match trie start class...%s\n",
3791 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3798 U8 *uc = ( U8* )locinput;
3802 U8 *uscan = (U8*)NULL;
3803 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3804 U32 charcount = 0; /* how many input chars we have matched */
3805 U32 accepted = 0; /* have we seen any accepting states? */
3807 ST.jump = trie->jump;
3810 ST.longfold = FALSE; /* char longer if folded => it's harder */
3813 /* fully traverse the TRIE; note the position of the
3814 shortest accept state and the wordnum of the longest
3817 while ( state && uc <= (U8*)PL_regeol ) {
3818 U32 base = trie->states[ state ].trans.base;
3822 wordnum = trie->states[ state ].wordnum;
3824 if (wordnum) { /* it's an accept state */
3827 /* record first match position */
3829 ST.firstpos = (U8*)locinput;
3834 ST.firstchars = charcount;
3837 if (!ST.nextword || wordnum < ST.nextword)
3838 ST.nextword = wordnum;
3839 ST.topword = wordnum;
3842 DEBUG_TRIE_EXECUTE_r({
3843 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3844 PerlIO_printf( Perl_debug_log,
3845 "%*s %sState: %4"UVxf" Accepted: %c ",
3846 2+depth * 2, "", PL_colors[4],
3847 (UV)state, (accepted ? 'Y' : 'N'));
3850 /* read a char and goto next state */
3851 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3853 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3854 uscan, len, uvc, charid, foldlen,
3861 base + charid - 1 - trie->uniquecharcount)) >= 0)
3863 && ((U32)offset < trie->lasttrans)
3864 && trie->trans[offset].check == state)
3866 state = trie->trans[offset].next;
3877 DEBUG_TRIE_EXECUTE_r(
3878 PerlIO_printf( Perl_debug_log,
3879 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3880 charid, uvc, (UV)state, PL_colors[5] );
3886 /* calculate total number of accept states */
3891 w = trie->wordinfo[w].prev;
3894 ST.accepted = accepted;
3898 PerlIO_printf( Perl_debug_log,
3899 "%*s %sgot %"IVdf" possible matches%s\n",
3900 REPORT_CODE_OFF + depth * 2, "",
3901 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3903 goto trie_first_try; /* jump into the fail handler */
3905 assert(0); /* NOTREACHED */
3907 case TRIE_next_fail: /* we failed - try next alternative */
3911 REGCP_UNWIND(ST.cp);
3912 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3914 if (!--ST.accepted) {
3916 PerlIO_printf( Perl_debug_log,
3917 "%*s %sTRIE failed...%s\n",
3918 REPORT_CODE_OFF+depth*2, "",
3925 /* Find next-highest word to process. Note that this code
3926 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3929 U16 const nextword = ST.nextword;
3930 reg_trie_wordinfo * const wordinfo
3931 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3932 for (word=ST.topword;