5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGEXEC_C
87 #ifdef PERL_IN_XSUB_RE
93 #include "inline_invlist.c"
94 #include "unicode_constants.h"
96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
102 /* Valid for non-utf8 strings: avoids the reginclass
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
105 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
106 : ANYOF_BITMAP_TEST(p,*(c)))
112 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
113 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
115 #define HOPc(pos,off) \
116 (char *)(PL_reg_match_utf8 \
117 ? reghop3((U8*)pos, off, \
118 (U8*)(off >= 0 ? reginfo->strend : PL_bostr)) \
120 #define HOPBACKc(pos, off) \
121 (char*)(PL_reg_match_utf8\
122 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
123 : (pos - off >= PL_bostr) \
127 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
131 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
132 #define NEXTCHR_IS_EOS (nextchr < 0)
134 #define SET_nextchr \
135 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
137 #define SET_locinput(p) \
142 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
144 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
145 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
146 1, 0, NULL, &flags); \
151 /* If in debug mode, we test that a known character properly matches */
153 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
155 utf8_char_in_property) \
156 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
157 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
159 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
161 utf8_char_in_property) \
162 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
166 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
167 swash_property_names[_CC_WORDCHAR], \
168 GREEK_SMALL_LETTER_IOTA_UTF8)
170 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
172 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
173 "_X_regular_begin", \
174 GREEK_SMALL_LETTER_IOTA_UTF8); \
175 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
177 COMBINING_GRAVE_ACCENT_UTF8); \
180 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
181 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
183 /* for use after a quantifier and before an EXACT-like node -- japhy */
184 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
186 * NOTE that *nothing* that affects backtracking should be in here, specifically
187 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
188 * node that is in between two EXACT like nodes when ascertaining what the required
189 * "follow" character is. This should probably be moved to regex compile time
190 * although it may be done at run time beause of the REF possibility - more
191 * investigation required. -- demerphq
193 #define JUMPABLE(rn) ( \
195 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
197 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
198 OP(rn) == PLUS || OP(rn) == MINMOD || \
200 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
202 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
204 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
207 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
208 we don't need this definition. */
209 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
210 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
211 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
214 /* ... so we use this as its faster. */
215 #define IS_TEXT(rn) ( OP(rn)==EXACT )
216 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
217 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
218 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
223 Search for mandatory following text node; for lookahead, the text must
224 follow but for lookbehind (rn->flags != 0) we skip to the next step.
226 #define FIND_NEXT_IMPT(rn) STMT_START { \
227 while (JUMPABLE(rn)) { \
228 const OPCODE type = OP(rn); \
229 if (type == SUSPEND || PL_regkind[type] == CURLY) \
230 rn = NEXTOPER(NEXTOPER(rn)); \
231 else if (type == PLUS) \
233 else if (type == IFMATCH) \
234 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
235 else rn += NEXT_OFF(rn); \
239 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
240 * These are for the pre-composed Hangul syllables, which are all in a
241 * contiguous block and arranged there in such a way so as to facilitate
242 * alorithmic determination of their characteristics. As such, they don't need
243 * a swash, but can be determined by simple arithmetic. Almost all are
244 * GCB=LVT, but every 28th one is a GCB=LV */
245 #define SBASE 0xAC00 /* Start of block */
246 #define SCount 11172 /* Length of block */
249 static void restore_pos(pTHX_ void *arg);
251 #define REGCP_PAREN_ELEMS 3
252 #define REGCP_OTHER_ELEMS 3
253 #define REGCP_FRAME_ELEMS 1
254 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
255 * are needed for the regexp context stack bookkeeping. */
258 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
261 const int retval = PL_savestack_ix;
262 const int paren_elems_to_push =
263 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
264 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
265 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
267 GET_RE_DEBUG_FLAGS_DECL;
269 PERL_ARGS_ASSERT_REGCPPUSH;
271 if (paren_elems_to_push < 0)
272 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
273 paren_elems_to_push);
275 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
276 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
277 " out of range (%lu-%ld)",
279 (unsigned long)maxopenparen,
282 SSGROW(total_elems + REGCP_FRAME_ELEMS);
285 if ((int)maxopenparen > (int)parenfloor)
286 PerlIO_printf(Perl_debug_log,
287 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
292 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
293 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
294 SSPUSHINT(rex->offs[p].end);
295 SSPUSHINT(rex->offs[p].start);
296 SSPUSHINT(rex->offs[p].start_tmp);
297 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
298 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
300 (IV)rex->offs[p].start,
301 (IV)rex->offs[p].start_tmp,
305 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
306 SSPUSHINT(maxopenparen);
307 SSPUSHINT(rex->lastparen);
308 SSPUSHINT(rex->lastcloseparen);
309 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
314 /* These are needed since we do not localize EVAL nodes: */
315 #define REGCP_SET(cp) \
317 PerlIO_printf(Perl_debug_log, \
318 " Setting an EVAL scope, savestack=%"IVdf"\n", \
319 (IV)PL_savestack_ix)); \
322 #define REGCP_UNWIND(cp) \
324 if (cp != PL_savestack_ix) \
325 PerlIO_printf(Perl_debug_log, \
326 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
327 (IV)(cp), (IV)PL_savestack_ix)); \
330 #define UNWIND_PAREN(lp, lcp) \
331 for (n = rex->lastparen; n > lp; n--) \
332 rex->offs[n].end = -1; \
333 rex->lastparen = n; \
334 rex->lastcloseparen = lcp;
338 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
343 GET_RE_DEBUG_FLAGS_DECL;
345 PERL_ARGS_ASSERT_REGCPPOP;
347 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
349 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
350 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
351 rex->lastcloseparen = SSPOPINT;
352 rex->lastparen = SSPOPINT;
353 *maxopenparen_p = SSPOPINT;
355 i -= REGCP_OTHER_ELEMS;
356 /* Now restore the parentheses context. */
358 if (i || rex->lastparen + 1 <= rex->nparens)
359 PerlIO_printf(Perl_debug_log,
360 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
365 paren = *maxopenparen_p;
366 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
368 rex->offs[paren].start_tmp = SSPOPINT;
369 rex->offs[paren].start = SSPOPINT;
371 if (paren <= rex->lastparen)
372 rex->offs[paren].end = tmps;
373 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
374 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
376 (IV)rex->offs[paren].start,
377 (IV)rex->offs[paren].start_tmp,
378 (IV)rex->offs[paren].end,
379 (paren > rex->lastparen ? "(skipped)" : ""));
384 /* It would seem that the similar code in regtry()
385 * already takes care of this, and in fact it is in
386 * a better location to since this code can #if 0-ed out
387 * but the code in regtry() is needed or otherwise tests
388 * requiring null fields (pat.t#187 and split.t#{13,14}
389 * (as of patchlevel 7877) will fail. Then again,
390 * this code seems to be necessary or otherwise
391 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
392 * --jhi updated by dapm */
393 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
394 if (i > *maxopenparen_p)
395 rex->offs[i].start = -1;
396 rex->offs[i].end = -1;
397 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
398 " \\%"UVuf": %s ..-1 undeffing\n",
400 (i > *maxopenparen_p) ? "-1" : " "
406 /* restore the parens and associated vars at savestack position ix,
407 * but without popping the stack */
410 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
412 I32 tmpix = PL_savestack_ix;
413 PL_savestack_ix = ix;
414 regcppop(rex, maxopenparen_p);
415 PL_savestack_ix = tmpix;
418 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
421 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
423 /* Returns a boolean as to whether or not 'character' is a member of the
424 * Posix character class given by 'classnum' that should be equivalent to a
425 * value in the typedef '_char_class_number'.
427 * Ideally this could be replaced by a just an array of function pointers
428 * to the C library functions that implement the macros this calls.
429 * However, to compile, the precise function signatures are required, and
430 * these may vary from platform to to platform. To avoid having to figure
431 * out what those all are on each platform, I (khw) am using this method,
432 * which adds an extra layer of function call overhead (unless the C
433 * optimizer strips it away). But we don't particularly care about
434 * performance with locales anyway. */
436 switch ((_char_class_number) classnum) {
437 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
438 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
439 case _CC_ENUM_ASCII: return isASCII_LC(character);
440 case _CC_ENUM_BLANK: return isBLANK_LC(character);
441 case _CC_ENUM_CASED: return isLOWER_LC(character)
442 || isUPPER_LC(character);
443 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
444 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
445 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
446 case _CC_ENUM_LOWER: return isLOWER_LC(character);
447 case _CC_ENUM_PRINT: return isPRINT_LC(character);
448 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
449 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
450 case _CC_ENUM_SPACE: return isSPACE_LC(character);
451 case _CC_ENUM_UPPER: return isUPPER_LC(character);
452 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
453 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
454 default: /* VERTSPACE should never occur in locales */
455 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
458 assert(0); /* NOTREACHED */
463 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
465 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
466 * 'character' is a member of the Posix character class given by 'classnum'
467 * that should be equivalent to a value in the typedef
468 * '_char_class_number'.
470 * This just calls isFOO_lc on the code point for the character if it is in
471 * the range 0-255. Outside that range, all characters avoid Unicode
472 * rules, ignoring any locale. So use the Unicode function if this class
473 * requires a swash, and use the Unicode macro otherwise. */
475 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
477 if (UTF8_IS_INVARIANT(*character)) {
478 return isFOO_lc(classnum, *character);
480 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
481 return isFOO_lc(classnum,
482 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
485 if (classnum < _FIRST_NON_SWASH_CC) {
487 /* Initialize the swash unless done already */
488 if (! PL_utf8_swash_ptrs[classnum]) {
489 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
490 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
491 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
494 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
496 TRUE /* is UTF */ ));
499 switch ((_char_class_number) classnum) {
501 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
503 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
504 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
505 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
506 default: return 0; /* Things like CNTRL are always
510 assert(0); /* NOTREACHED */
515 * pregexec and friends
518 #ifndef PERL_IN_XSUB_RE
520 - pregexec - match a regexp against a string
523 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
524 char *strbeg, I32 minend, SV *screamer, U32 nosave)
525 /* stringarg: the point in the string at which to begin matching */
526 /* strend: pointer to null at end of string */
527 /* strbeg: real beginning of string */
528 /* minend: end of match must be >= minend bytes after stringarg. */
529 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
530 * itself is accessed via the pointers above */
531 /* nosave: For optimizations. */
533 PERL_ARGS_ASSERT_PREGEXEC;
536 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
537 nosave ? 0 : REXEC_COPY_STR);
542 * Need to implement the following flags for reg_anch:
544 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
546 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
547 * INTUIT_AUTORITATIVE_ML
548 * INTUIT_ONCE_NOML - Intuit can match in one location only.
551 * Another flag for this function: SECOND_TIME (so that float substrs
552 * with giant delta may be not rechecked).
555 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
557 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
558 Otherwise, only SvCUR(sv) is used to get strbeg. */
560 /* XXXX We assume that strpos is strbeg unless sv. */
562 /* XXXX Some places assume that there is a fixed substring.
563 An update may be needed if optimizer marks as "INTUITable"
564 RExen without fixed substrings. Similarly, it is assumed that
565 lengths of all the strings are no more than minlen, thus they
566 cannot come from lookahead.
567 (Or minlen should take into account lookahead.)
568 NOTE: Some of this comment is not correct. minlen does now take account
569 of lookahead/behind. Further research is required. -- demerphq
573 /* A failure to find a constant substring means that there is no need to make
574 an expensive call to REx engine, thus we celebrate a failure. Similarly,
575 finding a substring too deep into the string means that fewer calls to
576 regtry() should be needed.
578 REx compiler's optimizer found 4 possible hints:
579 a) Anchored substring;
581 c) Whether we are anchored (beginning-of-line or \G);
582 d) First node (of those at offset 0) which may distinguish positions;
583 We use a)b)d) and multiline-part of c), and try to find a position in the
584 string which does not contradict any of them.
587 /* Most of decisions we do here should have been done at compile time.
588 The nodes of the REx which we used for the search should have been
589 deleted from the finite automaton. */
592 * rx: the regex to match against
593 * sv: the SV being matched: only used for utf8 flag; the string
594 * itself is accessed via the pointers below. Note that on
595 * something like an overloaded SV, SvPOK(sv) may be false
596 * and the string pointers may point to something unrelated to
598 * strbeg: real beginning of string
599 * strpos: the point in the string at which to begin matching
600 * strend: pointer to the byte following the last char of the string
601 * flags currently unused; set to 0
602 * data: currently unused; set to NULL
606 Perl_re_intuit_start(pTHX_
609 const char * const strbeg,
613 re_scream_pos_data *data)
616 struct regexp *const prog = ReANY(rx);
618 /* Should be nonnegative! */
623 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
625 char *other_last = NULL; /* other substr checked before this */
626 char *check_at = NULL; /* check substr found at this pos */
627 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
628 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
629 RXi_GET_DECL(prog,progi);
631 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
632 regmatch_info *const reginfo = ®info_buf;
634 const char * const i_strpos = strpos;
636 GET_RE_DEBUG_FLAGS_DECL;
638 PERL_ARGS_ASSERT_RE_INTUIT_START;
639 PERL_UNUSED_ARG(flags);
640 PERL_UNUSED_ARG(data);
642 RX_MATCH_UTF8_set(rx,utf8_target);
644 is_utf8_pat = cBOOL(RX_UTF8(rx));
646 /* CHR_DIST() would be more correct here but it makes things slow. */
647 if (prog->minlen > strend - strpos) {
648 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
649 "String too short... [re_intuit_start]\n"));
653 PL_bostr = (char *)strbeg;
655 reginfo->strend = strend;
656 reginfo->is_utf8_pat = is_utf8_pat;
660 if (!prog->check_utf8 && prog->check_substr)
661 to_utf8_substr(prog);
662 check = prog->check_utf8;
664 if (!prog->check_substr && prog->check_utf8) {
665 if (! to_byte_substr(prog)) {
666 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
669 check = prog->check_substr;
671 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
672 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
673 || ( (prog->extflags & RXf_ANCH_BOL)
674 && !multiline ) ); /* Check after \n? */
677 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
678 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
679 && (strpos != strbeg)) {
680 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
683 if (prog->check_offset_min == prog->check_offset_max
684 && !(prog->extflags & RXf_CANY_SEEN)
685 && ! multiline) /* /m can cause \n's to match that aren't
686 accounted for in the string max length.
687 See [perl #115242] */
689 /* Substring at constant offset from beg-of-str... */
692 s = HOP3c(strpos, prog->check_offset_min, strend);
695 slen = SvCUR(check); /* >= 1 */
697 if ( strend - s > slen || strend - s < slen - 1
698 || (strend - s == slen && strend[-1] != '\n')) {
699 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
702 /* Now should match s[0..slen-2] */
704 if (slen && (*SvPVX_const(check) != *s
706 && memNE(SvPVX_const(check), s, slen)))) {
708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
712 else if (*SvPVX_const(check) != *s
713 || ((slen = SvCUR(check)) > 1
714 && memNE(SvPVX_const(check), s, slen)))
717 goto success_at_start;
720 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
722 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
723 end_shift = prog->check_end_shift;
726 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
727 - (SvTAIL(check) != 0);
728 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
730 if (end_shift < eshift)
734 else { /* Can match at random position */
737 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
738 end_shift = prog->check_end_shift;
740 /* end shift should be non negative here */
743 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
745 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
746 (IV)end_shift, RX_PRECOMP(prog));
750 /* Find a possible match in the region s..strend by looking for
751 the "check" substring in the region corrected by start/end_shift. */
754 I32 srch_start_shift = start_shift;
755 I32 srch_end_shift = end_shift;
758 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
759 srch_end_shift -= ((strbeg - s) - srch_start_shift);
760 srch_start_shift = strbeg - s;
762 DEBUG_OPTIMISE_MORE_r({
763 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
764 (IV)prog->check_offset_min,
765 (IV)srch_start_shift,
767 (IV)prog->check_end_shift);
770 if (prog->extflags & RXf_CANY_SEEN) {
771 start_point= (U8*)(s + srch_start_shift);
772 end_point= (U8*)(strend - srch_end_shift);
774 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
775 end_point= HOP3(strend, -srch_end_shift, strbeg);
777 DEBUG_OPTIMISE_MORE_r({
778 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
779 (int)(end_point - start_point),
780 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
784 s = fbm_instr( start_point, end_point,
785 check, multiline ? FBMrf_MULTILINE : 0);
787 /* Update the count-of-usability, remove useless subpatterns,
791 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
792 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
793 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
794 (s ? "Found" : "Did not find"),
795 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
796 ? "anchored" : "floating"),
799 (s ? " at offset " : "...\n") );
804 /* Finish the diagnostic message */
805 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
807 /* XXX dmq: first branch is for positive lookbehind...
808 Our check string is offset from the beginning of the pattern.
809 So we need to do any stclass tests offset forward from that
818 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
819 Start with the other substr.
820 XXXX no SCREAM optimization yet - and a very coarse implementation
821 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
822 *always* match. Probably should be marked during compile...
823 Probably it is right to do no SCREAM here...
826 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
827 : (prog->float_substr && prog->anchored_substr))
829 /* Take into account the "other" substring. */
830 /* XXXX May be hopelessly wrong for UTF... */
833 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
836 char * const last = HOP3c(s, -start_shift, strbeg);
838 char * const saved_s = s;
841 t = s - prog->check_offset_max;
842 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
844 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
849 t = HOP3c(t, prog->anchored_offset, strend);
850 if (t < other_last) /* These positions already checked */
852 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
855 /* XXXX It is not documented what units *_offsets are in.
856 We assume bytes, but this is clearly wrong.
857 Meaning this code needs to be carefully reviewed for errors.
861 /* On end-of-str: see comment below. */
862 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
863 if (must == &PL_sv_undef) {
865 DEBUG_r(must = prog->anchored_utf8); /* for debug */
870 HOP3(HOP3(last1, prog->anchored_offset, strend)
871 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
873 multiline ? FBMrf_MULTILINE : 0
876 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
877 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
878 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
879 (s ? "Found" : "Contradicts"),
880 quoted, RE_SV_TAIL(must));
885 if (last1 >= last2) {
886 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
887 ", giving up...\n"));
890 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
891 ", trying floating at offset %ld...\n",
892 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
893 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
894 s = HOP3c(last, 1, strend);
898 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
899 (long)(s - i_strpos)));
900 t = HOP3c(s, -prog->anchored_offset, strbeg);
901 other_last = HOP3c(s, 1, strend);
909 else { /* Take into account the floating substring. */
911 char * const saved_s = s;
914 t = HOP3c(s, -start_shift, strbeg);
916 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
917 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
918 last = HOP3c(t, prog->float_max_offset, strend);
919 s = HOP3c(t, prog->float_min_offset, strend);
922 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
923 must = utf8_target ? prog->float_utf8 : prog->float_substr;
924 /* fbm_instr() takes into account exact value of end-of-str
925 if the check is SvTAIL(ed). Since false positives are OK,
926 and end-of-str is not later than strend we are OK. */
927 if (must == &PL_sv_undef) {
929 DEBUG_r(must = prog->float_utf8); /* for debug message */
932 s = fbm_instr((unsigned char*)s,
933 (unsigned char*)last + SvCUR(must)
935 must, multiline ? FBMrf_MULTILINE : 0);
937 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
938 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
939 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
940 (s ? "Found" : "Contradicts"),
941 quoted, RE_SV_TAIL(must));
945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
946 ", giving up...\n"));
949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
950 ", trying anchored starting at offset %ld...\n",
951 (long)(saved_s + 1 - i_strpos)));
953 s = HOP3c(t, 1, strend);
957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
958 (long)(s - i_strpos)));
959 other_last = s; /* Fix this later. --Hugo */
969 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
971 DEBUG_OPTIMISE_MORE_r(
972 PerlIO_printf(Perl_debug_log,
973 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
974 (IV)prog->check_offset_min,
975 (IV)prog->check_offset_max,
983 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
985 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
988 /* Fixed substring is found far enough so that the match
989 cannot start at strpos. */
991 if (ml_anch && t[-1] != '\n') {
992 /* Eventually fbm_*() should handle this, but often
993 anchored_offset is not 0, so this check will not be wasted. */
994 /* XXXX In the code below we prefer to look for "^" even in
995 presence of anchored substrings. And we search even
996 beyond the found float position. These pessimizations
997 are historical artefacts only. */
999 while (t < strend - prog->minlen) {
1001 if (t < check_at - prog->check_offset_min) {
1002 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1003 /* Since we moved from the found position,
1004 we definitely contradict the found anchored
1005 substr. Due to the above check we do not
1006 contradict "check" substr.
1007 Thus we can arrive here only if check substr
1008 is float. Redo checking for "other"=="fixed".
1011 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1012 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1013 goto do_other_anchored;
1015 /* We don't contradict the found floating substring. */
1016 /* XXXX Why not check for STCLASS? */
1018 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1019 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1022 /* Position contradicts check-string */
1023 /* XXXX probably better to look for check-string
1024 than for "\n", so one should lower the limit for t? */
1025 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1026 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1027 other_last = strpos = s = t + 1;
1032 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1033 PL_colors[0], PL_colors[1]));
1037 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1038 PL_colors[0], PL_colors[1]));
1042 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1045 /* The found string does not prohibit matching at strpos,
1046 - no optimization of calling REx engine can be performed,
1047 unless it was an MBOL and we are not after MBOL,
1048 or a future STCLASS check will fail this. */
1050 /* Even in this situation we may use MBOL flag if strpos is offset
1051 wrt the start of the string. */
1052 if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1053 /* May be due to an implicit anchor of m{.*foo} */
1054 && !(prog->intflags & PREGf_IMPLICIT))
1059 DEBUG_EXECUTE_r( if (ml_anch)
1060 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1061 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1064 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1066 prog->check_utf8 /* Could be deleted already */
1067 && --BmUSEFUL(prog->check_utf8) < 0
1068 && (prog->check_utf8 == prog->float_utf8)
1070 prog->check_substr /* Could be deleted already */
1071 && --BmUSEFUL(prog->check_substr) < 0
1072 && (prog->check_substr == prog->float_substr)
1075 /* If flags & SOMETHING - do not do it many times on the same match */
1076 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1077 /* XXX Does the destruction order has to change with utf8_target? */
1078 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1079 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1080 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1081 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1082 check = NULL; /* abort */
1084 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1085 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1086 if (prog->intflags & PREGf_IMPLICIT)
1087 prog->extflags &= ~RXf_ANCH_MBOL;
1088 /* XXXX This is a remnant of the old implementation. It
1089 looks wasteful, since now INTUIT can use many
1090 other heuristics. */
1091 prog->extflags &= ~RXf_USE_INTUIT;
1092 /* XXXX What other flags might need to be cleared in this branch? */
1098 /* Last resort... */
1099 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1100 /* trie stclasses are too expensive to use here, we are better off to
1101 leave it to regmatch itself */
1102 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1103 /* minlen == 0 is possible if regstclass is \b or \B,
1104 and the fixed substr is ''$.
1105 Since minlen is already taken into account, s+1 is before strend;
1106 accidentally, minlen >= 1 guaranties no false positives at s + 1
1107 even for \b or \B. But (minlen? 1 : 0) below assumes that
1108 regstclass does not come from lookahead... */
1109 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1110 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1111 const U8* const str = (U8*)STRING(progi->regstclass);
1112 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1113 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1116 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1117 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1118 else if (prog->float_substr || prog->float_utf8)
1119 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1123 if (checked_upto < s)
1125 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1126 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1129 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1135 const char *what = NULL;
1137 if (endpos == strend) {
1138 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1139 "Could not match STCLASS...\n") );
1142 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1143 "This position contradicts STCLASS...\n") );
1144 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1146 checked_upto = HOPBACKc(endpos, start_shift);
1147 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1148 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1149 /* Contradict one of substrings */
1150 if (prog->anchored_substr || prog->anchored_utf8) {
1151 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1152 DEBUG_EXECUTE_r( what = "anchored" );
1154 s = HOP3c(t, 1, strend);
1155 if (s + start_shift + end_shift > strend) {
1156 /* XXXX Should be taken into account earlier? */
1157 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1158 "Could not match STCLASS...\n") );
1163 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1164 "Looking for %s substr starting at offset %ld...\n",
1165 what, (long)(s + start_shift - i_strpos)) );
1168 /* Have both, check_string is floating */
1169 if (t + start_shift >= check_at) /* Contradicts floating=check */
1170 goto retry_floating_check;
1171 /* Recheck anchored substring, but not floating... */
1175 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1176 "Looking for anchored substr starting at offset %ld...\n",
1177 (long)(other_last - i_strpos)) );
1178 goto do_other_anchored;
1180 /* Another way we could have checked stclass at the
1181 current position only: */
1186 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1187 "Looking for /%s^%s/m starting at offset %ld...\n",
1188 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1191 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1193 /* Check is floating substring. */
1194 retry_floating_check:
1195 t = check_at - start_shift;
1196 DEBUG_EXECUTE_r( what = "floating" );
1197 goto hop_and_restart;
1200 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1201 "By STCLASS: moving %ld --> %ld\n",
1202 (long)(t - i_strpos), (long)(s - i_strpos))
1206 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1207 "Does not contradict STCLASS...\n");
1212 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1213 PL_colors[4], (check ? "Guessed" : "Giving up"),
1214 PL_colors[5], (long)(s - i_strpos)) );
1217 fail_finish: /* Substring not found */
1218 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1219 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1221 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1222 PL_colors[4], PL_colors[5]));
1226 #define DECL_TRIE_TYPE(scan) \
1227 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1228 trie_type = ((scan->flags == EXACT) \
1229 ? (utf8_target ? trie_utf8 : trie_plain) \
1230 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1232 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1235 switch (trie_type) { \
1236 case trie_utf8_fold: \
1237 if ( foldlen>0 ) { \
1238 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1243 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1244 len = UTF8SKIP(uc); \
1245 skiplen = UNISKIP( uvc ); \
1246 foldlen -= skiplen; \
1247 uscan = foldbuf + skiplen; \
1250 case trie_latin_utf8_fold: \
1251 if ( foldlen>0 ) { \
1252 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1258 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
1259 skiplen = UNISKIP( uvc ); \
1260 foldlen -= skiplen; \
1261 uscan = foldbuf + skiplen; \
1265 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1272 charid = trie->charmap[ uvc ]; \
1276 if (widecharmap) { \
1277 SV** const svpp = hv_fetch(widecharmap, \
1278 (char*)&uvc, sizeof(UV), 0); \
1280 charid = (U16)SvIV(*svpp); \
1285 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1289 && (ln == 1 || folder(s, pat_string, ln)) \
1290 && (reginfo->intuit || regtry(reginfo, &s)) )\
1296 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1298 while (s < strend) { \
1304 #define REXEC_FBC_SCAN(CoDe) \
1306 while (s < strend) { \
1312 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1313 REXEC_FBC_UTF8_SCAN( \
1315 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1324 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1327 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1336 #define REXEC_FBC_TRYIT \
1337 if ((reginfo->intuit || regtry(reginfo, &s))) \
1340 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1341 if (utf8_target) { \
1342 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1345 REXEC_FBC_CLASS_SCAN(CoNd); \
1348 #define DUMP_EXEC_POS(li,s,doutf8) \
1349 dump_exec_pos(li,s,(reginfo->strend),(PL_bostr),(PL_reg_starttry),doutf8)
1352 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1353 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1354 tmp = TEST_NON_UTF8(tmp); \
1355 REXEC_FBC_UTF8_SCAN( \
1356 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1365 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1366 if (s == PL_bostr) { \
1370 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1371 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1374 LOAD_UTF8_CHARCLASS_ALNUM(); \
1375 REXEC_FBC_UTF8_SCAN( \
1376 if (tmp == ! (TeSt2_UtF8)) { \
1385 /* The only difference between the BOUND and NBOUND cases is that
1386 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1387 * NBOUND. This is accomplished by passing it in either the if or else clause,
1388 * with the other one being empty */
1389 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1390 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1392 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1393 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1395 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1396 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1398 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1399 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1402 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1403 * be passed in completely with the variable name being tested, which isn't
1404 * such a clean interface, but this is easier to read than it was before. We
1405 * are looking for the boundary (or non-boundary between a word and non-word
1406 * character. The utf8 and non-utf8 cases have the same logic, but the details
1407 * must be different. Find the "wordness" of the character just prior to this
1408 * one, and compare it with the wordness of this one. If they differ, we have
1409 * a boundary. At the beginning of the string, pretend that the previous
1410 * character was a new-line */
1411 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1412 if (utf8_target) { \
1415 else { /* Not utf8 */ \
1416 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1417 tmp = TEST_NON_UTF8(tmp); \
1419 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1428 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
1431 /* We know what class REx starts with. Try to find this position... */
1432 /* if reginfo->intuit, its a dryrun */
1433 /* annoyingly all the vars in this routine have different names from their counterparts
1434 in regmatch. /grrr */
1437 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1438 const char *strend, regmatch_info *reginfo)
1441 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1442 char *pat_string; /* The pattern's exactish string */
1443 char *pat_end; /* ptr to end char of pat_string */
1444 re_fold_t folder; /* Function for computing non-utf8 folds */
1445 const U8 *fold_array; /* array for folding ords < 256 */
1451 I32 tmp = 1; /* Scratch variable? */
1452 const bool utf8_target = PL_reg_match_utf8;
1453 UV utf8_fold_flags = 0;
1454 const bool is_utf8_pat = reginfo->is_utf8_pat;
1455 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1456 with a result inverts that result, as 0^1 =
1458 _char_class_number classnum;
1460 RXi_GET_DECL(prog,progi);
1462 PERL_ARGS_ASSERT_FIND_BYCLASS;
1464 /* We know what class it must start with. */
1467 case ANYOF_SYNTHETIC:
1468 case ANYOF_WARN_SUPER:
1470 REXEC_FBC_UTF8_CLASS_SCAN(
1471 reginclass(prog, c, (U8*)s, utf8_target));
1474 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1479 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1487 if (is_utf8_pat || utf8_target) {
1488 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1489 goto do_exactf_utf8;
1491 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1492 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1493 goto do_exactf_non_utf8; /* isn't dealt with by these */
1498 /* regcomp.c already folded this if pattern is in UTF-8 */
1499 utf8_fold_flags = 0;
1500 goto do_exactf_utf8;
1502 fold_array = PL_fold;
1504 goto do_exactf_non_utf8;
1507 if (is_utf8_pat || utf8_target) {
1508 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1509 goto do_exactf_utf8;
1511 fold_array = PL_fold_locale;
1512 folder = foldEQ_locale;
1513 goto do_exactf_non_utf8;
1517 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1519 goto do_exactf_utf8;
1521 case EXACTFU_TRICKYFOLD:
1523 if (is_utf8_pat || utf8_target) {
1524 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1525 goto do_exactf_utf8;
1528 /* Any 'ss' in the pattern should have been replaced by regcomp,
1529 * so we don't have to worry here about this single special case
1530 * in the Latin1 range */
1531 fold_array = PL_fold_latin1;
1532 folder = foldEQ_latin1;
1536 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1537 are no glitches with fold-length differences
1538 between the target string and pattern */
1540 /* The idea in the non-utf8 EXACTF* cases is to first find the
1541 * first character of the EXACTF* node and then, if necessary,
1542 * case-insensitively compare the full text of the node. c1 is the
1543 * first character. c2 is its fold. This logic will not work for
1544 * Unicode semantics and the german sharp ss, which hence should
1545 * not be compiled into a node that gets here. */
1546 pat_string = STRING(c);
1547 ln = STR_LEN(c); /* length to match in octets/bytes */
1549 /* We know that we have to match at least 'ln' bytes (which is the
1550 * same as characters, since not utf8). If we have to match 3
1551 * characters, and there are only 2 availabe, we know without
1552 * trying that it will fail; so don't start a match past the
1553 * required minimum number from the far end */
1554 e = HOP3c(strend, -((I32)ln), s);
1556 if (reginfo->intuit && e < s) {
1557 e = s; /* Due to minlen logic of intuit() */
1561 c2 = fold_array[c1];
1562 if (c1 == c2) { /* If char and fold are the same */
1563 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1566 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1574 /* If one of the operands is in utf8, we can't use the simpler folding
1575 * above, due to the fact that many different characters can have the
1576 * same fold, or portion of a fold, or different- length fold */
1577 pat_string = STRING(c);
1578 ln = STR_LEN(c); /* length to match in octets/bytes */
1579 pat_end = pat_string + ln;
1580 lnc = is_utf8_pat /* length to match in characters */
1581 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1584 /* We have 'lnc' characters to match in the pattern, but because of
1585 * multi-character folding, each character in the target can match
1586 * up to 3 characters (Unicode guarantees it will never exceed
1587 * this) if it is utf8-encoded; and up to 2 if not (based on the
1588 * fact that the Latin 1 folds are already determined, and the
1589 * only multi-char fold in that range is the sharp-s folding to
1590 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1591 * string character. Adjust lnc accordingly, rounding up, so that
1592 * if we need to match at least 4+1/3 chars, that really is 5. */
1593 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1594 lnc = (lnc + expansion - 1) / expansion;
1596 /* As in the non-UTF8 case, if we have to match 3 characters, and
1597 * only 2 are left, it's guaranteed to fail, so don't start a
1598 * match that would require us to go beyond the end of the string
1600 e = HOP3c(strend, -((I32)lnc), s);
1602 if (reginfo->intuit && e < s) {
1603 e = s; /* Due to minlen logic of intuit() */
1606 /* XXX Note that we could recalculate e to stop the loop earlier,
1607 * as the worst case expansion above will rarely be met, and as we
1608 * go along we would usually find that e moves further to the left.
1609 * This would happen only after we reached the point in the loop
1610 * where if there were no expansion we should fail. Unclear if
1611 * worth the expense */
1614 char *my_strend= (char *)strend;
1615 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1616 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1617 && (reginfo->intuit || regtry(reginfo, &s)) )
1621 s += (utf8_target) ? UTF8SKIP(s) : 1;
1626 RXp_MATCH_TAINTED_on(prog);
1627 FBC_BOUND(isWORDCHAR_LC,
1628 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1629 isWORDCHAR_LC_utf8((U8*)s));
1632 RXp_MATCH_TAINTED_on(prog);
1633 FBC_NBOUND(isWORDCHAR_LC,
1634 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1635 isWORDCHAR_LC_utf8((U8*)s));
1638 FBC_BOUND(isWORDCHAR,
1639 isWORDCHAR_uni(tmp),
1640 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1643 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1645 isWORDCHAR_A((U8*)s));
1648 FBC_NBOUND(isWORDCHAR,
1649 isWORDCHAR_uni(tmp),
1650 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1653 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1655 isWORDCHAR_A((U8*)s));
1658 FBC_BOUND(isWORDCHAR_L1,
1659 isWORDCHAR_uni(tmp),
1660 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1663 FBC_NBOUND(isWORDCHAR_L1,
1664 isWORDCHAR_uni(tmp),
1665 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1668 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1669 is_LNBREAK_latin1_safe(s, strend)
1673 /* The argument to all the POSIX node types is the class number to pass to
1674 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1681 RXp_MATCH_TAINTED_on(prog);
1682 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1683 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1698 /* The complement of something that matches only ASCII matches all
1699 * UTF-8 variant code points, plus everything in ASCII that isn't
1701 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1702 || ! _generic_isCC_A(*s, FLAGS(c)));
1711 /* Don't need to worry about utf8, as it can match only a single
1712 * byte invariant character. */
1713 REXEC_FBC_CLASS_SCAN(
1714 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1722 if (! utf8_target) {
1723 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1729 classnum = (_char_class_number) FLAGS(c);
1730 if (classnum < _FIRST_NON_SWASH_CC) {
1731 while (s < strend) {
1733 /* We avoid loading in the swash as long as possible, but
1734 * should we have to, we jump to a separate loop. This
1735 * extra 'if' statement is what keeps this code from being
1736 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1737 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1738 goto found_above_latin1;
1740 if ((UTF8_IS_INVARIANT(*s)
1741 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1743 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1744 && to_complement ^ cBOOL(
1745 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1748 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1760 else switch (classnum) { /* These classes are implemented as
1762 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1763 revert the change of \v matching this */
1766 case _CC_ENUM_PSXSPC:
1767 REXEC_FBC_UTF8_CLASS_SCAN(
1768 to_complement ^ cBOOL(isSPACE_utf8(s)));
1771 case _CC_ENUM_BLANK:
1772 REXEC_FBC_UTF8_CLASS_SCAN(
1773 to_complement ^ cBOOL(isBLANK_utf8(s)));
1776 case _CC_ENUM_XDIGIT:
1777 REXEC_FBC_UTF8_CLASS_SCAN(
1778 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1781 case _CC_ENUM_VERTSPACE:
1782 REXEC_FBC_UTF8_CLASS_SCAN(
1783 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1786 case _CC_ENUM_CNTRL:
1787 REXEC_FBC_UTF8_CLASS_SCAN(
1788 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1792 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1793 assert(0); /* NOTREACHED */
1798 found_above_latin1: /* Here we have to load a swash to get the result
1799 for the current code point */
1800 if (! PL_utf8_swash_ptrs[classnum]) {
1801 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1802 PL_utf8_swash_ptrs[classnum] =
1803 _core_swash_init("utf8", swash_property_names[classnum],
1804 &PL_sv_undef, 1, 0, NULL, &flags);
1807 /* This is a copy of the loop above for swash classes, though using the
1808 * FBC macro instead of being expanded out. Since we've loaded the
1809 * swash, we don't have to check for that each time through the loop */
1810 REXEC_FBC_UTF8_CLASS_SCAN(
1811 to_complement ^ cBOOL(_generic_utf8(
1814 swash_fetch(PL_utf8_swash_ptrs[classnum],
1822 /* what trie are we using right now */
1823 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1824 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1825 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1827 const char *last_start = strend - trie->minlen;
1829 const char *real_start = s;
1831 STRLEN maxlen = trie->maxlen;
1833 U8 **points; /* map of where we were in the input string
1834 when reading a given char. For ASCII this
1835 is unnecessary overhead as the relationship
1836 is always 1:1, but for Unicode, especially
1837 case folded Unicode this is not true. */
1838 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1842 GET_RE_DEBUG_FLAGS_DECL;
1844 /* We can't just allocate points here. We need to wrap it in
1845 * an SV so it gets freed properly if there is a croak while
1846 * running the match */
1849 sv_points=newSV(maxlen * sizeof(U8 *));
1850 SvCUR_set(sv_points,
1851 maxlen * sizeof(U8 *));
1852 SvPOK_on(sv_points);
1853 sv_2mortal(sv_points);
1854 points=(U8**)SvPV_nolen(sv_points );
1855 if ( trie_type != trie_utf8_fold
1856 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1859 bitmap=(U8*)trie->bitmap;
1861 bitmap=(U8*)ANYOF_BITMAP(c);
1863 /* this is the Aho-Corasick algorithm modified a touch
1864 to include special handling for long "unknown char" sequences.
1865 The basic idea being that we use AC as long as we are dealing
1866 with a possible matching char, when we encounter an unknown char
1867 (and we have not encountered an accepting state) we scan forward
1868 until we find a legal starting char.
1869 AC matching is basically that of trie matching, except that when
1870 we encounter a failing transition, we fall back to the current
1871 states "fail state", and try the current char again, a process
1872 we repeat until we reach the root state, state 1, or a legal
1873 transition. If we fail on the root state then we can either
1874 terminate if we have reached an accepting state previously, or
1875 restart the entire process from the beginning if we have not.
1878 while (s <= last_start) {
1879 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1887 U8 *uscan = (U8*)NULL;
1888 U8 *leftmost = NULL;
1890 U32 accepted_word= 0;
1894 while ( state && uc <= (U8*)strend ) {
1896 U32 word = aho->states[ state ].wordnum;
1900 DEBUG_TRIE_EXECUTE_r(
1901 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1902 dump_exec_pos( (char *)uc, c, strend, real_start,
1903 (char *)uc, utf8_target );
1904 PerlIO_printf( Perl_debug_log,
1905 " Scanning for legal start char...\n");
1909 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1913 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1919 if (uc >(U8*)last_start) break;
1923 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1924 if (!leftmost || lpos < leftmost) {
1925 DEBUG_r(accepted_word=word);
1931 points[pointpos++ % maxlen]= uc;
1932 if (foldlen || uc < (U8*)strend) {
1933 REXEC_TRIE_READ_CHAR(trie_type, trie,
1935 uscan, len, uvc, charid, foldlen,
1937 DEBUG_TRIE_EXECUTE_r({
1938 dump_exec_pos( (char *)uc, c, strend,
1939 real_start, s, utf8_target);
1940 PerlIO_printf(Perl_debug_log,
1941 " Charid:%3u CP:%4"UVxf" ",
1953 word = aho->states[ state ].wordnum;
1955 base = aho->states[ state ].trans.base;
1957 DEBUG_TRIE_EXECUTE_r({
1959 dump_exec_pos( (char *)uc, c, strend, real_start,
1961 PerlIO_printf( Perl_debug_log,
1962 "%sState: %4"UVxf", word=%"UVxf,
1963 failed ? " Fail transition to " : "",
1964 (UV)state, (UV)word);
1970 ( ((offset = base + charid
1971 - 1 - trie->uniquecharcount)) >= 0)
1972 && ((U32)offset < trie->lasttrans)
1973 && trie->trans[offset].check == state
1974 && (tmp=trie->trans[offset].next))
1976 DEBUG_TRIE_EXECUTE_r(
1977 PerlIO_printf( Perl_debug_log," - legal\n"));
1982 DEBUG_TRIE_EXECUTE_r(
1983 PerlIO_printf( Perl_debug_log," - fail\n"));
1985 state = aho->fail[state];
1989 /* we must be accepting here */
1990 DEBUG_TRIE_EXECUTE_r(
1991 PerlIO_printf( Perl_debug_log," - accepting\n"));
2000 if (!state) state = 1;
2003 if ( aho->states[ state ].wordnum ) {
2004 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2005 if (!leftmost || lpos < leftmost) {
2006 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2011 s = (char*)leftmost;
2012 DEBUG_TRIE_EXECUTE_r({
2014 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2015 (UV)accepted_word, (IV)(s - real_start)
2018 if (reginfo->intuit || regtry(reginfo, &s)) {
2024 DEBUG_TRIE_EXECUTE_r({
2025 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2028 DEBUG_TRIE_EXECUTE_r(
2029 PerlIO_printf( Perl_debug_log,"No match.\n"));
2038 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2048 - regexec_flags - match a regexp against a string
2051 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2052 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2053 /* stringarg: the point in the string at which to begin matching */
2054 /* strend: pointer to null at end of string */
2055 /* strbeg: real beginning of string */
2056 /* minend: end of match must be >= minend bytes after stringarg. */
2057 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2058 * itself is accessed via the pointers above */
2059 /* data: May be used for some additional optimizations.
2060 Currently its only used, with a U32 cast, for transmitting
2061 the ganch offset when doing a /g match. This will change */
2062 /* nosave: For optimizations. */
2066 struct regexp *const prog = ReANY(rx);
2069 char *startpos = stringarg;
2070 I32 minlen; /* must match at least this many chars */
2071 I32 dontbother = 0; /* how many characters not to try at end */
2072 I32 end_shift = 0; /* Same for the end. */ /* CC */
2073 I32 scream_pos = -1; /* Internal iterator of scream. */
2074 char *scream_olds = NULL;
2075 const bool utf8_target = cBOOL(DO_UTF8(sv));
2077 RXi_GET_DECL(prog,progi);
2078 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2079 regmatch_info *const reginfo = ®info_buf;
2080 regexp_paren_pair *swap = NULL;
2081 GET_RE_DEBUG_FLAGS_DECL;
2083 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2084 PERL_UNUSED_ARG(data);
2086 /* Be paranoid... */
2087 if (prog == NULL || startpos == NULL) {
2088 Perl_croak(aTHX_ "NULL regexp parameter");
2092 multiline = prog->extflags & RXf_PMf_MULTILINE;
2093 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2094 reginfo->intuit = 0;
2096 RX_MATCH_UTF8_set(rx, utf8_target);
2098 debug_start_match(rx, utf8_target, startpos, strend,
2102 minlen = prog->minlen;
2104 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2105 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2106 "String too short [regexec_flags]...\n"));
2111 /* Check validity of program. */
2112 if (UCHARAT(progi->program) != REG_MAGIC) {
2113 Perl_croak(aTHX_ "corrupted regexp program");
2116 RX_MATCH_TAINTED_off(rx);
2117 PL_reg_state.re_state_eval_setup_done = FALSE;
2120 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2121 reginfo->warned = FALSE;
2122 /* Mark beginning of line for ^ and lookbehind. */
2123 reginfo->bol = startpos; /* XXX not used ??? */
2127 /* Mark end of string for $ (and such) */
2128 reginfo->strend = strend;
2130 /* see how far we have to get to not match where we matched before */
2131 reginfo->till = startpos+minend;
2133 /* If there is a "must appear" string, look for it. */
2136 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2138 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2139 reginfo->ganch = startpos + prog->gofs;
2140 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2141 "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2142 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2144 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2145 && mg->mg_len >= 0) {
2146 reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */
2147 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2148 "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2150 if (prog->extflags & RXf_ANCH_GPOS) {
2151 if (s > reginfo->ganch)
2153 s = reginfo->ganch - prog->gofs;
2154 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2155 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2161 reginfo->ganch = strbeg + PTR2UV(data);
2162 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2163 "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2165 } else { /* pos() not defined */
2166 reginfo->ganch = strbeg;
2167 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2168 "GPOS: reginfo->ganch = strbeg\n"));
2171 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2172 /* We have to be careful. If the previous successful match
2173 was from this regex we don't want a subsequent partially
2174 successful match to clobber the old results.
2175 So when we detect this possibility we add a swap buffer
2176 to the re, and switch the buffer each match. If we fail,
2177 we switch it back; otherwise we leave it swapped.
2180 /* do we need a save destructor here for eval dies? */
2181 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2182 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2183 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2189 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2190 re_scream_pos_data d;
2192 d.scream_olds = &scream_olds;
2193 d.scream_pos = &scream_pos;
2194 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
2196 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2197 goto phooey; /* not present */
2203 /* Simplest case: anchored match need be tried only once. */
2204 /* [unless only anchor is BOL and multiline is set] */
2205 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2206 if (s == startpos && regtry(reginfo, &startpos))
2208 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2209 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2214 dontbother = minlen - 1;
2215 end = HOP3c(strend, -dontbother, strbeg) - 1;
2216 /* for multiline we only have to try after newlines */
2217 if (prog->check_substr || prog->check_utf8) {
2218 /* because of the goto we can not easily reuse the macros for bifurcating the
2219 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2222 goto after_try_utf8;
2224 if (regtry(reginfo, &s)) {
2231 if (prog->extflags & RXf_USE_INTUIT) {
2232 s = re_intuit_start(rx, sv, strbeg,
2233 s + UTF8SKIP(s), strend, flags, NULL);
2242 } /* end search for check string in unicode */
2244 if (s == startpos) {
2245 goto after_try_latin;
2248 if (regtry(reginfo, &s)) {
2255 if (prog->extflags & RXf_USE_INTUIT) {
2256 s = re_intuit_start(rx, sv, strbeg,
2257 s + 1, strend, flags, NULL);
2266 } /* end search for check string in latin*/
2267 } /* end search for check string */
2268 else { /* search for newline */
2270 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2273 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2274 while (s <= end) { /* note it could be possible to match at the end of the string */
2275 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2276 if (regtry(reginfo, &s))
2280 } /* end search for newline */
2281 } /* end anchored/multiline check string search */
2283 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2285 /* the warning about reginfo->ganch being used without initialization
2286 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2287 and we only enter this block when the same bit is set. */
2288 char *tmp_s = reginfo->ganch - prog->gofs;
2290 if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
2295 /* Messy cases: unanchored match. */
2296 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2297 /* we have /x+whatever/ */
2298 /* it must be a one character string (XXXX Except is_utf8_pat?) */
2304 if (! prog->anchored_utf8) {
2305 to_utf8_substr(prog);
2307 ch = SvPVX_const(prog->anchored_utf8)[0];
2310 DEBUG_EXECUTE_r( did_match = 1 );
2311 if (regtry(reginfo, &s)) goto got_it;
2313 while (s < strend && *s == ch)
2320 if (! prog->anchored_substr) {
2321 if (! to_byte_substr(prog)) {
2322 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2325 ch = SvPVX_const(prog->anchored_substr)[0];
2328 DEBUG_EXECUTE_r( did_match = 1 );
2329 if (regtry(reginfo, &s)) goto got_it;
2331 while (s < strend && *s == ch)
2336 DEBUG_EXECUTE_r(if (!did_match)
2337 PerlIO_printf(Perl_debug_log,
2338 "Did not find anchored character...\n")
2341 else if (prog->anchored_substr != NULL
2342 || prog->anchored_utf8 != NULL
2343 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2344 && prog->float_max_offset < strend - s)) {
2349 char *last1; /* Last position checked before */
2353 if (prog->anchored_substr || prog->anchored_utf8) {
2355 if (! prog->anchored_utf8) {
2356 to_utf8_substr(prog);
2358 must = prog->anchored_utf8;
2361 if (! prog->anchored_substr) {
2362 if (! to_byte_substr(prog)) {
2363 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2366 must = prog->anchored_substr;
2368 back_max = back_min = prog->anchored_offset;
2371 if (! prog->float_utf8) {
2372 to_utf8_substr(prog);
2374 must = prog->float_utf8;
2377 if (! prog->float_substr) {
2378 if (! to_byte_substr(prog)) {
2379 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2382 must = prog->float_substr;
2384 back_max = prog->float_max_offset;
2385 back_min = prog->float_min_offset;
2391 last = HOP3c(strend, /* Cannot start after this */
2392 -(I32)(CHR_SVLEN(must)
2393 - (SvTAIL(must) != 0) + back_min), strbeg);
2396 last1 = HOPc(s, -1);
2398 last1 = s - 1; /* bogus */
2400 /* XXXX check_substr already used to find "s", can optimize if
2401 check_substr==must. */
2403 dontbother = end_shift;
2404 strend = HOPc(strend, -dontbother);
2405 while ( (s <= last) &&
2406 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2407 (unsigned char*)strend, must,
2408 multiline ? FBMrf_MULTILINE : 0)) ) {
2409 DEBUG_EXECUTE_r( did_match = 1 );
2410 if (HOPc(s, -back_max) > last1) {
2411 last1 = HOPc(s, -back_min);
2412 s = HOPc(s, -back_max);
2415 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2417 last1 = HOPc(s, -back_min);
2421 while (s <= last1) {
2422 if (regtry(reginfo, &s))
2425 s++; /* to break out of outer loop */
2432 while (s <= last1) {
2433 if (regtry(reginfo, &s))
2439 DEBUG_EXECUTE_r(if (!did_match) {
2440 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2441 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2442 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2443 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2444 ? "anchored" : "floating"),
2445 quoted, RE_SV_TAIL(must));
2449 else if ( (c = progi->regstclass) ) {
2451 const OPCODE op = OP(progi->regstclass);
2452 /* don't bother with what can't match */
2453 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2454 strend = HOPc(strend, -(minlen - 1));
2457 SV * const prop = sv_newmortal();
2458 regprop(prog, prop, c);
2460 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2462 PerlIO_printf(Perl_debug_log,
2463 "Matching stclass %.*s against %s (%d bytes)\n",
2464 (int)SvCUR(prop), SvPVX_const(prop),
2465 quoted, (int)(strend - s));
2468 if (find_byclass(prog, c, s, strend, reginfo))
2470 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2474 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2482 if (! prog->float_utf8) {
2483 to_utf8_substr(prog);
2485 float_real = prog->float_utf8;
2488 if (! prog->float_substr) {
2489 if (! to_byte_substr(prog)) {
2490 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2493 float_real = prog->float_substr;
2496 little = SvPV_const(float_real, len);
2497 if (SvTAIL(float_real)) {
2498 /* This means that float_real contains an artificial \n on
2499 * the end due to the presence of something like this:
2500 * /foo$/ where we can match both "foo" and "foo\n" at the
2501 * end of the string. So we have to compare the end of the
2502 * string first against the float_real without the \n and
2503 * then against the full float_real with the string. We
2504 * have to watch out for cases where the string might be
2505 * smaller than the float_real or the float_real without
2507 char *checkpos= strend - len;
2509 PerlIO_printf(Perl_debug_log,
2510 "%sChecking for float_real.%s\n",
2511 PL_colors[4], PL_colors[5]));
2512 if (checkpos + 1 < strbeg) {
2513 /* can't match, even if we remove the trailing \n
2514 * string is too short to match */
2516 PerlIO_printf(Perl_debug_log,
2517 "%sString shorter than required trailing substring, cannot match.%s\n",
2518 PL_colors[4], PL_colors[5]));
2520 } else if (memEQ(checkpos + 1, little, len - 1)) {
2521 /* can match, the end of the string matches without the
2523 last = checkpos + 1;
2524 } else if (checkpos < strbeg) {
2525 /* cant match, string is too short when the "\n" is
2528 PerlIO_printf(Perl_debug_log,
2529 "%sString does not contain required trailing substring, cannot match.%s\n",
2530 PL_colors[4], PL_colors[5]));
2532 } else if (!multiline) {
2533 /* non multiline match, so compare with the "\n" at the
2534 * end of the string */
2535 if (memEQ(checkpos, little, len)) {
2539 PerlIO_printf(Perl_debug_log,
2540 "%sString does not contain required trailing substring, cannot match.%s\n",
2541 PL_colors[4], PL_colors[5]));
2545 /* multiline match, so we have to search for a place
2546 * where the full string is located */
2552 last = rninstr(s, strend, little, little + len);
2554 last = strend; /* matching "$" */
2557 /* at one point this block contained a comment which was
2558 * probably incorrect, which said that this was a "should not
2559 * happen" case. Even if it was true when it was written I am
2560 * pretty sure it is not anymore, so I have removed the comment
2561 * and replaced it with this one. Yves */
2563 PerlIO_printf(Perl_debug_log,
2564 "String does not contain required substring, cannot match.\n"
2568 dontbother = strend - last + prog->float_min_offset;
2570 if (minlen && (dontbother < minlen))
2571 dontbother = minlen - 1;
2572 strend -= dontbother; /* this one's always in bytes! */
2573 /* We don't know much -- general case. */
2576 if (regtry(reginfo, &s))
2585 if (regtry(reginfo, &s))
2587 } while (s++ < strend);
2597 PerlIO_printf(Perl_debug_log,
2598 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2605 if (PL_reg_state.re_state_eval_setup_done)
2606 restore_pos(aTHX_ prog);
2607 if (RXp_PAREN_NAMES(prog))
2608 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2610 /* make sure $`, $&, $', and $digit will work later */
2611 if ( !(flags & REXEC_NOT_FIRST) ) {
2612 if (flags & REXEC_COPY_STR) {
2616 PerlIO_printf(Perl_debug_log,
2617 "Copy on write: regexp capture, type %d\n",
2620 RX_MATCH_COPY_FREE(rx);
2621 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2622 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2623 assert (SvPOKp(prog->saved_copy));
2624 prog->sublen = reginfo->strend - strbeg;
2625 prog->suboffset = 0;
2626 prog->subcoffset = 0;
2631 I32 max = reginfo->strend - strbeg;
2634 if ( (flags & REXEC_COPY_SKIP_POST)
2635 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2636 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2637 ) { /* don't copy $' part of string */
2640 /* calculate the right-most part of the string covered
2641 * by a capture. Due to look-ahead, this may be to
2642 * the right of $&, so we have to scan all captures */
2643 while (n <= prog->lastparen) {
2644 if (prog->offs[n].end > max)
2645 max = prog->offs[n].end;
2649 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2650 ? prog->offs[0].start
2652 assert(max >= 0 && max <= reginfo->strend - strbeg);
2655 if ( (flags & REXEC_COPY_SKIP_PRE)
2656 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2657 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2658 ) { /* don't copy $` part of string */
2661 /* calculate the left-most part of the string covered
2662 * by a capture. Due to look-behind, this may be to
2663 * the left of $&, so we have to scan all captures */
2664 while (min && n <= prog->lastparen) {
2665 if ( prog->offs[n].start != -1
2666 && prog->offs[n].start < min)
2668 min = prog->offs[n].start;
2672 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2673 && min > prog->offs[0].end
2675 min = prog->offs[0].end;
2679 assert(min >= 0 && min <= max
2680 && min <= reginfo->strend - strbeg);
2683 if (RX_MATCH_COPIED(rx)) {
2684 if (sublen > prog->sublen)
2686 (char*)saferealloc(prog->subbeg, sublen+1);
2689 prog->subbeg = (char*)safemalloc(sublen+1);
2690 Copy(strbeg + min, prog->subbeg, sublen, char);
2691 prog->subbeg[sublen] = '\0';
2692 prog->suboffset = min;
2693 prog->sublen = sublen;
2694 RX_MATCH_COPIED_on(rx);
2696 prog->subcoffset = prog->suboffset;
2697 if (prog->suboffset && utf8_target) {
2698 /* Convert byte offset to chars.
2699 * XXX ideally should only compute this if @-/@+
2700 * has been seen, a la PL_sawampersand ??? */
2702 /* If there's a direct correspondence between the
2703 * string which we're matching and the original SV,
2704 * then we can use the utf8 len cache associated with
2705 * the SV. In particular, it means that under //g,
2706 * sv_pos_b2u() will use the previously cached
2707 * position to speed up working out the new length of
2708 * subcoffset, rather than counting from the start of
2709 * the string each time. This stops
2710 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2711 * from going quadratic */
2712 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2713 sv_pos_b2u(sv, &(prog->subcoffset));
2715 prog->subcoffset = utf8_length((U8*)strbeg,
2716 (U8*)(strbeg+prog->suboffset));
2720 RX_MATCH_COPY_FREE(rx);
2721 prog->subbeg = strbeg;
2722 prog->suboffset = 0;
2723 prog->subcoffset = 0;
2724 /* use reginfo->strend, as strend may have been modified */
2725 prog->sublen = reginfo->strend - strbeg;
2732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2733 PL_colors[4], PL_colors[5]));
2734 if (PL_reg_state.re_state_eval_setup_done)
2735 restore_pos(aTHX_ prog);
2737 /* we failed :-( roll it back */
2738 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2739 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2744 Safefree(prog->offs);
2751 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2752 * Do inc before dec, in case old and new rex are the same */
2753 #define SET_reg_curpm(Re2) \
2754 if (PL_reg_state.re_state_eval_setup_done) { \
2755 (void)ReREFCNT_inc(Re2); \
2756 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2757 PM_SETRE((PL_reg_curpm), (Re2)); \
2762 - regtry - try match at specific point
2764 STATIC I32 /* 0 failure, 1 success */
2765 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2769 REGEXP *const rx = reginfo->prog;
2770 regexp *const prog = ReANY(rx);
2772 RXi_GET_DECL(prog,progi);
2773 GET_RE_DEBUG_FLAGS_DECL;
2775 PERL_ARGS_ASSERT_REGTRY;
2777 reginfo->cutpoint=NULL;
2779 if ((prog->extflags & RXf_EVAL_SEEN)
2780 && !PL_reg_state.re_state_eval_setup_done)
2784 PL_reg_state.re_state_eval_setup_done = TRUE;
2786 /* Make $_ available to executed code. */
2787 if (reginfo->sv != DEFSV) {
2789 DEFSV_set(reginfo->sv);
2792 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2793 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2794 /* prepare for quick setting of pos */
2795 #ifdef PERL_OLD_COPY_ON_WRITE
2796 if (SvIsCOW(reginfo->sv))
2797 sv_force_normal_flags(reginfo->sv, 0);
2799 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2800 &PL_vtbl_mglob, NULL, 0);
2804 PL_reg_oldpos = mg->mg_len;
2805 SAVEDESTRUCTOR_X(restore_pos, prog);
2807 if (!PL_reg_curpm) {
2808 Newxz(PL_reg_curpm, 1, PMOP);
2811 SV* const repointer = &PL_sv_undef;
2812 /* this regexp is also owned by the new PL_reg_curpm, which
2813 will try to free it. */
2814 av_push(PL_regex_padav, repointer);
2815 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2816 PL_regex_pad = AvARRAY(PL_regex_padav);
2821 PL_reg_oldcurpm = PL_curpm;
2822 PL_curpm = PL_reg_curpm;
2823 if (RXp_MATCH_COPIED(prog)) {
2824 /* Here is a serious problem: we cannot rewrite subbeg,
2825 since it may be needed if this match fails. Thus
2826 $` inside (?{}) could fail... */
2827 PL_reg_oldsaved = prog->subbeg;
2828 PL_reg_oldsavedlen = prog->sublen;
2829 PL_reg_oldsavedoffset = prog->suboffset;
2830 PL_reg_oldsavedcoffset = prog->suboffset;
2832 PL_nrs = prog->saved_copy;
2834 RXp_MATCH_COPIED_off(prog);
2837 PL_reg_oldsaved = NULL;
2838 prog->subbeg = PL_bostr;
2839 prog->suboffset = 0;
2840 prog->subcoffset = 0;
2841 /* use reginfo->strend, as strend may have been modified */
2842 prog->sublen = reginfo->strend - PL_bostr;
2845 PL_reg_starttry = *startposp;
2847 prog->offs[0].start = *startposp - PL_bostr;
2848 prog->lastparen = 0;
2849 prog->lastcloseparen = 0;
2851 /* XXXX What this code is doing here?!!! There should be no need
2852 to do this again and again, prog->lastparen should take care of
2855 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2856 * Actually, the code in regcppop() (which Ilya may be meaning by
2857 * prog->lastparen), is not needed at all by the test suite
2858 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2859 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2860 * Meanwhile, this code *is* needed for the
2861 * above-mentioned test suite tests to succeed. The common theme
2862 * on those tests seems to be returning null fields from matches.
2863 * --jhi updated by dapm */
2865 if (prog->nparens) {
2866 regexp_paren_pair *pp = prog->offs;
2868 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2876 result = regmatch(reginfo, *startposp, progi->program + 1);
2878 prog->offs[0].end = result;
2881 if (reginfo->cutpoint)
2882 *startposp= reginfo->cutpoint;
2883 REGCP_UNWIND(lastcp);
2888 #define sayYES goto yes
2889 #define sayNO goto no
2890 #define sayNO_SILENT goto no_silent
2892 /* we dont use STMT_START/END here because it leads to
2893 "unreachable code" warnings, which are bogus, but distracting. */
2894 #define CACHEsayNO \
2895 if (ST.cache_mask) \
2896 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2899 /* this is used to determine how far from the left messages like
2900 'failed...' are printed. It should be set such that messages
2901 are inline with the regop output that created them.
2903 #define REPORT_CODE_OFF 32
2906 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2907 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2908 #define CHRTEST_NOT_A_CP_1 -999
2909 #define CHRTEST_NOT_A_CP_2 -998
2911 #define SLAB_FIRST(s) (&(s)->states[0])
2912 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2914 /* grab a new slab and return the first slot in it */
2916 STATIC regmatch_state *
2919 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2922 regmatch_slab *s = PL_regmatch_slab->next;
2924 Newx(s, 1, regmatch_slab);
2925 s->prev = PL_regmatch_slab;
2927 PL_regmatch_slab->next = s;
2929 PL_regmatch_slab = s;
2930 return SLAB_FIRST(s);
2934 /* push a new state then goto it */
2936 #define PUSH_STATE_GOTO(state, node, input) \
2937 pushinput = input; \
2939 st->resume_state = state; \
2942 /* push a new state with success backtracking, then goto it */
2944 #define PUSH_YES_STATE_GOTO(state, node, input) \
2945 pushinput = input; \
2947 st->resume_state = state; \
2948 goto push_yes_state;
2955 regmatch() - main matching routine
2957 This is basically one big switch statement in a loop. We execute an op,
2958 set 'next' to point the next op, and continue. If we come to a point which
2959 we may need to backtrack to on failure such as (A|B|C), we push a
2960 backtrack state onto the backtrack stack. On failure, we pop the top
2961 state, and re-enter the loop at the state indicated. If there are no more
2962 states to pop, we return failure.
2964 Sometimes we also need to backtrack on success; for example /A+/, where
2965 after successfully matching one A, we need to go back and try to
2966 match another one; similarly for lookahead assertions: if the assertion
2967 completes successfully, we backtrack to the state just before the assertion
2968 and then carry on. In these cases, the pushed state is marked as
2969 'backtrack on success too'. This marking is in fact done by a chain of
2970 pointers, each pointing to the previous 'yes' state. On success, we pop to
2971 the nearest yes state, discarding any intermediate failure-only states.
2972 Sometimes a yes state is pushed just to force some cleanup code to be
2973 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2974 it to free the inner regex.
2976 Note that failure backtracking rewinds the cursor position, while
2977 success backtracking leaves it alone.
2979 A pattern is complete when the END op is executed, while a subpattern
2980 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2981 ops trigger the "pop to last yes state if any, otherwise return true"
2984 A common convention in this function is to use A and B to refer to the two
2985 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2986 the subpattern to be matched possibly multiple times, while B is the entire
2987 rest of the pattern. Variable and state names reflect this convention.
2989 The states in the main switch are the union of ops and failure/success of
2990 substates associated with with that op. For example, IFMATCH is the op
2991 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2992 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2993 successfully matched A and IFMATCH_A_fail is a state saying that we have
2994 just failed to match A. Resume states always come in pairs. The backtrack
2995 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2996 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2997 on success or failure.
2999 The struct that holds a backtracking state is actually a big union, with
3000 one variant for each major type of op. The variable st points to the
3001 top-most backtrack struct. To make the code clearer, within each
3002 block of code we #define ST to alias the relevant union.
3004 Here's a concrete example of a (vastly oversimplified) IFMATCH
3010 #define ST st->u.ifmatch
3012 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3013 ST.foo = ...; // some state we wish to save
3015 // push a yes backtrack state with a resume value of
3016 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3018 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3021 case IFMATCH_A: // we have successfully executed A; now continue with B
3023 bar = ST.foo; // do something with the preserved value
3026 case IFMATCH_A_fail: // A failed, so the assertion failed
3027 ...; // do some housekeeping, then ...
3028 sayNO; // propagate the failure
3035 For any old-timers reading this who are familiar with the old recursive
3036 approach, the code above is equivalent to:
3038 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3047 ...; // do some housekeeping, then ...
3048 sayNO; // propagate the failure
3051 The topmost backtrack state, pointed to by st, is usually free. If you
3052 want to claim it, populate any ST.foo fields in it with values you wish to
3053 save, then do one of
3055 PUSH_STATE_GOTO(resume_state, node, newinput);
3056 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3058 which sets that backtrack state's resume value to 'resume_state', pushes a
3059 new free entry to the top of the backtrack stack, then goes to 'node'.
3060 On backtracking, the free slot is popped, and the saved state becomes the
3061 new free state. An ST.foo field in this new top state can be temporarily
3062 accessed to retrieve values, but once the main loop is re-entered, it
3063 becomes available for reuse.
3065 Note that the depth of the backtrack stack constantly increases during the
3066 left-to-right execution of the pattern, rather than going up and down with
3067 the pattern nesting. For example the stack is at its maximum at Z at the
3068 end of the pattern, rather than at X in the following:
3070 /(((X)+)+)+....(Y)+....Z/
3072 The only exceptions to this are lookahead/behind assertions and the cut,
3073 (?>A), which pop all the backtrack states associated with A before
3076 Backtrack state structs are allocated in slabs of about 4K in size.
3077 PL_regmatch_state and st always point to the currently active state,
3078 and PL_regmatch_slab points to the slab currently containing
3079 PL_regmatch_state. The first time regmatch() is called, the first slab is
3080 allocated, and is never freed until interpreter destruction. When the slab
3081 is full, a new one is allocated and chained to the end. At exit from
3082 regmatch(), slabs allocated since entry are freed.
3087 #define DEBUG_STATE_pp(pp) \
3089 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3090 PerlIO_printf(Perl_debug_log, \
3091 " %*s"pp" %s%s%s%s%s\n", \
3093 PL_reg_name[st->resume_state], \
3094 ((st==yes_state||st==mark_state) ? "[" : ""), \
3095 ((st==yes_state) ? "Y" : ""), \
3096 ((st==mark_state) ? "M" : ""), \
3097 ((st==yes_state||st==mark_state) ? "]" : "") \
3102 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3107 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3108 const char *start, const char *end, const char *blurb)
3110 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3112 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3117 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3118 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3120 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3121 start, end - start, 60);
3123 PerlIO_printf(Perl_debug_log,
3124 "%s%s REx%s %s against %s\n",
3125 PL_colors[4], blurb, PL_colors[5], s0, s1);
3127 if (utf8_target||utf8_pat)
3128 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3129 utf8_pat ? "pattern" : "",
3130 utf8_pat && utf8_target ? " and " : "",
3131 utf8_target ? "string" : ""
3137 S_dump_exec_pos(pTHX_ const char *locinput,
3138 const regnode *scan,
3139 const char *loc_regeol,
3140 const char *loc_bostr,
3141 const char *loc_reg_starttry,
3142 const bool utf8_target)
3144 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3145 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3146 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3147 /* The part of the string before starttry has one color
3148 (pref0_len chars), between starttry and current
3149 position another one (pref_len - pref0_len chars),
3150 after the current position the third one.
3151 We assume that pref0_len <= pref_len, otherwise we
3152 decrease pref0_len. */
3153 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3154 ? (5 + taill) - l : locinput - loc_bostr;
3157 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3159 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3161 pref0_len = pref_len - (locinput - loc_reg_starttry);
3162 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3163 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3164 ? (5 + taill) - pref_len : loc_regeol - locinput);
3165 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3169 if (pref0_len > pref_len)
3170 pref0_len = pref_len;
3172 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3174 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3175 (locinput - pref_len),pref0_len, 60, 4, 5);
3177 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3178 (locinput - pref_len + pref0_len),
3179 pref_len - pref0_len, 60, 2, 3);
3181 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3182 locinput, loc_regeol - locinput, 10, 0, 1);
3184 const STRLEN tlen=len0+len1+len2;
3185 PerlIO_printf(Perl_debug_log,
3186 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3187 (IV)(locinput - loc_bostr),
3190 (docolor ? "" : "> <"),
3192 (int)(tlen > 19 ? 0 : 19 - tlen),
3199 /* reg_check_named_buff_matched()
3200 * Checks to see if a named buffer has matched. The data array of
3201 * buffer numbers corresponding to the buffer is expected to reside
3202 * in the regexp->data->data array in the slot stored in the ARG() of
3203 * node involved. Note that this routine doesn't actually care about the
3204 * name, that information is not preserved from compilation to execution.
3205 * Returns the index of the leftmost defined buffer with the given name
3206 * or 0 if non of the buffers matched.
3209 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3212 RXi_GET_DECL(rex,rexi);
3213 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3214 I32 *nums=(I32*)SvPVX(sv_dat);
3216 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3218 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3219 if ((I32)rex->lastparen >= nums[n] &&
3220 rex->offs[nums[n]].end != -1)
3229 /* free all slabs above current one - called during LEAVE_SCOPE */
3232 S_clear_backtrack_stack(pTHX_ void *p)
3234 regmatch_slab *s = PL_regmatch_slab->next;
3239 PL_regmatch_slab->next = NULL;
3241 regmatch_slab * const osl = s;
3247 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3248 U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat)
3250 /* This function determines if there are one or two characters that match
3251 * the first character of the passed-in EXACTish node <text_node>, and if
3252 * so, returns them in the passed-in pointers.
3254 * If it determines that no possible character in the target string can
3255 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3256 * the first character in <text_node> requires UTF-8 to represent, and the
3257 * target string isn't in UTF-8.)
3259 * If there are more than two characters that could match the beginning of
3260 * <text_node>, or if more context is required to determine a match or not,
3261 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3263 * The motiviation behind this function is to allow the caller to set up
3264 * tight loops for matching. If <text_node> is of type EXACT, there is
3265 * only one possible character that can match its first character, and so
3266 * the situation is quite simple. But things get much more complicated if
3267 * folding is involved. It may be that the first character of an EXACTFish
3268 * node doesn't participate in any possible fold, e.g., punctuation, so it
3269 * can be matched only by itself. The vast majority of characters that are
3270 * in folds match just two things, their lower and upper-case equivalents.
3271 * But not all are like that; some have multiple possible matches, or match
3272 * sequences of more than one character. This function sorts all that out.
3274 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3275 * loop of trying to match A*, we know we can't exit where the thing
3276 * following it isn't a B. And something can't be a B unless it is the
3277 * beginning of B. By putting a quick test for that beginning in a tight
3278 * loop, we can rule out things that can't possibly be B without having to
3279 * break out of the loop, thus avoiding work. Similarly, if A is a single
3280 * character, we can make a tight loop matching A*, using the outputs of
3283 * If the target string to match isn't in UTF-8, and there aren't
3284 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3285 * the one or two possible octets (which are characters in this situation)
3286 * that can match. In all cases, if there is only one character that can
3287 * match, *<c1p> and *<c2p> will be identical.
3289 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3290 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3291 * can match the beginning of <text_node>. They should be declared with at
3292 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3293 * undefined what these contain.) If one or both of the buffers are
3294 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3295 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3296 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3297 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3298 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3300 const bool utf8_target = PL_reg_match_utf8;
3302 UV c1 = CHRTEST_NOT_A_CP_1;
3303 UV c2 = CHRTEST_NOT_A_CP_2;
3304 bool use_chrtest_void = FALSE;
3306 /* Used when we have both utf8 input and utf8 output, to avoid converting
3307 * to/from code points */
3308 bool utf8_has_been_setup = FALSE;
3312 U8 *pat = (U8*)STRING(text_node);
3314 if (OP(text_node) == EXACT) {
3316 /* In an exact node, only one thing can be matched, that first
3317 * character. If both the pat and the target are UTF-8, we can just
3318 * copy the input to the output, avoiding finding the code point of
3323 else if (utf8_target) {
3324 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3325 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3326 utf8_has_been_setup = TRUE;
3329 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3332 else /* an EXACTFish node */
3334 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3335 pat + STR_LEN(text_node)))
3337 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3338 pat + STR_LEN(text_node))))
3340 /* Multi-character folds require more context to sort out. Also
3341 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3342 * handled outside this routine */
3343 use_chrtest_void = TRUE;
3345 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3346 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3348 /* Load the folds hash, if not already done */
3350 if (! PL_utf8_foldclosures) {
3351 if (! PL_utf8_tofold) {
3352 U8 dummy[UTF8_MAXBYTES+1];
3354 /* Force loading this by folding an above-Latin1 char */
3355 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3356 assert(PL_utf8_tofold); /* Verify that worked */
3358 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3361 /* The fold closures data structure is a hash with the keys being
3362 * the UTF-8 of every character that is folded to, like 'k', and
3363 * the values each an array of all code points that fold to its
3364 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3366 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3371 /* Not found in the hash, therefore there are no folds
3372 * containing it, so there is only a single character that
3376 else { /* Does participate in folds */
3377 AV* list = (AV*) *listp;
3378 if (av_len(list) != 1) {
3380 /* If there aren't exactly two folds to this, it is outside
3381 * the scope of this function */
3382 use_chrtest_void = TRUE;
3384 else { /* There are two. Get them */
3385 SV** c_p = av_fetch(list, 0, FALSE);
3387 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3391 c_p = av_fetch(list, 1, FALSE);
3393 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3397 /* Folds that cross the 255/256 boundary are forbidden if
3398 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3399 * pattern character is above 256, and its only other match
3400 * is below 256, the only legal match will be to itself.
3401 * We have thrown away the original, so have to compute
3402 * which is the one above 255 */
3403 if ((c1 < 256) != (c2 < 256)) {
3404 if (OP(text_node) == EXACTFL
3405 || (OP(text_node) == EXACTFA
3406 && (isASCII(c1) || isASCII(c2))))
3419 else /* Here, c1 is < 255 */
3421 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3422 && OP(text_node) != EXACTFL
3423 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3425 /* Here, there could be something above Latin1 in the target which
3426 * folds to this character in the pattern. All such cases except
3427 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3428 * involved in their folds, so are outside the scope of this
3430 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3431 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3434 use_chrtest_void = TRUE;
3437 else { /* Here nothing above Latin1 can fold to the pattern character */
3438 switch (OP(text_node)) {
3440 case EXACTFL: /* /l rules */
3441 c2 = PL_fold_locale[c1];
3445 if (! utf8_target) { /* /d rules */
3450 /* /u rules for all these. This happens to work for
3451 * EXACTFA as nothing in Latin1 folds to ASCII */
3453 case EXACTFU_TRICKYFOLD:
3456 c2 = PL_fold_latin1[c1];
3460 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3461 assert(0); /* NOTREACHED */
3466 /* Here have figured things out. Set up the returns */
3467 if (use_chrtest_void) {
3468 *c2p = *c1p = CHRTEST_VOID;
3470 else if (utf8_target) {
3471 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3472 uvchr_to_utf8(c1_utf8, c1);
3473 uvchr_to_utf8(c2_utf8, c2);
3476 /* Invariants are stored in both the utf8 and byte outputs; Use
3477 * negative numbers otherwise for the byte ones. Make sure that the
3478 * byte ones are the same iff the utf8 ones are the same */
3479 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3480 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3483 ? CHRTEST_NOT_A_CP_1
3484 : CHRTEST_NOT_A_CP_2;
3486 else if (c1 > 255) {
3487 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3492 *c1p = *c2p = c2; /* c2 is the only representable value */
3494 else { /* c1 is representable; see about c2 */
3496 *c2p = (c2 < 256) ? c2 : c1;
3502 /* returns -1 on failure, $+[0] on success */
3504 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3506 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3510 const bool utf8_target = PL_reg_match_utf8;
3511 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3512 REGEXP *rex_sv = reginfo->prog;
3513 regexp *rex = ReANY(rex_sv);
3514 RXi_GET_DECL(rex,rexi);
3516 /* the current state. This is a cached copy of PL_regmatch_state */
3518 /* cache heavy used fields of st in registers */
3521 U32 n = 0; /* general value; init to avoid compiler warning */
3522 I32 ln = 0; /* len or last; init to avoid compiler warning */
3523 char *locinput = startpos;
3524 char *pushinput; /* where to continue after a PUSH */
3525 I32 nextchr; /* is always set to UCHARAT(locinput) */
3527 bool result = 0; /* return value of S_regmatch */
3528 int depth = 0; /* depth of backtrack stack */
3529 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3530 const U32 max_nochange_depth =
3531 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3532 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3533 regmatch_state *yes_state = NULL; /* state to pop to on success of
3535 /* mark_state piggy backs on the yes_state logic so that when we unwind
3536 the stack on success we can update the mark_state as we go */
3537 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3538 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3539 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3541 bool no_final = 0; /* prevent failure from backtracking? */
3542 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3543 char *startpoint = locinput;
3544 SV *popmark = NULL; /* are we looking for a mark? */
3545 SV *sv_commit = NULL; /* last mark name seen in failure */
3546 SV *sv_yes_mark = NULL; /* last mark name we have seen
3547 during a successful match */
3548 U32 lastopen = 0; /* last open we saw */
3549 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3550 SV* const oreplsv = GvSV(PL_replgv);
3551 /* these three flags are set by various ops to signal information to
3552 * the very next op. They have a useful lifetime of exactly one loop
3553 * iteration, and are not preserved or restored by state pushes/pops
3555 bool sw = 0; /* the condition value in (?(cond)a|b) */
3556 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3557 int logical = 0; /* the following EVAL is:
3561 or the following IFMATCH/UNLESSM is:
3562 false: plain (?=foo)
3563 true: used as a condition: (?(?=foo))
3565 PAD* last_pad = NULL;
3567 I32 gimme = G_SCALAR;
3568 CV *caller_cv = NULL; /* who called us */
3569 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3570 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3571 U32 maxopenparen = 0; /* max '(' index seen so far */
3572 int to_complement; /* Invert the result? */
3573 _char_class_number classnum;
3574 bool is_utf8_pat = reginfo->is_utf8_pat;
3577 GET_RE_DEBUG_FLAGS_DECL;
3580 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3581 multicall_oldcatch = 0;
3582 multicall_cv = NULL;
3584 PERL_UNUSED_VAR(multicall_cop);
3585 PERL_UNUSED_VAR(newsp);
3588 PERL_ARGS_ASSERT_REGMATCH;
3590 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3591 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3593 /* on first ever call to regmatch, allocate first slab */
3594 if (!PL_regmatch_slab) {
3595 Newx(PL_regmatch_slab, 1, regmatch_slab);
3596 PL_regmatch_slab->prev = NULL;
3597 PL_regmatch_slab->next = NULL;
3598 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3601 oldsave = PL_savestack_ix;
3602 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3603 SAVEVPTR(PL_regmatch_slab);
3604 SAVEVPTR(PL_regmatch_state);
3606 /* grab next free state slot */
3607 st = ++PL_regmatch_state;
3608 if (st > SLAB_LAST(PL_regmatch_slab))
3609 st = PL_regmatch_state = S_push_slab(aTHX);
3611 /* Note that nextchr is a byte even in UTF */
3614 while (scan != NULL) {
3617 SV * const prop = sv_newmortal();
3618 regnode *rnext=regnext(scan);
3619 DUMP_EXEC_POS( locinput, scan, utf8_target );
3620 regprop(rex, prop, scan);
3622 PerlIO_printf(Perl_debug_log,
3623 "%3"IVdf":%*s%s(%"IVdf")\n",
3624 (IV)(scan - rexi->program), depth*2, "",
3626 (PL_regkind[OP(scan)] == END || !rnext) ?
3627 0 : (IV)(rnext - rexi->program));
3630 next = scan + NEXT_OFF(scan);
3633 state_num = OP(scan);
3639 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3641 switch (state_num) {
3642 case BOL: /* /^../ */
3643 if (locinput == PL_bostr)
3645 /* reginfo->till = reginfo->bol; */
3650 case MBOL: /* /^../m */
3651 if (locinput == PL_bostr ||
3652 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3658 case SBOL: /* /^../s */
3659 if (locinput == PL_bostr)
3664 if (locinput == reginfo->ganch)
3668 case KEEPS: /* \K */
3669 /* update the startpoint */
3670 st->u.keeper.val = rex->offs[0].start;
3671 rex->offs[0].start = locinput - PL_bostr;
3672 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3673 assert(0); /*NOTREACHED*/
3674 case KEEPS_next_fail:
3675 /* rollback the start point change */
3676 rex->offs[0].start = st->u.keeper.val;
3678 assert(0); /*NOTREACHED*/
3680 case EOL: /* /..$/ */
3683 case MEOL: /* /..$/m */
3684 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3688 case SEOL: /* /..$/s */
3690 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3692 if (reginfo->strend - locinput > 1)
3697 if (!NEXTCHR_IS_EOS)
3701 case SANY: /* /./s */
3704 goto increment_locinput;
3712 case REG_ANY: /* /./ */
3713 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3715 goto increment_locinput;
3719 #define ST st->u.trie
3720 case TRIEC: /* (ab|cd) with known charclass */
3721 /* In this case the charclass data is available inline so
3722 we can fail fast without a lot of extra overhead.
3724 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3726 PerlIO_printf(Perl_debug_log,
3727 "%*s %sfailed to match trie start class...%s\n",
3728 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3731 assert(0); /* NOTREACHED */
3734 case TRIE: /* (ab|cd) */
3735 /* the basic plan of execution of the trie is:
3736 * At the beginning, run though all the states, and
3737 * find the longest-matching word. Also remember the position
3738 * of the shortest matching word. For example, this pattern:
3741 * when matched against the string "abcde", will generate
3742 * accept states for all words except 3, with the longest
3743 * matching word being 4, and the shortest being 2 (with
3744 * the position being after char 1 of the string).
3746 * Then for each matching word, in word order (i.e. 1,2,4,5),
3747 * we run the remainder of the pattern; on each try setting
3748 * the current position to the character following the word,
3749 * returning to try the next word on failure.
3751 * We avoid having to build a list of words at runtime by
3752 * using a compile-time structure, wordinfo[].prev, which
3753 * gives, for each word, the previous accepting word (if any).
3754 * In the case above it would contain the mappings 1->2, 2->0,
3755 * 3->0, 4->5, 5->1. We can use this table to generate, from
3756 * the longest word (4 above), a list of all words, by
3757 * following the list of prev pointers; this gives us the
3758 * unordered list 4,5,1,2. Then given the current word we have
3759 * just tried, we can go through the list and find the
3760 * next-biggest word to try (so if we just failed on word 2,
3761 * the next in the list is 4).
3763 * Since at runtime we don't record the matching position in
3764 * the string for each word, we have to work that out for
3765 * each word we're about to process. The wordinfo table holds
3766 * the character length of each word; given that we recorded
3767 * at the start: the position of the shortest word and its
3768 * length in chars, we just need to move the pointer the
3769 * difference between the two char lengths. Depending on
3770 * Unicode status and folding, that's cheap or expensive.
3772 * This algorithm is optimised for the case where are only a
3773 * small number of accept states, i.e. 0,1, or maybe 2.
3774 * With lots of accepts states, and having to try all of them,
3775 * it becomes quadratic on number of accept states to find all
3780 /* what type of TRIE am I? (utf8 makes this contextual) */
3781 DECL_TRIE_TYPE(scan);
3783 /* what trie are we using right now */
3784 reg_trie_data * const trie
3785 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3786 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3787 U32 state = trie->startstate;
3790 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3792 if (trie->states[ state ].wordnum) {
3794 PerlIO_printf(Perl_debug_log,
3795 "%*s %smatched empty string...%s\n",
3796 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3802 PerlIO_printf(Perl_debug_log,
3803 "%*s %sfailed to match trie start class...%s\n",
3804 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3811 U8 *uc = ( U8* )locinput;
3815 U8 *uscan = (U8*)NULL;
3816 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3817 U32 charcount = 0; /* how many input chars we have matched */
3818 U32 accepted = 0; /* have we seen any accepting states? */
3820 ST.jump = trie->jump;
3823 ST.longfold = FALSE; /* char longer if folded => it's harder */
3826 /* fully traverse the TRIE; note the position of the
3827 shortest accept state and the wordnum of the longest
3830 while ( state && uc <= (U8*)(reginfo->strend) ) {
3831 U32 base = trie->states[ state ].trans.base;
3835 wordnum = trie->states[ state ].wordnum;
3837 if (wordnum) { /* it's an accept state */
3840 /* record first match position */
3842 ST.firstpos = (U8*)locinput;
3847 ST.firstchars = charcount;
3850 if (!ST.nextword || wordnum < ST.nextword)
3851 ST.nextword = wordnum;
3852 ST.topword = wordnum;
3855 DEBUG_TRIE_EXECUTE_r({
3856 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3857 PerlIO_printf( Perl_debug_log,
3858 "%*s %sState: %4"UVxf" Accepted: %c ",
3859 2+depth * 2, "", PL_colors[4],
3860 (UV)state, (accepted ? 'Y' : 'N'));
3863 /* read a char and goto next state */
3864 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3866 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3867 uscan, len, uvc, charid, foldlen,
3874 base + charid - 1 - trie->uniquecharcount)) >= 0)
3876 && ((U32)offset < trie->lasttrans)
3877 && trie->trans[offset].check == state)
3879 state = trie->trans[offset].next;
3890 DEBUG_TRIE_EXECUTE_r(
3891 PerlIO_printf( Perl_debug_log,
3892 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3893 charid, uvc, (UV)state, PL_colors[5] );
3899 /* calculate total number of accept states */
3904 w = trie->wordinfo[w].prev;
3907 ST.accepted = accepted;
3911 PerlIO_printf( Perl_debug_log,
3912 "%*s %sgot %"IVdf" possible matches%s\n",
3913 REPORT_CODE_OFF + depth * 2, "",
3914 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3916 goto trie_first_try; /* jump into the fail handler */
3918 assert(0); /* NOTREACHED */
3920 case TRIE_next_fail: /* we failed - try next alternative */
<