5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
46 * pregcomp and pregexec -- regsub and regerror are not used in perl
48 * Copyright (c) 1986 by University of Toronto.
49 * Written by Henry Spencer. Not derived from licensed software.
51 * Permission is granted to anyone to use this software for any
52 * purpose on any computer system, and to redistribute it freely,
53 * subject to the following restrictions:
55 * 1. The author is not responsible for the consequences of use of
56 * this software, no matter how awful, even if they arise
59 * 2. The origin of this software must not be misrepresented, either
60 * by explicit claim or by omission.
62 * 3. Altered versions must be plainly marked as such, and must not
63 * be misrepresented as being the original software.
65 **** Alterations to Henry's code are...
67 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
68 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
69 **** by Larry Wall and others
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
88 #include "inline_invlist.c"
89 #include "unicode_constants.h"
91 #define RF_tainted 1 /* tainted information used? e.g. locale */
92 #define RF_warned 2 /* warned about big count? */
94 #define RF_utf8 8 /* Pattern contains multibyte chars? */
96 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
102 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
105 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
106 : ANYOF_BITMAP_TEST(p,*(c)))
112 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
113 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
115 #define HOPc(pos,off) \
116 (char *)(PL_reg_match_utf8 \
117 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
119 #define HOPBACKc(pos, off) \
120 (char*)(PL_reg_match_utf8\
121 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
122 : (pos - off >= PL_bostr) \
126 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
127 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
130 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
131 #define NEXTCHR_IS_EOS (nextchr < 0)
133 #define SET_nextchr \
134 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
136 #define SET_locinput(p) \
141 /* these are unrolled below in the CCC_TRY_XXX defined */
142 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { \
145 ENTER; save_re_context(); \
146 ok=CAT2(is_utf8_,class)((const U8*)str); \
147 PERL_UNUSED_VAR(ok); \
148 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
149 /* Doesn't do an assert to verify that is correct */
150 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
151 if (!CAT2(PL_utf8_,class)) { \
152 bool throw_away PERL_UNUSED_DECL; \
153 ENTER; save_re_context(); \
154 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
157 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
158 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
159 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
161 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
162 /* No asserts are done for some of these, in case called on a */ \
163 /* Unicode version in which they map to nothing */ \
164 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
165 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
167 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
169 /* The actual code for CCC_TRY, which uses several variables from the routine
170 * it's callable from. It is designed to be the bulk of a case statement.
171 * FUNC is the macro or function to call on non-utf8 targets that indicate if
172 * nextchr matches the class.
173 * UTF8_TEST is the whole test string to use for utf8 targets
174 * LOAD is what to use to test, and if not present to load in the swash for the
176 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
178 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
179 * utf8 and a variant, load the swash if necessary and test using the utf8
180 * test. Advance to the next character if test is ok, otherwise fail; If not
181 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
182 * fails, or advance to the next character */
184 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
185 if (NEXTCHR_IS_EOS) { \
188 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
189 LOAD_UTF8_CHARCLASS(CLASS, STR); \
190 if (POS_OR_NEG (UTF8_TEST)) { \
194 else if (POS_OR_NEG (FUNC(nextchr))) { \
197 goto increment_locinput;
199 /* Handle the non-locale cases for a character class and its complement. It
200 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
201 * This is because that code fails when the test succeeds, so we want to have
202 * the test fail so that the code succeeds. The swash is stored in a
203 * predictable PL_ place */
204 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
207 _CCC_TRY_CODE( !, FUNC, \
208 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
209 (U8*)locinput, TRUE)), \
212 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
213 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
214 (U8*)locinput, TRUE)), \
217 /* Generate the case statements for both locale and non-locale character
218 * classes in regmatch for classes that don't have special unicode semantics.
219 * Locales don't use an immediate swash, but an intermediary special locale
220 * function that is called on the pointer to the current place in the input
221 * string. That function will resolve to needing the same swash. One might
222 * think that because we don't know what the locale will match, we shouldn't
223 * check with the swash loading function that it loaded properly; ie, that we
224 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
225 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
227 #define CCC_TRY(NAME, NNAME, FUNC, \
228 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
229 NAMEA, NNAMEA, FUNCA, \
232 PL_reg_flags |= RF_tainted; \
233 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
235 PL_reg_flags |= RF_tainted; \
236 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
239 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
242 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
246 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
249 goto increment_locinput; \
250 /* Generate the non-locale cases */ \
251 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
253 /* This is like CCC_TRY, but has an extra set of parameters for generating case
254 * statements to handle separate Unicode semantics nodes */
255 #define CCC_TRY_U(NAME, NNAME, FUNC, \
256 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
257 NAMEU, NNAMEU, FUNCU, \
258 NAMEA, NNAMEA, FUNCA, \
260 CCC_TRY(NAME, NNAME, FUNC, \
261 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
262 NAMEA, NNAMEA, FUNCA, \
264 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
266 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
268 /* for use after a quantifier and before an EXACT-like node -- japhy */
269 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
271 * NOTE that *nothing* that affects backtracking should be in here, specifically
272 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
273 * node that is in between two EXACT like nodes when ascertaining what the required
274 * "follow" character is. This should probably be moved to regex compile time
275 * although it may be done at run time beause of the REF possibility - more
276 * investigation required. -- demerphq
278 #define JUMPABLE(rn) ( \
280 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
282 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
283 OP(rn) == PLUS || OP(rn) == MINMOD || \
285 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
287 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
289 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
292 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
293 we don't need this definition. */
294 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
295 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
296 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
299 /* ... so we use this as its faster. */
300 #define IS_TEXT(rn) ( OP(rn)==EXACT )
301 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
302 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
303 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
308 Search for mandatory following text node; for lookahead, the text must
309 follow but for lookbehind (rn->flags != 0) we skip to the next step.
311 #define FIND_NEXT_IMPT(rn) STMT_START { \
312 while (JUMPABLE(rn)) { \
313 const OPCODE type = OP(rn); \
314 if (type == SUSPEND || PL_regkind[type] == CURLY) \
315 rn = NEXTOPER(NEXTOPER(rn)); \
316 else if (type == PLUS) \
318 else if (type == IFMATCH) \
319 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
320 else rn += NEXT_OFF(rn); \
325 static void restore_pos(pTHX_ void *arg);
327 #define REGCP_PAREN_ELEMS 3
328 #define REGCP_OTHER_ELEMS 3
329 #define REGCP_FRAME_ELEMS 1
330 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
331 * are needed for the regexp context stack bookkeeping. */
334 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
337 const int retval = PL_savestack_ix;
338 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
339 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
340 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
342 GET_RE_DEBUG_FLAGS_DECL;
344 PERL_ARGS_ASSERT_REGCPPUSH;
346 if (paren_elems_to_push < 0)
347 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
348 paren_elems_to_push);
350 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
351 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
352 " out of range (%lu-%ld)",
353 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
355 SSGROW(total_elems + REGCP_FRAME_ELEMS);
358 if ((int)PL_regsize > (int)parenfloor)
359 PerlIO_printf(Perl_debug_log,
360 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
365 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
366 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
367 SSPUSHINT(rex->offs[p].end);
368 SSPUSHINT(rex->offs[p].start);
369 SSPUSHINT(rex->offs[p].start_tmp);
370 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
371 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
373 (IV)rex->offs[p].start,
374 (IV)rex->offs[p].start_tmp,
378 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
379 SSPUSHINT(PL_regsize);
380 SSPUSHINT(rex->lastparen);
381 SSPUSHINT(rex->lastcloseparen);
382 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
387 /* These are needed since we do not localize EVAL nodes: */
388 #define REGCP_SET(cp) \
390 PerlIO_printf(Perl_debug_log, \
391 " Setting an EVAL scope, savestack=%"IVdf"\n", \
392 (IV)PL_savestack_ix)); \
395 #define REGCP_UNWIND(cp) \
397 if (cp != PL_savestack_ix) \
398 PerlIO_printf(Perl_debug_log, \
399 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
400 (IV)(cp), (IV)PL_savestack_ix)); \
403 #define UNWIND_PAREN(lp, lcp) \
404 for (n = rex->lastparen; n > lp; n--) \
405 rex->offs[n].end = -1; \
406 rex->lastparen = n; \
407 rex->lastcloseparen = lcp;
411 S_regcppop(pTHX_ regexp *rex)
416 GET_RE_DEBUG_FLAGS_DECL;
418 PERL_ARGS_ASSERT_REGCPPOP;
420 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
422 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
423 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
424 rex->lastcloseparen = SSPOPINT;
425 rex->lastparen = SSPOPINT;
426 PL_regsize = SSPOPINT;
428 i -= REGCP_OTHER_ELEMS;
429 /* Now restore the parentheses context. */
431 if (i || rex->lastparen + 1 <= rex->nparens)
432 PerlIO_printf(Perl_debug_log,
433 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
439 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
441 rex->offs[paren].start_tmp = SSPOPINT;
442 rex->offs[paren].start = SSPOPINT;
444 if (paren <= rex->lastparen)
445 rex->offs[paren].end = tmps;
446 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
447 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
449 (IV)rex->offs[paren].start,
450 (IV)rex->offs[paren].start_tmp,
451 (IV)rex->offs[paren].end,
452 (paren > rex->lastparen ? "(skipped)" : ""));
457 /* It would seem that the similar code in regtry()
458 * already takes care of this, and in fact it is in
459 * a better location to since this code can #if 0-ed out
460 * but the code in regtry() is needed or otherwise tests
461 * requiring null fields (pat.t#187 and split.t#{13,14}
462 * (as of patchlevel 7877) will fail. Then again,
463 * this code seems to be necessary or otherwise
464 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
465 * --jhi updated by dapm */
466 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
468 rex->offs[i].start = -1;
469 rex->offs[i].end = -1;
470 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
471 " \\%"UVuf": %s ..-1 undeffing\n",
473 (i > PL_regsize) ? "-1" : " "
479 /* restore the parens and associated vars at savestack position ix,
480 * but without popping the stack */
483 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
485 I32 tmpix = PL_savestack_ix;
486 PL_savestack_ix = ix;
488 PL_savestack_ix = tmpix;
491 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
494 * pregexec and friends
497 #ifndef PERL_IN_XSUB_RE
499 - pregexec - match a regexp against a string
502 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
503 char *strbeg, I32 minend, SV *screamer, U32 nosave)
504 /* stringarg: the point in the string at which to begin matching */
505 /* strend: pointer to null at end of string */
506 /* strbeg: real beginning of string */
507 /* minend: end of match must be >= minend bytes after stringarg. */
508 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
509 * itself is accessed via the pointers above */
510 /* nosave: For optimizations. */
512 PERL_ARGS_ASSERT_PREGEXEC;
515 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
516 nosave ? 0 : REXEC_COPY_STR);
521 * Need to implement the following flags for reg_anch:
523 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
525 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
526 * INTUIT_AUTORITATIVE_ML
527 * INTUIT_ONCE_NOML - Intuit can match in one location only.
530 * Another flag for this function: SECOND_TIME (so that float substrs
531 * with giant delta may be not rechecked).
534 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
536 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
537 Otherwise, only SvCUR(sv) is used to get strbeg. */
539 /* XXXX We assume that strpos is strbeg unless sv. */
541 /* XXXX Some places assume that there is a fixed substring.
542 An update may be needed if optimizer marks as "INTUITable"
543 RExen without fixed substrings. Similarly, it is assumed that
544 lengths of all the strings are no more than minlen, thus they
545 cannot come from lookahead.
546 (Or minlen should take into account lookahead.)
547 NOTE: Some of this comment is not correct. minlen does now take account
548 of lookahead/behind. Further research is required. -- demerphq
552 /* A failure to find a constant substring means that there is no need to make
553 an expensive call to REx engine, thus we celebrate a failure. Similarly,
554 finding a substring too deep into the string means that less calls to
555 regtry() should be needed.
557 REx compiler's optimizer found 4 possible hints:
558 a) Anchored substring;
560 c) Whether we are anchored (beginning-of-line or \G);
561 d) First node (of those at offset 0) which may distinguish positions;
562 We use a)b)d) and multiline-part of c), and try to find a position in the
563 string which does not contradict any of them.
566 /* Most of decisions we do here should have been done at compile time.
567 The nodes of the REx which we used for the search should have been
568 deleted from the finite automaton. */
571 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
572 char *strend, const U32 flags, re_scream_pos_data *data)
575 struct regexp *const prog = (struct regexp *)SvANY(rx);
577 /* Should be nonnegative! */
583 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
585 char *other_last = NULL; /* other substr checked before this */
586 char *check_at = NULL; /* check substr found at this pos */
587 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
588 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
589 RXi_GET_DECL(prog,progi);
591 const char * const i_strpos = strpos;
593 GET_RE_DEBUG_FLAGS_DECL;
595 PERL_ARGS_ASSERT_RE_INTUIT_START;
596 PERL_UNUSED_ARG(flags);
597 PERL_UNUSED_ARG(data);
599 RX_MATCH_UTF8_set(rx,utf8_target);
602 PL_reg_flags |= RF_utf8;
605 debug_start_match(rx, utf8_target, strpos, strend,
606 sv ? "Guessing start of match in sv for"
607 : "Guessing start of match in string for");
610 /* CHR_DIST() would be more correct here but it makes things slow. */
611 if (prog->minlen > strend - strpos) {
612 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
613 "String too short... [re_intuit_start]\n"));
617 /* XXX we need to pass strbeg as a separate arg: the following is
618 * guesswork and can be wrong... */
619 if (sv && SvPOK(sv)) {
620 char * p = SvPVX(sv);
621 STRLEN cur = SvCUR(sv);
622 if (p <= strpos && strpos < p + cur) {
624 assert(p <= strend && strend <= p + cur);
627 strbeg = strend - cur;
634 if (!prog->check_utf8 && prog->check_substr)
635 to_utf8_substr(prog);
636 check = prog->check_utf8;
638 if (!prog->check_substr && prog->check_utf8) {
639 if (! to_byte_substr(prog)) {
640 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
641 non_utf8_target_but_utf8_required));
645 check = prog->check_substr;
647 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
648 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
649 || ( (prog->extflags & RXf_ANCH_BOL)
650 && !multiline ) ); /* Check after \n? */
653 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
654 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
655 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
657 && (strpos != strbeg)) {
658 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
661 if (prog->check_offset_min == prog->check_offset_max &&
662 !(prog->extflags & RXf_CANY_SEEN)) {
663 /* Substring at constant offset from beg-of-str... */
666 s = HOP3c(strpos, prog->check_offset_min, strend);
669 slen = SvCUR(check); /* >= 1 */
671 if ( strend - s > slen || strend - s < slen - 1
672 || (strend - s == slen && strend[-1] != '\n')) {
673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
676 /* Now should match s[0..slen-2] */
678 if (slen && (*SvPVX_const(check) != *s
680 && memNE(SvPVX_const(check), s, slen)))) {
682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
686 else if (*SvPVX_const(check) != *s
687 || ((slen = SvCUR(check)) > 1
688 && memNE(SvPVX_const(check), s, slen)))
691 goto success_at_start;
694 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
696 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
697 end_shift = prog->check_end_shift;
700 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
701 - (SvTAIL(check) != 0);
702 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
704 if (end_shift < eshift)
708 else { /* Can match at random position */
711 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
712 end_shift = prog->check_end_shift;
714 /* end shift should be non negative here */
717 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
719 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
720 (IV)end_shift, RX_PRECOMP(prog));
724 /* Find a possible match in the region s..strend by looking for
725 the "check" substring in the region corrected by start/end_shift. */
728 I32 srch_start_shift = start_shift;
729 I32 srch_end_shift = end_shift;
732 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
733 srch_end_shift -= ((strbeg - s) - srch_start_shift);
734 srch_start_shift = strbeg - s;
736 DEBUG_OPTIMISE_MORE_r({
737 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
738 (IV)prog->check_offset_min,
739 (IV)srch_start_shift,
741 (IV)prog->check_end_shift);
744 if (prog->extflags & RXf_CANY_SEEN) {
745 start_point= (U8*)(s + srch_start_shift);
746 end_point= (U8*)(strend - srch_end_shift);
748 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
749 end_point= HOP3(strend, -srch_end_shift, strbeg);
751 DEBUG_OPTIMISE_MORE_r({
752 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
753 (int)(end_point - start_point),
754 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
758 s = fbm_instr( start_point, end_point,
759 check, multiline ? FBMrf_MULTILINE : 0);
761 /* Update the count-of-usability, remove useless subpatterns,
765 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
766 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
767 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
768 (s ? "Found" : "Did not find"),
769 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
770 ? "anchored" : "floating"),
773 (s ? " at offset " : "...\n") );
778 /* Finish the diagnostic message */
779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
781 /* XXX dmq: first branch is for positive lookbehind...
782 Our check string is offset from the beginning of the pattern.
783 So we need to do any stclass tests offset forward from that
792 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
793 Start with the other substr.
794 XXXX no SCREAM optimization yet - and a very coarse implementation
795 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
796 *always* match. Probably should be marked during compile...
797 Probably it is right to do no SCREAM here...
800 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
801 : (prog->float_substr && prog->anchored_substr))
803 /* Take into account the "other" substring. */
804 /* XXXX May be hopelessly wrong for UTF... */
807 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
810 char * const last = HOP3c(s, -start_shift, strbeg);
812 char * const saved_s = s;
815 t = s - prog->check_offset_max;
816 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
818 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
823 t = HOP3c(t, prog->anchored_offset, strend);
824 if (t < other_last) /* These positions already checked */
826 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
829 /* XXXX It is not documented what units *_offsets are in.
830 We assume bytes, but this is clearly wrong.
831 Meaning this code needs to be carefully reviewed for errors.
835 /* On end-of-str: see comment below. */
836 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
837 if (must == &PL_sv_undef) {
839 DEBUG_r(must = prog->anchored_utf8); /* for debug */
844 HOP3(HOP3(last1, prog->anchored_offset, strend)
845 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
847 multiline ? FBMrf_MULTILINE : 0
850 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
851 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
852 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
853 (s ? "Found" : "Contradicts"),
854 quoted, RE_SV_TAIL(must));
859 if (last1 >= last2) {
860 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
861 ", giving up...\n"));
864 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
865 ", trying floating at offset %ld...\n",
866 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
867 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
868 s = HOP3c(last, 1, strend);
872 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
873 (long)(s - i_strpos)));
874 t = HOP3c(s, -prog->anchored_offset, strbeg);
875 other_last = HOP3c(s, 1, strend);
883 else { /* Take into account the floating substring. */
885 char * const saved_s = s;
888 t = HOP3c(s, -start_shift, strbeg);
890 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
891 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
892 last = HOP3c(t, prog->float_max_offset, strend);
893 s = HOP3c(t, prog->float_min_offset, strend);
896 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
897 must = utf8_target ? prog->float_utf8 : prog->float_substr;
898 /* fbm_instr() takes into account exact value of end-of-str
899 if the check is SvTAIL(ed). Since false positives are OK,
900 and end-of-str is not later than strend we are OK. */
901 if (must == &PL_sv_undef) {
903 DEBUG_r(must = prog->float_utf8); /* for debug message */
906 s = fbm_instr((unsigned char*)s,
907 (unsigned char*)last + SvCUR(must)
909 must, multiline ? FBMrf_MULTILINE : 0);
911 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
912 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
913 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
914 (s ? "Found" : "Contradicts"),
915 quoted, RE_SV_TAIL(must));
919 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
920 ", giving up...\n"));
923 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
924 ", trying anchored starting at offset %ld...\n",
925 (long)(saved_s + 1 - i_strpos)));
927 s = HOP3c(t, 1, strend);
931 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
932 (long)(s - i_strpos)));
933 other_last = s; /* Fix this later. --Hugo */
943 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
945 DEBUG_OPTIMISE_MORE_r(
946 PerlIO_printf(Perl_debug_log,
947 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
948 (IV)prog->check_offset_min,
949 (IV)prog->check_offset_max,
957 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
959 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
962 /* Fixed substring is found far enough so that the match
963 cannot start at strpos. */
965 if (ml_anch && t[-1] != '\n') {
966 /* Eventually fbm_*() should handle this, but often
967 anchored_offset is not 0, so this check will not be wasted. */
968 /* XXXX In the code below we prefer to look for "^" even in
969 presence of anchored substrings. And we search even
970 beyond the found float position. These pessimizations
971 are historical artefacts only. */
973 while (t < strend - prog->minlen) {
975 if (t < check_at - prog->check_offset_min) {
976 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
977 /* Since we moved from the found position,
978 we definitely contradict the found anchored
979 substr. Due to the above check we do not
980 contradict "check" substr.
981 Thus we can arrive here only if check substr
982 is float. Redo checking for "other"=="fixed".
985 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
986 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
987 goto do_other_anchored;
989 /* We don't contradict the found floating substring. */
990 /* XXXX Why not check for STCLASS? */
992 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
993 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
996 /* Position contradicts check-string */
997 /* XXXX probably better to look for check-string
998 than for "\n", so one should lower the limit for t? */
999 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1000 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1001 other_last = strpos = s = t + 1;
1006 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1007 PL_colors[0], PL_colors[1]));
1011 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1012 PL_colors[0], PL_colors[1]));
1016 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1019 /* The found string does not prohibit matching at strpos,
1020 - no optimization of calling REx engine can be performed,
1021 unless it was an MBOL and we are not after MBOL,
1022 or a future STCLASS check will fail this. */
1024 /* Even in this situation we may use MBOL flag if strpos is offset
1025 wrt the start of the string. */
1026 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1027 && (strpos != strbeg) && strpos[-1] != '\n'
1028 /* May be due to an implicit anchor of m{.*foo} */
1029 && !(prog->intflags & PREGf_IMPLICIT))
1034 DEBUG_EXECUTE_r( if (ml_anch)
1035 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1036 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1039 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1041 prog->check_utf8 /* Could be deleted already */
1042 && --BmUSEFUL(prog->check_utf8) < 0
1043 && (prog->check_utf8 == prog->float_utf8)
1045 prog->check_substr /* Could be deleted already */
1046 && --BmUSEFUL(prog->check_substr) < 0
1047 && (prog->check_substr == prog->float_substr)
1050 /* If flags & SOMETHING - do not do it many times on the same match */
1051 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1052 /* XXX Does the destruction order has to change with utf8_target? */
1053 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1054 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1055 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1056 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1057 check = NULL; /* abort */
1059 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1060 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1061 if (prog->intflags & PREGf_IMPLICIT)
1062 prog->extflags &= ~RXf_ANCH_MBOL;
1063 /* XXXX This is a remnant of the old implementation. It
1064 looks wasteful, since now INTUIT can use many
1065 other heuristics. */
1066 prog->extflags &= ~RXf_USE_INTUIT;
1067 /* XXXX What other flags might need to be cleared in this branch? */
1073 /* Last resort... */
1074 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1075 /* trie stclasses are too expensive to use here, we are better off to
1076 leave it to regmatch itself */
1077 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1078 /* minlen == 0 is possible if regstclass is \b or \B,
1079 and the fixed substr is ''$.
1080 Since minlen is already taken into account, s+1 is before strend;
1081 accidentally, minlen >= 1 guaranties no false positives at s + 1
1082 even for \b or \B. But (minlen? 1 : 0) below assumes that
1083 regstclass does not come from lookahead... */
1084 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1085 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1086 const U8* const str = (U8*)STRING(progi->regstclass);
1087 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1088 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1091 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1092 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1093 else if (prog->float_substr || prog->float_utf8)
1094 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1098 if (checked_upto < s)
1100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1101 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1104 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1109 const char *what = NULL;
1111 if (endpos == strend) {
1112 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1113 "Could not match STCLASS...\n") );
1116 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1117 "This position contradicts STCLASS...\n") );
1118 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1120 checked_upto = HOPBACKc(endpos, start_shift);
1121 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1122 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1123 /* Contradict one of substrings */
1124 if (prog->anchored_substr || prog->anchored_utf8) {
1125 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1126 DEBUG_EXECUTE_r( what = "anchored" );
1128 s = HOP3c(t, 1, strend);
1129 if (s + start_shift + end_shift > strend) {
1130 /* XXXX Should be taken into account earlier? */
1131 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1132 "Could not match STCLASS...\n") );
1137 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1138 "Looking for %s substr starting at offset %ld...\n",
1139 what, (long)(s + start_shift - i_strpos)) );
1142 /* Have both, check_string is floating */
1143 if (t + start_shift >= check_at) /* Contradicts floating=check */
1144 goto retry_floating_check;
1145 /* Recheck anchored substring, but not floating... */
1149 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1150 "Looking for anchored substr starting at offset %ld...\n",
1151 (long)(other_last - i_strpos)) );
1152 goto do_other_anchored;
1154 /* Another way we could have checked stclass at the
1155 current position only: */
1160 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1161 "Looking for /%s^%s/m starting at offset %ld...\n",
1162 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1165 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1167 /* Check is floating substring. */
1168 retry_floating_check:
1169 t = check_at - start_shift;
1170 DEBUG_EXECUTE_r( what = "floating" );
1171 goto hop_and_restart;
1174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1175 "By STCLASS: moving %ld --> %ld\n",
1176 (long)(t - i_strpos), (long)(s - i_strpos))
1180 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1181 "Does not contradict STCLASS...\n");
1186 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1187 PL_colors[4], (check ? "Guessed" : "Giving up"),
1188 PL_colors[5], (long)(s - i_strpos)) );
1191 fail_finish: /* Substring not found */
1192 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1193 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1195 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1196 PL_colors[4], PL_colors[5]));
1200 #define DECL_TRIE_TYPE(scan) \
1201 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1202 trie_type = ((scan->flags == EXACT) \
1203 ? (utf8_target ? trie_utf8 : trie_plain) \
1204 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1206 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1207 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1209 switch (trie_type) { \
1210 case trie_utf8_fold: \
1211 if ( foldlen>0 ) { \
1212 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1217 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1218 len = UTF8SKIP(uc); \
1219 skiplen = UNISKIP( uvc ); \
1220 foldlen -= skiplen; \
1221 uscan = foldbuf + skiplen; \
1224 case trie_latin_utf8_fold: \
1225 if ( foldlen>0 ) { \
1226 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1232 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1233 skiplen = UNISKIP( uvc ); \
1234 foldlen -= skiplen; \
1235 uscan = foldbuf + skiplen; \
1239 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1246 charid = trie->charmap[ uvc ]; \
1250 if (widecharmap) { \
1251 SV** const svpp = hv_fetch(widecharmap, \
1252 (char*)&uvc, sizeof(UV), 0); \
1254 charid = (U16)SvIV(*svpp); \
1259 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1263 && (ln == 1 || folder(s, pat_string, ln)) \
1264 && (!reginfo || regtry(reginfo, &s)) ) \
1270 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1272 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
1278 #define REXEC_FBC_SCAN(CoDe) \
1280 while (s < strend) { \
1286 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1287 REXEC_FBC_UTF8_SCAN( \
1289 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1298 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1301 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1310 #define REXEC_FBC_TRYIT \
1311 if ((!reginfo || regtry(reginfo, &s))) \
1314 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1315 if (utf8_target) { \
1316 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1319 REXEC_FBC_CLASS_SCAN(CoNd); \
1322 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1323 if (utf8_target) { \
1325 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1328 REXEC_FBC_CLASS_SCAN(CoNd); \
1331 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1332 PL_reg_flags |= RF_tainted; \
1333 if (utf8_target) { \
1334 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1337 REXEC_FBC_CLASS_SCAN(CoNd); \
1340 #define DUMP_EXEC_POS(li,s,doutf8) \
1341 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1344 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1345 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1346 tmp = TEST_NON_UTF8(tmp); \
1347 REXEC_FBC_UTF8_SCAN( \
1348 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1357 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1358 if (s == PL_bostr) { \
1362 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1363 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1366 LOAD_UTF8_CHARCLASS_ALNUM(); \
1367 REXEC_FBC_UTF8_SCAN( \
1368 if (tmp == ! (TeSt2_UtF8)) { \
1377 /* The only difference between the BOUND and NBOUND cases is that
1378 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1379 * NBOUND. This is accomplished by passing it in either the if or else clause,
1380 * with the other one being empty */
1381 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1382 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1384 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1385 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1387 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1388 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1390 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1391 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1394 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1395 * be passed in completely with the variable name being tested, which isn't
1396 * such a clean interface, but this is easier to read than it was before. We
1397 * are looking for the boundary (or non-boundary between a word and non-word
1398 * character. The utf8 and non-utf8 cases have the same logic, but the details
1399 * must be different. Find the "wordness" of the character just prior to this
1400 * one, and compare it with the wordness of this one. If they differ, we have
1401 * a boundary. At the beginning of the string, pretend that the previous
1402 * character was a new-line */
1403 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1404 if (utf8_target) { \
1407 else { /* Not utf8 */ \
1408 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1409 tmp = TEST_NON_UTF8(tmp); \
1411 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1420 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1423 /* We know what class REx starts with. Try to find this position... */
1424 /* if reginfo is NULL, its a dryrun */
1425 /* annoyingly all the vars in this routine have different names from their counterparts
1426 in regmatch. /grrr */
1429 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1430 const char *strend, regmatch_info *reginfo)
1433 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1434 char *pat_string; /* The pattern's exactish string */
1435 char *pat_end; /* ptr to end char of pat_string */
1436 re_fold_t folder; /* Function for computing non-utf8 folds */
1437 const U8 *fold_array; /* array for folding ords < 256 */
1444 I32 tmp = 1; /* Scratch variable? */
1445 const bool utf8_target = PL_reg_match_utf8;
1446 UV utf8_fold_flags = 0;
1447 RXi_GET_DECL(prog,progi);
1449 PERL_ARGS_ASSERT_FIND_BYCLASS;
1451 /* We know what class it must start with. */
1455 if (utf8_target || OP(c) == ANYOFV) {
1456 STRLEN inclasslen = strend - s;
1457 REXEC_FBC_UTF8_CLASS_SCAN(
1458 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1461 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1466 if (tmp && (!reginfo || regtry(reginfo, &s)))
1474 if (UTF_PATTERN || utf8_target) {
1475 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1476 goto do_exactf_utf8;
1478 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1479 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1480 goto do_exactf_non_utf8; /* isn't dealt with by these */
1485 /* regcomp.c already folded this if pattern is in UTF-8 */
1486 utf8_fold_flags = 0;
1487 goto do_exactf_utf8;
1489 fold_array = PL_fold;
1491 goto do_exactf_non_utf8;
1494 if (UTF_PATTERN || utf8_target) {
1495 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1496 goto do_exactf_utf8;
1498 fold_array = PL_fold_locale;
1499 folder = foldEQ_locale;
1500 goto do_exactf_non_utf8;
1504 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1506 goto do_exactf_utf8;
1508 case EXACTFU_TRICKYFOLD:
1510 if (UTF_PATTERN || utf8_target) {
1511 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1512 goto do_exactf_utf8;
1515 /* Any 'ss' in the pattern should have been replaced by regcomp,
1516 * so we don't have to worry here about this single special case
1517 * in the Latin1 range */
1518 fold_array = PL_fold_latin1;
1519 folder = foldEQ_latin1;
1523 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1524 are no glitches with fold-length differences
1525 between the target string and pattern */
1527 /* The idea in the non-utf8 EXACTF* cases is to first find the
1528 * first character of the EXACTF* node and then, if necessary,
1529 * case-insensitively compare the full text of the node. c1 is the
1530 * first character. c2 is its fold. This logic will not work for
1531 * Unicode semantics and the german sharp ss, which hence should
1532 * not be compiled into a node that gets here. */
1533 pat_string = STRING(c);
1534 ln = STR_LEN(c); /* length to match in octets/bytes */
1536 /* We know that we have to match at least 'ln' bytes (which is the
1537 * same as characters, since not utf8). If we have to match 3
1538 * characters, and there are only 2 availabe, we know without
1539 * trying that it will fail; so don't start a match past the
1540 * required minimum number from the far end */
1541 e = HOP3c(strend, -((I32)ln), s);
1543 if (!reginfo && e < s) {
1544 e = s; /* Due to minlen logic of intuit() */
1548 c2 = fold_array[c1];
1549 if (c1 == c2) { /* If char and fold are the same */
1550 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1553 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1562 /* If one of the operands is in utf8, we can't use the simpler
1563 * folding above, due to the fact that many different characters
1564 * can have the same fold, or portion of a fold, or different-
1566 pat_string = STRING(c);
1567 ln = STR_LEN(c); /* length to match in octets/bytes */
1568 pat_end = pat_string + ln;
1569 lnc = (UTF_PATTERN) /* length to match in characters */
1570 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1573 /* We have 'lnc' characters to match in the pattern, but because of
1574 * multi-character folding, each character in the target can match
1575 * up to 3 characters (Unicode guarantees it will never exceed
1576 * this) if it is utf8-encoded; and up to 2 if not (based on the
1577 * fact that the Latin 1 folds are already determined, and the
1578 * only multi-char fold in that range is the sharp-s folding to
1579 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1580 * string character. Adjust lnc accordingly, rounding up, so that
1581 * if we need to match at least 4+1/3 chars, that really is 5. */
1582 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1583 lnc = (lnc + expansion - 1) / expansion;
1585 /* As in the non-UTF8 case, if we have to match 3 characters, and
1586 * only 2 are left, it's guaranteed to fail, so don't start a
1587 * match that would require us to go beyond the end of the string
1589 e = HOP3c(strend, -((I32)lnc), s);
1591 if (!reginfo && e < s) {
1592 e = s; /* Due to minlen logic of intuit() */
1595 /* XXX Note that we could recalculate e to stop the loop earlier,
1596 * as the worst case expansion above will rarely be met, and as we
1597 * go along we would usually find that e moves further to the left.
1598 * This would happen only after we reached the point in the loop
1599 * where if there were no expansion we should fail. Unclear if
1600 * worth the expense */
1603 char *my_strend= (char *)strend;
1604 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1605 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1606 && (!reginfo || regtry(reginfo, &s)) )
1610 s += (utf8_target) ? UTF8SKIP(s) : 1;
1615 PL_reg_flags |= RF_tainted;
1616 FBC_BOUND(isALNUM_LC,
1617 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1618 isALNUM_LC_utf8((U8*)s));
1621 PL_reg_flags |= RF_tainted;
1622 FBC_NBOUND(isALNUM_LC,
1623 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1624 isALNUM_LC_utf8((U8*)s));
1627 FBC_BOUND(isWORDCHAR,
1629 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1632 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1634 isWORDCHAR_A((U8*)s));
1637 FBC_NBOUND(isWORDCHAR,
1639 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1642 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1644 isWORDCHAR_A((U8*)s));
1647 FBC_BOUND(isWORDCHAR_L1,
1649 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1652 FBC_NBOUND(isWORDCHAR_L1,
1654 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1657 REXEC_FBC_CSCAN_TAINT(
1658 isALNUM_LC_utf8((U8*)s),
1663 REXEC_FBC_CSCAN_PRELOAD(
1664 LOAD_UTF8_CHARCLASS_ALNUM(),
1665 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1666 isWORDCHAR_L1((U8) *s)
1670 REXEC_FBC_CSCAN_PRELOAD(
1671 LOAD_UTF8_CHARCLASS_ALNUM(),
1672 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1677 /* Don't need to worry about utf8, as it can match only a single
1678 * byte invariant character */
1679 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1682 REXEC_FBC_CSCAN_PRELOAD(
1683 LOAD_UTF8_CHARCLASS_ALNUM(),
1684 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1685 ! isWORDCHAR_L1((U8) *s)
1689 REXEC_FBC_CSCAN_PRELOAD(
1690 LOAD_UTF8_CHARCLASS_ALNUM(),
1691 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1702 REXEC_FBC_CSCAN_TAINT(
1703 !isALNUM_LC_utf8((U8*)s),
1708 REXEC_FBC_CSCAN_PRELOAD(
1709 LOAD_UTF8_CHARCLASS_SPACE(),
1710 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1715 REXEC_FBC_CSCAN_PRELOAD(
1716 LOAD_UTF8_CHARCLASS_SPACE(),
1717 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1722 /* Don't need to worry about utf8, as it can match only a single
1723 * byte invariant character */
1724 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1727 REXEC_FBC_CSCAN_TAINT(
1728 isSPACE_LC_utf8((U8*)s),
1733 REXEC_FBC_CSCAN_PRELOAD(
1734 LOAD_UTF8_CHARCLASS_SPACE(),
1735 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1736 ! isSPACE_L1((U8) *s)
1740 REXEC_FBC_CSCAN_PRELOAD(
1741 LOAD_UTF8_CHARCLASS_SPACE(),
1742 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1753 REXEC_FBC_CSCAN_TAINT(
1754 !isSPACE_LC_utf8((U8*)s),
1759 REXEC_FBC_CSCAN_PRELOAD(
1760 LOAD_UTF8_CHARCLASS_DIGIT(),
1761 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1766 /* Don't need to worry about utf8, as it can match only a single
1767 * byte invariant character */
1768 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1771 REXEC_FBC_CSCAN_TAINT(
1772 isDIGIT_LC_utf8((U8*)s),
1777 REXEC_FBC_CSCAN_PRELOAD(
1778 LOAD_UTF8_CHARCLASS_DIGIT(),
1779 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1790 REXEC_FBC_CSCAN_TAINT(
1791 !isDIGIT_LC_utf8((U8*)s),
1797 is_LNBREAK_utf8_safe(s, strend),
1798 is_LNBREAK_latin1_safe(s, strend)
1803 is_VERTWS_utf8_safe(s, strend),
1804 is_VERTWS_latin1_safe(s, strend)
1809 !is_VERTWS_utf8_safe(s, strend),
1810 !is_VERTWS_latin1_safe(s, strend)
1815 is_HORIZWS_utf8_safe(s, strend),
1816 is_HORIZWS_latin1_safe(s, strend)
1821 !is_HORIZWS_utf8_safe(s, strend),
1822 !is_HORIZWS_latin1_safe(s, strend)
1826 /* Don't need to worry about utf8, as it can match only a single
1827 * byte invariant character. The flag in this node type is the
1828 * class number to pass to _generic_isCC() to build a mask for
1829 * searching in PL_charclass[] */
1830 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1834 !_generic_isCC_A(*s, FLAGS(c)),
1835 !_generic_isCC_A(*s, FLAGS(c))
1843 /* what trie are we using right now */
1845 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1847 = (reg_trie_data*)progi->data->data[ aho->trie ];
1848 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1850 const char *last_start = strend - trie->minlen;
1852 const char *real_start = s;
1854 STRLEN maxlen = trie->maxlen;
1856 U8 **points; /* map of where we were in the input string
1857 when reading a given char. For ASCII this
1858 is unnecessary overhead as the relationship
1859 is always 1:1, but for Unicode, especially
1860 case folded Unicode this is not true. */
1861 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1865 GET_RE_DEBUG_FLAGS_DECL;
1867 /* We can't just allocate points here. We need to wrap it in
1868 * an SV so it gets freed properly if there is a croak while
1869 * running the match */
1872 sv_points=newSV(maxlen * sizeof(U8 *));
1873 SvCUR_set(sv_points,
1874 maxlen * sizeof(U8 *));
1875 SvPOK_on(sv_points);
1876 sv_2mortal(sv_points);
1877 points=(U8**)SvPV_nolen(sv_points );
1878 if ( trie_type != trie_utf8_fold
1879 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1882 bitmap=(U8*)trie->bitmap;
1884 bitmap=(U8*)ANYOF_BITMAP(c);
1886 /* this is the Aho-Corasick algorithm modified a touch
1887 to include special handling for long "unknown char"
1888 sequences. The basic idea being that we use AC as long
1889 as we are dealing with a possible matching char, when
1890 we encounter an unknown char (and we have not encountered
1891 an accepting state) we scan forward until we find a legal
1893 AC matching is basically that of trie matching, except
1894 that when we encounter a failing transition, we fall back
1895 to the current states "fail state", and try the current char
1896 again, a process we repeat until we reach the root state,
1897 state 1, or a legal transition. If we fail on the root state
1898 then we can either terminate if we have reached an accepting
1899 state previously, or restart the entire process from the beginning
1903 while (s <= last_start) {
1904 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1912 U8 *uscan = (U8*)NULL;
1913 U8 *leftmost = NULL;
1915 U32 accepted_word= 0;
1919 while ( state && uc <= (U8*)strend ) {
1921 U32 word = aho->states[ state ].wordnum;
1925 DEBUG_TRIE_EXECUTE_r(
1926 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1927 dump_exec_pos( (char *)uc, c, strend, real_start,
1928 (char *)uc, utf8_target );
1929 PerlIO_printf( Perl_debug_log,
1930 " Scanning for legal start char...\n");
1934 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1938 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1944 if (uc >(U8*)last_start) break;
1948 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1949 if (!leftmost || lpos < leftmost) {
1950 DEBUG_r(accepted_word=word);
1956 points[pointpos++ % maxlen]= uc;
1957 if (foldlen || uc < (U8*)strend) {
1958 REXEC_TRIE_READ_CHAR(trie_type, trie,
1960 uscan, len, uvc, charid, foldlen,
1962 DEBUG_TRIE_EXECUTE_r({
1963 dump_exec_pos( (char *)uc, c, strend,
1964 real_start, s, utf8_target);
1965 PerlIO_printf(Perl_debug_log,
1966 " Charid:%3u CP:%4"UVxf" ",
1978 word = aho->states[ state ].wordnum;
1980 base = aho->states[ state ].trans.base;
1982 DEBUG_TRIE_EXECUTE_r({
1984 dump_exec_pos( (char *)uc, c, strend, real_start,
1986 PerlIO_printf( Perl_debug_log,
1987 "%sState: %4"UVxf", word=%"UVxf,
1988 failed ? " Fail transition to " : "",
1989 (UV)state, (UV)word);
1995 ( ((offset = base + charid
1996 - 1 - trie->uniquecharcount)) >= 0)
1997 && ((U32)offset < trie->lasttrans)
1998 && trie->trans[offset].check == state
1999 && (tmp=trie->trans[offset].next))
2001 DEBUG_TRIE_EXECUTE_r(
2002 PerlIO_printf( Perl_debug_log," - legal\n"));
2007 DEBUG_TRIE_EXECUTE_r(
2008 PerlIO_printf( Perl_debug_log," - fail\n"));
2010 state = aho->fail[state];
2014 /* we must be accepting here */
2015 DEBUG_TRIE_EXECUTE_r(
2016 PerlIO_printf( Perl_debug_log," - accepting\n"));
2025 if (!state) state = 1;
2028 if ( aho->states[ state ].wordnum ) {
2029 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2030 if (!leftmost || lpos < leftmost) {
2031 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2036 s = (char*)leftmost;
2037 DEBUG_TRIE_EXECUTE_r({
2039 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2040 (UV)accepted_word, (IV)(s - real_start)
2043 if (!reginfo || regtry(reginfo, &s)) {
2049 DEBUG_TRIE_EXECUTE_r({
2050 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2053 DEBUG_TRIE_EXECUTE_r(
2054 PerlIO_printf( Perl_debug_log,"No match.\n"));
2063 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2073 - regexec_flags - match a regexp against a string
2076 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2077 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2078 /* stringarg: the point in the string at which to begin matching */
2079 /* strend: pointer to null at end of string */
2080 /* strbeg: real beginning of string */
2081 /* minend: end of match must be >= minend bytes after stringarg. */
2082 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2083 * itself is accessed via the pointers above */
2084 /* data: May be used for some additional optimizations.
2085 Currently its only used, with a U32 cast, for transmitting
2086 the ganch offset when doing a /g match. This will change */
2087 /* nosave: For optimizations. */
2091 struct regexp *const prog = (struct regexp *)SvANY(rx);
2092 /*register*/ char *s;
2094 /*register*/ char *startpos = stringarg;
2095 I32 minlen; /* must match at least this many chars */
2096 I32 dontbother = 0; /* how many characters not to try at end */
2097 I32 end_shift = 0; /* Same for the end. */ /* CC */
2098 I32 scream_pos = -1; /* Internal iterator of scream. */
2099 char *scream_olds = NULL;
2100 const bool utf8_target = cBOOL(DO_UTF8(sv));
2102 RXi_GET_DECL(prog,progi);
2103 regmatch_info reginfo; /* create some info to pass to regtry etc */
2104 regexp_paren_pair *swap = NULL;
2105 GET_RE_DEBUG_FLAGS_DECL;
2107 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2108 PERL_UNUSED_ARG(data);
2110 /* Be paranoid... */
2111 if (prog == NULL || startpos == NULL) {
2112 Perl_croak(aTHX_ "NULL regexp parameter");
2116 multiline = prog->extflags & RXf_PMf_MULTILINE;
2117 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2119 RX_MATCH_UTF8_set(rx, utf8_target);
2121 debug_start_match(rx, utf8_target, startpos, strend,
2125 minlen = prog->minlen;
2127 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2128 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2129 "String too short [regexec_flags]...\n"));
2134 /* Check validity of program. */
2135 if (UCHARAT(progi->program) != REG_MAGIC) {
2136 Perl_croak(aTHX_ "corrupted regexp program");
2140 PL_reg_state.re_state_eval_setup_done = FALSE;
2144 PL_reg_flags |= RF_utf8;
2146 /* Mark beginning of line for ^ and lookbehind. */
2147 reginfo.bol = startpos; /* XXX not used ??? */
2151 /* Mark end of line for $ (and such) */
2154 /* see how far we have to get to not match where we matched before */
2155 reginfo.till = startpos+minend;
2157 /* If there is a "must appear" string, look for it. */
2160 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2162 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2163 reginfo.ganch = startpos + prog->gofs;
2164 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2165 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2166 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2168 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2169 && mg->mg_len >= 0) {
2170 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2171 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2172 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2174 if (prog->extflags & RXf_ANCH_GPOS) {
2175 if (s > reginfo.ganch)
2177 s = reginfo.ganch - prog->gofs;
2178 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2179 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2185 reginfo.ganch = strbeg + PTR2UV(data);
2186 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2187 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2189 } else { /* pos() not defined */
2190 reginfo.ganch = strbeg;
2191 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2192 "GPOS: reginfo.ganch = strbeg\n"));
2195 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2196 /* We have to be careful. If the previous successful match
2197 was from this regex we don't want a subsequent partially
2198 successful match to clobber the old results.
2199 So when we detect this possibility we add a swap buffer
2200 to the re, and switch the buffer each match. If we fail
2201 we switch it back, otherwise we leave it swapped.
2204 /* do we need a save destructor here for eval dies? */
2205 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2206 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2207 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2213 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2214 re_scream_pos_data d;
2216 d.scream_olds = &scream_olds;
2217 d.scream_pos = &scream_pos;
2218 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2220 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2221 goto phooey; /* not present */
2227 /* Simplest case: anchored match need be tried only once. */
2228 /* [unless only anchor is BOL and multiline is set] */
2229 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2230 if (s == startpos && regtry(®info, &startpos))
2232 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2233 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2238 dontbother = minlen - 1;
2239 end = HOP3c(strend, -dontbother, strbeg) - 1;
2240 /* for multiline we only have to try after newlines */
2241 if (prog->check_substr || prog->check_utf8) {
2242 /* because of the goto we can not easily reuse the macros for bifurcating the
2243 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2246 goto after_try_utf8;
2248 if (regtry(®info, &s)) {
2255 if (prog->extflags & RXf_USE_INTUIT) {
2256 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2265 } /* end search for check string in unicode */
2267 if (s == startpos) {
2268 goto after_try_latin;
2271 if (regtry(®info, &s)) {
2278 if (prog->extflags & RXf_USE_INTUIT) {
2279 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2288 } /* end search for check string in latin*/
2289 } /* end search for check string */
2290 else { /* search for newline */
2292 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2295 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2296 while (s <= end) { /* note it could be possible to match at the end of the string */
2297 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2298 if (regtry(®info, &s))
2302 } /* end search for newline */
2303 } /* end anchored/multiline check string search */
2305 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2307 /* the warning about reginfo.ganch being used without initialization
2308 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2309 and we only enter this block when the same bit is set. */
2310 char *tmp_s = reginfo.ganch - prog->gofs;
2312 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2317 /* Messy cases: unanchored match. */
2318 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2319 /* we have /x+whatever/ */
2320 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2326 if (! prog->anchored_utf8) {
2327 to_utf8_substr(prog);
2329 ch = SvPVX_const(prog->anchored_utf8)[0];
2332 DEBUG_EXECUTE_r( did_match = 1 );
2333 if (regtry(®info, &s)) goto got_it;
2335 while (s < strend && *s == ch)
2342 if (! prog->anchored_substr) {
2343 if (! to_byte_substr(prog)) {
2344 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2345 non_utf8_target_but_utf8_required));
2349 ch = SvPVX_const(prog->anchored_substr)[0];
2352 DEBUG_EXECUTE_r( did_match = 1 );
2353 if (regtry(®info, &s)) goto got_it;
2355 while (s < strend && *s == ch)
2360 DEBUG_EXECUTE_r(if (!did_match)
2361 PerlIO_printf(Perl_debug_log,
2362 "Did not find anchored character...\n")
2365 else if (prog->anchored_substr != NULL
2366 || prog->anchored_utf8 != NULL
2367 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2368 && prog->float_max_offset < strend - s)) {
2373 char *last1; /* Last position checked before */
2377 if (prog->anchored_substr || prog->anchored_utf8) {
2379 if (! prog->anchored_utf8) {
2380 to_utf8_substr(prog);
2382 must = prog->anchored_utf8;
2385 if (! prog->anchored_substr) {
2386 if (! to_byte_substr(prog)) {
2387 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2388 non_utf8_target_but_utf8_required));
2392 must = prog->anchored_substr;
2394 back_max = back_min = prog->anchored_offset;
2397 if (! prog->float_utf8) {
2398 to_utf8_substr(prog);
2400 must = prog->float_utf8;
2403 if (! prog->float_substr) {
2404 if (! to_byte_substr(prog)) {
2405 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2406 non_utf8_target_but_utf8_required));
2410 must = prog->float_substr;
2412 back_max = prog->float_max_offset;
2413 back_min = prog->float_min_offset;
2419 last = HOP3c(strend, /* Cannot start after this */
2420 -(I32)(CHR_SVLEN(must)
2421 - (SvTAIL(must) != 0) + back_min), strbeg);
2424 last1 = HOPc(s, -1);
2426 last1 = s - 1; /* bogus */
2428 /* XXXX check_substr already used to find "s", can optimize if
2429 check_substr==must. */
2431 dontbother = end_shift;
2432 strend = HOPc(strend, -dontbother);
2433 while ( (s <= last) &&
2434 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2435 (unsigned char*)strend, must,
2436 multiline ? FBMrf_MULTILINE : 0)) ) {
2437 DEBUG_EXECUTE_r( did_match = 1 );
2438 if (HOPc(s, -back_max) > last1) {
2439 last1 = HOPc(s, -back_min);
2440 s = HOPc(s, -back_max);
2443 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2445 last1 = HOPc(s, -back_min);
2449 while (s <= last1) {
2450 if (regtry(®info, &s))
2453 s++; /* to break out of outer loop */
2460 while (s <= last1) {
2461 if (regtry(®info, &s))
2467 DEBUG_EXECUTE_r(if (!did_match) {
2468 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2469 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2470 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2471 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2472 ? "anchored" : "floating"),
2473 quoted, RE_SV_TAIL(must));
2477 else if ( (c = progi->regstclass) ) {
2479 const OPCODE op = OP(progi->regstclass);
2480 /* don't bother with what can't match */
2481 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2482 strend = HOPc(strend, -(minlen - 1));
2485 SV * const prop = sv_newmortal();
2486 regprop(prog, prop, c);
2488 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2490 PerlIO_printf(Perl_debug_log,
2491 "Matching stclass %.*s against %s (%d bytes)\n",
2492 (int)SvCUR(prop), SvPVX_const(prop),
2493 quoted, (int)(strend - s));
2496 if (find_byclass(prog, c, s, strend, ®info))
2498 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2502 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2510 if (! prog->float_utf8) {
2511 to_utf8_substr(prog);
2513 float_real = prog->float_utf8;
2516 if (! prog->float_substr) {
2517 if (! to_byte_substr(prog)) {
2518 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2519 non_utf8_target_but_utf8_required));
2523 float_real = prog->float_substr;
2526 little = SvPV_const(float_real, len);
2527 if (SvTAIL(float_real)) {
2528 /* This means that float_real contains an artificial \n on
2529 * the end due to the presence of something like this:
2530 * /foo$/ where we can match both "foo" and "foo\n" at the
2531 * end of the string. So we have to compare the end of the
2532 * string first against the float_real without the \n and
2533 * then against the full float_real with the string. We
2534 * have to watch out for cases where the string might be
2535 * smaller than the float_real or the float_real without
2537 char *checkpos= strend - len;
2539 PerlIO_printf(Perl_debug_log,
2540 "%sChecking for float_real.%s\n",
2541 PL_colors[4], PL_colors[5]));
2542 if (checkpos + 1 < strbeg) {
2543 /* can't match, even if we remove the trailing \n
2544 * string is too short to match */
2546 PerlIO_printf(Perl_debug_log,
2547 "%sString shorter than required trailing substring, cannot match.%s\n",
2548 PL_colors[4], PL_colors[5]));
2550 } else if (memEQ(checkpos + 1, little, len - 1)) {
2551 /* can match, the end of the string matches without the
2553 last = checkpos + 1;
2554 } else if (checkpos < strbeg) {
2555 /* cant match, string is too short when the "\n" is
2558 PerlIO_printf(Perl_debug_log,
2559 "%sString does not contain required trailing substring, cannot match.%s\n",
2560 PL_colors[4], PL_colors[5]));
2562 } else if (!multiline) {
2563 /* non multiline match, so compare with the "\n" at the
2564 * end of the string */
2565 if (memEQ(checkpos, little, len)) {
2569 PerlIO_printf(Perl_debug_log,
2570 "%sString does not contain required trailing substring, cannot match.%s\n",
2571 PL_colors[4], PL_colors[5]));
2575 /* multiline match, so we have to search for a place
2576 * where the full string is located */
2582 last = rninstr(s, strend, little, little + len);
2584 last = strend; /* matching "$" */
2587 /* at one point this block contained a comment which was
2588 * probably incorrect, which said that this was a "should not
2589 * happen" case. Even if it was true when it was written I am
2590 * pretty sure it is not anymore, so I have removed the comment
2591 * and replaced it with this one. Yves */
2593 PerlIO_printf(Perl_debug_log,
2594 "String does not contain required substring, cannot match.\n"
2598 dontbother = strend - last + prog->float_min_offset;
2600 if (minlen && (dontbother < minlen))
2601 dontbother = minlen - 1;
2602 strend -= dontbother; /* this one's always in bytes! */
2603 /* We don't know much -- general case. */
2606 if (regtry(®info, &s))
2615 if (regtry(®info, &s))
2617 } while (s++ < strend);
2627 PerlIO_printf(Perl_debug_log,
2628 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2634 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2636 if (PL_reg_state.re_state_eval_setup_done)
2637 restore_pos(aTHX_ prog);
2638 if (RXp_PAREN_NAMES(prog))
2639 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2641 /* make sure $`, $&, $', and $digit will work later */
2642 if ( !(flags & REXEC_NOT_FIRST) ) {
2643 if (flags & REXEC_COPY_STR) {
2644 #ifdef PERL_OLD_COPY_ON_WRITE
2646 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2648 PerlIO_printf(Perl_debug_log,
2649 "Copy on write: regexp capture, type %d\n",
2652 RX_MATCH_COPY_FREE(rx);
2653 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2654 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2655 assert (SvPOKp(prog->saved_copy));
2656 prog->sublen = PL_regeol - strbeg;
2657 prog->suboffset = 0;
2658 prog->subcoffset = 0;
2663 I32 max = PL_regeol - strbeg;
2666 if ( (flags & REXEC_COPY_SKIP_POST)
2667 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2668 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2669 ) { /* don't copy $' part of string */
2672 /* calculate the right-most part of the string covered
2673 * by a capture. Due to look-ahead, this may be to
2674 * the right of $&, so we have to scan all captures */
2675 while (n <= prog->lastparen) {
2676 if (prog->offs[n].end > max)
2677 max = prog->offs[n].end;
2681 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2682 ? prog->offs[0].start
2684 assert(max >= 0 && max <= PL_regeol - strbeg);
2687 if ( (flags & REXEC_COPY_SKIP_PRE)
2688 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2689 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2690 ) { /* don't copy $` part of string */
2693 /* calculate the left-most part of the string covered
2694 * by a capture. Due to look-behind, this may be to
2695 * the left of $&, so we have to scan all captures */
2696 while (min && n <= prog->lastparen) {
2697 if ( prog->offs[n].start != -1
2698 && prog->offs[n].start < min)
2700 min = prog->offs[n].start;
2704 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2705 && min > prog->offs[0].end
2707 min = prog->offs[0].end;
2711 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2714 if (RX_MATCH_COPIED(rx)) {
2715 if (sublen > prog->sublen)
2717 (char*)saferealloc(prog->subbeg, sublen+1);
2720 prog->subbeg = (char*)safemalloc(sublen+1);
2721 Copy(strbeg + min, prog->subbeg, sublen, char);
2722 prog->subbeg[sublen] = '\0';
2723 prog->suboffset = min;
2724 prog->sublen = sublen;
2725 RX_MATCH_COPIED_on(rx);
2727 prog->subcoffset = prog->suboffset;
2728 if (prog->suboffset && utf8_target) {
2729 /* Convert byte offset to chars.
2730 * XXX ideally should only compute this if @-/@+
2731 * has been seen, a la PL_sawampersand ??? */
2733 /* If there's a direct correspondence between the
2734 * string which we're matching and the original SV,
2735 * then we can use the utf8 len cache associated with
2736 * the SV. In particular, it means that under //g,
2737 * sv_pos_b2u() will use the previously cached
2738 * position to speed up working out the new length of
2739 * subcoffset, rather than counting from the start of
2740 * the string each time. This stops
2741 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2742 * from going quadratic */
2743 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2744 sv_pos_b2u(sv, &(prog->subcoffset));
2746 prog->subcoffset = utf8_length((U8*)strbeg,
2747 (U8*)(strbeg+prog->suboffset));
2751 RX_MATCH_COPY_FREE(rx);
2752 prog->subbeg = strbeg;
2753 prog->suboffset = 0;
2754 prog->subcoffset = 0;
2755 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2762 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2763 PL_colors[4], PL_colors[5]));
2764 if (PL_reg_state.re_state_eval_setup_done)
2765 restore_pos(aTHX_ prog);
2767 /* we failed :-( roll it back */
2768 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2769 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2774 Safefree(prog->offs);
2781 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2782 * Do inc before dec, in case old and new rex are the same */
2783 #define SET_reg_curpm(Re2) \
2784 if (PL_reg_state.re_state_eval_setup_done) { \
2785 (void)ReREFCNT_inc(Re2); \
2786 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2787 PM_SETRE((PL_reg_curpm), (Re2)); \
2792 - regtry - try match at specific point
2794 STATIC I32 /* 0 failure, 1 success */
2795 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2799 REGEXP *const rx = reginfo->prog;
2800 regexp *const prog = (struct regexp *)SvANY(rx);
2802 RXi_GET_DECL(prog,progi);
2803 GET_RE_DEBUG_FLAGS_DECL;
2805 PERL_ARGS_ASSERT_REGTRY;
2807 reginfo->cutpoint=NULL;
2809 if ((prog->extflags & RXf_EVAL_SEEN)
2810 && !PL_reg_state.re_state_eval_setup_done)
2814 PL_reg_state.re_state_eval_setup_done = TRUE;
2816 /* Make $_ available to executed code. */
2817 if (reginfo->sv != DEFSV) {
2819 DEFSV_set(reginfo->sv);
2822 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2823 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2824 /* prepare for quick setting of pos */
2825 #ifdef PERL_OLD_COPY_ON_WRITE
2826 if (SvIsCOW(reginfo->sv))
2827 sv_force_normal_flags(reginfo->sv, 0);
2829 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2830 &PL_vtbl_mglob, NULL, 0);
2834 PL_reg_oldpos = mg->mg_len;
2835 SAVEDESTRUCTOR_X(restore_pos, prog);
2837 if (!PL_reg_curpm) {
2838 Newxz(PL_reg_curpm, 1, PMOP);
2841 SV* const repointer = &PL_sv_undef;
2842 /* this regexp is also owned by the new PL_reg_curpm, which
2843 will try to free it. */
2844 av_push(PL_regex_padav, repointer);
2845 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2846 PL_regex_pad = AvARRAY(PL_regex_padav);
2851 PL_reg_oldcurpm = PL_curpm;
2852 PL_curpm = PL_reg_curpm;
2853 if (RXp_MATCH_COPIED(prog)) {
2854 /* Here is a serious problem: we cannot rewrite subbeg,
2855 since it may be needed if this match fails. Thus
2856 $` inside (?{}) could fail... */
2857 PL_reg_oldsaved = prog->subbeg;
2858 PL_reg_oldsavedlen = prog->sublen;
2859 PL_reg_oldsavedoffset = prog->suboffset;
2860 PL_reg_oldsavedcoffset = prog->suboffset;
2861 #ifdef PERL_OLD_COPY_ON_WRITE
2862 PL_nrs = prog->saved_copy;
2864 RXp_MATCH_COPIED_off(prog);
2867 PL_reg_oldsaved = NULL;
2868 prog->subbeg = PL_bostr;
2869 prog->suboffset = 0;
2870 prog->subcoffset = 0;
2871 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2874 PL_reg_starttry = *startposp;
2876 prog->offs[0].start = *startposp - PL_bostr;
2877 prog->lastparen = 0;
2878 prog->lastcloseparen = 0;
2881 /* XXXX What this code is doing here?!!! There should be no need
2882 to do this again and again, prog->lastparen should take care of
2885 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2886 * Actually, the code in regcppop() (which Ilya may be meaning by
2887 * prog->lastparen), is not needed at all by the test suite
2888 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2889 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2890 * Meanwhile, this code *is* needed for the
2891 * above-mentioned test suite tests to succeed. The common theme
2892 * on those tests seems to be returning null fields from matches.
2893 * --jhi updated by dapm */
2895 if (prog->nparens) {
2896 regexp_paren_pair *pp = prog->offs;
2898 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2906 result = regmatch(reginfo, *startposp, progi->program + 1);
2908 prog->offs[0].end = result;
2911 if (reginfo->cutpoint)
2912 *startposp= reginfo->cutpoint;
2913 REGCP_UNWIND(lastcp);
2918 #define sayYES goto yes
2919 #define sayNO goto no
2920 #define sayNO_SILENT goto no_silent
2922 /* we dont use STMT_START/END here because it leads to
2923 "unreachable code" warnings, which are bogus, but distracting. */
2924 #define CACHEsayNO \
2925 if (ST.cache_mask) \
2926 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2929 /* this is used to determine how far from the left messages like
2930 'failed...' are printed. It should be set such that messages
2931 are inline with the regop output that created them.
2933 #define REPORT_CODE_OFF 32
2936 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2937 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2939 #define SLAB_FIRST(s) (&(s)->states[0])
2940 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2942 /* grab a new slab and return the first slot in it */
2944 STATIC regmatch_state *
2947 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2950 regmatch_slab *s = PL_regmatch_slab->next;
2952 Newx(s, 1, regmatch_slab);
2953 s->prev = PL_regmatch_slab;
2955 PL_regmatch_slab->next = s;
2957 PL_regmatch_slab = s;
2958 return SLAB_FIRST(s);
2962 /* push a new state then goto it */
2964 #define PUSH_STATE_GOTO(state, node, input) \
2965 pushinput = input; \
2967 st->resume_state = state; \
2970 /* push a new state with success backtracking, then goto it */
2972 #define PUSH_YES_STATE_GOTO(state, node, input) \
2973 pushinput = input; \
2975 st->resume_state = state; \
2976 goto push_yes_state;
2983 regmatch() - main matching routine
2985 This is basically one big switch statement in a loop. We execute an op,
2986 set 'next' to point the next op, and continue. If we come to a point which
2987 we may need to backtrack to on failure such as (A|B|C), we push a
2988 backtrack state onto the backtrack stack. On failure, we pop the top
2989 state, and re-enter the loop at the state indicated. If there are no more
2990 states to pop, we return failure.
2992 Sometimes we also need to backtrack on success; for example /A+/, where
2993 after successfully matching one A, we need to go back and try to
2994 match another one; similarly for lookahead assertions: if the assertion
2995 completes successfully, we backtrack to the state just before the assertion
2996 and then carry on. In these cases, the pushed state is marked as
2997 'backtrack on success too'. This marking is in fact done by a chain of
2998 pointers, each pointing to the previous 'yes' state. On success, we pop to
2999 the nearest yes state, discarding any intermediate failure-only states.
3000 Sometimes a yes state is pushed just to force some cleanup code to be
3001 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3002 it to free the inner regex.
3004 Note that failure backtracking rewinds the cursor position, while
3005 success backtracking leaves it alone.
3007 A pattern is complete when the END op is executed, while a subpattern
3008 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3009 ops trigger the "pop to last yes state if any, otherwise return true"
3012 A common convention in this function is to use A and B to refer to the two
3013 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3014 the subpattern to be matched possibly multiple times, while B is the entire
3015 rest of the pattern. Variable and state names reflect this convention.
3017 The states in the main switch are the union of ops and failure/success of
3018 substates associated with with that op. For example, IFMATCH is the op
3019 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3020 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3021 successfully matched A and IFMATCH_A_fail is a state saying that we have
3022 just failed to match A. Resume states always come in pairs. The backtrack
3023 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3024 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3025 on success or failure.
3027 The struct that holds a backtracking state is actually a big union, with
3028 one variant for each major type of op. The variable st points to the
3029 top-most backtrack struct. To make the code clearer, within each
3030 block of code we #define ST to alias the relevant union.
3032 Here's a concrete example of a (vastly oversimplified) IFMATCH
3038 #define ST st->u.ifmatch
3040 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3041 ST.foo = ...; // some state we wish to save
3043 // push a yes backtrack state with a resume value of
3044 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3046 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3049 case IFMATCH_A: // we have successfully executed A; now continue with B
3051 bar = ST.foo; // do something with the preserved value
3054 case IFMATCH_A_fail: // A failed, so the assertion failed
3055 ...; // do some housekeeping, then ...
3056 sayNO; // propagate the failure
3063 For any old-timers reading this who are familiar with the old recursive
3064 approach, the code above is equivalent to:
3066 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3075 ...; // do some housekeeping, then ...
3076 sayNO; // propagate the failure
3079 The topmost backtrack state, pointed to by st, is usually free. If you
3080 want to claim it, populate any ST.foo fields in it with values you wish to
3081 save, then do one of
3083 PUSH_STATE_GOTO(resume_state, node, newinput);
3084 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3086 which sets that backtrack state's resume value to 'resume_state', pushes a
3087 new free entry to the top of the backtrack stack, then goes to 'node'.
3088 On backtracking, the free slot is popped, and the saved state becomes the
3089 new free state. An ST.foo field in this new top state can be temporarily
3090 accessed to retrieve values, but once the main loop is re-entered, it
3091 becomes available for reuse.
3093 Note that the depth of the backtrack stack constantly increases during the
3094 left-to-right execution of the pattern, rather than going up and down with
3095 the pattern nesting. For example the stack is at its maximum at Z at the
3096 end of the pattern, rather than at X in the following:
3098 /(((X)+)+)+....(Y)+....Z/
3100 The only exceptions to this are lookahead/behind assertions and the cut,
3101 (?>A), which pop all the backtrack states associated with A before
3104 Backtrack state structs are allocated in slabs of about 4K in size.
3105 PL_regmatch_state and st always point to the currently active state,
3106 and PL_regmatch_slab points to the slab currently containing
3107 PL_regmatch_state. The first time regmatch() is called, the first slab is
3108 allocated, and is never freed until interpreter destruction. When the slab
3109 is full, a new one is allocated and chained to the end. At exit from
3110 regmatch(), slabs allocated since entry are freed.
3115 #define DEBUG_STATE_pp(pp) \
3117 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3118 PerlIO_printf(Perl_debug_log, \
3119 " %*s"pp" %s%s%s%s%s\n", \
3121 PL_reg_name[st->resume_state], \
3122 ((st==yes_state||st==mark_state) ? "[" : ""), \
3123 ((st==yes_state) ? "Y" : ""), \
3124 ((st==mark_state) ? "M" : ""), \
3125 ((st==yes_state||st==mark_state) ? "]" : "") \
3130 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3135 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3136 const char *start, const char *end, const char *blurb)
3138 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3140 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3145 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3146 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3148 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3149 start, end - start, 60);
3151 PerlIO_printf(Perl_debug_log,
3152 "%s%s REx%s %s against %s\n",
3153 PL_colors[4], blurb, PL_colors[5], s0, s1);
3155 if (utf8_target||utf8_pat)
3156 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3157 utf8_pat ? "pattern" : "",
3158 utf8_pat && utf8_target ? " and " : "",
3159 utf8_target ? "string" : ""
3165 S_dump_exec_pos(pTHX_ const char *locinput,
3166 const regnode *scan,
3167 const char *loc_regeol,
3168 const char *loc_bostr,
3169 const char *loc_reg_starttry,
3170 const bool utf8_target)
3172 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3173 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3174 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3175 /* The part of the string before starttry has one color
3176 (pref0_len chars), between starttry and current
3177 position another one (pref_len - pref0_len chars),
3178 after the current position the third one.
3179 We assume that pref0_len <= pref_len, otherwise we
3180 decrease pref0_len. */
3181 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3182 ? (5 + taill) - l : locinput - loc_bostr;
3185 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3187 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3189 pref0_len = pref_len - (locinput - loc_reg_starttry);
3190 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3191 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3192 ? (5 + taill) - pref_len : loc_regeol - locinput);
3193 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3197 if (pref0_len > pref_len)
3198 pref0_len = pref_len;
3200 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3202 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3203 (locinput - pref_len),pref0_len, 60, 4, 5);
3205 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3206 (locinput - pref_len + pref0_len),
3207 pref_len - pref0_len, 60, 2, 3);
3209 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3210 locinput, loc_regeol - locinput, 10, 0, 1);
3212 const STRLEN tlen=len0+len1+len2;
3213 PerlIO_printf(Perl_debug_log,
3214 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3215 (IV)(locinput - loc_bostr),
3218 (docolor ? "" : "> <"),
3220 (int)(tlen > 19 ? 0 : 19 - tlen),
3227 /* reg_check_named_buff_matched()
3228 * Checks to see if a named buffer has matched. The data array of
3229 * buffer numbers corresponding to the buffer is expected to reside
3230 * in the regexp->data->data array in the slot stored in the ARG() of
3231 * node involved. Note that this routine doesn't actually care about the
3232 * name, that information is not preserved from compilation to execution.
3233 * Returns the index of the leftmost defined buffer with the given name
3234 * or 0 if non of the buffers matched.
3237 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3240 RXi_GET_DECL(rex,rexi);
3241 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3242 I32 *nums=(I32*)SvPVX(sv_dat);
3244 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3246 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3247 if ((I32)rex->lastparen >= nums[n] &&
3248 rex->offs[nums[n]].end != -1)
3257 /* free all slabs above current one - called during LEAVE_SCOPE */
3260 S_clear_backtrack_stack(pTHX_ void *p)
3262 regmatch_slab *s = PL_regmatch_slab->next;
3267 PL_regmatch_slab->next = NULL;
3269 regmatch_slab * const osl = s;
3276 /* returns -1 on failure, $+[0] on success */
3278 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3280 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3284 const bool utf8_target = PL_reg_match_utf8;
3285 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3286 REGEXP *rex_sv = reginfo->prog;
3287 regexp *rex = (struct regexp *)SvANY(rex_sv);
3288 RXi_GET_DECL(rex,rexi);
3290 /* the current state. This is a cached copy of PL_regmatch_state */
3292 /* cache heavy used fields of st in registers */
3295 U32 n = 0; /* general value; init to avoid compiler warning */
3296 I32 ln = 0; /* len or last; init to avoid compiler warning */
3297 char *locinput = startpos;
3298 char *pushinput; /* where to continue after a PUSH */
3299 I32 nextchr; /* is always set to UCHARAT(locinput) */
3301 bool result = 0; /* return value of S_regmatch */
3302 int depth = 0; /* depth of backtrack stack */
3303 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3304 const U32 max_nochange_depth =
3305 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3306 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3307 regmatch_state *yes_state = NULL; /* state to pop to on success of
3309 /* mark_state piggy backs on the yes_state logic so that when we unwind
3310 the stack on success we can update the mark_state as we go */
3311 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3312 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3313 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3315 bool no_final = 0; /* prevent failure from backtracking? */
3316 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3317 char *startpoint = locinput;
3318 SV *popmark = NULL; /* are we looking for a mark? */
3319 SV *sv_commit = NULL; /* last mark name seen in failure */
3320 SV *sv_yes_mark = NULL; /* last mark name we have seen
3321 during a successful match */
3322 U32 lastopen = 0; /* last open we saw */
3323 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3324 SV* const oreplsv = GvSV(PL_replgv);
3325 /* these three flags are set by various ops to signal information to
3326 * the very next op. They have a useful lifetime of exactly one loop
3327 * iteration, and are not preserved or restored by state pushes/pops
3329 bool sw = 0; /* the condition value in (?(cond)a|b) */
3330 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3331 int logical = 0; /* the following EVAL is:
3335 or the following IFMATCH/UNLESSM is:
3336 false: plain (?=foo)
3337 true: used as a condition: (?(?=foo))
3339 PAD* last_pad = NULL;
3341 I32 gimme = G_SCALAR;
3342 CV *caller_cv = NULL; /* who called us */
3343 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3344 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3347 GET_RE_DEBUG_FLAGS_DECL;
3350 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3351 multicall_oldcatch = 0;
3352 multicall_cv = NULL;
3354 PERL_UNUSED_VAR(multicall_cop);
3355 PERL_UNUSED_VAR(newsp);
3358 PERL_ARGS_ASSERT_REGMATCH;
3360 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3361 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3363 /* on first ever call to regmatch, allocate first slab */
3364 if (!PL_regmatch_slab) {
3365 Newx(PL_regmatch_slab, 1, regmatch_slab);
3366 PL_regmatch_slab->prev = NULL;
3367 PL_regmatch_slab->next = NULL;
3368 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3371 oldsave = PL_savestack_ix;
3372 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3373 SAVEVPTR(PL_regmatch_slab);
3374 SAVEVPTR(PL_regmatch_state);
3376 /* grab next free state slot */
3377 st = ++PL_regmatch_state;
3378 if (st > SLAB_LAST(PL_regmatch_slab))
3379 st = PL_regmatch_state = S_push_slab(aTHX);
3381 /* Note that nextchr is a byte even in UTF */
3384 while (scan != NULL) {
3387 SV * const prop = sv_newmortal();
3388 regnode *rnext=regnext(scan);
3389 DUMP_EXEC_POS( locinput, scan, utf8_target );
3390 regprop(rex, prop, scan);
3392 PerlIO_printf(Perl_debug_log,
3393 "%3"IVdf":%*s%s(%"IVdf")\n",
3394 (IV)(scan - rexi->program), depth*2, "",
3396 (PL_regkind[OP(scan)] == END || !rnext) ?
3397 0 : (IV)(rnext - rexi->program));
3400 next = scan + NEXT_OFF(scan);
3403 state_num = OP(scan);
3409 switch (state_num) {
3410 case BOL: /* /^../ */
3411 if (locinput == PL_bostr)
3413 /* reginfo->till = reginfo->bol; */
3418 case MBOL: /* /^../m */
3419 if (locinput == PL_bostr ||
3420 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3426 case SBOL: /* /^../s */
3427 if (locinput == PL_bostr)
3432 if (locinput == reginfo->ganch)
3436 case KEEPS: /* \K */
3437 /* update the startpoint */
3438 st->u.keeper.val = rex->offs[0].start;
3439 rex->offs[0].start = locinput - PL_bostr;
3440 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3442 case KEEPS_next_fail:
3443 /* rollback the start point change */
3444 rex->offs[0].start = st->u.keeper.val;
3448 case EOL: /* /..$/ */
3451 case MEOL: /* /..$/m */
3452 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3456 case SEOL: /* /..$/s */
3458 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3460 if (PL_regeol - locinput > 1)
3465 if (!NEXTCHR_IS_EOS)
3469 case SANY: /* /./s */
3472 goto increment_locinput;
3480 case REG_ANY: /* /./ */
3481 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3483 goto increment_locinput;
3487 #define ST st->u.trie
3488 case TRIEC: /* (ab|cd) with known charclass */
3489 /* In this case the charclass data is available inline so
3490 we can fail fast without a lot of extra overhead.
3492 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3494 PerlIO_printf(Perl_debug_log,
3495 "%*s %sfailed to match trie start class...%s\n",
3496 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3499 assert(0); /* NOTREACHED */
3502 case TRIE: /* (ab|cd) */
3503 /* the basic plan of execution of the trie is:
3504 * At the beginning, run though all the states, and
3505 * find the longest-matching word. Also remember the position
3506 * of the shortest matching word. For example, this pattern:
3509 * when matched against the string "abcde", will generate
3510 * accept states for all words except 3, with the longest
3511 * matching word being 4, and the shortest being 2 (with
3512 * the position being after char 1 of the string).
3514 * Then for each matching word, in word order (i.e. 1,2,4,5),
3515 * we run the remainder of the pattern; on each try setting
3516 * the current position to the character following the word,
3517 * returning to try the next word on failure.
3519 * We avoid having to build a list of words at runtime by
3520 * using a compile-time structure, wordinfo[].prev, which
3521 * gives, for each word, the previous accepting word (if any).
3522 * In the case above it would contain the mappings 1->2, 2->0,
3523 * 3->0, 4->5, 5->1. We can use this table to generate, from
3524 * the longest word (4 above), a list of all words, by
3525 * following the list of prev pointers; this gives us the
3526 * unordered list 4,5,1,2. Then given the current word we have
3527 * just tried, we can go through the list and find the
3528 * next-biggest word to try (so if we just failed on word 2,
3529 * the next in the list is 4).
3531 * Since at runtime we don't record the matching position in
3532 * the string for each word, we have to work that out for
3533 * each word we're about to process. The wordinfo table holds
3534 * the character length of each word; given that we recorded
3535 * at the start: the position of the shortest word and its
3536 * length in chars, we just need to move the pointer the
3537 * difference between the two char lengths. Depending on
3538 * Unicode status and folding, that's cheap or expensive.
3540 * This algorithm is optimised for the case where are only a
3541 * small number of accept states, i.e. 0,1, or maybe 2.
3542 * With lots of accepts states, and having to try all of them,
3543 * it becomes quadratic on number of accept states to find all
3548 /* what type of TRIE am I? (utf8 makes this contextual) */
3549 DECL_TRIE_TYPE(scan);
3551 /* what trie are we using right now */
3552 reg_trie_data * const trie
3553 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3554 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3555 U32 state = trie->startstate;
3558 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3560 if (trie->states[ state ].wordnum) {
3562 PerlIO_printf(Perl_debug_log,
3563 "%*s %smatched empty string...%s\n",
3564 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3570 PerlIO_printf(Perl_debug_log,
3571 "%*s %sfailed to match trie start class...%s\n",
3572 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3579 U8 *uc = ( U8* )locinput;
3583 U8 *uscan = (U8*)NULL;
3584 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3585 U32 charcount = 0; /* how many input chars we have matched */
3586 U32 accepted = 0; /* have we seen any accepting states? */
3588 ST.jump = trie->jump;
3591 ST.longfold = FALSE; /* char longer if folded => it's harder */
3594 /* fully traverse the TRIE; note the position of the
3595 shortest accept state and the wordnum of the longest
3598 while ( state && uc <= (U8*)PL_regeol ) {
3599 U32 base = trie->states[ state ].trans.base;
3603 wordnum = trie->states[ state ].wordnum;
3605 if (wordnum) { /* it's an accept state */
3608 /* record first match position */
3610 ST.firstpos = (U8*)locinput;
3615 ST.firstchars = charcount;
3618 if (!ST.nextword || wordnum < ST.nextword)
3619 ST.nextword = wordnum;
3620 ST.topword = wordnum;
3623 DEBUG_TRIE_EXECUTE_r({
3624 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3625 PerlIO_printf( Perl_debug_log,
3626 "%*s %sState: %4"UVxf" Accepted: %c ",
3627 2+depth * 2, "", PL_colors[4],
3628 (UV)state, (accepted ? 'Y' : 'N'));
3631 /* read a char and goto next state */
3632 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3634 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3635 uscan, len, uvc, charid, foldlen,
3642 base + charid - 1 - trie->uniquecharcount)) >= 0)
3644 && ((U32)offset < trie->lasttrans)
3645 && trie->trans[offset].check == state)
3647 state = trie->trans[offset].next;
3658 DEBUG_TRIE_EXECUTE_r(
3659 PerlIO_printf( Perl_debug_log,
3660 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3661 charid, uvc, (UV)state, PL_colors[5] );
3667 /* calculate total number of accept states */
3672 w = trie->wordinfo[w].prev;
3675 ST.accepted = accepted;
3679 PerlIO_printf( Perl_debug_log,
3680 "%*s %sgot %"IVdf" possible matches%s\n",
3681 REPORT_CODE_OFF + depth * 2, "",
3682 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3684 goto trie_first_try; /* jump into the fail handler */
3686 assert(0); /* NOTREACHED */
3688 case TRIE_next_fail: /* we failed - try next alternative */
3692 REGCP_UNWIND(ST.cp);
3693 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3695 if (!--ST.accepted) {
3697 PerlIO_printf( Perl_debug_log,
3698 "%*s %sTRIE failed...%s\n",
3699 REPORT_CODE_OFF+depth*2, "",
3706 /* Find next-highest word to process. Note that this code
3707 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3710 U16 const nextword = ST.nextword;
3711 reg_trie_wordinfo * const wordinfo
3712 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3713 for (word=ST.topword; word; word=wordinfo[word].prev) {
3714 if (word > nextword && (!min || word < min))
3727 ST.lastparen = rex->lastparen;
3728 ST.lastcloseparen = rex->lastcloseparen;
3732 /* find start char of end of current word */
3734 U32 chars; /* how many chars to skip */
3735 reg_trie_data * const trie
3736 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3738 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3740 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3745 /* the hard option - fold each char in turn and find
3746 * its folded length (which may be different */
3747 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3755 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3763 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3768 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3784 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3785 ? ST.jump[ST.nextword]
3789 PerlIO_printf( Perl_debug_log,
3790 "%*s %sTRIE matched word #%d, continuing%s\n",
3791 REPORT_CODE_OFF+depth*2, "",
3798 if (ST.accepted > 1 || has_cutgroup) {
3799 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
3800 assert(0); /* NOTREACHED */
3802 /* only one choice left - just continue */
3804 AV *const trie_words
3805 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3806 SV ** const tmp = av_fetch( trie_words,
3808 SV *sv= tmp ? sv_newmortal() : NULL;
3810 PerlIO_printf( Perl_debug_log,
3811 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3812 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3814 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3815 PL_colors[0], PL_colors[1],
3816 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3818 : "not compiled under -Dr",
3822 locinput = (char*)uc;
3823 continue; /* execute rest of RE */
3824 assert(0); /* NOTREACHED */
3828 case EXACT: { /* /abc/ */
3829 char *s = STRING(scan);
3831 if (utf8_target != UTF_PATTERN) {
3832 /* The target and the pattern have differing utf8ness. */
3834 const char * const e = s + ln;
3837 /* The target is utf8, the pattern is not utf8.
3838 * Above-Latin1 code points can't match the pattern;
3839 * invariants match exactly, and the other Latin1 ones need
3840 * to be downgraded to a single byte in order to do the
3841 * comparison. (If we could be confident that the target
3842 * is not malformed, this could be refactored to have fewer
3843 * tests by just assuming that if the first bytes match, it
3844 * is an invariant, but there are tests in the test suite
3845 * dealing with (??{...}) which violate this) */
3849 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
3852 if (UTF8_IS_INVARIANT(*(U8*)l)) {