5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #define RF_tainted 1 /* tainted information used? */
84 #define RF_warned 2 /* warned about big count? */
86 #define RF_utf8 8 /* Pattern contains multibyte chars? */
88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
90 #define RS_init 1 /* eval environment created */
91 #define RS_set 2 /* replsv value is set */
97 /* Valid for non-utf8 strings only: avoids the reginclass call if there are no
98 * complications: i.e., if everything matchable is straight forward in the
100 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
110 #define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
114 #define HOPBACKc(pos, off) \
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 /* these are unrolled below in the CCC_TRY_XXX defined */
125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
126 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
128 /* Doesn't do an assert to verify that is correct */
129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
130 if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
136 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
137 LOAD_UTF8_CHARCLASS(X_begin, " "); \
138 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
139 /* These are utf8 constants, and not utf-ebcdic constants, so the \
140 * assert should likely and hopefully fail on an EBCDIC machine */ \
141 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
143 /* No asserts are done for these, in case called on an early \
144 * Unicode version in which they map to nothing */ \
145 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
148 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
149 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
150 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
151 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
154 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
155 so that it is possible to override the option here without having to
156 rebuild the entire core. as we are required to do if we change regcomp.h
157 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
159 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
160 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
163 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
164 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
165 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
166 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
167 #define RE_utf8_perl_word PL_utf8_alnum
168 #define RE_utf8_perl_space PL_utf8_space
169 #define RE_utf8_posix_digit PL_utf8_digit
170 #define perl_word alnum
171 #define perl_space space
172 #define posix_digit digit
174 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
175 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
176 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
177 #define RE_utf8_perl_word PL_utf8_perl_word
178 #define RE_utf8_perl_space PL_utf8_perl_space
179 #define RE_utf8_posix_digit PL_utf8_posix_digit
182 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
184 /* The actual code for CCC_TRY, which uses several variables from the routine
185 * it's callable from. It is designed to be the bulk of a case statement.
186 * FUNC is the macro or function to call on non-utf8 targets that indicate if
187 * nextchr matches the class.
188 * UTF8_TEST is the whole test string to use for utf8 targets
189 * LOAD is what to use to test, and if not present to load in the swash for the
191 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
193 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
194 * utf8 and a variant, load the swash if necessary and test using the utf8
195 * test. Advance to the next character if test is ok, otherwise fail; If not
196 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
197 * fails, or advance to the next character */
199 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
200 if (locinput >= PL_regeol) { \
203 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
204 LOAD_UTF8_CHARCLASS(CLASS, STR); \
205 if (POS_OR_NEG (UTF8_TEST)) { \
208 locinput += PL_utf8skip[nextchr]; \
209 nextchr = UCHARAT(locinput); \
212 if (POS_OR_NEG (FUNC(nextchr))) { \
215 nextchr = UCHARAT(++locinput); \
218 /* Handle the non-locale cases for a character class and its complement. It
219 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
220 * This is because that code fails when the test succeeds, so we want to have
221 * the test fail so that the code succeeds. The swash is stored in a
222 * predictable PL_ place */
223 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
226 _CCC_TRY_CODE( !, FUNC, \
227 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
228 (U8*)locinput, TRUE)), \
231 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
232 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
233 (U8*)locinput, TRUE)), \
236 /* Generate the case statements for both locale and non-locale character
237 * classes in regmatch for classes that don't have special unicode semantics.
238 * Locales don't use an immediate swash, but an intermediary special locale
239 * function that is called on the pointer to the current place in the input
240 * string. That function will resolve to needing the same swash. One might
241 * think that because we don't know what the locale will match, we shouldn't
242 * check with the swash loading function that it loaded properly; ie, that we
243 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
244 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
246 #define CCC_TRY(NAME, NNAME, FUNC, \
247 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
248 NAMEA, NNAMEA, FUNCA, \
251 PL_reg_flags |= RF_tainted; \
252 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
254 PL_reg_flags |= RF_tainted; \
255 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
258 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
261 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
262 nextchr = UCHARAT(++locinput); \
265 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
269 locinput += PL_utf8skip[nextchr]; \
270 nextchr = UCHARAT(locinput); \
273 nextchr = UCHARAT(++locinput); \
276 /* Generate the non-locale cases */ \
277 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
279 /* This is like CCC_TRY, but has an extra set of parameters for generating case
280 * statements to handle separate Unicode semantics nodes */
281 #define CCC_TRY_U(NAME, NNAME, FUNC, \
282 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
283 NAMEU, NNAMEU, FUNCU, \
284 NAMEA, NNAMEA, FUNCA, \
286 CCC_TRY(NAME, NNAME, FUNC, \
287 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
288 NAMEA, NNAMEA, FUNCA, \
290 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
292 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
294 /* for use after a quantifier and before an EXACT-like node -- japhy */
295 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
297 * NOTE that *nothing* that affects backtracking should be in here, specifically
298 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
299 * node that is in between two EXACT like nodes when ascertaining what the required
300 * "follow" character is. This should probably be moved to regex compile time
301 * although it may be done at run time beause of the REF possibility - more
302 * investigation required. -- demerphq
304 #define JUMPABLE(rn) ( \
306 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
308 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
309 OP(rn) == PLUS || OP(rn) == MINMOD || \
311 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
313 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
315 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
318 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
319 we don't need this definition. */
320 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
321 #define IS_TEXTF(rn) ( (OP(rn)==EXACTFU || OP(rn)==EXACTF) || OP(rn)==REFF || OP(rn)==NREFF )
322 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
325 /* ... so we use this as its faster. */
326 #define IS_TEXT(rn) ( OP(rn)==EXACT )
327 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU )
328 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
329 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
334 Search for mandatory following text node; for lookahead, the text must
335 follow but for lookbehind (rn->flags != 0) we skip to the next step.
337 #define FIND_NEXT_IMPT(rn) STMT_START { \
338 while (JUMPABLE(rn)) { \
339 const OPCODE type = OP(rn); \
340 if (type == SUSPEND || PL_regkind[type] == CURLY) \
341 rn = NEXTOPER(NEXTOPER(rn)); \
342 else if (type == PLUS) \
344 else if (type == IFMATCH) \
345 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
346 else rn += NEXT_OFF(rn); \
351 static void restore_pos(pTHX_ void *arg);
353 #define REGCP_PAREN_ELEMS 4
354 #define REGCP_OTHER_ELEMS 5
355 #define REGCP_FRAME_ELEMS 1
356 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
357 * are needed for the regexp context stack bookkeeping. */
360 S_regcppush(pTHX_ I32 parenfloor)
363 const int retval = PL_savestack_ix;
364 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
365 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
366 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
368 GET_RE_DEBUG_FLAGS_DECL;
370 if (paren_elems_to_push < 0)
371 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
373 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
374 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
375 " out of range (%lu-%ld)",
376 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
378 SSGROW(total_elems + REGCP_FRAME_ELEMS);
380 for (p = PL_regsize; p > parenfloor; p--) {
381 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
382 SSPUSHINT(PL_regoffs[p].end);
383 SSPUSHINT(PL_regoffs[p].start);
384 SSPUSHPTR(PL_reg_start_tmp[p]);
386 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
387 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
388 (UV)p, (IV)PL_regoffs[p].start,
389 (IV)(PL_reg_start_tmp[p] - PL_bostr),
390 (IV)PL_regoffs[p].end
393 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
394 SSPUSHPTR(PL_regoffs);
395 SSPUSHINT(PL_regsize);
396 SSPUSHINT(*PL_reglastparen);
397 SSPUSHINT(*PL_reglastcloseparen);
398 SSPUSHPTR(PL_reginput);
399 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
404 /* These are needed since we do not localize EVAL nodes: */
405 #define REGCP_SET(cp) \
407 PerlIO_printf(Perl_debug_log, \
408 " Setting an EVAL scope, savestack=%"IVdf"\n", \
409 (IV)PL_savestack_ix)); \
412 #define REGCP_UNWIND(cp) \
414 if (cp != PL_savestack_ix) \
415 PerlIO_printf(Perl_debug_log, \
416 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
417 (IV)(cp), (IV)PL_savestack_ix)); \
421 S_regcppop(pTHX_ const regexp *rex)
426 GET_RE_DEBUG_FLAGS_DECL;
428 PERL_ARGS_ASSERT_REGCPPOP;
430 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
432 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
433 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
434 input = (char *) SSPOPPTR;
435 *PL_reglastcloseparen = SSPOPINT;
436 *PL_reglastparen = SSPOPINT;
437 PL_regsize = SSPOPINT;
438 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
440 i -= REGCP_OTHER_ELEMS;
441 /* Now restore the parentheses context. */
442 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
444 U32 paren = (U32)SSPOPINT;
445 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
446 PL_regoffs[paren].start = SSPOPINT;
448 if (paren <= *PL_reglastparen)
449 PL_regoffs[paren].end = tmps;
451 PerlIO_printf(Perl_debug_log,
452 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
453 (UV)paren, (IV)PL_regoffs[paren].start,
454 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
455 (IV)PL_regoffs[paren].end,
456 (paren > *PL_reglastparen ? "(no)" : ""));
460 if (*PL_reglastparen + 1 <= rex->nparens) {
461 PerlIO_printf(Perl_debug_log,
462 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
463 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
467 /* It would seem that the similar code in regtry()
468 * already takes care of this, and in fact it is in
469 * a better location to since this code can #if 0-ed out
470 * but the code in regtry() is needed or otherwise tests
471 * requiring null fields (pat.t#187 and split.t#{13,14}
472 * (as of patchlevel 7877) will fail. Then again,
473 * this code seems to be necessary or otherwise
474 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
475 * --jhi updated by dapm */
476 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
478 PL_regoffs[i].start = -1;
479 PL_regoffs[i].end = -1;
485 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
488 * pregexec and friends
491 #ifndef PERL_IN_XSUB_RE
493 - pregexec - match a regexp against a string
496 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
497 char *strbeg, I32 minend, SV *screamer, U32 nosave)
498 /* strend: pointer to null at end of string */
499 /* strbeg: real beginning of string */
500 /* minend: end of match must be >=minend after stringarg. */
501 /* nosave: For optimizations. */
503 PERL_ARGS_ASSERT_PREGEXEC;
506 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
507 nosave ? 0 : REXEC_COPY_STR);
512 * Need to implement the following flags for reg_anch:
514 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
516 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
517 * INTUIT_AUTORITATIVE_ML
518 * INTUIT_ONCE_NOML - Intuit can match in one location only.
521 * Another flag for this function: SECOND_TIME (so that float substrs
522 * with giant delta may be not rechecked).
525 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
527 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
528 Otherwise, only SvCUR(sv) is used to get strbeg. */
530 /* XXXX We assume that strpos is strbeg unless sv. */
532 /* XXXX Some places assume that there is a fixed substring.
533 An update may be needed if optimizer marks as "INTUITable"
534 RExen without fixed substrings. Similarly, it is assumed that
535 lengths of all the strings are no more than minlen, thus they
536 cannot come from lookahead.
537 (Or minlen should take into account lookahead.)
538 NOTE: Some of this comment is not correct. minlen does now take account
539 of lookahead/behind. Further research is required. -- demerphq
543 /* A failure to find a constant substring means that there is no need to make
544 an expensive call to REx engine, thus we celebrate a failure. Similarly,
545 finding a substring too deep into the string means that less calls to
546 regtry() should be needed.
548 REx compiler's optimizer found 4 possible hints:
549 a) Anchored substring;
551 c) Whether we are anchored (beginning-of-line or \G);
552 d) First node (of those at offset 0) which may distinguish positions;
553 We use a)b)d) and multiline-part of c), and try to find a position in the
554 string which does not contradict any of them.
557 /* Most of decisions we do here should have been done at compile time.
558 The nodes of the REx which we used for the search should have been
559 deleted from the finite automaton. */
562 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
563 char *strend, const U32 flags, re_scream_pos_data *data)
566 struct regexp *const prog = (struct regexp *)SvANY(rx);
567 register I32 start_shift = 0;
568 /* Should be nonnegative! */
569 register I32 end_shift = 0;
574 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
576 register char *other_last = NULL; /* other substr checked before this */
577 char *check_at = NULL; /* check substr found at this pos */
578 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
579 RXi_GET_DECL(prog,progi);
581 const char * const i_strpos = strpos;
583 GET_RE_DEBUG_FLAGS_DECL;
585 PERL_ARGS_ASSERT_RE_INTUIT_START;
587 RX_MATCH_UTF8_set(rx,utf8_target);
590 PL_reg_flags |= RF_utf8;
593 debug_start_match(rx, utf8_target, strpos, strend,
594 sv ? "Guessing start of match in sv for"
595 : "Guessing start of match in string for");
598 /* CHR_DIST() would be more correct here but it makes things slow. */
599 if (prog->minlen > strend - strpos) {
600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
601 "String too short... [re_intuit_start]\n"));
605 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
608 if (!prog->check_utf8 && prog->check_substr)
609 to_utf8_substr(prog);
610 check = prog->check_utf8;
612 if (!prog->check_substr && prog->check_utf8)
613 to_byte_substr(prog);
614 check = prog->check_substr;
616 if (check == &PL_sv_undef) {
617 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
618 "Non-utf8 string cannot match utf8 check string\n"));
621 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
622 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
623 || ( (prog->extflags & RXf_ANCH_BOL)
624 && !multiline ) ); /* Check after \n? */
627 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
628 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
629 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
631 && (strpos != strbeg)) {
632 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
635 if (prog->check_offset_min == prog->check_offset_max &&
636 !(prog->extflags & RXf_CANY_SEEN)) {
637 /* Substring at constant offset from beg-of-str... */
640 s = HOP3c(strpos, prog->check_offset_min, strend);
643 slen = SvCUR(check); /* >= 1 */
645 if ( strend - s > slen || strend - s < slen - 1
646 || (strend - s == slen && strend[-1] != '\n')) {
647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
650 /* Now should match s[0..slen-2] */
652 if (slen && (*SvPVX_const(check) != *s
654 && memNE(SvPVX_const(check), s, slen)))) {
656 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
660 else if (*SvPVX_const(check) != *s
661 || ((slen = SvCUR(check)) > 1
662 && memNE(SvPVX_const(check), s, slen)))
665 goto success_at_start;
668 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
670 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
671 end_shift = prog->check_end_shift;
674 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
675 - (SvTAIL(check) != 0);
676 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
678 if (end_shift < eshift)
682 else { /* Can match at random position */
685 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
686 end_shift = prog->check_end_shift;
688 /* end shift should be non negative here */
691 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
693 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
694 (IV)end_shift, RX_PRECOMP(prog));
698 /* Find a possible match in the region s..strend by looking for
699 the "check" substring in the region corrected by start/end_shift. */
702 I32 srch_start_shift = start_shift;
703 I32 srch_end_shift = end_shift;
704 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
705 srch_end_shift -= ((strbeg - s) - srch_start_shift);
706 srch_start_shift = strbeg - s;
708 DEBUG_OPTIMISE_MORE_r({
709 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
710 (IV)prog->check_offset_min,
711 (IV)srch_start_shift,
713 (IV)prog->check_end_shift);
716 if (flags & REXEC_SCREAM) {
717 I32 p = -1; /* Internal iterator of scream. */
718 I32 * const pp = data ? data->scream_pos : &p;
720 if (PL_screamfirst[BmRARE(check)] >= 0
721 || ( BmRARE(check) == '\n'
722 && (BmPREVIOUS(check) == SvCUR(check) - 1)
724 s = screaminstr(sv, check,
725 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
728 /* we may be pointing at the wrong string */
729 if (s && RXp_MATCH_COPIED(prog))
730 s = strbeg + (s - SvPVX_const(sv));
732 *data->scream_olds = s;
737 if (prog->extflags & RXf_CANY_SEEN) {
738 start_point= (U8*)(s + srch_start_shift);
739 end_point= (U8*)(strend - srch_end_shift);
741 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
742 end_point= HOP3(strend, -srch_end_shift, strbeg);
744 DEBUG_OPTIMISE_MORE_r({
745 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
746 (int)(end_point - start_point),
747 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
751 s = fbm_instr( start_point, end_point,
752 check, multiline ? FBMrf_MULTILINE : 0);
755 /* Update the count-of-usability, remove useless subpatterns,
759 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
760 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
761 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
762 (s ? "Found" : "Did not find"),
763 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
764 ? "anchored" : "floating"),
767 (s ? " at offset " : "...\n") );
772 /* Finish the diagnostic message */
773 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
775 /* XXX dmq: first branch is for positive lookbehind...
776 Our check string is offset from the beginning of the pattern.
777 So we need to do any stclass tests offset forward from that
786 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
787 Start with the other substr.
788 XXXX no SCREAM optimization yet - and a very coarse implementation
789 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
790 *always* match. Probably should be marked during compile...
791 Probably it is right to do no SCREAM here...
794 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
795 : (prog->float_substr && prog->anchored_substr))
797 /* Take into account the "other" substring. */
798 /* XXXX May be hopelessly wrong for UTF... */
801 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
804 char * const last = HOP3c(s, -start_shift, strbeg);
806 char * const saved_s = s;
809 t = s - prog->check_offset_max;
810 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
812 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
817 t = HOP3c(t, prog->anchored_offset, strend);
818 if (t < other_last) /* These positions already checked */
820 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
823 /* XXXX It is not documented what units *_offsets are in.
824 We assume bytes, but this is clearly wrong.
825 Meaning this code needs to be carefully reviewed for errors.
829 /* On end-of-str: see comment below. */
830 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
831 if (must == &PL_sv_undef) {
833 DEBUG_r(must = prog->anchored_utf8); /* for debug */
838 HOP3(HOP3(last1, prog->anchored_offset, strend)
839 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
841 multiline ? FBMrf_MULTILINE : 0
844 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
845 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
846 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
847 (s ? "Found" : "Contradicts"),
848 quoted, RE_SV_TAIL(must));
853 if (last1 >= last2) {
854 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
855 ", giving up...\n"));
858 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
859 ", trying floating at offset %ld...\n",
860 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
861 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
862 s = HOP3c(last, 1, strend);
866 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
867 (long)(s - i_strpos)));
868 t = HOP3c(s, -prog->anchored_offset, strbeg);
869 other_last = HOP3c(s, 1, strend);
877 else { /* Take into account the floating substring. */
879 char * const saved_s = s;
882 t = HOP3c(s, -start_shift, strbeg);
884 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
885 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
886 last = HOP3c(t, prog->float_max_offset, strend);
887 s = HOP3c(t, prog->float_min_offset, strend);
890 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
891 must = utf8_target ? prog->float_utf8 : prog->float_substr;
892 /* fbm_instr() takes into account exact value of end-of-str
893 if the check is SvTAIL(ed). Since false positives are OK,
894 and end-of-str is not later than strend we are OK. */
895 if (must == &PL_sv_undef) {
897 DEBUG_r(must = prog->float_utf8); /* for debug message */
900 s = fbm_instr((unsigned char*)s,
901 (unsigned char*)last + SvCUR(must)
903 must, multiline ? FBMrf_MULTILINE : 0);
905 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
906 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
907 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
908 (s ? "Found" : "Contradicts"),
909 quoted, RE_SV_TAIL(must));
913 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
914 ", giving up...\n"));
917 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
918 ", trying anchored starting at offset %ld...\n",
919 (long)(saved_s + 1 - i_strpos)));
921 s = HOP3c(t, 1, strend);
925 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
926 (long)(s - i_strpos)));
927 other_last = s; /* Fix this later. --Hugo */
937 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
939 DEBUG_OPTIMISE_MORE_r(
940 PerlIO_printf(Perl_debug_log,
941 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
942 (IV)prog->check_offset_min,
943 (IV)prog->check_offset_max,
951 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
953 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
956 /* Fixed substring is found far enough so that the match
957 cannot start at strpos. */
959 if (ml_anch && t[-1] != '\n') {
960 /* Eventually fbm_*() should handle this, but often
961 anchored_offset is not 0, so this check will not be wasted. */
962 /* XXXX In the code below we prefer to look for "^" even in
963 presence of anchored substrings. And we search even
964 beyond the found float position. These pessimizations
965 are historical artefacts only. */
967 while (t < strend - prog->minlen) {
969 if (t < check_at - prog->check_offset_min) {
970 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
971 /* Since we moved from the found position,
972 we definitely contradict the found anchored
973 substr. Due to the above check we do not
974 contradict "check" substr.
975 Thus we can arrive here only if check substr
976 is float. Redo checking for "other"=="fixed".
979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
980 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
981 goto do_other_anchored;
983 /* We don't contradict the found floating substring. */
984 /* XXXX Why not check for STCLASS? */
986 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
987 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
990 /* Position contradicts check-string */
991 /* XXXX probably better to look for check-string
992 than for "\n", so one should lower the limit for t? */
993 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
994 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
995 other_last = strpos = s = t + 1;
1000 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1001 PL_colors[0], PL_colors[1]));
1005 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1006 PL_colors[0], PL_colors[1]));
1010 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1013 /* The found string does not prohibit matching at strpos,
1014 - no optimization of calling REx engine can be performed,
1015 unless it was an MBOL and we are not after MBOL,
1016 or a future STCLASS check will fail this. */
1018 /* Even in this situation we may use MBOL flag if strpos is offset
1019 wrt the start of the string. */
1020 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1021 && (strpos != strbeg) && strpos[-1] != '\n'
1022 /* May be due to an implicit anchor of m{.*foo} */
1023 && !(prog->intflags & PREGf_IMPLICIT))
1028 DEBUG_EXECUTE_r( if (ml_anch)
1029 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1030 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1033 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1035 prog->check_utf8 /* Could be deleted already */
1036 && --BmUSEFUL(prog->check_utf8) < 0
1037 && (prog->check_utf8 == prog->float_utf8)
1039 prog->check_substr /* Could be deleted already */
1040 && --BmUSEFUL(prog->check_substr) < 0
1041 && (prog->check_substr == prog->float_substr)
1044 /* If flags & SOMETHING - do not do it many times on the same match */
1045 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1046 /* XXX Does the destruction order has to change with utf8_target? */
1047 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1048 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1049 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1050 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1051 check = NULL; /* abort */
1053 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1054 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1055 if (prog->intflags & PREGf_IMPLICIT)
1056 prog->extflags &= ~RXf_ANCH_MBOL;
1057 /* XXXX This is a remnant of the old implementation. It
1058 looks wasteful, since now INTUIT can use many
1059 other heuristics. */
1060 prog->extflags &= ~RXf_USE_INTUIT;
1061 /* XXXX What other flags might need to be cleared in this branch? */
1067 /* Last resort... */
1068 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1069 /* trie stclasses are too expensive to use here, we are better off to
1070 leave it to regmatch itself */
1071 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1072 /* minlen == 0 is possible if regstclass is \b or \B,
1073 and the fixed substr is ''$.
1074 Since minlen is already taken into account, s+1 is before strend;
1075 accidentally, minlen >= 1 guaranties no false positives at s + 1
1076 even for \b or \B. But (minlen? 1 : 0) below assumes that
1077 regstclass does not come from lookahead... */
1078 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1079 This leaves EXACTF, EXACTFU only, which are dealt with in find_byclass(). */
1080 const U8* const str = (U8*)STRING(progi->regstclass);
1081 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1082 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1085 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1086 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1087 else if (prog->float_substr || prog->float_utf8)
1088 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1092 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1093 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1096 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1099 const char *what = NULL;
1101 if (endpos == strend) {
1102 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1103 "Could not match STCLASS...\n") );
1106 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1107 "This position contradicts STCLASS...\n") );
1108 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1110 /* Contradict one of substrings */
1111 if (prog->anchored_substr || prog->anchored_utf8) {
1112 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1113 DEBUG_EXECUTE_r( what = "anchored" );
1115 s = HOP3c(t, 1, strend);
1116 if (s + start_shift + end_shift > strend) {
1117 /* XXXX Should be taken into account earlier? */
1118 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119 "Could not match STCLASS...\n") );
1124 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1125 "Looking for %s substr starting at offset %ld...\n",
1126 what, (long)(s + start_shift - i_strpos)) );
1129 /* Have both, check_string is floating */
1130 if (t + start_shift >= check_at) /* Contradicts floating=check */
1131 goto retry_floating_check;
1132 /* Recheck anchored substring, but not floating... */
1136 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1137 "Looking for anchored substr starting at offset %ld...\n",
1138 (long)(other_last - i_strpos)) );
1139 goto do_other_anchored;
1141 /* Another way we could have checked stclass at the
1142 current position only: */
1147 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1148 "Looking for /%s^%s/m starting at offset %ld...\n",
1149 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1152 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1154 /* Check is floating substring. */
1155 retry_floating_check:
1156 t = check_at - start_shift;
1157 DEBUG_EXECUTE_r( what = "floating" );
1158 goto hop_and_restart;
1161 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1162 "By STCLASS: moving %ld --> %ld\n",
1163 (long)(t - i_strpos), (long)(s - i_strpos))
1167 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1168 "Does not contradict STCLASS...\n");
1173 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1174 PL_colors[4], (check ? "Guessed" : "Giving up"),
1175 PL_colors[5], (long)(s - i_strpos)) );
1178 fail_finish: /* Substring not found */
1179 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1180 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1183 PL_colors[4], PL_colors[5]));
1187 #define DECL_TRIE_TYPE(scan) \
1188 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1189 trie_type = (scan->flags != EXACT) \
1190 ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1191 : (utf8_target ? trie_utf8 : trie_plain)
1193 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1194 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1195 switch (trie_type) { \
1196 case trie_utf8_fold: \
1197 if ( foldlen>0 ) { \
1198 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1203 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1204 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1205 foldlen -= UNISKIP( uvc ); \
1206 uscan = foldbuf + UNISKIP( uvc ); \
1209 case trie_latin_utf8_fold: \
1210 if ( foldlen>0 ) { \
1211 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1217 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1218 foldlen -= UNISKIP( uvc ); \
1219 uscan = foldbuf + UNISKIP( uvc ); \
1223 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1230 charid = trie->charmap[ uvc ]; \
1234 if (widecharmap) { \
1235 SV** const svpp = hv_fetch(widecharmap, \
1236 (char*)&uvc, sizeof(UV), 0); \
1238 charid = (U16)SvIV(*svpp); \
1243 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1245 char *my_strend= (char *)strend; \
1248 foldEQ_utf8(s, &my_strend, 0, utf8_target, \
1249 m, NULL, ln, cBOOL(UTF_PATTERN))) \
1250 && (!reginfo || regtry(reginfo, &s)) ) \
1253 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1254 uvchr_to_utf8(tmpbuf, c); \
1255 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1257 && (f == c1 || f == c2) \
1259 foldEQ_utf8(s, &my_strend, 0, utf8_target,\
1260 m, NULL, ln, cBOOL(UTF_PATTERN)))\
1261 && (!reginfo || regtry(reginfo, &s)) ) \
1267 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1271 case EXACTFU: folder = foldEQ_latin1; break; \
1272 case EXACTFL: folder = foldEQ_locale; break; \
1273 case EXACTF: folder = foldEQ; break; \
1275 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
1279 && (ln == 1 || folder(s, m, ln)) \
1280 && (!reginfo || regtry(reginfo, &s)) ) \
1286 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1288 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1294 #define REXEC_FBC_SCAN(CoDe) \
1296 while (s < strend) { \
1302 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1303 REXEC_FBC_UTF8_SCAN( \
1305 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1314 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1317 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1326 #define REXEC_FBC_TRYIT \
1327 if ((!reginfo || regtry(reginfo, &s))) \
1330 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1331 if (utf8_target) { \
1332 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1335 REXEC_FBC_CLASS_SCAN(CoNd); \
1339 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1340 if (utf8_target) { \
1342 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1345 REXEC_FBC_CLASS_SCAN(CoNd); \
1349 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1350 PL_reg_flags |= RF_tainted; \
1351 if (utf8_target) { \
1352 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1355 REXEC_FBC_CLASS_SCAN(CoNd); \
1359 #define DUMP_EXEC_POS(li,s,doutf8) \
1360 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1363 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1364 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1365 tmp = TEST_NON_UTF8(tmp); \
1366 REXEC_FBC_UTF8_SCAN( \
1367 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1376 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1377 if (s == PL_bostr) { \
1381 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1382 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1385 LOAD_UTF8_CHARCLASS_ALNUM(); \
1386 REXEC_FBC_UTF8_SCAN( \
1387 if (tmp == ! (TeSt2_UtF8)) { \
1396 /* The only difference between the BOUND and NBOUND cases is that
1397 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1398 * NBOUND. This is accomplished by passing it in either the if or else clause,
1399 * with the other one being empty */
1400 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1401 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1403 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1406 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1407 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1409 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1410 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1413 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1414 * be passed in completely with the variable name being tested, which isn't
1415 * such a clean interface, but this is easier to read than it was before. We
1416 * are looking for the boundary (or non-boundary between a word and non-word
1417 * character. The utf8 and non-utf8 cases have the same logic, but the details
1418 * must be different. Find the "wordness" of the character just prior to this
1419 * one, and compare it with the wordness of this one. If they differ, we have
1420 * a boundary. At the beginning of the string, pretend that the previous
1421 * character was a new-line */
1422 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1423 if (utf8_target) { \
1426 else { /* Not utf8 */ \
1427 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1428 tmp = TEST_NON_UTF8(tmp); \
1430 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1439 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1442 /* We know what class REx starts with. Try to find this position... */
1443 /* if reginfo is NULL, its a dryrun */
1444 /* annoyingly all the vars in this routine have different names from their counterparts
1445 in regmatch. /grrr */
1448 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1449 const char *strend, regmatch_info *reginfo)
1452 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1456 register STRLEN uskip;
1460 register I32 tmp = 1; /* Scratch variable? */
1461 register const bool utf8_target = PL_reg_match_utf8;
1462 RXi_GET_DECL(prog,progi);
1464 PERL_ARGS_ASSERT_FIND_BYCLASS;
1466 /* We know what class it must start with. */
1470 if (utf8_target || OP(c) == ANYOFV) {
1471 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
1472 !UTF8_IS_INVARIANT((U8)s[0]) ?
1473 reginclass(prog, c, (U8*)s, 0, utf8_target) :
1474 REGINCLASS(prog, c, (U8*)s));
1477 while (s < strend) {
1480 if (REGINCLASS(prog, c, (U8*)s) ||
1481 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1482 /* The assignment of 2 is intentional:
1483 * for the folded sharp s, the skip is 2. */
1484 (skip = SHARP_S_SKIP))) {
1485 if (tmp && (!reginfo || regtry(reginfo, &s)))
1498 if (tmp && (!reginfo || regtry(reginfo, &s)))
1507 ln = STR_LEN(c); /* length to match in octets/bytes */
1508 lnc = (I32) ln; /* length to match in characters */
1510 STRLEN ulen1, ulen2;
1512 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1513 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1514 /* used by commented-out code below */
1515 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1517 /* XXX: Since the node will be case folded at compile
1518 time this logic is a little odd, although im not
1519 sure that its actually wrong. --dmq */
1521 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1522 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1524 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1525 codepoint of the first character in the converted
1526 form, yet originally we did the extra step.
1527 No tests fail by commenting this code out however
1528 so Ive left it out. -- dmq.
1530 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1532 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1537 while (sm < ((U8 *) m + ln)) {
1544 if (utf8_target || OP(c) == EXACTFU) {
1546 /* Micro sign folds to GREEK SMALL LETTER MU;
1547 LATIN_SMALL_LETTER_SHARP_S folds to 'ss', and this sets
1548 c2 to the first 's' of the pair, and the code below will
1550 c2 = (c1 == MICRO_SIGN)
1551 ? GREEK_SMALL_LETTER_MU
1552 : (c1 == LATIN_SMALL_LETTER_SHARP_S)
1554 : PL_fold_latin1[c1];
1555 } else c2 = PL_fold[c1];
1563 c2 = PL_fold_locale[c1];
1565 e = HOP3c(strend, -((I32)lnc), s);
1567 if (!reginfo && e < s)
1568 e = s; /* Due to minlen logic of intuit() */
1570 /* The idea in the EXACTF* cases is to first find the
1571 * first character of the EXACTF* node and then, if
1572 * necessary, case-insensitively compare the full
1573 * text of the node. The c1 and c2 are the first
1574 * characters (though in Unicode it gets a bit
1575 * more complicated because there are more cases
1576 * than just upper and lower: one needs to use
1577 * the so-called folding case for case-insensitive
1578 * matching (called "loose matching" in Unicode).
1579 * foldEQ_utf8() will do just that. */
1581 if (utf8_target || UTF_PATTERN) {
1583 U8 tmpbuf [UTF8_MAXBYTES+1];
1586 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1588 /* Upper and lower of 1st char are equal -
1589 * probably not a "letter". */
1592 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1597 REXEC_FBC_EXACTISH_CHECK(c == c1);
1603 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1609 /* Handle some of the three Greek sigmas cases.
1610 * Note that not all the possible combinations
1611 * are handled here: some of them are handled
1612 * by the standard folding rules, and some of
1613 * them (the character class or ANYOF cases)
1614 * are handled during compiletime in
1615 * regexec.c:S_regclass(). */
1616 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1617 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1618 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1620 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1625 /* Neither pattern nor string are UTF8 */
1627 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1629 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1633 PL_reg_flags |= RF_tainted;
1634 FBC_BOUND(isALNUM_LC,
1635 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1636 isALNUM_LC_utf8((U8*)s));
1639 PL_reg_flags |= RF_tainted;
1640 FBC_NBOUND(isALNUM_LC,
1641 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1642 isALNUM_LC_utf8((U8*)s));
1645 FBC_BOUND(isWORDCHAR,
1647 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1650 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1652 isWORDCHAR_A((U8*)s));
1655 FBC_NBOUND(isWORDCHAR,
1657 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1660 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1662 isWORDCHAR_A((U8*)s));
1665 FBC_BOUND(isWORDCHAR_L1,
1667 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1670 FBC_NBOUND(isWORDCHAR_L1,
1672 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1675 REXEC_FBC_CSCAN_TAINT(
1676 isALNUM_LC_utf8((U8*)s),
1680 REXEC_FBC_CSCAN_PRELOAD(
1681 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1682 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1683 isWORDCHAR_L1((U8) *s)
1686 REXEC_FBC_CSCAN_PRELOAD(
1687 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1688 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1692 /* Don't need to worry about utf8, as it can match only a single
1693 * byte invariant character */
1694 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1696 REXEC_FBC_CSCAN_PRELOAD(
1697 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1698 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1699 ! isWORDCHAR_L1((U8) *s)
1702 REXEC_FBC_CSCAN_PRELOAD(
1703 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1704 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1714 REXEC_FBC_CSCAN_TAINT(
1715 !isALNUM_LC_utf8((U8*)s),
1719 REXEC_FBC_CSCAN_PRELOAD(
1720 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1721 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1725 REXEC_FBC_CSCAN_PRELOAD(
1726 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1727 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1731 /* Don't need to worry about utf8, as it can match only a single
1732 * byte invariant character */
1733 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1735 REXEC_FBC_CSCAN_TAINT(
1736 isSPACE_LC_utf8((U8*)s),
1740 REXEC_FBC_CSCAN_PRELOAD(
1741 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1742 !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1743 ! isSPACE_L1((U8) *s)
1746 REXEC_FBC_CSCAN_PRELOAD(
1747 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1748 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1758 REXEC_FBC_CSCAN_TAINT(
1759 !isSPACE_LC_utf8((U8*)s),
1763 REXEC_FBC_CSCAN_PRELOAD(
1764 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1765 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1769 /* Don't need to worry about utf8, as it can match only a single
1770 * byte invariant character */
1771 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1773 REXEC_FBC_CSCAN_TAINT(
1774 isDIGIT_LC_utf8((U8*)s),
1778 REXEC_FBC_CSCAN_PRELOAD(
1779 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1780 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1790 REXEC_FBC_CSCAN_TAINT(
1791 !isDIGIT_LC_utf8((U8*)s),
1797 is_LNBREAK_latin1(s)
1807 !is_VERTWS_latin1(s)
1812 is_HORIZWS_latin1(s)
1816 !is_HORIZWS_utf8(s),
1817 !is_HORIZWS_latin1(s)
1823 /* what trie are we using right now */
1825 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1827 = (reg_trie_data*)progi->data->data[ aho->trie ];
1828 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1830 const char *last_start = strend - trie->minlen;
1832 const char *real_start = s;
1834 STRLEN maxlen = trie->maxlen;
1836 U8 **points; /* map of where we were in the input string
1837 when reading a given char. For ASCII this
1838 is unnecessary overhead as the relationship
1839 is always 1:1, but for Unicode, especially
1840 case folded Unicode this is not true. */
1841 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1845 GET_RE_DEBUG_FLAGS_DECL;
1847 /* We can't just allocate points here. We need to wrap it in
1848 * an SV so it gets freed properly if there is a croak while
1849 * running the match */
1852 sv_points=newSV(maxlen * sizeof(U8 *));
1853 SvCUR_set(sv_points,
1854 maxlen * sizeof(U8 *));
1855 SvPOK_on(sv_points);
1856 sv_2mortal(sv_points);
1857 points=(U8**)SvPV_nolen(sv_points );
1858 if ( trie_type != trie_utf8_fold
1859 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1862 bitmap=(U8*)trie->bitmap;
1864 bitmap=(U8*)ANYOF_BITMAP(c);
1866 /* this is the Aho-Corasick algorithm modified a touch
1867 to include special handling for long "unknown char"
1868 sequences. The basic idea being that we use AC as long
1869 as we are dealing with a possible matching char, when
1870 we encounter an unknown char (and we have not encountered
1871 an accepting state) we scan forward until we find a legal
1873 AC matching is basically that of trie matching, except
1874 that when we encounter a failing transition, we fall back
1875 to the current states "fail state", and try the current char
1876 again, a process we repeat until we reach the root state,
1877 state 1, or a legal transition. If we fail on the root state
1878 then we can either terminate if we have reached an accepting
1879 state previously, or restart the entire process from the beginning
1883 while (s <= last_start) {
1884 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1892 U8 *uscan = (U8*)NULL;
1893 U8 *leftmost = NULL;
1895 U32 accepted_word= 0;
1899 while ( state && uc <= (U8*)strend ) {
1901 U32 word = aho->states[ state ].wordnum;
1905 DEBUG_TRIE_EXECUTE_r(
1906 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1907 dump_exec_pos( (char *)uc, c, strend, real_start,
1908 (char *)uc, utf8_target );
1909 PerlIO_printf( Perl_debug_log,
1910 " Scanning for legal start char...\n");
1914 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1918 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1924 if (uc >(U8*)last_start) break;
1928 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1929 if (!leftmost || lpos < leftmost) {
1930 DEBUG_r(accepted_word=word);
1936 points[pointpos++ % maxlen]= uc;
1937 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1938 uscan, len, uvc, charid, foldlen,
1940 DEBUG_TRIE_EXECUTE_r({
1941 dump_exec_pos( (char *)uc, c, strend, real_start,
1943 PerlIO_printf(Perl_debug_log,
1944 " Charid:%3u CP:%4"UVxf" ",
1950 word = aho->states[ state ].wordnum;
1952 base = aho->states[ state ].trans.base;
1954 DEBUG_TRIE_EXECUTE_r({
1956 dump_exec_pos( (char *)uc, c, strend, real_start,
1958 PerlIO_printf( Perl_debug_log,
1959 "%sState: %4"UVxf", word=%"UVxf,
1960 failed ? " Fail transition to " : "",
1961 (UV)state, (UV)word);
1967 ( ((offset = base + charid
1968 - 1 - trie->uniquecharcount)) >= 0)
1969 && ((U32)offset < trie->lasttrans)
1970 && trie->trans[offset].check == state
1971 && (tmp=trie->trans[offset].next))
1973 DEBUG_TRIE_EXECUTE_r(
1974 PerlIO_printf( Perl_debug_log," - legal\n"));
1979 DEBUG_TRIE_EXECUTE_r(
1980 PerlIO_printf( Perl_debug_log," - fail\n"));
1982 state = aho->fail[state];
1986 /* we must be accepting here */
1987 DEBUG_TRIE_EXECUTE_r(
1988 PerlIO_printf( Perl_debug_log," - accepting\n"));
1997 if (!state) state = 1;
2000 if ( aho->states[ state ].wordnum ) {
2001 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2002 if (!leftmost || lpos < leftmost) {
2003 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2008 s = (char*)leftmost;
2009 DEBUG_TRIE_EXECUTE_r({
2011 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2012 (UV)accepted_word, (IV)(s - real_start)
2015 if (!reginfo || regtry(reginfo, &s)) {
2021 DEBUG_TRIE_EXECUTE_r({
2022 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2025 DEBUG_TRIE_EXECUTE_r(
2026 PerlIO_printf( Perl_debug_log,"No match.\n"));
2035 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2045 - regexec_flags - match a regexp against a string
2048 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2049 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2050 /* strend: pointer to null at end of string */
2051 /* strbeg: real beginning of string */
2052 /* minend: end of match must be >=minend after stringarg. */
2053 /* data: May be used for some additional optimizations.
2054 Currently its only used, with a U32 cast, for transmitting
2055 the ganch offset when doing a /g match. This will change */
2056 /* nosave: For optimizations. */
2059 struct regexp *const prog = (struct regexp *)SvANY(rx);
2060 /*register*/ char *s;
2061 register regnode *c;
2062 /*register*/ char *startpos = stringarg;
2063 I32 minlen; /* must match at least this many chars */
2064 I32 dontbother = 0; /* how many characters not to try at end */
2065 I32 end_shift = 0; /* Same for the end. */ /* CC */
2066 I32 scream_pos = -1; /* Internal iterator of scream. */
2067 char *scream_olds = NULL;
2068 const bool utf8_target = cBOOL(DO_UTF8(sv));
2070 RXi_GET_DECL(prog,progi);
2071 regmatch_info reginfo; /* create some info to pass to regtry etc */
2072 regexp_paren_pair *swap = NULL;
2073 GET_RE_DEBUG_FLAGS_DECL;
2075 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2076 PERL_UNUSED_ARG(data);
2078 /* Be paranoid... */
2079 if (prog == NULL || startpos == NULL) {
2080 Perl_croak(aTHX_ "NULL regexp parameter");
2084 multiline = prog->extflags & RXf_PMf_MULTILINE;
2085 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2087 RX_MATCH_UTF8_set(rx, utf8_target);
2089 debug_start_match(rx, utf8_target, startpos, strend,
2093 minlen = prog->minlen;
2095 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2096 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2097 "String too short [regexec_flags]...\n"));
2102 /* Check validity of program. */
2103 if (UCHARAT(progi->program) != REG_MAGIC) {
2104 Perl_croak(aTHX_ "corrupted regexp program");
2108 PL_reg_eval_set = 0;
2112 PL_reg_flags |= RF_utf8;
2114 /* Mark beginning of line for ^ and lookbehind. */
2115 reginfo.bol = startpos; /* XXX not used ??? */
2119 /* Mark end of line for $ (and such) */
2122 /* see how far we have to get to not match where we matched before */
2123 reginfo.till = startpos+minend;
2125 /* If there is a "must appear" string, look for it. */
2128 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2130 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2131 reginfo.ganch = startpos + prog->gofs;
2132 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2133 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2134 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2136 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2137 && mg->mg_len >= 0) {
2138 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2139 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2140 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2142 if (prog->extflags & RXf_ANCH_GPOS) {
2143 if (s > reginfo.ganch)
2145 s = reginfo.ganch - prog->gofs;
2146 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2147 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2153 reginfo.ganch = strbeg + PTR2UV(data);
2154 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2155 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2157 } else { /* pos() not defined */
2158 reginfo.ganch = strbeg;
2159 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2160 "GPOS: reginfo.ganch = strbeg\n"));
2163 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2164 /* We have to be careful. If the previous successful match
2165 was from this regex we don't want a subsequent partially
2166 successful match to clobber the old results.
2167 So when we detect this possibility we add a swap buffer
2168 to the re, and switch the buffer each match. If we fail
2169 we switch it back, otherwise we leave it swapped.
2172 /* do we need a save destructor here for eval dies? */
2173 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2175 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2176 re_scream_pos_data d;
2178 d.scream_olds = &scream_olds;
2179 d.scream_pos = &scream_pos;
2180 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2183 goto phooey; /* not present */
2189 /* Simplest case: anchored match need be tried only once. */
2190 /* [unless only anchor is BOL and multiline is set] */
2191 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2192 if (s == startpos && regtry(®info, &startpos))
2194 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2195 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2200 dontbother = minlen - 1;
2201 end = HOP3c(strend, -dontbother, strbeg) - 1;
2202 /* for multiline we only have to try after newlines */
2203 if (prog->check_substr || prog->check_utf8) {
2204 /* because of the goto we can not easily reuse the macros for bifurcating the
2205 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2208 goto after_try_utf8;
2210 if (regtry(®info, &s)) {
2217 if (prog->extflags & RXf_USE_INTUIT) {
2218 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2227 } /* end search for check string in unicode */
2229 if (s == startpos) {
2230 goto after_try_latin;
2233 if (regtry(®info, &s)) {
2240 if (prog->extflags & RXf_USE_INTUIT) {
2241 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2250 } /* end search for check string in latin*/
2251 } /* end search for check string */
2252 else { /* search for newline */
2254 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2257 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2259 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2260 if (regtry(®info, &s))
2264 } /* end search for newline */
2265 } /* end anchored/multiline check string search */
2267 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2269 /* the warning about reginfo.ganch being used without initialization
2270 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2271 and we only enter this block when the same bit is set. */
2272 char *tmp_s = reginfo.ganch - prog->gofs;
2274 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2279 /* Messy cases: unanchored match. */
2280 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2281 /* we have /x+whatever/ */
2282 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2287 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2288 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2289 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2294 DEBUG_EXECUTE_r( did_match = 1 );
2295 if (regtry(®info, &s)) goto got_it;
2297 while (s < strend && *s == ch)
2305 DEBUG_EXECUTE_r( did_match = 1 );
2306 if (regtry(®info, &s)) goto got_it;
2308 while (s < strend && *s == ch)
2313 DEBUG_EXECUTE_r(if (!did_match)
2314 PerlIO_printf(Perl_debug_log,
2315 "Did not find anchored character...\n")
2318 else if (prog->anchored_substr != NULL
2319 || prog->anchored_utf8 != NULL
2320 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2321 && prog->float_max_offset < strend - s)) {
2326 char *last1; /* Last position checked before */
2330 if (prog->anchored_substr || prog->anchored_utf8) {
2331 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2332 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2333 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2334 back_max = back_min = prog->anchored_offset;
2336 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2337 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2338 must = utf8_target ? prog->float_utf8 : prog->float_substr;
2339 back_max = prog->float_max_offset;
2340 back_min = prog->float_min_offset;
2344 if (must == &PL_sv_undef)
2345 /* could not downgrade utf8 check substring, so must fail */
2351 last = HOP3c(strend, /* Cannot start after this */
2352 -(I32)(CHR_SVLEN(must)
2353 - (SvTAIL(must) != 0) + back_min), strbeg);
2356 last1 = HOPc(s, -1);
2358 last1 = s - 1; /* bogus */
2360 /* XXXX check_substr already used to find "s", can optimize if
2361 check_substr==must. */
2363 dontbother = end_shift;
2364 strend = HOPc(strend, -dontbother);
2365 while ( (s <= last) &&
2366 ((flags & REXEC_SCREAM)
2367 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2368 end_shift, &scream_pos, 0))
2369 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2370 (unsigned char*)strend, must,
2371 multiline ? FBMrf_MULTILINE : 0))) ) {
2372 /* we may be pointing at the wrong string */
2373 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2374 s = strbeg + (s - SvPVX_const(sv));
2375 DEBUG_EXECUTE_r( did_match = 1 );
2376 if (HOPc(s, -back_max) > last1) {
2377 last1 = HOPc(s, -back_min);
2378 s = HOPc(s, -back_max);
2381 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2383 last1 = HOPc(s, -back_min);
2387 while (s <= last1) {
2388 if (regtry(®info, &s))
2394 while (s <= last1) {
2395 if (regtry(®info, &s))
2401 DEBUG_EXECUTE_r(if (!did_match) {
2402 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2403 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2404 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2405 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2406 ? "anchored" : "floating"),
2407 quoted, RE_SV_TAIL(must));
2411 else if ( (c = progi->regstclass) ) {
2413 const OPCODE op = OP(progi->regstclass);
2414 /* don't bother with what can't match */
2415 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2416 strend = HOPc(strend, -(minlen - 1));
2419 SV * const prop = sv_newmortal();
2420 regprop(prog, prop, c);
2422 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2424 PerlIO_printf(Perl_debug_log,
2425 "Matching stclass %.*s against %s (%d bytes)\n",
2426 (int)SvCUR(prop), SvPVX_const(prop),
2427 quoted, (int)(strend - s));
2430 if (find_byclass(prog, c, s, strend, ®info))
2432 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2436 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2441 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2442 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2443 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2445 if (flags & REXEC_SCREAM) {
2446 last = screaminstr(sv, float_real, s - strbeg,
2447 end_shift, &scream_pos, 1); /* last one */
2449 last = scream_olds; /* Only one occurrence. */
2450 /* we may be pointing at the wrong string */
2451 else if (RXp_MATCH_COPIED(prog))
2452 s = strbeg + (s - SvPVX_const(sv));
2456 const char * const little = SvPV_const(float_real, len);
2458 if (SvTAIL(float_real)) {
2459 if (memEQ(strend - len + 1, little, len - 1))
2460 last = strend - len + 1;
2461 else if (!multiline)
2462 last = memEQ(strend - len, little, len)
2463 ? strend - len : NULL;
2469 last = rninstr(s, strend, little, little + len);
2471 last = strend; /* matching "$" */
2476 PerlIO_printf(Perl_debug_log,
2477 "%sCan't trim the tail, match fails (should not happen)%s\n",
2478 PL_colors[4], PL_colors[5]));
2479 goto phooey; /* Should not happen! */
2481 dontbother = strend - last + prog->float_min_offset;
2483 if (minlen && (dontbother < minlen))
2484 dontbother = minlen - 1;
2485 strend -= dontbother; /* this one's always in bytes! */
2486 /* We don't know much -- general case. */
2489 if (regtry(®info, &s))
2498 if (regtry(®info, &s))
2500 } while (s++ < strend);
2509 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2511 if (PL_reg_eval_set)
2512 restore_pos(aTHX_ prog);
2513 if (RXp_PAREN_NAMES(prog))
2514 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2516 /* make sure $`, $&, $', and $digit will work later */
2517 if ( !(flags & REXEC_NOT_FIRST) ) {
2518 RX_MATCH_COPY_FREE(rx);
2519 if (flags & REXEC_COPY_STR) {
2520 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2521 #ifdef PERL_OLD_COPY_ON_WRITE
2523 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2525 PerlIO_printf(Perl_debug_log,
2526 "Copy on write: regexp capture, type %d\n",
2529 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2530 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2531 assert (SvPOKp(prog->saved_copy));
2535 RX_MATCH_COPIED_on(rx);
2536 s = savepvn(strbeg, i);
2542 prog->subbeg = strbeg;
2543 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2550 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2551 PL_colors[4], PL_colors[5]));
2552 if (PL_reg_eval_set)
2553 restore_pos(aTHX_ prog);
2555 /* we failed :-( roll it back */
2556 Safefree(prog->offs);
2565 - regtry - try match at specific point
2567 STATIC I32 /* 0 failure, 1 success */
2568 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2572 REGEXP *const rx = reginfo->prog;
2573 regexp *const prog = (struct regexp *)SvANY(rx);
2574 RXi_GET_DECL(prog,progi);
2575 GET_RE_DEBUG_FLAGS_DECL;
2577 PERL_ARGS_ASSERT_REGTRY;
2579 reginfo->cutpoint=NULL;
2581 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2584 PL_reg_eval_set = RS_init;
2585 DEBUG_EXECUTE_r(DEBUG_s(
2586 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2587 (IV)(PL_stack_sp - PL_stack_base));
2590 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2591 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2593 /* Apparently this is not needed, judging by wantarray. */
2594 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2595 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2598 /* Make $_ available to executed code. */
2599 if (reginfo->sv != DEFSV) {
2601 DEFSV_set(reginfo->sv);
2604 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2605 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2606 /* prepare for quick setting of pos */
2607 #ifdef PERL_OLD_COPY_ON_WRITE
2608 if (SvIsCOW(reginfo->sv))
2609 sv_force_normal_flags(reginfo->sv, 0);
2611 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2612 &PL_vtbl_mglob, NULL, 0);
2616 PL_reg_oldpos = mg->mg_len;
2617 SAVEDESTRUCTOR_X(restore_pos, prog);
2619 if (!PL_reg_curpm) {
2620 Newxz(PL_reg_curpm, 1, PMOP);
2623 SV* const repointer = &PL_sv_undef;
2624 /* this regexp is also owned by the new PL_reg_curpm, which
2625 will try to free it. */
2626 av_push(PL_regex_padav, repointer);
2627 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2628 PL_regex_pad = AvARRAY(PL_regex_padav);
2633 /* It seems that non-ithreads works both with and without this code.
2634 So for efficiency reasons it seems best not to have the code
2635 compiled when it is not needed. */
2636 /* This is safe against NULLs: */
2637 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2638 /* PM_reg_curpm owns a reference to this regexp. */
2641 PM_SETRE(PL_reg_curpm, rx);
2642 PL_reg_oldcurpm = PL_curpm;
2643 PL_curpm = PL_reg_curpm;
2644 if (RXp_MATCH_COPIED(prog)) {
2645 /* Here is a serious problem: we cannot rewrite subbeg,
2646 since it may be needed if this match fails. Thus
2647 $` inside (?{}) could fail... */
2648 PL_reg_oldsaved = prog->subbeg;
2649 PL_reg_oldsavedlen = prog->sublen;
2650 #ifdef PERL_OLD_COPY_ON_WRITE
2651 PL_nrs = prog->saved_copy;
2653 RXp_MATCH_COPIED_off(prog);
2656 PL_reg_oldsaved = NULL;
2657 prog->subbeg = PL_bostr;
2658 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2660 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2661 prog->offs[0].start = *startpos - PL_bostr;
2662 PL_reginput = *startpos;
2663 PL_reglastparen = &prog->lastparen;
2664 PL_reglastcloseparen = &prog->lastcloseparen;
2665 prog->lastparen = 0;
2666 prog->lastcloseparen = 0;
2668 PL_regoffs = prog->offs;
2669 if (PL_reg_start_tmpl <= prog->nparens) {
2670 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2671 if(PL_reg_start_tmp)
2672 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2674 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2677 /* XXXX What this code is doing here?!!! There should be no need
2678 to do this again and again, PL_reglastparen should take care of
2681 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2682 * Actually, the code in regcppop() (which Ilya may be meaning by
2683 * PL_reglastparen), is not needed at all by the test suite
2684 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2685 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2686 * Meanwhile, this code *is* needed for the
2687 * above-mentioned test suite tests to succeed. The common theme
2688 * on those tests seems to be returning null fields from matches.
2689 * --jhi updated by dapm */
2691 if (prog->nparens) {
2692 regexp_paren_pair *pp = PL_regoffs;
2694 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2702 if (regmatch(reginfo, progi->program + 1)) {
2703 PL_regoffs[0].end = PL_reginput - PL_bostr;
2706 if (reginfo->cutpoint)
2707 *startpos= reginfo->cutpoint;
2708 REGCP_UNWIND(lastcp);
2713 #define sayYES goto yes
2714 #define sayNO goto no
2715 #define sayNO_SILENT goto no_silent
2717 /* we dont use STMT_START/END here because it leads to
2718 "unreachable code" warnings, which are bogus, but distracting. */
2719 #define CACHEsayNO \
2720 if (ST.cache_mask) \
2721 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2724 /* this is used to determine how far from the left messages like
2725 'failed...' are printed. It should be set such that messages
2726 are inline with the regop output that created them.
2728 #define REPORT_CODE_OFF 32
2731 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2732 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2734 #define SLAB_FIRST(s) (&(s)->states[0])
2735 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2737 /* grab a new slab and return the first slot in it */
2739 STATIC regmatch_state *
2742 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2745 regmatch_slab *s = PL_regmatch_slab->next;
2747 Newx(s, 1, regmatch_slab);
2748 s->prev = PL_regmatch_slab;
2750 PL_regmatch_slab->next = s;
2752 PL_regmatch_slab = s;
2753 return SLAB_FIRST(s);
2757 /* push a new state then goto it */
2759 #define PUSH_STATE_GOTO(state, node) \
2761 st->resume_state = state; \
2764 /* push a new state with success backtracking, then goto it */
2766 #define PUSH_YES_STATE_GOTO(state, node) \
2768 st->resume_state = state; \
2769 goto push_yes_state;
2775 regmatch() - main matching routine
2777 This is basically one big switch statement in a loop. We execute an op,
2778 set 'next' to point the next op, and continue. If we come to a point which
2779 we may need to backtrack to on failure such as (A|B|C), we push a
2780 backtrack state onto the backtrack stack. On failure, we pop the top
2781 state, and re-enter the loop at the state indicated. If there are no more
2782 states to pop, we return failure.
2784 Sometimes we also need to backtrack on success; for example /A+/, where
2785 after successfully matching one A, we need to go back and try to
2786 match another one; similarly for lookahead assertions: if the assertion
2787 completes successfully, we backtrack to the state just before the assertion
2788 and then carry on. In these cases, the pushed state is marked as
2789 'backtrack on success too'. This marking is in fact done by a chain of
2790 pointers, each pointing to the previous 'yes' state. On success, we pop to
2791 the nearest yes state, discarding any intermediate failure-only states.
2792 Sometimes a yes state is pushed just to force some cleanup code to be
2793 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2794 it to free the inner regex.
2796 Note that failure backtracking rewinds the cursor position, while
2797 success backtracking leaves it alone.
2799 A pattern is complete when the END op is executed, while a subpattern
2800 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2801 ops trigger the "pop to last yes state if any, otherwise return true"
2804 A common convention in this function is to use A and B to refer to the two
2805 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2806 the subpattern to be matched possibly multiple times, while B is the entire
2807 rest of the pattern. Variable and state names reflect this convention.
2809 The states in the main switch are the union of ops and failure/success of
2810 substates associated with with that op. For example, IFMATCH is the op
2811 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2812 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2813 successfully matched A and IFMATCH_A_fail is a state saying that we have
2814 just failed to match A. Resume states always come in pairs. The backtrack
2815 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2816 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2817 on success or failure.
2819 The struct that holds a backtracking state is actually a big union, with
2820 one variant for each major type of op. The variable st points to the
2821 top-most backtrack struct. To make the code clearer, within each
2822 block of code we #define ST to alias the relevant union.
2824 Here's a concrete example of a (vastly oversimplified) IFMATCH
2830 #define ST st->u.ifmatch
2832 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2833 ST.foo = ...; // some state we wish to save
2835 // push a yes backtrack state with a resume value of
2836 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2838 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2841 case IFMATCH_A: // we have successfully executed A; now continue with B
2843 bar = ST.foo; // do something with the preserved value
2846 case IFMATCH_A_fail: // A failed, so the assertion failed
2847 ...; // do some housekeeping, then ...
2848 sayNO; // propagate the failure
2855 For any old-timers reading this who are familiar with the old recursive
2856 approach, the code above is equivalent to:
2858 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2867 ...; // do some housekeeping, then ...
2868 sayNO; // propagate the failure
2871 The topmost backtrack state, pointed to by st, is usually free. If you
2872 want to claim it, populate any ST.foo fields in it with values you wish to
2873 save, then do one of
2875 PUSH_STATE_GOTO(resume_state, node);
2876 PUSH_YES_STATE_GOTO(resume_state, node);
2878 which sets that backtrack state's resume value to 'resume_state', pushes a
2879 new free entry to the top of the backtrack stack, then goes to 'node'.
2880 On backtracking, the free slot is popped, and the saved state becomes the
2881 new free state. An ST.foo field in this new top state can be temporarily
2882 accessed to retrieve values, but once the main loop is re-entered, it
2883 becomes available for reuse.
2885 Note that the depth of the backtrack stack constantly increases during the
2886 left-to-right execution of the pattern, rather than going up and down with
2887 the pattern nesting. For example the stack is at its maximum at Z at the
2888 end of the pattern, rather than at X in the following:
2890 /(((X)+)+)+....(Y)+....Z/
2892 The only exceptions to this are lookahead/behind assertions and the cut,
2893 (?>A), which pop all the backtrack states associated with A before
2896 Backtrack state structs are allocated in slabs of about 4K in size.
2897 PL_regmatch_state and st always point to the currently active state,
2898 and PL_regmatch_slab points to the slab currently containing
2899 PL_regmatch_state. The first time regmatch() is called, the first slab is
2900 allocated, and is never freed until interpreter destruction. When the slab
2901 is full, a new one is allocated and chained to the end. At exit from
2902 regmatch(), slabs allocated since entry are freed.
2907 #define DEBUG_STATE_pp(pp) \
2909 DUMP_EXEC_POS(locinput, scan, utf8_target); \
2910 PerlIO_printf(Perl_debug_log, \
2911 " %*s"pp" %s%s%s%s%s\n", \
2913 PL_reg_name[st->resume_state], \
2914 ((st==yes_state||st==mark_state) ? "[" : ""), \
2915 ((st==yes_state) ? "Y" : ""), \
2916 ((st==mark_state) ? "M" : ""), \
2917 ((st==yes_state||st==mark_state) ? "]" : "") \
2922 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2927 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2928 const char *start, const char *end, const char *blurb)
2930 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2932 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2937 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2938 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2940 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2941 start, end - start, 60);
2943 PerlIO_printf(Perl_debug_log,
2944 "%s%s REx%s %s against %s\n",
2945 PL_colors[4], blurb, PL_colors[5], s0, s1);
2947 if (utf8_target||utf8_pat)
2948 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2949 utf8_pat ? "pattern" : "",
2950 utf8_pat && utf8_target ? " and " : "",
2951 utf8_target ? "string" : ""
2957 S_dump_exec_pos(pTHX_ const char *locinput,
2958 const regnode *scan,
2959 const char *loc_regeol,
2960 const char *loc_bostr,
2961 const char *loc_reg_starttry,
2962 const bool utf8_target)
2964 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2965 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2966 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2967 /* The part of the string before starttry has one color
2968 (pref0_len chars), between starttry and current
2969 position another one (pref_len - pref0_len chars),
2970 after the current position the third one.
2971 We assume that pref0_len <= pref_len, otherwise we
2972 decrease pref0_len. */
2973 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2974 ? (5 + taill) - l : locinput - loc_bostr;
2977 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2979 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2981 pref0_len = pref_len - (locinput - loc_reg_starttry);
2982 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2983 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2984 ? (5 + taill) - pref_len : loc_regeol - locinput);
2985 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2989 if (pref0_len > pref_len)
2990 pref0_len = pref_len;
2992 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2994 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2995 (locinput - pref_len),pref0_len, 60, 4, 5);
2997 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2998 (locinput - pref_len + pref0_len),
2999 pref_len - pref0_len, 60, 2, 3);
3001 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3002 locinput, loc_regeol - locinput, 10, 0, 1);
3004 const STRLEN tlen=len0+len1+len2;
3005 PerlIO_printf(Perl_debug_log,
3006 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3007 (IV)(locinput - loc_bostr),
3010 (docolor ? "" : "> <"),
3012 (int)(tlen > 19 ? 0 : 19 - tlen),
3019 /* reg_check_named_buff_matched()
3020 * Checks to see if a named buffer has matched. The data array of
3021 * buffer numbers corresponding to the buffer is expected to reside
3022 * in the regexp->data->data array in the slot stored in the ARG() of
3023 * node involved. Note that this routine doesn't actually care about the
3024 * name, that information is not preserved from compilation to execution.
3025 * Returns the index of the leftmost defined buffer with the given name
3026 * or 0 if non of the buffers matched.
3029 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3032 RXi_GET_DECL(rex,rexi);
3033 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3034 I32 *nums=(I32*)SvPVX(sv_dat);
3036 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3038 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3039 if ((I32)*PL_reglastparen >= nums[n] &&
3040 PL_regoffs[nums[n]].end != -1)
3049 /* free all slabs above current one - called during LEAVE_SCOPE */
3052 S_clear_backtrack_stack(pTHX_ void *p)
3054 regmatch_slab *s = PL_regmatch_slab->next;
3059 PL_regmatch_slab->next = NULL;
3061 regmatch_slab * const osl = s;
3068 #define SETREX(Re1,Re2) \
3069 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3072 STATIC I32 /* 0 failure, 1 success */
3073 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3075 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3079 register const bool utf8_target = PL_reg_match_utf8;
3080 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3081 REGEXP *rex_sv = reginfo->prog;
3082 regexp *rex = (struct regexp *)SvANY(rex_sv);
3083 RXi_GET_DECL(rex,rexi);
3085 /* the current state. This is a cached copy of PL_regmatch_state */
3086 register regmatch_state *st;
3087 /* cache heavy used fields of st in registers */
3088 register regnode *scan;
3089 register regnode *next;
3090 register U32 n = 0; /* general value; init to avoid compiler warning */
3091 register I32 ln = 0; /* len or last; init to avoid compiler warning */
3092 register char *locinput = PL_reginput;
3093 register I32 nextchr; /* is always set to UCHARAT(locinput) */
3095 bool result = 0; /* return value of S_regmatch */
3096 int depth = 0; /* depth of backtrack stack */
3097 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3098 const U32 max_nochange_depth =
3099 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3100 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3101 regmatch_state *yes_state = NULL; /* state to pop to on success of
3103 /* mark_state piggy backs on the yes_state logic so that when we unwind
3104 the stack on success we can update the mark_state as we go */
3105 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3106 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3107 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3109 bool no_final = 0; /* prevent failure from backtracking? */
3110 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3111 char *startpoint = PL_reginput;
3112 SV *popmark = NULL; /* are we looking for a mark? */
3113 SV *sv_commit = NULL; /* last mark name seen in failure */
3114 SV *sv_yes_mark = NULL; /* last mark name we have seen
3115 during a successful match */
3116 U32 lastopen = 0; /* last open we saw */
3117 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3118 SV* const oreplsv = GvSV(PL_replgv);
3119 /* these three flags are set by various ops to signal information to
3120 * the very next op. They have a useful lifetime of exactly one loop
3121 * iteration, and are not preserved or restored by state pushes/pops
3123 bool sw = 0; /* the condition value in (?(cond)a|b) */
3124 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3125 int logical = 0; /* the following EVAL is:
3129 or the following IFMATCH/UNLESSM is:
3130 false: plain (?=foo)
3131 true: used as a condition: (?(?=foo))
3134 GET_RE_DEBUG_FLAGS_DECL;
3137 PERL_ARGS_ASSERT_REGMATCH;
3139 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3140 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3142 /* on first ever call to regmatch, allocate first slab */
3143 if (!PL_regmatch_slab) {
3144 Newx(PL_regmatch_slab, 1, regmatch_slab);
3145 PL_regmatch_slab->prev = NULL;
3146 PL_regmatch_slab->next = NULL;
3147 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3150 oldsave = PL_savestack_ix;
3151 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3152 SAVEVPTR(PL_regmatch_slab);
3153 SAVEVPTR(PL_regmatch_state);
3155 /* grab next free state slot */
3156 st = ++PL_regmatch_state;
3157 if (st > SLAB_LAST(PL_regmatch_slab))
3158 st = PL_regmatch_state = S_push_slab(aTHX);
3160 /* Note that nextchr is a byte even in UTF */
3161 nextchr = UCHARAT(locinput);
3163 while (scan != NULL) {
3166 SV * const prop = sv_newmortal();
3167 regnode *rnext=regnext(scan);
3168 DUMP_EXEC_POS( locinput, scan, utf8_target );
3169 regprop(rex, prop, scan);
3171 PerlIO_printf(Perl_debug_log,
3172 "%3"IVdf":%*s%s(%"IVdf")\n",
3173 (IV)(scan - rexi->program), depth*2, "",
3175 (PL_regkind[OP(scan)] == END || !rnext) ?
3176 0 : (IV)(rnext - rexi->program));
3179 next = scan + NEXT_OFF(scan);
3182 state_num = OP(scan);
3186 assert(PL_reglastparen == &rex->lastparen);
3187 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3188 assert(PL_regoffs == rex->offs);
3190 switch (state_num) {
3192 if (locinput == PL_bostr)
3194 /* reginfo->till = reginfo->bol; */
3199 if (locinput == PL_bostr ||
3200 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3206 if (locinput == PL_bostr)
3210 if (locinput == reginfo->ganch)
3215 /* update the startpoint */
3216 st->u.keeper.val = PL_regoffs[0].start;
3217 PL_reginput = locinput;
3218 PL_regoffs[0].start = locinput - PL_bostr;
3219 PUSH_STATE_GOTO(KEEPS_next, next);
3221 case KEEPS_next_fail:
3222 /* rollback the start point change */
3223 PL_regoffs[0].start = st->u.keeper.val;
3229 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3234 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3236 if (PL_regeol - locinput > 1)
3240 if (PL_regeol != locinput)
3244 if (!nextchr && locinput >= PL_regeol)
3247 locinput += PL_utf8skip[nextchr];
3248 if (locinput > PL_regeol)
3250 nextchr = UCHARAT(locinput);
3253 nextchr = UCHARAT(++locinput);
3256 if (!nextchr && locinput >= PL_regeol)
3258 nextchr = UCHARAT(++locinput);
3261 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3264 locinput += PL_utf8skip[nextchr];
3265 if (locinput > PL_regeol)
3267 nextchr = UCHARAT(locinput);
3270 nextchr = UCHARAT(++locinput);
3274 #define ST st->u.trie
3276 /* In this case the charclass data is available inline so
3277 we can fail fast without a lot of extra overhead.
3279 if (scan->flags == EXACT || !utf8_target) {
3280 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3282 PerlIO_printf(Perl_debug_log,
3283 "%*s %sfailed to match trie start class...%s\n",
3284 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3292 /* the basic plan of execution of the trie is:
3293 * At the beginning, run though all the states, and
3294 * find the longest-matching word. Also remember the position
3295 * of the shortest matching word. For example, this pattern:
3298 * when matched against the string "abcde", will generate
3299 * accept states for all words except 3, with the longest
3300 * matching word being 4, and the shortest being 1 (with
3301 * the position being after char 1 of the string).
3303 * Then for each matching word, in word order (i.e. 1,2,4,5),
3304 * we run the remainder of the pattern; on each try setting
3305 * the current position to the character following the word,
3306 * returning to try the next word on failure.
3308 * We avoid having to build a list of words at runtime by
3309 * using a compile-time structure, wordinfo[].prev, which
3310 * gives, for each word, the previous accepting word (if any).
3311 * In the case above it would contain the mappings 1->2, 2->0,
3312 * 3->0, 4->5, 5->1. We can use this table to generate, from
3313 * the longest word (4 above), a list of all words, by
3314 * following the list of prev pointers; this gives us the
3315 * unordered list 4,5,1,2. Then given the current word we have
3316 * just tried, we can go through the list and find the
3317 * next-biggest word to try (so if we just failed on word 2,
3318 * the next in the list is 4).
3320 * Since at runtime we don't record the matching position in
3321 * the string for each word, we have to work that out for
3322 * each word we're about to process. The wordinfo table holds
3323 * the character length of each word; given that we recorded
3324 * at the start: the position of the shortest word and its
3325 * length in chars, we just need to move the pointer the
3326 * difference between the two char lengths. Depending on
3327 * Unicode status and folding, that's cheap or expensive.
3329 * This algorithm is optimised for the case where are only a
3330 * small number of accept states, i.e. 0,1, or maybe 2.
3331 * With lots of accepts states, and having to try all of them,
3332 * it becomes quadratic on number of accept states to find all
3337 /* what type of TRIE am I? (utf8 makes this contextual) */
3338 DECL_TRIE_TYPE(scan);
3340 /* what trie are we using right now */
3341 reg_trie_data * const trie
3342 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3343 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3344 U32 state = trie->startstate;
3346 if (trie->bitmap && trie_type != trie_utf8_fold &&
3347 !TRIE_BITMAP_TEST(trie,*locinput)
3349 if (trie->states[ state ].wordnum) {
3351 PerlIO_printf(Perl_debug_log,
3352 "%*s %smatched empty string...%s\n",
3353 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3359 PerlIO_printf(Perl_debug_log,
3360 "%*s %sfailed to match trie start class...%s\n",
3361 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3368 U8 *uc = ( U8* )locinput;
3372 U8 *uscan = (U8*)NULL;
3373 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3374 U32 charcount = 0; /* how many input chars we have matched */
3375 U32 accepted = 0; /* have we seen any accepting states? */
3378 ST.jump = trie->jump;
3381 ST.longfold = FALSE; /* char longer if folded => it's harder */
3384 /* fully traverse the TRIE; note the position of the
3385 shortest accept state and the wordnum of the longest
3388 while ( state && uc <= (U8*)PL_regeol ) {
3389 U32 base = trie->states[ state ].trans.base;
3393 wordnum = trie->states[ state ].wordnum;
3395 if (wordnum) { /* it's an accept state */
3398 /* record first match position */
3400 ST.firstpos = (U8*)locinput;
3405 ST.firstchars = charcount;
3408 if (!ST.nextword || wordnum < ST.nextword)
3409 ST.nextword = wordnum;
3410 ST.topword = wordnum;
3413 DEBUG_TRIE_EXECUTE_r({
3414 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3415 PerlIO_printf( Perl_debug_log,
3416 "%*s %sState: %4"UVxf" Accepted: %c ",
3417 2+depth * 2, "", PL_colors[4],
3418 (UV)state, (accepted ? 'Y' : 'N'));
3421 /* read a char and goto next state */
3424 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3425 uscan, len, uvc, charid, foldlen,
3432 base + charid - 1 - trie->uniquecharcount)) >= 0)
3434 && ((U32)offset < trie->lasttrans)
3435 && trie->trans[offset].check == state)
3437 state = trie->trans[offset].next;
3448 DEBUG_TRIE_EXECUTE_r(
3449 PerlIO_printf( Perl_debug_log,
3450 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3451 charid, uvc, (UV)state, PL_colors[5] );
3457 /* calculate total number of accept states */
3462 w = trie->wordinfo[w].prev;
3465 ST.accepted = accepted;
3469 PerlIO_printf( Perl_debug_log,
3470 "%*s %sgot %"IVdf" possible matches%s\n",
3471 REPORT_CODE_OFF + depth * 2, "",
3472 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3474 goto trie_first_try; /* jump into the fail handler */
3478 case TRIE_next_fail: /* we failed - try next alternative */
3480 REGCP_UNWIND(ST.cp);
3481 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3482 PL_regoffs[n].end = -1;
3483 *PL_reglastparen = n;
3485 if (!--ST.accepted) {
3487 PerlIO_printf( Perl_debug_log,
3488 "%*s %sTRIE failed...%s\n",
3489 REPORT_CODE_OFF+depth*2, "",
3496 /* Find next-highest word to process. Note that this code
3497 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3498 register U16 min = 0;
3500 register U16 const nextword = ST.nextword;
3501 register reg_trie_wordinfo * const wordinfo
3502 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3503 for (word=ST.topword; word; word=wordinfo[word].prev) {
3504 if (word > nextword && (!min || word < min))
3517 ST.lastparen = *PL_reglastparen;
3521 /* find start char of end of current word */
3523 U32 chars; /* how many chars to skip */
3524 U8 *uc = ST.firstpos;
3525 reg_trie_data * const trie
3526 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3528 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3530 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3534 /* the hard option - fold each char in turn and find
3535 * its folded length (which may be different */
3536 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3544 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3552 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3557 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3571 PL_reginput = (char *)uc;
3574 scan = (ST.jump && ST.jump[ST.nextword])
3575 ? ST.me + ST.jump[ST.nextword]
3579 PerlIO_printf( Perl_debug_log,
3580 "%*s %sTRIE matched word #%d, continuing%s\n",
3581 REPORT_CODE_OFF+depth*2, "",
3588 if (ST.accepted > 1 || has_cutgroup) {
3589 PUSH_STATE_GOTO(TRIE_next, scan);
3592 /* only one choice left - just continue */
3594 AV *const trie_words
3595 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3596 SV ** const tmp = av_fetch( trie_words,
3598 SV *sv= tmp ? sv_newmortal() : NULL;
3600 PerlIO_printf( Perl_debug_log,
3601 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3602 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3604 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3605 PL_colors[0], PL_colors[1],
3606 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3608 : "not compiled under -Dr",
3612 locinput = PL_reginput;
3613 nextchr = UCHARAT(locinput);
3614 continue; /* execute rest of RE */
3619 char *s = STRING(scan);
3621 if (utf8_target != UTF_PATTERN) {
3622 /* The target and the pattern have differing utf8ness. */
3624 const char * const e = s + ln;
3627 /* The target is utf8, the pattern is not utf8. */
3632 if (NATIVE_TO_UNI(*(U8*)s) !=
3633 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3641 /* The target is not utf8, the pattern is utf8. */
3646 if (NATIVE_TO_UNI(*((U8*)l)) !=
3647 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3655 nextchr = UCHARAT(locinput);
3658 /* The target and the pattern have the same utf8ness. */
3659 /* Inline the first character, for speed. */
3660 if (UCHARAT(s) != nextchr)
3662 if (PL_regeol - locinput < ln)
3664 if (ln > 1 && memNE(s, locinput, ln))
3667 nextchr = UCHARAT(locinput);
3672 const U8 * fold_array;
3675 PL_reg_flags |= RF_tainted;
3676 folder = foldEQ_locale;
3677 fold_array = PL_fold_locale;
3681 folder = foldEQ_latin1;
3682 fold_array = PL_fold_latin1;
3687 fold_array = PL_fold;
3693 if (utf8_target || UTF_PATTERN) {
3694 /* Either target or the pattern are utf8. */
3695 const char * const l = locinput;
3696 char *e = PL_regeol;
3698 if (! foldEQ_utf8(s, 0, ln, cBOOL(UTF_PATTERN),
3699 l, &e, 0, utf8_target)) {
3700 /* One more case for the sharp s:
3701 * pack("U0U*", 0xDF) =~ /ss/i,
3702 * the 0xC3 0x9F are the UTF-8
3703 * byte sequence for the U+00DF. */
3705 if (!(utf8_target &&
3706 toLOWER(s[0]) == 's' &&
3708 toLOWER(s[1]) == 's' &&
3715 nextchr = UCHARAT(locinput);
3719 /* Neither the target and the pattern are utf8. */
3721 /* Inline the first character, for speed. */
3722 if (UCHARAT(s) != nextchr &&
3723 UCHARAT(s) != fold_array[nextchr])
3727 if (PL_regeol - locinput < ln)
3729 if (ln > 1 && ! folder(s, locinput, ln))
3732 nextchr = UCHARAT(locinput);
3736 /* XXX Could improve efficiency by separating these all out using a
3737 * macro or in-line function. At that point regcomp.c would no longer
3738 * have to set the FLAGS fields of these */
3741 PL_reg_flags |= RF_tainted;
3749 /* was last char in word? */
3750 if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3751 if (locinput == PL_bostr)
3754 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3756 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3758 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3759 ln = isALNUM_uni(ln);
3760 LOAD_UTF8_CHARCLASS_ALNUM();
3761 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3764 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3765 n = isALNUM_LC_utf8((U8*)locinput);
3770 /* Here the string isn't utf8, or is utf8 and only ascii
3771 * characters are to match \w. In the latter case looking at
3772 * the byte just prior to the current one may be just the final
3773 * byte of a multi-byte character. This is ok. There are two
3775 * 1) it is a single byte character, and then the test is doing
3776 * just what it's supposed to.
3777 * 2) it is a multi-byte character, in which case the final
3778 * byte is never mistakable for ASCII, and so the test
3779 * will say it is not a word character, which is the
3780 * correct answer. */
3781 ln = (locinput != PL_bostr) ?
3782 UCHARAT(locinput - 1) : '\n';
3783 switch (FLAGS(scan)) {
3784 case REGEX_UNICODE_CHARSET:
3785 ln = isWORDCHAR_L1(ln);
3786 n = isWORDCHAR_L1(nextchr);
3788 case REGEX_LOCALE_CHARSET:
3789 ln = isALNUM_LC(ln);
3790 n = isALNUM_LC(nextchr);
3792 case REGEX_DEPENDS_CHARSET:
3794 n = isALNUM(nextchr);
3796 case REGEX_ASCII_RESTRICTED_CHARSET:
3797 ln = isWORDCHAR_A(ln);
3798 n = isWORDCHAR_A(nextchr);
3801 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3805 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3807 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3812 if (utf8_target || state_num == ANYOFV) {
3813 STRLEN inclasslen = PL_regeol - locinput;
3814 if (locinput >= PL_regeol)
3817 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3819 locinput += inclasslen;
3820 nextchr = UCHARAT(locinput);
3825 nextchr = UCHARAT(locinput);
3826 if (!nextchr && locinput >= PL_regeol)