5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGEXEC_C
87 #ifdef PERL_IN_XSUB_RE
93 #include "inline_invlist.c"
94 #include "unicode_constants.h"
96 #define RF_tainted 1 /* tainted information used? e.g. locale */
97 #define RF_warned 2 /* warned about big count? */
99 #define RF_utf8 8 /* Pattern contains multibyte chars? */
101 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
103 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
106 #define STATIC static
109 /* Valid for non-utf8 strings: avoids the reginclass
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
112 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
113 : ANYOF_BITMAP_TEST(p,*(c)))
119 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
120 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
122 #define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
126 #define HOPBACKc(pos, off) \
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
137 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138 #define NEXTCHR_IS_EOS (nextchr < 0)
140 #define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
143 #define SET_locinput(p) \
148 /* these are unrolled below in the CCC_TRY_XXX defined */
149 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
150 if (!CAT2(PL_utf8_,class)) { \
152 ENTER; save_re_context(); \
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
156 /* Doesn't do an assert to verify that is correct */
157 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
158 if (!CAT2(PL_utf8_,class)) { \
159 bool throw_away PERL_UNUSED_DECL; \
160 ENTER; save_re_context(); \
161 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
164 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
165 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
166 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
168 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
169 /* No asserts are done for some of these, in case called on a */ \
170 /* Unicode version in which they map to nothing */ \
171 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
172 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
174 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
176 /* The actual code for CCC_TRY, which uses several variables from the routine
177 * it's callable from. It is designed to be the bulk of a case statement.
178 * FUNC is the macro or function to call on non-utf8 targets that indicate if
179 * nextchr matches the class.
180 * UTF8_TEST is the whole test string to use for utf8 targets
181 * LOAD is what to use to test, and if not present to load in the swash for the
183 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
185 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
186 * utf8 and a variant, load the swash if necessary and test using the utf8
187 * test. Advance to the next character if test is ok, otherwise fail; If not
188 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
189 * fails, or advance to the next character */
191 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
192 if (NEXTCHR_IS_EOS) { \
195 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
196 LOAD_UTF8_CHARCLASS(CLASS, STR); \
197 if (POS_OR_NEG (UTF8_TEST)) { \
201 else if (POS_OR_NEG (FUNC(nextchr))) { \
204 goto increment_locinput;
206 /* Handle the non-locale cases for a character class and its complement. It
207 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
208 * This is because that code fails when the test succeeds, so we want to have
209 * the test fail so that the code succeeds. The swash is stored in a
210 * predictable PL_ place */
211 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
214 _CCC_TRY_CODE( !, FUNC, \
215 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
216 (U8*)locinput, TRUE)), \
219 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
220 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
221 (U8*)locinput, TRUE)), \
224 /* Generate the case statements for both locale and non-locale character
225 * classes in regmatch for classes that don't have special unicode semantics.
226 * Locales don't use an immediate swash, but an intermediary special locale
227 * function that is called on the pointer to the current place in the input
228 * string. That function will resolve to needing the same swash. One might
229 * think that because we don't know what the locale will match, we shouldn't
230 * check with the swash loading function that it loaded properly; ie, that we
231 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
232 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
234 #define CCC_TRY(NAME, NNAME, FUNC, \
235 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
236 NAMEA, NNAMEA, FUNCA, \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
242 PL_reg_flags |= RF_tainted; \
243 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
246 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
249 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
253 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
256 goto increment_locinput; \
257 /* Generate the non-locale cases */ \
258 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
260 /* This is like CCC_TRY, but has an extra set of parameters for generating case
261 * statements to handle separate Unicode semantics nodes */
262 #define CCC_TRY_U(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEU, NNAMEU, FUNCU, \
265 NAMEA, NNAMEA, FUNCA, \
267 CCC_TRY(NAME, NNAME, FUNC, \
268 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
269 NAMEA, NNAMEA, FUNCA, \
271 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
273 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
275 /* for use after a quantifier and before an EXACT-like node -- japhy */
276 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
278 * NOTE that *nothing* that affects backtracking should be in here, specifically
279 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
280 * node that is in between two EXACT like nodes when ascertaining what the required
281 * "follow" character is. This should probably be moved to regex compile time
282 * although it may be done at run time beause of the REF possibility - more
283 * investigation required. -- demerphq
285 #define JUMPABLE(rn) ( \
287 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
289 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
290 OP(rn) == PLUS || OP(rn) == MINMOD || \
292 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
294 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
296 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
299 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
300 we don't need this definition. */
301 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
302 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
303 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
306 /* ... so we use this as its faster. */
307 #define IS_TEXT(rn) ( OP(rn)==EXACT )
308 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
309 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
310 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
315 Search for mandatory following text node; for lookahead, the text must
316 follow but for lookbehind (rn->flags != 0) we skip to the next step.
318 #define FIND_NEXT_IMPT(rn) STMT_START { \
319 while (JUMPABLE(rn)) { \
320 const OPCODE type = OP(rn); \
321 if (type == SUSPEND || PL_regkind[type] == CURLY) \
322 rn = NEXTOPER(NEXTOPER(rn)); \
323 else if (type == PLUS) \
325 else if (type == IFMATCH) \
326 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
327 else rn += NEXT_OFF(rn); \
332 static void restore_pos(pTHX_ void *arg);
334 #define REGCP_PAREN_ELEMS 3
335 #define REGCP_OTHER_ELEMS 3
336 #define REGCP_FRAME_ELEMS 1
337 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
338 * are needed for the regexp context stack bookkeeping. */
341 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
344 const int retval = PL_savestack_ix;
345 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
346 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
347 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
349 GET_RE_DEBUG_FLAGS_DECL;
351 PERL_ARGS_ASSERT_REGCPPUSH;
353 if (paren_elems_to_push < 0)
354 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
355 paren_elems_to_push);
357 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
358 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
359 " out of range (%lu-%ld)",
360 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
362 SSGROW(total_elems + REGCP_FRAME_ELEMS);
365 if ((int)PL_regsize > (int)parenfloor)
366 PerlIO_printf(Perl_debug_log,
367 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
372 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
373 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
374 SSPUSHINT(rex->offs[p].end);
375 SSPUSHINT(rex->offs[p].start);
376 SSPUSHINT(rex->offs[p].start_tmp);
377 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
378 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
380 (IV)rex->offs[p].start,
381 (IV)rex->offs[p].start_tmp,
385 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
386 SSPUSHINT(PL_regsize);
387 SSPUSHINT(rex->lastparen);
388 SSPUSHINT(rex->lastcloseparen);
389 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
394 /* These are needed since we do not localize EVAL nodes: */
395 #define REGCP_SET(cp) \
397 PerlIO_printf(Perl_debug_log, \
398 " Setting an EVAL scope, savestack=%"IVdf"\n", \
399 (IV)PL_savestack_ix)); \
402 #define REGCP_UNWIND(cp) \
404 if (cp != PL_savestack_ix) \
405 PerlIO_printf(Perl_debug_log, \
406 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
407 (IV)(cp), (IV)PL_savestack_ix)); \
410 #define UNWIND_PAREN(lp, lcp) \
411 for (n = rex->lastparen; n > lp; n--) \
412 rex->offs[n].end = -1; \
413 rex->lastparen = n; \
414 rex->lastcloseparen = lcp;
418 S_regcppop(pTHX_ regexp *rex)
423 GET_RE_DEBUG_FLAGS_DECL;
425 PERL_ARGS_ASSERT_REGCPPOP;
427 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
429 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
430 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
431 rex->lastcloseparen = SSPOPINT;
432 rex->lastparen = SSPOPINT;
433 PL_regsize = SSPOPINT;
435 i -= REGCP_OTHER_ELEMS;
436 /* Now restore the parentheses context. */
438 if (i || rex->lastparen + 1 <= rex->nparens)
439 PerlIO_printf(Perl_debug_log,
440 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
446 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
448 rex->offs[paren].start_tmp = SSPOPINT;
449 rex->offs[paren].start = SSPOPINT;
451 if (paren <= rex->lastparen)
452 rex->offs[paren].end = tmps;
453 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
454 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
456 (IV)rex->offs[paren].start,
457 (IV)rex->offs[paren].start_tmp,
458 (IV)rex->offs[paren].end,
459 (paren > rex->lastparen ? "(skipped)" : ""));
464 /* It would seem that the similar code in regtry()
465 * already takes care of this, and in fact it is in
466 * a better location to since this code can #if 0-ed out
467 * but the code in regtry() is needed or otherwise tests
468 * requiring null fields (pat.t#187 and split.t#{13,14}
469 * (as of patchlevel 7877) will fail. Then again,
470 * this code seems to be necessary or otherwise
471 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
472 * --jhi updated by dapm */
473 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
475 rex->offs[i].start = -1;
476 rex->offs[i].end = -1;
477 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
478 " \\%"UVuf": %s ..-1 undeffing\n",
480 (i > PL_regsize) ? "-1" : " "
486 /* restore the parens and associated vars at savestack position ix,
487 * but without popping the stack */
490 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
492 I32 tmpix = PL_savestack_ix;
493 PL_savestack_ix = ix;
495 PL_savestack_ix = tmpix;
498 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
501 * pregexec and friends
504 #ifndef PERL_IN_XSUB_RE
506 - pregexec - match a regexp against a string
509 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
510 char *strbeg, I32 minend, SV *screamer, U32 nosave)
511 /* stringarg: the point in the string at which to begin matching */
512 /* strend: pointer to null at end of string */
513 /* strbeg: real beginning of string */
514 /* minend: end of match must be >= minend bytes after stringarg. */
515 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
516 * itself is accessed via the pointers above */
517 /* nosave: For optimizations. */
519 PERL_ARGS_ASSERT_PREGEXEC;
522 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
523 nosave ? 0 : REXEC_COPY_STR);
528 * Need to implement the following flags for reg_anch:
530 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
532 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
533 * INTUIT_AUTORITATIVE_ML
534 * INTUIT_ONCE_NOML - Intuit can match in one location only.
537 * Another flag for this function: SECOND_TIME (so that float substrs
538 * with giant delta may be not rechecked).
541 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
543 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
544 Otherwise, only SvCUR(sv) is used to get strbeg. */
546 /* XXXX We assume that strpos is strbeg unless sv. */
548 /* XXXX Some places assume that there is a fixed substring.
549 An update may be needed if optimizer marks as "INTUITable"
550 RExen without fixed substrings. Similarly, it is assumed that
551 lengths of all the strings are no more than minlen, thus they
552 cannot come from lookahead.
553 (Or minlen should take into account lookahead.)
554 NOTE: Some of this comment is not correct. minlen does now take account
555 of lookahead/behind. Further research is required. -- demerphq
559 /* A failure to find a constant substring means that there is no need to make
560 an expensive call to REx engine, thus we celebrate a failure. Similarly,
561 finding a substring too deep into the string means that less calls to
562 regtry() should be needed.
564 REx compiler's optimizer found 4 possible hints:
565 a) Anchored substring;
567 c) Whether we are anchored (beginning-of-line or \G);
568 d) First node (of those at offset 0) which may distinguish positions;
569 We use a)b)d) and multiline-part of c), and try to find a position in the
570 string which does not contradict any of them.
573 /* Most of decisions we do here should have been done at compile time.
574 The nodes of the REx which we used for the search should have been
575 deleted from the finite automaton. */
578 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
579 char *strend, const U32 flags, re_scream_pos_data *data)
582 struct regexp *const prog = ReANY(rx);
584 /* Should be nonnegative! */
590 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
592 char *other_last = NULL; /* other substr checked before this */
593 char *check_at = NULL; /* check substr found at this pos */
594 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
595 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
596 RXi_GET_DECL(prog,progi);
598 const char * const i_strpos = strpos;
600 GET_RE_DEBUG_FLAGS_DECL;
602 PERL_ARGS_ASSERT_RE_INTUIT_START;
603 PERL_UNUSED_ARG(flags);
604 PERL_UNUSED_ARG(data);
606 RX_MATCH_UTF8_set(rx,utf8_target);
609 PL_reg_flags |= RF_utf8;
612 debug_start_match(rx, utf8_target, strpos, strend,
613 sv ? "Guessing start of match in sv for"
614 : "Guessing start of match in string for");
617 /* CHR_DIST() would be more correct here but it makes things slow. */
618 if (prog->minlen > strend - strpos) {
619 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
620 "String too short... [re_intuit_start]\n"));
624 /* XXX we need to pass strbeg as a separate arg: the following is
625 * guesswork and can be wrong... */
626 if (sv && SvPOK(sv)) {
627 char * p = SvPVX(sv);
628 STRLEN cur = SvCUR(sv);
629 if (p <= strpos && strpos < p + cur) {
631 assert(p <= strend && strend <= p + cur);
634 strbeg = strend - cur;
641 if (!prog->check_utf8 && prog->check_substr)
642 to_utf8_substr(prog);
643 check = prog->check_utf8;
645 if (!prog->check_substr && prog->check_utf8) {
646 if (! to_byte_substr(prog)) {
647 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
650 check = prog->check_substr;
652 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
653 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
654 || ( (prog->extflags & RXf_ANCH_BOL)
655 && !multiline ) ); /* Check after \n? */
658 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
659 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
660 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
662 && (strpos != strbeg)) {
663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
666 if (prog->check_offset_min == prog->check_offset_max
667 && !(prog->extflags & RXf_CANY_SEEN)
668 && ! multiline) /* /m can cause \n's to match that aren't
669 accounted for in the string max length.
670 See [perl #115242] */
672 /* Substring at constant offset from beg-of-str... */
675 s = HOP3c(strpos, prog->check_offset_min, strend);
678 slen = SvCUR(check); /* >= 1 */
680 if ( strend - s > slen || strend - s < slen - 1
681 || (strend - s == slen && strend[-1] != '\n')) {
682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
685 /* Now should match s[0..slen-2] */
687 if (slen && (*SvPVX_const(check) != *s
689 && memNE(SvPVX_const(check), s, slen)))) {
691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
695 else if (*SvPVX_const(check) != *s
696 || ((slen = SvCUR(check)) > 1
697 && memNE(SvPVX_const(check), s, slen)))
700 goto success_at_start;
703 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
705 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
706 end_shift = prog->check_end_shift;
709 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
710 - (SvTAIL(check) != 0);
711 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
713 if (end_shift < eshift)
717 else { /* Can match at random position */
720 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
721 end_shift = prog->check_end_shift;
723 /* end shift should be non negative here */
726 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
728 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
729 (IV)end_shift, RX_PRECOMP(prog));
733 /* Find a possible match in the region s..strend by looking for
734 the "check" substring in the region corrected by start/end_shift. */
737 I32 srch_start_shift = start_shift;
738 I32 srch_end_shift = end_shift;
741 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
742 srch_end_shift -= ((strbeg - s) - srch_start_shift);
743 srch_start_shift = strbeg - s;
745 DEBUG_OPTIMISE_MORE_r({
746 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
747 (IV)prog->check_offset_min,
748 (IV)srch_start_shift,
750 (IV)prog->check_end_shift);
753 if (prog->extflags & RXf_CANY_SEEN) {
754 start_point= (U8*)(s + srch_start_shift);
755 end_point= (U8*)(strend - srch_end_shift);
757 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
758 end_point= HOP3(strend, -srch_end_shift, strbeg);
760 DEBUG_OPTIMISE_MORE_r({
761 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
762 (int)(end_point - start_point),
763 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
767 s = fbm_instr( start_point, end_point,
768 check, multiline ? FBMrf_MULTILINE : 0);
770 /* Update the count-of-usability, remove useless subpatterns,
774 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
775 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
776 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
777 (s ? "Found" : "Did not find"),
778 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
779 ? "anchored" : "floating"),
782 (s ? " at offset " : "...\n") );
787 /* Finish the diagnostic message */
788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
790 /* XXX dmq: first branch is for positive lookbehind...
791 Our check string is offset from the beginning of the pattern.
792 So we need to do any stclass tests offset forward from that
801 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
802 Start with the other substr.
803 XXXX no SCREAM optimization yet - and a very coarse implementation
804 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
805 *always* match. Probably should be marked during compile...
806 Probably it is right to do no SCREAM here...
809 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
810 : (prog->float_substr && prog->anchored_substr))
812 /* Take into account the "other" substring. */
813 /* XXXX May be hopelessly wrong for UTF... */
816 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
819 char * const last = HOP3c(s, -start_shift, strbeg);
821 char * const saved_s = s;
824 t = s - prog->check_offset_max;
825 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
827 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
832 t = HOP3c(t, prog->anchored_offset, strend);
833 if (t < other_last) /* These positions already checked */
835 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
838 /* XXXX It is not documented what units *_offsets are in.
839 We assume bytes, but this is clearly wrong.
840 Meaning this code needs to be carefully reviewed for errors.
844 /* On end-of-str: see comment below. */
845 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
846 if (must == &PL_sv_undef) {
848 DEBUG_r(must = prog->anchored_utf8); /* for debug */
853 HOP3(HOP3(last1, prog->anchored_offset, strend)
854 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
856 multiline ? FBMrf_MULTILINE : 0
859 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
860 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
861 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
862 (s ? "Found" : "Contradicts"),
863 quoted, RE_SV_TAIL(must));
868 if (last1 >= last2) {
869 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
870 ", giving up...\n"));
873 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
874 ", trying floating at offset %ld...\n",
875 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
876 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
877 s = HOP3c(last, 1, strend);
881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
882 (long)(s - i_strpos)));
883 t = HOP3c(s, -prog->anchored_offset, strbeg);
884 other_last = HOP3c(s, 1, strend);
892 else { /* Take into account the floating substring. */
894 char * const saved_s = s;
897 t = HOP3c(s, -start_shift, strbeg);
899 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
900 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
901 last = HOP3c(t, prog->float_max_offset, strend);
902 s = HOP3c(t, prog->float_min_offset, strend);
905 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
906 must = utf8_target ? prog->float_utf8 : prog->float_substr;
907 /* fbm_instr() takes into account exact value of end-of-str
908 if the check is SvTAIL(ed). Since false positives are OK,
909 and end-of-str is not later than strend we are OK. */
910 if (must == &PL_sv_undef) {
912 DEBUG_r(must = prog->float_utf8); /* for debug message */
915 s = fbm_instr((unsigned char*)s,
916 (unsigned char*)last + SvCUR(must)
918 must, multiline ? FBMrf_MULTILINE : 0);
920 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
921 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
922 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
923 (s ? "Found" : "Contradicts"),
924 quoted, RE_SV_TAIL(must));
928 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
929 ", giving up...\n"));
932 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
933 ", trying anchored starting at offset %ld...\n",
934 (long)(saved_s + 1 - i_strpos)));
936 s = HOP3c(t, 1, strend);
940 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
941 (long)(s - i_strpos)));
942 other_last = s; /* Fix this later. --Hugo */
952 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
954 DEBUG_OPTIMISE_MORE_r(
955 PerlIO_printf(Perl_debug_log,
956 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
957 (IV)prog->check_offset_min,
958 (IV)prog->check_offset_max,
966 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
968 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
971 /* Fixed substring is found far enough so that the match
972 cannot start at strpos. */
974 if (ml_anch && t[-1] != '\n') {
975 /* Eventually fbm_*() should handle this, but often
976 anchored_offset is not 0, so this check will not be wasted. */
977 /* XXXX In the code below we prefer to look for "^" even in
978 presence of anchored substrings. And we search even
979 beyond the found float position. These pessimizations
980 are historical artefacts only. */
982 while (t < strend - prog->minlen) {
984 if (t < check_at - prog->check_offset_min) {
985 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
986 /* Since we moved from the found position,
987 we definitely contradict the found anchored
988 substr. Due to the above check we do not
989 contradict "check" substr.
990 Thus we can arrive here only if check substr
991 is float. Redo checking for "other"=="fixed".
994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
995 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
996 goto do_other_anchored;
998 /* We don't contradict the found floating substring. */
999 /* XXXX Why not check for STCLASS? */
1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1002 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1005 /* Position contradicts check-string */
1006 /* XXXX probably better to look for check-string
1007 than for "\n", so one should lower the limit for t? */
1008 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1009 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1010 other_last = strpos = s = t + 1;
1015 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1016 PL_colors[0], PL_colors[1]));
1020 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1021 PL_colors[0], PL_colors[1]));
1025 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1028 /* The found string does not prohibit matching at strpos,
1029 - no optimization of calling REx engine can be performed,
1030 unless it was an MBOL and we are not after MBOL,
1031 or a future STCLASS check will fail this. */
1033 /* Even in this situation we may use MBOL flag if strpos is offset
1034 wrt the start of the string. */
1035 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1036 && (strpos != strbeg) && strpos[-1] != '\n'
1037 /* May be due to an implicit anchor of m{.*foo} */
1038 && !(prog->intflags & PREGf_IMPLICIT))
1043 DEBUG_EXECUTE_r( if (ml_anch)
1044 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1045 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1048 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1050 prog->check_utf8 /* Could be deleted already */
1051 && --BmUSEFUL(prog->check_utf8) < 0
1052 && (prog->check_utf8 == prog->float_utf8)
1054 prog->check_substr /* Could be deleted already */
1055 && --BmUSEFUL(prog->check_substr) < 0
1056 && (prog->check_substr == prog->float_substr)
1059 /* If flags & SOMETHING - do not do it many times on the same match */
1060 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1061 /* XXX Does the destruction order has to change with utf8_target? */
1062 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1063 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1064 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1065 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1066 check = NULL; /* abort */
1068 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1069 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1070 if (prog->intflags & PREGf_IMPLICIT)
1071 prog->extflags &= ~RXf_ANCH_MBOL;
1072 /* XXXX This is a remnant of the old implementation. It
1073 looks wasteful, since now INTUIT can use many
1074 other heuristics. */
1075 prog->extflags &= ~RXf_USE_INTUIT;
1076 /* XXXX What other flags might need to be cleared in this branch? */
1082 /* Last resort... */
1083 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1084 /* trie stclasses are too expensive to use here, we are better off to
1085 leave it to regmatch itself */
1086 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1087 /* minlen == 0 is possible if regstclass is \b or \B,
1088 and the fixed substr is ''$.
1089 Since minlen is already taken into account, s+1 is before strend;
1090 accidentally, minlen >= 1 guaranties no false positives at s + 1
1091 even for \b or \B. But (minlen? 1 : 0) below assumes that
1092 regstclass does not come from lookahead... */
1093 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1094 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1095 const U8* const str = (U8*)STRING(progi->regstclass);
1096 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1097 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1100 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1101 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1102 else if (prog->float_substr || prog->float_utf8)
1103 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1107 if (checked_upto < s)
1109 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1110 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1113 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1118 const char *what = NULL;
1120 if (endpos == strend) {
1121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1122 "Could not match STCLASS...\n") );
1125 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1126 "This position contradicts STCLASS...\n") );
1127 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1129 checked_upto = HOPBACKc(endpos, start_shift);
1130 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1131 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1132 /* Contradict one of substrings */
1133 if (prog->anchored_substr || prog->anchored_utf8) {
1134 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1135 DEBUG_EXECUTE_r( what = "anchored" );
1137 s = HOP3c(t, 1, strend);
1138 if (s + start_shift + end_shift > strend) {
1139 /* XXXX Should be taken into account earlier? */
1140 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1141 "Could not match STCLASS...\n") );
1146 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1147 "Looking for %s substr starting at offset %ld...\n",
1148 what, (long)(s + start_shift - i_strpos)) );
1151 /* Have both, check_string is floating */
1152 if (t + start_shift >= check_at) /* Contradicts floating=check */
1153 goto retry_floating_check;
1154 /* Recheck anchored substring, but not floating... */
1158 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1159 "Looking for anchored substr starting at offset %ld...\n",
1160 (long)(other_last - i_strpos)) );
1161 goto do_other_anchored;
1163 /* Another way we could have checked stclass at the
1164 current position only: */
1169 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1170 "Looking for /%s^%s/m starting at offset %ld...\n",
1171 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1174 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1176 /* Check is floating substring. */
1177 retry_floating_check:
1178 t = check_at - start_shift;
1179 DEBUG_EXECUTE_r( what = "floating" );
1180 goto hop_and_restart;
1183 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1184 "By STCLASS: moving %ld --> %ld\n",
1185 (long)(t - i_strpos), (long)(s - i_strpos))
1189 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1190 "Does not contradict STCLASS...\n");
1195 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1196 PL_colors[4], (check ? "Guessed" : "Giving up"),
1197 PL_colors[5], (long)(s - i_strpos)) );
1200 fail_finish: /* Substring not found */
1201 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1202 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1204 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1205 PL_colors[4], PL_colors[5]));
1209 #define DECL_TRIE_TYPE(scan) \
1210 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1211 trie_type = ((scan->flags == EXACT) \
1212 ? (utf8_target ? trie_utf8 : trie_plain) \
1213 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1215 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1216 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1218 switch (trie_type) { \
1219 case trie_utf8_fold: \
1220 if ( foldlen>0 ) { \
1221 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1226 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1227 len = UTF8SKIP(uc); \
1228 skiplen = UNISKIP( uvc ); \
1229 foldlen -= skiplen; \
1230 uscan = foldbuf + skiplen; \
1233 case trie_latin_utf8_fold: \
1234 if ( foldlen>0 ) { \
1235 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1241 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1242 skiplen = UNISKIP( uvc ); \
1243 foldlen -= skiplen; \
1244 uscan = foldbuf + skiplen; \
1248 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1255 charid = trie->charmap[ uvc ]; \
1259 if (widecharmap) { \
1260 SV** const svpp = hv_fetch(widecharmap, \
1261 (char*)&uvc, sizeof(UV), 0); \
1263 charid = (U16)SvIV(*svpp); \
1268 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1272 && (ln == 1 || folder(s, pat_string, ln)) \
1273 && (!reginfo || regtry(reginfo, &s)) ) \
1279 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1281 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
1287 #define REXEC_FBC_SCAN(CoDe) \
1289 while (s < strend) { \
1295 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1296 REXEC_FBC_UTF8_SCAN( \
1298 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1307 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1310 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1319 #define REXEC_FBC_TRYIT \
1320 if ((!reginfo || regtry(reginfo, &s))) \
1323 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1324 if (utf8_target) { \
1325 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1328 REXEC_FBC_CLASS_SCAN(CoNd); \
1331 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1332 if (utf8_target) { \
1334 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1337 REXEC_FBC_CLASS_SCAN(CoNd); \
1340 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1341 PL_reg_flags |= RF_tainted; \
1342 if (utf8_target) { \
1343 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1346 REXEC_FBC_CLASS_SCAN(CoNd); \
1349 #define DUMP_EXEC_POS(li,s,doutf8) \
1350 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1353 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1354 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1355 tmp = TEST_NON_UTF8(tmp); \
1356 REXEC_FBC_UTF8_SCAN( \
1357 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1366 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1367 if (s == PL_bostr) { \
1371 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1372 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1375 LOAD_UTF8_CHARCLASS_ALNUM(); \
1376 REXEC_FBC_UTF8_SCAN( \
1377 if (tmp == ! (TeSt2_UtF8)) { \
1386 /* The only difference between the BOUND and NBOUND cases is that
1387 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1388 * NBOUND. This is accomplished by passing it in either the if or else clause,
1389 * with the other one being empty */
1390 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1391 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1393 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1394 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1396 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1397 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1399 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1400 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1403 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1404 * be passed in completely with the variable name being tested, which isn't
1405 * such a clean interface, but this is easier to read than it was before. We
1406 * are looking for the boundary (or non-boundary between a word and non-word
1407 * character. The utf8 and non-utf8 cases have the same logic, but the details
1408 * must be different. Find the "wordness" of the character just prior to this
1409 * one, and compare it with the wordness of this one. If they differ, we have
1410 * a boundary. At the beginning of the string, pretend that the previous
1411 * character was a new-line */
1412 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1413 if (utf8_target) { \
1416 else { /* Not utf8 */ \
1417 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1418 tmp = TEST_NON_UTF8(tmp); \
1420 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1429 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1432 /* We know what class REx starts with. Try to find this position... */
1433 /* if reginfo is NULL, its a dryrun */
1434 /* annoyingly all the vars in this routine have different names from their counterparts
1435 in regmatch. /grrr */
1438 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1439 const char *strend, regmatch_info *reginfo)
1442 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1443 char *pat_string; /* The pattern's exactish string */
1444 char *pat_end; /* ptr to end char of pat_string */
1445 re_fold_t folder; /* Function for computing non-utf8 folds */
1446 const U8 *fold_array; /* array for folding ords < 256 */
1453 I32 tmp = 1; /* Scratch variable? */
1454 const bool utf8_target = PL_reg_match_utf8;
1455 UV utf8_fold_flags = 0;
1456 RXi_GET_DECL(prog,progi);
1458 PERL_ARGS_ASSERT_FIND_BYCLASS;
1460 /* We know what class it must start with. */
1464 REXEC_FBC_UTF8_CLASS_SCAN(
1465 reginclass(prog, c, (U8*)s, utf8_target));
1468 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1473 if (tmp && (!reginfo || regtry(reginfo, &s)))
1481 if (UTF_PATTERN || utf8_target) {
1482 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1483 goto do_exactf_utf8;
1485 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1486 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1487 goto do_exactf_non_utf8; /* isn't dealt with by these */
1492 /* regcomp.c already folded this if pattern is in UTF-8 */
1493 utf8_fold_flags = 0;
1494 goto do_exactf_utf8;
1496 fold_array = PL_fold;
1498 goto do_exactf_non_utf8;
1501 if (UTF_PATTERN || utf8_target) {
1502 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1503 goto do_exactf_utf8;
1505 fold_array = PL_fold_locale;
1506 folder = foldEQ_locale;
1507 goto do_exactf_non_utf8;
1511 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1513 goto do_exactf_utf8;
1515 case EXACTFU_TRICKYFOLD:
1517 if (UTF_PATTERN || utf8_target) {
1518 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1519 goto do_exactf_utf8;
1522 /* Any 'ss' in the pattern should have been replaced by regcomp,
1523 * so we don't have to worry here about this single special case
1524 * in the Latin1 range */
1525 fold_array = PL_fold_latin1;
1526 folder = foldEQ_latin1;
1530 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1531 are no glitches with fold-length differences
1532 between the target string and pattern */
1534 /* The idea in the non-utf8 EXACTF* cases is to first find the
1535 * first character of the EXACTF* node and then, if necessary,
1536 * case-insensitively compare the full text of the node. c1 is the
1537 * first character. c2 is its fold. This logic will not work for
1538 * Unicode semantics and the german sharp ss, which hence should
1539 * not be compiled into a node that gets here. */
1540 pat_string = STRING(c);
1541 ln = STR_LEN(c); /* length to match in octets/bytes */
1543 /* We know that we have to match at least 'ln' bytes (which is the
1544 * same as characters, since not utf8). If we have to match 3
1545 * characters, and there are only 2 availabe, we know without
1546 * trying that it will fail; so don't start a match past the
1547 * required minimum number from the far end */
1548 e = HOP3c(strend, -((I32)ln), s);
1550 if (!reginfo && e < s) {
1551 e = s; /* Due to minlen logic of intuit() */
1555 c2 = fold_array[c1];
1556 if (c1 == c2) { /* If char and fold are the same */
1557 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1560 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1569 /* If one of the operands is in utf8, we can't use the simpler
1570 * folding above, due to the fact that many different characters
1571 * can have the same fold, or portion of a fold, or different-
1573 pat_string = STRING(c);
1574 ln = STR_LEN(c); /* length to match in octets/bytes */
1575 pat_end = pat_string + ln;
1576 lnc = (UTF_PATTERN) /* length to match in characters */
1577 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1580 /* We have 'lnc' characters to match in the pattern, but because of
1581 * multi-character folding, each character in the target can match
1582 * up to 3 characters (Unicode guarantees it will never exceed
1583 * this) if it is utf8-encoded; and up to 2 if not (based on the
1584 * fact that the Latin 1 folds are already determined, and the
1585 * only multi-char fold in that range is the sharp-s folding to
1586 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1587 * string character. Adjust lnc accordingly, rounding up, so that
1588 * if we need to match at least 4+1/3 chars, that really is 5. */
1589 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1590 lnc = (lnc + expansion - 1) / expansion;
1592 /* As in the non-UTF8 case, if we have to match 3 characters, and
1593 * only 2 are left, it's guaranteed to fail, so don't start a
1594 * match that would require us to go beyond the end of the string
1596 e = HOP3c(strend, -((I32)lnc), s);
1598 if (!reginfo && e < s) {
1599 e = s; /* Due to minlen logic of intuit() */
1602 /* XXX Note that we could recalculate e to stop the loop earlier,
1603 * as the worst case expansion above will rarely be met, and as we
1604 * go along we would usually find that e moves further to the left.
1605 * This would happen only after we reached the point in the loop
1606 * where if there were no expansion we should fail. Unclear if
1607 * worth the expense */
1610 char *my_strend= (char *)strend;
1611 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1612 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1613 && (!reginfo || regtry(reginfo, &s)) )
1617 s += (utf8_target) ? UTF8SKIP(s) : 1;
1622 PL_reg_flags |= RF_tainted;
1623 FBC_BOUND(isALNUM_LC,
1624 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isALNUM_LC_utf8((U8*)s));
1628 PL_reg_flags |= RF_tainted;
1629 FBC_NBOUND(isALNUM_LC,
1630 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1631 isALNUM_LC_utf8((U8*)s));
1634 FBC_BOUND(isWORDCHAR,
1636 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1639 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1641 isWORDCHAR_A((U8*)s));
1644 FBC_NBOUND(isWORDCHAR,
1646 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1649 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1651 isWORDCHAR_A((U8*)s));
1654 FBC_BOUND(isWORDCHAR_L1,
1656 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1659 FBC_NBOUND(isWORDCHAR_L1,
1661 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1664 REXEC_FBC_CSCAN_TAINT(
1665 isALNUM_LC_utf8((U8*)s),
1670 REXEC_FBC_CSCAN_PRELOAD(
1671 LOAD_UTF8_CHARCLASS_ALNUM(),
1672 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1673 isWORDCHAR_L1((U8) *s)
1677 REXEC_FBC_CSCAN_PRELOAD(
1678 LOAD_UTF8_CHARCLASS_ALNUM(),
1679 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1684 /* Don't need to worry about utf8, as it can match only a single
1685 * byte invariant character */
1686 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1689 REXEC_FBC_CSCAN_PRELOAD(
1690 LOAD_UTF8_CHARCLASS_ALNUM(),
1691 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1692 ! isWORDCHAR_L1((U8) *s)
1696 REXEC_FBC_CSCAN_PRELOAD(
1697 LOAD_UTF8_CHARCLASS_ALNUM(),
1698 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1709 REXEC_FBC_CSCAN_TAINT(
1710 !isALNUM_LC_utf8((U8*)s),
1715 REXEC_FBC_CSCAN_PRELOAD(
1716 LOAD_UTF8_CHARCLASS_SPACE(),
1717 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1722 REXEC_FBC_CSCAN_PRELOAD(
1723 LOAD_UTF8_CHARCLASS_SPACE(),
1724 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1729 /* Don't need to worry about utf8, as it can match only a single
1730 * byte invariant character */
1731 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1734 REXEC_FBC_CSCAN_TAINT(
1735 isSPACE_LC_utf8((U8*)s),
1740 REXEC_FBC_CSCAN_PRELOAD(
1741 LOAD_UTF8_CHARCLASS_SPACE(),
1742 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1743 ! isSPACE_L1((U8) *s)
1747 REXEC_FBC_CSCAN_PRELOAD(
1748 LOAD_UTF8_CHARCLASS_SPACE(),
1749 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1760 REXEC_FBC_CSCAN_TAINT(
1761 !isSPACE_LC_utf8((U8*)s),
1766 REXEC_FBC_CSCAN_PRELOAD(
1767 LOAD_UTF8_CHARCLASS_DIGIT(),
1768 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1773 /* Don't need to worry about utf8, as it can match only a single
1774 * byte invariant character */
1775 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1778 REXEC_FBC_CSCAN_TAINT(
1779 isDIGIT_LC_utf8((U8*)s),
1784 REXEC_FBC_CSCAN_PRELOAD(
1785 LOAD_UTF8_CHARCLASS_DIGIT(),
1786 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1797 REXEC_FBC_CSCAN_TAINT(
1798 !isDIGIT_LC_utf8((U8*)s),
1804 is_LNBREAK_utf8_safe(s, strend),
1805 is_LNBREAK_latin1_safe(s, strend)
1810 is_VERTWS_utf8_safe(s, strend),
1811 is_VERTWS_latin1_safe(s, strend)
1816 !is_VERTWS_utf8_safe(s, strend),
1817 !is_VERTWS_latin1_safe(s, strend)
1822 is_HORIZWS_utf8_safe(s, strend),
1823 is_HORIZWS_latin1_safe(s, strend)
1828 !is_HORIZWS_utf8_safe(s, strend),
1829 !is_HORIZWS_latin1_safe(s, strend)
1833 /* Don't need to worry about utf8, as it can match only a single
1834 * byte invariant character. The flag in this node type is the
1835 * class number to pass to _generic_isCC() to build a mask for
1836 * searching in PL_charclass[] */
1837 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1841 !_generic_isCC_A(*s, FLAGS(c)),
1842 !_generic_isCC_A(*s, FLAGS(c))
1850 /* what trie are we using right now */
1852 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1854 = (reg_trie_data*)progi->data->data[ aho->trie ];
1855 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1857 const char *last_start = strend - trie->minlen;
1859 const char *real_start = s;
1861 STRLEN maxlen = trie->maxlen;
1863 U8 **points; /* map of where we were in the input string
1864 when reading a given char. For ASCII this
1865 is unnecessary overhead as the relationship
1866 is always 1:1, but for Unicode, especially
1867 case folded Unicode this is not true. */
1868 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1872 GET_RE_DEBUG_FLAGS_DECL;
1874 /* We can't just allocate points here. We need to wrap it in
1875 * an SV so it gets freed properly if there is a croak while
1876 * running the match */
1879 sv_points=newSV(maxlen * sizeof(U8 *));
1880 SvCUR_set(sv_points,
1881 maxlen * sizeof(U8 *));
1882 SvPOK_on(sv_points);
1883 sv_2mortal(sv_points);
1884 points=(U8**)SvPV_nolen(sv_points );
1885 if ( trie_type != trie_utf8_fold
1886 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1889 bitmap=(U8*)trie->bitmap;
1891 bitmap=(U8*)ANYOF_BITMAP(c);
1893 /* this is the Aho-Corasick algorithm modified a touch
1894 to include special handling for long "unknown char"
1895 sequences. The basic idea being that we use AC as long
1896 as we are dealing with a possible matching char, when
1897 we encounter an unknown char (and we have not encountered
1898 an accepting state) we scan forward until we find a legal
1900 AC matching is basically that of trie matching, except
1901 that when we encounter a failing transition, we fall back
1902 to the current states "fail state", and try the current char
1903 again, a process we repeat until we reach the root state,
1904 state 1, or a legal transition. If we fail on the root state
1905 then we can either terminate if we have reached an accepting
1906 state previously, or restart the entire process from the beginning
1910 while (s <= last_start) {
1911 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1919 U8 *uscan = (U8*)NULL;
1920 U8 *leftmost = NULL;
1922 U32 accepted_word= 0;
1926 while ( state && uc <= (U8*)strend ) {
1928 U32 word = aho->states[ state ].wordnum;
1932 DEBUG_TRIE_EXECUTE_r(
1933 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1934 dump_exec_pos( (char *)uc, c, strend, real_start,
1935 (char *)uc, utf8_target );
1936 PerlIO_printf( Perl_debug_log,
1937 " Scanning for legal start char...\n");
1941 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1945 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1951 if (uc >(U8*)last_start) break;
1955 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1956 if (!leftmost || lpos < leftmost) {
1957 DEBUG_r(accepted_word=word);
1963 points[pointpos++ % maxlen]= uc;
1964 if (foldlen || uc < (U8*)strend) {
1965 REXEC_TRIE_READ_CHAR(trie_type, trie,
1967 uscan, len, uvc, charid, foldlen,
1969 DEBUG_TRIE_EXECUTE_r({
1970 dump_exec_pos( (char *)uc, c, strend,
1971 real_start, s, utf8_target);
1972 PerlIO_printf(Perl_debug_log,
1973 " Charid:%3u CP:%4"UVxf" ",
1985 word = aho->states[ state ].wordnum;
1987 base = aho->states[ state ].trans.base;
1989 DEBUG_TRIE_EXECUTE_r({
1991 dump_exec_pos( (char *)uc, c, strend, real_start,
1993 PerlIO_printf( Perl_debug_log,
1994 "%sState: %4"UVxf", word=%"UVxf,
1995 failed ? " Fail transition to " : "",
1996 (UV)state, (UV)word);
2002 ( ((offset = base + charid
2003 - 1 - trie->uniquecharcount)) >= 0)
2004 && ((U32)offset < trie->lasttrans)
2005 && trie->trans[offset].check == state
2006 && (tmp=trie->trans[offset].next))
2008 DEBUG_TRIE_EXECUTE_r(
2009 PerlIO_printf( Perl_debug_log," - legal\n"));
2014 DEBUG_TRIE_EXECUTE_r(
2015 PerlIO_printf( Perl_debug_log," - fail\n"));
2017 state = aho->fail[state];
2021 /* we must be accepting here */
2022 DEBUG_TRIE_EXECUTE_r(
2023 PerlIO_printf( Perl_debug_log," - accepting\n"));
2032 if (!state) state = 1;
2035 if ( aho->states[ state ].wordnum ) {
2036 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2037 if (!leftmost || lpos < leftmost) {
2038 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2043 s = (char*)leftmost;
2044 DEBUG_TRIE_EXECUTE_r({
2046 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2047 (UV)accepted_word, (IV)(s - real_start)
2050 if (!reginfo || regtry(reginfo, &s)) {
2056 DEBUG_TRIE_EXECUTE_r({
2057 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2060 DEBUG_TRIE_EXECUTE_r(
2061 PerlIO_printf( Perl_debug_log,"No match.\n"));
2070 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2080 - regexec_flags - match a regexp against a string
2083 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2084 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2085 /* stringarg: the point in the string at which to begin matching */
2086 /* strend: pointer to null at end of string */
2087 /* strbeg: real beginning of string */
2088 /* minend: end of match must be >= minend bytes after stringarg. */
2089 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2090 * itself is accessed via the pointers above */
2091 /* data: May be used for some additional optimizations.
2092 Currently its only used, with a U32 cast, for transmitting
2093 the ganch offset when doing a /g match. This will change */
2094 /* nosave: For optimizations. */
2098 struct regexp *const prog = ReANY(rx);
2099 /*register*/ char *s;
2101 /*register*/ char *startpos = stringarg;
2102 I32 minlen; /* must match at least this many chars */
2103 I32 dontbother = 0; /* how many characters not to try at end */
2104 I32 end_shift = 0; /* Same for the end. */ /* CC */
2105 I32 scream_pos = -1; /* Internal iterator of scream. */
2106 char *scream_olds = NULL;
2107 const bool utf8_target = cBOOL(DO_UTF8(sv));
2109 RXi_GET_DECL(prog,progi);
2110 regmatch_info reginfo; /* create some info to pass to regtry etc */
2111 regexp_paren_pair *swap = NULL;
2112 GET_RE_DEBUG_FLAGS_DECL;
2114 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2115 PERL_UNUSED_ARG(data);
2117 /* Be paranoid... */
2118 if (prog == NULL || startpos == NULL) {
2119 Perl_croak(aTHX_ "NULL regexp parameter");
2123 multiline = prog->extflags & RXf_PMf_MULTILINE;
2124 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2126 RX_MATCH_UTF8_set(rx, utf8_target);
2128 debug_start_match(rx, utf8_target, startpos, strend,
2132 minlen = prog->minlen;
2134 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2135 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2136 "String too short [regexec_flags]...\n"));
2141 /* Check validity of program. */
2142 if (UCHARAT(progi->program) != REG_MAGIC) {
2143 Perl_croak(aTHX_ "corrupted regexp program");
2147 PL_reg_state.re_state_eval_setup_done = FALSE;
2151 PL_reg_flags |= RF_utf8;
2153 /* Mark beginning of line for ^ and lookbehind. */
2154 reginfo.bol = startpos; /* XXX not used ??? */
2158 /* Mark end of line for $ (and such) */
2161 /* see how far we have to get to not match where we matched before */
2162 reginfo.till = startpos+minend;
2164 /* If there is a "must appear" string, look for it. */
2167 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2169 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2170 reginfo.ganch = startpos + prog->gofs;
2171 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2172 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2173 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2175 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2176 && mg->mg_len >= 0) {
2177 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2178 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2179 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2181 if (prog->extflags & RXf_ANCH_GPOS) {
2182 if (s > reginfo.ganch)
2184 s = reginfo.ganch - prog->gofs;
2185 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2186 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2192 reginfo.ganch = strbeg + PTR2UV(data);
2193 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2196 } else { /* pos() not defined */
2197 reginfo.ganch = strbeg;
2198 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2199 "GPOS: reginfo.ganch = strbeg\n"));
2202 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2203 /* We have to be careful. If the previous successful match
2204 was from this regex we don't want a subsequent partially
2205 successful match to clobber the old results.
2206 So when we detect this possibility we add a swap buffer
2207 to the re, and switch the buffer each match. If we fail
2208 we switch it back, otherwise we leave it swapped.
2211 /* do we need a save destructor here for eval dies? */
2212 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2213 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2214 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2220 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2221 re_scream_pos_data d;
2223 d.scream_olds = &scream_olds;
2224 d.scream_pos = &scream_pos;
2225 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2227 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2228 goto phooey; /* not present */
2234 /* Simplest case: anchored match need be tried only once. */
2235 /* [unless only anchor is BOL and multiline is set] */
2236 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2237 if (s == startpos && regtry(®info, &startpos))
2239 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2240 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2245 dontbother = minlen - 1;
2246 end = HOP3c(strend, -dontbother, strbeg) - 1;
2247 /* for multiline we only have to try after newlines */
2248 if (prog->check_substr || prog->check_utf8) {
2249 /* because of the goto we can not easily reuse the macros for bifurcating the
2250 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2253 goto after_try_utf8;
2255 if (regtry(®info, &s)) {
2262 if (prog->extflags & RXf_USE_INTUIT) {
2263 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2272 } /* end search for check string in unicode */
2274 if (s == startpos) {
2275 goto after_try_latin;
2278 if (regtry(®info, &s)) {
2285 if (prog->extflags & RXf_USE_INTUIT) {
2286 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2295 } /* end search for check string in latin*/
2296 } /* end search for check string */
2297 else { /* search for newline */
2299 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2302 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2303 while (s <= end) { /* note it could be possible to match at the end of the string */
2304 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2305 if (regtry(®info, &s))
2309 } /* end search for newline */
2310 } /* end anchored/multiline check string search */
2312 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2314 /* the warning about reginfo.ganch being used without initialization
2315 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2316 and we only enter this block when the same bit is set. */
2317 char *tmp_s = reginfo.ganch - prog->gofs;
2319 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2324 /* Messy cases: unanchored match. */
2325 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2326 /* we have /x+whatever/ */
2327 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2333 if (! prog->anchored_utf8) {
2334 to_utf8_substr(prog);
2336 ch = SvPVX_const(prog->anchored_utf8)[0];
2339 DEBUG_EXECUTE_r( did_match = 1 );
2340 if (regtry(®info, &s)) goto got_it;
2342 while (s < strend && *s == ch)
2349 if (! prog->anchored_substr) {
2350 if (! to_byte_substr(prog)) {
2351 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2354 ch = SvPVX_const(prog->anchored_substr)[0];
2357 DEBUG_EXECUTE_r( did_match = 1 );
2358 if (regtry(®info, &s)) goto got_it;
2360 while (s < strend && *s == ch)
2365 DEBUG_EXECUTE_r(if (!did_match)
2366 PerlIO_printf(Perl_debug_log,
2367 "Did not find anchored character...\n")
2370 else if (prog->anchored_substr != NULL
2371 || prog->anchored_utf8 != NULL
2372 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2373 && prog->float_max_offset < strend - s)) {
2378 char *last1; /* Last position checked before */
2382 if (prog->anchored_substr || prog->anchored_utf8) {
2384 if (! prog->anchored_utf8) {
2385 to_utf8_substr(prog);
2387 must = prog->anchored_utf8;
2390 if (! prog->anchored_substr) {
2391 if (! to_byte_substr(prog)) {
2392 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2395 must = prog->anchored_substr;
2397 back_max = back_min = prog->anchored_offset;
2400 if (! prog->float_utf8) {
2401 to_utf8_substr(prog);
2403 must = prog->float_utf8;
2406 if (! prog->float_substr) {
2407 if (! to_byte_substr(prog)) {
2408 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2411 must = prog->float_substr;
2413 back_max = prog->float_max_offset;
2414 back_min = prog->float_min_offset;
2420 last = HOP3c(strend, /* Cannot start after this */
2421 -(I32)(CHR_SVLEN(must)
2422 - (SvTAIL(must) != 0) + back_min), strbeg);
2425 last1 = HOPc(s, -1);
2427 last1 = s - 1; /* bogus */
2429 /* XXXX check_substr already used to find "s", can optimize if
2430 check_substr==must. */
2432 dontbother = end_shift;
2433 strend = HOPc(strend, -dontbother);
2434 while ( (s <= last) &&
2435 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2436 (unsigned char*)strend, must,
2437 multiline ? FBMrf_MULTILINE : 0)) ) {
2438 DEBUG_EXECUTE_r( did_match = 1 );
2439 if (HOPc(s, -back_max) > last1) {
2440 last1 = HOPc(s, -back_min);
2441 s = HOPc(s, -back_max);
2444 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2446 last1 = HOPc(s, -back_min);
2450 while (s <= last1) {
2451 if (regtry(®info, &s))
2454 s++; /* to break out of outer loop */
2461 while (s <= last1) {
2462 if (regtry(®info, &s))
2468 DEBUG_EXECUTE_r(if (!did_match) {
2469 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2470 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2471 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2472 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2473 ? "anchored" : "floating"),
2474 quoted, RE_SV_TAIL(must));
2478 else if ( (c = progi->regstclass) ) {
2480 const OPCODE op = OP(progi->regstclass);
2481 /* don't bother with what can't match */
2482 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2483 strend = HOPc(strend, -(minlen - 1));
2486 SV * const prop = sv_newmortal();
2487 regprop(prog, prop, c);
2489 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2491 PerlIO_printf(Perl_debug_log,
2492 "Matching stclass %.*s against %s (%d bytes)\n",
2493 (int)SvCUR(prop), SvPVX_const(prop),
2494 quoted, (int)(strend - s));
2497 if (find_byclass(prog, c, s, strend, ®info))
2499 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2503 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2511 if (! prog->float_utf8) {
2512 to_utf8_substr(prog);
2514 float_real = prog->float_utf8;
2517 if (! prog->float_substr) {
2518 if (! to_byte_substr(prog)) {
2519 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2522 float_real = prog->float_substr;
2525 little = SvPV_const(float_real, len);
2526 if (SvTAIL(float_real)) {
2527 /* This means that float_real contains an artificial \n on
2528 * the end due to the presence of something like this:
2529 * /foo$/ where we can match both "foo" and "foo\n" at the
2530 * end of the string. So we have to compare the end of the
2531 * string first against the float_real without the \n and
2532 * then against the full float_real with the string. We
2533 * have to watch out for cases where the string might be
2534 * smaller than the float_real or the float_real without
2536 char *checkpos= strend - len;
2538 PerlIO_printf(Perl_debug_log,
2539 "%sChecking for float_real.%s\n",
2540 PL_colors[4], PL_colors[5]));
2541 if (checkpos + 1 < strbeg) {
2542 /* can't match, even if we remove the trailing \n
2543 * string is too short to match */
2545 PerlIO_printf(Perl_debug_log,
2546 "%sString shorter than required trailing substring, cannot match.%s\n",
2547 PL_colors[4], PL_colors[5]));
2549 } else if (memEQ(checkpos + 1, little, len - 1)) {
2550 /* can match, the end of the string matches without the
2552 last = checkpos + 1;
2553 } else if (checkpos < strbeg) {
2554 /* cant match, string is too short when the "\n" is
2557 PerlIO_printf(Perl_debug_log,
2558 "%sString does not contain required trailing substring, cannot match.%s\n",
2559 PL_colors[4], PL_colors[5]));
2561 } else if (!multiline) {
2562 /* non multiline match, so compare with the "\n" at the
2563 * end of the string */
2564 if (memEQ(checkpos, little, len)) {
2568 PerlIO_printf(Perl_debug_log,
2569 "%sString does not contain required trailing substring, cannot match.%s\n",
2570 PL_colors[4], PL_colors[5]));
2574 /* multiline match, so we have to search for a place
2575 * where the full string is located */
2581 last = rninstr(s, strend, little, little + len);
2583 last = strend; /* matching "$" */
2586 /* at one point this block contained a comment which was
2587 * probably incorrect, which said that this was a "should not
2588 * happen" case. Even if it was true when it was written I am
2589 * pretty sure it is not anymore, so I have removed the comment
2590 * and replaced it with this one. Yves */
2592 PerlIO_printf(Perl_debug_log,
2593 "String does not contain required substring, cannot match.\n"
2597 dontbother = strend - last + prog->float_min_offset;
2599 if (minlen && (dontbother < minlen))
2600 dontbother = minlen - 1;
2601 strend -= dontbother; /* this one's always in bytes! */
2602 /* We don't know much -- general case. */
2605 if (regtry(®info, &s))
2614 if (regtry(®info, &s))
2616 } while (s++ < strend);
2626 PerlIO_printf(Perl_debug_log,
2627 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2633 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2635 if (PL_reg_state.re_state_eval_setup_done)
2636 restore_pos(aTHX_ prog);
2637 if (RXp_PAREN_NAMES(prog))
2638 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2640 /* make sure $`, $&, $', and $digit will work later */
2641 if ( !(flags & REXEC_NOT_FIRST) ) {
2642 if (flags & REXEC_COPY_STR) {
2643 #ifdef PERL_OLD_COPY_ON_WRITE
2645 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2647 PerlIO_printf(Perl_debug_log,
2648 "Copy on write: regexp capture, type %d\n",
2651 RX_MATCH_COPY_FREE(rx);
2652 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2653 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2654 assert (SvPOKp(prog->saved_copy));
2655 prog->sublen = PL_regeol - strbeg;
2656 prog->suboffset = 0;
2657 prog->subcoffset = 0;
2662 I32 max = PL_regeol - strbeg;
2665 if ( (flags & REXEC_COPY_SKIP_POST)
2666 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2667 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2668 ) { /* don't copy $' part of string */
2671 /* calculate the right-most part of the string covered
2672 * by a capture. Due to look-ahead, this may be to
2673 * the right of $&, so we have to scan all captures */
2674 while (n <= prog->lastparen) {
2675 if (prog->offs[n].end > max)
2676 max = prog->offs[n].end;
2680 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2681 ? prog->offs[0].start
2683 assert(max >= 0 && max <= PL_regeol - strbeg);
2686 if ( (flags & REXEC_COPY_SKIP_PRE)
2687 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2688 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2689 ) { /* don't copy $` part of string */
2692 /* calculate the left-most part of the string covered
2693 * by a capture. Due to look-behind, this may be to
2694 * the left of $&, so we have to scan all captures */
2695 while (min && n <= prog->lastparen) {
2696 if ( prog->offs[n].start != -1
2697 && prog->offs[n].start < min)
2699 min = prog->offs[n].start;
2703 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2704 && min > prog->offs[0].end
2706 min = prog->offs[0].end;
2710 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2713 if (RX_MATCH_COPIED(rx)) {
2714 if (sublen > prog->sublen)
2716 (char*)saferealloc(prog->subbeg, sublen+1);
2719 prog->subbeg = (char*)safemalloc(sublen+1);
2720 Copy(strbeg + min, prog->subbeg, sublen, char);
2721 prog->subbeg[sublen] = '\0';
2722 prog->suboffset = min;
2723 prog->sublen = sublen;
2724 RX_MATCH_COPIED_on(rx);
2726 prog->subcoffset = prog->suboffset;
2727 if (prog->suboffset && utf8_target) {
2728 /* Convert byte offset to chars.
2729 * XXX ideally should only compute this if @-/@+
2730 * has been seen, a la PL_sawampersand ??? */
2732 /* If there's a direct correspondence between the
2733 * string which we're matching and the original SV,
2734 * then we can use the utf8 len cache associated with
2735 * the SV. In particular, it means that under //g,
2736 * sv_pos_b2u() will use the previously cached
2737 * position to speed up working out the new length of
2738 * subcoffset, rather than counting from the start of
2739 * the string each time. This stops
2740 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2741 * from going quadratic */
2742 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2743 sv_pos_b2u(sv, &(prog->subcoffset));
2745 prog->subcoffset = utf8_length((U8*)strbeg,
2746 (U8*)(strbeg+prog->suboffset));
2750 RX_MATCH_COPY_FREE(rx);
2751 prog->subbeg = strbeg;
2752 prog->suboffset = 0;
2753 prog->subcoffset = 0;
2754 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2761 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2762 PL_colors[4], PL_colors[5]));
2763 if (PL_reg_state.re_state_eval_setup_done)
2764 restore_pos(aTHX_ prog);
2766 /* we failed :-( roll it back */
2767 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2768 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2773 Safefree(prog->offs);
2780 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2781 * Do inc before dec, in case old and new rex are the same */
2782 #define SET_reg_curpm(Re2) \
2783 if (PL_reg_state.re_state_eval_setup_done) { \
2784 (void)ReREFCNT_inc(Re2); \
2785 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2786 PM_SETRE((PL_reg_curpm), (Re2)); \
2791 - regtry - try match at specific point
2793 STATIC I32 /* 0 failure, 1 success */
2794 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2798 REGEXP *const rx = reginfo->prog;
2799 regexp *const prog = ReANY(rx);
2801 RXi_GET_DECL(prog,progi);
2802 GET_RE_DEBUG_FLAGS_DECL;
2804 PERL_ARGS_ASSERT_REGTRY;
2806 reginfo->cutpoint=NULL;
2808 if ((prog->extflags & RXf_EVAL_SEEN)
2809 && !PL_reg_state.re_state_eval_setup_done)
2813 PL_reg_state.re_state_eval_setup_done = TRUE;
2815 /* Make $_ available to executed code. */
2816 if (reginfo->sv != DEFSV) {
2818 DEFSV_set(reginfo->sv);
2821 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2822 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2823 /* prepare for quick setting of pos */
2824 #ifdef PERL_OLD_COPY_ON_WRITE
2825 if (SvIsCOW(reginfo->sv))
2826 sv_force_normal_flags(reginfo->sv, 0);
2828 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2829 &PL_vtbl_mglob, NULL, 0);
2833 PL_reg_oldpos = mg->mg_len;
2834 SAVEDESTRUCTOR_X(restore_pos, prog);
2836 if (!PL_reg_curpm) {
2837 Newxz(PL_reg_curpm, 1, PMOP);
2840 SV* const repointer = &PL_sv_undef;
2841 /* this regexp is also owned by the new PL_reg_curpm, which
2842 will try to free it. */
2843 av_push(PL_regex_padav, repointer);
2844 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2845 PL_regex_pad = AvARRAY(PL_regex_padav);
2850 PL_reg_oldcurpm = PL_curpm;
2851 PL_curpm = PL_reg_curpm;
2852 if (RXp_MATCH_COPIED(prog)) {
2853 /* Here is a serious problem: we cannot rewrite subbeg,
2854 since it may be needed if this match fails. Thus
2855 $` inside (?{}) could fail... */
2856 PL_reg_oldsaved = prog->subbeg;
2857 PL_reg_oldsavedlen = prog->sublen;
2858 PL_reg_oldsavedoffset = prog->suboffset;
2859 PL_reg_oldsavedcoffset = prog->suboffset;
2860 #ifdef PERL_OLD_COPY_ON_WRITE
2861 PL_nrs = prog->saved_copy;
2863 RXp_MATCH_COPIED_off(prog);
2866 PL_reg_oldsaved = NULL;
2867 prog->subbeg = PL_bostr;
2868 prog->suboffset = 0;
2869 prog->subcoffset = 0;
2870 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2873 PL_reg_starttry = *startposp;
2875 prog->offs[0].start = *startposp - PL_bostr;
2876 prog->lastparen = 0;
2877 prog->lastcloseparen = 0;
2880 /* XXXX What this code is doing here?!!! There should be no need
2881 to do this again and again, prog->lastparen should take care of
2884 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2885 * Actually, the code in regcppop() (which Ilya may be meaning by
2886 * prog->lastparen), is not needed at all by the test suite
2887 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2888 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2889 * Meanwhile, this code *is* needed for the
2890 * above-mentioned test suite tests to succeed. The common theme
2891 * on those tests seems to be returning null fields from matches.
2892 * --jhi updated by dapm */
2894 if (prog->nparens) {
2895 regexp_paren_pair *pp = prog->offs;
2897 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2905 result = regmatch(reginfo, *startposp, progi->program + 1);
2907 prog->offs[0].end = result;
2910 if (reginfo->cutpoint)
2911 *startposp= reginfo->cutpoint;
2912 REGCP_UNWIND(lastcp);
2917 #define sayYES goto yes
2918 #define sayNO goto no
2919 #define sayNO_SILENT goto no_silent
2921 /* we dont use STMT_START/END here because it leads to
2922 "unreachable code" warnings, which are bogus, but distracting. */
2923 #define CACHEsayNO \
2924 if (ST.cache_mask) \
2925 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2928 /* this is used to determine how far from the left messages like
2929 'failed...' are printed. It should be set such that messages
2930 are inline with the regop output that created them.
2932 #define REPORT_CODE_OFF 32
2935 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2936 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2937 #define CHRTEST_NOT_A_CP_1 -999
2938 #define CHRTEST_NOT_A_CP_2 -998
2940 #define SLAB_FIRST(s) (&(s)->states[0])
2941 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2943 /* grab a new slab and return the first slot in it */
2945 STATIC regmatch_state *
2948 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2951 regmatch_slab *s = PL_regmatch_slab->next;
2953 Newx(s, 1, regmatch_slab);
2954 s->prev = PL_regmatch_slab;
2956 PL_regmatch_slab->next = s;
2958 PL_regmatch_slab = s;
2959 return SLAB_FIRST(s);
2963 /* push a new state then goto it */
2965 #define PUSH_STATE_GOTO(state, node, input) \
2966 pushinput = input; \
2968 st->resume_state = state; \
2971 /* push a new state with success backtracking, then goto it */
2973 #define PUSH_YES_STATE_GOTO(state, node, input) \
2974 pushinput = input; \
2976 st->resume_state = state; \
2977 goto push_yes_state;
2984 regmatch() - main matching routine
2986 This is basically one big switch statement in a loop. We execute an op,
2987 set 'next' to point the next op, and continue. If we come to a point which
2988 we may need to backtrack to on failure such as (A|B|C), we push a
2989 backtrack state onto the backtrack stack. On failure, we pop the top
2990 state, and re-enter the loop at the state indicated. If there are no more
2991 states to pop, we return failure.
2993 Sometimes we also need to backtrack on success; for example /A+/, where
2994 after successfully matching one A, we need to go back and try to
2995 match another one; similarly for lookahead assertions: if the assertion
2996 completes successfully, we backtrack to the state just before the assertion
2997 and then carry on. In these cases, the pushed state is marked as
2998 'backtrack on success too'. This marking is in fact done by a chain of
2999 pointers, each pointing to the previous 'yes' state. On success, we pop to
3000 the nearest yes state, discarding any intermediate failure-only states.
3001 Sometimes a yes state is pushed just to force some cleanup code to be
3002 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3003 it to free the inner regex.
3005 Note that failure backtracking rewinds the cursor position, while
3006 success backtracking leaves it alone.
3008 A pattern is complete when the END op is executed, while a subpattern
3009 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3010 ops trigger the "pop to last yes state if any, otherwise return true"
3013 A common convention in this function is to use A and B to refer to the two
3014 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3015 the subpattern to be matched possibly multiple times, while B is the entire
3016 rest of the pattern. Variable and state names reflect this convention.
3018 The states in the main switch are the union of ops and failure/success of
3019 substates associated with with that op. For example, IFMATCH is the op
3020 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3021 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3022 successfully matched A and IFMATCH_A_fail is a state saying that we have
3023 just failed to match A. Resume states always come in pairs. The backtrack
3024 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3025 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3026 on success or failure.
3028 The struct that holds a backtracking state is actually a big union, with
3029 one variant for each major type of op. The variable st points to the
3030 top-most backtrack struct. To make the code clearer, within each
3031 block of code we #define ST to alias the relevant union.
3033 Here's a concrete example of a (vastly oversimplified) IFMATCH
3039 #define ST st->u.ifmatch
3041 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3042 ST.foo = ...; // some state we wish to save
3044 // push a yes backtrack state with a resume value of
3045 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3047 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3050 case IFMATCH_A: // we have successfully executed A; now continue with B
3052 bar = ST.foo; // do something with the preserved value
3055 case IFMATCH_A_fail: // A failed, so the assertion failed
3056 ...; // do some housekeeping, then ...
3057 sayNO; // propagate the failure
3064 For any old-timers reading this who are familiar with the old recursive
3065 approach, the code above is equivalent to:
3067 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3076 ...; // do some housekeeping, then ...
3077 sayNO; // propagate the failure
3080 The topmost backtrack state, pointed to by st, is usually free. If you
3081 want to claim it, populate any ST.foo fields in it with values you wish to
3082 save, then do one of
3084 PUSH_STATE_GOTO(resume_state, node, newinput);
3085 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3087 which sets that backtrack state's resume value to 'resume_state', pushes a
3088 new free entry to the top of the backtrack stack, then goes to 'node'.
3089 On backtracking, the free slot is popped, and the saved state becomes the
3090 new free state. An ST.foo field in this new top state can be temporarily
3091 accessed to retrieve values, but once the main loop is re-entered, it
3092 becomes available for reuse.
3094 Note that the depth of the backtrack stack constantly increases during the
3095 left-to-right execution of the pattern, rather than going up and down with
3096 the pattern nesting. For example the stack is at its maximum at Z at the
3097 end of the pattern, rather than at X in the following:
3099 /(((X)+)+)+....(Y)+....Z/
3101 The only exceptions to this are lookahead/behind assertions and the cut,
3102 (?>A), which pop all the backtrack states associated with A before
3105 Backtrack state structs are allocated in slabs of about 4K in size.
3106 PL_regmatch_state and st always point to the currently active state,
3107 and PL_regmatch_slab points to the slab currently containing
3108 PL_regmatch_state. The first time regmatch() is called, the first slab is
3109 allocated, and is never freed until interpreter destruction. When the slab
3110 is full, a new one is allocated and chained to the end. At exit from
3111 regmatch(), slabs allocated since entry are freed.
3116 #define DEBUG_STATE_pp(pp) \
3118 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3119 PerlIO_printf(Perl_debug_log, \
3120 " %*s"pp" %s%s%s%s%s\n", \
3122 PL_reg_name[st->resume_state], \
3123 ((st==yes_state||st==mark_state) ? "[" : ""), \
3124 ((st==yes_state) ? "Y" : ""), \
3125 ((st==mark_state) ? "M" : ""), \
3126 ((st==yes_state||st==mark_state) ? "]" : "") \
3131 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3136 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3137 const char *start, const char *end, const char *blurb)
3139 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3141 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3146 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3147 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3149 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3150 start, end - start, 60);
3152 PerlIO_printf(Perl_debug_log,
3153 "%s%s REx%s %s against %s\n",
3154 PL_colors[4], blurb, PL_colors[5], s0, s1);
3156 if (utf8_target||utf8_pat)
3157 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3158 utf8_pat ? "pattern" : "",
3159 utf8_pat && utf8_target ? " and " : "",
3160 utf8_target ? "string" : ""
3166 S_dump_exec_pos(pTHX_ const char *locinput,
3167 const regnode *scan,
3168 const char *loc_regeol,
3169 const char *loc_bostr,
3170 const char *loc_reg_starttry,
3171 const bool utf8_target)
3173 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3174 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3175 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3176 /* The part of the string before starttry has one color
3177 (pref0_len chars), between starttry and current
3178 position another one (pref_len - pref0_len chars),
3179 after the current position the third one.
3180 We assume that pref0_len <= pref_len, otherwise we
3181 decrease pref0_len. */
3182 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3183 ? (5 + taill) - l : locinput - loc_bostr;
3186 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3188 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3190 pref0_len = pref_len - (locinput - loc_reg_starttry);
3191 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3192 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3193 ? (5 + taill) - pref_len : loc_regeol - locinput);
3194 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3198 if (pref0_len > pref_len)
3199 pref0_len = pref_len;
3201 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3203 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3204 (locinput - pref_len),pref0_len, 60, 4, 5);
3206 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3207 (locinput - pref_len + pref0_len),
3208 pref_len - pref0_len, 60, 2, 3);
3210 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3211 locinput, loc_regeol - locinput, 10, 0, 1);
3213 const STRLEN tlen=len0+len1+len2;
3214 PerlIO_printf(Perl_debug_log,
3215 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3216 (IV)(locinput - loc_bostr),
3219 (docolor ? "" : "> <"),
3221 (int)(tlen > 19 ? 0 : 19 - tlen),
3228 /* reg_check_named_buff_matched()
3229 * Checks to see if a named buffer has matched. The data array of
3230 * buffer numbers corresponding to the buffer is expected to reside
3231 * in the regexp->data->data array in the slot stored in the ARG() of
3232 * node involved. Note that this routine doesn't actually care about the
3233 * name, that information is not preserved from compilation to execution.
3234 * Returns the index of the leftmost defined buffer with the given name
3235 * or 0 if non of the buffers matched.
3238 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3241 RXi_GET_DECL(rex,rexi);
3242 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3243 I32 *nums=(I32*)SvPVX(sv_dat);
3245 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3247 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3248 if ((I32)rex->lastparen >= nums[n] &&
3249 rex->offs[nums[n]].end != -1)
3258 /* free all slabs above current one - called during LEAVE_SCOPE */
3261 S_clear_backtrack_stack(pTHX_ void *p)
3263 regmatch_slab *s = PL_regmatch_slab->next;
3268 PL_regmatch_slab->next = NULL;
3270 regmatch_slab * const osl = s;
3276 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3278 /* This function determines if there are one or two characters that match
3279 * the first character of the passed-in EXACTish node <text_node>, and if
3280 * so, returns them in the passed-in pointers.
3282 * If it determines that no possible character in the target string can
3283 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3284 * the first character in <text_node> requires UTF-8 to represent, and the
3285 * target string isn't in UTF-8.)
3287 * If there are more than two characters that could match the beginning of
3288 * <text_node>, or if more context is required to determine a match or not,
3289 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3291 * The motiviation behind this function is to allow the caller to set up
3292 * tight loops for matching. If <text_node> is of type EXACT, there is
3293 * only one possible character that can match its first character, and so
3294 * the situation is quite simple. But things get much more complicated if
3295 * folding is involved. It may be that the first character of an EXACTFish
3296 * node doesn't participate in any possible fold, e.g., punctuation, so it
3297 * can be matched only by itself. The vast majority of characters that are
3298 * in folds match just two things, their lower and upper-case equivalents.
3299 * But not all are like that; some have multiple possible matches, or match
3300 * sequences of more than one character. This function sorts all that out.
3302 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3303 * loop of trying to match A*, we know we can't exit where the thing
3304 * following it isn't a B. And something can't be a B unless it is the
3305 * beginning of B. By putting a quick test for that beginning in a tight
3306 * loop, we can rule out things that can't possibly be B without having to
3307 * break out of the loop, thus avoiding work. Similarly, if A is a single
3308 * character, we can make a tight loop matching A*, using the outputs of
3311 * If the target string to match isn't in UTF-8, and there aren't
3312 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3313 * the one or two possible octets (which are characters in this situation)
3314 * that can match. In all cases, if there is only one character that can
3315 * match, *<c1p> and *<c2p> will be identical.
3317 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3318 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3319 * can match the beginning of <text_node>. They should be declared with at
3320 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3321 * undefined what these contain.) If one or both of the buffers are
3322 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3323 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3324 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3325 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3326 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3328 const bool utf8_target = PL_reg_match_utf8;
3330 UV c1 = CHRTEST_NOT_A_CP_1;
3331 UV c2 = CHRTEST_NOT_A_CP_2;
3332 bool use_chrtest_void = FALSE;
3334 /* Used when we have both utf8 input and utf8 output, to avoid converting
3335 * to/from code points */
3336 bool utf8_has_been_setup = FALSE;
3340 U8 *pat = (U8*)STRING(text_node);
3342 if (OP(text_node) == EXACT) {
3344 /* In an exact node, only one thing can be matched, that first
3345 * character. If both the pat and the target are UTF-8, we can just
3346 * copy the input to the output, avoiding finding the code point of
3348 if (! UTF_PATTERN) {
3351 else if (utf8_target) {
3352 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3353 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3354 utf8_has_been_setup = TRUE;
3357 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3360 else /* an EXACTFish node */
3362 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3363 pat + STR_LEN(text_node)))
3365 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3366 pat + STR_LEN(text_node))))
3368 /* Multi-character folds require more context to sort out. Also
3369 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3370 * handled outside this routine */
3371 use_chrtest_void = TRUE;
3373 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3374 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3376 /* Load the folds hash, if not already done */
3378 if (! PL_utf8_foldclosures) {
3379 if (! PL_utf8_tofold) {
3380 U8 dummy[UTF8_MAXBYTES+1];
3382 /* Force loading this by folding an above-Latin1 char */
3383 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3384 assert(PL_utf8_tofold); /* Verify that worked */
3386 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3389 /* The fold closures data structure is a hash with the keys being
3390 * the UTF-8 of every character that is folded to, like 'k', and
3391 * the values each an array of all code points that fold to its
3392 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3394 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3399 /* Not found in the hash, therefore there are no folds
3400 * containing it, so there is only a single character that
3404 else { /* Does participate in folds */
3405 AV* list = (AV*) *listp;
3406 if (av_len(list) != 1) {
3408 /* If there aren't exactly two folds to this, it is outside
3409 * the scope of this function */
3410 use_chrtest_void = TRUE;
3412 else { /* There are two. Get them */
3413 SV** c_p = av_fetch(list, 0, FALSE);
3415 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3419 c_p = av_fetch(list, 1, FALSE);
3421 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3425 /* Folds that cross the 255/256 boundary are forbidden if
3426 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3427 * pattern character is above 256, and its only other match
3428 * is below 256, the only legal match will be to itself.
3429 * We have thrown away the original, so have to compute
3430 * which is the one above 255 */
3431 if ((c1 < 256) != (c2 < 256)) {
3432 if (OP(text_node) == EXACTFL
3433 || (OP(text_node) == EXACTFA
3434 && (isASCII(c1) || isASCII(c2))))
3447 else /* Here, c1 is < 255 */
3449 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3450 && OP(text_node) != EXACTFL
3451 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3453 /* Here, there could be something above Latin1 in the target which
3454 * folds to this character in the pattern. All such cases except
3455 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3456 * involved in their folds, so are outside the scope of this
3458 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3459 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3462 use_chrtest_void = TRUE;
3465 else { /* Here nothing above Latin1 can fold to the pattern character */
3466 switch (OP(text_node)) {
3468 case EXACTFL: /* /l rules */
3469 c2 = PL_fold_locale[c1];
3473 if (! utf8_target) { /* /d rules */
3478 /* /u rules for all these. This happens to work for
3479 * EXACTFA as nothing in Latin1 folds to ASCII */
3481 case EXACTFU_TRICKYFOLD:
3484 c2 = PL_fold_latin1[c1];
3488 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3489 assert(0); /* NOTREACHED */
3494 /* Here have figured things out. Set up the returns */
3495 if (use_chrtest_void) {
3496 *c2p = *c1p = CHRTEST_VOID;
3498 else if (utf8_target) {
3499 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3500 uvchr_to_utf8(c1_utf8, c1);
3501 uvchr_to_utf8(c2_utf8, c2);
3504 /* Invariants are stored in both the utf8 and byte outputs; Use
3505 * negative numbers otherwise for the byte ones. Make sure that the
3506 * byte ones are the same iff the utf8 ones are the same */
3507 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3508 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3511 ? CHRTEST_NOT_A_CP_1
3512 : CHRTEST_NOT_A_CP_2;
3514 else if (c1 > 255) {
3515 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3520 *c1p = *c2p = c2; /* c2 is the only representable value */
3522 else { /* c1 is representable; see about c2 */
3524 *c2p = (c2 < 256) ? c2 : c1;
3530 /* returns -1 on failure, $+[0] on success */
3532 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3534 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3538 const bool utf8_target = PL_reg_match_utf8;
3539 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3540 REGEXP *rex_sv = reginfo->prog;
3541 regexp *rex = ReANY(rex_sv);
3542 RXi_GET_DECL(rex,rexi);
3544 /* the current state. This is a cached copy of PL_regmatch_state */
3546 /* cache heavy used fields of st in registers */
3549 U32 n = 0; /* general value; init to avoid compiler warning */
3550 I32 ln = 0; /* len or last; init to avoid compiler warning */
3551 char *locinput = startpos;
3552 char *pushinput; /* where to continue after a PUSH */
3553 I32 nextchr; /* is always set to UCHARAT(locinput) */
3555 bool result = 0; /* return value of S_regmatch */
3556 int depth = 0; /* depth of backtrack stack */
3557 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3558 const U32 max_nochange_depth =
3559 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3560 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3561 regmatch_state *yes_state = NULL; /* state to pop to on success of
3563 /* mark_state piggy backs on the yes_state logic so that when we unwind
3564 the stack on success we can update the mark_state as we go */
3565 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3566 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3567 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3569 bool no_final = 0; /* prevent failure from backtracking? */
3570 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3571 char *startpoint = locinput;
3572 SV *popmark = NULL; /* are we looking for a mark? */
3573 SV *sv_commit = NULL; /* last mark name seen in failure */
3574 SV *sv_yes_mark = NULL; /* last mark name we have seen
3575 during a successful match */
3576 U32 lastopen = 0; /* last open we saw */
3577 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3578 SV* const oreplsv = GvSV(PL_replgv);
3579 /* these three flags are set by various ops to signal information to
3580 * the very next op. They have a useful lifetime of exactly one loop
3581 * iteration, and are not preserved or restored by state pushes/pops
3583 bool sw = 0; /* the condition value in (?(cond)a|b) */
3584 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3585 int logical = 0; /* the following EVAL is:
3589 or the following IFMATCH/UNLESSM is:
3590 false: plain (?=foo)
3591 true: used as a condition: (?(?=foo))
3593 PAD* last_pad = NULL;
3595 I32 gimme = G_SCALAR;
3596 CV *caller_cv = NULL; /* who called us */
3597 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3598 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3601 GET_RE_DEBUG_FLAGS_DECL;
3604 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3605 multicall_oldcatch = 0;
3606 multicall_cv = NULL;
3608 PERL_UNUSED_VAR(multicall_cop);
3609 PERL_UNUSED_VAR(newsp);
3612 PERL_ARGS_ASSERT_REGMATCH;
3614 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3615 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3617 /* on first ever call to regmatch, allocate first slab */
3618 if (!PL_regmatch_slab) {
3619 Newx(PL_regmatch_slab, 1, regmatch_slab);
3620 PL_regmatch_slab->prev = NULL;
3621 PL_regmatch_slab->next = NULL;
3622 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3625 oldsave = PL_savestack_ix;
3626 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3627 SAVEVPTR(PL_regmatch_slab);
3628 SAVEVPTR(PL_regmatch_state);
3630 /* grab next free state slot */
3631 st = ++PL_regmatch_state;
3632 if (st > SLAB_LAST(PL_regmatch_slab))
3633 st = PL_regmatch_state = S_push_slab(aTHX);
3635 /* Note that nextchr is a byte even in UTF */
3638 while (scan != NULL) {
3641 SV * const prop = sv_newmortal();
3642 regnode *rnext=regnext(scan);
3643 DUMP_EXEC_POS( locinput, scan, utf8_target );
3644 regprop(rex, prop, scan);
3646 PerlIO_printf(Perl_debug_log,
3647 "%3"IVdf":%*s%s(%"IVdf")\n",
3648 (IV)(scan - rexi->program), depth*2, "",
3650 (PL_regkind[OP(scan)] == END || !rnext) ?
3651 0 : (IV)(rnext - rexi->program));
3654 next = scan + NEXT_OFF(scan);
3657 state_num = OP(scan);
3662 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3664 switch (state_num) {
3665 case BOL: /* /^../ */
3666 if (locinput == PL_bostr)
3668 /* reginfo->till = reginfo->bol; */
3673 case MBOL: /* /^../m */
3674 if (locinput == PL_bostr ||
3675 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3681 case SBOL: /* /^../s */
3682 if (locinput == PL_bostr)
3687 if (locinput == reginfo->ganch)
3691 case KEEPS: /* \K */
3692 /* update the startpoint */
3693 st->u.keeper.val = rex->offs[0].start;
3694 rex->offs[0].start = locinput - PL_bostr;
3695 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3696 assert(0); /*NOTREACHED*/
3697 case KEEPS_next_fail:
3698 /* rollback the start point change */
3699 rex->offs[0].start = st->u.keeper.val;
3701 assert(0); /*NOTREACHED*/
3703 case EOL: /* /..$/ */
3706 case MEOL: /* /..$/m */
3707 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3711 case SEOL: /* /..$/s */
3713 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3715 if (PL_regeol - locinput > 1)
3720 if (!NEXTCHR_IS_EOS)
3724 case SANY: /* /./s */
3727 goto increment_locinput;
3735 case REG_ANY: /* /./ */
3736 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3738 goto increment_locinput;
3742 #define ST st->u.trie
3743 case TRIEC: /* (ab|cd) with known charclass */
3744 /* In this case the charclass data is available inline so
3745 we can fail fast without a lot of extra overhead.
3747 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3749 PerlIO_printf(Perl_debug_log,
3750 "%*s %sfailed to match trie start class...%s\n",
3751 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3754 assert(0); /* NOTREACHED */
3757 case TRIE: /* (ab|cd) */
3758 /* the basic plan of execution of the trie is:
3759 * At the beginning, run though all the states, and
3760 * find the longest-matching word. Also remember the position
3761 * of the shortest matching word. For example, this pattern:
3764 * when matched against the string "abcde", will generate
3765 * accept states for all words except 3, with the longest
3766 * matching word being 4, and the shortest being 2 (with
3767 * the position being after char 1 of the string).
3769 * Then for each matching word, in word order (i.e. 1,2,4,5),
3770 * we run the remainder of the pattern; on each try setting
3771 * the current position to the character following the word,
3772 * returning to try the next word on failure.
3774 * We avoid having to build a list of words at runtime by
3775 * using a compile-time structure, wordinfo[].prev, which
3776 * gives, for each word, the previous accepting word (if any).
3777 * In the case above it would contain the mappings 1->2, 2->0,
3778 * 3->0, 4->5, 5->1. We can use this table to generate, from
3779 * the longest word (4 above), a list of all words, by
3780 * following the list of prev pointers; this gives us the
3781 * unordered list 4,5,1,2. Then given the current word we have
3782 * just tried, we can go through the list and find the
3783 * next-biggest word to try (so if we just failed on word 2,
3784 * the next in the list is 4).
3786 * Since at runtime we don't record the matching position in
3787 * the string for each word, we have to work that out for
3788 * each word we're about to process. The wordinfo table holds
3789 * the character length of each word; given that we recorded
3790 * at the start: the position of the shortest word and its
3791 * length in chars, we just need to move the pointer the
3792 * difference between the two char lengths. Depending on
3793 * Unicode status and folding, that's cheap or expensive.
3795 * This algorithm is optimised for the case where are only a
3796 * small number of accept states, i.e. 0,1, or maybe 2.
3797 * With lots of accepts states, and having to try all of them,
3798 * it becomes quadratic on number of accept states to find all
3803 /* what type of TRIE am I? (utf8 makes this contextual) */
3804 DECL_TRIE_TYPE(scan);
3806 /* what trie are we using right now */
3807 reg_trie_data * const trie
3808 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3809 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3810 U32 state = trie->startstate;
3813 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3815 if (trie->states[ state ].wordnum) {
3817 PerlIO_printf(Perl_debug_log,
3818 "%*s %smatched empty string...%s\n",
3819 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3825 PerlIO_printf(Perl_debug_log,
3826 "%*s %sfailed to match trie start class...%s\n",
3827 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3834 U8 *uc = ( U8* )locinput;
3838 U8 *uscan = (U8*)NULL;
3839 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3840 U32 charcount = 0; /* how many input chars we have matched */
3841 U32 accepted = 0; /* have we seen any accepting states? */
3843 ST.jump = trie->jump;
3846 ST.longfold = FALSE; /* char longer if folded => it's harder */
3849 /* fully traverse the TRIE; note the position of the
3850 shortest accept state and the wordnum of the longest
3853 while ( state && uc <= (U8*)PL_regeol ) {
3854 U32 base = trie->states[ state ].trans.base;
3858 wordnum = trie->states[ state ].wordnum;
3860 if (wordnum) { /* it's an accept state */
3863 /* record first match position */
3865 ST.firstpos = (U8*)locinput;
3870 ST.firstchars = charcount;
3873 if (!ST.nextword || wordnum < ST.nextword)
3874 ST.nextword = wordnum;
3875 ST.topword = wordnum;
3878 DEBUG_TRIE_EXECUTE_r({
3879 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3880 PerlIO_printf( Perl_debug_log,
3881 "%*s %sState: %4"UVxf" Accepted: %c ",
3882 2+depth * 2, "", PL_colors[4],
3883 (UV)state, (accepted ? 'Y' : 'N'));
3886 /* read a char and goto next state */
3887 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3889 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3890 uscan, len, uvc, charid, foldlen,
3897 base + charid - 1 - trie->uniquecharcount)) >= 0)
3899 && ((U32)offset < trie->lasttrans)
3900 && trie->trans[offset].check == state)
3902 state = trie->trans[offset].next;
3913 DEBUG_TRIE_EXECUTE_r(
3914 PerlIO_printf( Perl_debug_log,
3915 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3916 charid, uvc, (UV)state, PL_colors[5] );
3922 /* calculate total number of accept states */
3927 w = trie->wordinfo[w].prev;
3930 ST.accepted = accepted;
3934 PerlIO_printf( Perl_debug_log,
3935 "%*s %sgot %"IVdf" possible matches%s\n",
3936 REPORT_CODE_OFF + depth * 2, "",
3937 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3939 goto trie_first_try; /* jump into the fail handler */
3941 assert(0); /* NOTREACHED */
3943 case TRIE_next_fail: /* we failed - try next alternative */
3947 REGCP_UNWIND(ST.cp);
3948 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3950 if (!--ST.accepted) {
3952 PerlIO_printf( Perl_debug_log,
3953 "%*s %sTRIE failed...%s\n",
3954 REPORT_CODE_OFF+depth*2, "",
3961 /* Find next-highest word to process. Note that this code
3962 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3965 U16 const nextword = ST.nextword;
3966 reg_trie_wordinfo * const wordinfo
3967 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3968 for (word=ST.topword; word; word=wordinfo[word].prev) {
3969 if (word > nextword && (!min || word < min))
3982 ST.lastparen = rex->lastparen;
3983 ST.lastcloseparen = rex->lastcloseparen;
3987 /* find start char of end of current word */
3989 U32 chars; /* how many chars to skip */
3990 reg_trie_data * const trie
3991 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3993 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3995 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4000 /* the hard option - fold each char in turn and find
4001 * its folded length (which may be different */
4002 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4010 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4018 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4023 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4039 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4040 ? ST.jump[ST.nextword]
4044 PerlIO_printf( Perl_debug_log,
4045 "%*s %sTRIE matched word #%d, continuing%s\n",
4046 REPORT_CODE_OFF+depth*2, "",
4053 if (ST.accepted > 1 || has_cutgroup) {
4054 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4055 assert(0); /* NOTREACHED */
4057 /* only one choice left - just continue */
4059 AV *const trie_words
4060 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4061 SV ** const tmp = av_fetch( trie_words,
4063 SV *sv= tmp ? sv_newmortal() : NULL;
4065 PerlIO_printf( Perl_debug_log,
4066 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4067 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4069 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4070 PL_colors[0], PL_colors[1],
4071 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4073 : "not compiled under -Dr",
4077 locinput = (char*)uc;
4078 continue; /* execute rest of RE */
4079 assert(0); /* NOTREACHED */
4083 case EXACT: { /* /abc/ */
4084 char *s = STRING(scan);
4086 if (utf8_target != UTF_PATTERN) {
4087 /* The target and the pattern have differing utf8ness. */
4089 const char * const e = s + ln;
4092 /* The target is utf8, the pattern is not utf8.
4093 * Above-Latin1 code points can't match the pattern;
4094 * invariants match exactly, and the other Latin1 ones need
4095 * to be downgraded to a single byte in order to do the
4096 * comparison. (If we could be confident that the target
4097 * is not malformed, this could be refactored to have fewer
4098 * tests by just assuming that if the first bytes match, it
4099 * is an invariant, but there are tests in the test suite
4100 * dealing with (??{...}) which violate this) */
4104 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4107 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4114 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4123 /* The target is not utf8, the pattern is utf8. */
4125 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4129 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4136 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4147 /* The target and the pattern have the same utf8ness. */
4148 /* Inline the first character, for speed. */
4149 if (UCHARAT(s) != nextchr)
4151 if (PL_regeol - locinput < ln)
4153 if (ln > 1 && memNE(s, locinput, ln))
4159 case EXACTFL: { /* /abc/il */
4161 const U8 * fold_array;
4163 U32 fold_utf8_flags;
4165 PL_reg_flags |= RF_tainted;
4166 folder = foldEQ_locale;
4167 fold_array = PL_fold_locale;
4168 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4171 case EXACTFU_SS: /* /\x{df}/iu */
4172 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4173 case EXACTFU: /* /abc/iu */
4174 folder = foldEQ_latin1;
4175 fold_array = PL_fold_latin1;
4176 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4179 case EXACTFA: /* /abc/iaa */
4180 folder = foldEQ_latin1;
4181 fold_array = PL_fold_latin1;
4182 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4185 case EXACTF: /* /abc/i */
4187 fold_array = PL_fold;
4188 fold_utf8_flags = 0;
4194 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4195 /* Either target or the pattern are utf8, or has the issue where
4196 * the fold lengths may differ. */
4197 const char * const l = locinput;
4198 char *e = PL_regeol;
4200 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
4201 l, &e, 0, utf8_target, fold_utf8_flags))
4209 /* Neither the target nor the pattern are utf8 */
4210 if (UCHARAT(s) != nextchr
4212 && UCHARAT(s) != fold_array[nextchr])
4216 if (PL_regeol - locinput < ln)
4218 if (ln > 1 && ! folder(s, locinput, ln))
4224 /* XXX Could improve efficiency by separating these all out using a
4225 * macro or in-line function. At that point regcomp.c would no longer
4226 * have to set the FLAGS fields of these */
4227 case BOUNDL: /* /\b/l */
4228 case NBOUNDL: /* /\B/l */
4229 PL_reg_flags |= RF_tainted;
4231 case BOUND: /* /\b/ */
4232 case BOUNDU: /* /\b/u */
4233 case BOUNDA: /* /\b/a */
4234 case NBOUND: /* /\B/ */
4235 case NBOUNDU: /* /\B/u */
4236 case NBOUNDA: /* /\B/a */
4237 /* was last char in word? */
4239 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4240 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4242 if (locinput == PL_bostr)
4245 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4247 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4249 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4250 ln = isALNUM_uni(ln);
4254 LOAD_UTF8_CHARCLASS_ALNUM();
4255 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4260 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4261 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4266 /* Here the string isn't utf8, or is utf8 and only ascii
4267 * characters are to match \w. In the latter case looking at
4268 * the byte just prior to the current one may be just the final
4269 * byte of a multi-byte character. This is ok. There are two
4271 * 1) it is a single byte character, and then the test is doing
4272 * just what it's supposed to.
4273 * 2) it is a multi-byte character, in which case the final
4274 * byte is never mistakable for ASCII, and so the test
4275 * will say it is not a word character, which is the
4276 * correct answer. */
4277 ln = (locinput != PL_bostr) ?
4278 UCHARAT(locinput - 1) : '\n';
4279 switch (FLAGS(scan)) {
4280 case REGEX_UNICODE_CHARSET:
4281 ln = isWORDCHAR_L1(ln);
4282 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4284 case REGEX_LOCALE_CHARSET:
4285 ln = isALNUM_LC(ln);
4286 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4288 case REGEX_DEPENDS_CHARSET:
4290 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4292 case REGEX_ASCII_RESTRICTED_CHARSET:
4293 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4294 ln = isWORDCHAR_A(ln);
4295 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4298 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4302 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4304 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4308 case ANYOF: /* /[abc]/ */
4312 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4314 locinput += UTF8SKIP(locinput);
4318 if (!REGINCLASS(rex, scan, (U8*)locinput))
4325 /* Special char classes: \d, \w etc.
4326 * The defines start on line 166 or so */
4327 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4328 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4329 ALNUMU, NALNUMU, isWORDCHAR_L1,
4330 ALNUMA, NALNUMA, isWORDCHAR_A,
4333 CCC_TRY_U(SPACE, NSPACE, isSPACE,
4334 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4335 SPACEU, NSPACEU, isSPACE_L1,
4336 SPACEA, NSPACEA, isSPACE_A,
4339 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4340 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4341 DIGITA, NDIGITA, isDIGIT_A,
4344 case POSIXA: /* /[[:ascii:]]/ etc */
4345 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4348 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4352 case NPOSIXA: /* /[^[:ascii:]]/ etc */
4353 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4356 goto increment_locinput;
4358 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4359 a Unicode extended Grapheme Cluster */
4360 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4361 extended Grapheme Cluster is:
4364 | Prepend* Begin Extend*
4367 Begin is: ( Special_Begin | ! Control )
4368 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4369 Extend is: ( Grapheme_Extend | Spacing_Mark )
4370 Control is: [ GCB_Control CR LF ]
4371 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4373 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4376 Begin is ( Regular_Begin + Special Begin )
4378 It turns out that 98.4% of all Unicode code points match
4379 Regular_Begin. Doing it this way eliminates a table match in
4380 the previous implementation for almost all Unicode code points.
4382 There is a subtlety with Prepend* which showed up in testing.
4383 Note that the Begin, and only the Begin is required in:
4384 | Prepend* Begin Extend*
4385 Also, Begin contains '! Control'. A Prepend must be a
4386 '! Control', which means it must also be a Begin. What it
4387 comes down to is that if we match Prepend* and then find no
4388 suitable Begin afterwards, that if we backtrack the last
4389 Prepend, that one will be a suitable Begin.
4394 if (! utf8_target) {
4396 /* Match either CR LF or '.', as all the other possibilities
4398 locinput++; /* Match the . or CR */
4399 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4401 && locinput < PL_regeol
4402 && UCHARAT(locinput) == '\n') locinput++;
4406 /* Utf8: See if is ( CR LF ); already know that locinput <
4407 * PL_regeol, so locinput+1 is in bounds */
4408 if ( nextchr == '\r' && locinput+1 < PL_regeol
4409 && UCHARAT(locinput + 1) == '\n')
4416 /* In case have to backtrack to beginning, then match '.' */
4417 char *starting = locinput;
4419 /* In case have to backtrack the last prepend */
4420 char *previous_prepend = 0;
4422 LOAD_UTF8_CHARCLASS_GCB();
4424 /* Match (prepend)* */
4425 while (locinput < PL_regeol
4426 && (len = is_GCB_Prepend_utf8(locinput)))
4428 previous_prepend = locinput;
4432 /* As noted above, if we matched a prepend character, but
4433 * the next thing won't match, back off the last prepend we
4434 * matched, as it is guaranteed to match the begin */
4435 if (previous_prepend
4436 && (locinput >= PL_regeol
4437 || (! swash_fetch(PL_utf8_X_regular_begin,
4438 (U8*)locinput, utf8_target)
4439 && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4442 locinput = previous_prepend;
4445 /* Note that here we know PL_regeol > locinput, as we
4446 * tested that upon input to this switch case, and if we
4447 * moved locinput forward, we tested the result just above
4448 * and it either passed, or we backed off so that it will
4450 if (swash_fetch(PL_utf8_X_regular_begin,
4451 (U8*)locinput, utf8_target)) {
4452 locinput += UTF8SKIP(locinput);
4454 else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4456 /* Here did not match the required 'Begin' in the
4457 * second term. So just match the very first
4458 * character, the '.' of the final term of the regex */
4459 locinput = starting + UTF8SKIP(starting);
4463 /* Here is a special begin. It can be composed of
4464 * several individual characters. One possibility is
4466 if ((len = is_GCB_RI_utf8(locinput))) {
4468 while (locinput < PL_regeol
4469 && (len = is_GCB_RI_utf8(locinput)))
4473 } else if ((len = is_GCB_T_utf8(locinput))) {
4474 /* Another possibility is T+ */
4476 while (locinput < PL_regeol
4477 && (len = is_GCB_T_utf8(locinput)))
4483 /* Here, neither RI+ nor T+; must be some other
4484 * Hangul. That means it is one of the others: L,
4485 * LV, LVT or V, and matches:
4486 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4489 while (locinput < PL_regeol
4490 && (len = is_GCB_L_utf8(locinput)))
4495 /* Here, have exhausted L*. If the next character
4496 * is not an LV, LVT nor V, it means we had to have
4497 * at least one L, so matches L+ in the original
4498 * equation, we have a complete hangul syllable.
4501 if (locinput < PL_regeol
4502 && is_GCB_LV_LVT_V_utf8(locinput))
4505 /* Otherwise keep going. Must be LV, LVT or V.
4507 if (is_utf8_X_LVT((U8*)locinput)) {
4508 locinput += UTF8SKIP(locinput);
4511 /* Must be V or LV. Take it, then match
4513 locinput += UTF8SKIP(locinput);
4514 while (locinput < PL_regeol
4515 && (len = is_GCB_V_utf8(locinput)))
4521 /* And any of LV, LVT, or V can be followed
4523 while (locinput < PL_regeol
4524 && (len = is_GCB_T_utf8(locinput)))
4532 /* Match any extender */
4533 while (locinput < PL_regeol
4534 && swash_fetch(PL_utf8_X_extend,
4535 (U8*)locinput, utf8_target))
4537 locinput += UTF8SKIP(locinput);
4541 if (locinput > PL_regeol) sayNO;
4545 case NREFFL: /* /\g{name}/il */
4546 { /* The capture buffer cases. The ones beginning with N for the
4547 named buffers just convert to the equivalent numbered and
4548 pretend they were called as the corresponding numbered buffer
4550 /* don't initialize these in the declaration, it makes C++
4555 const U8 *fold_array;
4558 PL_reg_flags |= RF_tainted;
4559 folder = foldEQ_locale;
4560 fold_array = PL_fold_locale;
4562 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4565 case NREFFA: /* /\g{name}/iaa */
4566 folder = foldEQ_latin1;
4567 fold_array = PL_fold_latin1;
4569 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4572 case NREFFU: /* /\g{name}/iu */
4573 folder = foldEQ_latin1;
4574 fold_array = PL_fold_latin1;
4576 utf8_fold_flags = 0;
4579 case NREFF: /* /\g{name}/i */
4581 fold_array = PL_fold;
4583 utf8_fold_flags = 0;
4586 case NREF: /* /\g{name}/ */
4590 utf8_fold_flags = 0;
4593 /* For the named back references, find the corresponding buffer
4595 n = reg_check_named_buff_matched(rex,scan);
4600 goto do_nref_ref_common;
4602 case REFFL: /* /\1/il */
4603 PL_reg_flags |= RF_tainted;
4604 folder = foldEQ_locale;
4605 fold_array = PL_fold_locale;
4606 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4609 case REFFA: /* /\1/iaa */
4610 folder = foldEQ_latin1;
4611 fold_array = PL_fold_latin1;
4612 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4615 case REFFU: /* /\1/iu */
4616 folder = foldEQ_latin1;
4617 fold_array = PL_fold_latin1;
4618 utf8_fold_flags = 0;
4621 case REFF: /* /\1/i */
4623 fold_array = PL_fold;
4624 utf8_fold_flags = 0;
4627 case REF: /* /\1/ */
4630 utf8_fold_flags = 0;
4634 n = ARG(scan); /* which paren pair */
4637 ln = rex->offs[n].start;
4638 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4639 if (rex->lastparen < n || ln == -1)
4640 sayNO; /* Do not match unless seen CLOSEn. */
4641 if (ln == rex->offs[n].end)
4645 if (type != REF /* REF can do byte comparison */
4646 && (utf8_target || type == REFFU))
4647 { /* XXX handle REFFL better */
4648 char * limit = PL_regeol;
4650 /* This call case insensitively compares the entire buffer
4651 * at s, with the current input starting at locinput, but
4652 * not going off the end given by PL_regeol, and returns in
4653 * <limit> upon success, how much of the current input was
4655 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4656 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4664 /* Not utf8: Inline the first character, for speed. */
4665 if (!NEXTCHR_IS_EOS &&
4666 UCHARAT(s) != nextchr &&
4668 UCHARAT(s) != fold_array[nextchr]))
4670 ln = rex->offs[n].end - ln;
4671 if (locinput + ln > PL_regeol)
4673 if (ln > 1 && (type == REF
4674 ? memNE(s, locinput, ln)
4675 : ! folder(s, locinput, ln)))
4681 case NOTHING: /* null op; e.g. the 'nothing' following
4682 * the '*' in m{(a+|b)*}' */
4684 case TAIL: /* placeholder while compiling (A|B|C) */
4687 case BACK: /* ??? doesn't appear to be used ??? */
4691 #define ST st->u.eval
4696 regexp_internal *rei;
4697 regnode *startpoint;
4699 case GOSTART: /* (?R) */
4700 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4701 if (cur_eval && cur_eval->locinput==locinput) {
4702 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4703 Perl_croak(aTHX_ "Infinite recursion in regex");
4704 if ( ++nochange_depth > max_nochange_depth )
4706 "Pattern subroutine nesting without pos change"
4707 " exceeded limit in regex");
4714 if (OP(scan)==GOSUB) {
4715 startpoint = scan + ARG2L(scan);
4716 ST.close_paren = ARG(scan);
4718 startpoint = rei->program+1;
4721 goto eval_recurse_doit;
4722 assert(0); /* NOTREACHED */
4724 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4725 if (cur_eval && cur_eval->locinput==locinput) {
4726 if ( ++nochange_depth > max_nochange_depth )
4727 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4732 /* execute the code in the {...} */
4736 OP * const oop = PL_op;
4737 COP * const ocurcop = PL_curcop;
4739 char *saved_regeol = PL_regeol;
4740 struct re_save_state saved_state;
4743 /* save *all* paren positions */
4745 REGCP_SET(runops_cp);
4747 /* To not corrupt the existing regex state while executing the
4748 * eval we would normally put it on the save stack, like with
4749 * save_re_context. However, re-evals have a weird scoping so we
4750 * can't just add ENTER/LEAVE here. With that, things like
4752 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4754 * would break, as they expect the localisation to be unwound
4755 * only when the re-engine backtracks through the bit that
4758 * What we do instead is just saving the state in a local c
4761 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4763 PL_reg_state.re_reparsing = FALSE;
4766 caller_cv = find_runcv(NULL);
4770 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4772 (REGEXP*)(rexi->data->data[n])
4775 nop = (OP*)rexi->data->data[n+1];
4777 else if (rexi->data->what[n] == 'l') { /* literal code */
4779 nop = (OP*)rexi->data->data[n];
4780 assert(CvDEPTH(newcv));
4783 /* literal with own CV */
4784 assert(rexi->data->what[n] == 'L');
4785 newcv = rex->qr_anoncv;
4786 nop = (OP*)rexi->data->data[n];
4789 /* normally if we're about to execute code from the same
4790 * CV that we used previously, we just use the existing
4791 * CX stack entry. However, its possible that in the
4792 * meantime we may have backtracked, popped from the save
4793 * stack, and undone the SAVECOMPPAD(s) associated with
4794 * PUSH_MULTICALL; in which case PL_comppad no longer
4795 * points to newcv's pad. */
4796 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4798 I32 depth = (newcv == caller_cv) ? 0 : 1;
4799 if (last_pushed_cv) {
4800 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4803 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4805 last_pushed_cv = newcv;
4807 last_pad = PL_comppad;
4809 /* the initial nextstate you would normally execute
4810 * at the start of an eval (which would cause error
4811 * messages to come from the eval), may be optimised
4812 * away from the execution path in the regex code blocks;
4813 * so manually set PL_curcop to it initially */
4815 OP *o = cUNOPx(nop)->op_first;
4816 assert(o->op_type == OP_NULL);
4817 if (o->op_targ == OP_SCOPE) {
4818 o = cUNOPo->op_first;
4821 assert(o->op_targ == OP_LEAVE);
4822 o = cUNOPo->op_first;
4823 assert(o->op_type == OP_ENTER);
4827 if (o->op_type != OP_STUB) {
4828 assert( o->op_type == OP_NEXTSTATE
4829 || o->op_type == OP_DBSTATE
4830 || (o->op_type == OP_NULL
4831 && ( o->op_targ == OP_NEXTSTATE
4832 || o->op_targ == OP_DBSTATE
4836 PL_curcop = (COP*)o;
4841 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4842 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4844 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4847 SV *sv_mrk = get_sv("REGMARK", 1);
4848 sv_setsv(sv_mrk, sv_yes_mark);
4851 /* we don't use MULTICALL here as we want to call the
4852 * first op of the block of interest, rather than the
4853 * first op of the sub */
4854 before = SP-PL_stack_base;
4856 CALLRUNOPS(aTHX); /* Scalar context. */
4858 if (SP-PL_stack_base == before)
4859 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4865 /* before restoring everything, evaluate the returned
4866 * value, so that 'uninit' warnings don't use the wrong
4867 * PL_op or pad. Also need to process any magic vars
4868 * (e.g. $1) *before* parentheses are restored */
4873 if (logical == 0) /* (?{})/ */
4874 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4875 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4876 sw = cBOOL(SvTRUE(ret));
4879 else { /* /(??{}) */
4880 /* if its overloaded, let the regex compiler handle
4881 * it; otherwise extract regex, or stringify */
4882 if (!SvAMAGIC(ret)) {
4886 if (SvTYPE(sv) == SVt_REGEXP)
4887 re_sv = (REGEXP*) sv;
4888 else if (SvSMAGICAL(sv)) {
4889 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4891 re_sv = (REGEXP *) mg->mg_obj;
4894 /* force any magic, undef warnings here */
4896 ret = sv_mortalcopy(ret);
4897 (void) SvPV_force_nolen(ret);
4903 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4905 /* *** Note that at this point we don't restore
4906 * PL_comppad, (or pop the CxSUB) on the assumption it may
4907 * be used again soon. This is safe as long as nothing
4908 * in the regexp code uses the pad ! */
4910 PL_curcop = ocurcop;
4911 PL_regeol = saved_regeol;
4912 S_regcp_restore(aTHX_ rex, runops_cp);
4918 /* only /(??{})/ from now on */
4921 /* extract RE object from returned value; compiling if
4925 re_sv = reg_temp_copy(NULL, re_sv);
4929 const I32 osize = PL_regsize;
4931 if (SvUTF8(ret) && IN_BYTES) {
4932 /* In use 'bytes': make a copy of the octet
4933 * sequence, but without the flag on */
4935 const char *const p = SvPV(ret, len);
4936 ret = newSVpvn_flags(p, len, SVs_TEMP);
4938 if (rex->intflags & PREGf_USE_RE_EVAL)
4939 pm_flags |= PMf_USE_RE_EVAL;
4941 /* if we got here, it should be an engine which
4942 * supports compiling code blocks and stuff */
4943 assert(rex->engine && rex->engine->op_comp);
4944 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4945 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4946 rex->engine, NULL, NULL,
4947 /* copy /msix etc to inner pattern */
4952 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4954 /* This isn't a first class regexp. Instead, it's
4955 caching a regexp onto an existing, Perl visible
4957 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4960 /* safe to do now that any $1 etc has been
4961 * interpolated into the new pattern string and
4963 S_regcp_restore(aTHX_ rex, runops_cp);
4968 RXp_MATCH_COPIED_off(re);
4969 re->subbeg = rex->subbeg;
4970 re->sublen = rex->sublen;
4971 re->suboffset = rex->suboffset;
4972 re->subcoffset = rex->subcoffset;
4975 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4976 "Matching embedded");
4978 startpoint = rei->program + 1;
4979 ST.close_paren = 0; /* only used for GOSUB */
4981 eval_recurse_doit: /* Share code with GOSUB below this line */
4982 /* run the pattern returned from (??{...}) */
4983 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
4984 REGCP_SET(ST.lastcp);
4987 re->lastcloseparen = 0;
4991 /* XXXX This is too dramatic a measure... */
4994 ST.toggle_reg_flags = PL_reg_flags;
4996 PL_reg_flags |= RF_utf8;
4998 PL_reg_flags &= ~RF_utf8;
4999 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5001 ST.prev_rex = rex_sv;
5002 ST.prev_curlyx = cur_curlyx;
5004 SET_reg_curpm(rex_sv);
5009 ST.prev_eval = cur_eval;
5011 /* now continue from first node in postoned RE */
5012 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5013 assert(0); /* NOTREACHED */
5016 case EVAL_AB: /* cleanup after a successful (??{A})B */
5017 /* note: this is called twice; first after popping B, then A */
5018 PL_reg_flags ^= ST.toggle_reg_flags;
5019 rex_sv = ST.prev_rex;
5020 SET_reg_curpm(rex_sv);
5021 rex = ReANY(rex_sv);
5022 rexi = RXi_GET(rex);
5024 cur_eval = ST.prev_eval;
5025 cur_curlyx = ST.prev_curlyx;
5027 /* XXXX This is too dramatic a measure... */
5029 if ( nochange_depth )
5034 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5035 /* note: this is called twice; first after popping B, then A */
5036 PL_reg_flags ^= ST.toggle_reg_flags;
5037 rex_sv = ST.prev_rex;
5038 SET_reg_curpm(rex_sv);
5039 rex = ReANY(rex_sv);
5040 rexi = RXi_GET(rex);
5042 REGCP_UNWIND(ST.lastcp);
5044 cur_eval = ST.prev_eval;
5045 cur_curlyx = ST.prev_curlyx;
5046 /* XXXX This is too dramatic a measure... */
5048 if ( nochange_depth )
5054 n = ARG(scan); /* which paren pair */
5055 rex->offs[n].start_tmp = locinput - PL_bostr;
5058 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5059 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5063 (IV)rex->offs[n].start_tmp,
5069 /* XXX really need to log other places start/end are set too */
5070 #define CLOSE_CAPTURE \
5071 rex->offs[n].start = rex->offs[n].start_tmp; \
5072 rex->offs[n].end = locinput - PL_bostr; \
5073 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5074 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5076 PTR2UV(rex->offs), \
5078 (IV)rex->offs[n].start, \
5079 (IV)rex->offs[n].end \
5083 n = ARG(scan); /* which paren pair */
5085 /*if (n > PL_regsize)
5087 if (n > rex->lastparen)
5089 rex->lastcloseparen = n;
5090 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5095 case ACCEPT: /* (*ACCEPT) */
5099 cursor && OP(cursor)!=END;
5100 cursor=regnext(cursor))
5102 if ( OP(cursor)==CLOSE ){
5104 if ( n <= lastopen ) {
5106 /*if (n > PL_regsize)
5108 if (n > rex->lastparen)
5110 rex->lastcloseparen = n;
5111 if ( n == ARG(scan) || (cur_eval &&
5112 cur_eval->u.eval.close_paren == n))
5121 case GROUPP: /* (?(1)) */
5122 n = ARG(scan); /* which paren pair */
5123 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5126 case NGROUPP: /* (?(<name>)) */
5127 /* reg_check_named_buff_matched returns 0 for no match */
5128 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5131 case INSUBP: /* (?(R)) */
5133 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5136 case DEFINEP: /* (?(DEFINE)) */
5140 case IFTHEN: /* (?(cond)A|B) */
5141 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5143 next = NEXTOPER(NEXTOPER(scan));
5145 next = scan + ARG(scan);
5146 if (OP(next) == IFTHEN) /* Fake one. */
5147 next = NEXTOPER(NEXTOPER(next));
5151 case LOGICAL: /* modifier for EVAL and IFMATCH */
5152 logical = scan->flags;
5155 /*******************************************************************
5157 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5158 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5159 STAR/PLUS/CURLY/CURLYN are used instead.)
5161 A*B is compiled as <CURLYX><A><WHILEM><B>
5163 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5164 state, which contains the current count, initialised to -1. It also sets
5165 cur_curlyx to point to this state, with any previous value saved in the
5168 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5169 since the pattern may possibly match zero times (i.e. it's a while {} loop
5170 rather than a do {} while loop).
5172 Each entry to WHILEM represents a successful match of A. The count in the
5173 CURLYX block is incremented, another WHILEM state is pushed, and execution
5174 passes to A or B depending on greediness and the current count.
5176 For example, if matching against the string a1a2a3b (where the aN are
5177 substrings that match /A/), then the match progresses as follows: (the
5178 pushed states are interspersed with the bits of strings matched so far):
5181 <CURLYX cnt=0><WHILEM>
5182 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5183 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5184 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5185 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5187 (Contrast this with something like CURLYM, which maintains only a single
5191 a1 <CURLYM cnt=1> a2
5192 a1 a2 <CURLYM cnt=2> a3
5193 a1 a2 a3 <CURLYM cnt=3> b
5196 Each WHILEM state block marks a point to backtrack to upon partial failure
5197 of A or B, and also contains some minor state data related to that
5198 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5199 overall state, such as the count, and pointers to the A and B ops.
5201 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5202 must always point to the *current* CURLYX block, the rules are:
5204 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5205 and set cur_curlyx to point the new block.
5207 When popping the CURLYX block after a successful or unsuccessful match,
5208 restore the previous cur_curlyx.
5210 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5211 to the outer one saved in the CURLYX block.
5213 When popping the WHILEM block after a successful or unsuccessful B match,
5214 restore the previous cur_curlyx.
5216 Here's an example for the pattern (AI* BI)*BO
5217 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5220 curlyx backtrack stack
5221 ------ ---------------
5223 CO <CO prev=NULL> <WO>
5224 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5225 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5226 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5228 At this point the pattern succeeds, and we work back down the stack to
5229 clean up, restoring as we go:
5231 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5232 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5233 CO <CO prev=NULL> <WO>
5236 *******************************************************************/
5238 #define ST st->u.curlyx
5240 case CURLYX: /* start of /A*B/ (for complex A) */
5242 /* No need to save/restore up to this paren */
5243 I32 parenfloor = scan->flags;
5245 assert(next); /* keep Coverity happy */
5246 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5249 /* XXXX Probably it is better to teach regpush to support
5250 parenfloor > PL_regsize... */
5251 if (parenfloor > (I32)rex->lastparen)
5252 parenfloor = rex->lastparen; /* Pessimization... */
5254 ST.prev_curlyx= cur_curlyx;
5256 ST.cp = PL_savestack_ix;
5258 /* these fields contain the state of the current curly.
5259 * they are accessed by subsequent WHILEMs */
5260 ST.parenfloor = parenfloor;
5265 ST.count = -1; /* this will be updated by WHILEM */
5266 ST.lastloc = NULL; /* this will be updated by WHILEM */
5268 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5269 assert(0); /* NOTREACHED */
5272 case CURLYX_end: /* just finished matching all of A*B */
5273 cur_curlyx = ST.prev_curlyx;
5275 assert(0); /* NOTREACHED */
5277 case CURLYX_end_fail: /* just failed to match all of A*B */
5279 cur_curlyx = ST.prev_curlyx;
5281 assert(0); /* NOTREACHED */
5285 #define ST st->u.whilem
5287 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5289 /* see the discussion above about CURLYX/WHILEM */
5291 int min = ARG1(cur_curlyx->u.curlyx.me);
5292 int max = ARG2(cur_curlyx->u.curlyx.me);
5293 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5295 assert(cur_curlyx); /* keep Coverity happy */
5296 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5297 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5298 ST.cache_offset = 0;
5302 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5303 "%*s whilem: matched %ld out of %d..%d\n",
5304 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5307 /* First just match a string of min A's. */
5310 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5311 cur_curlyx->u.curlyx.lastloc = locinput;
5312 REGCP_SET(ST.lastcp);
5314 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5315 assert(0); /* NOTREACHED */
5318 /* If degenerate A matches "", assume A done. */
5320 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5321 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5322 "%*s whilem: empty match detected, trying continuation...\n",
5323 REPORT_CODE_OFF+depth*2, "")
5325 goto do_whilem_B_max;
5328 /* super-linear cache processing */
5332 if (!PL_reg_maxiter) {
5333 /* start the countdown: Postpone detection until we
5334 * know the match is not *that* much linear. */
5335 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5336 /* possible overflow for long strings and many CURLYX's */
5337 if (PL_reg_maxiter < 0)
5338 PL_reg_maxiter = I32_MAX;
5339 PL_reg_leftiter = PL_reg_maxiter;
5342 if (PL_reg_leftiter-- == 0) {
5343 /* initialise cache */
5344 const I32 size = (PL_reg_maxiter + 7)/8;
5345 if (PL_reg_poscache) {
5346 if ((I32)PL_reg_poscache_size < size) {
5347 Renew(PL_reg_poscache, size, char);
5348 PL_reg_poscache_size = size;
5350 Zero(PL_reg_poscache, size, char);
5353 PL_reg_poscache_size = size;
5354 Newxz(PL_reg_poscache, size, char);
5356 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5357 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5358 PL_colors[4], PL_colors[5])
5362 if (PL_reg_leftiter < 0) {
5363 /* have we already failed at this position? */
5365 offset = (scan->flags & 0xf) - 1
5366 + (locinput - PL_bostr) * (scan->flags>>4);
5367 mask = 1 << (offset % 8);
5369 if (PL_reg_poscache[offset] & mask) {
5370 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5371 "%*s whilem: (cache) already tried at this position...\n",
5372 REPORT_CODE_OFF+depth*2, "")
5374 sayNO; /* cache records failure */
5376 ST.cache_offset = offset;
5377 ST.cache_mask = mask;
5381 /* Prefer B over A for minimal matching. */
5383 if (cur_curlyx->u.curlyx.minmod) {
5384 ST.save_curlyx = cur_curlyx;
5385 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5386 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5387 REGCP_SET(ST.lastcp);
5388 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5390 assert(0); /* NOTREACHED */
5393 /* Prefer A over B for maximal matching. */
5395 if (n < max) { /* More greed allowed? */
5396 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5397 cur_curlyx->u.curlyx.lastloc = locinput;
5398 REGCP_SET(ST.lastcp);
5399 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5400 assert(0); /* NOTREACHED */
5402 goto do_whilem_B_max;
5404 assert(0); /* NOTREACHED */
5406 case WHILEM_B_min: /* just matched B in a minimal match */
5407 case WHILEM_B_max: /* just matched B in a maximal match */
5408 cur_curlyx = ST.save_curlyx;
5410 assert(0); /* NOTREACHED */
5412 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5413 cur_curlyx = ST.save_curlyx;
5414 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5415 cur_curlyx->u.curlyx.count--;
5417 assert(0); /* NOTREACHED */
5419 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5421 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5422 REGCP_UNWIND(ST.lastcp);
5424 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5425 cur_curlyx->u.curlyx.count--;
5427 assert(0); /* NOTREACHED */
5429 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5430 REGCP_UNWIND(ST.lastcp);
5431 regcppop(rex); /* Restore some previous $<digit>s? */
5432 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5433 "%*s whilem: failed, trying continuation...\n",
5434 REPORT_CODE_OFF+depth*2, "")
5437 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5438 && ckWARN(WARN_REGEXP)
5439 && !(PL_reg_flags & RF_warned))
5441 PL_reg_flags |= RF_warned;
5442 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5443 "Complex regular subexpression recursion limit (%d) "
5449 ST.save_curlyx = cur_curlyx;
5450 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5451 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5453 assert(0); /* NOTREACHED */
5455 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5456 cur_curlyx = ST.save_curlyx;
5457 REGCP_UNWIND(ST.lastcp);
5460 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5461 /* Maximum greed exceeded */
5462 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5463 && ckWARN(WARN_REGEXP)
5464 && !(PL_reg_flags & RF_warned))
5466 PL_reg_flags |= RF_warned;
5467 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5468 "Complex regular subexpression recursion "
5469 "limit (%d) exceeded",
5472 cur_curlyx->u.curlyx.count--;
5476 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5477 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5479 /* Try grabbing another A and see if it helps. */
5480 cur_curlyx->u.curlyx.lastloc = locinput;
5481 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5482 REGCP_SET(ST.lastcp);
5483 PUSH_STATE_GOTO(WHILEM_A_min,
5484 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5486 assert(0); /* NOTREACHED */
5489 #define ST st->u.branch
5491 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5492 next = scan + ARG(scan);
5495 scan = NEXTOPER(scan);
5498 case BRANCH: /* /(...|A|...)/ */
5499 scan = NEXTOPER(scan); /* scan now points to inner node */
5500 ST.lastparen = rex->lastparen;
5501 ST.lastcloseparen = rex->lastcloseparen;
5502 ST.next_branch = next;
5505 /* Now go into the branch */
5507 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5509 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5511 assert(0); /* NOTREACHED */
5513 case CUTGROUP: /* /(*THEN)/ */
5514 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5515 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5516 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5517 assert(0); /* NOTREACHED */
5519 case CUTGROUP_next_fail:
5522 if (st->u.mark.mark_name)
5523 sv_commit = st->u.mark.mark_name;
5525 assert(0); /* NOTREACHED */
5529 assert(0); /* NOTREACHED */
5531 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5536 REGCP_UNWIND(ST.cp);
5537 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5538 scan = ST.next_branch;
5539 /* no more branches? */
5540 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5542 PerlIO_printf( Perl_debug_log,
5543 "%*s %sBRANCH failed...%s\n",
5544 REPORT_CODE_OFF+depth*2, "",
5550 continue; /* execute next BRANCH[J] op */
5551 assert(0); /* NOTREACHED */
5553 case MINMOD: /* next op will be non-greedy, e.g. A*? */
5558 #define ST st->u.curlym
5560 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5562 /* This is an optimisation of CURLYX that enables us to push
5563 * only a single backtracking state, no matter how many matches
5564 * there are in {m,n}. It relies on the pattern being constant
5565 * length, with no parens to influence future backrefs
5569 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5571 ST.lastparen = rex->lastparen;
5572 ST.lastcloseparen = rex->lastcloseparen;
5574 /* if paren positive, emulate an OPEN/CLOSE around A */
5576 U32 paren = ST.me->flags;
5577 if (paren > PL_regsize)
5579 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5587 ST.c1 = CHRTEST_UNINIT;
5590 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5593 curlym_do_A: /* execute the A in /A{m,n}B/ */
5594 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5595 assert(0); /* NOTREACHED */
5597 case CURLYM_A: /* we've just matched an A */
5599 /* after first match, determine A's length: u.curlym.alen */
5600 if (ST.count == 1) {
5601 if (PL_reg_match_utf8) {
5602 char *s = st->locinput;
5603 while (s < locinput) {
5609 ST.alen = locinput - st->locinput;
5612 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5615 PerlIO_printf(Perl_debug_log,
5616 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5617 (int)(REPORT_CODE_OFF+(depth*2)), "",
5618 (IV) ST.count, (IV)ST.alen)
5621 if (cur_eval && cur_eval->u.eval.close_paren &&
5622 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5626 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5627 if ( max == REG_INFTY || ST.count < max )
5628 goto curlym_do_A; /* try to match another A */
5630 goto curlym_do_B; /* try to match B */
5632 case CURLYM_A_fail: /* just failed to match an A */
5633 REGCP_UNWIND(ST.cp);
5635 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5636 || (cur_eval && cur_eval->u.eval.close_paren &&
5637 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5640 curlym_do_B: /* execute the B in /A{m,n}B/ */
5641 if (ST.c1 == CHRTEST_UNINIT) {
5642 /* calculate c1 and c2 for possible match of 1st char
5643 * following curly */
5644 ST.c1 = ST.c2 = CHRTEST_VOID;
5645 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5646 regnode *text_node = ST.B;
5647 if (! HAS_TEXT(text_node))
5648 FIND_NEXT_IMPT(text_node);
5651 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5653 But the former is redundant in light of the latter.
5655 if this changes back then the macro for
5656 IS_TEXT and friends need to change.
5658 if (PL_regkind[OP(text_node)] == EXACT) {
5659 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5660 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5669 PerlIO_printf(Perl_debug_log,
5670 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5671 (int)(REPORT_CODE_OFF+(depth*2)),
5674 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5675 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5676 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5677 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5679 /* simulate B failing */
5681 PerlIO_printf(Perl_debug_log,
5682 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5683 (int)(REPORT_CODE_OFF+(depth*2)),"",
5684 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5685 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5686 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5688 state_num = CURLYM_B_fail;
5689 goto reenter_switch;
5692 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5693 /* simulate B failing */
5695 PerlIO_printf(Perl_debug_log,
5696 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5697 (int)(REPORT_CODE_OFF+(depth*2)),"",
5698 (int) nextchr, ST.c1, ST.c2)
5700 state_num = CURLYM_B_fail;
5701 goto reenter_switch;
5706 /* emulate CLOSE: mark current A as captured */
5707 I32 paren = ST.me->flags;
5709 rex->offs[paren].start
5710 = HOPc(locinput, -ST.alen) - PL_bostr;
5711 rex->offs[paren].end = locinput - PL_bostr;
5712 if ((U32)paren > rex->lastparen)
5713 rex->lastparen = paren;
5714 rex->lastcloseparen = paren;
5717 rex->offs[paren].end = -1;
5718 if (cur_eval && cur_eval->u.eval.close_paren &&
5719 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5728 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5729 assert(0); /* NOTREACHED */
5731 case CURLYM_B_fail: /* just failed to match a B */
5732 REGCP_UNWIND(ST.cp);
5733 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5735 I32 max = ARG2(ST.me);
5736 if (max != REG_INFTY && ST.count == max)
5738 goto curlym_do_A; /* try to match a further A */
5740 /* backtrack one A */
5741 if (ST.count == ARG1(ST.me) /* min */)
5744 SET_locinput(HOPc(locinput, -ST.alen));
5745 goto curlym_do_B; /* try to match B */
5748 #define ST st->u.curly
5750 #define CURLY_SETPAREN(paren, success) \
5753 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5754 rex->offs[paren].end = locinput - PL_bostr; \
5755 if (paren > rex->lastparen) \
5756 rex->lastparen = paren; \
5757 rex->lastcloseparen = paren; \
5760 rex->offs[paren].end = -1; \
5761 rex->lastparen = ST.lastparen; \
5762 rex->lastcloseparen = ST.lastcloseparen; \
5766 case STAR: /* /A*B/ where A is width 1 char */
5770 scan = NEXTOPER(scan);
5773 case PLUS: /* /A+B/ where A is width 1 char */
5777 scan = NEXTOPER(scan);
5780 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5781 ST.paren = scan->flags; /* Which paren to set */
5782 ST.lastparen = rex->lastparen;
5783 ST.lastcloseparen = rex->lastcloseparen;
5784 if (ST.paren > PL_regsize)
5785 PL_regsize = ST.paren;
5786 ST.min = ARG1(scan); /* min to match */
5787 ST.max = ARG2(scan); /* max to match */
5788 if (cur_eval && cur_eval->u.eval.close_paren &&
5789 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5793 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5796 case CURLY: /* /A{m,n}B/ where A is width 1 char */
5798 ST.min = ARG1(scan); /* min to match */
5799 ST.max = ARG2(scan); /* max to match */
5800 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5803 * Lookahead to avoid useless match attempts
5804 * when we know what character comes next.
5806 * Used to only do .*x and .*?x, but now it allows
5807 * for )'s, ('s and (?{ ... })'s to be in the way
5808 * of the quantifier and the EXACT-like node. -- japhy
5811 assert(ST.min <= ST.max);
5812 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5813 ST.c1 = ST.c2 = CHRTEST_VOID;
5816 regnode *text_node = next;
5818 if (! HAS_TEXT(text_node))
5819 FIND_NEXT_IMPT(text_node);
5821 if (! HAS_TEXT(text_node))
5822 ST.c1 = ST.c2 = CHRTEST_VOID;
5824 if ( PL_regkind[OP(text_node)] != EXACT ) {
5825 ST.c1 = ST.c2 = CHRTEST_VOID;
5829 /* Currently we only get here when
5831 PL_rekind[OP(text_node)] == EXACT
5833 if this changes back then the macro for IS_TEXT and
5834 friends need to change. */
5835 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5836 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5847 char *li = locinput;
5849 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5854 if (ST.c1 == CHRTEST_VOID)
5855 goto curly_try_B_min;
5857 ST.oldloc = locinput;
5859 /* set ST.maxpos to the furthest point along the
5860 * string that could possibly match */
5861 if (ST.max == REG_INFTY) {
5862 ST.maxpos = PL_regeol - 1;
5864 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5867 else if (utf8_target) {
5868 int m = ST.max - ST.min;
5869 for (ST.maxpos = locinput;
5870 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5871 ST.maxpos += UTF8SKIP(ST.maxpos);
5874 ST.maxpos = locinput + ST.max - ST.min;
5875 if (ST.maxpos >= PL_regeol)
5876 ST.maxpos = PL_regeol - 1;
5878 goto curly_try_B_min_known;
5882 /* avoid taking address of locinput, so it can remain
5884 char *li = locinput;
5885 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5886 if (ST.count < ST.min)
5889 if ((ST.count > ST.min)
5890 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5892 /* A{m,n} must come at the end of the string, there's
5893 * no point in backing off ... */
5895 /* ...except that $ and \Z can match before *and* after
5896 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5897 We may back off by one in this case. */
5898 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5902 goto curly_try_B_max;
5904 assert(0); /* NOTREACHED */
5907 case CURLY_B_min_known_fail:
5908 /* failed to find B in a non-greedy match where c1,c2 valid */
5910 REGCP_UNWIND(ST.cp);
5912 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5914 /* Couldn't or didn't -- move forward. */
5915 ST.oldloc = locinput;
5917 locinput += UTF8SKIP(locinput);
5921 curly_try_B_min_known:
5922 /* find the next place where 'B' could work, then call B */
5926 n = (ST.oldloc == locinput) ? 0 : 1;
5927 if (ST.c1 == ST.c2) {
5928 /* set n to utf8_distance(oldloc, locinput) */
5929 while (locinput <= ST.maxpos
5930 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5932 locinput += UTF8SKIP(locinput);
5937 /* set n to utf8_distance(oldloc, locinput) */
5938 while (locinput <= ST.maxpos
5939 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5940 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5942 locinput += UTF8SKIP(locinput);
5947 else { /* Not utf8_target */
5948 if (ST.c1 == ST.c2) {
5949 while (locinput <= ST.maxpos &&
5950 UCHARAT(locinput) != ST.c1)
5954 while (locinput <= ST.maxpos
5955 && UCHARAT(locinput) != ST.c1
5956 && UCHARAT(locinput) != ST.c2)
5959 n = locinput - ST.oldloc;
5961 if (locinput > ST.maxpos)
5964 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5965 * at b; check that everything between oldloc and
5966 * locinput matches */
5967 char *li = ST.oldloc;
5969 if (regrepeat(rex, &li, ST.A, n, depth) < n)
5971 assert(n == REG_INFTY || locinput == li);
5973 CURLY_SETPAREN(ST.paren, ST.count);
5974 if (cur_eval && cur_eval->u.eval.close_paren &&
5975 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5978 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5980 assert(0); /* NOTREACHED */
5983 case CURLY_B_min_fail:
5984 /* failed to find B in a non-greedy match where c1,c2 invalid */
5986 REGCP_UNWIND(ST.cp);
5988 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5990 /* failed -- move forward one */
5992 char *li = locinput;
5993 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6000 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6001 ST.count > 0)) /* count overflow ? */
6004 CURLY_SETPAREN(ST.paren, ST.count);
6005 if (cur_eval && cur_eval->u.eval.close_paren &&
6006 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6009 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6013 assert(0); /* NOTREACHED */
6017 /* a successful greedy match: now try to match B */
6018 if (cur_eval && cur_eval->u.eval.close_paren &&
6019 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6023 bool could_match = locinput < PL_regeol;
6025 /* If it could work, try it. */
6026 if (ST.c1 != CHRTEST_VOID && could_match) {
6027 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6029 could_match = memEQ(locinput,
6034 UTF8SKIP(locinput));
6037 could_match = UCHARAT(locinput) == ST.c1
6038 || UCHARAT(locinput) == ST.c2;
6041 if (ST.c1 == CHRTEST_VOID || could_match) {
6042 CURLY_SETPAREN(ST.paren, ST.count);
6043 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6044 assert(0); /* NOTREACHED */
6049 case CURLY_B_max_fail:
6050 /* failed to find B in a greedy match */
6052 REGCP_UNWIND(ST.cp);
6054 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6057 if (--ST.count < ST.min)
6059 locinput = HOPc(locinput, -1);
6060 goto curly_try_B_max;
6064 case END: /* last op of main pattern */
6067 /* we've just finished A in /(??{A})B/; now continue with B */
6068 st->u.eval.toggle_reg_flags
6069 = cur_eval->u.eval.toggle_reg_flags;
6070 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6072 st->u.eval.prev_rex = rex_sv; /* inner */
6073 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6074 rex_sv = cur_eval->u.eval.prev_rex;
6075 SET_reg_curpm(rex_sv);
6076 rex = ReANY(rex_sv);
6077 rexi = RXi_GET(rex);
6078 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6080 REGCP_SET(st->u.eval.lastcp);
6082 /* Restore parens of the outer rex without popping the
6084 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6086 st->u.eval.prev_eval = cur_eval;
6087 cur_eval = cur_eval->u.eval.prev_eval;
6089 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6090 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6091 if ( nochange_depth )
6094 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6095 locinput); /* match B */
6098 if (locinput < reginfo->till) {
6099 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6100 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6102 (long)(locinput - PL_reg_starttry),
6103 (long)(reginfo->till - PL_reg_starttry),
6106 sayNO_SILENT; /* Cannot match: too short. */
6108 sayYES; /* Success! */
6110 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6112 PerlIO_printf(Perl_debug_log,
6113 "%*s %ssubpattern success...%s\n",
6114 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6115 sayYES; /* Success! */
6118 #define ST st->u.ifmatch
6123 case SUSPEND: /* (?>A) */
6125 newstart = locinput;
6128 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6130 goto ifmatch_trivial_fail_test;
6132 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6134 ifmatch_trivial_fail_test:
6136 char * const s = HOPBACKc(locinput, scan->flags);
6141 sw = 1 - cBOOL(ST.wanted);
6145 next = scan + ARG(scan);
6153 newstart = locinput;
6157 ST.logical = logical;
6158 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6160 /* execute body of (?...A) */
6161 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6162 assert(0); /* NOTREACHED */
6165 case IFMATCH_A_fail: /* body of (?...A) failed */
6166 ST.wanted = !ST.wanted;
6169 case IFMATCH_A: /* body of (?...A) succeeded */
6171 sw = cBOOL(ST.wanted);
6173 else if (!ST.wanted)
6176 if (OP(ST.me) != SUSPEND) {
6177 /* restore old position except for (?>...) */
6178 locinput = st->locinput;
6180 scan = ST.me + ARG(ST.me);
6183 continue; /* execute B */
6187 case LONGJMP: /* alternative with many branches compiles to
6188 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6189 next = scan + ARG(scan);
6194 case COMMIT: /* (*COMMIT) */
6195 reginfo->cutpoint = PL_regeol;
6198 case PRUNE: /* (*PRUNE) */
6200 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6201 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6202 assert(0); /* NOTREACHED */
6204 case COMMIT_next_fail:
6208 case OPFAIL: /* (*FAIL) */
6210 assert(0); /* NOTREACHED */
6212 #define ST st->u.mark
6213 case MARKPOINT: /* (*MARK:foo) */
6214 ST.prev_mark = mark_state;
6215 ST.mark_name = sv_commit = sv_yes_mark
6216 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6218 ST.mark_loc = locinput;
6219 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6220 assert(0); /* NOTREACHED */
6222 case MARKPOINT_next:
6223 mark_state = ST.prev_mark;
6225 assert(0); /* NOTREACHED */
6227 case MARKPOINT_next_fail:
6228 if (popmark && sv_eq(ST.mark_name,popmark))
6230 if (ST.mark_loc > startpoint)
6231 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6232 popmark = NULL; /* we found our mark */
6233 sv_commit = ST.mark_name;
6236 PerlIO_printf(Perl_debug_log,
6237 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6238 REPORT_CODE_OFF+depth*2, "",
6239 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6242 mark_state = ST.prev_mark;
6243 sv_yes_mark = mark_state ?
6244 mark_state->u.mark.mark_name : NULL;
6246 assert(0); /* NOTREACHED */
6248 case SKIP: /* (*SKIP) */
6250 /* (*SKIP) : if we fail we cut here*/
6251 ST.mark_name = NULL;
6252 ST.mark_loc = locinput;
6253 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6255 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6256 otherwise do nothing. Meaning we need to scan
6258 regmatch_state *cur = mark_state;
6259 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6262 if ( sv_eq( cur->u.mark.mark_name,
6265 ST.mark_name = find;
6266 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6268 cur = cur->u.mark.prev_mark;
6271 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6274 case SKIP_next_fail:
6276 /* (*CUT:NAME) - Set up to search for the name as we
6277 collapse the stack*/
6278 popmark = ST.mark_name;
6280 /* (*CUT) - No name, we cut here.*/
6281 if (ST.mark_loc > startpoint)
6282 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6283 /* but we set sv_commit to latest mark_name if there
6284 is one so they can test to see how things lead to this
6287 sv_commit=mark_state->u.mark.mark_name;
6291 assert(0); /* NOTREACHED */
6294 case LNBREAK: /* \R */
6295 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6301 #define CASE_CLASS(nAmE) \
6303 if (NEXTCHR_IS_EOS) \
6305 if ((n=is_##nAmE(locinput,utf8_target))) { \
6311 if (NEXTCHR_IS_EOS) \
6313 if ((n=is_##nAmE(locinput,utf8_target))) { \
6316 locinput += UTF8SKIP(locinput); \
6320 CASE_CLASS(VERTWS); /* \v \V */
6321 CASE_CLASS(HORIZWS); /* \h \H */
6325 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6326 PTR2UV(scan), OP(scan));
6327 Perl_croak(aTHX_ "regexp memory corruption");
6329 /* this is a point to jump to in order to increment
6330 * locinput by one character */
6332 assert(!NEXTCHR_IS_EOS);
6334 locinput += PL_utf8skip[nextchr];
6335 /* locinput is allowed to go 1 char off the end, but not 2+ */
6336 if (locinput > PL_regeol)
6345 /* switch break jumps here */
6346 scan = next; /* prepare to execute the next op and ... */
6347 continue; /* ... jump back to the top, reusing st */
6348 assert(0); /* NOTREACHED */
6351 /* push a state that backtracks on success */
6352 st->u.yes.prev_yes_state = yes_state;
6356 /* push a new regex state, then continue at scan */
6358 regmatch_state *newst;
6361 regmatch_state *cur = st;
6362 regmatch_state *curyes = yes_state;
6364 regmatch_slab *slab = PL_regmatch_slab;
6365 for (;curd > -1;cur--,curd--) {
6366 if (cur < SLAB_FIRST(slab)) {
6368 cur = SLAB_LAST(slab);
6370 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6371 REPORT_CODE_OFF + 2 + depth * 2,"",
6372 curd, PL_reg_name[cur->resume_state],
6373 (curyes == cur) ? "yes" : ""
6376 curyes = cur->u.yes.prev_yes_state;
6379 DEBUG_STATE_pp("push")
6382 st->locinput = locinput;
6384 if (newst > SLAB_LAST(PL_regmatch_slab))
6385 newst = S_push_slab(aTHX);
6386 PL_regmatch_state = newst;
6388 locinput = pushinput;
6391 assert(0); /* NOTREACHED */
6396 * We get here only if there's trouble -- normally "case END" is
6397 * the terminating point.
6399 Perl_croak(aTHX_ "corrupted regexp pointers");
6405 /* we have successfully completed a subexpression, but we must now
6406 * pop to the state marked by yes_state and continue from there */
6407 assert(st != yes_state);
6409 while (st != yes_state) {
6411 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6412 PL_regmatch_slab = PL_regmatch_slab->prev;
6413 st = SLAB_LAST(PL_regmatch_slab);
6417 DEBUG_STATE_pp("pop (no final)");
6419 DEBUG_STATE_pp("pop (yes)");
6425 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6426 || yes_state > SLAB_LAST(PL_regmatch_slab))
6428 /* not in this slab, pop slab */
6429 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6430 PL_regmatch_slab = PL_regmatch_slab->prev;
6431 st = SLAB_LAST(PL_regmatch_slab);
6433 depth -= (st - yes_state);
6436 yes_state = st->u.yes.prev_yes_state;
6437 PL_regmatch_state = st;
6440 locinput= st->locinput;
6441 state_num = st->resume_state + no_final;
6442 goto reenter_switch;
6445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6446 PL_colors[4], PL_colors[5]));
6448 if (PL_reg_state.re_state_eval_setup_done) {
6449 /* each successfully executed (?{...}) block does the equivalent of
6450 * local $^R = do {...}
6451 * When popping the save stack, all these locals would be undone;
6452 * bypass this by setting the outermost saved $^R to the latest
6454 if (oreplsv != GvSV(PL_replgv))
6455 sv_setsv(oreplsv, GvSV(PL_replgv));
6462 PerlIO_printf(Perl_debug_log,
6463 "%*s %sfailed...%s\n",
6464 REPORT_CODE_OFF+depth*2, "",
6465 PL_colors[4], PL_colors[5])
6477 /* there's a previous state to backtrack to */
6479 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6480 PL_regmatch_slab = PL_regmatch_slab->prev;
6481 st = SLAB_LAST(PL_regmatch_slab);
6483 PL_regmatch_state = st;
6484 locinput= st->locinput;
6486 DEBUG_STATE_pp("pop");
6488 if (yes_state == st)
6489 yes_state = st->u.yes.prev_yes_state;
6491 state_num = st->resume_state + 1; /* failure = success + 1 */
6492 goto reenter_switch;
6497 if (rex->intflags & PREGf_VERBARG_SEEN) {
6498 SV *sv_err = get_sv("REGERROR", 1);
6499 SV *sv_mrk = get_sv("REGMARK", 1);
6501 sv_commit = &PL_sv_no;
6503 sv_yes_mark = &PL_sv_yes;
6506 sv_commit = &PL_sv_yes;
6507 sv_yes_mark = &PL_sv_no;
6509 sv_setsv(sv_err, sv_commit);
6510 sv_setsv(sv_mrk, sv_yes_mark);
6514 if (last_pushed_cv) {
6517 PERL_UNUSED_VAR(SP);
6520 /* clean up; in particular, free all slabs above current one */
6521 LEAVE_SCOPE(oldsave);
6523 assert(!result || locinput - PL_bostr >= 0);
6524 return result ? locinput - PL_bostr : -1;
6528 - regrepeat - repeatedly match something simple, report how many
6530 * What 'simple' means is a node which can be the operand of a quantifier like
6533 * startposp - pointer a pointer to the start position. This is updated
6534 * to point to the byte following the highest successful
6536 * p - the regnode to be repeatedly matched against.
6537 * max - maximum number of things to match.
6538 * depth - (for debugging) backtracking depth.
6541 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6544 char *scan; /* Pointer to current position in target string */
6546 char *loceol = PL_regeol; /* local version */
6547 I32 hardcount = 0; /* How many matches so far */
6548 bool utf8_target = PL_reg_match_utf8;
6551 PERL_UNUSED_ARG(depth);
6554 PERL_ARGS_ASSERT_REGREPEAT;
6557 if (max == REG_INFTY)
6559 else if (! utf8_target && scan + max < loceol)
6560 loceol = scan + max;
6562 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6563 * to the maximum of how far we should go in it (leaving it set to the real
6564 * end, if the maximum permissible would take us beyond that). This allows
6565 * us to make the loop exit condition that we haven't gone past <loceol> to
6566 * also mean that we haven't exceeded the max permissible count, saving a
6567 * test each time through the loop. But it assumes that the OP matches a
6568 * single byte, which is true for most of the OPs below when applied to a
6569 * non-UTF-8 target. Those relatively few OPs that don't have this
6570 * characteristic will have to compensate.
6572 * There is no adjustment for UTF-8 targets, as the number of bytes per
6573 * character varies. OPs will have to test both that the count is less
6574 * than the max permissible (using <hardcount> to keep track), and that we
6575 * are still within the bounds of the string (using <loceol>. A few OPs
6576 * match a single byte no matter what the encoding. They can omit the max
6577 * test if, for the UTF-8 case, they do the adjustment that was skipped
6580 * Thus, the code above sets things up for the common case; and exceptional
6581 * cases need extra work; the common case is to make sure <scan> doesn't
6582 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6583 * count doesn't exceed the maximum permissible */
6588 while (scan < loceol && hardcount < max && *scan != '\n') {
6589 scan += UTF8SKIP(scan);
6593 while (scan < loceol && *scan != '\n')
6599 while (scan < loceol && hardcount < max) {
6600 scan += UTF8SKIP(scan);
6607 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6608 if (utf8_target && scan + max < loceol) {
6610 /* <loceol> hadn't been adjusted in the UTF-8 case */
6618 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6622 /* Can use a simple loop if the pattern char to match on is invariant
6623 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6624 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6625 * true iff it doesn't matter if the argument is in UTF-8 or not */
6626 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6627 if (utf8_target && scan + max < loceol) {
6628 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6629 * since here, to match at all, 1 char == 1 byte */
6630 loceol = scan + max;
6632 while (scan < loceol && UCHARAT(scan) == c) {
6636 else if (UTF_PATTERN) {
6638 STRLEN scan_char_len;
6640 /* When both target and pattern are UTF-8, we have to do
6642 while (hardcount < max
6643 && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6644 && scan_char_len <= STR_LEN(p)
6645 && memEQ(scan, STRING(p), scan_char_len))
6647 scan += scan_char_len;
6651 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6653 /* Target isn't utf8; convert the character in the UTF-8
6654 * pattern to non-UTF8, and do a simple loop */
6655 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6656 while (scan < loceol && UCHARAT(scan) == c) {
6659 } /* else pattern char is above Latin1, can't possibly match the
6664 /* Here, the string must be utf8; pattern isn't, and <c> is
6665 * different in utf8 than not, so can't compare them directly.
6666 * Outside the loop, find the two utf8 bytes that represent c, and
6667 * then look for those in sequence in the utf8 string */
6668 U8 high = UTF8_TWO_BYTE_HI(c);
6669 U8 low = UTF8_TWO_BYTE_LO(c);
6671 while (hardcount < max
6672 && scan + 1 < loceol
6673 && UCHARAT(scan) == high
6674 && UCHARAT(scan + 1) == low)
6683 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6687 PL_reg_flags |= RF_tainted;
6688 utf8_flags = FOLDEQ_UTF8_LOCALE;
6696 case EXACTFU_TRICKYFOLD:
6698 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6702 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6704 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6706 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6707 if (c1 == CHRTEST_VOID) {
6708 /* Use full Unicode fold matching */
6709 char *tmpeol = PL_regeol;
6710 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6711 while (hardcount < max
6712 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6713 STRING(p), NULL, pat_len,
6714 cBOOL(UTF_PATTERN), utf8_flags))
6721 else if (utf8_target) {
6723 while (scan < loceol
6725 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6727 scan += UTF8SKIP(scan);
6732 while (scan < loceol
6734 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6735 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6737 scan += UTF8SKIP(scan);
6742 else if (c1 == c2) {
6743 while (scan < loceol && UCHARAT(scan) == c1) {
6748 while (scan < loceol &&
6749 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6760 while (hardcount < max
6761 && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6762 && reginclass(prog, p, (U8*)scan, utf8_target))
6768 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6775 LOAD_UTF8_CHARCLASS_ALNUM();
6776 while (hardcount < max && scan < loceol &&
6777 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6779 scan += UTF8SKIP(scan);
6783 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6791 while (scan < loceol && isALNUM((U8) *scan)) {
6796 if (utf8_target && scan + max < loceol) {
6798 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6799 * since here, to match, 1 char == 1 byte */
6800 loceol = scan + max;
6802 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6807 PL_reg_flags |= RF_tainted;
6809 while (hardcount < max && scan < loceol &&
6810 isALNUM_LC_utf8((U8*)scan)) {
6811 scan += UTF8SKIP(scan);
6815 while (scan < loceol && isALNUM_LC(*scan))
6824 LOAD_UTF8_CHARCLASS_ALNUM();
6825 while (hardcount < max && scan < loceol &&
6826 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6828 scan += UTF8SKIP(scan);
6832 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6839 goto utf8_Nwordchar;
6840 while (scan < loceol && ! isALNUM((U8) *scan)) {
6846 if (utf8_target && scan + max < loceol) {
6848 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6849 * since here, to match, 1 char == 1 byte */
6850 loceol = scan + max;
6852 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6858 while (scan < loceol && hardcount < max
6859 && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6861 scan += UTF8SKIP(scan);
6866 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6873 while (scan < loceol && hardcount < max
6874 && ! isWORDCHAR_A((U8) *scan))
6876 scan += UTF8SKIP(scan);
6881 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6887 PL_reg_flags |= RF_tainted;
6889 while (hardcount < max && scan < loceol &&
6890 !isALNUM_LC_utf8((U8*)scan)) {
6891 scan += UTF8SKIP(scan);
6895 while (scan < loceol && !isALNUM_LC(*scan))
6904 LOAD_UTF8_CHARCLASS_SPACE();
6905 while (hardcount < max && scan < loceol &&
6907 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6909 scan += UTF8SKIP(scan);
6915 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6924 while (scan < loceol && isSPACE((U8) *scan)) {
6929 if (utf8_target && scan + max < loceol) {
6931 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6932 * since here, to match, 1 char == 1 byte */
6933 loceol = scan + max;
6935 while (scan < loceol && isSPACE_A((U8) *scan)) {
6940 PL_reg_flags |= RF_tainted;
6942 while (hardcount < max && scan < loceol &&
6943 isSPACE_LC_utf8((U8*)scan)) {
6944 scan += UTF8SKIP(scan);
6948 while (scan < loceol && isSPACE_LC(*scan))
6957 LOAD_UTF8_CHARCLASS_SPACE();
6958 while (hardcount < max && scan < loceol &&
6960 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6962 scan += UTF8SKIP(scan);
6968 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6977 while (scan < loceol && ! isSPACE((U8) *scan)) {
6983 while (hardcount < max && scan < loceol
6984 && ! isSPACE_A((U8) *scan))
6986 scan += UTF8SKIP(scan);
6991 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6997 PL_reg_flags |= RF_tainted;
6999 while (hardcount < max && scan < loceol &&
7000 !isSPACE_LC_utf8((U8*)scan)) {
7001 scan += UTF8SKIP(scan);
7005 while (scan < loceol && !isSPACE_LC(*scan))
7011 LOAD_UTF8_CHARCLASS_DIGIT();
7012 while (hardcount < max && scan < loceol &&
7013 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7014 scan += UTF8SKIP(scan);
7018 while (scan < loceol && isDIGIT(*scan))
7023 if (utf8_target && scan + max < loceol) {
7025 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7026 * since here, to match, 1 char == 1 byte */
7027 loceol = scan + max;
7029 while (scan < loceol && isDIGIT_A((U8) *scan)) {
7034 PL_reg_flags |= RF_tainted;
7036 while (hardcount < max && scan < loceol &&
7037 isDIGIT_LC_utf8((U8*)scan)) {
7038 scan += UTF8SKIP(scan);
7042 while (scan < loceol && isDIGIT_LC(*scan))
7048 LOAD_UTF8_CHARCLASS_DIGIT();
7049 while (hardcount < max && scan < loceol &&
7050 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7051 scan += UTF8SKIP(scan);
7055 while (scan < loceol && !isDIGIT(*scan))
7061 while (hardcount < max && scan < loceol
7062 && ! isDIGIT_A((U8) *scan)) {
7063 scan += UTF8SKIP(scan);
7068 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7074 PL_reg_flags |= RF_tainted;
7076 while (hardcount < max && scan < loceol &&
7077 !isDIGIT_LC_utf8((U8*)scan)) {
7078 scan += UTF8SKIP(scan);
7082 while (scan < loceol && !isDIGIT_LC(*scan))
7088 while (hardcount < max && scan < loceol &&
7089 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7094 /* LNBREAK can match one or two latin chars, which is ok, but we
7095 * have to use hardcount in this situation, and throw away the
7096 * adjustment to <loceol> done before the switch statement */
7098 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7106 while (hardcount < max && scan < loceol &&
7107 (c=is_HORIZWS_utf8_safe(scan, loceol)))
7113 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7119 while (hardcount < max && scan < loceol &&
7120 !is_HORIZWS_utf8_safe(scan, loceol))
7122 scan += UTF8SKIP(scan);
7126 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7133 while (hardcount < max && scan < loceol &&
7134 (c=is_VERTWS_utf8_safe(scan, loceol)))
7140 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7147 while (hardcount < max && scan < loceol &&
7148 !is_VERTWS_utf8_safe(scan, loceol))
7150 scan += UTF8SKIP(scan);
7154 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7174 /* These are all 0 width, so match right here or not at all. */
7178 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7179 assert(0); /* NOTREACHED */
7186 c = scan - *startposp;
7190 GET_RE_DEBUG_FLAGS_DECL;
7192 SV * const prop = sv_newmortal();
7193 regprop(prog, prop, p);
7194 PerlIO_printf(Perl_debug_log,
7195 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7196 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7204 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7206 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7207 create a copy so that changes the caller makes won't change the shared one.
7208 If <altsvp> is non-null, will return NULL in it, for back-compat.
7211 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7213 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7219 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7224 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
7226 /* Returns the swash for the input 'node' in the regex 'prog'.
7227 * If <doinit> is true, will attempt to create the swash if not already
7229 * If <listsvp> is non-null, will return the swash initialization string in
7231 * Tied intimately to how regcomp.c sets up the data structure */
7238 RXi_GET_DECL(prog,progi);
7239 const struct reg_data * const data = prog ? progi->data : NULL;
7241 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7243 assert(ANYOF_NONBITMAP(node));
7245 if (data && data->count) {
7246 const U32 n = ARG(node);
7248 if (data->what[n] == 's') {
7249 SV * const rv = MUTABLE_SV(data->data[n]);
7250 AV * const av = MUTABLE_AV(SvRV(rv));
7251 SV **const ary = AvARRAY(av);
7252 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7254 si = *ary; /* ary[0] = the string to initialize the swash with */
7256 /* Elements 2 and 3 are either both present or both absent. [2] is
7257 * any inversion list generated at compile time; [3] indicates if
7258 * that inversion list has any user-defined properties in it. */
7259 if (av_len(av) >= 2) {
7262 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7269 /* Element [1] is reserved for the set-up swash. If already there,
7270 * return it; if not, create it and store it there */
7271 if (SvROK(ary[1])) {
7274 else if (si && doinit) {
7276 sw = _core_swash_init("utf8", /* the utf8 package */
7280 0, /* not from tr/// */
7283 (void)av_store(av, 1, sw);
7289 SV* matches_string = newSVpvn("", 0);
7291 /* Use the swash, if any, which has to have incorporated into it all
7293 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7294 && (si && si != &PL_sv_undef))
7297 /* If no swash, use the input initialization string, if available */
7298 sv_catsv(matches_string, si);
7301 /* Add the inversion list to whatever we have. This may have come from
7302 * the swash, or from an input parameter */
7304 sv_catsv(matches_string, _invlist_contents(invlist));
7306 *listsvp = matches_string;
7313 - reginclass - determine if a character falls into a character class
7315 n is the ANYOF regnode
7316 p is the target string
7317 utf8_target tells whether p is in UTF-8.
7319 Returns true if matched; false otherwise.
7321 Note that this can be a synthetic start class, a combination of various
7322 nodes, so things you think might be mutually exclusive, such as locale,
7323 aren't. It can match both locale and non-locale
7328 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
7331 const char flags = ANYOF_FLAGS(n);
7335 PERL_ARGS_ASSERT_REGINCLASS;
7337 /* If c is not already the code point, get it. Note that
7338 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7339 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7341 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7342 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7343 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7344 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7345 * UTF8_ALLOW_FFFF */
7346 if (c_len == (STRLEN)-1)
7347 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7350 /* If this character is potentially in the bitmap, check it */
7352 if (ANYOF_BITMAP_TEST(n, c))
7354 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7360 else if (flags & ANYOF_LOCALE) {
7361 PL_reg_flags |= RF_tainted;
7363 if ((flags & ANYOF_LOC_FOLD)
7364 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7368 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7369 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
7370 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
7371 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
7372 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
7373 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
7374 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
7375 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
7376 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7377 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
7378 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
7379 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
7380 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
7381 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
7382 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
7383 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
7384 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
7385 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
7386 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
7387 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
7388 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
7389 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
7390 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
7391 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
7392 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
7393 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
7394 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
7395 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
7396 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
7397 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
7398 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
7399 ) /* How's that for a conditional? */
7406 /* If the bitmap didn't (or couldn't) match, and something outside the
7407 * bitmap could match, try that. Locale nodes specify completely the
7408 * behavior of code points in the bit map (otherwise, a utf8 target would
7409 * cause them to be treated as Unicode and not locale), except in
7410 * the very unlikely event when this node is a synthetic start class, which
7411 * could be a combination of locale and non-locale nodes. So allow locale
7412 * to match for the synthetic start class, which will give a false
7413 * positive that will be resolved when the match is done again as not part
7414 * of the synthetic start class */
7416 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7417 match = TRUE; /* Everything above 255 matches */
7419 else if (ANYOF_NONBITMAP(n)
7420 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7423 || (! (flags & ANYOF_LOCALE))
7424 || (flags & ANYOF_IS_SYNTHETIC)))))
7426 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7431 } else { /* Convert to utf8 */
7433 utf8_p = bytes_to_utf8(p, &len);
7436 if (swash_fetch(sw, utf8_p, TRUE)) {
7440 /* If we allocated a string above, free it */
7441 if (! utf8_target) Safefree(utf8_p);
7445 if (UNICODE_IS_SUPER(c)
7446 && (flags & ANYOF_WARN_SUPER)
7447 && ckWARN_d(WARN_NON_UNICODE))
7449 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7450 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7454 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7455 return cBOOL(flags & ANYOF_INVERT) ^ match;
7459 S_reghop3(U8 *s, I32 off, const U8* lim)
7461 /* return the position 'off' UTF-8 characters away from 's', forward if
7462 * 'off' >= 0, backwards if negative. But don't go outside of position
7463 * 'lim', which better be < s if off < 0 */
7467 PERL_ARGS_ASSERT_REGHOP3;
7470 while (off-- && s < lim) {
7471 /* XXX could check well-formedness here */
7476 while (off++ && s > lim) {
7478 if (UTF8_IS_CONTINUED(*s)) {
7479 while (s > lim && UTF8_IS_CONTINUATION(*s))
7482 /* XXX could check well-formedness here */
7489 /* there are a bunch of places where we use two reghop3's that should
7490 be replaced with this routine. but since thats not done yet
7491 we ifdef it out - dmq
7494 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7498 PERL_ARGS_ASSERT_REGHOP4;
7501 while (off-- && s < rlim) {
7502 /* XXX could check well-formedness here */
7507 while (off++ && s > llim) {
7509 if (UTF8_IS_CONTINUED(*s)) {
7510 while (s > llim && UTF8_IS_CONTINUATION(*s))
7513 /* XXX could check well-formedness here */
7521 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7525 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7528 while (off-- && s < lim) {
7529 /* XXX could check well-formedness here */
7536 while (off++ && s > lim) {
7538 if (UTF8_IS_CONTINUED(*s)) {
7539 while (s > lim && UTF8_IS_CONTINUATION(*s))
7542 /* XXX could check well-formedness here */
7551 restore_pos(pTHX_ void *arg)
7554 regexp * const rex = (regexp *)arg;
7555 if (PL_reg_state.re_state_eval_setup_done) {
7556 if (PL_reg_oldsaved) {
7557 rex->subbeg = PL_reg_oldsaved;
7558 rex->sublen = PL_reg_oldsavedlen;
7559 rex->suboffset = PL_reg_oldsavedoffset;
7560 rex->subcoffset = PL_reg_oldsavedcoffset;
7561 #ifdef PERL_OLD_COPY_ON_WRITE
7562 rex->saved_copy = PL_nrs;
7564 RXp_MATCH_COPIED_on(rex);
7566 PL_reg_magic->mg_len = PL_reg_oldpos;
7567 PL_reg_state.re_state_eval_setup_done = FALSE;
7568 PL_curpm = PL_reg_oldcurpm;
7573 S_to_utf8_substr(pTHX_ register regexp *prog)
7575 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7576 * on the converted value */
7580 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7583 if (prog->substrs->data[i].substr
7584 && !prog->substrs->data[i].utf8_substr) {
7585 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7586 prog->substrs->data[i].utf8_substr = sv;
7587 sv_utf8_upgrade(sv);
7588 if (SvVALID(prog->substrs->data[i].substr)) {
7589 if (SvTAIL(prog->substrs->data[i].substr)) {
7590 /* Trim the trailing \n that fbm_compile added last
7592 SvCUR_set(sv, SvCUR(sv) - 1);
7593 /* Whilst this makes the SV technically "invalid" (as its
7594 buffer is no longer followed by "\0") when fbm_compile()
7595 adds the "\n" back, a "\0" is restored. */
7596 fbm_compile(sv, FBMcf_TAIL);
7600 if (prog->substrs->data[i].substr == prog->check_substr)
7601 prog->check_utf8 = sv;
7607 S_to_byte_substr(pTHX_ register regexp *prog)
7609 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7610 * on the converted value; returns FALSE if can't be converted. */
7615 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7618 if (prog->substrs->data[i].utf8_substr
7619 && !prog->substrs->data[i].substr) {
7620 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7621 if (! sv_utf8_downgrade(sv, TRUE)) {
7624 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7625 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7626 /* Trim the trailing \n that fbm_compile added last
7628 SvCUR_set(sv, SvCUR(sv) - 1);
7629 fbm_compile(sv, FBMcf_TAIL);
7633 prog->substrs->data[i].substr = sv;
7634 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7635 prog->check_substr = sv;
7642 /* These constants are for finding GCB=LV and GCB=LVT. These are for the
7643 * pre-composed Hangul syllables, which are all in a contiguous block and
7644 * arranged there in such a way so as to facilitate alorithmic determination of
7645 * their characteristics. As such, they don't need a swash, but can be
7646 * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one
7648 #define SBASE 0xAC00 /* Start of block */
7649 #define SCount 11172 /* Length of block */
7652 #if 0 /* This routine is not currently used */
7653 PERL_STATIC_INLINE bool
7654 S_is_utf8_X_LV(pTHX_ const U8 *p)
7656 /* Unlike most other similarly named routines here, this does not create a
7657 * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7661 UV cp = valid_utf8_to_uvchr(p, NULL);
7663 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7665 /* The earliest Unicode releases did not have these precomposed Hangul
7666 * syllables. Set to point to undef in that case, so will return false on
7668 if (! PL_utf8_X_LV) { /* Set up if this is the first time called */
7669 PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7670 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7671 SvREFCNT_dec(PL_utf8_X_LV);
7672 PL_utf8_X_LV = &PL_sv_undef;
7676 return (PL_utf8_X_LV != &PL_sv_undef
7677 && cp >= SBASE && cp < SBASE + SCount
7678 && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7682 PERL_STATIC_INLINE bool
7683 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7685 /* Unlike most other similarly named routines here, this does not create a
7686 * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7690 UV cp = valid_utf8_to_uvchr(p, NULL);
7692 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7694 /* The earliest Unicode releases did not have these precomposed Hangul
7695 * syllables. Set to point to undef in that case, so will return false on
7697 if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */
7698 PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7699 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7700 SvREFCNT_dec(PL_utf8_X_LVT);
7701 PL_utf8_X_LVT = &PL_sv_undef;
7705 return (PL_utf8_X_LVT != &PL_sv_undef
7706 && cp >= SBASE && cp < SBASE + SCount
7707 && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7712 * c-indentation-style: bsd
7714 * indent-tabs-mode: nil
7717 * ex: set ts=8 sts=4 sw=4 et: