5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #define RF_tainted 1 /* tainted information used? e.g. locale */
84 #define RF_warned 2 /* warned about big count? */
86 #define RF_utf8 8 /* Pattern contains multibyte chars? */
88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
90 #define RS_init 1 /* eval environment created */
91 #define RS_set 2 /* replsv value is set */
97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
100 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
110 #define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
114 #define HOPBACKc(pos, off) \
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 /* these are unrolled below in the CCC_TRY_XXX defined */
125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
126 if (!CAT2(PL_utf8_,class)) { \
128 ENTER; save_re_context(); \
129 ok=CAT2(is_utf8_,class)((const U8*)str); \
130 assert(ok); LEAVE; } } STMT_END
132 /* Doesn't do an assert to verify that is correct */
133 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
134 if (!CAT2(PL_utf8_,class)) { \
135 bool throw_away __attribute__unused__; \
136 ENTER; save_re_context(); \
137 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
140 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
141 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
142 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
144 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
145 LOAD_UTF8_CHARCLASS(X_begin, " "); \
146 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
147 /* These are utf8 constants, and not utf-ebcdic constants, so the \
148 * assert should likely and hopefully fail on an EBCDIC machine */ \
149 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
151 /* No asserts are done for these, in case called on an early \
152 * Unicode version in which they map to nothing */ \
153 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
154 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
155 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
156 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
157 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
158 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
161 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
163 /* The actual code for CCC_TRY, which uses several variables from the routine
164 * it's callable from. It is designed to be the bulk of a case statement.
165 * FUNC is the macro or function to call on non-utf8 targets that indicate if
166 * nextchr matches the class.
167 * UTF8_TEST is the whole test string to use for utf8 targets
168 * LOAD is what to use to test, and if not present to load in the swash for the
170 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
172 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
173 * utf8 and a variant, load the swash if necessary and test using the utf8
174 * test. Advance to the next character if test is ok, otherwise fail; If not
175 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
176 * fails, or advance to the next character */
178 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
179 if (locinput >= PL_regeol) { \
182 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
183 LOAD_UTF8_CHARCLASS(CLASS, STR); \
184 if (POS_OR_NEG (UTF8_TEST)) { \
187 locinput += PL_utf8skip[nextchr]; \
188 nextchr = UCHARAT(locinput); \
191 if (POS_OR_NEG (FUNC(nextchr))) { \
194 nextchr = UCHARAT(++locinput); \
197 /* Handle the non-locale cases for a character class and its complement. It
198 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
199 * This is because that code fails when the test succeeds, so we want to have
200 * the test fail so that the code succeeds. The swash is stored in a
201 * predictable PL_ place */
202 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
205 _CCC_TRY_CODE( !, FUNC, \
206 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
207 (U8*)locinput, TRUE)), \
210 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
211 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
212 (U8*)locinput, TRUE)), \
215 /* Generate the case statements for both locale and non-locale character
216 * classes in regmatch for classes that don't have special unicode semantics.
217 * Locales don't use an immediate swash, but an intermediary special locale
218 * function that is called on the pointer to the current place in the input
219 * string. That function will resolve to needing the same swash. One might
220 * think that because we don't know what the locale will match, we shouldn't
221 * check with the swash loading function that it loaded properly; ie, that we
222 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
223 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
225 #define CCC_TRY(NAME, NNAME, FUNC, \
226 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
227 NAMEA, NNAMEA, FUNCA, \
230 PL_reg_flags |= RF_tainted; \
231 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
233 PL_reg_flags |= RF_tainted; \
234 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
237 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
240 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
241 nextchr = UCHARAT(++locinput); \
244 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
248 locinput += PL_utf8skip[nextchr]; \
249 nextchr = UCHARAT(locinput); \
252 nextchr = UCHARAT(++locinput); \
255 /* Generate the non-locale cases */ \
256 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
258 /* This is like CCC_TRY, but has an extra set of parameters for generating case
259 * statements to handle separate Unicode semantics nodes */
260 #define CCC_TRY_U(NAME, NNAME, FUNC, \
261 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
262 NAMEU, NNAMEU, FUNCU, \
263 NAMEA, NNAMEA, FUNCA, \
265 CCC_TRY(NAME, NNAME, FUNC, \
266 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
267 NAMEA, NNAMEA, FUNCA, \
269 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
271 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
273 /* for use after a quantifier and before an EXACT-like node -- japhy */
274 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
276 * NOTE that *nothing* that affects backtracking should be in here, specifically
277 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
278 * node that is in between two EXACT like nodes when ascertaining what the required
279 * "follow" character is. This should probably be moved to regex compile time
280 * although it may be done at run time beause of the REF possibility - more
281 * investigation required. -- demerphq
283 #define JUMPABLE(rn) ( \
285 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
287 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
288 OP(rn) == PLUS || OP(rn) == MINMOD || \
290 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
292 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
294 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
297 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
298 we don't need this definition. */
299 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
300 #define IS_TEXTF(rn) ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA || OP(rn)==EXACTF) || OP(rn)==REFF || OP(rn)==NREFF )
301 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
304 /* ... so we use this as its faster. */
305 #define IS_TEXT(rn) ( OP(rn)==EXACT )
306 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
307 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
308 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
313 Search for mandatory following text node; for lookahead, the text must
314 follow but for lookbehind (rn->flags != 0) we skip to the next step.
316 #define FIND_NEXT_IMPT(rn) STMT_START { \
317 while (JUMPABLE(rn)) { \
318 const OPCODE type = OP(rn); \
319 if (type == SUSPEND || PL_regkind[type] == CURLY) \
320 rn = NEXTOPER(NEXTOPER(rn)); \
321 else if (type == PLUS) \
323 else if (type == IFMATCH) \
324 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
325 else rn += NEXT_OFF(rn); \
330 static void restore_pos(pTHX_ void *arg);
332 #define REGCP_PAREN_ELEMS 4
333 #define REGCP_OTHER_ELEMS 5
334 #define REGCP_FRAME_ELEMS 1
335 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
336 * are needed for the regexp context stack bookkeeping. */
339 S_regcppush(pTHX_ I32 parenfloor)
342 const int retval = PL_savestack_ix;
343 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
344 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
345 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
347 GET_RE_DEBUG_FLAGS_DECL;
349 if (paren_elems_to_push < 0)
350 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
352 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
353 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
354 " out of range (%lu-%ld)",
355 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
357 SSGROW(total_elems + REGCP_FRAME_ELEMS);
359 for (p = PL_regsize; p > parenfloor; p--) {
360 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
361 SSPUSHINT(PL_regoffs[p].end);
362 SSPUSHINT(PL_regoffs[p].start);
363 SSPUSHPTR(PL_reg_start_tmp[p]);
365 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
366 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
367 (UV)p, (IV)PL_regoffs[p].start,
368 (IV)(PL_reg_start_tmp[p] - PL_bostr),
369 (IV)PL_regoffs[p].end
372 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
373 SSPUSHPTR(PL_regoffs);
374 SSPUSHINT(PL_regsize);
375 SSPUSHINT(*PL_reglastparen);
376 SSPUSHINT(*PL_reglastcloseparen);
377 SSPUSHPTR(PL_reginput);
378 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
383 /* These are needed since we do not localize EVAL nodes: */
384 #define REGCP_SET(cp) \
386 PerlIO_printf(Perl_debug_log, \
387 " Setting an EVAL scope, savestack=%"IVdf"\n", \
388 (IV)PL_savestack_ix)); \
391 #define REGCP_UNWIND(cp) \
393 if (cp != PL_savestack_ix) \
394 PerlIO_printf(Perl_debug_log, \
395 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
396 (IV)(cp), (IV)PL_savestack_ix)); \
400 S_regcppop(pTHX_ const regexp *rex)
405 GET_RE_DEBUG_FLAGS_DECL;
407 PERL_ARGS_ASSERT_REGCPPOP;
409 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
411 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
412 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
413 input = (char *) SSPOPPTR;
414 *PL_reglastcloseparen = SSPOPINT;
415 *PL_reglastparen = SSPOPINT;
416 PL_regsize = SSPOPINT;
417 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
419 i -= REGCP_OTHER_ELEMS;
420 /* Now restore the parentheses context. */
421 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
423 U32 paren = (U32)SSPOPINT;
424 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
425 PL_regoffs[paren].start = SSPOPINT;
427 if (paren <= *PL_reglastparen)
428 PL_regoffs[paren].end = tmps;
430 PerlIO_printf(Perl_debug_log,
431 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
432 (UV)paren, (IV)PL_regoffs[paren].start,
433 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
434 (IV)PL_regoffs[paren].end,
435 (paren > *PL_reglastparen ? "(no)" : ""));
439 if (*PL_reglastparen + 1 <= rex->nparens) {
440 PerlIO_printf(Perl_debug_log,
441 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
442 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
446 /* It would seem that the similar code in regtry()
447 * already takes care of this, and in fact it is in
448 * a better location to since this code can #if 0-ed out
449 * but the code in regtry() is needed or otherwise tests
450 * requiring null fields (pat.t#187 and split.t#{13,14}
451 * (as of patchlevel 7877) will fail. Then again,
452 * this code seems to be necessary or otherwise
453 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
454 * --jhi updated by dapm */
455 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
457 PL_regoffs[i].start = -1;
458 PL_regoffs[i].end = -1;
464 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
467 * pregexec and friends
470 #ifndef PERL_IN_XSUB_RE
472 - pregexec - match a regexp against a string
475 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
476 char *strbeg, I32 minend, SV *screamer, U32 nosave)
477 /* strend: pointer to null at end of string */
478 /* strbeg: real beginning of string */
479 /* minend: end of match must be >=minend after stringarg. */
480 /* nosave: For optimizations. */
482 PERL_ARGS_ASSERT_PREGEXEC;
485 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
486 nosave ? 0 : REXEC_COPY_STR);
491 * Need to implement the following flags for reg_anch:
493 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
495 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
496 * INTUIT_AUTORITATIVE_ML
497 * INTUIT_ONCE_NOML - Intuit can match in one location only.
500 * Another flag for this function: SECOND_TIME (so that float substrs
501 * with giant delta may be not rechecked).
504 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
506 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
507 Otherwise, only SvCUR(sv) is used to get strbeg. */
509 /* XXXX We assume that strpos is strbeg unless sv. */
511 /* XXXX Some places assume that there is a fixed substring.
512 An update may be needed if optimizer marks as "INTUITable"
513 RExen without fixed substrings. Similarly, it is assumed that
514 lengths of all the strings are no more than minlen, thus they
515 cannot come from lookahead.
516 (Or minlen should take into account lookahead.)
517 NOTE: Some of this comment is not correct. minlen does now take account
518 of lookahead/behind. Further research is required. -- demerphq
522 /* A failure to find a constant substring means that there is no need to make
523 an expensive call to REx engine, thus we celebrate a failure. Similarly,
524 finding a substring too deep into the string means that less calls to
525 regtry() should be needed.
527 REx compiler's optimizer found 4 possible hints:
528 a) Anchored substring;
530 c) Whether we are anchored (beginning-of-line or \G);
531 d) First node (of those at offset 0) which may distinguish positions;
532 We use a)b)d) and multiline-part of c), and try to find a position in the
533 string which does not contradict any of them.
536 /* Most of decisions we do here should have been done at compile time.
537 The nodes of the REx which we used for the search should have been
538 deleted from the finite automaton. */
541 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
542 char *strend, const U32 flags, re_scream_pos_data *data)
545 struct regexp *const prog = (struct regexp *)SvANY(rx);
546 register I32 start_shift = 0;
547 /* Should be nonnegative! */
548 register I32 end_shift = 0;
553 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
555 register char *other_last = NULL; /* other substr checked before this */
556 char *check_at = NULL; /* check substr found at this pos */
557 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
558 RXi_GET_DECL(prog,progi);
560 const char * const i_strpos = strpos;
562 GET_RE_DEBUG_FLAGS_DECL;
564 PERL_ARGS_ASSERT_RE_INTUIT_START;
566 RX_MATCH_UTF8_set(rx,utf8_target);
569 PL_reg_flags |= RF_utf8;
572 debug_start_match(rx, utf8_target, strpos, strend,
573 sv ? "Guessing start of match in sv for"
574 : "Guessing start of match in string for");
577 /* CHR_DIST() would be more correct here but it makes things slow. */
578 if (prog->minlen > strend - strpos) {
579 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
580 "String too short... [re_intuit_start]\n"));
584 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
587 if (!prog->check_utf8 && prog->check_substr)
588 to_utf8_substr(prog);
589 check = prog->check_utf8;
591 if (!prog->check_substr && prog->check_utf8)
592 to_byte_substr(prog);
593 check = prog->check_substr;
595 if (check == &PL_sv_undef) {
596 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
597 "Non-utf8 string cannot match utf8 check string\n"));
600 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
601 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
602 || ( (prog->extflags & RXf_ANCH_BOL)
603 && !multiline ) ); /* Check after \n? */
606 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
607 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
608 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
610 && (strpos != strbeg)) {
611 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
614 if (prog->check_offset_min == prog->check_offset_max &&
615 !(prog->extflags & RXf_CANY_SEEN)) {
616 /* Substring at constant offset from beg-of-str... */
619 s = HOP3c(strpos, prog->check_offset_min, strend);
622 slen = SvCUR(check); /* >= 1 */
624 if ( strend - s > slen || strend - s < slen - 1
625 || (strend - s == slen && strend[-1] != '\n')) {
626 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
629 /* Now should match s[0..slen-2] */
631 if (slen && (*SvPVX_const(check) != *s
633 && memNE(SvPVX_const(check), s, slen)))) {
635 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
639 else if (*SvPVX_const(check) != *s
640 || ((slen = SvCUR(check)) > 1
641 && memNE(SvPVX_const(check), s, slen)))
644 goto success_at_start;
647 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
649 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
650 end_shift = prog->check_end_shift;
653 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
654 - (SvTAIL(check) != 0);
655 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
657 if (end_shift < eshift)
661 else { /* Can match at random position */
664 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
665 end_shift = prog->check_end_shift;
667 /* end shift should be non negative here */
670 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
672 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
673 (IV)end_shift, RX_PRECOMP(prog));
677 /* Find a possible match in the region s..strend by looking for
678 the "check" substring in the region corrected by start/end_shift. */
681 I32 srch_start_shift = start_shift;
682 I32 srch_end_shift = end_shift;
683 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
684 srch_end_shift -= ((strbeg - s) - srch_start_shift);
685 srch_start_shift = strbeg - s;
687 DEBUG_OPTIMISE_MORE_r({
688 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
689 (IV)prog->check_offset_min,
690 (IV)srch_start_shift,
692 (IV)prog->check_end_shift);
695 if (flags & REXEC_SCREAM) {
696 I32 p = -1; /* Internal iterator of scream. */
697 I32 * const pp = data ? data->scream_pos : &p;
699 if (PL_screamfirst[BmRARE(check)] >= 0
700 || ( BmRARE(check) == '\n'
701 && (BmPREVIOUS(check) == SvCUR(check) - 1)
703 s = screaminstr(sv, check,
704 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
707 /* we may be pointing at the wrong string */
708 if (s && RXp_MATCH_COPIED(prog))
709 s = strbeg + (s - SvPVX_const(sv));
711 *data->scream_olds = s;
716 if (prog->extflags & RXf_CANY_SEEN) {
717 start_point= (U8*)(s + srch_start_shift);
718 end_point= (U8*)(strend - srch_end_shift);
720 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
721 end_point= HOP3(strend, -srch_end_shift, strbeg);
723 DEBUG_OPTIMISE_MORE_r({
724 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
725 (int)(end_point - start_point),
726 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
730 s = fbm_instr( start_point, end_point,
731 check, multiline ? FBMrf_MULTILINE : 0);
734 /* Update the count-of-usability, remove useless subpatterns,
738 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
739 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
740 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
741 (s ? "Found" : "Did not find"),
742 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
743 ? "anchored" : "floating"),
746 (s ? " at offset " : "...\n") );
751 /* Finish the diagnostic message */
752 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
754 /* XXX dmq: first branch is for positive lookbehind...
755 Our check string is offset from the beginning of the pattern.
756 So we need to do any stclass tests offset forward from that
765 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
766 Start with the other substr.
767 XXXX no SCREAM optimization yet - and a very coarse implementation
768 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
769 *always* match. Probably should be marked during compile...
770 Probably it is right to do no SCREAM here...
773 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
774 : (prog->float_substr && prog->anchored_substr))
776 /* Take into account the "other" substring. */
777 /* XXXX May be hopelessly wrong for UTF... */
780 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
783 char * const last = HOP3c(s, -start_shift, strbeg);
785 char * const saved_s = s;
788 t = s - prog->check_offset_max;
789 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
791 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
796 t = HOP3c(t, prog->anchored_offset, strend);
797 if (t < other_last) /* These positions already checked */
799 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
802 /* XXXX It is not documented what units *_offsets are in.
803 We assume bytes, but this is clearly wrong.
804 Meaning this code needs to be carefully reviewed for errors.
808 /* On end-of-str: see comment below. */
809 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
810 if (must == &PL_sv_undef) {
812 DEBUG_r(must = prog->anchored_utf8); /* for debug */
817 HOP3(HOP3(last1, prog->anchored_offset, strend)
818 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
820 multiline ? FBMrf_MULTILINE : 0
823 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
824 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
825 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
826 (s ? "Found" : "Contradicts"),
827 quoted, RE_SV_TAIL(must));
832 if (last1 >= last2) {
833 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
834 ", giving up...\n"));
837 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
838 ", trying floating at offset %ld...\n",
839 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
840 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
841 s = HOP3c(last, 1, strend);
845 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
846 (long)(s - i_strpos)));
847 t = HOP3c(s, -prog->anchored_offset, strbeg);
848 other_last = HOP3c(s, 1, strend);
856 else { /* Take into account the floating substring. */
858 char * const saved_s = s;
861 t = HOP3c(s, -start_shift, strbeg);
863 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
864 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
865 last = HOP3c(t, prog->float_max_offset, strend);
866 s = HOP3c(t, prog->float_min_offset, strend);
869 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
870 must = utf8_target ? prog->float_utf8 : prog->float_substr;
871 /* fbm_instr() takes into account exact value of end-of-str
872 if the check is SvTAIL(ed). Since false positives are OK,
873 and end-of-str is not later than strend we are OK. */
874 if (must == &PL_sv_undef) {
876 DEBUG_r(must = prog->float_utf8); /* for debug message */
879 s = fbm_instr((unsigned char*)s,
880 (unsigned char*)last + SvCUR(must)
882 must, multiline ? FBMrf_MULTILINE : 0);
884 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
885 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
886 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
887 (s ? "Found" : "Contradicts"),
888 quoted, RE_SV_TAIL(must));
892 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
893 ", giving up...\n"));
896 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
897 ", trying anchored starting at offset %ld...\n",
898 (long)(saved_s + 1 - i_strpos)));
900 s = HOP3c(t, 1, strend);
904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
905 (long)(s - i_strpos)));
906 other_last = s; /* Fix this later. --Hugo */
916 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
918 DEBUG_OPTIMISE_MORE_r(
919 PerlIO_printf(Perl_debug_log,
920 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
921 (IV)prog->check_offset_min,
922 (IV)prog->check_offset_max,
930 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
932 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
935 /* Fixed substring is found far enough so that the match
936 cannot start at strpos. */
938 if (ml_anch && t[-1] != '\n') {
939 /* Eventually fbm_*() should handle this, but often
940 anchored_offset is not 0, so this check will not be wasted. */
941 /* XXXX In the code below we prefer to look for "^" even in
942 presence of anchored substrings. And we search even
943 beyond the found float position. These pessimizations
944 are historical artefacts only. */
946 while (t < strend - prog->minlen) {
948 if (t < check_at - prog->check_offset_min) {
949 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
950 /* Since we moved from the found position,
951 we definitely contradict the found anchored
952 substr. Due to the above check we do not
953 contradict "check" substr.
954 Thus we can arrive here only if check substr
955 is float. Redo checking for "other"=="fixed".
958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
959 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
960 goto do_other_anchored;
962 /* We don't contradict the found floating substring. */
963 /* XXXX Why not check for STCLASS? */
965 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
966 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
969 /* Position contradicts check-string */
970 /* XXXX probably better to look for check-string
971 than for "\n", so one should lower the limit for t? */
972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
973 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
974 other_last = strpos = s = t + 1;
979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
980 PL_colors[0], PL_colors[1]));
984 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
985 PL_colors[0], PL_colors[1]));
989 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
992 /* The found string does not prohibit matching at strpos,
993 - no optimization of calling REx engine can be performed,
994 unless it was an MBOL and we are not after MBOL,
995 or a future STCLASS check will fail this. */
997 /* Even in this situation we may use MBOL flag if strpos is offset
998 wrt the start of the string. */
999 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1000 && (strpos != strbeg) && strpos[-1] != '\n'
1001 /* May be due to an implicit anchor of m{.*foo} */
1002 && !(prog->intflags & PREGf_IMPLICIT))
1007 DEBUG_EXECUTE_r( if (ml_anch)
1008 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1009 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1012 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1014 prog->check_utf8 /* Could be deleted already */
1015 && --BmUSEFUL(prog->check_utf8) < 0
1016 && (prog->check_utf8 == prog->float_utf8)
1018 prog->check_substr /* Could be deleted already */
1019 && --BmUSEFUL(prog->check_substr) < 0
1020 && (prog->check_substr == prog->float_substr)
1023 /* If flags & SOMETHING - do not do it many times on the same match */
1024 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1025 /* XXX Does the destruction order has to change with utf8_target? */
1026 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1027 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1028 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1029 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1030 check = NULL; /* abort */
1032 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1033 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1034 if (prog->intflags & PREGf_IMPLICIT)
1035 prog->extflags &= ~RXf_ANCH_MBOL;
1036 /* XXXX This is a remnant of the old implementation. It
1037 looks wasteful, since now INTUIT can use many
1038 other heuristics. */
1039 prog->extflags &= ~RXf_USE_INTUIT;
1040 /* XXXX What other flags might need to be cleared in this branch? */
1046 /* Last resort... */
1047 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1048 /* trie stclasses are too expensive to use here, we are better off to
1049 leave it to regmatch itself */
1050 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1051 /* minlen == 0 is possible if regstclass is \b or \B,
1052 and the fixed substr is ''$.
1053 Since minlen is already taken into account, s+1 is before strend;
1054 accidentally, minlen >= 1 guaranties no false positives at s + 1
1055 even for \b or \B. But (minlen? 1 : 0) below assumes that
1056 regstclass does not come from lookahead... */
1057 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1058 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1059 const U8* const str = (U8*)STRING(progi->regstclass);
1060 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1061 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1064 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1065 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1066 else if (prog->float_substr || prog->float_utf8)
1067 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1071 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1072 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1075 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1078 const char *what = NULL;
1080 if (endpos == strend) {
1081 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082 "Could not match STCLASS...\n") );
1085 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1086 "This position contradicts STCLASS...\n") );
1087 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1089 /* Contradict one of substrings */
1090 if (prog->anchored_substr || prog->anchored_utf8) {
1091 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1092 DEBUG_EXECUTE_r( what = "anchored" );
1094 s = HOP3c(t, 1, strend);
1095 if (s + start_shift + end_shift > strend) {
1096 /* XXXX Should be taken into account earlier? */
1097 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1098 "Could not match STCLASS...\n") );
1103 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104 "Looking for %s substr starting at offset %ld...\n",
1105 what, (long)(s + start_shift - i_strpos)) );
1108 /* Have both, check_string is floating */
1109 if (t + start_shift >= check_at) /* Contradicts floating=check */
1110 goto retry_floating_check;
1111 /* Recheck anchored substring, but not floating... */
1115 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116 "Looking for anchored substr starting at offset %ld...\n",
1117 (long)(other_last - i_strpos)) );
1118 goto do_other_anchored;
1120 /* Another way we could have checked stclass at the
1121 current position only: */
1126 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127 "Looking for /%s^%s/m starting at offset %ld...\n",
1128 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1131 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1133 /* Check is floating substring. */
1134 retry_floating_check:
1135 t = check_at - start_shift;
1136 DEBUG_EXECUTE_r( what = "floating" );
1137 goto hop_and_restart;
1140 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1141 "By STCLASS: moving %ld --> %ld\n",
1142 (long)(t - i_strpos), (long)(s - i_strpos))
1146 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1147 "Does not contradict STCLASS...\n");
1152 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1153 PL_colors[4], (check ? "Guessed" : "Giving up"),
1154 PL_colors[5], (long)(s - i_strpos)) );
1157 fail_finish: /* Substring not found */
1158 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1159 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1161 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1162 PL_colors[4], PL_colors[5]));
1166 #define DECL_TRIE_TYPE(scan) \
1167 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1168 trie_type = (scan->flags != EXACT) \
1169 ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1170 : (utf8_target ? trie_utf8 : trie_plain)
1172 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1173 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1174 switch (trie_type) { \
1175 case trie_utf8_fold: \
1176 if ( foldlen>0 ) { \
1177 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1182 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1183 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1184 foldlen -= UNISKIP( uvc ); \
1185 uscan = foldbuf + UNISKIP( uvc ); \
1188 case trie_latin_utf8_fold: \
1189 if ( foldlen>0 ) { \
1190 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1196 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1197 foldlen -= UNISKIP( uvc ); \
1198 uscan = foldbuf + UNISKIP( uvc ); \
1202 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1209 charid = trie->charmap[ uvc ]; \
1213 if (widecharmap) { \
1214 SV** const svpp = hv_fetch(widecharmap, \
1215 (char*)&uvc, sizeof(UV), 0); \
1217 charid = (U16)SvIV(*svpp); \
1222 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1226 && (ln == 1 || folder(s, pat_string, ln)) \
1227 && (!reginfo || regtry(reginfo, &s)) ) \
1233 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1235 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1241 #define REXEC_FBC_SCAN(CoDe) \
1243 while (s < strend) { \
1249 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1250 REXEC_FBC_UTF8_SCAN( \
1252 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1261 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1264 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1273 #define REXEC_FBC_TRYIT \
1274 if ((!reginfo || regtry(reginfo, &s))) \
1277 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1278 if (utf8_target) { \
1279 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1282 REXEC_FBC_CLASS_SCAN(CoNd); \
1285 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1286 if (utf8_target) { \
1288 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1291 REXEC_FBC_CLASS_SCAN(CoNd); \
1294 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1295 PL_reg_flags |= RF_tainted; \
1296 if (utf8_target) { \
1297 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1300 REXEC_FBC_CLASS_SCAN(CoNd); \
1303 #define DUMP_EXEC_POS(li,s,doutf8) \
1304 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1307 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1308 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1309 tmp = TEST_NON_UTF8(tmp); \
1310 REXEC_FBC_UTF8_SCAN( \
1311 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1320 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1321 if (s == PL_bostr) { \
1325 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1326 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1329 LOAD_UTF8_CHARCLASS_ALNUM(); \
1330 REXEC_FBC_UTF8_SCAN( \
1331 if (tmp == ! (TeSt2_UtF8)) { \
1340 /* The only difference between the BOUND and NBOUND cases is that
1341 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1342 * NBOUND. This is accomplished by passing it in either the if or else clause,
1343 * with the other one being empty */
1344 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1345 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1347 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1348 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1350 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1351 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1353 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1354 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1357 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1358 * be passed in completely with the variable name being tested, which isn't
1359 * such a clean interface, but this is easier to read than it was before. We
1360 * are looking for the boundary (or non-boundary between a word and non-word
1361 * character. The utf8 and non-utf8 cases have the same logic, but the details
1362 * must be different. Find the "wordness" of the character just prior to this
1363 * one, and compare it with the wordness of this one. If they differ, we have
1364 * a boundary. At the beginning of the string, pretend that the previous
1365 * character was a new-line */
1366 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1367 if (utf8_target) { \
1370 else { /* Not utf8 */ \
1371 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1372 tmp = TEST_NON_UTF8(tmp); \
1374 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1383 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1386 /* We know what class REx starts with. Try to find this position... */
1387 /* if reginfo is NULL, its a dryrun */
1388 /* annoyingly all the vars in this routine have different names from their counterparts
1389 in regmatch. /grrr */
1392 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1393 const char *strend, regmatch_info *reginfo)
1396 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1397 char *pat_string; /* The pattern's exactish string */
1398 char *pat_end; /* ptr to end char of pat_string */
1399 re_fold_t folder; /* Function for computing non-utf8 folds */
1400 const U8 *fold_array; /* array for folding ords < 256 */
1403 register STRLEN uskip;
1407 register I32 tmp = 1; /* Scratch variable? */
1408 register const bool utf8_target = PL_reg_match_utf8;
1409 UV utf8_fold_flags = 0;
1410 RXi_GET_DECL(prog,progi);
1412 PERL_ARGS_ASSERT_FIND_BYCLASS;
1414 /* We know what class it must start with. */
1418 if (utf8_target || OP(c) == ANYOFV) {
1419 STRLEN inclasslen = strend - s;
1420 REXEC_FBC_UTF8_CLASS_SCAN(
1421 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1424 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1429 if (tmp && (!reginfo || regtry(reginfo, &s)))
1437 if (UTF_PATTERN || utf8_target) {
1438 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1439 goto do_exactf_utf8;
1441 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1442 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1443 goto do_exactf_non_utf8; /* isn't dealt with by these */
1446 if (UTF_PATTERN || utf8_target) {
1447 utf8_fold_flags = 0;
1448 goto do_exactf_utf8;
1450 fold_array = PL_fold_latin1;
1451 folder = foldEQ_latin1;
1452 /* XXX This uses the full utf8 fold because if the pattern contains
1453 * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1454 * There could be a new node type, say EXACTFU_SS, which is
1455 * generated by regcomp only if there is an 'ss', and then every
1456 * other case could goto do_exactf_non_utf8;*/
1457 goto do_exactf_utf8;
1460 if (UTF_PATTERN || utf8_target) {
1461 utf8_fold_flags = 0;
1462 goto do_exactf_utf8;
1464 fold_array = PL_fold;
1466 goto do_exactf_non_utf8;
1469 if (UTF_PATTERN || utf8_target) {
1470 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1471 goto do_exactf_utf8;
1473 fold_array = PL_fold_locale;
1474 folder = foldEQ_locale;
1478 do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1480 /* The idea in the non-utf8 EXACTF* cases is to first find the
1481 * first character of the EXACTF* node and then, if necessary,
1482 * case-insensitively compare the full text of the node. c1 is the
1483 * first character. c2 is its fold. This logic will not work for
1484 * Unicode semantics and the german sharp ss, which hence should
1485 * not be compiled into a node that gets here. */
1486 pat_string = STRING(c);
1487 ln = STR_LEN(c); /* length to match in octets/bytes */
1489 e = HOP3c(strend, -((I32)ln), s);
1491 if (!reginfo && e < s) {
1492 e = s; /* Due to minlen logic of intuit() */
1496 c2 = fold_array[c1];
1497 if (c1 == c2) { /* If char and fold are the same */
1498 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1501 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1507 /* If one of the operands is in utf8, we can't use the simpler
1508 * folding above, due to the fact that many different characters
1509 * can have the same fold, or portion of a fold, or different-
1511 pat_string = STRING(c);
1512 ln = STR_LEN(c); /* length to match in octets/bytes */
1513 pat_end = pat_string + ln;
1514 lnc = (UTF_PATTERN) /* length to match in characters */
1515 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1518 e = HOP3c(strend, -((I32)lnc), s);
1520 if (!reginfo && e < s) {
1521 e = s; /* Due to minlen logic of intuit() */
1525 char *my_strend= (char *)strend;
1526 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1527 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1528 && (!reginfo || regtry(reginfo, &s)) )
1536 PL_reg_flags |= RF_tainted;
1537 FBC_BOUND(isALNUM_LC,
1538 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1539 isALNUM_LC_utf8((U8*)s));
1542 PL_reg_flags |= RF_tainted;
1543 FBC_NBOUND(isALNUM_LC,
1544 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1545 isALNUM_LC_utf8((U8*)s));
1548 FBC_BOUND(isWORDCHAR,
1550 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1553 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1555 isWORDCHAR_A((U8*)s));
1558 FBC_NBOUND(isWORDCHAR,
1560 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1563 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1565 isWORDCHAR_A((U8*)s));
1568 FBC_BOUND(isWORDCHAR_L1,
1570 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1573 FBC_NBOUND(isWORDCHAR_L1,
1575 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1578 REXEC_FBC_CSCAN_TAINT(
1579 isALNUM_LC_utf8((U8*)s),
1584 REXEC_FBC_CSCAN_PRELOAD(
1585 LOAD_UTF8_CHARCLASS_ALNUM(),
1586 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1587 isWORDCHAR_L1((U8) *s)
1591 REXEC_FBC_CSCAN_PRELOAD(
1592 LOAD_UTF8_CHARCLASS_ALNUM(),
1593 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1598 /* Don't need to worry about utf8, as it can match only a single
1599 * byte invariant character */
1600 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1603 REXEC_FBC_CSCAN_PRELOAD(
1604 LOAD_UTF8_CHARCLASS_ALNUM(),
1605 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1606 ! isWORDCHAR_L1((U8) *s)
1610 REXEC_FBC_CSCAN_PRELOAD(
1611 LOAD_UTF8_CHARCLASS_ALNUM(),
1612 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1623 REXEC_FBC_CSCAN_TAINT(
1624 !isALNUM_LC_utf8((U8*)s),
1629 REXEC_FBC_CSCAN_PRELOAD(
1630 LOAD_UTF8_CHARCLASS_SPACE(),
1631 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1636 REXEC_FBC_CSCAN_PRELOAD(
1637 LOAD_UTF8_CHARCLASS_SPACE(),
1638 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1643 /* Don't need to worry about utf8, as it can match only a single
1644 * byte invariant character */
1645 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1648 REXEC_FBC_CSCAN_TAINT(
1649 isSPACE_LC_utf8((U8*)s),
1654 REXEC_FBC_CSCAN_PRELOAD(
1655 LOAD_UTF8_CHARCLASS_SPACE(),
1656 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1657 ! isSPACE_L1((U8) *s)
1661 REXEC_FBC_CSCAN_PRELOAD(
1662 LOAD_UTF8_CHARCLASS_SPACE(),
1663 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1674 REXEC_FBC_CSCAN_TAINT(
1675 !isSPACE_LC_utf8((U8*)s),
1680 REXEC_FBC_CSCAN_PRELOAD(
1681 LOAD_UTF8_CHARCLASS_DIGIT(),
1682 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1687 /* Don't need to worry about utf8, as it can match only a single
1688 * byte invariant character */
1689 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1692 REXEC_FBC_CSCAN_TAINT(
1693 isDIGIT_LC_utf8((U8*)s),
1698 REXEC_FBC_CSCAN_PRELOAD(
1699 LOAD_UTF8_CHARCLASS_DIGIT(),
1700 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1711 REXEC_FBC_CSCAN_TAINT(
1712 !isDIGIT_LC_utf8((U8*)s),
1719 is_LNBREAK_latin1(s)
1731 !is_VERTWS_latin1(s)
1737 is_HORIZWS_latin1(s)
1742 !is_HORIZWS_utf8(s),
1743 !is_HORIZWS_latin1(s)
1750 /* what trie are we using right now */
1752 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1754 = (reg_trie_data*)progi->data->data[ aho->trie ];
1755 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1757 const char *last_start = strend - trie->minlen;
1759 const char *real_start = s;
1761 STRLEN maxlen = trie->maxlen;
1763 U8 **points; /* map of where we were in the input string
1764 when reading a given char. For ASCII this
1765 is unnecessary overhead as the relationship
1766 is always 1:1, but for Unicode, especially
1767 case folded Unicode this is not true. */
1768 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1772 GET_RE_DEBUG_FLAGS_DECL;
1774 /* We can't just allocate points here. We need to wrap it in
1775 * an SV so it gets freed properly if there is a croak while
1776 * running the match */
1779 sv_points=newSV(maxlen * sizeof(U8 *));
1780 SvCUR_set(sv_points,
1781 maxlen * sizeof(U8 *));
1782 SvPOK_on(sv_points);
1783 sv_2mortal(sv_points);
1784 points=(U8**)SvPV_nolen(sv_points );
1785 if ( trie_type != trie_utf8_fold
1786 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1789 bitmap=(U8*)trie->bitmap;
1791 bitmap=(U8*)ANYOF_BITMAP(c);
1793 /* this is the Aho-Corasick algorithm modified a touch
1794 to include special handling for long "unknown char"
1795 sequences. The basic idea being that we use AC as long
1796 as we are dealing with a possible matching char, when
1797 we encounter an unknown char (and we have not encountered
1798 an accepting state) we scan forward until we find a legal
1800 AC matching is basically that of trie matching, except
1801 that when we encounter a failing transition, we fall back
1802 to the current states "fail state", and try the current char
1803 again, a process we repeat until we reach the root state,
1804 state 1, or a legal transition. If we fail on the root state
1805 then we can either terminate if we have reached an accepting
1806 state previously, or restart the entire process from the beginning
1810 while (s <= last_start) {
1811 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1819 U8 *uscan = (U8*)NULL;
1820 U8 *leftmost = NULL;
1822 U32 accepted_word= 0;
1826 while ( state && uc <= (U8*)strend ) {
1828 U32 word = aho->states[ state ].wordnum;
1832 DEBUG_TRIE_EXECUTE_r(
1833 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1834 dump_exec_pos( (char *)uc, c, strend, real_start,
1835 (char *)uc, utf8_target );
1836 PerlIO_printf( Perl_debug_log,
1837 " Scanning for legal start char...\n");
1841 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1845 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1851 if (uc >(U8*)last_start) break;
1855 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1856 if (!leftmost || lpos < leftmost) {
1857 DEBUG_r(accepted_word=word);
1863 points[pointpos++ % maxlen]= uc;
1864 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1865 uscan, len, uvc, charid, foldlen,
1867 DEBUG_TRIE_EXECUTE_r({
1868 dump_exec_pos( (char *)uc, c, strend, real_start,
1870 PerlIO_printf(Perl_debug_log,
1871 " Charid:%3u CP:%4"UVxf" ",
1877 word = aho->states[ state ].wordnum;
1879 base = aho->states[ state ].trans.base;
1881 DEBUG_TRIE_EXECUTE_r({
1883 dump_exec_pos( (char *)uc, c, strend, real_start,
1885 PerlIO_printf( Perl_debug_log,
1886 "%sState: %4"UVxf", word=%"UVxf,
1887 failed ? " Fail transition to " : "",
1888 (UV)state, (UV)word);
1894 ( ((offset = base + charid
1895 - 1 - trie->uniquecharcount)) >= 0)
1896 && ((U32)offset < trie->lasttrans)
1897 && trie->trans[offset].check == state
1898 && (tmp=trie->trans[offset].next))
1900 DEBUG_TRIE_EXECUTE_r(
1901 PerlIO_printf( Perl_debug_log," - legal\n"));
1906 DEBUG_TRIE_EXECUTE_r(
1907 PerlIO_printf( Perl_debug_log," - fail\n"));
1909 state = aho->fail[state];
1913 /* we must be accepting here */
1914 DEBUG_TRIE_EXECUTE_r(
1915 PerlIO_printf( Perl_debug_log," - accepting\n"));
1924 if (!state) state = 1;
1927 if ( aho->states[ state ].wordnum ) {
1928 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1929 if (!leftmost || lpos < leftmost) {
1930 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1935 s = (char*)leftmost;
1936 DEBUG_TRIE_EXECUTE_r({
1938 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1939 (UV)accepted_word, (IV)(s - real_start)
1942 if (!reginfo || regtry(reginfo, &s)) {
1948 DEBUG_TRIE_EXECUTE_r({
1949 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1952 DEBUG_TRIE_EXECUTE_r(
1953 PerlIO_printf( Perl_debug_log,"No match.\n"));
1962 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1972 - regexec_flags - match a regexp against a string
1975 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1976 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1977 /* strend: pointer to null at end of string */
1978 /* strbeg: real beginning of string */
1979 /* minend: end of match must be >=minend after stringarg. */
1980 /* data: May be used for some additional optimizations.
1981 Currently its only used, with a U32 cast, for transmitting
1982 the ganch offset when doing a /g match. This will change */
1983 /* nosave: For optimizations. */
1986 struct regexp *const prog = (struct regexp *)SvANY(rx);
1987 /*register*/ char *s;
1988 register regnode *c;
1989 /*register*/ char *startpos = stringarg;
1990 I32 minlen; /* must match at least this many chars */
1991 I32 dontbother = 0; /* how many characters not to try at end */
1992 I32 end_shift = 0; /* Same for the end. */ /* CC */
1993 I32 scream_pos = -1; /* Internal iterator of scream. */
1994 char *scream_olds = NULL;
1995 const bool utf8_target = cBOOL(DO_UTF8(sv));
1997 RXi_GET_DECL(prog,progi);
1998 regmatch_info reginfo; /* create some info to pass to regtry etc */
1999 regexp_paren_pair *swap = NULL;
2000 GET_RE_DEBUG_FLAGS_DECL;
2002 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2003 PERL_UNUSED_ARG(data);
2005 /* Be paranoid... */
2006 if (prog == NULL || startpos == NULL) {
2007 Perl_croak(aTHX_ "NULL regexp parameter");
2011 multiline = prog->extflags & RXf_PMf_MULTILINE;
2012 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2014 RX_MATCH_UTF8_set(rx, utf8_target);
2016 debug_start_match(rx, utf8_target, startpos, strend,
2020 minlen = prog->minlen;
2022 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2023 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2024 "String too short [regexec_flags]...\n"));
2029 /* Check validity of program. */
2030 if (UCHARAT(progi->program) != REG_MAGIC) {
2031 Perl_croak(aTHX_ "corrupted regexp program");
2035 PL_reg_eval_set = 0;
2039 PL_reg_flags |= RF_utf8;
2041 /* Mark beginning of line for ^ and lookbehind. */
2042 reginfo.bol = startpos; /* XXX not used ??? */
2046 /* Mark end of line for $ (and such) */
2049 /* see how far we have to get to not match where we matched before */
2050 reginfo.till = startpos+minend;
2052 /* If there is a "must appear" string, look for it. */
2055 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2057 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2058 reginfo.ganch = startpos + prog->gofs;
2059 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2060 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2061 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2063 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2064 && mg->mg_len >= 0) {
2065 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2066 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2067 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2069 if (prog->extflags & RXf_ANCH_GPOS) {
2070 if (s > reginfo.ganch)
2072 s = reginfo.ganch - prog->gofs;
2073 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2074 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2080 reginfo.ganch = strbeg + PTR2UV(data);
2081 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2082 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2084 } else { /* pos() not defined */
2085 reginfo.ganch = strbeg;
2086 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2087 "GPOS: reginfo.ganch = strbeg\n"));
2090 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2091 /* We have to be careful. If the previous successful match
2092 was from this regex we don't want a subsequent partially
2093 successful match to clobber the old results.
2094 So when we detect this possibility we add a swap buffer
2095 to the re, and switch the buffer each match. If we fail
2096 we switch it back, otherwise we leave it swapped.
2099 /* do we need a save destructor here for eval dies? */
2100 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2102 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2103 re_scream_pos_data d;
2105 d.scream_olds = &scream_olds;
2106 d.scream_pos = &scream_pos;
2107 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2109 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2110 goto phooey; /* not present */
2116 /* Simplest case: anchored match need be tried only once. */
2117 /* [unless only anchor is BOL and multiline is set] */
2118 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2119 if (s == startpos && regtry(®info, &startpos))
2121 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2122 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2127 dontbother = minlen - 1;
2128 end = HOP3c(strend, -dontbother, strbeg) - 1;
2129 /* for multiline we only have to try after newlines */
2130 if (prog->check_substr || prog->check_utf8) {
2131 /* because of the goto we can not easily reuse the macros for bifurcating the
2132 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2135 goto after_try_utf8;
2137 if (regtry(®info, &s)) {
2144 if (prog->extflags & RXf_USE_INTUIT) {
2145 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2154 } /* end search for check string in unicode */
2156 if (s == startpos) {
2157 goto after_try_latin;
2160 if (regtry(®info, &s)) {
2167 if (prog->extflags & RXf_USE_INTUIT) {
2168 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2177 } /* end search for check string in latin*/
2178 } /* end search for check string */
2179 else { /* search for newline */
2181 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2184 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2186 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2187 if (regtry(®info, &s))
2191 } /* end search for newline */
2192 } /* end anchored/multiline check string search */
2194 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2196 /* the warning about reginfo.ganch being used without initialization
2197 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2198 and we only enter this block when the same bit is set. */
2199 char *tmp_s = reginfo.ganch - prog->gofs;
2201 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2206 /* Messy cases: unanchored match. */
2207 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2208 /* we have /x+whatever/ */
2209 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2214 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2215 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2216 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2221 DEBUG_EXECUTE_r( did_match = 1 );
2222 if (regtry(®info, &s)) goto got_it;
2224 while (s < strend && *s == ch)
2232 DEBUG_EXECUTE_r( did_match = 1 );
2233 if (regtry(®info, &s)) goto got_it;
2235 while (s < strend && *s == ch)
2240 DEBUG_EXECUTE_r(if (!did_match)
2241 PerlIO_printf(Perl_debug_log,
2242 "Did not find anchored character...\n")
2245 else if (prog->anchored_substr != NULL
2246 || prog->anchored_utf8 != NULL
2247 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2248 && prog->float_max_offset < strend - s)) {
2253 char *last1; /* Last position checked before */
2257 if (prog->anchored_substr || prog->anchored_utf8) {
2258 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2259 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2260 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2261 back_max = back_min = prog->anchored_offset;
2263 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2264 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2265 must = utf8_target ? prog->float_utf8 : prog->float_substr;
2266 back_max = prog->float_max_offset;
2267 back_min = prog->float_min_offset;
2271 if (must == &PL_sv_undef)
2272 /* could not downgrade utf8 check substring, so must fail */
2278 last = HOP3c(strend, /* Cannot start after this */
2279 -(I32)(CHR_SVLEN(must)
2280 - (SvTAIL(must) != 0) + back_min), strbeg);
2283 last1 = HOPc(s, -1);
2285 last1 = s - 1; /* bogus */
2287 /* XXXX check_substr already used to find "s", can optimize if
2288 check_substr==must. */
2290 dontbother = end_shift;
2291 strend = HOPc(strend, -dontbother);
2292 while ( (s <= last) &&
2293 ((flags & REXEC_SCREAM)
2294 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2295 end_shift, &scream_pos, 0))
2296 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2297 (unsigned char*)strend, must,
2298 multiline ? FBMrf_MULTILINE : 0))) ) {
2299 /* we may be pointing at the wrong string */
2300 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2301 s = strbeg + (s - SvPVX_const(sv));
2302 DEBUG_EXECUTE_r( did_match = 1 );
2303 if (HOPc(s, -back_max) > last1) {
2304 last1 = HOPc(s, -back_min);
2305 s = HOPc(s, -back_max);
2308 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2310 last1 = HOPc(s, -back_min);
2314 while (s <= last1) {
2315 if (regtry(®info, &s))
2321 while (s <= last1) {
2322 if (regtry(®info, &s))
2328 DEBUG_EXECUTE_r(if (!did_match) {
2329 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2330 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2331 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2332 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2333 ? "anchored" : "floating"),
2334 quoted, RE_SV_TAIL(must));
2338 else if ( (c = progi->regstclass) ) {
2340 const OPCODE op = OP(progi->regstclass);
2341 /* don't bother with what can't match */
2342 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2343 strend = HOPc(strend, -(minlen - 1));
2346 SV * const prop = sv_newmortal();
2347 regprop(prog, prop, c);
2349 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2351 PerlIO_printf(Perl_debug_log,
2352 "Matching stclass %.*s against %s (%d bytes)\n",
2353 (int)SvCUR(prop), SvPVX_const(prop),
2354 quoted, (int)(strend - s));
2357 if (find_byclass(prog, c, s, strend, ®info))
2359 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2363 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2368 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2369 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2370 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2372 if (flags & REXEC_SCREAM) {
2373 last = screaminstr(sv, float_real, s - strbeg,
2374 end_shift, &scream_pos, 1); /* last one */
2376 last = scream_olds; /* Only one occurrence. */
2377 /* we may be pointing at the wrong string */
2378 else if (RXp_MATCH_COPIED(prog))
2379 s = strbeg + (s - SvPVX_const(sv));
2383 const char * const little = SvPV_const(float_real, len);
2385 if (SvTAIL(float_real)) {
2386 if (memEQ(strend - len + 1, little, len - 1))
2387 last = strend - len + 1;
2388 else if (!multiline)
2389 last = memEQ(strend - len, little, len)
2390 ? strend - len : NULL;
2396 last = rninstr(s, strend, little, little + len);
2398 last = strend; /* matching "$" */
2403 PerlIO_printf(Perl_debug_log,
2404 "%sCan't trim the tail, match fails (should not happen)%s\n",
2405 PL_colors[4], PL_colors[5]));
2406 goto phooey; /* Should not happen! */
2408 dontbother = strend - last + prog->float_min_offset;
2410 if (minlen && (dontbother < minlen))
2411 dontbother = minlen - 1;
2412 strend -= dontbother; /* this one's always in bytes! */
2413 /* We don't know much -- general case. */
2416 if (regtry(®info, &s))
2425 if (regtry(®info, &s))
2427 } while (s++ < strend);
2436 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2438 if (PL_reg_eval_set)
2439 restore_pos(aTHX_ prog);
2440 if (RXp_PAREN_NAMES(prog))
2441 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2443 /* make sure $`, $&, $', and $digit will work later */
2444 if ( !(flags & REXEC_NOT_FIRST) ) {
2445 RX_MATCH_COPY_FREE(rx);
2446 if (flags & REXEC_COPY_STR) {
2447 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2448 #ifdef PERL_OLD_COPY_ON_WRITE
2450 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2452 PerlIO_printf(Perl_debug_log,
2453 "Copy on write: regexp capture, type %d\n",
2456 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2457 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2458 assert (SvPOKp(prog->saved_copy));
2462 RX_MATCH_COPIED_on(rx);
2463 s = savepvn(strbeg, i);
2469 prog->subbeg = strbeg;
2470 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2477 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2478 PL_colors[4], PL_colors[5]));
2479 if (PL_reg_eval_set)
2480 restore_pos(aTHX_ prog);
2482 /* we failed :-( roll it back */
2483 Safefree(prog->offs);
2492 - regtry - try match at specific point
2494 STATIC I32 /* 0 failure, 1 success */
2495 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2499 REGEXP *const rx = reginfo->prog;
2500 regexp *const prog = (struct regexp *)SvANY(rx);
2501 RXi_GET_DECL(prog,progi);
2502 GET_RE_DEBUG_FLAGS_DECL;
2504 PERL_ARGS_ASSERT_REGTRY;
2506 reginfo->cutpoint=NULL;
2508 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2511 PL_reg_eval_set = RS_init;
2512 DEBUG_EXECUTE_r(DEBUG_s(
2513 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2514 (IV)(PL_stack_sp - PL_stack_base));
2517 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2518 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2520 /* Apparently this is not needed, judging by wantarray. */
2521 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2522 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2525 /* Make $_ available to executed code. */
2526 if (reginfo->sv != DEFSV) {
2528 DEFSV_set(reginfo->sv);
2531 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2532 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2533 /* prepare for quick setting of pos */
2534 #ifdef PERL_OLD_COPY_ON_WRITE
2535 if (SvIsCOW(reginfo->sv))
2536 sv_force_normal_flags(reginfo->sv, 0);
2538 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2539 &PL_vtbl_mglob, NULL, 0);
2543 PL_reg_oldpos = mg->mg_len;
2544 SAVEDESTRUCTOR_X(restore_pos, prog);
2546 if (!PL_reg_curpm) {
2547 Newxz(PL_reg_curpm, 1, PMOP);
2550 SV* const repointer = &PL_sv_undef;
2551 /* this regexp is also owned by the new PL_reg_curpm, which
2552 will try to free it. */
2553 av_push(PL_regex_padav, repointer);
2554 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2555 PL_regex_pad = AvARRAY(PL_regex_padav);
2560 /* It seems that non-ithreads works both with and without this code.
2561 So for efficiency reasons it seems best not to have the code
2562 compiled when it is not needed. */
2563 /* This is safe against NULLs: */
2564 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2565 /* PM_reg_curpm owns a reference to this regexp. */
2566 (void)ReREFCNT_inc(rx);
2568 PM_SETRE(PL_reg_curpm, rx);
2569 PL_reg_oldcurpm = PL_curpm;
2570 PL_curpm = PL_reg_curpm;
2571 if (RXp_MATCH_COPIED(prog)) {
2572 /* Here is a serious problem: we cannot rewrite subbeg,
2573 since it may be needed if this match fails. Thus
2574 $` inside (?{}) could fail... */
2575 PL_reg_oldsaved = prog->subbeg;
2576 PL_reg_oldsavedlen = prog->sublen;
2577 #ifdef PERL_OLD_COPY_ON_WRITE
2578 PL_nrs = prog->saved_copy;
2580 RXp_MATCH_COPIED_off(prog);
2583 PL_reg_oldsaved = NULL;
2584 prog->subbeg = PL_bostr;
2585 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2587 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2588 prog->offs[0].start = *startpos - PL_bostr;
2589 PL_reginput = *startpos;
2590 PL_reglastparen = &prog->lastparen;
2591 PL_reglastcloseparen = &prog->lastcloseparen;
2592 prog->lastparen = 0;
2593 prog->lastcloseparen = 0;
2595 PL_regoffs = prog->offs;
2596 if (PL_reg_start_tmpl <= prog->nparens) {
2597 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2598 if(PL_reg_start_tmp)
2599 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2601 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2604 /* XXXX What this code is doing here?!!! There should be no need
2605 to do this again and again, PL_reglastparen should take care of
2608 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2609 * Actually, the code in regcppop() (which Ilya may be meaning by
2610 * PL_reglastparen), is not needed at all by the test suite
2611 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2612 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2613 * Meanwhile, this code *is* needed for the
2614 * above-mentioned test suite tests to succeed. The common theme
2615 * on those tests seems to be returning null fields from matches.
2616 * --jhi updated by dapm */
2618 if (prog->nparens) {
2619 regexp_paren_pair *pp = PL_regoffs;
2621 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2629 if (regmatch(reginfo, progi->program + 1)) {
2630 PL_regoffs[0].end = PL_reginput - PL_bostr;
2633 if (reginfo->cutpoint)
2634 *startpos= reginfo->cutpoint;
2635 REGCP_UNWIND(lastcp);
2640 #define sayYES goto yes
2641 #define sayNO goto no
2642 #define sayNO_SILENT goto no_silent
2644 /* we dont use STMT_START/END here because it leads to
2645 "unreachable code" warnings, which are bogus, but distracting. */
2646 #define CACHEsayNO \
2647 if (ST.cache_mask) \
2648 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2651 /* this is used to determine how far from the left messages like
2652 'failed...' are printed. It should be set such that messages
2653 are inline with the regop output that created them.
2655 #define REPORT_CODE_OFF 32
2658 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2659 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2661 #define SLAB_FIRST(s) (&(s)->states[0])
2662 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2664 /* grab a new slab and return the first slot in it */
2666 STATIC regmatch_state *
2669 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2672 regmatch_slab *s = PL_regmatch_slab->next;
2674 Newx(s, 1, regmatch_slab);
2675 s->prev = PL_regmatch_slab;
2677 PL_regmatch_slab->next = s;
2679 PL_regmatch_slab = s;
2680 return SLAB_FIRST(s);
2684 /* push a new state then goto it */
2686 #define PUSH_STATE_GOTO(state, node) \
2688 st->resume_state = state; \
2691 /* push a new state with success backtracking, then goto it */
2693 #define PUSH_YES_STATE_GOTO(state, node) \
2695 st->resume_state = state; \
2696 goto push_yes_state;
2702 regmatch() - main matching routine
2704 This is basically one big switch statement in a loop. We execute an op,
2705 set 'next' to point the next op, and continue. If we come to a point which
2706 we may need to backtrack to on failure such as (A|B|C), we push a
2707 backtrack state onto the backtrack stack. On failure, we pop the top
2708 state, and re-enter the loop at the state indicated. If there are no more
2709 states to pop, we return failure.
2711 Sometimes we also need to backtrack on success; for example /A+/, where
2712 after successfully matching one A, we need to go back and try to
2713 match another one; similarly for lookahead assertions: if the assertion
2714 completes successfully, we backtrack to the state just before the assertion
2715 and then carry on. In these cases, the pushed state is marked as
2716 'backtrack on success too'. This marking is in fact done by a chain of
2717 pointers, each pointing to the previous 'yes' state. On success, we pop to
2718 the nearest yes state, discarding any intermediate failure-only states.
2719 Sometimes a yes state is pushed just to force some cleanup code to be
2720 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2721 it to free the inner regex.
2723 Note that failure backtracking rewinds the cursor position, while
2724 success backtracking leaves it alone.
2726 A pattern is complete when the END op is executed, while a subpattern
2727 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2728 ops trigger the "pop to last yes state if any, otherwise return true"
2731 A common convention in this function is to use A and B to refer to the two
2732 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2733 the subpattern to be matched possibly multiple times, while B is the entire
2734 rest of the pattern. Variable and state names reflect this convention.
2736 The states in the main switch are the union of ops and failure/success of
2737 substates associated with with that op. For example, IFMATCH is the op
2738 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2739 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2740 successfully matched A and IFMATCH_A_fail is a state saying that we have
2741 just failed to match A. Resume states always come in pairs. The backtrack
2742 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2743 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2744 on success or failure.
2746 The struct that holds a backtracking state is actually a big union, with
2747 one variant for each major type of op. The variable st points to the
2748 top-most backtrack struct. To make the code clearer, within each
2749 block of code we #define ST to alias the relevant union.
2751 Here's a concrete example of a (vastly oversimplified) IFMATCH
2757 #define ST st->u.ifmatch
2759 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2760 ST.foo = ...; // some state we wish to save
2762 // push a yes backtrack state with a resume value of
2763 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2765 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2768 case IFMATCH_A: // we have successfully executed A; now continue with B
2770 bar = ST.foo; // do something with the preserved value
2773 case IFMATCH_A_fail: // A failed, so the assertion failed
2774 ...; // do some housekeeping, then ...
2775 sayNO; // propagate the failure
2782 For any old-timers reading this who are familiar with the old recursive
2783 approach, the code above is equivalent to:
2785 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2794 ...; // do some housekeeping, then ...
2795 sayNO; // propagate the failure
2798 The topmost backtrack state, pointed to by st, is usually free. If you
2799 want to claim it, populate any ST.foo fields in it with values you wish to
2800 save, then do one of
2802 PUSH_STATE_GOTO(resume_state, node);
2803 PUSH_YES_STATE_GOTO(resume_state, node);
2805 which sets that backtrack state's resume value to 'resume_state', pushes a
2806 new free entry to the top of the backtrack stack, then goes to 'node'.
2807 On backtracking, the free slot is popped, and the saved state becomes the
2808 new free state. An ST.foo field in this new top state can be temporarily
2809 accessed to retrieve values, but once the main loop is re-entered, it
2810 becomes available for reuse.
2812 Note that the depth of the backtrack stack constantly increases during the
2813 left-to-right execution of the pattern, rather than going up and down with
2814 the pattern nesting. For example the stack is at its maximum at Z at the
2815 end of the pattern, rather than at X in the following:
2817 /(((X)+)+)+....(Y)+....Z/
2819 The only exceptions to this are lookahead/behind assertions and the cut,
2820 (?>A), which pop all the backtrack states associated with A before
2823 Backtrack state structs are allocated in slabs of about 4K in size.
2824 PL_regmatch_state and st always point to the currently active state,
2825 and PL_regmatch_slab points to the slab currently containing
2826 PL_regmatch_state. The first time regmatch() is called, the first slab is
2827 allocated, and is never freed until interpreter destruction. When the slab
2828 is full, a new one is allocated and chained to the end. At exit from
2829 regmatch(), slabs allocated since entry are freed.
2834 #define DEBUG_STATE_pp(pp) \
2836 DUMP_EXEC_POS(locinput, scan, utf8_target); \
2837 PerlIO_printf(Perl_debug_log, \
2838 " %*s"pp" %s%s%s%s%s\n", \
2840 PL_reg_name[st->resume_state], \
2841 ((st==yes_state||st==mark_state) ? "[" : ""), \
2842 ((st==yes_state) ? "Y" : ""), \
2843 ((st==mark_state) ? "M" : ""), \
2844 ((st==yes_state||st==mark_state) ? "]" : "") \
2849 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2854 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2855 const char *start, const char *end, const char *blurb)
2857 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2859 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2864 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2865 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2867 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2868 start, end - start, 60);
2870 PerlIO_printf(Perl_debug_log,
2871 "%s%s REx%s %s against %s\n",
2872 PL_colors[4], blurb, PL_colors[5], s0, s1);
2874 if (utf8_target||utf8_pat)
2875 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2876 utf8_pat ? "pattern" : "",
2877 utf8_pat && utf8_target ? " and " : "",
2878 utf8_target ? "string" : ""
2884 S_dump_exec_pos(pTHX_ const char *locinput,
2885 const regnode *scan,
2886 const char *loc_regeol,
2887 const char *loc_bostr,
2888 const char *loc_reg_starttry,
2889 const bool utf8_target)
2891 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2892 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2893 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2894 /* The part of the string before starttry has one color
2895 (pref0_len chars), between starttry and current
2896 position another one (pref_len - pref0_len chars),
2897 after the current position the third one.
2898 We assume that pref0_len <= pref_len, otherwise we
2899 decrease pref0_len. */
2900 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2901 ? (5 + taill) - l : locinput - loc_bostr;
2904 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2906 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2908 pref0_len = pref_len - (locinput - loc_reg_starttry);
2909 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2910 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2911 ? (5 + taill) - pref_len : loc_regeol - locinput);
2912 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2916 if (pref0_len > pref_len)
2917 pref0_len = pref_len;
2919 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2921 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2922 (locinput - pref_len),pref0_len, 60, 4, 5);
2924 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2925 (locinput - pref_len + pref0_len),
2926 pref_len - pref0_len, 60, 2, 3);
2928 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2929 locinput, loc_regeol - locinput, 10, 0, 1);
2931 const STRLEN tlen=len0+len1+len2;
2932 PerlIO_printf(Perl_debug_log,
2933 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2934 (IV)(locinput - loc_bostr),
2937 (docolor ? "" : "> <"),
2939 (int)(tlen > 19 ? 0 : 19 - tlen),
2946 /* reg_check_named_buff_matched()
2947 * Checks to see if a named buffer has matched. The data array of
2948 * buffer numbers corresponding to the buffer is expected to reside
2949 * in the regexp->data->data array in the slot stored in the ARG() of
2950 * node involved. Note that this routine doesn't actually care about the
2951 * name, that information is not preserved from compilation to execution.
2952 * Returns the index of the leftmost defined buffer with the given name
2953 * or 0 if non of the buffers matched.
2956 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2959 RXi_GET_DECL(rex,rexi);
2960 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2961 I32 *nums=(I32*)SvPVX(sv_dat);
2963 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2965 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2966 if ((I32)*PL_reglastparen >= nums[n] &&
2967 PL_regoffs[nums[n]].end != -1)
2976 /* free all slabs above current one - called during LEAVE_SCOPE */
2979 S_clear_backtrack_stack(pTHX_ void *p)
2981 regmatch_slab *s = PL_regmatch_slab->next;
2986 PL_regmatch_slab->next = NULL;
2988 regmatch_slab * const osl = s;
2995 #define SETREX(Re1,Re2) \
2996 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2999 STATIC I32 /* 0 failure, 1 success */
3000 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3002 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3006 register const bool utf8_target = PL_reg_match_utf8;
3007 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3008 REGEXP *rex_sv = reginfo->prog;
3009 regexp *rex = (struct regexp *)SvANY(rex_sv);
3010 RXi_GET_DECL(rex,rexi);
3012 /* the current state. This is a cached copy of PL_regmatch_state */
3013 register regmatch_state *st;
3014 /* cache heavy used fields of st in registers */
3015 register regnode *scan;
3016 register regnode *next;
3017 register U32 n = 0; /* general value; init to avoid compiler warning */
3018 register I32 ln = 0; /* len or last; init to avoid compiler warning */
3019 register char *locinput = PL_reginput;
3020 register I32 nextchr; /* is always set to UCHARAT(locinput) */
3022 bool result = 0; /* return value of S_regmatch */
3023 int depth = 0; /* depth of backtrack stack */
3024 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3025 const U32 max_nochange_depth =
3026 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3027 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3028 regmatch_state *yes_state = NULL; /* state to pop to on success of
3030 /* mark_state piggy backs on the yes_state logic so that when we unwind
3031 the stack on success we can update the mark_state as we go */
3032 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3033 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3034 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3036 bool no_final = 0; /* prevent failure from backtracking? */
3037 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3038 char *startpoint = PL_reginput;
3039 SV *popmark = NULL; /* are we looking for a mark? */
3040 SV *sv_commit = NULL; /* last mark name seen in failure */
3041 SV *sv_yes_mark = NULL; /* last mark name we have seen
3042 during a successful match */
3043 U32 lastopen = 0; /* last open we saw */
3044 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3045 SV* const oreplsv = GvSV(PL_replgv);
3046 /* these three flags are set by various ops to signal information to
3047 * the very next op. They have a useful lifetime of exactly one loop
3048 * iteration, and are not preserved or restored by state pushes/pops
3050 bool sw = 0; /* the condition value in (?(cond)a|b) */
3051 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3052 int logical = 0; /* the following EVAL is:
3056 or the following IFMATCH/UNLESSM is:
3057 false: plain (?=foo)
3058 true: used as a condition: (?(?=foo))
3061 GET_RE_DEBUG_FLAGS_DECL;
3064 PERL_ARGS_ASSERT_REGMATCH;
3066 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3067 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3069 /* on first ever call to regmatch, allocate first slab */
3070 if (!PL_regmatch_slab) {
3071 Newx(PL_regmatch_slab, 1, regmatch_slab);
3072 PL_regmatch_slab->prev = NULL;
3073 PL_regmatch_slab->next = NULL;
3074 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3077 oldsave = PL_savestack_ix;
3078 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3079 SAVEVPTR(PL_regmatch_slab);
3080 SAVEVPTR(PL_regmatch_state);
3082 /* grab next free state slot */
3083 st = ++PL_regmatch_state;
3084 if (st > SLAB_LAST(PL_regmatch_slab))
3085 st = PL_regmatch_state = S_push_slab(aTHX);
3087 /* Note that nextchr is a byte even in UTF */
3088 nextchr = UCHARAT(locinput);
3090 while (scan != NULL) {
3093 SV * const prop = sv_newmortal();
3094 regnode *rnext=regnext(scan);
3095 DUMP_EXEC_POS( locinput, scan, utf8_target );
3096 regprop(rex, prop, scan);
3098 PerlIO_printf(Perl_debug_log,
3099 "%3"IVdf":%*s%s(%"IVdf")\n",
3100 (IV)(scan - rexi->program), depth*2, "",
3102 (PL_regkind[OP(scan)] == END || !rnext) ?
3103 0 : (IV)(rnext - rexi->program));
3106 next = scan + NEXT_OFF(scan);
3109 state_num = OP(scan);
3113 assert(PL_reglastparen == &rex->lastparen);
3114 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3115 assert(PL_regoffs == rex->offs);
3117 switch (state_num) {
3119 if (locinput == PL_bostr)
3121 /* reginfo->till = reginfo->bol; */
3126 if (locinput == PL_bostr ||
3127 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3133 if (locinput == PL_bostr)
3137 if (locinput == reginfo->ganch)
3142 /* update the startpoint */
3143 st->u.keeper.val = PL_regoffs[0].start;
3144 PL_reginput = locinput;
3145 PL_regoffs[0].start = locinput - PL_bostr;
3146 PUSH_STATE_GOTO(KEEPS_next, next);
3148 case KEEPS_next_fail:
3149 /* rollback the start point change */
3150 PL_regoffs[0].start = st->u.keeper.val;
3156 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3161 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3163 if (PL_regeol - locinput > 1)
3167 if (PL_regeol != locinput)
3171 if (!nextchr && locinput >= PL_regeol)
3174 locinput += PL_utf8skip[nextchr];
3175 if (locinput > PL_regeol)
3177 nextchr = UCHARAT(locinput);
3180 nextchr = UCHARAT(++locinput);
3183 if (!nextchr && locinput >= PL_regeol)
3185 nextchr = UCHARAT(++locinput);
3188 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3191 locinput += PL_utf8skip[nextchr];
3192 if (locinput > PL_regeol)
3194 nextchr = UCHARAT(locinput);
3197 nextchr = UCHARAT(++locinput);
3201 #define ST st->u.trie
3203 /* In this case the charclass data is available inline so
3204 we can fail fast without a lot of extra overhead.
3206 if (scan->flags == EXACT || !utf8_target) {
3207 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3209 PerlIO_printf(Perl_debug_log,
3210 "%*s %sfailed to match trie start class...%s\n",
3211 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3219 /* the basic plan of execution of the trie is:
3220 * At the beginning, run though all the states, and
3221 * find the longest-matching word. Also remember the position
3222 * of the shortest matching word. For example, this pattern:
3225 * when matched against the string "abcde", will generate
3226 * accept states for all words except 3, with the longest
3227 * matching word being 4, and the shortest being 1 (with
3228 * the position being after char 1 of the string).
3230 * Then for each matching word, in word order (i.e. 1,2,4,5),
3231 * we run the remainder of the pattern; on each try setting
3232 * the current position to the character following the word,
3233 * returning to try the next word on failure.
3235 * We avoid having to build a list of words at runtime by
3236 * using a compile-time structure, wordinfo[].prev, which
3237 * gives, for each word, the previous accepting word (if any).
3238 * In the case above it would contain the mappings 1->2, 2->0,
3239 * 3->0, 4->5, 5->1. We can use this table to generate, from
3240 * the longest word (4 above), a list of all words, by
3241 * following the list of prev pointers; this gives us the
3242 * unordered list 4,5,1,2. Then given the current word we have
3243 * just tried, we can go through the list and find the
3244 * next-biggest word to try (so if we just failed on word 2,
3245 * the next in the list is 4).
3247 * Since at runtime we don't record the matching position in
3248 * the string for each word, we have to work that out for
3249 * each word we're about to process. The wordinfo table holds
3250 * the character length of each word; given that we recorded
3251 * at the start: the position of the shortest word and its
3252 * length in chars, we just need to move the pointer the
3253 * difference between the two char lengths. Depending on
3254 * Unicode status and folding, that's cheap or expensive.
3256 * This algorithm is optimised for the case where are only a
3257 * small number of accept states, i.e. 0,1, or maybe 2.
3258 * With lots of accepts states, and having to try all of them,
3259 * it becomes quadratic on number of accept states to find all
3264 /* what type of TRIE am I? (utf8 makes this contextual) */
3265 DECL_TRIE_TYPE(scan);
3267 /* what trie are we using right now */
3268 reg_trie_data * const trie
3269 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3270 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3271 U32 state = trie->startstate;
3273 if (trie->bitmap && trie_type != trie_utf8_fold &&
3274 !TRIE_BITMAP_TEST(trie,*locinput)
3276 if (trie->states[ state ].wordnum) {
3278 PerlIO_printf(Perl_debug_log,
3279 "%*s %smatched empty string...%s\n",
3280 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3286 PerlIO_printf(Perl_debug_log,
3287 "%*s %sfailed to match trie start class...%s\n",
3288 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3295 U8 *uc = ( U8* )locinput;
3299 U8 *uscan = (U8*)NULL;
3300 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3301 U32 charcount = 0; /* how many input chars we have matched */
3302 U32 accepted = 0; /* have we seen any accepting states? */
3305 ST.jump = trie->jump;
3308 ST.longfold = FALSE; /* char longer if folded => it's harder */
3311 /* fully traverse the TRIE; note the position of the
3312 shortest accept state and the wordnum of the longest
3315 while ( state && uc <= (U8*)PL_regeol ) {
3316 U32 base = trie->states[ state ].trans.base;
3320 wordnum = trie->states[ state ].wordnum;
3322 if (wordnum) { /* it's an accept state */
3325 /* record first match position */
3327 ST.firstpos = (U8*)locinput;
3332 ST.firstchars = charcount;
3335 if (!ST.nextword || wordnum < ST.nextword)
3336 ST.nextword = wordnum;
3337 ST.topword = wordnum;
3340 DEBUG_TRIE_EXECUTE_r({
3341 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3342 PerlIO_printf( Perl_debug_log,
3343 "%*s %sState: %4"UVxf" Accepted: %c ",
3344 2+depth * 2, "", PL_colors[4],
3345 (UV)state, (accepted ? 'Y' : 'N'));
3348 /* read a char and goto next state */
3351 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3352 uscan, len, uvc, charid, foldlen,
3359 base + charid - 1 - trie->uniquecharcount)) >= 0)
3361 && ((U32)offset < trie->lasttrans)
3362 && trie->trans[offset].check == state)
3364 state = trie->trans[offset].next;
3375 DEBUG_TRIE_EXECUTE_r(
3376 PerlIO_printf( Perl_debug_log,
3377 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3378 charid, uvc, (UV)state, PL_colors[5] );
3384 /* calculate total number of accept states */
3389 w = trie->wordinfo[w].prev;
3392 ST.accepted = accepted;
3396 PerlIO_printf( Perl_debug_log,
3397 "%*s %sgot %"IVdf" possible matches%s\n",
3398 REPORT_CODE_OFF + depth * 2, "",
3399 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3401 goto trie_first_try; /* jump into the fail handler */
3405 case TRIE_next_fail: /* we failed - try next alternative */
3407 REGCP_UNWIND(ST.cp);
3408 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3409 PL_regoffs[n].end = -1;
3410 *PL_reglastparen = n;
3412 if (!--ST.accepted) {
3414 PerlIO_printf( Perl_debug_log,
3415 "%*s %sTRIE failed...%s\n",
3416 REPORT_CODE_OFF+depth*2, "",
3423 /* Find next-highest word to process. Note that this code
3424 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3425 register U16 min = 0;
3427 register U16 const nextword = ST.nextword;
3428 register reg_trie_wordinfo * const wordinfo
3429 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3430 for (word=ST.topword; word; word=wordinfo[word].prev) {
3431 if (word > nextword && (!min || word < min))
3444 ST.lastparen = *PL_reglastparen;
3448 /* find start char of end of current word */
3450 U32 chars; /* how many chars to skip */
3451 U8 *uc = ST.firstpos;
3452 reg_trie_data * const trie
3453 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3455 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3457 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3461 /* the hard option - fold each char in turn and find
3462 * its folded length (which may be different */
3463 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3471 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3479 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3484 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3498 PL_reginput = (char *)uc;
3501 scan = (ST.jump && ST.jump[ST.nextword])
3502 ? ST.me + ST.jump[ST.nextword]
3506 PerlIO_printf( Perl_debug_log,
3507 "%*s %sTRIE matched word #%d, continuing%s\n",
3508 REPORT_CODE_OFF+depth*2, "",
3515 if (ST.accepted > 1 || has_cutgroup) {
3516 PUSH_STATE_GOTO(TRIE_next, scan);
3519 /* only one choice left - just continue */
3521 AV *const trie_words
3522 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3523 SV ** const tmp = av_fetch( trie_words,
3525 SV *sv= tmp ? sv_newmortal() : NULL;
3527 PerlIO_printf( Perl_debug_log,
3528 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3529 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3531 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3532 PL_colors[0], PL_colors[1],
3533 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3535 : "not compiled under -Dr",
3539 locinput = PL_reginput;
3540 nextchr = UCHARAT(locinput);
3541 continue; /* execute rest of RE */
3546 char *s = STRING(scan);
3548 if (utf8_target != UTF_PATTERN) {
3549 /* The target and the pattern have differing utf8ness. */
3551 const char * const e = s + ln;
3554 /* The target is utf8, the pattern is not utf8. */
3559 if (NATIVE_TO_UNI(*(U8*)s) !=
3560 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3568 /* The target is not utf8, the pattern is utf8. */
3573 if (NATIVE_TO_UNI(*((U8*)l)) !=
3574 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3582 nextchr = UCHARAT(locinput);
3585 /* The target and the pattern have the same utf8ness. */
3586 /* Inline the first character, for speed. */
3587 if (UCHARAT(s) != nextchr)
3589 if (PL_regeol - locinput < ln)
3591 if (ln > 1 && memNE(s, locinput, ln))
3594 nextchr = UCHARAT(locinput);
3599 const U8 * fold_array;
3601 U32 fold_utf8_flags;
3603 PL_reg_flags |= RF_tainted;
3604 folder = foldEQ_locale;
3605 fold_array = PL_fold_locale;
3606 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3610 folder = foldEQ_latin1;
3611 fold_array = PL_fold_latin1;
3612 fold_utf8_flags = 0;
3616 folder = foldEQ_latin1;
3617 fold_array = PL_fold_latin1;
3618 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3623 fold_array = PL_fold;
3624 fold_utf8_flags = 0;
3630 if (utf8_target || UTF_PATTERN) {
3631 /* Either target or the pattern are utf8. */
3632 const char * const l = locinput;
3633 char *e = PL_regeol;
3635 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
3636 l, &e, 0, utf8_target, fold_utf8_flags))
3641 nextchr = UCHARAT(locinput);
3645 /* Neither the target nor the pattern are utf8 */
3646 if (UCHARAT(s) != nextchr &&
3647 UCHARAT(s) != fold_array[nextchr])
3651 if (PL_regeol - locinput < ln)
3653 if (ln > 1 && ! folder(s, locinput, ln))
3656 nextchr = UCHARAT(locinput);
3660 /* XXX Could improve efficiency by separating these all out using a
3661 * macro or in-line function. At that point regcomp.c would no longer
3662 * have to set the FLAGS fields of these */
3665 PL_reg_flags |= RF_tainted;
3673 /* was last char in word? */
3674 if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3675 if (locinput == PL_bostr)
3678 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3680 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3682 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3683 ln = isALNUM_uni(ln);
3684 LOAD_UTF8_CHARCLASS_ALNUM();
3685 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3688 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3689 n = isALNUM_LC_utf8((U8*)locinput);
3694 /* Here the string isn't utf8, or is utf8 and only ascii
3695 * characters are to match \w. In the latter case looking at
3696 * the byte just prior to the current one may be just the final
3697 * byte of a multi-byte character. This is ok. There are two
3699 * 1) it is a single byte character, and then the test is doing
3700 * just what it's supposed to.
3701 * 2) it is a multi-byte character, in which case the final
3702 * byte is never mistakable for ASCII, and so the test
3703 * will say it is not a word character, which is the
3704 * correct answer. */
3705 ln = (locinput != PL_bostr) ?
3706 UCHARAT(locinput - 1) : '\n';
3707 switch (FLAGS(scan)) {
3708 case REGEX_UNICODE_CHARSET:
3709 ln = isWORDCHAR_L1(ln);
3710 n = isWORDCHAR_L1(nextchr);
3712 case REGEX_LOCALE_CHARSET:
3713 ln = isALNUM_LC(ln);
3714 n = isALNUM_LC(nextchr);
3716 case REGEX_DEPENDS_CHARSET:
3718 n = isALNUM(nextchr);
3720 case REGEX_ASCII_RESTRICTED_CHARSET:
3721 ln = isWORDCHAR_A(ln);
3722 n = isWORDCHAR_A(nextchr);
3725 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3729 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3731 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3736 if (utf8_target || state_num == ANYOFV) {
3737 STRLEN inclasslen = PL_regeol - locinput;
3738 if (locinput >= PL_regeol)
3741 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3743 locinput += inclasslen;
3744 nextchr = UCHARAT(locinput);
3749 nextchr = UCHARAT(locinput);
3750 if (!nextchr && locinput >= PL_regeol)
3752 if (!REGINCLASS(rex, scan, (U8*)locinput))
3754 nextchr = UCHARAT(++locinput);
3758 /* Special char classes - The defines start on line 129 or so */
3759 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3760 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3761 ALNUMU, NALNUMU, isWORDCHAR_L1,
3762 ALNUMA, NALNUMA, isWORDCHAR_A,
3765 CCC_TRY_U(SPACE, NSPACE, isSPACE,
3766 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3767 SPACEU, NSPACEU, isSPACE_L1,
3768 SPACEA, NSPACEA, isSPACE_A,
3771 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
3772 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3773 DIGITA, NDIGITA, isDIGIT_A,
3776 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3777 a Unicode extended Grapheme Cluster */
3778 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3779 extended Grapheme Cluster is:
3782 | Prepend* Begin Extend*
3785 Begin is (Hangul-syllable | ! Control)
3786 Extend is (Grapheme_Extend | Spacing_Mark)
3787 Control is [ GCB_Control CR LF ]
3789 The discussion below shows how the code for CLUMP is derived
3790 from this regex. Note that most of these concepts are from
3791 property values of the Grapheme Cluster Boundary (GCB) property.
3792 No code point can have multiple property values for a given
3793 property. Thus a code point in Prepend can't be in Control, but
3794 it must be in !Control. This is why Control above includes
3795 GCB_Control plus CR plus LF. The latter two are used in the GCB
3796 property separately, and so can't be in GCB_Control, even though
3797 they logically are controls. Control is not the same as gc=cc,
3798 but includes format and other characters as well.
3800 The Unicode definition of Hangul-syllable is:
3802 | (L* ( ( V | LV ) V* | LVT ) T*)
3805 Each of these is a value for the GCB property, and hence must be
3806 disjoint, so the order they are tested is immaterial, so the
3807 above can safely be changed to
3810 | (L* ( LVT | ( V | LV ) V*) T*)
3812 The last two terms can be combined like this:
3814 | (( LVT | ( V | LV ) V*) T*))
3816 And refactored into this:
3817 L* (L | LVT T* | V V* T* | LV V* T*)
3819 That means that if we have seen any L's at all we can quit
3820 there, but if the next character is a LVT, a V or and LV we
3823 There is a subtlety with Prepend* which showed up in testing.
3824 Note that the Begin, and only the Begin is required in:
3825 | Prepend* Begin Extend*
3826 Also, Begin contains '! Control'. A Prepend must be a '!
3827 Control', which means it must be a Begin. What it comes down to
3828 is that if we match Prepend* and then find no suitable Begin
3829 afterwards, that if we backtrack the last Prepend, that one will
3830 be a suitable Begin.
3833 if (locinput >= PL_regeol)
3835 if (! utf8_target) {
3837 /* Match either CR LF or '.', as all the other possibilities
3839 locinput++; /* Match the . or CR */
3841 && locinput < PL_regeol
3842 && UCHARAT(locinput) == '\n') locinput++;
3846 /* Utf8: See if is ( CR LF ); already know that locinput <
3847 * PL_regeol, so locinput+1 is in bounds */
3848 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3852 /* In case have to backtrack to beginning, then match '.' */
3853 char *starting = locinput;
3855 /* In case have to backtrack the last prepend */
3856 char *previous_prepend = 0;
3858 LOAD_UTF8_CHARCLASS_GCB();
3860 /* Match (prepend)* */
3861 while (locinput < PL_regeol
3862 && swash_fetch(PL_utf8_X_prepend,
3863 (U8*)locinput, utf8_target))
3865 previous_prepend = locinput;
3866 locinput += UTF8SKIP(locinput);
3869 /* As noted above, if we matched a prepend character, but
3870 * the next thing won't match, back off the last prepend we
3871 * matched, as it is guaranteed to match the begin */
3872 if (previous_prepend
3873 && (locinput >= PL_regeol
3874 || ! swash_fetch(PL_utf8_X_begin,
3875 (U8*)locinput, utf8_target)))
3877 locinput = previous_prepend;
3880 /* Note that here we know PL_regeol > locinput, as we
3881 * tested that upon input to this switch case, and if we
3882 * moved locinput forward, we tested the result just above
3883 * and it either passed, or we backed off so that it will
3885 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3887 /* Here did not match the required 'Begin' in the
3888 * second term. So just match the very first
3889 * character, the '.' of the final term of the regex */
3890 locinput = starting + UTF8SKIP(starting);
3893 /* Here is the beginning of a character that can have
3894 * an extender. It is either a hangul syllable, or a
3896 if (swash_fetch(PL_utf8_X_non_hangul,
3897 (U8*)locinput, utf8_target))
3900 /* Here not a Hangul syllable, must be a
3901 * ('! * Control') */
3902 locinput += UTF8SKIP(locinput);
3905 /* Here is a Hangul syllable. It can be composed
3906 * of several individual characters. One
3907 * possibility is T+ */
3908 if (swash_fetch(PL_utf8_X_T,
3909 (U8*)locinput, utf8_target))
3911 while (locinput < PL_regeol
3912 && swash_fetch(PL_utf8_X_T,
3913 (U8*)locinput, utf8_target))
3915 locinput += UTF8SKIP(locinput);
3919 /* Here, not T+, but is a Hangul. That means
3920 * it is one of the others: L, LV, LVT or V,
3922 * L* (L | LVT T* | V V* T* | LV V* T*) */
3925 while (locinput < PL_regeol
3926 && swash_fetch(PL_utf8_X_L,
3927 (U8*)locinput, utf8_target))
3929 locinput += UTF8SKIP(locinput);
3932 /* Here, have exhausted L*. If the next
3933 * character is not an LV, LVT nor V, it means
3934 * we had to have at least one L, so matches L+
3935 * in the original equation, we have a complete
3936 * hangul syllable. Are done. */
3938 if (locinput < PL_regeol
3939 && swash_fetch(PL_utf8_X_LV_LVT_V,
3940 (U8*)locinput, utf8_target))
3943 /* Otherwise keep going. Must be LV, LVT
3944 * or V. See if LVT */
3945 if (swash_fetch(PL_utf8_X_LVT,
3946 (U8*)locinput, utf8_target))
3948 locinput += UTF8SKIP(locinput);
3951 /* Must be V or LV. Take it, then
3953 locinput += UTF8SKIP(locinput);
3954 while (locinput < PL_regeol
3955 && swash_fetch(PL_utf8_X_V,
3956 (U8*)locinput, utf8_target))
3958 locinput += UTF8SKIP(locinput);
3962 /* And any of LV, LVT, or V can be followed
3964 while (locinput < PL_regeol
3965 && swash_fetch(PL_utf8_X_T,
3969 locinput += UTF8SKIP(locinput);
3975 /* Match any extender */
3976 while (locinput < PL_regeol
3977 && swash_fetch(PL_utf8_X_extend,
3978 (U8*)locinput, utf8_target))
3980 locinput += UTF8SKIP(locinput);
3984 if (locinput > PL_regeol) sayNO;
3986 nextchr = UCHARAT(locinput);
3990 { /* The capture buffer cases. The ones beginning with N for the
3991 named buffers just convert to the equivalent numbered and
3992 pretend they were called as the corresponding numbered buffer
3994 /* don't initialize these in the declaration, it makes C++
3999 const U8 *fold_array;
4002 PL_reg_flags |= RF_tainted;
4003 folder = foldEQ_locale;
4004 fold_array = PL_fold_locale;
4006 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4010 folder = foldEQ_latin1;
4011 fold_array = PL_fold_latin1;
4013 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4017 folder = foldEQ_latin1;
4018 fold_array = PL_fold_latin1;
4020 utf8_fold_flags = 0;
4025 fold_array = PL_fold;
4027 utf8_fold_flags = 0;
4034 utf8_fold_flags = 0;
4037 /* For the named back references, find the corresponding buffer
4039 n = reg_check_named_buff_matched(rex,scan);
4044 goto do_nref_ref_common;
4047 PL_reg_flags |= RF_tainted;
4048 folder = foldEQ_locale;
4049 fold_array = PL_fold_locale;
4050 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4054 folder = foldEQ_latin1;
4055 fold_array = PL_fold_latin1;
4056 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4060 folder = foldEQ_latin1;
4061 fold_array = PL_fold_latin1;
4062 utf8_fold_flags = 0;
4067 fold_array = PL_fold;
4068 utf8_fold_flags = 0;
4074 utf8_fold_flags = 0;
4078 n = ARG(scan); /* which paren pair */
4081 ln = PL_regoffs[n].start;
4082 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4083 if (*PL_reglastparen < n || ln == -1)
4084 sayNO; /* Do not match unless seen CLOSEn. */
4085 if (ln == PL_regoffs[n].end)
4089 if (type != REF /* REF can do byte comparison */
4090 && (utf8_target || type == REFFU))
4091 { /* XXX handle REFFL better */
4092 char * limit = PL_regeol;
4094 /* This call case insensitively compares the entire buffer
4095 * at s, with the current input starting at locinput, but
4096 * not going off the end given by PL_regeol, and returns in
4097 * limit upon success, how much of the current input was
4099 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4100 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4105 nextchr = UCHARAT(locinput);
4109 /* Not utf8: Inline the first character, for speed. */
4110 if (UCHARAT(s) != nextchr &&
4112 UCHARAT(s) != fold_array[nextchr]))
4114 ln = PL_regoffs[n].end - ln;
4115 if (locinput + ln > PL_regeol)
4117 if (ln > 1 && (type == REF
4118 ? memNE(s, locinput, ln)
4119 : ! folder(s, locinput, ln)))
4122 nextchr = UCHARAT(locinput);
4132 #define ST st->u.eval
4137 regexp_internal *rei;
4138 regnode *startpoint;
4141 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4142 if (cur_eval && cur_eval->locinput==locinput) {
4143 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4144 Perl_croak(aTHX_ "Infinite recursion in regex");
4145 if ( ++nochange_depth > max_nochange_depth )
4147 "Pattern subroutine nesting without pos change"
4148 " exceeded limit in regex");
4155 (void)ReREFCNT_inc(rex_sv);
4156 if (OP(scan)==GOSUB) {
4157 startpoint = scan + ARG2L(scan);
4158 ST.close_paren = ARG(scan);
4160 startpoint = rei->program+1;
4163 goto eval_recurse_doit;
4165 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4166 if (cur_eval && cur_eval->locinput==locinput) {
4167 if ( ++nochange_depth > max_nochange_depth )
4168 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4173 /* execute the code in the {...} */
4175 SV ** const before = SP;
4176 OP_4tree * const oop = PL_op;
4177 COP * const ocurcop = PL_curcop;
4179 char *saved_regeol = PL_regeol;
4180 struct re_save_state saved_state;
4182 /* To not corrupt the existing regex state while executing the
4183 * eval we would normally put it on the save stack, like with
4184 * save_re_context. However, re-evals have a weird scoping so we
4185 * can't just add ENTER/LEAVE here. With that, things like
4187 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4189 * would break, as they expect the localisation to be unwound
4190 * only when the re-engine backtracks through the bit that
4193 * What we do instead is just saving the state in a local c
4196 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4199 PL_op = (OP_4tree*)rexi->data->data[n];
4200 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4201 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4202 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4203 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4206 SV *sv_mrk = get_sv("REGMARK", 1);
4207 sv_setsv(sv_mrk, sv_yes_mark);
4210 CALLRUNOPS(aTHX); /* Scalar context. */
4213 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4219 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4222 PAD_RESTORE_LOCAL(old_comppad);
4223 PL_curcop = ocurcop;
4224 PL_regeol = saved_regeol;
4227 sv_setsv(save_scalar(PL_replgv), ret);
4231 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4234 /* extract RE object from returned value; compiling if
4240 SV *const sv = SvRV(ret);
4242 if (SvTYPE(sv) == SVt_REGEXP) {
4244 } else if (SvSMAGICAL(sv)) {
4245 mg = mg_find(sv, PERL_MAGIC_qr);
4248 } else if (SvTYPE(ret) == SVt_REGEXP) {
4250 } else if (SvSMAGICAL(ret)) {
4251 if (SvGMAGICAL(ret)) {
4252 /* I don't believe that there is ever qr magic
4254 assert(!mg_find(ret, PERL_MAGIC_qr));
4255 sv_unmagic(ret, PERL_MAGIC_qr);
4258 mg = mg_find(ret, PERL_MAGIC_qr);
4259 /* testing suggests mg only ends up non-NULL for
4260 scalars who were upgraded and compiled in the
4261 else block below. In turn, this is only
4262 triggered in the "postponed utf8 string" tests
4268 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4272 rx = reg_temp_copy(NULL, rx);
4276 const I32 osize = PL_regsize;
4279 assert (SvUTF8(ret));
4280 } else if (SvUTF8(ret)) {
4281 /* Not doing UTF-8, despite what the SV says. Is
4282 this only if we're trapped in use 'bytes'? */
4283 /* Make a copy of the octet sequence, but without
4284 the flag on, as the compiler now honours the
4285 SvUTF8 flag on ret. */
4287 const char *const p = SvPV(ret, len);
4288 ret = newSVpvn_flags(p, len, SVs_TEMP);
4290 rx = CALLREGCOMP(ret, pm_flags);
4292 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4294 /* This isn't a first class regexp. Instead, it's
4295 caching a regexp onto an existing, Perl visible
4297 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4302 re = (struct regexp *)SvANY(rx);
4304 RXp_MATCH_COPIED_off(re);
4305 re->subbeg = rex->subbeg;
4306 re->sublen = rex->sublen;
4309 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4310 "Matching embedded");
4312 startpoint = rei->program + 1;
4313 ST.close_paren = 0; /* only used for GOSUB */
4314 /* borrowed from regtry */
4315 if (PL_reg_start_tmpl <= re->nparens) {
4316 PL_reg_start_tmpl = re->nparens*3/2 + 3;
4317 if(PL_reg_start_tmp)
4318 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4320 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4323 eval_recurse_doit: /* Share code with GOSUB below this line */
4324 /* run the pattern returned from (??{...}) */
4325 ST.cp = regcppush(0); /* Save *all* the positions. */
4326 REGCP_SET(ST.lastcp);
4328 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4330 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4331 PL_reglastparen = &re->lastparen;
4332 PL_reglastcloseparen = &re->lastcloseparen;
4334 re->lastcloseparen = 0;
4336 PL_reginput = locinput;
4339 /* XXXX This is too dramatic a measure... */
4342 ST.toggle_reg_flags = PL_reg_flags;
4344 PL_reg_flags |= RF_utf8;
4346 PL_reg_flags &= ~RF_utf8;
4347 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4349 ST.prev_rex = rex_sv;
4350 ST.prev_curlyx = cur_curlyx;
4351 SETREX(rex_sv,re_sv);
4356 ST.prev_eval = cur_eval;
4358 /* now continue from first node in postoned RE */
4359 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4362 /* logical is 1, /(?(?{...})X|Y)/ */
4363 sw = cBOOL(SvTRUE(ret));
4368 case EVAL_AB: /* cleanup after a successful (??{A})B */
4369 /* note: this is called twice; first after popping B, then A */
4370 PL_reg_flags ^= ST.toggle_reg_flags;
4371 ReREFCNT_dec(rex_sv);
4372 SETREX(rex_sv,ST.prev_rex);
4373 rex = (struct regexp *)SvANY(rex_sv);
4374 rexi = RXi_GET(rex);
4376 cur_eval = ST.prev_eval;
4377 cur_curlyx = ST.prev_curlyx;
4379 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4380 PL_reglastparen = &rex->lastparen;
4381 PL_reglastcloseparen = &rex->lastcloseparen;
4382 /* also update PL_regoffs */
4383 PL_regoffs = rex->offs;
4385 /* XXXX This is too dramatic a measure... */
4387 if ( nochange_depth )
4392 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4393 /* note: this is called twice; first after popping B, then A */
4394 PL_reg_flags ^= ST.toggle_reg_flags;
4395 ReREFCNT_dec(rex_sv);
4396 SETREX(rex_sv,ST.prev_rex);
4397 rex = (struct regexp *)SvANY(rex_sv);
4398 rexi = RXi_GET(rex);
4399 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4400 PL_reglastparen = &rex->lastparen;
4401 PL_reglastcloseparen = &rex->lastcloseparen;
4403 PL_reginput = locinput;
4404 REGCP_UNWIND(ST.lastcp);
4406 cur_eval = ST.prev_eval;
4407 cur_curlyx = ST.prev_curlyx;
4408 /* XXXX This is too dramatic a measure... */
4410 if ( nochange_depth )
4416 n = ARG(scan); /* which paren pair */
4417 PL_reg_start_tmp[n] = locinput;
4423 n = ARG(scan); /* which paren pair */
4424 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4425 PL_regoffs[n].end = locinput - PL_bostr;
4426 /*if (n > PL_regsize)
4428 if (n > *PL_reglastparen)
4429 *PL_reglastparen = n;
4430 *PL_reglastcloseparen = n;
4431 if (cur_eval && cur_eval->u.eval.close_paren == n) {
4439 cursor && OP(cursor)!=END;
4440 cursor=regnext(cursor))
4442 if ( OP(cursor)==CLOSE ){
4444 if ( n <= lastopen ) {
4446 = PL_reg_start_tmp[n] - PL_bostr;
4447 PL_regoffs[n].end = locinput - PL_bostr;
4448 /*if (n > PL_regsize)
4450 if (n > *PL_reglastparen)
4451 *PL_reglastparen = n;
4452 *PL_reglastcloseparen = n;
4453 if ( n == ARG(scan) || (cur_eval &&
4454 cur_eval->u.eval.close_paren == n))
4463 n = ARG(scan); /* which paren pair */
4464 sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4467 /* reg_check_named_buff_matched returns 0 for no match */
4468 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4472 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4478 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4480 next = NEXTOPER(NEXTOPER(scan));
4482 next = scan + ARG(scan);
4483 if (OP(next) == IFTHEN) /* Fake one. */
4484 next = NEXTOPER(NEXTOPER(next));
4488 logical = scan->flags;
4491 /*******************************************************************
4493 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4494 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4495 STAR/PLUS/CURLY/CURLYN are used instead.)
4497 A*B is compiled as <CURLYX><A><WHILEM><B>
4499 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4500 state, which contains the current count, initialised to -1. It also sets
4501 cur_curlyx to point to this state, with any previous value saved in the
4504 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4505 since the pattern may possibly match zero times (i.e. it's a while {} loop
4506 rather than a do {} while loop).
4508 Each entry to WHILEM represents a successful match of A. The count in the
4509 CURLYX block is incremented, another WHILEM state is pushed, and execution
4510 passes to A or B depending on greediness and the current count.
4512 For example, if matching against the string a1a2a3b (where the aN are
4513 substrings that match /A/), then the match progresses as follows: (the
4514 pushed states are interspersed with the bits of strings matched so far):
4517 <CURLYX cnt=0><WHILEM>
4518 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4519 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4520 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4521 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4523 (Contrast this with something like CURLYM, which maintains only a single
4527 a1 <CURLYM cnt=1> a2
4528 a1 a2 <CURLYM cnt=2> a3
4529 a1 a2 a3 <CURLYM cnt=3> b
4532 Each WHILEM state block marks a point to backtrack to upon partial failure
4533 of A or B, and also contains some minor state data related to that
4534 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4535 overall state, such as the count, and pointers to the A and B ops.
4537 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4538 must always point to the *current* CURLYX block, the rules are:
4540 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4541 and set cur_curlyx to point the new block.
4543 When popping the CURLYX block after a successful or unsuccessful match,
4544 restore the previous cur_curlyx.
4546 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4547 to the outer one saved in the CURLYX block.
4549 When popping the WHILEM block after a successful or unsuccessful B match,
4550 restore the previous cur_curlyx.
4552 Here's an example for the pattern (AI* BI)*BO
4553 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4556 curlyx backtrack stack
4557 ------ ---------------
4559 CO <CO prev=NULL> <WO>
4560 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4561 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4562 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4564 At this point the pattern succeeds, and we work back down the stack to
4565 clean up, restoring as we go:
4567 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4568 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4569 CO <CO prev=NULL> <WO>
4572 *******************************************************************/
4574 #define ST st->u.curlyx
4576 case CURLYX: /* start of /A*B/ (for complex A) */
4578 /* No need to save/restore up to this paren */
4579 I32 parenfloor = scan->flags;
4581 assert(next); /* keep Coverity happy */
4582 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4585 /* XXXX Probably it is better to teach regpush to support
4586 parenfloor > PL_regsize... */
4587 if (parenfloor > (I32)*PL_reglastparen)
4588 parenfloor = *PL_reglastparen; /* Pessimization... */
4590 ST.prev_curlyx= cur_curlyx;
4592 ST.cp = PL_savestack_ix;
4594 /* these fields contain the state of the current curly.
4595 * they are accessed by subsequent WHILEMs */
4596 ST.parenfloor = parenfloor;
4601 ST.count = -1; /* this will be updated by WHILEM */
4602 ST.lastloc = NULL; /* this will be updated by WHILEM */
4604 PL_reginput = locinput;
4605 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4609 case CURLYX_end: /* just finished matching all of A*B */
4610 cur_curlyx = ST.prev_curlyx;
4614 case CURLYX_end_fail: /* just failed to match all of A*B */
4616 cur_curlyx = ST.prev_curlyx;
4622 #define ST st->u.whilem
4624 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4626 /* see the discussion above about CURLYX/WHILEM */
4628 int min = ARG1(cur_curlyx->u.curlyx.me);
4629 int max = ARG2(cur_curlyx->u.curlyx.me);
4630 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4632 assert(cur_curlyx); /* keep Coverity happy */
4633 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4634 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4635 ST.cache_offset = 0;
4638 PL_reginput = locinput;
4640 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4641 "%*s whilem: matched %ld out of %d..%d\n",
4642 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4645 /* First just match a string of min A's. */
4648 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4649 cur_curlyx->u.curlyx.lastloc = locinput;
4650 REGCP_SET(ST.lastcp);
4652 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4656 /* If degenerate A matches "", assume A done. */
4658 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4659 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4660 "%*s whilem: empty match detected, trying continuation...\n",
4661 REPORT_CODE_OFF+depth*2, "")
4663 goto do_whilem_B_max;
4666 /* super-linear cache processing */
4670 if (!PL_reg_maxiter) {
4671 /* start the countdown: Postpone detection until we
4672 * know the match is not *that* much linear. */
4673 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4674 /* possible overflow for long strings and many CURLYX's */
4675 if (PL_reg_maxiter < 0)
4676 PL_reg_maxiter = I32_MAX;
4677 PL_reg_leftiter = PL_reg_maxiter;
4680 if (PL_reg_leftiter-- == 0) {
4681 /* initialise cache */
4682 const I32 size = (PL_reg_maxiter + 7)/8;
4683 if (PL_reg_poscache) {
4684 if ((I32)PL_reg_poscache_size < size) {
4685 Renew(PL_reg_poscache, size, char);
4686 PL_reg_poscache_size = size;
4688 Zero(PL_reg_poscache, size, char);
4691 PL_reg_poscache_size = size;
4692 Newxz(PL_reg_poscache, size, char);
4694 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4695 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4696 PL_colors[4], PL_colors[5])
4700 if (PL_reg_leftiter < 0) {
4701 /* have we already failed at this position? */
4703 offset = (scan->flags & 0xf) - 1
4704 + (locinput - PL_bostr) * (scan->flags>>4);
4705 mask = 1 << (offset % 8);
4707 if (PL_reg_poscache[offset] & mask) {
4708 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4709 "%*s whilem: (cache) already tried at this position...\n",
4710 REPORT_CODE_OFF+depth*2, "")
4712 sayNO; /* cache records failure */
4714 ST.cache_offset = offset;
4715 ST.cache_mask = mask;
4719 /* Prefer B over A for minimal matching. */
4721 if (cur_curlyx->u.curlyx.minmod) {
4722 ST.save_curlyx = cur_curlyx;
4723 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4724 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4725 REGCP_SET(ST.lastcp);
4726 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4730 /* Prefer A over B for maximal matching. */
4732 if (n < max) { /* More greed allowed? */
4733 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4734 cur_curlyx->u.curlyx.lastloc = locinput;
4735 REGCP_SET(ST.lastcp);
4736 PUSH_STATE_GOTO(WHILEM_A_max, A);
4739 goto do_whilem_B_max;
4743 case WHILEM_B_min: /* just matched B in a minimal match */
4744 case WHILEM_B_max: /* just matched B in a maximal match */
4745 cur_curlyx = ST.save_curlyx;
4749 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4750 cur_curlyx = ST.save_curlyx;
4751 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4752 cur_curlyx->u.curlyx.count--;
4756 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4758 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4759 REGCP_UNWIND(ST.lastcp);
4761 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4762 cur_curlyx->u.curlyx.count--;
4766 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4767 REGCP_UNWIND(ST.lastcp);
4768 regcppop(rex); /* Restore some previous $<digit>s? */
4769 PL_reginput = locinput;
4770 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4771 "%*s whilem: failed, trying continuation...\n",
4772 REPORT_CODE_OFF+depth*2, "")
4775 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4776 && ckWARN(WARN_REGEXP)
4777 && !(PL_reg_flags & RF_warned))
4779 PL_reg_flags |= RF_warned;
4780 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4781 "Complex regular subexpression recursion",
4786 ST.save_curlyx = cur_curlyx;
4787 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4788 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4791 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4792 cur_curlyx = ST.save_curlyx;
4793 REGCP_UNWIND(ST.lastcp);
4796 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4797 /* Maximum greed exceeded */
4798 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4799 && ckWARN(WARN_REGEXP)
4800 && !(PL_reg_flags & RF_warned))
4802 PL_reg_flags |= RF_warned;
4803 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4804 "%s limit (%d) exceeded",
4805 "Complex regular subexpression recursion",
4808 cur_curlyx->u.curlyx.count--;
4812 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4813 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4815 /* Try grabbing another A and see if it helps. */
4816 PL_reginput = locinput;
4817 cur_curlyx->u.curlyx.lastloc = locinput;
4818 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4819 REGCP_SET(ST.lastcp);
4820 PUSH_STATE_GOTO(WHILEM_A_min,
4821 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4825 #define ST st->u.branch
4827 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4828 next = scan + ARG(scan);
4831 scan = NEXTOPER(scan);
4834 case BRANCH: /* /(...|A|...)/ */
4835 scan = NEXTOPER(scan); /* scan now points to inner node */
4836 ST.lastparen = *PL_reglastparen;
4837 ST.next_branch = next;
4839 PL_reginput = locinput;
4841 /* Now go into the branch */
4843 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4845 PUSH_STATE_GOTO(BRANCH_next, scan);
4849 PL_reginput = locinput;
4850 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4851 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4852 PUSH_STATE_GOTO(CUTGROUP_next,next);
4854 case CUTGROUP_next_fail:
4857 if (st->u.mark.mark_name)
4858 sv_commit = st->u.mark.mark_name;
4864 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4869 REGCP_UNWIND(ST.cp);
4870 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4871 PL_regoffs[n].end = -1;
4872 *PL_reglastparen = n;
4873 /*dmq: *PL_reglastcloseparen = n; */
4874 scan = ST.next_branch;
4875 /* no more branches? */
4876 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4878 PerlIO_printf( Perl_debug_log,
4879 "%*s %sBRANCH failed...%s\n",
4880 REPORT_CODE_OFF+depth*2, "",
4886 continue; /* execute next BRANCH[J] op */
4894 #define ST st->u.curlym
4896 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4898 /* This is an optimisation of CURLYX that enables us to push
4899 * only a single backtracking state, no matter how many matches
4900 * there are in {m,n}. It relies on the pattern being constant
4901 * length, with no parens to influence future backrefs
4905 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4907 /* if paren positive, emulate an OPEN/CLOSE around A */
4909 U32 paren = ST.me->flags;
4910 if (paren > PL_regsize)
4912 if (paren > *PL_reglastparen)
4913 *PL_reglastparen = paren;
4914 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4922 ST.c1 = CHRTEST_UNINIT;
4925 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4928 curlym_do_A: /* execute the A in /A{m,n}B/ */
4929 PL_reginput = locinput;
4930 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4933 case CURLYM_A: /* we've just matched an A */
4934 locinput = st->locinput;
4935 nextchr = UCHARAT(locinput);
4938 /* after first match, determine A's length: u.curlym.alen */
4939 if (ST.count == 1) {
4940 if (PL_reg_match_utf8) {
4942 while (s < PL_reginput) {
4948 ST.alen = PL_reginput - locinput;
4951 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4954 PerlIO_printf(Perl_debug_log,
4955 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4956 (int)(REPORT_CODE_OFF+(depth*2)), "",
4957 (IV) ST.count, (IV)ST.alen)
4960 locinput = PL_reginput;
4962 if (cur_eval && cur_eval->u.eval.close_paren &&
4963 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4967 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4968 if ( max == REG_INFTY || ST.count < max )
4969 goto curlym_do_A; /* try to match another A */
4971 goto curlym_do_B; /* try to match B */
4973 case CURLYM_A_fail: /* just failed to match an A */
4974 REGCP_UNWIND(ST.cp);
4976 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4977 || (cur_eval && cur_eval->u.eval.close_paren &&
4978 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4981 curlym_do_B: /* execute the B in /A{m,n}B/ */
4982 PL_reginput = locinput;
4983 if (ST.c1 == CHRTEST_UNINIT) {
4984 /* calculate c1 and c2 for possible match of 1st char
4985 * following curly */
4986 ST.c1 = ST.c2 = CHRTEST_VOID;
4987 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4988 regnode *text_node = ST.B;
4989 if (! HAS_TEXT(text_node))
4990 FIND_NEXT_IMPT(text_node);
4993 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4995 But the former is redundant in light of the latter.
4997 if this changes back then the macro for
4998 IS_TEXT and friends need to change.
5000 if (PL_regkind[OP(text_node)] == EXACT)
5003 ST.c1 = (U8)*STRING(text_node);
5004 switch (OP(text_node)) {
5005 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5007 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5008 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5009 default: ST.c2 = ST.c1;
5016 PerlIO_printf(Perl_debug_log,
5017 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5018 (int)(REPORT_CODE_OFF+(depth*2)),
5021 if (ST.c1 != CHRTEST_VOID
5022 && UCHARAT(PL_reginput) != ST.c1
5023 && UCHARAT(PL_reginput) != ST.c2)
5025 /* simulate B failing */
5027 PerlIO_printf(Perl_debug_log,
5028 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5029 (int)(REPORT_CODE_OFF+(depth*2)),"",
5032 state_num = CURLYM_B_fail;
5033 goto reenter_switch;
5037 /* mark current A as captured */
5038 I32 paren = ST.me->flags;
5040 PL_regoffs[paren].start
5041 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5042 PL_regoffs[paren].end = PL_reginput - PL_bostr;
5043 /*dmq: *PL_reglastcloseparen = paren; */
5046 PL_regoffs[paren].end = -1;
5047 if (cur_eval && cur_eval->u.eval.close_paren &&
5048 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5057 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5060 case CURLYM_B_fail: /* just failed to match a B */
5061 REGCP_UNWIND(ST.cp);
5063 I32 max = ARG2(ST.me);
5064 if (max != REG_INFTY && ST.count == max)
5066 goto curlym_do_A; /* try to match a further A */
5068 /* backtrack one A */
5069 if (ST.count == ARG1(ST.me) /* min */)
5072 locinput = HOPc(locinput, -ST.alen);
5073 goto curlym_do_B; /* try to match B */
5076 #define ST st->u.curly
5078 #define CURLY_SETPAREN(paren, success) \
5081 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5082 PL_regoffs[paren].end = locinput - PL_bostr; \
5083 *PL_reglastcloseparen = paren; \
5086 PL_regoffs[paren].end = -1; \
5089 case STAR: /* /A*B/ where A is width 1 */
5093 scan = NEXTOPER(scan);
5095 case PLUS: /* /A+B/ where A is width 1 */
5099 scan = NEXTOPER(scan);
5101 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
5102 ST.paren = scan->flags; /* Which paren to set */
5103 if (ST.paren > PL_regsize)
5104 PL_regsize = ST.paren;
5105 if (ST.paren > *PL_reglastparen)
5106 *PL_reglastparen = ST.paren;
5107 ST.min = ARG1(scan); /* min to match */
5108 ST.max = ARG2(scan); /* max to match */
5109 if (cur_eval && cur_eval->u.eval.close_paren &&
5110 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5114 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5116 case CURLY: /* /A{m,n}B/ where A is width 1 */
5118 ST.min = ARG1(scan); /* min to match */
5119 ST.max = ARG2(scan); /* max to match */
5120 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5123 * Lookahead to avoid useless match attempts
5124 * when we know what character comes next.
5126 * Used to only do .*x and .*?x, but now it allows
5127 * for )'s, ('s and (?{ ... })'s to be in the way
5128 * of the quantifier and the EXACT-like node. -- japhy
5131 if (ST.min > ST.max) /* XXX make this a compile-time check? */
5133 if (HAS_TEXT(next) || JUMPABLE(next)) {
5135 regnode *text_node = next;
5137 if (! HAS_TEXT(text_node))
5138 FIND_NEXT_IMPT(text_node);
5140 if (! HAS_TEXT(text_node))
5141 ST.c1 = ST.c2 = CHRTEST_VOID;
5143 if ( PL_regkind[OP(text_node)] != EXACT ) {
5144 ST.c1 = ST.c2 = CHRTEST_VOID;
5145 goto assume_ok_easy;
5148 s = (U8*)STRING(text_node);
5150 /* Currently we only get here when
5152 PL_rekind[OP(text_node)] == EXACT
5154 if this changes back then the macro for IS_TEXT and
5155 friends need to change. */
5158 switch (OP(text_node)) {
5159 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5161 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5162 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5163 default: ST.c2 = ST.c1; break;
5166 else { /* UTF_PATTERN */
5167 if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5168 STRLEN ulen1, ulen2;
5169 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5170 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5172 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5173 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5175 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5177 0 : UTF8_ALLOW_ANY);
5178 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5180 0 : UTF8_ALLOW_ANY);
5182 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5184 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5189 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5196 ST.c1 = ST.c2 = CHRTEST_VOID;
5201 PL_reginput = locinput;
5204 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5207 locinput = PL_reginput;
5209 if (ST.c1 == CHRTEST_VOID)
5210 goto curly_try_B_min;
5212 ST.oldloc = locinput;
5214 /* set ST.maxpos to the furthest point along the
5215 * string that could possibly match */
5216 if (ST.max == REG_INFTY) {
5217 ST.maxpos = PL_regeol - 1;
5219 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5222 else if (utf8_target) {
5223 int m = ST.max - ST.min;
5224 for (ST.maxpos = locinput;
5225 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5226 ST.maxpos += UTF8SKIP(ST.maxpos);
5229 ST.maxpos = locinput + ST.max - ST.min;
5230 if (ST.maxpos >= PL_regeol)
5231 ST.maxpos = PL_regeol - 1;
5233 goto curly_try_B_min_known;
5237 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5238 locinput = PL_reginput;
5239 if (ST.count < ST.min)
5241 if ((ST.count > ST.min)
5242 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5244 /* A{m,n} must come at the end of the string, there's
5245 * no point in backing off ... */
5247 /* ...except that $ and \Z can match before *and* after
5248 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5249 We may back off by one in this case. */
5250 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5254 goto curly_try_B_max;
5259 case CURLY_B_min_known_fail:
5260 /* failed to find B in a non-greedy match where c1,c2 valid */
5261 if (ST.paren && ST.count)
5262 PL_regoffs[ST.paren].end = -1;
5264 PL_reginput = locinput; /* Could be reset... */
5265 REGCP_UNWIND(ST.cp);
5266 /* Couldn't or didn't -- move forward. */
5267 ST.oldloc = locinput;
5269 locinput += UTF8SKIP(locinput);
5273 curly_try_B_min_known:
5274 /* find the next place where 'B' could work, then call B */
5278 n = (ST.oldloc == locinput) ? 0 : 1;
5279 if (ST.c1 == ST.c2) {
5281 /* set n to utf8_distance(oldloc, locinput) */
5282 while (locinput <= ST.maxpos &&
5283 utf8n_to_uvchr((U8*)locinput,
5284 UTF8_MAXBYTES, &len,
5285 uniflags) != (UV)ST.c1) {
5291 /* set n to utf8_distance(oldloc, locinput) */
5292 while (locinput <= ST.maxpos) {
5294 const UV c = utf8n_to_uvchr((U8*)locinput,
5295 UTF8_MAXBYTES, &len,
5297 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5305 if (ST.c1 == ST.c2) {
5306 while (locinput <= ST.maxpos &&
5307 UCHARAT(locinput) != ST.c1)
5311 while (locinput <= ST.maxpos
5312 && UCHARAT(locinput) != ST.c1
5313 && UCHARAT(locinput) != ST.c2)
5316 n = locinput - ST.oldloc;
5318 if (locinput > ST.maxpos)
5320 /* PL_reginput == oldloc now */
5323 if (regrepeat(rex, ST.A, n, depth) < n)
5326 PL_reginput = locinput;
5327 CURLY_SETPAREN(ST.paren, ST.count);
5328 if (cur_eval && cur_eval->u.eval.close_paren &&
5329 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5332 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5337 case CURLY_B_min_fail:
5338 /* failed to find B in a non-greedy match where c1,c2 invalid */
5339 if (ST.paren && ST.count)
5340 PL_regoffs[ST.paren].end = -1;
5342 REGCP_UNWIND(ST.cp);
5343 /* failed -- move forward one */
5344 PL_reginput = locinput;
5345 if (regrepeat(rex, ST.A, 1, depth)) {
5347 locinput = PL_reginput;
5348 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5349 ST.count > 0)) /* count overflow ? */
5352 CURLY_SETPAREN(ST.paren, ST.count);
5353 if (cur_eval && cur_eval->u.eval.close_paren &&
5354 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5357 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5365 /* a successful greedy match: now try to match B */
5366 if (cur_eval && cur_eval->u.eval.close_paren &&
5367 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5372 if (ST.c1 != CHRTEST_VOID)
5373 c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5374 UTF8_MAXBYTES, 0, uniflags)
5375 : (UV) UCHARAT(PL_reginput);
5376 /* If it could work, try it. */
5377 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5378 CURLY_SETPAREN(ST.paren, ST.count);
5379 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5384 case CURLY_B_max_fail:
5385 /* failed to find B in a greedy match */
5386 if (ST.paren && ST.count)
5387 PL_regoffs[ST.paren].end = -1;
5389 REGCP_UNWIND(ST.cp);
5391 if (--ST.count < ST.min)
5393 PL_reginput = locinput = HOPc(locinput, -1);
5394 goto curly_try_B_max;
5401 /* we've just finished A in /(??{A})B/; now continue with B */
5403 st->u.eval.toggle_reg_flags
5404 = cur_eval->u.eval.toggle_reg_flags;
5405 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5407 st->u.eval.prev_rex = rex_sv; /* inner */
5408 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5409 rex = (struct regexp *)SvANY(rex_sv);
5410 rexi = RXi_GET(rex);
5411 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5412 (void)ReREFCNT_inc(rex_sv);
5413 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
5415 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5416 PL_reglastparen = &rex->lastparen;
5417 PL_reglastcloseparen = &rex->lastcloseparen;
5419 REGCP_SET(st->u.eval.lastcp);
5420 PL_reginput = locinput;
5422 /* Restore parens of the outer rex without popping the
5424 tmpix = PL_savestack_ix;
5425 PL_savestack_ix = cur_eval->u.eval.lastcp;
5427 PL_savestack_ix = tmpix;
5429 st->u.eval.prev_eval = cur_eval;
5430 cur_eval = cur_eval->u.eval.prev_eval;
5432 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5433 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5434 if ( nochange_depth )
5437 PUSH_YES_STATE_GOTO(EVAL_AB,
5438 st->u.eval.prev_eval->u.eval.B); /* match B */
5441 if (locinput < reginfo->till) {
5442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5443 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5445 (long)(locinput - PL_reg_starttry),
5446 (long)(reginfo->till - PL_reg_starttry),
5449 sayNO_SILENT; /* Cannot match: too short. */
5451 PL_reginput = locinput; /* put where regtry can find it */
5452 sayYES; /* Success! */
5454 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5456 PerlIO_printf(Perl_debug_log,
5457 "%*s %ssubpattern success...%s\n",
5458 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5459 PL_reginput = locinput; /* put where regtry can find it */
5460 sayYES; /* Success! */
5463 #define ST st->u.ifmatch
5465 case SUSPEND: /* (?>A) */
5467 PL_reginput = locinput;
5470 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5472 goto ifmatch_trivial_fail_test;
5474 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5476 ifmatch_trivial_fail_test:
5478 char * const s = HOPBACKc(locinput, scan->flags);
5483 sw = 1 - cBOOL(ST.wanted);
5487 next = scan + ARG(scan);
5495 PL_reginput = locinput;
5499 ST.logical = logical;
5500 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5502 /* execute body of (?...A) */
5503 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5506 case IFMATCH_A_fail: /* body of (?...A) failed */
5507 ST.wanted = !ST.wanted;
5510 case IFMATCH_A: /* body of (?...A) succeeded */
5512 sw = cBOOL(ST.wanted);
5514 else if (!ST.wanted)
5517 if (OP(ST.me) == SUSPEND)
5518 locinput = PL_reginput;
5520 locinput = PL_reginput = st->locinput;
5521 nextchr = UCHARAT(locinput);
5523 scan = ST.me + ARG(ST.me);
5526 continue; /* execute B */
5531 next = scan + ARG(scan);
5536 reginfo->cutpoint = PL_regeol;
5539 PL_reginput = locinput;
5541 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5542 PUSH_STATE_GOTO(COMMIT_next,next);
5544 case COMMIT_next_fail:
5551 #define ST st->u.mark
5553 ST.prev_mark = mark_state;
5554 ST.mark_name = sv_commit = sv_yes_mark
5555 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5557 ST.mark_loc = PL_reginput = locinput;
5558 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5560 case MARKPOINT_next:
5561 mark_state = ST.prev_mark;
5564 case MARKPOINT_next_fail:
5565 if (popmark && sv_eq(ST.mark_name,popmark))
5567 if (ST.mark_loc > startpoint)
5568 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5569 popmark = NULL; /* we found our mark */
5570 sv_commit = ST.mark_name;
5573 PerlIO_printf(Perl_debug_log,
5574 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5575 REPORT_CODE_OFF+depth*2, "",
5576 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5579 mark_state = ST.prev_mark;
5580 sv_yes_mark = mark_state ?
5581 mark_state->u.mark.mark_name : NULL;
5585 PL_reginput = locinput;
5587 /* (*SKIP) : if we fail we cut here*/
5588 ST.mark_name = NULL;
5589 ST.mark_loc = locinput;
5590 PUSH_STATE_GOTO(SKIP_next,next);
5592 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5593 otherwise do nothing. Meaning we need to scan
5595 regmatch_state *cur = mark_state;
5596 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5599 if ( sv_eq( cur->u.mark.mark_name,
5602 ST.mark_name = find;
5603 PUSH_STATE_GOTO( SKIP_next, next );
5605 cur = cur->u.mark.prev_mark;
5608 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5610 case SKIP_next_fail:
5612 /* (*CUT:NAME) - Set up to search for the name as we
5613 collapse the stack*/
5614 popmark = ST.mark_name;
5616 /* (*CUT) - No name, we cut here.*/
5617 if (ST.mark_loc > startpoint)
5618 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5619 /* but we set sv_commit to latest mark_name if there
5620 is one so they can test to see how things lead to this
5623 sv_commit=mark_state->u.mark.mark_name;
5631 if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5633 } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5636 U8 folded[UTF8_MAXBYTES_CASE+1];
5638 const char * const l = locinput;
5639 char *e = PL_regeol;
5640 to_uni_fold(n, folded, &foldlen);
5642 if (! foldEQ_utf8((const char*) folded, 0, foldlen, 1,
5643 l, &e, 0, utf8_target)) {
5648 nextchr = UCHARAT(locinput);
5651 if ((n=is_LNBREAK(locinput,utf8_target))) {
5653 nextchr = UCHARAT(locinput);
5658 #define CASE_CLASS(nAmE) \
5660 if ((n=is_##nAmE(locinput,utf8_target))) { \
5662 nextchr = UCHARAT(locinput); \
5667 if ((n=is_##nAmE(locinput,utf8_target))) { \
5670 locinput += UTF8SKIP(locinput); \
5671 nextchr = UCHARAT(locinput); \
5676 CASE_CLASS(HORIZWS);
5680 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5681 PTR2UV(scan), OP(scan));
5682 Perl_croak(aTHX_ "regexp memory corruption");
5686 /* switch break jumps here */
5687 scan = next; /* prepare to execute the next op and ... */
5688 continue; /* ... jump back to the top, reusing st */
5692 /* push a state that backtracks on success */
5693 st->u.yes.prev_yes_state = yes_state;
5697 /* push a new regex state, then continue at scan */
5699 regmatch_state *newst;
5702 regmatch_state *cur = st;
5703 regmatch_state *curyes = yes_state;
5705 regmatch_slab *slab = PL_regmatch_slab;
5706 for (;curd > -1;cur--,curd--) {
5707 if (cur < SLAB_FIRST(slab)) {
5709 cur = SLAB_LAST(slab);
5711 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5712 REPORT_CODE_OFF + 2 + depth * 2,"",
5713 curd, PL_reg_name[cur->resume_state],
5714 (curyes == cur) ? "yes" : ""
5717 curyes = cur->u.yes.prev_yes_state;
5720 DEBUG_STATE_pp("push")
5723 st->locinput = locinput;
5725 if (newst > SLAB_LAST(PL_regmatch_slab))
5726 newst = S_push_slab(aTHX);
5727 PL_regmatch_state = newst;
5729 locinput = PL_reginput;
5730 nextchr = UCHARAT(locinput);
5738 * We get here only if there's trouble -- normally "case END" is
5739 * the terminating point.
5741 Perl_croak(aTHX_ "corrupted regexp pointers");
5747 /* we have successfully completed a subexpression, but we must now
5748 * pop to the state marked by yes_state and continue from there */
5749 assert(st != yes_state);
5751 while (st != yes_state) {
5753 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5754 PL_regmatch_slab = PL_regmatch_slab->prev;
5755 st = SLAB_LAST(PL_regmatch_slab);
5759 DEBUG_STATE_pp("pop (no final)");
5761 DEBUG_STATE_pp("pop (yes)");
5767 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5768 || yes_state > SLAB_LAST(PL_regmatch_slab))
5770 /* not in this slab, pop slab */
5771 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5772 PL_regmatch_slab = PL_regmatch_slab->prev;
5773 st = SLAB_LAST(PL_regmatch_slab);
5775 depth -= (st - yes_state);
5778 yes_state = st->u.yes.prev_yes_state;
5779 PL_regmatch_state = st;
5782 locinput= st->locinput;
5783 nextchr = UCHARAT(locinput);
5785 state_num = st->resume_state + no_final;
5786 goto reenter_switch;
5789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5790 PL_colors[4], PL_colors[5]));
5792 if (PL_reg_eval_set) {
5793 /* each successfully executed (?{...}) block does the equivalent of
5794 * local $^R = do {...}
5795 * When popping the save stack, all these locals would be undone;
5796 * bypass this by setting the outermost saved $^R to the latest
5798 if (oreplsv != GvSV(PL_replgv))
5799 sv_setsv(oreplsv, GvSV(PL_replgv));
5806 PerlIO_printf(Perl_debug_log,
5807 "%*s %sfailed...%s\n",
5808 REPORT_CODE_OFF+depth*2, "",
5809 PL_colors[4], PL_colors[5])
5821 /* there's a previous state to backtrack to */
5823 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5824 PL_regmatch_slab = PL_regmatch_slab->prev;
5825 st = SLAB_LAST(PL_regmatch_slab);
5827 PL_regmatch_state = st;
5828 locinput= st->locinput;
5829 nextchr = UCHARAT(locinput);
5831 DEBUG_STATE_pp("pop");
5833 if (yes_state == st)
5834 yes_state = st->u.yes.prev_yes_state;
5836 state_num = st->resume_state + 1; /* failure = success + 1 */
5837 goto reenter_switch;
5842 if (rex->intflags & PREGf_VERBARG_SEEN) {
5843 SV *sv_err = get_sv("REGERROR", 1);
5844 SV *sv_mrk = get_sv("REGMARK", 1);
5846 sv_commit = &PL_sv_no;
5848 sv_yes_mark = &PL_sv_yes;
5851 sv_commit = &PL_sv_yes;
5852 sv_yes_mark = &PL_sv_no;
5854 sv_setsv(sv_err, sv_commit);
5855 sv_setsv(sv_mrk, sv_yes_mark);
5858 /* clean up; in particular, free all slabs above current one */
5859 LEAVE_SCOPE(oldsave);
5865 - regrepeat - repeatedly match something simple, report how many
5868 * [This routine now assumes that it will only match on things of length 1.
5869 * That was true before, but now we assume scan - reginput is the count,
5870 * rather than incrementing count on every character. [Er, except utf8.]]
5873 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5876 register char *scan;
5878 register char *loceol = PL_regeol;
5879 register I32 hardcount = 0;
5880 register bool utf8_target = PL_reg_match_utf8;
5883 PERL_UNUSED_ARG(depth);
5886 PERL_ARGS_ASSERT_REGREPEAT;
5889 if (max == REG_INFTY)
5891 else if (max < loceol - scan)
5892 loceol = scan + max;
5897 while (scan < loceol && hardcount < max && *scan != '\n') {
5898 scan += UTF8SKIP(scan);
5902 while (scan < loceol && *scan != '\n')
5909 while (scan < loceol && hardcount < max) {
5910 scan += UTF8SKIP(scan);
5921 /* To get here, EXACTish nodes must have *byte* length == 1. That
5922 * means they match only characters in the string that can be expressed
5923 * as a single byte. For non-utf8 strings, that means a simple match.
5924 * For utf8 strings, the character matched must be an invariant, or
5925 * downgradable to a single byte. The pattern's utf8ness is
5926 * irrelevant, as since it's a single byte, it either isn't utf8, or if
5927 * it is, it's an invariant */
5930 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5932 if (! utf8_target || UNI_IS_INVARIANT(c)) {
5933 while (scan < loceol && UCHARAT(scan) == c) {
5939 /* Here, the string is utf8, and the pattern char is different
5940 * in utf8 than not, so can't compare them directly. Outside the
5941 * loop, find find the two utf8 bytes that represent c, and then
5942 * look for those in sequence in the utf8 string */
5943 U8 high = UTF8_TWO_BYTE_HI(c);
5944 U8 low = UTF8_TWO_BYTE_LO(c);
5947 while (hardcount < max
5948 && scan + 1 < loceol
5949 && UCHARAT(scan) == high
5950 && UCHARAT(scan + 1) == low)
5958 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5962 PL_reg_flags |= RF_tainted;
5963 utf8_flags = FOLDEQ_UTF8_LOCALE;
5970 /* The comments for the EXACT case above apply as well to these fold
5975 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5977 if (utf8_target) { /* Use full Unicode fold matching */
5978 char *tmpeol = loceol;
5979 while (hardcount < max
5980 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5981 STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5988 /* XXX Note that the above handles properly the German sharp s in
5989 * the pattern matching ss in the string. But it doesn't handle
5990 * properly cases where the string contains say 'LIGATURE ff' and
5991 * the pattern is 'f+'. This would require, say, a new function or
5992 * revised interface to foldEQ_utf8(), in which the maximum number
5993 * of characters to match could be passed and it would return how
5994 * many actually did. This is just one of many cases where
5995 * multi-char folds don't work properly, and so the fix is being
6001 /* Here, the string isn't utf8 and c is a single byte; and either
6002 * the pattern isn't utf8 or c is an invariant, so its utf8ness
6003 * doesn't affect c. Can just do simple comparisons for exact or
6006 case EXACTF: folded = PL_fold[c]; break;
6008 case EXACTFU: folded = PL_fold_latin1[c]; break;
6009 case EXACTFL: folded = PL_fold_locale[c]; break;
6010 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6012 while (scan < loceol &&
6013 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6021 if (utf8_target || OP(p) == ANYOFV) {
6024 inclasslen = loceol - scan;
6025 while (hardcount < max
6026 && ((inclasslen = loceol - scan) > 0)
6027 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6033 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6041 LOAD_UTF8_CHARCLASS_ALNUM();
6042 while (hardcount < max && scan < loceol &&
6043 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6045 scan += UTF8SKIP(scan);
6049 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6057 while (scan < loceol && isALNUM((U8) *scan)) {
6062 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6067 PL_reg_flags |= RF_tainted;
6070 while (hardcount < max && scan < loceol &&
6071 isALNUM_LC_utf8((U8*)scan)) {
6072 scan += UTF8SKIP(scan);
6076 while (scan < loceol && isALNUM_LC(*scan))
6086 LOAD_UTF8_CHARCLASS_ALNUM();
6087 while (hardcount < max && scan < loceol &&
6088 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6090 scan += UTF8SKIP(scan);
6094 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6101 goto utf8_Nwordchar;
6102 while (scan < loceol && ! isALNUM((U8) *scan)) {
6108 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6109 scan += UTF8SKIP(scan);
6113 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6119 PL_reg_flags |= RF_tainted;
6122 while (hardcount < max && scan < loceol &&
6123 !isALNUM_LC_utf8((U8*)scan)) {
6124 scan += UTF8SKIP(scan);
6128 while (scan < loceol && !isALNUM_LC(*scan))
6138 LOAD_UTF8_CHARCLASS_SPACE();
6139 while (hardcount < max && scan < loceol &&
6141 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6143 scan += UTF8SKIP(scan);
6149 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6158 while (scan < loceol && isSPACE((U8) *scan)) {
6163 while (scan < loceol && isSPACE_A((U8) *scan)) {
6168 PL_reg_flags |= RF_tainted;
6171 while (hardcount < max && scan < loceol &&
6172 isSPACE_LC_utf8((U8*)scan)) {
6173 scan += UTF8SKIP(scan);
6177 while (scan < loceol && isSPACE_LC(*scan))
6187 LOAD_UTF8_CHARCLASS_SPACE();
6188 while (hardcount < max && scan < loceol &&
6190 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6192 scan += UTF8SKIP(scan);
6198 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6207 while (scan < loceol && ! isSPACE((U8) *scan)) {
6213 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6214 scan += UTF8SKIP(scan);
6218 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6224 PL_reg_flags |= RF_tainted;
6227 while (hardcount < max && scan < loceol &&
6228 !isSPACE_LC_utf8((U8*)scan)) {
6229 scan += UTF8SKIP(scan);
6233 while (scan < loceol && !isSPACE_LC(*scan))
6240 LOAD_UTF8_CHARCLASS_DIGIT();
6241 while (hardcount < max && scan < loceol &&
6242 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6243 scan += UTF8SKIP(scan);
6247 while (scan < loceol && isDIGIT(*scan))
6252 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6257 PL_reg_flags |= RF_tainted;
6260 while (hardcount < max && scan < loceol &&
6261 isDIGIT_LC_utf8((U8*)scan)) {
6262 scan += UTF8SKIP(scan);
6266 while (scan < loceol && isDIGIT_LC(*scan))
6273 LOAD_UTF8_CHARCLASS_DIGIT();
6274 while (hardcount < max && scan < loceol &&
6275 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6276 scan += UTF8SKIP(scan);
6280 while (scan < loceol && !isDIGIT(*scan))
6286 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6287 scan += UTF8SKIP(scan);
6291 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6297 PL_reg_flags |= RF_tainted;
6300 while (hardcount < max && scan < loceol &&
6301 !isDIGIT_LC_utf8((U8*)scan)) {
6302 scan += UTF8SKIP(scan);
6306 while (scan < loceol && !isDIGIT_LC(*scan))
6313 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6319 LNBREAK can match two latin chars, which is ok,
6320 because we have a null terminated string, but we
6321 have to use hardcount in this situation
6323 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6332 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6337 while (scan < loceol && is_HORIZWS_latin1(scan))
6344 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6345 scan += UTF8SKIP(scan);
6349 while (scan < loceol && !is_HORIZWS_latin1(scan))
6357 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6362 while (scan < loceol && is_VERTWS_latin1(scan))
6370 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6371 scan += UTF8SKIP(scan);
6375 while (scan < loceol && !is_VERTWS_latin1(scan))
6381 default: /* Called on something of 0 width. */
6382 break; /* So match right here or not at all. */
6388 c = scan - PL_reginput;
6392 GET_RE_DEBUG_FLAGS_DECL;
6394 SV * const prop = sv_newmortal();
6395 regprop(prog, prop, p);
6396 PerlIO_printf(Perl_debug_log,
6397 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
6398 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6406 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6408 - regclass_swash - prepare the utf8 swash
6412 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6418 RXi_GET_DECL(prog,progi);
6419 const struct reg_data * const data = prog ? progi->data : NULL;
6421 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6423 assert(ANYOF_NONBITMAP(node));
6425 if (data && data->count) {
6426 const U32 n = ARG(node);
6428 if (data->what[n] == 's') {
6429 SV * const rv = MUTABLE_SV(data->data[n]);
6430 AV * const av = MUTABLE_AV(SvRV(rv));
6431 SV **const ary = AvARRAY(av);
6434 /* See the end of regcomp.c:S_regclass() for
6435 * documentation of these array elements. */
6438 a = SvROK(ary[1]) ? &ary[1] : NULL;
6439 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6443 else if (si && doinit) {
6444 sw = swash_init("utf8", "", si, 1, 0);
6445 (void)av_store(av, 1, sw);
6462 - reginclass - determine if a character falls into a character class
6464 n is the ANYOF regnode
6465 p is the target string
6466 lenp is pointer to the maximum number of bytes of how far to go in p
6467 (This is assumed wthout checking to always be at least the current
6469 utf8_target tells whether p is in UTF-8.
6471 Returns true if matched; false otherwise. If lenp is not NULL, on return
6472 from a successful match, the value it points to will be updated to how many
6473 bytes in p were matched. If there was no match, the value is undefined,
6474 possibly changed from the input.
6476 Note that this can be a synthetic start class, a combination of various
6477 nodes, so things you think might be mutually exclusive, such as locale,
6478 aren't. It can match both locale and non-locale
6483 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6486 const char flags = ANYOF_FLAGS(n);
6492 PERL_ARGS_ASSERT_REGINCLASS;
6494 /* If c is not already the code point, get it */
6495 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6496 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6497 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6498 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6499 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6500 * UTF8_ALLOW_FFFF */
6501 if (c_len == (STRLEN)-1)
6502 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6508 /* Use passed in max length, or one character if none passed in or less
6509 * than one character. And assume will match just one character. This is
6510 * overwritten later if matched more. */
6512 maxlen = (*lenp > c_len) ? *lenp : c_len;
6520 /* If this character is potentially in the bitmap, check it */
6522 if (ANYOF_BITMAP_TEST(n, c))
6524 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6531 else if (flags & ANYOF_LOCALE) {
6532 PL_reg_flags |= RF_tainted;
6534 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6535 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6539 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6540 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6541 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6542 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6543 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6544 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6545 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6546 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6547 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6548 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6549 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
6550 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
6551 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
6552 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6553 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6554 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6555 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6556 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6557 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6558 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6559 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6560 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6561 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6562 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6563 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6564 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6565 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6566 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6567 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
6568 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
6569 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
6570 ) /* How's that for a conditional? */
6577 /* If the bitmap didn't (or couldn't) match, and something outside the
6578 * bitmap could match, try that. Locale nodes specifiy completely the
6579 * behavior of code points in the bit map (otherwise, a utf8 target would
6580 * cause them to be treated as Unicode and not locale), except in
6581 * the very unlikely event when this node is a synthetic start class, which
6582 * could be a combination of locale and non-locale nodes. So allow locale
6583 * to match for the synthetic start class, which will give a false
6584 * positive that will be resolved when the match is done again as not part
6585 * of the synthetic start class */
6587 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6588 match = TRUE; /* Everything above 255 matches */
6590 else if (ANYOF_NONBITMAP(n)
6591 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6594 || (! (flags & ANYOF_LOCALE))
6595 || (flags & ANYOF_IS_SYNTHETIC)))))
6598 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6606 /* Not utf8. Convert as much of the string as available up
6607 * to the limit of how far the (single) character in the
6608 * pattern can possibly match (no need to go further). If
6609 * the node is a straight ANYOF or not folding, it can't
6610 * match more than one. Otherwise, It can match up to how
6611 * far a single char can fold to. Since not utf8, each
6612 * character is a single byte, so the max it can be in
6613 * bytes is the same as the max it can be in characters */
6614 STRLEN len = (OP(n) == ANYOF
6615 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6617 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6619 : UTF8_MAX_FOLD_CHAR_EXPAND;
6620 utf8_p = bytes_to_utf8(p, &len);
6623 if (swash_fetch(sw, utf8_p, TRUE))
6625 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6627 /* Here, we need to test if the fold of the target string
6628 * matches. The non-multi char folds have all been moved to
6629 * the compilation phase, and the multi-char folds have
6630 * been stored by regcomp into 'av'; we linearly check to
6631 * see if any match the target string (folded). We know
6632 * that the originals were each one character, but we don't
6633 * currently know how many characters/bytes each folded to,
6634 * except we do know that there are small limits imposed by
6635 * Unicode. XXX A performance enhancement would be to have
6636 * regcomp.c store the max number of chars/bytes that are
6637 * in an av entry, as, say the 0th element. Even better
6638 * would be to have a hash of the few characters that can
6639 * start a multi-char fold to the max number of chars of
6642 * If there is a match, we will need to advance (if lenp is
6643 * specified) the match pointer in the target string. But
6644 * what we are comparing here isn't that string directly,
6645 * but its fold, whose length may differ from the original.
6646 * As we go along in constructing the fold, therefore, we
6647 * create a map so that we know how many bytes in the
6648 * source to advance given that we have matched a certain
6649 * number of bytes in the fold. This map is stored in
6650 * 'map_fold_len_back'. Let n mean the number of bytes in
6651 * the fold of the first character that we are folding.
6652 * Then map_fold_len_back[n] is set to the number of bytes
6653 * in that first character. Similarly let m be the
6654 * corresponding number for the second character to be
6655 * folded. Then map_fold_len_back[n+m] is set to the
6656 * number of bytes occupied by the first two source
6657 * characters. ... */
6658 U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6659 U8 folded[UTF8_MAXBYTES_CASE+1];
6660 STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6661 STRLEN total_foldlen = 0; /* num bytes in fold of all
6664 if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6666 /* Here, only need to fold the first char of the target
6667 * string. It the source wasn't utf8, is 1 byte long */
6668 to_utf8_fold(utf8_p, folded, &foldlen);
6669 total_foldlen = foldlen;
6670 map_fold_len_back[foldlen] = (utf8_target)
6676 /* Here, need to fold more than the first char. Do so
6677 * up to the limits */
6678 U8* source_ptr = utf8_p; /* The source for the fold
6681 U8* folded_ptr = folded;
6682 U8* e = utf8_p + maxlen; /* Can't go beyond last
6683 available byte in the
6687 i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6691 /* Fold the next character */
6692 U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6693 STRLEN this_char_foldlen;
6694 to_utf8_fold(source_ptr,
6696 &this_char_foldlen);
6698 /* Bail if it would exceed the byte limit for
6699 * folding a single char. */
6700 if (this_char_foldlen + folded_ptr - folded >
6706 /* Add the fold of this character */
6707 Copy(this_char_folded,
6711 source_ptr += UTF8SKIP(source_ptr);
6712 folded_ptr += this_char_foldlen;
6713 total_foldlen = folded_ptr - folded;
6715 /* Create map from the number of bytes in the fold
6716 * back to the number of bytes in the source. If
6717 * the source isn't utf8, the byte count is just
6718 * the number of characters so far */
6719 map_fold_len_back[total_foldlen]
6721 ? source_ptr - utf8_p
6728 /* Do the linear search to see if the fold is in the list
6729 * of multi-char folds. */
6732 for (i = 0; i <= av_len(av); i++) {
6733 SV* const sv = *av_fetch(av, i, FALSE);
6735 const char * const s = SvPV_const(sv, len);
6737 if (len <= total_foldlen
6738 && memEQ(s, (char*)folded, len)
6740 /* If 0, means matched a partial char. See
6742 && map_fold_len_back[len])
6745 /* Advance the target string ptr to account for
6746 * this fold, but have to translate from the
6747 * folded length to the corresponding source
6750 *lenp = map_fold_len_back[len];
6759 /* If we allocated a string above, free it */
6760 if (! utf8_target) Safefree(utf8_p);
6765 return (flags & ANYOF_INVERT) ? !match : match;
6769 S_reghop3(U8 *s, I32 off, const U8* lim)
6773 PERL_ARGS_ASSERT_REGHOP3;
6776 while (off-- && s < lim) {
6777 /* XXX could check well-formedness here */
6782 while (off++ && s > lim) {
6784 if (UTF8_IS_CONTINUED(*s)) {
6785 while (s > lim && UTF8_IS_CONTINUATION(*s))
6788 /* XXX could check well-formedness here */
6795 /* there are a bunch of places where we use two reghop3's that should
6796 be replaced with this routine. but since thats not done yet
6797 we ifdef it out - dmq
6800 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6804 PERL_ARGS_ASSERT_REGHOP4;
6807 while (off-- && s < rlim) {
6808 /* XXX could check well-formedness here */
6813 while (off++ && s > llim) {
6815 if (UTF8_IS_CONTINUED(*s)) {
6816 while (s > llim && UTF8_IS_CONTINUATION(*s))
6819 /* XXX could check well-formedness here */
6827 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6831 PERL_ARGS_ASSERT_REGHOPMAYBE3;
6834 while (off-- && s < lim) {
6835 /* XXX could check well-formedness here */
6842 while (off++ && s > lim) {
6844 if (UTF8_IS_CONTINUED(*s)) {
6845 while (s > lim && UTF8_IS_CONTINUATION(*s))
6848 /* XXX could check well-formedness here */
6857 restore_pos(pTHX_ void *arg)
6860 regexp * const rex = (regexp *)arg;
6861 if (PL_reg_eval_set) {
6862 if (PL_reg_oldsaved) {
6863 rex->subbeg = PL_reg_oldsaved;
6864 rex->sublen = PL_reg_oldsavedlen;
6865 #ifdef PERL_OLD_COPY_ON_WRITE
6866 rex->saved_copy = PL_nrs;
6868 RXp_MATCH_COPIED_on(rex);
6870 PL_reg_magic->mg_len = PL_reg_oldpos;
6871 PL_reg_eval_set = 0;
6872 PL_curpm = PL_reg_oldcurpm;
6877 S_to_utf8_substr(pTHX_ register regexp *prog)
6881 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6884 if (prog->substrs->data[i].substr
6885 && !prog->substrs->data[i].utf8_substr) {
6886 SV* const sv = newSVsv(prog->substrs->data[i].substr);
6887 prog->substrs->data[i].utf8_substr = sv;
6888 sv_utf8_upgrade(sv);
6889 if (SvVALID(prog->substrs->data[i].substr)) {
6890 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6891 if (flags & FBMcf_TAIL) {
6892 /* Trim the trailing \n that fbm_compile added last
6894 SvCUR_set(sv, SvCUR(sv) - 1);
6895 /* Whilst this makes the SV technically "invalid" (as its
6896 buffer is no longer followed by "\0") when fbm_compile()
6897 adds the "\n" back, a "\0" is restored. */
6899 fbm_compile(sv, flags);
6901 if (prog->substrs->data[i].substr == prog->check_substr)
6902 prog->check_utf8 = sv;
6908 S_to_byte_substr(pTHX_ register regexp *prog)
6913 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6916 if (prog->substrs->data[i].utf8_substr
6917 && !prog->substrs->data[i].substr) {
6918 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6919 if (sv_utf8_downgrade(sv, TRUE)) {
6920 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6922 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6923 if (flags & FBMcf_TAIL) {
6924 /* Trim the trailing \n that fbm_compile added last
6926 SvCUR_set(sv, SvCUR(sv) - 1);
6928 fbm_compile(sv, flags);
6934 prog->substrs->data[i].substr = sv;
6935 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6936 prog->check_substr = sv;
6943 * c-indentation-style: bsd
6945 * indent-tabs-mode: t
6948 * ex: set ts=8 sts=4 sw=4 noet: