5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #define RF_tainted 1 /* tainted information used? e.g. locale */
84 #define RF_warned 2 /* warned about big count? */
86 #define RF_utf8 8 /* Pattern contains multibyte chars? */
88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
90 #define RS_init 1 /* eval environment created */
91 #define RS_set 2 /* replsv value is set */
97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
100 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
110 #define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
114 #define HOPBACKc(pos, off) \
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 /* these are unrolled below in the CCC_TRY_XXX defined */
126 /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
127 * skip the check on EBCDIC platforms */
128 # define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
130 # define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
131 if (!CAT2(PL_utf8_,class)) { \
133 ENTER; save_re_context(); \
134 ok=CAT2(is_utf8_,class)((const U8*)str); \
135 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
138 /* Doesn't do an assert to verify that is correct */
139 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
140 if (!CAT2(PL_utf8_,class)) { \
141 bool throw_away PERL_UNUSED_DECL; \
142 ENTER; save_re_context(); \
143 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
146 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
147 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
148 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
150 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
151 LOAD_UTF8_CHARCLASS(X_begin, " "); \
152 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
153 /* These are utf8 constants, and not utf-ebcdic constants, so the \
154 * assert should likely and hopefully fail on an EBCDIC machine */ \
155 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
157 /* No asserts are done for these, in case called on an early \
158 * Unicode version in which they map to nothing */ \
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
160 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
161 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
162 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
163 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
164 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
165 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
167 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
169 /* The actual code for CCC_TRY, which uses several variables from the routine
170 * it's callable from. It is designed to be the bulk of a case statement.
171 * FUNC is the macro or function to call on non-utf8 targets that indicate if
172 * nextchr matches the class.
173 * UTF8_TEST is the whole test string to use for utf8 targets
174 * LOAD is what to use to test, and if not present to load in the swash for the
176 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
178 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
179 * utf8 and a variant, load the swash if necessary and test using the utf8
180 * test. Advance to the next character if test is ok, otherwise fail; If not
181 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
182 * fails, or advance to the next character */
184 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
185 if (locinput >= PL_regeol) { \
188 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
189 LOAD_UTF8_CHARCLASS(CLASS, STR); \
190 if (POS_OR_NEG (UTF8_TEST)) { \
193 locinput += PL_utf8skip[nextchr]; \
194 nextchr = UCHARAT(locinput); \
197 if (POS_OR_NEG (FUNC(nextchr))) { \
200 nextchr = UCHARAT(++locinput); \
203 /* Handle the non-locale cases for a character class and its complement. It
204 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
205 * This is because that code fails when the test succeeds, so we want to have
206 * the test fail so that the code succeeds. The swash is stored in a
207 * predictable PL_ place */
208 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
211 _CCC_TRY_CODE( !, FUNC, \
212 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
213 (U8*)locinput, TRUE)), \
216 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
217 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
218 (U8*)locinput, TRUE)), \
221 /* Generate the case statements for both locale and non-locale character
222 * classes in regmatch for classes that don't have special unicode semantics.
223 * Locales don't use an immediate swash, but an intermediary special locale
224 * function that is called on the pointer to the current place in the input
225 * string. That function will resolve to needing the same swash. One might
226 * think that because we don't know what the locale will match, we shouldn't
227 * check with the swash loading function that it loaded properly; ie, that we
228 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
229 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
231 #define CCC_TRY(NAME, NNAME, FUNC, \
232 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
233 NAMEA, NNAMEA, FUNCA, \
236 PL_reg_flags |= RF_tainted; \
237 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
243 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
246 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
247 nextchr = UCHARAT(++locinput); \
250 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
254 locinput += PL_utf8skip[nextchr]; \
255 nextchr = UCHARAT(locinput); \
258 nextchr = UCHARAT(++locinput); \
261 /* Generate the non-locale cases */ \
262 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
264 /* This is like CCC_TRY, but has an extra set of parameters for generating case
265 * statements to handle separate Unicode semantics nodes */
266 #define CCC_TRY_U(NAME, NNAME, FUNC, \
267 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
268 NAMEU, NNAMEU, FUNCU, \
269 NAMEA, NNAMEA, FUNCA, \
271 CCC_TRY(NAME, NNAME, FUNC, \
272 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
273 NAMEA, NNAMEA, FUNCA, \
275 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
277 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
279 /* for use after a quantifier and before an EXACT-like node -- japhy */
280 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
282 * NOTE that *nothing* that affects backtracking should be in here, specifically
283 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
284 * node that is in between two EXACT like nodes when ascertaining what the required
285 * "follow" character is. This should probably be moved to regex compile time
286 * although it may be done at run time beause of the REF possibility - more
287 * investigation required. -- demerphq
289 #define JUMPABLE(rn) ( \
291 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
293 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
294 OP(rn) == PLUS || OP(rn) == MINMOD || \
296 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
298 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
300 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
303 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
304 we don't need this definition. */
305 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
306 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
307 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
310 /* ... so we use this as its faster. */
311 #define IS_TEXT(rn) ( OP(rn)==EXACT )
312 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
313 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
314 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
319 Search for mandatory following text node; for lookahead, the text must
320 follow but for lookbehind (rn->flags != 0) we skip to the next step.
322 #define FIND_NEXT_IMPT(rn) STMT_START { \
323 while (JUMPABLE(rn)) { \
324 const OPCODE type = OP(rn); \
325 if (type == SUSPEND || PL_regkind[type] == CURLY) \
326 rn = NEXTOPER(NEXTOPER(rn)); \
327 else if (type == PLUS) \
329 else if (type == IFMATCH) \
330 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
331 else rn += NEXT_OFF(rn); \
336 static void restore_pos(pTHX_ void *arg);
338 #define REGCP_PAREN_ELEMS 4
339 #define REGCP_OTHER_ELEMS 5
340 #define REGCP_FRAME_ELEMS 1
341 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
342 * are needed for the regexp context stack bookkeeping. */
345 S_regcppush(pTHX_ I32 parenfloor)
348 const int retval = PL_savestack_ix;
349 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
350 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
351 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
353 GET_RE_DEBUG_FLAGS_DECL;
355 if (paren_elems_to_push < 0)
356 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
357 paren_elems_to_push);
359 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
360 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
361 " out of range (%lu-%ld)",
362 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
364 SSGROW(total_elems + REGCP_FRAME_ELEMS);
366 for (p = PL_regsize; p > parenfloor; p--) {
367 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
368 SSPUSHINT(PL_regoffs[p].end);
369 SSPUSHINT(PL_regoffs[p].start);
370 SSPUSHPTR(PL_reg_start_tmp[p]);
372 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
373 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
374 (UV)p, (IV)PL_regoffs[p].start,
375 (IV)(PL_reg_start_tmp[p] - PL_bostr),
376 (IV)PL_regoffs[p].end
379 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
380 SSPUSHPTR(PL_regoffs);
381 SSPUSHINT(PL_regsize);
382 SSPUSHINT(*PL_reglastparen);
383 SSPUSHINT(*PL_reglastcloseparen);
384 SSPUSHPTR(PL_reginput);
385 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
390 /* These are needed since we do not localize EVAL nodes: */
391 #define REGCP_SET(cp) \
393 PerlIO_printf(Perl_debug_log, \
394 " Setting an EVAL scope, savestack=%"IVdf"\n", \
395 (IV)PL_savestack_ix)); \
398 #define REGCP_UNWIND(cp) \
400 if (cp != PL_savestack_ix) \
401 PerlIO_printf(Perl_debug_log, \
402 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
403 (IV)(cp), (IV)PL_savestack_ix)); \
407 S_regcppop(pTHX_ const regexp *rex)
412 GET_RE_DEBUG_FLAGS_DECL;
414 PERL_ARGS_ASSERT_REGCPPOP;
416 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
418 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
419 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
420 input = (char *) SSPOPPTR;
421 *PL_reglastcloseparen = SSPOPINT;
422 *PL_reglastparen = SSPOPINT;
423 PL_regsize = SSPOPINT;
424 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
426 i -= REGCP_OTHER_ELEMS;
427 /* Now restore the parentheses context. */
428 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
430 U32 paren = (U32)SSPOPINT;
431 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
432 PL_regoffs[paren].start = SSPOPINT;
434 if (paren <= *PL_reglastparen)
435 PL_regoffs[paren].end = tmps;
437 PerlIO_printf(Perl_debug_log,
438 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
439 (UV)paren, (IV)PL_regoffs[paren].start,
440 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
441 (IV)PL_regoffs[paren].end,
442 (paren > *PL_reglastparen ? "(no)" : ""));
446 if (*PL_reglastparen + 1 <= rex->nparens) {
447 PerlIO_printf(Perl_debug_log,
448 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
449 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
453 /* It would seem that the similar code in regtry()
454 * already takes care of this, and in fact it is in
455 * a better location to since this code can #if 0-ed out
456 * but the code in regtry() is needed or otherwise tests
457 * requiring null fields (pat.t#187 and split.t#{13,14}
458 * (as of patchlevel 7877) will fail. Then again,
459 * this code seems to be necessary or otherwise
460 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
461 * --jhi updated by dapm */
462 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
464 PL_regoffs[i].start = -1;
465 PL_regoffs[i].end = -1;
471 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
474 * pregexec and friends
477 #ifndef PERL_IN_XSUB_RE
479 - pregexec - match a regexp against a string
482 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
483 char *strbeg, I32 minend, SV *screamer, U32 nosave)
484 /* strend: pointer to null at end of string */
485 /* strbeg: real beginning of string */
486 /* minend: end of match must be >=minend after stringarg. */
487 /* nosave: For optimizations. */
489 PERL_ARGS_ASSERT_PREGEXEC;
492 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
493 nosave ? 0 : REXEC_COPY_STR);
498 * Need to implement the following flags for reg_anch:
500 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
502 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
503 * INTUIT_AUTORITATIVE_ML
504 * INTUIT_ONCE_NOML - Intuit can match in one location only.
507 * Another flag for this function: SECOND_TIME (so that float substrs
508 * with giant delta may be not rechecked).
511 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
513 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
514 Otherwise, only SvCUR(sv) is used to get strbeg. */
516 /* XXXX We assume that strpos is strbeg unless sv. */
518 /* XXXX Some places assume that there is a fixed substring.
519 An update may be needed if optimizer marks as "INTUITable"
520 RExen without fixed substrings. Similarly, it is assumed that
521 lengths of all the strings are no more than minlen, thus they
522 cannot come from lookahead.
523 (Or minlen should take into account lookahead.)
524 NOTE: Some of this comment is not correct. minlen does now take account
525 of lookahead/behind. Further research is required. -- demerphq
529 /* A failure to find a constant substring means that there is no need to make
530 an expensive call to REx engine, thus we celebrate a failure. Similarly,
531 finding a substring too deep into the string means that less calls to
532 regtry() should be needed.
534 REx compiler's optimizer found 4 possible hints:
535 a) Anchored substring;
537 c) Whether we are anchored (beginning-of-line or \G);
538 d) First node (of those at offset 0) which may distinguish positions;
539 We use a)b)d) and multiline-part of c), and try to find a position in the
540 string which does not contradict any of them.
543 /* Most of decisions we do here should have been done at compile time.
544 The nodes of the REx which we used for the search should have been
545 deleted from the finite automaton. */
548 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
549 char *strend, const U32 flags, re_scream_pos_data *data)
552 struct regexp *const prog = (struct regexp *)SvANY(rx);
553 register I32 start_shift = 0;
554 /* Should be nonnegative! */
555 register I32 end_shift = 0;
560 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
562 register char *other_last = NULL; /* other substr checked before this */
563 char *check_at = NULL; /* check substr found at this pos */
564 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
565 RXi_GET_DECL(prog,progi);
567 const char * const i_strpos = strpos;
569 GET_RE_DEBUG_FLAGS_DECL;
571 PERL_ARGS_ASSERT_RE_INTUIT_START;
572 PERL_UNUSED_ARG(flags);
573 PERL_UNUSED_ARG(data);
575 RX_MATCH_UTF8_set(rx,utf8_target);
578 PL_reg_flags |= RF_utf8;
581 debug_start_match(rx, utf8_target, strpos, strend,
582 sv ? "Guessing start of match in sv for"
583 : "Guessing start of match in string for");
586 /* CHR_DIST() would be more correct here but it makes things slow. */
587 if (prog->minlen > strend - strpos) {
588 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
589 "String too short... [re_intuit_start]\n"));
593 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
596 if (!prog->check_utf8 && prog->check_substr)
597 to_utf8_substr(prog);
598 check = prog->check_utf8;
600 if (!prog->check_substr && prog->check_utf8)
601 to_byte_substr(prog);
602 check = prog->check_substr;
604 if (check == &PL_sv_undef) {
605 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
606 "Non-utf8 string cannot match utf8 check string\n"));
609 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
610 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
611 || ( (prog->extflags & RXf_ANCH_BOL)
612 && !multiline ) ); /* Check after \n? */
615 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
616 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
617 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
619 && (strpos != strbeg)) {
620 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
623 if (prog->check_offset_min == prog->check_offset_max &&
624 !(prog->extflags & RXf_CANY_SEEN)) {
625 /* Substring at constant offset from beg-of-str... */
628 s = HOP3c(strpos, prog->check_offset_min, strend);
631 slen = SvCUR(check); /* >= 1 */
633 if ( strend - s > slen || strend - s < slen - 1
634 || (strend - s == slen && strend[-1] != '\n')) {
635 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
638 /* Now should match s[0..slen-2] */
640 if (slen && (*SvPVX_const(check) != *s
642 && memNE(SvPVX_const(check), s, slen)))) {
644 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
648 else if (*SvPVX_const(check) != *s
649 || ((slen = SvCUR(check)) > 1
650 && memNE(SvPVX_const(check), s, slen)))
653 goto success_at_start;
656 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
658 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
659 end_shift = prog->check_end_shift;
662 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
663 - (SvTAIL(check) != 0);
664 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
666 if (end_shift < eshift)
670 else { /* Can match at random position */
673 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
674 end_shift = prog->check_end_shift;
676 /* end shift should be non negative here */
679 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
681 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
682 (IV)end_shift, RX_PRECOMP(prog));
686 /* Find a possible match in the region s..strend by looking for
687 the "check" substring in the region corrected by start/end_shift. */
690 I32 srch_start_shift = start_shift;
691 I32 srch_end_shift = end_shift;
694 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
695 srch_end_shift -= ((strbeg - s) - srch_start_shift);
696 srch_start_shift = strbeg - s;
698 DEBUG_OPTIMISE_MORE_r({
699 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
700 (IV)prog->check_offset_min,
701 (IV)srch_start_shift,
703 (IV)prog->check_end_shift);
706 if (prog->extflags & RXf_CANY_SEEN) {
707 start_point= (U8*)(s + srch_start_shift);
708 end_point= (U8*)(strend - srch_end_shift);
710 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
711 end_point= HOP3(strend, -srch_end_shift, strbeg);
713 DEBUG_OPTIMISE_MORE_r({
714 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
715 (int)(end_point - start_point),
716 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
720 s = fbm_instr( start_point, end_point,
721 check, multiline ? FBMrf_MULTILINE : 0);
723 /* Update the count-of-usability, remove useless subpatterns,
727 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
728 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
729 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
730 (s ? "Found" : "Did not find"),
731 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
732 ? "anchored" : "floating"),
735 (s ? " at offset " : "...\n") );
740 /* Finish the diagnostic message */
741 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
743 /* XXX dmq: first branch is for positive lookbehind...
744 Our check string is offset from the beginning of the pattern.
745 So we need to do any stclass tests offset forward from that
754 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
755 Start with the other substr.
756 XXXX no SCREAM optimization yet - and a very coarse implementation
757 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
758 *always* match. Probably should be marked during compile...
759 Probably it is right to do no SCREAM here...
762 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
763 : (prog->float_substr && prog->anchored_substr))
765 /* Take into account the "other" substring. */
766 /* XXXX May be hopelessly wrong for UTF... */
769 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
772 char * const last = HOP3c(s, -start_shift, strbeg);
774 char * const saved_s = s;
777 t = s - prog->check_offset_max;
778 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
780 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
785 t = HOP3c(t, prog->anchored_offset, strend);
786 if (t < other_last) /* These positions already checked */
788 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
791 /* XXXX It is not documented what units *_offsets are in.
792 We assume bytes, but this is clearly wrong.
793 Meaning this code needs to be carefully reviewed for errors.
797 /* On end-of-str: see comment below. */
798 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
799 if (must == &PL_sv_undef) {
801 DEBUG_r(must = prog->anchored_utf8); /* for debug */
806 HOP3(HOP3(last1, prog->anchored_offset, strend)
807 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
809 multiline ? FBMrf_MULTILINE : 0
812 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
813 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
814 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
815 (s ? "Found" : "Contradicts"),
816 quoted, RE_SV_TAIL(must));
821 if (last1 >= last2) {
822 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
823 ", giving up...\n"));
826 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
827 ", trying floating at offset %ld...\n",
828 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
829 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
830 s = HOP3c(last, 1, strend);
834 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
835 (long)(s - i_strpos)));
836 t = HOP3c(s, -prog->anchored_offset, strbeg);
837 other_last = HOP3c(s, 1, strend);
845 else { /* Take into account the floating substring. */
847 char * const saved_s = s;
850 t = HOP3c(s, -start_shift, strbeg);
852 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
853 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
854 last = HOP3c(t, prog->float_max_offset, strend);
855 s = HOP3c(t, prog->float_min_offset, strend);
858 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
859 must = utf8_target ? prog->float_utf8 : prog->float_substr;
860 /* fbm_instr() takes into account exact value of end-of-str
861 if the check is SvTAIL(ed). Since false positives are OK,
862 and end-of-str is not later than strend we are OK. */
863 if (must == &PL_sv_undef) {
865 DEBUG_r(must = prog->float_utf8); /* for debug message */
868 s = fbm_instr((unsigned char*)s,
869 (unsigned char*)last + SvCUR(must)
871 must, multiline ? FBMrf_MULTILINE : 0);
873 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
874 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
875 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
876 (s ? "Found" : "Contradicts"),
877 quoted, RE_SV_TAIL(must));
881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
882 ", giving up...\n"));
885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
886 ", trying anchored starting at offset %ld...\n",
887 (long)(saved_s + 1 - i_strpos)));
889 s = HOP3c(t, 1, strend);
893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
894 (long)(s - i_strpos)));
895 other_last = s; /* Fix this later. --Hugo */
905 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
907 DEBUG_OPTIMISE_MORE_r(
908 PerlIO_printf(Perl_debug_log,
909 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
910 (IV)prog->check_offset_min,
911 (IV)prog->check_offset_max,
919 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
921 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
924 /* Fixed substring is found far enough so that the match
925 cannot start at strpos. */
927 if (ml_anch && t[-1] != '\n') {
928 /* Eventually fbm_*() should handle this, but often
929 anchored_offset is not 0, so this check will not be wasted. */
930 /* XXXX In the code below we prefer to look for "^" even in
931 presence of anchored substrings. And we search even
932 beyond the found float position. These pessimizations
933 are historical artefacts only. */
935 while (t < strend - prog->minlen) {
937 if (t < check_at - prog->check_offset_min) {
938 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
939 /* Since we moved from the found position,
940 we definitely contradict the found anchored
941 substr. Due to the above check we do not
942 contradict "check" substr.
943 Thus we can arrive here only if check substr
944 is float. Redo checking for "other"=="fixed".
947 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
948 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
949 goto do_other_anchored;
951 /* We don't contradict the found floating substring. */
952 /* XXXX Why not check for STCLASS? */
954 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
955 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
958 /* Position contradicts check-string */
959 /* XXXX probably better to look for check-string
960 than for "\n", so one should lower the limit for t? */
961 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
962 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
963 other_last = strpos = s = t + 1;
968 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
969 PL_colors[0], PL_colors[1]));
973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
974 PL_colors[0], PL_colors[1]));
978 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
981 /* The found string does not prohibit matching at strpos,
982 - no optimization of calling REx engine can be performed,
983 unless it was an MBOL and we are not after MBOL,
984 or a future STCLASS check will fail this. */
986 /* Even in this situation we may use MBOL flag if strpos is offset
987 wrt the start of the string. */
988 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
989 && (strpos != strbeg) && strpos[-1] != '\n'
990 /* May be due to an implicit anchor of m{.*foo} */
991 && !(prog->intflags & PREGf_IMPLICIT))
996 DEBUG_EXECUTE_r( if (ml_anch)
997 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
998 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1001 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1003 prog->check_utf8 /* Could be deleted already */
1004 && --BmUSEFUL(prog->check_utf8) < 0
1005 && (prog->check_utf8 == prog->float_utf8)
1007 prog->check_substr /* Could be deleted already */
1008 && --BmUSEFUL(prog->check_substr) < 0
1009 && (prog->check_substr == prog->float_substr)
1012 /* If flags & SOMETHING - do not do it many times on the same match */
1013 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1014 /* XXX Does the destruction order has to change with utf8_target? */
1015 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1016 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1017 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1018 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1019 check = NULL; /* abort */
1021 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1022 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1023 if (prog->intflags & PREGf_IMPLICIT)
1024 prog->extflags &= ~RXf_ANCH_MBOL;
1025 /* XXXX This is a remnant of the old implementation. It
1026 looks wasteful, since now INTUIT can use many
1027 other heuristics. */
1028 prog->extflags &= ~RXf_USE_INTUIT;
1029 /* XXXX What other flags might need to be cleared in this branch? */
1035 /* Last resort... */
1036 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1037 /* trie stclasses are too expensive to use here, we are better off to
1038 leave it to regmatch itself */
1039 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1040 /* minlen == 0 is possible if regstclass is \b or \B,
1041 and the fixed substr is ''$.
1042 Since minlen is already taken into account, s+1 is before strend;
1043 accidentally, minlen >= 1 guaranties no false positives at s + 1
1044 even for \b or \B. But (minlen? 1 : 0) below assumes that
1045 regstclass does not come from lookahead... */
1046 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1047 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1048 const U8* const str = (U8*)STRING(progi->regstclass);
1049 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1050 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1053 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1054 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1055 else if (prog->float_substr || prog->float_utf8)
1056 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1060 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1061 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1064 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1067 const char *what = NULL;
1069 if (endpos == strend) {
1070 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1071 "Could not match STCLASS...\n") );
1074 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1075 "This position contradicts STCLASS...\n") );
1076 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1078 /* Contradict one of substrings */
1079 if (prog->anchored_substr || prog->anchored_utf8) {
1080 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1081 DEBUG_EXECUTE_r( what = "anchored" );
1083 s = HOP3c(t, 1, strend);
1084 if (s + start_shift + end_shift > strend) {
1085 /* XXXX Should be taken into account earlier? */
1086 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1087 "Could not match STCLASS...\n") );
1092 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1093 "Looking for %s substr starting at offset %ld...\n",
1094 what, (long)(s + start_shift - i_strpos)) );
1097 /* Have both, check_string is floating */
1098 if (t + start_shift >= check_at) /* Contradicts floating=check */
1099 goto retry_floating_check;
1100 /* Recheck anchored substring, but not floating... */
1104 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1105 "Looking for anchored substr starting at offset %ld...\n",
1106 (long)(other_last - i_strpos)) );
1107 goto do_other_anchored;
1109 /* Another way we could have checked stclass at the
1110 current position only: */
1115 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116 "Looking for /%s^%s/m starting at offset %ld...\n",
1117 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1120 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1122 /* Check is floating substring. */
1123 retry_floating_check:
1124 t = check_at - start_shift;
1125 DEBUG_EXECUTE_r( what = "floating" );
1126 goto hop_and_restart;
1129 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1130 "By STCLASS: moving %ld --> %ld\n",
1131 (long)(t - i_strpos), (long)(s - i_strpos))
1135 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1136 "Does not contradict STCLASS...\n");
1141 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1142 PL_colors[4], (check ? "Guessed" : "Giving up"),
1143 PL_colors[5], (long)(s - i_strpos)) );
1146 fail_finish: /* Substring not found */
1147 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1148 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1150 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1151 PL_colors[4], PL_colors[5]));
1155 #define DECL_TRIE_TYPE(scan) \
1156 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1157 trie_type = ((scan->flags == EXACT) \
1158 ? (utf8_target ? trie_utf8 : trie_plain) \
1159 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1161 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1162 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1164 switch (trie_type) { \
1165 case trie_utf8_fold: \
1166 if ( foldlen>0 ) { \
1167 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1172 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1173 len = UTF8SKIP(uc); \
1174 skiplen = UNISKIP( uvc ); \
1175 foldlen -= skiplen; \
1176 uscan = foldbuf + skiplen; \
1179 case trie_latin_utf8_fold: \
1180 if ( foldlen>0 ) { \
1181 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1187 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1188 skiplen = UNISKIP( uvc ); \
1189 foldlen -= skiplen; \
1190 uscan = foldbuf + skiplen; \
1194 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1201 charid = trie->charmap[ uvc ]; \
1205 if (widecharmap) { \
1206 SV** const svpp = hv_fetch(widecharmap, \
1207 (char*)&uvc, sizeof(UV), 0); \
1209 charid = (U16)SvIV(*svpp); \
1214 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1218 && (ln == 1 || folder(s, pat_string, ln)) \
1219 && (!reginfo || regtry(reginfo, &s)) ) \
1225 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1227 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1233 #define REXEC_FBC_SCAN(CoDe) \
1235 while (s < strend) { \
1241 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1242 REXEC_FBC_UTF8_SCAN( \
1244 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1253 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1256 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1265 #define REXEC_FBC_TRYIT \
1266 if ((!reginfo || regtry(reginfo, &s))) \
1269 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1270 if (utf8_target) { \
1271 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1274 REXEC_FBC_CLASS_SCAN(CoNd); \
1277 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1278 if (utf8_target) { \
1280 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1283 REXEC_FBC_CLASS_SCAN(CoNd); \
1286 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1287 PL_reg_flags |= RF_tainted; \
1288 if (utf8_target) { \
1289 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1292 REXEC_FBC_CLASS_SCAN(CoNd); \
1295 #define DUMP_EXEC_POS(li,s,doutf8) \
1296 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1299 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1300 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1301 tmp = TEST_NON_UTF8(tmp); \
1302 REXEC_FBC_UTF8_SCAN( \
1303 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1312 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1313 if (s == PL_bostr) { \
1317 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1318 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1321 LOAD_UTF8_CHARCLASS_ALNUM(); \
1322 REXEC_FBC_UTF8_SCAN( \
1323 if (tmp == ! (TeSt2_UtF8)) { \
1332 /* The only difference between the BOUND and NBOUND cases is that
1333 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1334 * NBOUND. This is accomplished by passing it in either the if or else clause,
1335 * with the other one being empty */
1336 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1337 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1339 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1340 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1342 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1343 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1345 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1346 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1349 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1350 * be passed in completely with the variable name being tested, which isn't
1351 * such a clean interface, but this is easier to read than it was before. We
1352 * are looking for the boundary (or non-boundary between a word and non-word
1353 * character. The utf8 and non-utf8 cases have the same logic, but the details
1354 * must be different. Find the "wordness" of the character just prior to this
1355 * one, and compare it with the wordness of this one. If they differ, we have
1356 * a boundary. At the beginning of the string, pretend that the previous
1357 * character was a new-line */
1358 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1359 if (utf8_target) { \
1362 else { /* Not utf8 */ \
1363 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1364 tmp = TEST_NON_UTF8(tmp); \
1366 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1375 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1378 /* We know what class REx starts with. Try to find this position... */
1379 /* if reginfo is NULL, its a dryrun */
1380 /* annoyingly all the vars in this routine have different names from their counterparts
1381 in regmatch. /grrr */
1384 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1385 const char *strend, regmatch_info *reginfo)
1388 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1389 char *pat_string; /* The pattern's exactish string */
1390 char *pat_end; /* ptr to end char of pat_string */
1391 re_fold_t folder; /* Function for computing non-utf8 folds */
1392 const U8 *fold_array; /* array for folding ords < 256 */
1395 register STRLEN uskip;
1399 register I32 tmp = 1; /* Scratch variable? */
1400 register const bool utf8_target = PL_reg_match_utf8;
1401 UV utf8_fold_flags = 0;
1402 RXi_GET_DECL(prog,progi);
1404 PERL_ARGS_ASSERT_FIND_BYCLASS;
1406 /* We know what class it must start with. */
1410 if (utf8_target || OP(c) == ANYOFV) {
1411 STRLEN inclasslen = strend - s;
1412 REXEC_FBC_UTF8_CLASS_SCAN(
1413 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1416 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1421 if (tmp && (!reginfo || regtry(reginfo, &s)))
1429 if (UTF_PATTERN || utf8_target) {
1430 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1431 goto do_exactf_utf8;
1433 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1434 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1435 goto do_exactf_non_utf8; /* isn't dealt with by these */
1440 /* regcomp.c already folded this if pattern is in UTF-8 */
1441 utf8_fold_flags = 0;
1442 goto do_exactf_utf8;
1444 fold_array = PL_fold;
1446 goto do_exactf_non_utf8;
1449 if (UTF_PATTERN || utf8_target) {
1450 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1451 goto do_exactf_utf8;
1453 fold_array = PL_fold_locale;
1454 folder = foldEQ_locale;
1455 goto do_exactf_non_utf8;
1459 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1461 goto do_exactf_utf8;
1463 case EXACTFU_TRICKYFOLD:
1465 if (UTF_PATTERN || utf8_target) {
1466 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1467 goto do_exactf_utf8;
1470 /* Any 'ss' in the pattern should have been replaced by regcomp,
1471 * so we don't have to worry here about this single special case
1472 * in the Latin1 range */
1473 fold_array = PL_fold_latin1;
1474 folder = foldEQ_latin1;
1478 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1479 are no glitches with fold-length differences
1480 between the target string and pattern */
1482 /* The idea in the non-utf8 EXACTF* cases is to first find the
1483 * first character of the EXACTF* node and then, if necessary,
1484 * case-insensitively compare the full text of the node. c1 is the
1485 * first character. c2 is its fold. This logic will not work for
1486 * Unicode semantics and the german sharp ss, which hence should
1487 * not be compiled into a node that gets here. */
1488 pat_string = STRING(c);
1489 ln = STR_LEN(c); /* length to match in octets/bytes */
1491 /* We know that we have to match at least 'ln' bytes (which is the
1492 * same as characters, since not utf8). If we have to match 3
1493 * characters, and there are only 2 availabe, we know without
1494 * trying that it will fail; so don't start a match past the
1495 * required minimum number from the far end */
1496 e = HOP3c(strend, -((I32)ln), s);
1498 if (!reginfo && e < s) {
1499 e = s; /* Due to minlen logic of intuit() */
1503 c2 = fold_array[c1];
1504 if (c1 == c2) { /* If char and fold are the same */
1505 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1508 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1517 /* If one of the operands is in utf8, we can't use the simpler
1518 * folding above, due to the fact that many different characters
1519 * can have the same fold, or portion of a fold, or different-
1521 pat_string = STRING(c);
1522 ln = STR_LEN(c); /* length to match in octets/bytes */
1523 pat_end = pat_string + ln;
1524 lnc = (UTF_PATTERN) /* length to match in characters */
1525 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1528 /* We have 'lnc' characters to match in the pattern, but because of
1529 * multi-character folding, each character in the target can match
1530 * up to 3 characters (Unicode guarantees it will never exceed
1531 * this) if it is utf8-encoded; and up to 2 if not (based on the
1532 * fact that the Latin 1 folds are already determined, and the
1533 * only multi-char fold in that range is the sharp-s folding to
1534 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1535 * string character. Adjust lnc accordingly, rounding up, so that
1536 * if we need to match at least 4+1/3 chars, that really is 5. */
1537 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1538 lnc = (lnc + expansion - 1) / expansion;
1540 /* As in the non-UTF8 case, if we have to match 3 characters, and
1541 * only 2 are left, it's guaranteed to fail, so don't start a
1542 * match that would require us to go beyond the end of the string
1544 e = HOP3c(strend, -((I32)lnc), s);
1546 if (!reginfo && e < s) {
1547 e = s; /* Due to minlen logic of intuit() */
1550 /* XXX Note that we could recalculate e to stop the loop earlier,
1551 * as the worst case expansion above will rarely be met, and as we
1552 * go along we would usually find that e moves further to the left.
1553 * This would happen only after we reached the point in the loop
1554 * where if there were no expansion we should fail. Unclear if
1555 * worth the expense */
1558 char *my_strend= (char *)strend;
1559 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1560 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1561 && (!reginfo || regtry(reginfo, &s)) )
1565 s += (utf8_target) ? UTF8SKIP(s) : 1;
1570 PL_reg_flags |= RF_tainted;
1571 FBC_BOUND(isALNUM_LC,
1572 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1573 isALNUM_LC_utf8((U8*)s));
1576 PL_reg_flags |= RF_tainted;
1577 FBC_NBOUND(isALNUM_LC,
1578 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1579 isALNUM_LC_utf8((U8*)s));
1582 FBC_BOUND(isWORDCHAR,
1584 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1587 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1589 isWORDCHAR_A((U8*)s));
1592 FBC_NBOUND(isWORDCHAR,
1594 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1597 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1599 isWORDCHAR_A((U8*)s));
1602 FBC_BOUND(isWORDCHAR_L1,
1604 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1607 FBC_NBOUND(isWORDCHAR_L1,
1609 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1612 REXEC_FBC_CSCAN_TAINT(
1613 isALNUM_LC_utf8((U8*)s),
1618 REXEC_FBC_CSCAN_PRELOAD(
1619 LOAD_UTF8_CHARCLASS_ALNUM(),
1620 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1621 isWORDCHAR_L1((U8) *s)
1625 REXEC_FBC_CSCAN_PRELOAD(
1626 LOAD_UTF8_CHARCLASS_ALNUM(),
1627 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1632 /* Don't need to worry about utf8, as it can match only a single
1633 * byte invariant character */
1634 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1637 REXEC_FBC_CSCAN_PRELOAD(
1638 LOAD_UTF8_CHARCLASS_ALNUM(),
1639 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1640 ! isWORDCHAR_L1((U8) *s)
1644 REXEC_FBC_CSCAN_PRELOAD(
1645 LOAD_UTF8_CHARCLASS_ALNUM(),
1646 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1657 REXEC_FBC_CSCAN_TAINT(
1658 !isALNUM_LC_utf8((U8*)s),
1663 REXEC_FBC_CSCAN_PRELOAD(
1664 LOAD_UTF8_CHARCLASS_SPACE(),
1665 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1670 REXEC_FBC_CSCAN_PRELOAD(
1671 LOAD_UTF8_CHARCLASS_SPACE(),
1672 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1677 /* Don't need to worry about utf8, as it can match only a single
1678 * byte invariant character */
1679 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1682 REXEC_FBC_CSCAN_TAINT(
1683 isSPACE_LC_utf8((U8*)s),
1688 REXEC_FBC_CSCAN_PRELOAD(
1689 LOAD_UTF8_CHARCLASS_SPACE(),
1690 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1691 ! isSPACE_L1((U8) *s)
1695 REXEC_FBC_CSCAN_PRELOAD(
1696 LOAD_UTF8_CHARCLASS_SPACE(),
1697 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1708 REXEC_FBC_CSCAN_TAINT(
1709 !isSPACE_LC_utf8((U8*)s),
1714 REXEC_FBC_CSCAN_PRELOAD(
1715 LOAD_UTF8_CHARCLASS_DIGIT(),
1716 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1721 /* Don't need to worry about utf8, as it can match only a single
1722 * byte invariant character */
1723 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1726 REXEC_FBC_CSCAN_TAINT(
1727 isDIGIT_LC_utf8((U8*)s),
1732 REXEC_FBC_CSCAN_PRELOAD(
1733 LOAD_UTF8_CHARCLASS_DIGIT(),
1734 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1745 REXEC_FBC_CSCAN_TAINT(
1746 !isDIGIT_LC_utf8((U8*)s),
1753 is_LNBREAK_latin1(s)
1765 !is_VERTWS_latin1(s)
1771 is_HORIZWS_latin1(s)
1776 !is_HORIZWS_utf8(s),
1777 !is_HORIZWS_latin1(s)
1784 /* what trie are we using right now */
1786 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1788 = (reg_trie_data*)progi->data->data[ aho->trie ];
1789 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1791 const char *last_start = strend - trie->minlen;
1793 const char *real_start = s;
1795 STRLEN maxlen = trie->maxlen;
1797 U8 **points; /* map of where we were in the input string
1798 when reading a given char. For ASCII this
1799 is unnecessary overhead as the relationship
1800 is always 1:1, but for Unicode, especially
1801 case folded Unicode this is not true. */
1802 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1806 GET_RE_DEBUG_FLAGS_DECL;
1808 /* We can't just allocate points here. We need to wrap it in
1809 * an SV so it gets freed properly if there is a croak while
1810 * running the match */
1813 sv_points=newSV(maxlen * sizeof(U8 *));
1814 SvCUR_set(sv_points,
1815 maxlen * sizeof(U8 *));
1816 SvPOK_on(sv_points);
1817 sv_2mortal(sv_points);
1818 points=(U8**)SvPV_nolen(sv_points );
1819 if ( trie_type != trie_utf8_fold
1820 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1823 bitmap=(U8*)trie->bitmap;
1825 bitmap=(U8*)ANYOF_BITMAP(c);
1827 /* this is the Aho-Corasick algorithm modified a touch
1828 to include special handling for long "unknown char"
1829 sequences. The basic idea being that we use AC as long
1830 as we are dealing with a possible matching char, when
1831 we encounter an unknown char (and we have not encountered
1832 an accepting state) we scan forward until we find a legal
1834 AC matching is basically that of trie matching, except
1835 that when we encounter a failing transition, we fall back
1836 to the current states "fail state", and try the current char
1837 again, a process we repeat until we reach the root state,
1838 state 1, or a legal transition. If we fail on the root state
1839 then we can either terminate if we have reached an accepting
1840 state previously, or restart the entire process from the beginning
1844 while (s <= last_start) {
1845 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1853 U8 *uscan = (U8*)NULL;
1854 U8 *leftmost = NULL;
1856 U32 accepted_word= 0;
1860 while ( state && uc <= (U8*)strend ) {
1862 U32 word = aho->states[ state ].wordnum;
1866 DEBUG_TRIE_EXECUTE_r(
1867 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1868 dump_exec_pos( (char *)uc, c, strend, real_start,
1869 (char *)uc, utf8_target );
1870 PerlIO_printf( Perl_debug_log,
1871 " Scanning for legal start char...\n");
1875 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1879 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1885 if (uc >(U8*)last_start) break;
1889 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1890 if (!leftmost || lpos < leftmost) {
1891 DEBUG_r(accepted_word=word);
1897 points[pointpos++ % maxlen]= uc;
1898 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1899 uscan, len, uvc, charid, foldlen,
1901 DEBUG_TRIE_EXECUTE_r({
1902 dump_exec_pos( (char *)uc, c, strend, real_start,
1904 PerlIO_printf(Perl_debug_log,
1905 " Charid:%3u CP:%4"UVxf" ",
1911 word = aho->states[ state ].wordnum;
1913 base = aho->states[ state ].trans.base;
1915 DEBUG_TRIE_EXECUTE_r({
1917 dump_exec_pos( (char *)uc, c, strend, real_start,
1919 PerlIO_printf( Perl_debug_log,
1920 "%sState: %4"UVxf", word=%"UVxf,
1921 failed ? " Fail transition to " : "",
1922 (UV)state, (UV)word);
1928 ( ((offset = base + charid
1929 - 1 - trie->uniquecharcount)) >= 0)
1930 && ((U32)offset < trie->lasttrans)
1931 && trie->trans[offset].check == state
1932 && (tmp=trie->trans[offset].next))
1934 DEBUG_TRIE_EXECUTE_r(
1935 PerlIO_printf( Perl_debug_log," - legal\n"));
1940 DEBUG_TRIE_EXECUTE_r(
1941 PerlIO_printf( Perl_debug_log," - fail\n"));
1943 state = aho->fail[state];
1947 /* we must be accepting here */
1948 DEBUG_TRIE_EXECUTE_r(
1949 PerlIO_printf( Perl_debug_log," - accepting\n"));
1958 if (!state) state = 1;
1961 if ( aho->states[ state ].wordnum ) {
1962 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1963 if (!leftmost || lpos < leftmost) {
1964 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1969 s = (char*)leftmost;
1970 DEBUG_TRIE_EXECUTE_r({
1972 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1973 (UV)accepted_word, (IV)(s - real_start)
1976 if (!reginfo || regtry(reginfo, &s)) {
1982 DEBUG_TRIE_EXECUTE_r({
1983 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1986 DEBUG_TRIE_EXECUTE_r(
1987 PerlIO_printf( Perl_debug_log,"No match.\n"));
1996 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2006 - regexec_flags - match a regexp against a string
2009 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2010 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2011 /* strend: pointer to null at end of string */
2012 /* strbeg: real beginning of string */
2013 /* minend: end of match must be >=minend after stringarg. */
2014 /* data: May be used for some additional optimizations.
2015 Currently its only used, with a U32 cast, for transmitting
2016 the ganch offset when doing a /g match. This will change */
2017 /* nosave: For optimizations. */
2020 struct regexp *const prog = (struct regexp *)SvANY(rx);
2021 /*register*/ char *s;
2022 register regnode *c;
2023 /*register*/ char *startpos = stringarg;
2024 I32 minlen; /* must match at least this many chars */
2025 I32 dontbother = 0; /* how many characters not to try at end */
2026 I32 end_shift = 0; /* Same for the end. */ /* CC */
2027 I32 scream_pos = -1; /* Internal iterator of scream. */
2028 char *scream_olds = NULL;
2029 const bool utf8_target = cBOOL(DO_UTF8(sv));
2031 RXi_GET_DECL(prog,progi);
2032 regmatch_info reginfo; /* create some info to pass to regtry etc */
2033 regexp_paren_pair *swap = NULL;
2034 GET_RE_DEBUG_FLAGS_DECL;
2036 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2037 PERL_UNUSED_ARG(data);
2039 /* Be paranoid... */
2040 if (prog == NULL || startpos == NULL) {
2041 Perl_croak(aTHX_ "NULL regexp parameter");
2045 multiline = prog->extflags & RXf_PMf_MULTILINE;
2046 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2048 RX_MATCH_UTF8_set(rx, utf8_target);
2050 debug_start_match(rx, utf8_target, startpos, strend,
2054 minlen = prog->minlen;
2056 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2057 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2058 "String too short [regexec_flags]...\n"));
2063 /* Check validity of program. */
2064 if (UCHARAT(progi->program) != REG_MAGIC) {
2065 Perl_croak(aTHX_ "corrupted regexp program");
2069 PL_reg_eval_set = 0;
2073 PL_reg_flags |= RF_utf8;
2075 /* Mark beginning of line for ^ and lookbehind. */
2076 reginfo.bol = startpos; /* XXX not used ??? */
2080 /* Mark end of line for $ (and such) */
2083 /* see how far we have to get to not match where we matched before */
2084 reginfo.till = startpos+minend;
2086 /* If there is a "must appear" string, look for it. */
2089 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2091 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2092 reginfo.ganch = startpos + prog->gofs;
2093 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2094 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2095 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2097 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2098 && mg->mg_len >= 0) {
2099 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2100 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2101 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2103 if (prog->extflags & RXf_ANCH_GPOS) {
2104 if (s > reginfo.ganch)
2106 s = reginfo.ganch - prog->gofs;
2107 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2108 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2114 reginfo.ganch = strbeg + PTR2UV(data);
2115 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2116 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2118 } else { /* pos() not defined */
2119 reginfo.ganch = strbeg;
2120 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2121 "GPOS: reginfo.ganch = strbeg\n"));
2124 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2125 /* We have to be careful. If the previous successful match
2126 was from this regex we don't want a subsequent partially
2127 successful match to clobber the old results.
2128 So when we detect this possibility we add a swap buffer
2129 to the re, and switch the buffer each match. If we fail
2130 we switch it back, otherwise we leave it swapped.
2133 /* do we need a save destructor here for eval dies? */
2134 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2136 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2137 re_scream_pos_data d;
2139 d.scream_olds = &scream_olds;
2140 d.scream_pos = &scream_pos;
2141 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2144 goto phooey; /* not present */
2150 /* Simplest case: anchored match need be tried only once. */
2151 /* [unless only anchor is BOL and multiline is set] */
2152 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2153 if (s == startpos && regtry(®info, &startpos))
2155 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2156 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2161 dontbother = minlen - 1;
2162 end = HOP3c(strend, -dontbother, strbeg) - 1;
2163 /* for multiline we only have to try after newlines */
2164 if (prog->check_substr || prog->check_utf8) {
2165 /* because of the goto we can not easily reuse the macros for bifurcating the
2166 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2169 goto after_try_utf8;
2171 if (regtry(®info, &s)) {
2178 if (prog->extflags & RXf_USE_INTUIT) {
2179 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2188 } /* end search for check string in unicode */
2190 if (s == startpos) {
2191 goto after_try_latin;
2194 if (regtry(®info, &s)) {
2201 if (prog->extflags & RXf_USE_INTUIT) {
2202 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2211 } /* end search for check string in latin*/
2212 } /* end search for check string */
2213 else { /* search for newline */
2215 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2218 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2219 while (s <= end) { /* note it could be possible to match at the end of the string */
2220 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2221 if (regtry(®info, &s))
2225 } /* end search for newline */
2226 } /* end anchored/multiline check string search */
2228 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2230 /* the warning about reginfo.ganch being used without initialization
2231 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2232 and we only enter this block when the same bit is set. */
2233 char *tmp_s = reginfo.ganch - prog->gofs;
2235 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2240 /* Messy cases: unanchored match. */
2241 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2242 /* we have /x+whatever/ */
2243 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2248 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2249 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2250 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2255 DEBUG_EXECUTE_r( did_match = 1 );
2256 if (regtry(®info, &s)) goto got_it;
2258 while (s < strend && *s == ch)
2266 DEBUG_EXECUTE_r( did_match = 1 );
2267 if (regtry(®info, &s)) goto got_it;
2269 while (s < strend && *s == ch)
2274 DEBUG_EXECUTE_r(if (!did_match)
2275 PerlIO_printf(Perl_debug_log,
2276 "Did not find anchored character...\n")
2279 else if (prog->anchored_substr != NULL
2280 || prog->anchored_utf8 != NULL
2281 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2282 && prog->float_max_offset < strend - s)) {
2287 char *last1; /* Last position checked before */
2291 if (prog->anchored_substr || prog->anchored_utf8) {
2292 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2293 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2294 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2295 back_max = back_min = prog->anchored_offset;
2297 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2298 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2299 must = utf8_target ? prog->float_utf8 : prog->float_substr;
2300 back_max = prog->float_max_offset;
2301 back_min = prog->float_min_offset;
2305 if (must == &PL_sv_undef)
2306 /* could not downgrade utf8 check substring, so must fail */
2312 last = HOP3c(strend, /* Cannot start after this */
2313 -(I32)(CHR_SVLEN(must)
2314 - (SvTAIL(must) != 0) + back_min), strbeg);
2317 last1 = HOPc(s, -1);
2319 last1 = s - 1; /* bogus */
2321 /* XXXX check_substr already used to find "s", can optimize if
2322 check_substr==must. */
2324 dontbother = end_shift;
2325 strend = HOPc(strend, -dontbother);
2326 while ( (s <= last) &&
2327 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2328 (unsigned char*)strend, must,
2329 multiline ? FBMrf_MULTILINE : 0)) ) {
2330 DEBUG_EXECUTE_r( did_match = 1 );
2331 if (HOPc(s, -back_max) > last1) {
2332 last1 = HOPc(s, -back_min);
2333 s = HOPc(s, -back_max);
2336 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2338 last1 = HOPc(s, -back_min);
2342 while (s <= last1) {
2343 if (regtry(®info, &s))
2349 while (s <= last1) {
2350 if (regtry(®info, &s))
2356 DEBUG_EXECUTE_r(if (!did_match) {
2357 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2358 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2359 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2360 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2361 ? "anchored" : "floating"),
2362 quoted, RE_SV_TAIL(must));
2366 else if ( (c = progi->regstclass) ) {
2368 const OPCODE op = OP(progi->regstclass);
2369 /* don't bother with what can't match */
2370 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2371 strend = HOPc(strend, -(minlen - 1));
2374 SV * const prop = sv_newmortal();
2375 regprop(prog, prop, c);
2377 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2379 PerlIO_printf(Perl_debug_log,
2380 "Matching stclass %.*s against %s (%d bytes)\n",
2381 (int)SvCUR(prop), SvPVX_const(prop),
2382 quoted, (int)(strend - s));
2385 if (find_byclass(prog, c, s, strend, ®info))
2387 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2391 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2398 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2399 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2400 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2402 little = SvPV_const(float_real, len);
2403 if (SvTAIL(float_real)) {
2404 /* This means that float_real contains an artificial \n on the end
2405 * due to the presence of something like this: /foo$/
2406 * where we can match both "foo" and "foo\n" at the end of the string.
2407 * So we have to compare the end of the string first against the float_real
2408 * without the \n and then against the full float_real with the string.
2409 * We have to watch out for cases where the string might be smaller
2410 * than the float_real or the float_real without the \n.
2412 char *checkpos= strend - len;
2414 PerlIO_printf(Perl_debug_log,
2415 "%sChecking for float_real.%s\n",
2416 PL_colors[4], PL_colors[5]));
2417 if (checkpos + 1 < strbeg) {
2418 /* can't match, even if we remove the trailing \n string is too short to match */
2420 PerlIO_printf(Perl_debug_log,
2421 "%sString shorter than required trailing substring, cannot match.%s\n",
2422 PL_colors[4], PL_colors[5]));
2424 } else if (memEQ(checkpos + 1, little, len - 1)) {
2425 /* can match, the end of the string matches without the "\n" */
2426 last = checkpos + 1;
2427 } else if (checkpos < strbeg) {
2428 /* cant match, string is too short when the "\n" is included */
2430 PerlIO_printf(Perl_debug_log,
2431 "%sString does not contain required trailing substring, cannot match.%s\n",
2432 PL_colors[4], PL_colors[5]));
2434 } else if (!multiline) {
2435 /* non multiline match, so compare with the "\n" at the end of the string */
2436 if (memEQ(checkpos, little, len)) {
2440 PerlIO_printf(Perl_debug_log,
2441 "%sString does not contain required trailing substring, cannot match.%s\n",
2442 PL_colors[4], PL_colors[5]));
2446 /* multiline match, so we have to search for a place where the full string is located */
2452 last = rninstr(s, strend, little, little + len);
2454 last = strend; /* matching "$" */
2457 /* at one point this block contained a comment which was probably
2458 * incorrect, which said that this was a "should not happen" case.
2459 * Even if it was true when it was written I am pretty sure it is
2460 * not anymore, so I have removed the comment and replaced it with
2463 PerlIO_printf(Perl_debug_log,
2464 "String does not contain required substring, cannot match.\n"
2468 dontbother = strend - last + prog->float_min_offset;
2470 if (minlen && (dontbother < minlen))
2471 dontbother = minlen - 1;
2472 strend -= dontbother; /* this one's always in bytes! */
2473 /* We don't know much -- general case. */
2476 if (regtry(®info, &s))
2485 if (regtry(®info, &s))
2487 } while (s++ < strend);
2496 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2498 if (PL_reg_eval_set)
2499 restore_pos(aTHX_ prog);
2500 if (RXp_PAREN_NAMES(prog))
2501 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2503 /* make sure $`, $&, $', and $digit will work later */
2504 if ( !(flags & REXEC_NOT_FIRST) ) {
2505 RX_MATCH_COPY_FREE(rx);
2506 if (flags & REXEC_COPY_STR) {
2507 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2508 #ifdef PERL_OLD_COPY_ON_WRITE
2510 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2512 PerlIO_printf(Perl_debug_log,
2513 "Copy on write: regexp capture, type %d\n",
2516 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2517 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2518 assert (SvPOKp(prog->saved_copy));
2522 RX_MATCH_COPIED_on(rx);
2523 s = savepvn(strbeg, i);
2529 prog->subbeg = strbeg;
2530 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2537 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2538 PL_colors[4], PL_colors[5]));
2539 if (PL_reg_eval_set)
2540 restore_pos(aTHX_ prog);
2542 /* we failed :-( roll it back */
2543 Safefree(prog->offs);
2552 - regtry - try match at specific point
2554 STATIC I32 /* 0 failure, 1 success */
2555 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2559 REGEXP *const rx = reginfo->prog;
2560 regexp *const prog = (struct regexp *)SvANY(rx);
2561 RXi_GET_DECL(prog,progi);
2562 GET_RE_DEBUG_FLAGS_DECL;
2564 PERL_ARGS_ASSERT_REGTRY;
2566 reginfo->cutpoint=NULL;
2568 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2571 PL_reg_eval_set = RS_init;
2572 DEBUG_EXECUTE_r(DEBUG_s(
2573 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2574 (IV)(PL_stack_sp - PL_stack_base));
2577 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2578 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2580 /* Apparently this is not needed, judging by wantarray. */
2581 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2582 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2585 /* Make $_ available to executed code. */
2586 if (reginfo->sv != DEFSV) {
2588 DEFSV_set(reginfo->sv);
2591 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2592 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2593 /* prepare for quick setting of pos */
2594 #ifdef PERL_OLD_COPY_ON_WRITE
2595 if (SvIsCOW(reginfo->sv))
2596 sv_force_normal_flags(reginfo->sv, 0);
2598 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2599 &PL_vtbl_mglob, NULL, 0);
2603 PL_reg_oldpos = mg->mg_len;
2604 SAVEDESTRUCTOR_X(restore_pos, prog);
2606 if (!PL_reg_curpm) {
2607 Newxz(PL_reg_curpm, 1, PMOP);
2610 SV* const repointer = &PL_sv_undef;
2611 /* this regexp is also owned by the new PL_reg_curpm, which
2612 will try to free it. */
2613 av_push(PL_regex_padav, repointer);
2614 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2615 PL_regex_pad = AvARRAY(PL_regex_padav);
2620 /* It seems that non-ithreads works both with and without this code.
2621 So for efficiency reasons it seems best not to have the code
2622 compiled when it is not needed. */
2623 /* This is safe against NULLs: */
2624 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2625 /* PM_reg_curpm owns a reference to this regexp. */
2626 (void)ReREFCNT_inc(rx);
2628 PM_SETRE(PL_reg_curpm, rx);
2629 PL_reg_oldcurpm = PL_curpm;
2630 PL_curpm = PL_reg_curpm;
2631 if (RXp_MATCH_COPIED(prog)) {
2632 /* Here is a serious problem: we cannot rewrite subbeg,
2633 since it may be needed if this match fails. Thus
2634 $` inside (?{}) could fail... */
2635 PL_reg_oldsaved = prog->subbeg;
2636 PL_reg_oldsavedlen = prog->sublen;
2637 #ifdef PERL_OLD_COPY_ON_WRITE
2638 PL_nrs = prog->saved_copy;
2640 RXp_MATCH_COPIED_off(prog);
2643 PL_reg_oldsaved = NULL;
2644 prog->subbeg = PL_bostr;
2645 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2647 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2648 prog->offs[0].start = *startpos - PL_bostr;
2649 PL_reginput = *startpos;
2650 PL_reglastparen = &prog->lastparen;
2651 PL_reglastcloseparen = &prog->lastcloseparen;
2652 prog->lastparen = 0;
2653 prog->lastcloseparen = 0;
2655 PL_regoffs = prog->offs;
2656 if (PL_reg_start_tmpl <= prog->nparens) {
2657 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2658 if(PL_reg_start_tmp)
2659 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2661 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2664 /* XXXX What this code is doing here?!!! There should be no need
2665 to do this again and again, PL_reglastparen should take care of
2668 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2669 * Actually, the code in regcppop() (which Ilya may be meaning by
2670 * PL_reglastparen), is not needed at all by the test suite
2671 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2672 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2673 * Meanwhile, this code *is* needed for the
2674 * above-mentioned test suite tests to succeed. The common theme
2675 * on those tests seems to be returning null fields from matches.
2676 * --jhi updated by dapm */
2678 if (prog->nparens) {
2679 regexp_paren_pair *pp = PL_regoffs;
2681 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2689 if (regmatch(reginfo, progi->program + 1)) {
2690 PL_regoffs[0].end = PL_reginput - PL_bostr;
2693 if (reginfo->cutpoint)
2694 *startpos= reginfo->cutpoint;
2695 REGCP_UNWIND(lastcp);
2700 #define sayYES goto yes
2701 #define sayNO goto no
2702 #define sayNO_SILENT goto no_silent
2704 /* we dont use STMT_START/END here because it leads to
2705 "unreachable code" warnings, which are bogus, but distracting. */
2706 #define CACHEsayNO \
2707 if (ST.cache_mask) \
2708 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2711 /* this is used to determine how far from the left messages like
2712 'failed...' are printed. It should be set such that messages
2713 are inline with the regop output that created them.
2715 #define REPORT_CODE_OFF 32
2718 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2719 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2721 #define SLAB_FIRST(s) (&(s)->states[0])
2722 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2724 /* grab a new slab and return the first slot in it */
2726 STATIC regmatch_state *
2729 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2732 regmatch_slab *s = PL_regmatch_slab->next;
2734 Newx(s, 1, regmatch_slab);
2735 s->prev = PL_regmatch_slab;
2737 PL_regmatch_slab->next = s;
2739 PL_regmatch_slab = s;
2740 return SLAB_FIRST(s);
2744 /* push a new state then goto it */
2746 #define PUSH_STATE_GOTO(state, node) \
2748 st->resume_state = state; \
2751 /* push a new state with success backtracking, then goto it */
2753 #define PUSH_YES_STATE_GOTO(state, node) \
2755 st->resume_state = state; \
2756 goto push_yes_state;
2762 regmatch() - main matching routine
2764 This is basically one big switch statement in a loop. We execute an op,
2765 set 'next' to point the next op, and continue. If we come to a point which
2766 we may need to backtrack to on failure such as (A|B|C), we push a
2767 backtrack state onto the backtrack stack. On failure, we pop the top
2768 state, and re-enter the loop at the state indicated. If there are no more
2769 states to pop, we return failure.
2771 Sometimes we also need to backtrack on success; for example /A+/, where
2772 after successfully matching one A, we need to go back and try to
2773 match another one; similarly for lookahead assertions: if the assertion
2774 completes successfully, we backtrack to the state just before the assertion
2775 and then carry on. In these cases, the pushed state is marked as
2776 'backtrack on success too'. This marking is in fact done by a chain of
2777 pointers, each pointing to the previous 'yes' state. On success, we pop to
2778 the nearest yes state, discarding any intermediate failure-only states.
2779 Sometimes a yes state is pushed just to force some cleanup code to be
2780 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2781 it to free the inner regex.
2783 Note that failure backtracking rewinds the cursor position, while
2784 success backtracking leaves it alone.
2786 A pattern is complete when the END op is executed, while a subpattern
2787 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2788 ops trigger the "pop to last yes state if any, otherwise return true"
2791 A common convention in this function is to use A and B to refer to the two
2792 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2793 the subpattern to be matched possibly multiple times, while B is the entire
2794 rest of the pattern. Variable and state names reflect this convention.
2796 The states in the main switch are the union of ops and failure/success of
2797 substates associated with with that op. For example, IFMATCH is the op
2798 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2799 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2800 successfully matched A and IFMATCH_A_fail is a state saying that we have
2801 just failed to match A. Resume states always come in pairs. The backtrack
2802 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2803 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2804 on success or failure.
2806 The struct that holds a backtracking state is actually a big union, with
2807 one variant for each major type of op. The variable st points to the
2808 top-most backtrack struct. To make the code clearer, within each
2809 block of code we #define ST to alias the relevant union.
2811 Here's a concrete example of a (vastly oversimplified) IFMATCH
2817 #define ST st->u.ifmatch
2819 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2820 ST.foo = ...; // some state we wish to save
2822 // push a yes backtrack state with a resume value of
2823 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2825 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2828 case IFMATCH_A: // we have successfully executed A; now continue with B
2830 bar = ST.foo; // do something with the preserved value
2833 case IFMATCH_A_fail: // A failed, so the assertion failed
2834 ...; // do some housekeeping, then ...
2835 sayNO; // propagate the failure
2842 For any old-timers reading this who are familiar with the old recursive
2843 approach, the code above is equivalent to:
2845 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2854 ...; // do some housekeeping, then ...
2855 sayNO; // propagate the failure
2858 The topmost backtrack state, pointed to by st, is usually free. If you
2859 want to claim it, populate any ST.foo fields in it with values you wish to
2860 save, then do one of
2862 PUSH_STATE_GOTO(resume_state, node);
2863 PUSH_YES_STATE_GOTO(resume_state, node);
2865 which sets that backtrack state's resume value to 'resume_state', pushes a
2866 new free entry to the top of the backtrack stack, then goes to 'node'.
2867 On backtracking, the free slot is popped, and the saved state becomes the
2868 new free state. An ST.foo field in this new top state can be temporarily
2869 accessed to retrieve values, but once the main loop is re-entered, it
2870 becomes available for reuse.
2872 Note that the depth of the backtrack stack constantly increases during the
2873 left-to-right execution of the pattern, rather than going up and down with
2874 the pattern nesting. For example the stack is at its maximum at Z at the
2875 end of the pattern, rather than at X in the following:
2877 /(((X)+)+)+....(Y)+....Z/
2879 The only exceptions to this are lookahead/behind assertions and the cut,
2880 (?>A), which pop all the backtrack states associated with A before
2883 Backtrack state structs are allocated in slabs of about 4K in size.
2884 PL_regmatch_state and st always point to the currently active state,
2885 and PL_regmatch_slab points to the slab currently containing
2886 PL_regmatch_state. The first time regmatch() is called, the first slab is
2887 allocated, and is never freed until interpreter destruction. When the slab
2888 is full, a new one is allocated and chained to the end. At exit from
2889 regmatch(), slabs allocated since entry are freed.
2894 #define DEBUG_STATE_pp(pp) \
2896 DUMP_EXEC_POS(locinput, scan, utf8_target); \
2897 PerlIO_printf(Perl_debug_log, \
2898 " %*s"pp" %s%s%s%s%s\n", \
2900 PL_reg_name[st->resume_state], \
2901 ((st==yes_state||st==mark_state) ? "[" : ""), \
2902 ((st==yes_state) ? "Y" : ""), \
2903 ((st==mark_state) ? "M" : ""), \
2904 ((st==yes_state||st==mark_state) ? "]" : "") \
2909 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2914 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2915 const char *start, const char *end, const char *blurb)
2917 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2919 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2924 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2925 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2927 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2928 start, end - start, 60);
2930 PerlIO_printf(Perl_debug_log,
2931 "%s%s REx%s %s against %s\n",
2932 PL_colors[4], blurb, PL_colors[5], s0, s1);
2934 if (utf8_target||utf8_pat)
2935 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2936 utf8_pat ? "pattern" : "",
2937 utf8_pat && utf8_target ? " and " : "",
2938 utf8_target ? "string" : ""
2944 S_dump_exec_pos(pTHX_ const char *locinput,
2945 const regnode *scan,
2946 const char *loc_regeol,
2947 const char *loc_bostr,
2948 const char *loc_reg_starttry,
2949 const bool utf8_target)
2951 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2952 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2953 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2954 /* The part of the string before starttry has one color
2955 (pref0_len chars), between starttry and current
2956 position another one (pref_len - pref0_len chars),
2957 after the current position the third one.
2958 We assume that pref0_len <= pref_len, otherwise we
2959 decrease pref0_len. */
2960 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2961 ? (5 + taill) - l : locinput - loc_bostr;
2964 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2966 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2968 pref0_len = pref_len - (locinput - loc_reg_starttry);
2969 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2970 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2971 ? (5 + taill) - pref_len : loc_regeol - locinput);
2972 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2976 if (pref0_len > pref_len)
2977 pref0_len = pref_len;
2979 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2981 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2982 (locinput - pref_len),pref0_len, 60, 4, 5);
2984 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2985 (locinput - pref_len + pref0_len),
2986 pref_len - pref0_len, 60, 2, 3);
2988 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2989 locinput, loc_regeol - locinput, 10, 0, 1);
2991 const STRLEN tlen=len0+len1+len2;
2992 PerlIO_printf(Perl_debug_log,
2993 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2994 (IV)(locinput - loc_bostr),
2997 (docolor ? "" : "> <"),
2999 (int)(tlen > 19 ? 0 : 19 - tlen),
3006 /* reg_check_named_buff_matched()
3007 * Checks to see if a named buffer has matched. The data array of
3008 * buffer numbers corresponding to the buffer is expected to reside
3009 * in the regexp->data->data array in the slot stored in the ARG() of
3010 * node involved. Note that this routine doesn't actually care about the
3011 * name, that information is not preserved from compilation to execution.
3012 * Returns the index of the leftmost defined buffer with the given name
3013 * or 0 if non of the buffers matched.
3016 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3019 RXi_GET_DECL(rex,rexi);
3020 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3021 I32 *nums=(I32*)SvPVX(sv_dat);
3023 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3025 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3026 if ((I32)*PL_reglastparen >= nums[n] &&
3027 PL_regoffs[nums[n]].end != -1)
3036 /* free all slabs above current one - called during LEAVE_SCOPE */
3039 S_clear_backtrack_stack(pTHX_ void *p)
3041 regmatch_slab *s = PL_regmatch_slab->next;
3046 PL_regmatch_slab->next = NULL;
3048 regmatch_slab * const osl = s;
3055 #define SETREX(Re1,Re2) \
3056 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3059 STATIC I32 /* 0 failure, 1 success */
3060 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3062 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3066 register const bool utf8_target = PL_reg_match_utf8;
3067 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3068 REGEXP *rex_sv = reginfo->prog;
3069 regexp *rex = (struct regexp *)SvANY(rex_sv);
3070 RXi_GET_DECL(rex,rexi);
3072 /* the current state. This is a cached copy of PL_regmatch_state */
3073 register regmatch_state *st;
3074 /* cache heavy used fields of st in registers */
3075 register regnode *scan;
3076 register regnode *next;
3077 register U32 n = 0; /* general value; init to avoid compiler warning */
3078 register I32 ln = 0; /* len or last; init to avoid compiler warning */
3079 register char *locinput = PL_reginput;
3080 register I32 nextchr; /* is always set to UCHARAT(locinput) */
3082 bool result = 0; /* return value of S_regmatch */
3083 int depth = 0; /* depth of backtrack stack */
3084 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3085 const U32 max_nochange_depth =
3086 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3087 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3088 regmatch_state *yes_state = NULL; /* state to pop to on success of
3090 /* mark_state piggy backs on the yes_state logic so that when we unwind
3091 the stack on success we can update the mark_state as we go */
3092 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3093 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3094 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3096 bool no_final = 0; /* prevent failure from backtracking? */
3097 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3098 char *startpoint = PL_reginput;
3099 SV *popmark = NULL; /* are we looking for a mark? */
3100 SV *sv_commit = NULL; /* last mark name seen in failure */
3101 SV *sv_yes_mark = NULL; /* last mark name we have seen
3102 during a successful match */
3103 U32 lastopen = 0; /* last open we saw */
3104 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3105 SV* const oreplsv = GvSV(PL_replgv);
3106 /* these three flags are set by various ops to signal information to
3107 * the very next op. They have a useful lifetime of exactly one loop
3108 * iteration, and are not preserved or restored by state pushes/pops
3110 bool sw = 0; /* the condition value in (?(cond)a|b) */
3111 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3112 int logical = 0; /* the following EVAL is:
3116 or the following IFMATCH/UNLESSM is:
3117 false: plain (?=foo)
3118 true: used as a condition: (?(?=foo))
3121 GET_RE_DEBUG_FLAGS_DECL;
3124 PERL_ARGS_ASSERT_REGMATCH;
3126 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3127 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3129 /* on first ever call to regmatch, allocate first slab */
3130 if (!PL_regmatch_slab) {
3131 Newx(PL_regmatch_slab, 1, regmatch_slab);
3132 PL_regmatch_slab->prev = NULL;
3133 PL_regmatch_slab->next = NULL;
3134 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3137 oldsave = PL_savestack_ix;
3138 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3139 SAVEVPTR(PL_regmatch_slab);
3140 SAVEVPTR(PL_regmatch_state);
3142 /* grab next free state slot */
3143 st = ++PL_regmatch_state;
3144 if (st > SLAB_LAST(PL_regmatch_slab))
3145 st = PL_regmatch_state = S_push_slab(aTHX);
3147 /* Note that nextchr is a byte even in UTF */
3148 nextchr = UCHARAT(locinput);
3150 while (scan != NULL) {
3153 SV * const prop = sv_newmortal();
3154 regnode *rnext=regnext(scan);
3155 DUMP_EXEC_POS( locinput, scan, utf8_target );
3156 regprop(rex, prop, scan);
3158 PerlIO_printf(Perl_debug_log,
3159 "%3"IVdf":%*s%s(%"IVdf")\n",
3160 (IV)(scan - rexi->program), depth*2, "",
3162 (PL_regkind[OP(scan)] == END || !rnext) ?
3163 0 : (IV)(rnext - rexi->program));
3166 next = scan + NEXT_OFF(scan);
3169 state_num = OP(scan);
3173 assert(PL_reglastparen == &rex->lastparen);
3174 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3175 assert(PL_regoffs == rex->offs);
3177 switch (state_num) {
3179 if (locinput == PL_bostr)
3181 /* reginfo->till = reginfo->bol; */
3186 if (locinput == PL_bostr ||
3187 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3193 if (locinput == PL_bostr)
3197 if (locinput == reginfo->ganch)
3202 /* update the startpoint */
3203 st->u.keeper.val = PL_regoffs[0].start;
3204 PL_reginput = locinput;
3205 PL_regoffs[0].start = locinput - PL_bostr;
3206 PUSH_STATE_GOTO(KEEPS_next, next);
3208 case KEEPS_next_fail:
3209 /* rollback the start point change */
3210 PL_regoffs[0].start = st->u.keeper.val;
3216 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3221 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3223 if (PL_regeol - locinput > 1)
3227 if (PL_regeol != locinput)
3231 if (!nextchr && locinput >= PL_regeol)
3234 locinput += PL_utf8skip[nextchr];
3235 if (locinput > PL_regeol)
3237 nextchr = UCHARAT(locinput);
3240 nextchr = UCHARAT(++locinput);
3243 if (!nextchr && locinput >= PL_regeol)
3245 nextchr = UCHARAT(++locinput);
3248 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3251 locinput += PL_utf8skip[nextchr];
3252 if (locinput > PL_regeol)
3254 nextchr = UCHARAT(locinput);
3257 nextchr = UCHARAT(++locinput);
3261 #define ST st->u.trie
3263 /* In this case the charclass data is available inline so
3264 we can fail fast without a lot of extra overhead.
3266 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3268 PerlIO_printf(Perl_debug_log,
3269 "%*s %sfailed to match trie start class...%s\n",
3270 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3277 /* the basic plan of execution of the trie is:
3278 * At the beginning, run though all the states, and
3279 * find the longest-matching word. Also remember the position
3280 * of the shortest matching word. For example, this pattern:
3283 * when matched against the string "abcde", will generate
3284 * accept states for all words except 3, with the longest
3285 * matching word being 4, and the shortest being 1 (with
3286 * the position being after char 1 of the string).
3288 * Then for each matching word, in word order (i.e. 1,2,4,5),
3289 * we run the remainder of the pattern; on each try setting
3290 * the current position to the character following the word,
3291 * returning to try the next word on failure.
3293 * We avoid having to build a list of words at runtime by
3294 * using a compile-time structure, wordinfo[].prev, which
3295 * gives, for each word, the previous accepting word (if any).
3296 * In the case above it would contain the mappings 1->2, 2->0,
3297 * 3->0, 4->5, 5->1. We can use this table to generate, from
3298 * the longest word (4 above), a list of all words, by
3299 * following the list of prev pointers; this gives us the
3300 * unordered list 4,5,1,2. Then given the current word we have
3301 * just tried, we can go through the list and find the
3302 * next-biggest word to try (so if we just failed on word 2,
3303 * the next in the list is 4).
3305 * Since at runtime we don't record the matching position in
3306 * the string for each word, we have to work that out for
3307 * each word we're about to process. The wordinfo table holds
3308 * the character length of each word; given that we recorded
3309 * at the start: the position of the shortest word and its
3310 * length in chars, we just need to move the pointer the
3311 * difference between the two char lengths. Depending on
3312 * Unicode status and folding, that's cheap or expensive.
3314 * This algorithm is optimised for the case where are only a
3315 * small number of accept states, i.e. 0,1, or maybe 2.
3316 * With lots of accepts states, and having to try all of them,
3317 * it becomes quadratic on number of accept states to find all
3322 /* what type of TRIE am I? (utf8 makes this contextual) */
3323 DECL_TRIE_TYPE(scan);
3325 /* what trie are we using right now */
3326 reg_trie_data * const trie
3327 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3328 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3329 U32 state = trie->startstate;
3331 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3332 if (trie->states[ state ].wordnum) {
3334 PerlIO_printf(Perl_debug_log,
3335 "%*s %smatched empty string...%s\n",
3336 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3342 PerlIO_printf(Perl_debug_log,
3343 "%*s %sfailed to match trie start class...%s\n",
3344 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3351 U8 *uc = ( U8* )locinput;
3355 U8 *uscan = (U8*)NULL;
3356 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3357 U32 charcount = 0; /* how many input chars we have matched */
3358 U32 accepted = 0; /* have we seen any accepting states? */
3361 ST.jump = trie->jump;
3364 ST.longfold = FALSE; /* char longer if folded => it's harder */
3367 /* fully traverse the TRIE; note the position of the
3368 shortest accept state and the wordnum of the longest
3371 while ( state && uc <= (U8*)PL_regeol ) {
3372 U32 base = trie->states[ state ].trans.base;
3376 wordnum = trie->states[ state ].wordnum;
3378 if (wordnum) { /* it's an accept state */
3381 /* record first match position */
3383 ST.firstpos = (U8*)locinput;
3388 ST.firstchars = charcount;
3391 if (!ST.nextword || wordnum < ST.nextword)
3392 ST.nextword = wordnum;
3393 ST.topword = wordnum;
3396 DEBUG_TRIE_EXECUTE_r({
3397 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3398 PerlIO_printf( Perl_debug_log,
3399 "%*s %sState: %4"UVxf" Accepted: %c ",
3400 2+depth * 2, "", PL_colors[4],
3401 (UV)state, (accepted ? 'Y' : 'N'));
3404 /* read a char and goto next state */
3407 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3408 uscan, len, uvc, charid, foldlen,
3415 base + charid - 1 - trie->uniquecharcount)) >= 0)
3417 && ((U32)offset < trie->lasttrans)
3418 && trie->trans[offset].check == state)
3420 state = trie->trans[offset].next;
3431 DEBUG_TRIE_EXECUTE_r(
3432 PerlIO_printf( Perl_debug_log,
3433 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3434 charid, uvc, (UV)state, PL_colors[5] );
3440 /* calculate total number of accept states */
3445 w = trie->wordinfo[w].prev;
3448 ST.accepted = accepted;
3452 PerlIO_printf( Perl_debug_log,
3453 "%*s %sgot %"IVdf" possible matches%s\n",
3454 REPORT_CODE_OFF + depth * 2, "",
3455 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3457 goto trie_first_try; /* jump into the fail handler */
3461 case TRIE_next_fail: /* we failed - try next alternative */
3463 REGCP_UNWIND(ST.cp);
3464 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3465 PL_regoffs[n].end = -1;
3466 *PL_reglastparen = n;
3468 if (!--ST.accepted) {
3470 PerlIO_printf( Perl_debug_log,
3471 "%*s %sTRIE failed...%s\n",
3472 REPORT_CODE_OFF+depth*2, "",
3479 /* Find next-highest word to process. Note that this code
3480 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3481 register U16 min = 0;
3483 register U16 const nextword = ST.nextword;
3484 register reg_trie_wordinfo * const wordinfo
3485 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3486 for (word=ST.topword; word; word=wordinfo[word].prev) {
3487 if (word > nextword && (!min || word < min))
3500 ST.lastparen = *PL_reglastparen;
3504 /* find start char of end of current word */
3506 U32 chars; /* how many chars to skip */
3507 U8 *uc = ST.firstpos;
3508 reg_trie_data * const trie
3509 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3511 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3513 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3517 /* the hard option - fold each char in turn and find
3518 * its folded length (which may be different */
3519 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3527 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3535 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3540 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3554 PL_reginput = (char *)uc;
3557 scan = (ST.jump && ST.jump[ST.nextword])
3558 ? ST.me + ST.jump[ST.nextword]
3562 PerlIO_printf( Perl_debug_log,
3563 "%*s %sTRIE matched word #%d, continuing%s\n",
3564 REPORT_CODE_OFF+depth*2, "",
3571 if (ST.accepted > 1 || has_cutgroup) {
3572 PUSH_STATE_GOTO(TRIE_next, scan);
3575 /* only one choice left - just continue */
3577 AV *const trie_words
3578 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3579 SV ** const tmp = av_fetch( trie_words,
3581 SV *sv= tmp ? sv_newmortal() : NULL;
3583 PerlIO_printf( Perl_debug_log,
3584 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3585 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3587 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3588 PL_colors[0], PL_colors[1],
3589 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3591 : "not compiled under -Dr",
3595 locinput = PL_reginput;
3596 nextchr = UCHARAT(locinput);
3597 continue; /* execute rest of RE */
3602 char *s = STRING(scan);
3604 if (utf8_target != UTF_PATTERN) {
3605 /* The target and the pattern have differing utf8ness. */
3607 const char * const e = s + ln;
3610 /* The target is utf8, the pattern is not utf8. */
3615 if (NATIVE_TO_UNI(*(U8*)s) !=
3616 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3624 /* The target is not utf8, the pattern is utf8. */
3629 if (NATIVE_TO_UNI(*((U8*)l)) !=
3630 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3638 nextchr = UCHARAT(locinput);
3641 /* The target and the pattern have the same utf8ness. */
3642 /* Inline the first character, for speed. */
3643 if (UCHARAT(s) != nextchr)
3645 if (PL_regeol - locinput < ln)
3647 if (ln > 1 && memNE(s, locinput, ln))
3650 nextchr = UCHARAT(locinput);
3655 const U8 * fold_array;
3657 U32 fold_utf8_flags;
3659 PL_reg_flags |= RF_tainted;
3660 folder = foldEQ_locale;
3661 fold_array = PL_fold_locale;
3662 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3666 case EXACTFU_TRICKYFOLD:
3668 folder = foldEQ_latin1;
3669 fold_array = PL_fold_latin1;
3670 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3674 folder = foldEQ_latin1;
3675 fold_array = PL_fold_latin1;
3676 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3681 fold_array = PL_fold;
3682 fold_utf8_flags = 0;
3688 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3689 /* Either target or the pattern are utf8, or has the issue where
3690 * the fold lengths may differ. */
3691 const char * const l = locinput;
3692 char *e = PL_regeol;
3694 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
3695 l, &e, 0, utf8_target, fold_utf8_flags))
3700 nextchr = UCHARAT(locinput);
3704 /* Neither the target nor the pattern are utf8 */
3705 if (UCHARAT(s) != nextchr &&
3706 UCHARAT(s) != fold_array[nextchr])
3710 if (PL_regeol - locinput < ln)
3712 if (ln > 1 && ! folder(s, locinput, ln))
3715 nextchr = UCHARAT(locinput);
3719 /* XXX Could improve efficiency by separating these all out using a
3720 * macro or in-line function. At that point regcomp.c would no longer
3721 * have to set the FLAGS fields of these */
3724 PL_reg_flags |= RF_tainted;
3732 /* was last char in word? */
3734 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3735 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3737 if (locinput == PL_bostr)
3740 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3742 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3744 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3745 ln = isALNUM_uni(ln);
3746 LOAD_UTF8_CHARCLASS_ALNUM();
3747 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3750 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3751 n = isALNUM_LC_utf8((U8*)locinput);
3756 /* Here the string isn't utf8, or is utf8 and only ascii
3757 * characters are to match \w. In the latter case looking at
3758 * the byte just prior to the current one may be just the final
3759 * byte of a multi-byte character. This is ok. There are two
3761 * 1) it is a single byte character, and then the test is doing
3762 * just what it's supposed to.
3763 * 2) it is a multi-byte character, in which case the final
3764 * byte is never mistakable for ASCII, and so the test
3765 * will say it is not a word character, which is the
3766 * correct answer. */
3767 ln = (locinput != PL_bostr) ?
3768 UCHARAT(locinput - 1) : '\n';
3769 switch (FLAGS(scan)) {
3770 case REGEX_UNICODE_CHARSET:
3771 ln = isWORDCHAR_L1(ln);
3772 n = isWORDCHAR_L1(nextchr);
3774 case REGEX_LOCALE_CHARSET:
3775 ln = isALNUM_LC(ln);
3776 n = isALNUM_LC(nextchr);
3778 case REGEX_DEPENDS_CHARSET:
3780 n = isALNUM(nextchr);
3782 case REGEX_ASCII_RESTRICTED_CHARSET:
3783 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3784 ln = isWORDCHAR_A(ln);
3785 n = isWORDCHAR_A(nextchr);
3788 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3792 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3794 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3799 if (utf8_target || state_num == ANYOFV) {
3800 STRLEN inclasslen = PL_regeol - locinput;
3801 if (locinput >= PL_regeol)
3804 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3806 locinput += inclasslen;
3807 nextchr = UCHARAT(locinput);
3812 nextchr = UCHARAT(locinput);
3813 if (!nextchr && locinput >= PL_regeol)
3815 if (!REGINCLASS(rex, scan, (U8*)locinput))
3817 nextchr = UCHARAT(++locinput);
3821 /* Special char classes - The defines start on line 129 or so */
3822 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3823 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3824 ALNUMU, NALNUMU, isWORDCHAR_L1,
3825 ALNUMA, NALNUMA, isWORDCHAR_A,
3828 CCC_TRY_U(SPACE, NSPACE, isSPACE,
3829 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3830 SPACEU, NSPACEU, isSPACE_L1,
3831 SPACEA, NSPACEA, isSPACE_A,
3834 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
3835 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3836 DIGITA, NDIGITA, isDIGIT_A,
3839 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3840 a Unicode extended Grapheme Cluster */
3841 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3842 extended Grapheme Cluster is:
3845 | Prepend* Begin Extend*
3848 Begin is (Hangul-syllable | ! Control)
3849 Extend is (Grapheme_Extend | Spacing_Mark)
3850 Control is [ GCB_Control CR LF ]
3852 The discussion below shows how the code for CLUMP is derived
3853 from this regex. Note that most of these concepts are from
3854 property values of the Grapheme Cluster Boundary (GCB) property.
3855 No code point can have multiple property values for a given
3856 property. Thus a code point in Prepend can't be in Control, but
3857 it must be in !Control. This is why Control above includes
3858 GCB_Control plus CR plus LF. The latter two are used in the GCB
3859 property separately, and so can't be in GCB_Control, even though
3860 they logically are controls. Control is not the same as gc=cc,
3861 but includes format and other characters as well.
3863 The Unicode definition of Hangul-syllable is:
3865 | (L* ( ( V | LV ) V* | LVT ) T*)
3868 Each of these is a value for the GCB property, and hence must be
3869 disjoint, so the order they are tested is immaterial, so the
3870 above can safely be changed to
3873 | (L* ( LVT | ( V | LV ) V*) T*)
3875 The last two terms can be combined like this:
3877 | (( LVT | ( V | LV ) V*) T*))
3879 And refactored into this:
3880 L* (L | LVT T* | V V* T* | LV V* T*)
3882 That means that if we have seen any L's at all we can quit
3883 there, but if the next character is an LVT, a V, or an LV we
3886 There is a subtlety with Prepend* which showed up in testing.
3887 Note that the Begin, and only the Begin is required in:
3888 | Prepend* Begin Extend*
3889 Also, Begin contains '! Control'. A Prepend must be a
3890 '! Control', which means it must also be a Begin. What it
3891 comes down to is that if we match Prepend* and then find no
3892 suitable Begin afterwards, that if we backtrack the last
3893 Prepend, that one will be a suitable Begin.
3896 if (locinput >= PL_regeol)
3898 if (! utf8_target) {
3900 /* Match either CR LF or '.', as all the other possibilities
3902 locinput++; /* Match the . or CR */
3903 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3905 && locinput < PL_regeol
3906 && UCHARAT(locinput) == '\n') locinput++;
3910 /* Utf8: See if is ( CR LF ); already know that locinput <
3911 * PL_regeol, so locinput+1 is in bounds */
3912 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3916 /* In case have to backtrack to beginning, then match '.' */
3917 char *starting = locinput;
3919 /* In case have to backtrack the last prepend */
3920 char *previous_prepend = 0;
3922 LOAD_UTF8_CHARCLASS_GCB();
3924 /* Match (prepend)* */
3925 while (locinput < PL_regeol
3926 && swash_fetch(PL_utf8_X_prepend,
3927 (U8*)locinput, utf8_target))
3929 previous_prepend = locinput;
3930 locinput += UTF8SKIP(locinput);
3933 /* As noted above, if we matched a prepend character, but
3934 * the next thing won't match, back off the last prepend we
3935 * matched, as it is guaranteed to match the begin */
3936 if (previous_prepend
3937 && (locinput >= PL_regeol
3938 || ! swash_fetch(PL_utf8_X_begin,
3939 (U8*)locinput, utf8_target)))
3941 locinput = previous_prepend;
3944 /* Note that here we know PL_regeol > locinput, as we
3945 * tested that upon input to this switch case, and if we
3946 * moved locinput forward, we tested the result just above
3947 * and it either passed, or we backed off so that it will
3949 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3951 /* Here did not match the required 'Begin' in the
3952 * second term. So just match the very first
3953 * character, the '.' of the final term of the regex */
3954 locinput = starting + UTF8SKIP(starting);
3957 /* Here is the beginning of a character that can have
3958 * an extender. It is either a hangul syllable, or a
3960 if (swash_fetch(PL_utf8_X_non_hangul,
3961 (U8*)locinput, utf8_target))
3964 /* Here not a Hangul syllable, must be a
3965 * ('! * Control') */
3966 locinput += UTF8SKIP(locinput);
3969 /* Here is a Hangul syllable. It can be composed
3970 * of several individual characters. One
3971 * possibility is T+ */
3972 if (swash_fetch(PL_utf8_X_T,
3973 (U8*)locinput, utf8_target))
3975 while (locinput < PL_regeol
3976 && swash_fetch(PL_utf8_X_T,
3977 (U8*)locinput, utf8_target))
3979 locinput += UTF8SKIP(locinput);
3983 /* Here, not T+, but is a Hangul. That means
3984 * it is one of the others: L, LV, LVT or V,
3986 * L* (L | LVT T* | V V* T* | LV V* T*) */
3989 while (locinput < PL_regeol
3990 && swash_fetch(PL_utf8_X_L,
3991 (U8*)locinput, utf8_target))
3993 locinput += UTF8SKIP(locinput);
3996 /* Here, have exhausted L*. If the next
3997 * character is not an LV, LVT nor V, it means
3998 * we had to have at least one L, so matches L+
3999 * in the original equation, we have a complete
4000 * hangul syllable. Are done. */
4002 if (locinput < PL_regeol
4003 && swash_fetch(PL_utf8_X_LV_LVT_V,
4004 (U8*)locinput, utf8_target))
4007 /* Otherwise keep going. Must be LV, LVT
4008 * or V. See if LVT */
4009 if (swash_fetch(PL_utf8_X_LVT,
4010 (U8*)locinput, utf8_target))
4012 locinput += UTF8SKIP(locinput);
4015 /* Must be V or LV. Take it, then
4017 locinput += UTF8SKIP(locinput);
4018 while (locinput < PL_regeol
4019 && swash_fetch(PL_utf8_X_V,
4020 (U8*)locinput, utf8_target))
4022 locinput += UTF8SKIP(locinput);
4026 /* And any of LV, LVT, or V can be followed
4028 while (locinput < PL_regeol
4029 && swash_fetch(PL_utf8_X_T,
4033 locinput += UTF8SKIP(locinput);
4039 /* Match any extender */
4040 while (locinput < PL_regeol
4041 && swash_fetch(PL_utf8_X_extend,
4042 (U8*)locinput, utf8_target))
4044 locinput += UTF8SKIP(locinput);
4048 if (locinput > PL_regeol) sayNO;
4050 nextchr = UCHARAT(locinput);
4054 { /* The capture buffer cases. The ones beginning with N for the
4055 named buffers just convert to the equivalent numbered and
4056 pretend they were called as the corresponding numbered buffer
4058 /* don't initialize these in the declaration, it makes C++
4063 const U8 *fold_array;
4066 PL_reg_flags |= RF_tainted;
4067 folder = foldEQ_locale;
4068 fold_array = PL_fold_locale;
4070 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4074 folder = foldEQ_latin1;
4075 fold_array = PL_fold_latin1;
4077 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4081 folder = foldEQ_latin1;
4082 fold_array = PL_fold_latin1;
4084 utf8_fold_flags = 0;
4089 fold_array = PL_fold;
4091 utf8_fold_flags = 0;
4098 utf8_fold_flags = 0;
4101 /* For the named back references, find the corresponding buffer
4103 n = reg_check_named_buff_matched(rex,scan);
4108 goto do_nref_ref_common;
4111 PL_reg_flags |= RF_tainted;
4112 folder = foldEQ_locale;
4113 fold_array = PL_fold_locale;
4114 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4118 folder = foldEQ_latin1;
4119 fold_array = PL_fold_latin1;
4120 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4124 folder = foldEQ_latin1;
4125 fold_array = PL_fold_latin1;
4126 utf8_fold_flags = 0;
4131 fold_array = PL_fold;
4132 utf8_fold_flags = 0;
4138 utf8_fold_flags = 0;
4142 n = ARG(scan); /* which paren pair */
4145 ln = PL_regoffs[n].start;
4146 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4147 if (*PL_reglastparen < n || ln == -1)
4148 sayNO; /* Do not match unless seen CLOSEn. */
4149 if (ln == PL_regoffs[n].end)
4153 if (type != REF /* REF can do byte comparison */
4154 && (utf8_target || type == REFFU))
4155 { /* XXX handle REFFL better */
4156 char * limit = PL_regeol;
4158 /* This call case insensitively compares the entire buffer
4159 * at s, with the current input starting at locinput, but
4160 * not going off the end given by PL_regeol, and returns in
4161 * limit upon success, how much of the current input was
4163 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4164 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4169 nextchr = UCHARAT(locinput);
4173 /* Not utf8: Inline the first character, for speed. */
4174 if (UCHARAT(s) != nextchr &&
4176 UCHARAT(s) != fold_array[nextchr]))
4178 ln = PL_regoffs[n].end - ln;
4179 if (locinput + ln > PL_regeol)
4181 if (ln > 1 && (type == REF
4182 ? memNE(s, locinput, ln)
4183 : ! folder(s, locinput, ln)))
4186 nextchr = UCHARAT(locinput);
4196 #define ST st->u.eval
4201 regexp_internal *rei;
4202 regnode *startpoint;
4205 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4206 if (cur_eval && cur_eval->locinput==locinput) {
4207 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4208 Perl_croak(aTHX_ "Infinite recursion in regex");
4209 if ( ++nochange_depth > max_nochange_depth )
4211 "Pattern subroutine nesting without pos change"
4212 " exceeded limit in regex");
4219 (void)ReREFCNT_inc(rex_sv);
4220 if (OP(scan)==GOSUB) {
4221 startpoint = scan + ARG2L(scan);
4222 ST.close_paren = ARG(scan);
4224 startpoint = rei->program+1;
4227 goto eval_recurse_doit;
4229 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4230 if (cur_eval && cur_eval->locinput==locinput) {
4231 if ( ++nochange_depth > max_nochange_depth )
4232 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4237 /* execute the code in the {...} */
4239 SV ** const before = SP;
4240 OP_4tree * const oop = PL_op;
4241 COP * const ocurcop = PL_curcop;
4243 char *saved_regeol = PL_regeol;
4244 struct re_save_state saved_state;
4246 /* To not corrupt the existing regex state while executing the
4247 * eval we would normally put it on the save stack, like with
4248 * save_re_context. However, re-evals have a weird scoping so we
4249 * can't just add ENTER/LEAVE here. With that, things like
4251 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4253 * would break, as they expect the localisation to be unwound
4254 * only when the re-engine backtracks through the bit that
4257 * What we do instead is just saving the state in a local c
4260 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4263 PL_op = (OP_4tree*)rexi->data->data[n];
4264 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4265 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4266 /* wrap the call in two SAVECOMPPADs. This ensures that
4267 * when the save stack is eventually unwound, all the
4268 * accumulated SAVEt_CLEARSV's will be processed with
4269 * interspersed SAVEt_COMPPAD's to ensure that lexicals
4270 * are cleared in the right pad */
4272 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4273 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4276 SV *sv_mrk = get_sv("REGMARK", 1);
4277 sv_setsv(sv_mrk, sv_yes_mark);
4280 CALLRUNOPS(aTHX); /* Scalar context. */
4283 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4289 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4293 PAD_RESTORE_LOCAL(old_comppad);
4294 PL_curcop = ocurcop;
4295 PL_regeol = saved_regeol;
4298 sv_setsv(save_scalar(PL_replgv), ret);
4302 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4305 /* extract RE object from returned value; compiling if
4311 SV *const sv = SvRV(ret);
4313 if (SvTYPE(sv) == SVt_REGEXP) {
4315 } else if (SvSMAGICAL(sv)) {
4316 mg = mg_find(sv, PERL_MAGIC_qr);
4319 } else if (SvTYPE(ret) == SVt_REGEXP) {
4321 } else if (SvSMAGICAL(ret)) {
4322 if (SvGMAGICAL(ret)) {
4323 /* I don't believe that there is ever qr magic
4325 assert(!mg_find(ret, PERL_MAGIC_qr));
4326 sv_unmagic(ret, PERL_MAGIC_qr);
4329 mg = mg_find(ret, PERL_MAGIC_qr);
4330 /* testing suggests mg only ends up non-NULL for
4331 scalars who were upgraded and compiled in the
4332 else block below. In turn, this is only
4333 triggered in the "postponed utf8 string" tests
4339 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4343 rx = reg_temp_copy(NULL, rx);
4347 const I32 osize = PL_regsize;
4350 assert (SvUTF8(ret));
4351 } else if (SvUTF8(ret)) {
4352 /* Not doing UTF-8, despite what the SV says. Is
4353 this only if we're trapped in use 'bytes'? */
4354 /* Make a copy of the octet sequence, but without
4355 the flag on, as the compiler now honours the
4356 SvUTF8 flag on ret. */
4358 const char *const p = SvPV(ret, len);
4359 ret = newSVpvn_flags(p, len, SVs_TEMP);
4361 rx = CALLREGCOMP(ret, pm_flags);
4363 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4365 /* This isn't a first class regexp. Instead, it's
4366 caching a regexp onto an existing, Perl visible
4368 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4373 re = (struct regexp *)SvANY(rx);
4375 RXp_MATCH_COPIED_off(re);
4376 re->subbeg = rex->subbeg;
4377 re->sublen = rex->sublen;
4380 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4381 "Matching embedded");
4383 startpoint = rei->program + 1;
4384 ST.close_paren = 0; /* only used for GOSUB */
4385 /* borrowed from regtry */
4386 if (PL_reg_start_tmpl <= re->nparens) {
4387 PL_reg_start_tmpl = re->nparens*3/2 + 3;
4388 if(PL_reg_start_tmp)
4389 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4391 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4394 eval_recurse_doit: /* Share code with GOSUB below this line */
4395 /* run the pattern returned from (??{...}) */
4396 ST.cp = regcppush(0); /* Save *all* the positions. */
4397 REGCP_SET(ST.lastcp);
4399 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4401 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4402 PL_reglastparen = &re->lastparen;
4403 PL_reglastcloseparen = &re->lastcloseparen;
4405 re->lastcloseparen = 0;
4407 PL_reginput = locinput;
4410 /* XXXX This is too dramatic a measure... */
4413 ST.toggle_reg_flags = PL_reg_flags;
4415 PL_reg_flags |= RF_utf8;
4417 PL_reg_flags &= ~RF_utf8;
4418 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4420 ST.prev_rex = rex_sv;
4421 ST.prev_curlyx = cur_curlyx;
4422 SETREX(rex_sv,re_sv);
4427 ST.prev_eval = cur_eval;
4429 /* now continue from first node in postoned RE */
4430 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4433 /* logical is 1, /(?(?{...})X|Y)/ */
4434 sw = cBOOL(SvTRUE(ret));
4439 case EVAL_AB: /* cleanup after a successful (??{A})B */
4440 /* note: this is called twice; first after popping B, then A */
4441 PL_reg_flags ^= ST.toggle_reg_flags;
4442 ReREFCNT_dec(rex_sv);
4443 SETREX(rex_sv,ST.prev_rex);
4444 rex = (struct regexp *)SvANY(rex_sv);
4445 rexi = RXi_GET(rex);
4447 cur_eval = ST.prev_eval;
4448 cur_curlyx = ST.prev_curlyx;
4450 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4451 PL_reglastparen = &rex->lastparen;
4452 PL_reglastcloseparen = &rex->lastcloseparen;
4453 /* also update PL_regoffs */
4454 PL_regoffs = rex->offs;
4456 /* XXXX This is too dramatic a measure... */
4458 if ( nochange_depth )
4463 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4464 /* note: this is called twice; first after popping B, then A */
4465 PL_reg_flags ^= ST.toggle_reg_flags;
4466 ReREFCNT_dec(rex_sv);
4467 SETREX(rex_sv,ST.prev_rex);
4468 rex = (struct regexp *)SvANY(rex_sv);
4469 rexi = RXi_GET(rex);
4470 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4471 PL_reglastparen = &rex->lastparen;
4472 PL_reglastcloseparen = &rex->lastcloseparen;
4474 PL_reginput = locinput;
4475 REGCP_UNWIND(ST.lastcp);
4477 cur_eval = ST.prev_eval;
4478 cur_curlyx = ST.prev_curlyx;
4479 /* XXXX This is too dramatic a measure... */
4481 if ( nochange_depth )
4487 n = ARG(scan); /* which paren pair */
4488 PL_reg_start_tmp[n] = locinput;
4494 n = ARG(scan); /* which paren pair */
4495 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4496 PL_regoffs[n].end = locinput - PL_bostr;
4497 /*if (n > PL_regsize)
4499 if (n > *PL_reglastparen)
4500 *PL_reglastparen = n;
4501 *PL_reglastcloseparen = n;
4502 if (cur_eval && cur_eval->u.eval.close_paren == n) {
4510 cursor && OP(cursor)!=END;
4511 cursor=regnext(cursor))
4513 if ( OP(cursor)==CLOSE ){
4515 if ( n <= lastopen ) {
4517 = PL_reg_start_tmp[n] - PL_bostr;
4518 PL_regoffs[n].end = locinput - PL_bostr;
4519 /*if (n > PL_regsize)
4521 if (n > *PL_reglastparen)
4522 *PL_reglastparen = n;
4523 *PL_reglastcloseparen = n;
4524 if ( n == ARG(scan) || (cur_eval &&
4525 cur_eval->u.eval.close_paren == n))
4534 n = ARG(scan); /* which paren pair */
4535 sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4538 /* reg_check_named_buff_matched returns 0 for no match */
4539 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4543 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4549 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4551 next = NEXTOPER(NEXTOPER(scan));
4553 next = scan + ARG(scan);
4554 if (OP(next) == IFTHEN) /* Fake one. */
4555 next = NEXTOPER(NEXTOPER(next));
4559 logical = scan->flags;
4562 /*******************************************************************
4564 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4565 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4566 STAR/PLUS/CURLY/CURLYN are used instead.)
4568 A*B is compiled as <CURLYX><A><WHILEM><B>
4570 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4571 state, which contains the current count, initialised to -1. It also sets
4572 cur_curlyx to point to this state, with any previous value saved in the
4575 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4576 since the pattern may possibly match zero times (i.e. it's a while {} loop
4577 rather than a do {} while loop).
4579 Each entry to WHILEM represents a successful match of A. The count in the
4580 CURLYX block is incremented, another WHILEM state is pushed, and execution
4581 passes to A or B depending on greediness and the current count.
4583 For example, if matching against the string a1a2a3b (where the aN are
4584 substrings that match /A/), then the match progresses as follows: (the
4585 pushed states are interspersed with the bits of strings matched so far):
4588 <CURLYX cnt=0><WHILEM>
4589 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4590 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4591 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4592 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4594 (Contrast this with something like CURLYM, which maintains only a single
4598 a1 <CURLYM cnt=1> a2
4599 a1 a2 <CURLYM cnt=2> a3
4600 a1 a2 a3 <CURLYM cnt=3> b
4603 Each WHILEM state block marks a point to backtrack to upon partial failure
4604 of A or B, and also contains some minor state data related to that
4605 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4606 overall state, such as the count, and pointers to the A and B ops.
4608 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4609 must always point to the *current* CURLYX block, the rules are:
4611 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4612 and set cur_curlyx to point the new block.
4614 When popping the CURLYX block after a successful or unsuccessful match,
4615 restore the previous cur_curlyx.
4617 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4618 to the outer one saved in the CURLYX block.
4620 When popping the WHILEM block after a successful or unsuccessful B match,
4621 restore the previous cur_curlyx.
4623 Here's an example for the pattern (AI* BI)*BO
4624 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4627 curlyx backtrack stack
4628 ------ ---------------
4630 CO <CO prev=NULL> <WO>
4631 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4632 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4633 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4635 At this point the pattern succeeds, and we work back down the stack to
4636 clean up, restoring as we go:
4638 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4639 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4640 CO <CO prev=NULL> <WO>
4643 *******************************************************************/
4645 #define ST st->u.curlyx
4647 case CURLYX: /* start of /A*B/ (for complex A) */
4649 /* No need to save/restore up to this paren */
4650 I32 parenfloor = scan->flags;
4652 assert(next); /* keep Coverity happy */
4653 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4656 /* XXXX Probably it is better to teach regpush to support
4657 parenfloor > PL_regsize... */
4658 if (parenfloor > (I32)*PL_reglastparen)
4659 parenfloor = *PL_reglastparen; /* Pessimization... */
4661 ST.prev_curlyx= cur_curlyx;
4663 ST.cp = PL_savestack_ix;
4665 /* these fields contain the state of the current curly.
4666 * they are accessed by subsequent WHILEMs */
4667 ST.parenfloor = parenfloor;
4672 ST.count = -1; /* this will be updated by WHILEM */
4673 ST.lastloc = NULL; /* this will be updated by WHILEM */
4675 PL_reginput = locinput;
4676 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4680 case CURLYX_end: /* just finished matching all of A*B */
4681 cur_curlyx = ST.prev_curlyx;
4685 case CURLYX_end_fail: /* just failed to match all of A*B */
4687 cur_curlyx = ST.prev_curlyx;
4693 #define ST st->u.whilem
4695 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4697 /* see the discussion above about CURLYX/WHILEM */
4699 int min = ARG1(cur_curlyx->u.curlyx.me);
4700 int max = ARG2(cur_curlyx->u.curlyx.me);
4701 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4703 assert(cur_curlyx); /* keep Coverity happy */
4704 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4705 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4706 ST.cache_offset = 0;
4709 PL_reginput = locinput;
4711 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4712 "%*s whilem: matched %ld out of %d..%d\n",
4713 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4716 /* First just match a string of min A's. */
4719 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4720 cur_curlyx->u.curlyx.lastloc = locinput;
4721 REGCP_SET(ST.lastcp);
4723 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4727 /* If degenerate A matches "", assume A done. */
4729 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4730 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4731 "%*s whilem: empty match detected, trying continuation...\n",
4732 REPORT_CODE_OFF+depth*2, "")
4734 goto do_whilem_B_max;
4737 /* super-linear cache processing */
4741 if (!PL_reg_maxiter) {
4742 /* start the countdown: Postpone detection until we
4743 * know the match is not *that* much linear. */
4744 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4745 /* possible overflow for long strings and many CURLYX's */
4746 if (PL_reg_maxiter < 0)
4747 PL_reg_maxiter = I32_MAX;
4748 PL_reg_leftiter = PL_reg_maxiter;
4751 if (PL_reg_leftiter-- == 0) {
4752 /* initialise cache */
4753 const I32 size = (PL_reg_maxiter + 7)/8;
4754 if (PL_reg_poscache) {
4755 if ((I32)PL_reg_poscache_size < size) {
4756 Renew(PL_reg_poscache, size, char);
4757 PL_reg_poscache_size = size;
4759 Zero(PL_reg_poscache, size, char);
4762 PL_reg_poscache_size = size;
4763 Newxz(PL_reg_poscache, size, char);
4765 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4766 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4767 PL_colors[4], PL_colors[5])
4771 if (PL_reg_leftiter < 0) {
4772 /* have we already failed at this position? */
4774 offset = (scan->flags & 0xf) - 1
4775 + (locinput - PL_bostr) * (scan->flags>>4);
4776 mask = 1 << (offset % 8);
4778 if (PL_reg_poscache[offset] & mask) {
4779 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4780 "%*s whilem: (cache) already tried at this position...\n",
4781 REPORT_CODE_OFF+depth*2, "")
4783 sayNO; /* cache records failure */
4785 ST.cache_offset = offset;
4786 ST.cache_mask = mask;
4790 /* Prefer B over A for minimal matching. */
4792 if (cur_curlyx->u.curlyx.minmod) {
4793 ST.save_curlyx = cur_curlyx;
4794 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4795 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4796 REGCP_SET(ST.lastcp);
4797 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4801 /* Prefer A over B for maximal matching. */
4803 if (n < max) { /* More greed allowed? */
4804 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4805 cur_curlyx->u.curlyx.lastloc = locinput;
4806 REGCP_SET(ST.lastcp);
4807 PUSH_STATE_GOTO(WHILEM_A_max, A);
4810 goto do_whilem_B_max;
4814 case WHILEM_B_min: /* just matched B in a minimal match */
4815 case WHILEM_B_max: /* just matched B in a maximal match */
4816 cur_curlyx = ST.save_curlyx;
4820 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4821 cur_curlyx = ST.save_curlyx;
4822 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4823 cur_curlyx->u.curlyx.count--;
4827 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4829 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4830 REGCP_UNWIND(ST.lastcp);
4832 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4833 cur_curlyx->u.curlyx.count--;
4837 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4838 REGCP_UNWIND(ST.lastcp);
4839 regcppop(rex); /* Restore some previous $<digit>s? */
4840 PL_reginput = locinput;
4841 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4842 "%*s whilem: failed, trying continuation...\n",
4843 REPORT_CODE_OFF+depth*2, "")
4846 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4847 && ckWARN(WARN_REGEXP)
4848 && !(PL_reg_flags & RF_warned))
4850 PL_reg_flags |= RF_warned;
4851 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4852 "Complex regular subexpression recursion limit (%d) "
4858 ST.save_curlyx = cur_curlyx;
4859 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4860 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4863 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4864 cur_curlyx = ST.save_curlyx;
4865 REGCP_UNWIND(ST.lastcp);
4868 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4869 /* Maximum greed exceeded */
4870 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4871 && ckWARN(WARN_REGEXP)
4872 && !(PL_reg_flags & RF_warned))
4874 PL_reg_flags |= RF_warned;
4875 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4876 "Complex regular subexpression recursion "
4877 "limit (%d) exceeded",
4880 cur_curlyx->u.curlyx.count--;
4884 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4885 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4887 /* Try grabbing another A and see if it helps. */
4888 PL_reginput = locinput;
4889 cur_curlyx->u.curlyx.lastloc = locinput;
4890 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4891 REGCP_SET(ST.lastcp);
4892 PUSH_STATE_GOTO(WHILEM_A_min,
4893 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4897 #define ST st->u.branch
4899 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4900 next = scan + ARG(scan);
4903 scan = NEXTOPER(scan);
4906 case BRANCH: /* /(...|A|...)/ */
4907 scan = NEXTOPER(scan); /* scan now points to inner node */
4908 ST.lastparen = *PL_reglastparen;
4909 ST.next_branch = next;
4911 PL_reginput = locinput;
4913 /* Now go into the branch */
4915 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4917 PUSH_STATE_GOTO(BRANCH_next, scan);
4921 PL_reginput = locinput;
4922 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4923 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4924 PUSH_STATE_GOTO(CUTGROUP_next,next);
4926 case CUTGROUP_next_fail:
4929 if (st->u.mark.mark_name)
4930 sv_commit = st->u.mark.mark_name;
4936 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4941 REGCP_UNWIND(ST.cp);
4942 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4943 PL_regoffs[n].end = -1;
4944 *PL_reglastparen = n;
4945 /*dmq: *PL_reglastcloseparen = n; */
4946 scan = ST.next_branch;
4947 /* no more branches? */
4948 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4950 PerlIO_printf( Perl_debug_log,
4951 "%*s %sBRANCH failed...%s\n",
4952 REPORT_CODE_OFF+depth*2, "",
4958 continue; /* execute next BRANCH[J] op */
4966 #define ST st->u.curlym
4968 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4970 /* This is an optimisation of CURLYX that enables us to push
4971 * only a single backtracking state, no matter how many matches
4972 * there are in {m,n}. It relies on the pattern being constant
4973 * length, with no parens to influence future backrefs
4977 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4979 /* if paren positive, emulate an OPEN/CLOSE around A */
4981 U32 paren = ST.me->flags;
4982 if (paren > PL_regsize)
4984 if (paren > *PL_reglastparen)
4985 *PL_reglastparen = paren;
4986 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4994 ST.c1 = CHRTEST_UNINIT;
4997 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5000 curlym_do_A: /* execute the A in /A{m,n}B/ */
5001 PL_reginput = locinput;
5002 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5005 case CURLYM_A: /* we've just matched an A */
5006 locinput = st->locinput;
5007 nextchr = UCHARAT(locinput);
5010 /* after first match, determine A's length: u.curlym.alen */
5011 if (ST.count == 1) {
5012 if (PL_reg_match_utf8) {
5014 while (s < PL_reginput) {
5020 ST.alen = PL_reginput - locinput;
5023 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5026 PerlIO_printf(Perl_debug_log,
5027 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5028 (int)(REPORT_CODE_OFF+(depth*2)), "",
5029 (IV) ST.count, (IV)ST.alen)
5032 locinput = PL_reginput;
5034 if (cur_eval && cur_eval->u.eval.close_paren &&
5035 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5039 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5040 if ( max == REG_INFTY || ST.count < max )
5041 goto curlym_do_A; /* try to match another A */
5043 goto curlym_do_B; /* try to match B */
5045 case CURLYM_A_fail: /* just failed to match an A */
5046 REGCP_UNWIND(ST.cp);
5048 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5049 || (cur_eval && cur_eval->u.eval.close_paren &&
5050 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5053 curlym_do_B: /* execute the B in /A{m,n}B/ */
5054 PL_reginput = locinput;
5055 if (ST.c1 == CHRTEST_UNINIT) {
5056 /* calculate c1 and c2 for possible match of 1st char
5057 * following curly */
5058 ST.c1 = ST.c2 = CHRTEST_VOID;
5059 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5060 regnode *text_node = ST.B;
5061 if (! HAS_TEXT(text_node))
5062 FIND_NEXT_IMPT(text_node);
5065 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5067 But the former is redundant in light of the latter.
5069 if this changes back then the macro for
5070 IS_TEXT and friends need to change.
5072 if (PL_regkind[OP(text_node)] == EXACT)
5075 ST.c1 = (U8)*STRING(text_node);
5076 switch (OP(text_node)) {
5077 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5080 case EXACTFU_TRICKYFOLD:
5081 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5082 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5083 default: ST.c2 = ST.c1;
5090 PerlIO_printf(Perl_debug_log,
5091 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5092 (int)(REPORT_CODE_OFF+(depth*2)),
5095 if (ST.c1 != CHRTEST_VOID
5096 && UCHARAT(PL_reginput) != ST.c1
5097 && UCHARAT(PL_reginput) != ST.c2)
5099 /* simulate B failing */
5101 PerlIO_printf(Perl_debug_log,
5102 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5103 (int)(REPORT_CODE_OFF+(depth*2)),"",
5106 state_num = CURLYM_B_fail;
5107 goto reenter_switch;
5111 /* mark current A as captured */
5112 I32 paren = ST.me->flags;
5114 PL_regoffs[paren].start
5115 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5116 PL_regoffs[paren].end = PL_reginput - PL_bostr;
5117 /*dmq: *PL_reglastcloseparen = paren; */
5120 PL_regoffs[paren].end = -1;
5121 if (cur_eval && cur_eval->u.eval.close_paren &&
5122 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5131 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5134 case CURLYM_B_fail: /* just failed to match a B */
5135 REGCP_UNWIND(ST.cp);
5137 I32 max = ARG2(ST.me);
5138 if (max != REG_INFTY && ST.count == max)
5140 goto curlym_do_A; /* try to match a further A */
5142 /* backtrack one A */
5143 if (ST.count == ARG1(ST.me) /* min */)
5146 locinput = HOPc(locinput, -ST.alen);
5147 goto curlym_do_B; /* try to match B */
5150 #define ST st->u.curly
5152 #define CURLY_SETPAREN(paren, success) \
5155 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5156 PL_regoffs[paren].end = locinput - PL_bostr; \
5157 *PL_reglastcloseparen = paren; \
5160 PL_regoffs[paren].end = -1; \
5163 case STAR: /* /A*B/ where A is width 1 */
5167 scan = NEXTOPER(scan);
5169 case PLUS: /* /A+B/ where A is width 1 */
5173 scan = NEXTOPER(scan);
5175 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
5176 ST.paren = scan->flags; /* Which paren to set */
5177 if (ST.paren > PL_regsize)
5178 PL_regsize = ST.paren;
5179 if (ST.paren > *PL_reglastparen)
5180 *PL_reglastparen = ST.paren;
5181 ST.min = ARG1(scan); /* min to match */
5182 ST.max = ARG2(scan); /* max to match */
5183 if (cur_eval && cur_eval->u.eval.close_paren &&
5184 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5188 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5190 case CURLY: /* /A{m,n}B/ where A is width 1 */
5192 ST.min = ARG1(scan); /* min to match */
5193 ST.max = ARG2(scan); /* max to match */
5194 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5197 * Lookahead to avoid useless match attempts
5198 * when we know what character comes next.
5200 * Used to only do .*x and .*?x, but now it allows
5201 * for )'s, ('s and (?{ ... })'s to be in the way
5202 * of the quantifier and the EXACT-like node. -- japhy
5205 if (ST.min > ST.max) /* XXX make this a compile-time check? */
5207 if (HAS_TEXT(next) || JUMPABLE(next)) {
5209 regnode *text_node = next;
5211 if (! HAS_TEXT(text_node))
5212 FIND_NEXT_IMPT(text_node);
5214 if (! HAS_TEXT(text_node))
5215 ST.c1 = ST.c2 = CHRTEST_VOID;
5217 if ( PL_regkind[OP(text_node)] != EXACT ) {
5218 ST.c1 = ST.c2 = CHRTEST_VOID;
5219 goto assume_ok_easy;
5222 s = (U8*)STRING(text_node);
5224 /* Currently we only get here when
5226 PL_rekind[OP(text_node)] == EXACT
5228 if this changes back then the macro for IS_TEXT and
5229 friends need to change. */
5232 switch (OP(text_node)) {
5233 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5236 case EXACTFU_TRICKYFOLD:
5237 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5238 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5239 default: ST.c2 = ST.c1; break;
5242 else { /* UTF_PATTERN */
5243 if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5245 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5247 to_utf8_fold((U8*)s, tmpbuf, &ulen);
5248 ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5252 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5259 ST.c1 = ST.c2 = CHRTEST_VOID;
5264 PL_reginput = locinput;
5267 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5270 locinput = PL_reginput;
5272 if (ST.c1 == CHRTEST_VOID)
5273 goto curly_try_B_min;
5275 ST.oldloc = locinput;
5277 /* set ST.maxpos to the furthest point along the
5278 * string that could possibly match */
5279 if (ST.max == REG_INFTY) {
5280 ST.maxpos = PL_regeol - 1;
5282 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5285 else if (utf8_target) {
5286 int m = ST.max - ST.min;
5287 for (ST.maxpos = locinput;
5288 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5289 ST.maxpos += UTF8SKIP(ST.maxpos);
5292 ST.maxpos = locinput + ST.max - ST.min;
5293 if (ST.maxpos >= PL_regeol)
5294 ST.maxpos = PL_regeol - 1;
5296 goto curly_try_B_min_known;
5300 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5301 locinput = PL_reginput;
5302 if (ST.count < ST.min)
5304 if ((ST.count > ST.min)
5305 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5307 /* A{m,n} must come at the end of the string, there's
5308 * no point in backing off ... */
5310 /* ...except that $ and \Z can match before *and* after
5311 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5312 We may back off by one in this case. */
5313 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5317 goto curly_try_B_max;
5322 case CURLY_B_min_known_fail:
5323 /* failed to find B in a non-greedy match where c1,c2 valid */
5324 if (ST.paren && ST.count)
5325 PL_regoffs[ST.paren].end = -1;
5327 PL_reginput = locinput; /* Could be reset... */
5328 REGCP_UNWIND(ST.cp);
5329 /* Couldn't or didn't -- move forward. */
5330 ST.oldloc = locinput;
5332 locinput += UTF8SKIP(locinput);
5336 curly_try_B_min_known:
5337 /* find the next place where 'B' could work, then call B */
5341 n = (ST.oldloc == locinput) ? 0 : 1;
5342 if (ST.c1 == ST.c2) {
5344 /* set n to utf8_distance(oldloc, locinput) */
5345 while (locinput <= ST.maxpos &&
5346 utf8n_to_uvchr((U8*)locinput,
5347 UTF8_MAXBYTES, &len,
5348 uniflags) != (UV)ST.c1) {
5354 /* set n to utf8_distance(oldloc, locinput) */
5355 while (locinput <= ST.maxpos) {
5357 const UV c = utf8n_to_uvchr((U8*)locinput,
5358 UTF8_MAXBYTES, &len,
5360 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5368 if (ST.c1 == ST.c2) {
5369 while (locinput <= ST.maxpos &&
5370 UCHARAT(locinput) != ST.c1)
5374 while (locinput <= ST.maxpos
5375 && UCHARAT(locinput) != ST.c1
5376 && UCHARAT(locinput) != ST.c2)
5379 n = locinput - ST.oldloc;
5381 if (locinput > ST.maxpos)
5383 /* PL_reginput == oldloc now */
5386 if (regrepeat(rex, ST.A, n, depth) < n)
5389 PL_reginput = locinput;
5390 CURLY_SETPAREN(ST.paren, ST.count);
5391 if (cur_eval && cur_eval->u.eval.close_paren &&
5392 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5395 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5400 case CURLY_B_min_fail:
5401 /* failed to find B in a non-greedy match where c1,c2 invalid */
5402 if (ST.paren && ST.count)
5403 PL_regoffs[ST.paren].end = -1;
5405 REGCP_UNWIND(ST.cp);
5406 /* failed -- move forward one */
5407 PL_reginput = locinput;
5408 if (regrepeat(rex, ST.A, 1, depth)) {
5410 locinput = PL_reginput;
5411 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5412 ST.count > 0)) /* count overflow ? */
5415 CURLY_SETPAREN(ST.paren, ST.count);
5416 if (cur_eval && cur_eval->u.eval.close_paren &&
5417 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5420 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5428 /* a successful greedy match: now try to match B */
5429 if (cur_eval && cur_eval->u.eval.close_paren &&
5430 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5435 if (ST.c1 != CHRTEST_VOID)
5436 c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5437 UTF8_MAXBYTES, 0, uniflags)
5438 : (UV) UCHARAT(PL_reginput);
5439 /* If it could work, try it. */
5440 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5441 CURLY_SETPAREN(ST.paren, ST.count);
5442 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5447 case CURLY_B_max_fail:
5448 /* failed to find B in a greedy match */
5449 if (ST.paren && ST.count)
5450 PL_regoffs[ST.paren].end = -1;
5452 REGCP_UNWIND(ST.cp);
5454 if (--ST.count < ST.min)
5456 PL_reginput = locinput = HOPc(locinput, -1);
5457 goto curly_try_B_max;
5464 /* we've just finished A in /(??{A})B/; now continue with B */
5466 st->u.eval.toggle_reg_flags
5467 = cur_eval->u.eval.toggle_reg_flags;
5468 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5470 st->u.eval.prev_rex = rex_sv; /* inner */
5471 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5472 rex = (struct regexp *)SvANY(rex_sv);
5473 rexi = RXi_GET(rex);
5474 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5475 (void)ReREFCNT_inc(rex_sv);
5476 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
5478 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5479 PL_reglastparen = &rex->lastparen;
5480 PL_reglastcloseparen = &rex->lastcloseparen;
5482 REGCP_SET(st->u.eval.lastcp);
5483 PL_reginput = locinput;
5485 /* Restore parens of the outer rex without popping the
5487 tmpix = PL_savestack_ix;
5488 PL_savestack_ix = cur_eval->u.eval.lastcp;
5490 PL_savestack_ix = tmpix;
5492 st->u.eval.prev_eval = cur_eval;
5493 cur_eval = cur_eval->u.eval.prev_eval;
5495 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5496 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5497 if ( nochange_depth )
5500 PUSH_YES_STATE_GOTO(EVAL_AB,
5501 st->u.eval.prev_eval->u.eval.B); /* match B */
5504 if (locinput < reginfo->till) {
5505 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5506 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5508 (long)(locinput - PL_reg_starttry),
5509 (long)(reginfo->till - PL_reg_starttry),
5512 sayNO_SILENT; /* Cannot match: too short. */
5514 PL_reginput = locinput; /* put where regtry can find it */
5515 sayYES; /* Success! */
5517 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5519 PerlIO_printf(Perl_debug_log,
5520 "%*s %ssubpattern success...%s\n",
5521 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5522 PL_reginput = locinput; /* put where regtry can find it */
5523 sayYES; /* Success! */
5526 #define ST st->u.ifmatch
5528 case SUSPEND: /* (?>A) */
5530 PL_reginput = locinput;
5533 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5535 goto ifmatch_trivial_fail_test;
5537 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5539 ifmatch_trivial_fail_test:
5541 char * const s = HOPBACKc(locinput, scan->flags);
5546 sw = 1 - cBOOL(ST.wanted);
5550 next = scan + ARG(scan);
5558 PL_reginput = locinput;
5562 ST.logical = logical;
5563 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5565 /* execute body of (?...A) */
5566 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5569 case IFMATCH_A_fail: /* body of (?...A) failed */
5570 ST.wanted = !ST.wanted;
5573 case IFMATCH_A: /* body of (?...A) succeeded */
5575 sw = cBOOL(ST.wanted);
5577 else if (!ST.wanted)
5580 if (OP(ST.me) == SUSPEND)
5581 locinput = PL_reginput;
5583 locinput = PL_reginput = st->locinput;
5584 nextchr = UCHARAT(locinput);
5586 scan = ST.me + ARG(ST.me);
5589 continue; /* execute B */
5594 next = scan + ARG(scan);
5599 reginfo->cutpoint = PL_regeol;
5602 PL_reginput = locinput;
5604 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5605 PUSH_STATE_GOTO(COMMIT_next,next);
5607 case COMMIT_next_fail:
5614 #define ST st->u.mark
5616 ST.prev_mark = mark_state;
5617 ST.mark_name = sv_commit = sv_yes_mark
5618 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5620 ST.mark_loc = PL_reginput = locinput;
5621 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5623 case MARKPOINT_next:
5624 mark_state = ST.prev_mark;
5627 case MARKPOINT_next_fail:
5628 if (popmark && sv_eq(ST.mark_name,popmark))
5630 if (ST.mark_loc > startpoint)
5631 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5632 popmark = NULL; /* we found our mark */
5633 sv_commit = ST.mark_name;
5636 PerlIO_printf(Perl_debug_log,
5637 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5638 REPORT_CODE_OFF+depth*2, "",
5639 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5642 mark_state = ST.prev_mark;
5643 sv_yes_mark = mark_state ?
5644 mark_state->u.mark.mark_name : NULL;
5648 PL_reginput = locinput;
5650 /* (*SKIP) : if we fail we cut here*/
5651 ST.mark_name = NULL;
5652 ST.mark_loc = locinput;
5653 PUSH_STATE_GOTO(SKIP_next,next);
5655 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5656 otherwise do nothing. Meaning we need to scan
5658 regmatch_state *cur = mark_state;
5659 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5662 if ( sv_eq( cur->u.mark.mark_name,
5665 ST.mark_name = find;
5666 PUSH_STATE_GOTO( SKIP_next, next );
5668 cur = cur->u.mark.prev_mark;
5671 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5673 case SKIP_next_fail:
5675 /* (*CUT:NAME) - Set up to search for the name as we
5676 collapse the stack*/
5677 popmark = ST.mark_name;
5679 /* (*CUT) - No name, we cut here.*/
5680 if (ST.mark_loc > startpoint)
5681 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5682 /* but we set sv_commit to latest mark_name if there
5683 is one so they can test to see how things lead to this
5686 sv_commit=mark_state->u.mark.mark_name;
5693 if ((n=is_LNBREAK(locinput,utf8_target))) {
5695 nextchr = UCHARAT(locinput);
5700 #define CASE_CLASS(nAmE) \
5702 if (locinput >= PL_regeol) \
5704 if ((n=is_##nAmE(locinput,utf8_target))) { \
5706 nextchr = UCHARAT(locinput); \
5711 if (locinput >= PL_regeol) \
5713 if ((n=is_##nAmE(locinput,utf8_target))) { \
5716 locinput += UTF8SKIP(locinput); \
5717 nextchr = UCHARAT(locinput); \
5722 CASE_CLASS(HORIZWS);
5726 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5727 PTR2UV(scan), OP(scan));
5728 Perl_croak(aTHX_ "regexp memory corruption");
5732 /* switch break jumps here */
5733 scan = next; /* prepare to execute the next op and ... */
5734 continue; /* ... jump back to the top, reusing st */
5738 /* push a state that backtracks on success */
5739 st->u.yes.prev_yes_state = yes_state;
5743 /* push a new regex state, then continue at scan */
5745 regmatch_state *newst;
5748 regmatch_state *cur = st;
5749 regmatch_state *curyes = yes_state;
5751 regmatch_slab *slab = PL_regmatch_slab;
5752 for (;curd > -1;cur--,curd--) {
5753 if (cur < SLAB_FIRST(slab)) {
5755 cur = SLAB_LAST(slab);
5757 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5758 REPORT_CODE_OFF + 2 + depth * 2,"",
5759 curd, PL_reg_name[cur->resume_state],
5760 (curyes == cur) ? "yes" : ""
5763 curyes = cur->u.yes.prev_yes_state;
5766 DEBUG_STATE_pp("push")
5769 st->locinput = locinput;
5771 if (newst > SLAB_LAST(PL_regmatch_slab))
5772 newst = S_push_slab(aTHX);
5773 PL_regmatch_state = newst;
5775 locinput = PL_reginput;
5776 nextchr = UCHARAT(locinput);
5784 * We get here only if there's trouble -- normally "case END" is
5785 * the terminating point.
5787 Perl_croak(aTHX_ "corrupted regexp pointers");
5793 /* we have successfully completed a subexpression, but we must now
5794 * pop to the state marked by yes_state and continue from there */
5795 assert(st != yes_state);
5797 while (st != yes_state) {
5799 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5800 PL_regmatch_slab = PL_regmatch_slab->prev;
5801 st = SLAB_LAST(PL_regmatch_slab);
5805 DEBUG_STATE_pp("pop (no final)");
5807 DEBUG_STATE_pp("pop (yes)");
5813 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5814 || yes_state > SLAB_LAST(PL_regmatch_slab))
5816 /* not in this slab, pop slab */
5817 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5818 PL_regmatch_slab = PL_regmatch_slab->prev;
5819 st = SLAB_LAST(PL_regmatch_slab);
5821 depth -= (st - yes_state);
5824 yes_state = st->u.yes.prev_yes_state;
5825 PL_regmatch_state = st;
5828 locinput= st->locinput;
5829 nextchr = UCHARAT(locinput);
5831 state_num = st->resume_state + no_final;
5832 goto reenter_switch;
5835 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5836 PL_colors[4], PL_colors[5]));
5838 if (PL_reg_eval_set) {
5839 /* each successfully executed (?{...}) block does the equivalent of
5840 * local $^R = do {...}
5841 * When popping the save stack, all these locals would be undone;
5842 * bypass this by setting the outermost saved $^R to the latest
5844 if (oreplsv != GvSV(PL_replgv))
5845 sv_setsv(oreplsv, GvSV(PL_replgv));
5852 PerlIO_printf(Perl_debug_log,
5853 "%*s %sfailed...%s\n",
5854 REPORT_CODE_OFF+depth*2, "",
5855 PL_colors[4], PL_colors[5])
5867 /* there's a previous state to backtrack to */
5869 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5870 PL_regmatch_slab = PL_regmatch_slab->prev;
5871 st = SLAB_LAST(PL_regmatch_slab);
5873 PL_regmatch_state = st;
5874 locinput= st->locinput;
5875 nextchr = UCHARAT(locinput);
5877 DEBUG_STATE_pp("pop");
5879 if (yes_state == st)
5880 yes_state = st->u.yes.prev_yes_state;
5882 state_num = st->resume_state + 1; /* failure = success + 1 */
5883 goto reenter_switch;
5888 if (rex->intflags & PREGf_VERBARG_SEEN) {
5889 SV *sv_err = get_sv("REGERROR", 1);
5890 SV *sv_mrk = get_sv("REGMARK", 1);
5892 sv_commit = &PL_sv_no;
5894 sv_yes_mark = &PL_sv_yes;
5897 sv_commit = &PL_sv_yes;
5898 sv_yes_mark = &PL_sv_no;
5900 sv_setsv(sv_err, sv_commit);
5901 sv_setsv(sv_mrk, sv_yes_mark);
5904 /* clean up; in particular, free all slabs above current one */
5905 LEAVE_SCOPE(oldsave);
5911 - regrepeat - repeatedly match something simple, report how many
5914 * [This routine now assumes that it will only match on things of length 1.
5915 * That was true before, but now we assume scan - reginput is the count,
5916 * rather than incrementing count on every character. [Er, except utf8.]]
5919 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5922 register char *scan;
5924 register char *loceol = PL_regeol;
5925 register I32 hardcount = 0;
5926 register bool utf8_target = PL_reg_match_utf8;
5929 PERL_UNUSED_ARG(depth);
5932 PERL_ARGS_ASSERT_REGREPEAT;
5935 if (max == REG_INFTY)
5937 else if (max < loceol - scan)
5938 loceol = scan + max;
5943 while (scan < loceol && hardcount < max && *scan != '\n') {
5944 scan += UTF8SKIP(scan);
5948 while (scan < loceol && *scan != '\n')
5955 while (scan < loceol && hardcount < max) {
5956 scan += UTF8SKIP(scan);
5967 /* To get here, EXACTish nodes must have *byte* length == 1. That
5968 * means they match only characters in the string that can be expressed
5969 * as a single byte. For non-utf8 strings, that means a simple match.
5970 * For utf8 strings, the character matched must be an invariant, or
5971 * downgradable to a single byte. The pattern's utf8ness is
5972 * irrelevant, as since it's a single byte, it either isn't utf8, or if
5973 * it is, it's an invariant */
5976 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5978 if (! utf8_target || UNI_IS_INVARIANT(c)) {
5979 while (scan < loceol && UCHARAT(scan) == c) {
5985 /* Here, the string is utf8, and the pattern char is different
5986 * in utf8 than not, so can't compare them directly. Outside the
5987 * loop, find the two utf8 bytes that represent c, and then
5988 * look for those in sequence in the utf8 string */
5989 U8 high = UTF8_TWO_BYTE_HI(c);
5990 U8 low = UTF8_TWO_BYTE_LO(c);
5993 while (hardcount < max
5994 && scan + 1 < loceol
5995 && UCHARAT(scan) == high
5996 && UCHARAT(scan + 1) == low)
6004 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6008 PL_reg_flags |= RF_tainted;
6009 utf8_flags = FOLDEQ_UTF8_LOCALE;
6017 case EXACTFU_TRICKYFOLD:
6019 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6021 /* The comments for the EXACT case above apply as well to these fold
6026 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6028 if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6029 char *tmpeol = loceol;
6030 while (hardcount < max
6031 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6032 STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6039 /* XXX Note that the above handles properly the German sharp s in
6040 * the pattern matching ss in the string. But it doesn't handle
6041 * properly cases where the string contains say 'LIGATURE ff' and
6042 * the pattern is 'f+'. This would require, say, a new function or
6043 * revised interface to foldEQ_utf8(), in which the maximum number
6044 * of characters to match could be passed and it would return how
6045 * many actually did. This is just one of many cases where
6046 * multi-char folds don't work properly, and so the fix is being
6052 /* Here, the string isn't utf8 and c is a single byte; and either
6053 * the pattern isn't utf8 or c is an invariant, so its utf8ness
6054 * doesn't affect c. Can just do simple comparisons for exact or
6057 case EXACTF: folded = PL_fold[c]; break;
6059 case EXACTFU_TRICKYFOLD:
6060 case EXACTFU: folded = PL_fold_latin1[c]; break;
6061 case EXACTFL: folded = PL_fold_locale[c]; break;
6062 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6064 while (scan < loceol &&
6065 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6073 if (utf8_target || OP(p) == ANYOFV) {
6076 inclasslen = loceol - scan;
6077 while (hardcount < max
6078 && ((inclasslen = loceol - scan) > 0)
6079 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6085 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6093 LOAD_UTF8_CHARCLASS_ALNUM();
6094 while (hardcount < max && scan < loceol &&
6095 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6097 scan += UTF8SKIP(scan);
6101 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6109 while (scan < loceol && isALNUM((U8) *scan)) {
6114 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6119 PL_reg_flags |= RF_tainted;
6122 while (hardcount < max && scan < loceol &&
6123 isALNUM_LC_utf8((U8*)scan)) {
6124 scan += UTF8SKIP(scan);
6128 while (scan < loceol && isALNUM_LC(*scan))
6138 LOAD_UTF8_CHARCLASS_ALNUM();
6139 while (hardcount < max && scan < loceol &&
6140 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6142 scan += UTF8SKIP(scan);
6146 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6153 goto utf8_Nwordchar;
6154 while (scan < loceol && ! isALNUM((U8) *scan)) {
6160 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6161 scan += UTF8SKIP(scan);
6165 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6171 PL_reg_flags |= RF_tainted;
6174 while (hardcount < max && scan < loceol &&
6175 !isALNUM_LC_utf8((U8*)scan)) {
6176 scan += UTF8SKIP(scan);
6180 while (scan < loceol && !isALNUM_LC(*scan))
6190 LOAD_UTF8_CHARCLASS_SPACE();
6191 while (hardcount < max && scan < loceol &&
6193 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6195 scan += UTF8SKIP(scan);
6201 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6210 while (scan < loceol && isSPACE((U8) *scan)) {
6215 while (scan < loceol && isSPACE_A((U8) *scan)) {
6220 PL_reg_flags |= RF_tainted;
6223 while (hardcount < max && scan < loceol &&
6224 isSPACE_LC_utf8((U8*)scan)) {
6225 scan += UTF8SKIP(scan);
6229 while (scan < loceol && isSPACE_LC(*scan))
6239 LOAD_UTF8_CHARCLASS_SPACE();
6240 while (hardcount < max && scan < loceol &&
6242 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6244 scan += UTF8SKIP(scan);
6250 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6259 while (scan < loceol && ! isSPACE((U8) *scan)) {
6265 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6266 scan += UTF8SKIP(scan);
6270 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6276 PL_reg_flags |= RF_tainted;
6279 while (hardcount < max && scan < loceol &&
6280 !isSPACE_LC_utf8((U8*)scan)) {
6281 scan += UTF8SKIP(scan);
6285 while (scan < loceol && !isSPACE_LC(*scan))
6292 LOAD_UTF8_CHARCLASS_DIGIT();
6293 while (hardcount < max && scan < loceol &&
6294 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6295 scan += UTF8SKIP(scan);
6299 while (scan < loceol && isDIGIT(*scan))
6304 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6309 PL_reg_flags |= RF_tainted;
6312 while (hardcount < max && scan < loceol &&
6313 isDIGIT_LC_utf8((U8*)scan)) {
6314 scan += UTF8SKIP(scan);
6318 while (scan < loceol && isDIGIT_LC(*scan))
6325 LOAD_UTF8_CHARCLASS_DIGIT();
6326 while (hardcount < max && scan < loceol &&
6327 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6328 scan += UTF8SKIP(scan);
6332 while (scan < loceol && !isDIGIT(*scan))
6338 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6339 scan += UTF8SKIP(scan);
6343 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6349 PL_reg_flags |= RF_tainted;
6352 while (hardcount < max && scan < loceol &&
6353 !isDIGIT_LC_utf8((U8*)scan)) {
6354 scan += UTF8SKIP(scan);
6358 while (scan < loceol && !isDIGIT_LC(*scan))
6365 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6371 LNBREAK can match two latin chars, which is ok,
6372 because we have a null terminated string, but we
6373 have to use hardcount in this situation
6375 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6384 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6389 while (scan < loceol && is_HORIZWS_latin1(scan))
6396 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6397 scan += UTF8SKIP(scan);
6401 while (scan < loceol && !is_HORIZWS_latin1(scan))
6409 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6414 while (scan < loceol && is_VERTWS_latin1(scan))
6422 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6423 scan += UTF8SKIP(scan);
6427 while (scan < loceol && !is_VERTWS_latin1(scan))
6433 default: /* Called on something of 0 width. */
6434 break; /* So match right here or not at all. */
6440 c = scan - PL_reginput;
6444 GET_RE_DEBUG_FLAGS_DECL;
6446 SV * const prop = sv_newmortal();
6447 regprop(prog, prop, p);
6448 PerlIO_printf(Perl_debug_log,
6449 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
6450 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6458 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6460 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
6461 create a copy so that changes the caller makes won't change the shared one
6464 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6466 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6467 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6472 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6474 /* Returns the swash for the input 'node' in the regex 'prog'.
6475 * If <doinit> is true, will attempt to create the swash if not already
6477 * If <listsvp> is non-null, will return the swash initialization string in
6479 * If <altsvp> is non-null, will return the alternates to the regular swash
6481 * Tied intimately to how regcomp.c sets up the data structure */
6489 RXi_GET_DECL(prog,progi);
6490 const struct reg_data * const data = prog ? progi->data : NULL;
6492 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6494 assert(ANYOF_NONBITMAP(node));
6496 if (data && data->count) {
6497 const U32 n = ARG(node);
6499 if (data->what[n] == 's') {
6500 SV * const rv = MUTABLE_SV(data->data[n]);
6501 AV * const av = MUTABLE_AV(SvRV(rv));
6502 SV **const ary = AvARRAY(av);
6503 bool invlist_has_user_defined_property;
6505 si = *ary; /* ary[0] = the string to initialize the swash with */
6507 /* Elements 3 and 4 are either both present or both absent. [3] is
6508 * any inversion list generated at compile time; [4] indicates if
6509 * that inversion list has any user-defined properties in it. */
6510 if (av_len(av) >= 3) {
6512 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6516 invlist_has_user_defined_property = FALSE;
6519 /* Element [1] is reserved for the set-up swash. If already there,
6520 * return it; if not, create it and store it there */
6521 if (SvROK(ary[1])) {
6524 else if (si && doinit) {
6526 sw = _core_swash_init("utf8", /* the utf8 package */
6530 0, /* not from tr/// */
6531 FALSE, /* is error if can't find
6534 invlist_has_user_defined_property);
6535 (void)av_store(av, 1, sw);
6538 /* Element [2] is for any multi-char folds. Note that is a
6539 * fundamentally flawed design, because can't backtrack and try
6540 * again. See [perl #89774] */
6541 if (SvTYPE(ary[2]) == SVt_PVAV) {
6548 SV* matches_string = newSVpvn("", 0);
6551 /* Use the swash, if any, which has to have incorporated into it all
6555 && SvTYPE(SvRV(sw)) == SVt_PVHV
6556 && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6558 invlist = *invlistsvp;
6560 else if (si && si != &PL_sv_undef) {
6562 /* If no swash, use the input nitialization string, if available */
6563 sv_catsv(matches_string, si);
6566 /* Add the inversion list to whatever we have. This may have come from
6567 * the swash, or from an input parameter */
6569 sv_catsv(matches_string, _invlist_contents(invlist));
6571 *listsvp = matches_string;
6581 - reginclass - determine if a character falls into a character class
6583 n is the ANYOF regnode
6584 p is the target string
6585 lenp is pointer to the maximum number of bytes of how far to go in p
6586 (This is assumed wthout checking to always be at least the current
6588 utf8_target tells whether p is in UTF-8.
6590 Returns true if matched; false otherwise. If lenp is not NULL, on return
6591 from a successful match, the value it points to will be updated to how many
6592 bytes in p were matched. If there was no match, the value is undefined,
6593 possibly changed from the input.
6595 Note that this can be a synthetic start class, a combination of various
6596 nodes, so things you think might be mutually exclusive, such as locale,
6597 aren't. It can match both locale and non-locale
6602 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6605 const char flags = ANYOF_FLAGS(n);
6611 PERL_ARGS_ASSERT_REGINCLASS;
6613 /* If c is not already the code point, get it */
6614 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6615 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6616 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6617 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6618 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6619 * UTF8_ALLOW_FFFF */
6620 if (c_len == (STRLEN)-1)
6621 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6627 /* Use passed in max length, or one character if none passed in or less
6628 * than one character. And assume will match just one character. This is
6629 * overwritten later if matched more. */
6631 maxlen = (*lenp > c_len) ? *lenp : c_len;
6639 /* If this character is potentially in the bitmap, check it */
6641 if (ANYOF_BITMAP_TEST(n, c))
6643 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6650 else if (flags & ANYOF_LOCALE) {
6651 PL_reg_flags |= RF_tainted;
6653 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6654 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6658 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6659 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6660 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6661 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6662 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6663 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6664 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6665 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6666 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6667 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6668 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
6669 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
6670 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
6671 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6672 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6673 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6674 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6675 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6676 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6677 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6678 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6679 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6680 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6681 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6682 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6683 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6684 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6685 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6686 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
6687 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
6688 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
6689 ) /* How's that for a conditional? */
6696 /* If the bitmap didn't (or couldn't) match, and something outside the
6697 * bitmap could match, try that. Locale nodes specifiy completely the
6698 * behavior of code points in the bit map (otherwise, a utf8 target would
6699 * cause them to be treated as Unicode and not locale), except in
6700 * the very unlikely event when this node is a synthetic start class, which
6701 * could be a combination of locale and non-locale nodes. So allow locale
6702 * to match for the synthetic start class, which will give a false
6703 * positive that will be resolved when the match is done again as not part
6704 * of the synthetic start class */
6706 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6707 match = TRUE; /* Everything above 255 matches */
6709 else if (ANYOF_NONBITMAP(n)
6710 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6713 || (! (flags & ANYOF_LOCALE))
6714 || (flags & ANYOF_IS_SYNTHETIC)))))
6717 SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6725 /* Not utf8. Convert as much of the string as available up
6726 * to the limit of how far the (single) character in the
6727 * pattern can possibly match (no need to go further). If
6728 * the node is a straight ANYOF or not folding, it can't
6729 * match more than one. Otherwise, It can match up to how
6730 * far a single char can fold to. Since not utf8, each
6731 * character is a single byte, so the max it can be in
6732 * bytes is the same as the max it can be in characters */
6733 STRLEN len = (OP(n) == ANYOF
6734 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6736 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6738 : UTF8_MAX_FOLD_CHAR_EXPAND;
6739 utf8_p = bytes_to_utf8(p, &len);
6742 if (swash_fetch(sw, utf8_p, TRUE))
6744 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6746 /* Here, we need to test if the fold of the target string
6747 * matches. The non-multi char folds have all been moved to
6748 * the compilation phase, and the multi-char folds have
6749 * been stored by regcomp into 'av'; we linearly check to
6750 * see if any match the target string (folded). We know
6751 * that the originals were each one character, but we don't
6752 * currently know how many characters/bytes each folded to,
6753 * except we do know that there are small limits imposed by
6754 * Unicode. XXX A performance enhancement would be to have
6755 * regcomp.c store the max number of chars/bytes that are
6756 * in an av entry, as, say the 0th element. Even better
6757 * would be to have a hash of the few characters that can
6758 * start a multi-char fold to the max number of chars of
6761 * If there is a match, we will need to advance (if lenp is
6762 * specified) the match pointer in the target string. But
6763 * what we are comparing here isn't that string directly,
6764 * but its fold, whose length may differ from the original.
6765 * As we go along in constructing the fold, therefore, we
6766 * create a map so that we know how many bytes in the
6767 * source to advance given that we have matched a certain
6768 * number of bytes in the fold. This map is stored in
6769 * 'map_fold_len_back'. Let n mean the number of bytes in
6770 * the fold of the first character that we are folding.
6771 * Then map_fold_len_back[n] is set to the number of bytes
6772 * in that first character. Similarly let m be the
6773 * corresponding number for the second character to be
6774 * folded. Then map_fold_len_back[n+m] is set to the
6775 * number of bytes occupied by the first two source
6776 * characters. ... */
6777 U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6778 U8 folded[UTF8_MAXBYTES_CASE+1];
6779 STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6780 STRLEN total_foldlen = 0; /* num bytes in fold of all
6783 if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6785 /* Here, only need to fold the first char of the target
6786 * string. It the source wasn't utf8, is 1 byte long */
6787 to_utf8_fold(utf8_p, folded, &foldlen);
6788 total_foldlen = foldlen;
6789 map_fold_len_back[foldlen] = (utf8_target)
6795 /* Here, need to fold more than the first char. Do so
6796 * up to the limits */
6797 U8* source_ptr = utf8_p; /* The source for the fold
6800 U8* folded_ptr = folded;
6801 U8* e = utf8_p + maxlen; /* Can't go beyond last
6802 available byte in the
6806 i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6810 /* Fold the next character */
6811 U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6812 STRLEN this_char_foldlen;
6813 to_utf8_fold(source_ptr,
6815 &this_char_foldlen);
6817 /* Bail if it would exceed the byte limit for
6818 * folding a single char. */
6819 if (this_char_foldlen + folded_ptr - folded >
6825 /* Add the fold of this character */
6826 Copy(this_char_folded,
6830 source_ptr += UTF8SKIP(source_ptr);
6831 folded_ptr += this_char_foldlen;
6832 total_foldlen = folded_ptr - folded;
6834 /* Create map from the number of bytes in the fold
6835 * back to the number of bytes in the source. If
6836 * the source isn't utf8, the byte count is just
6837 * the number of characters so far */
6838 map_fold_len_back[total_foldlen]
6840 ? source_ptr - utf8_p
6847 /* Do the linear search to see if the fold is in the list
6848 * of multi-char folds. */
6851 for (i = 0; i <= av_len(av); i++) {
6852 SV* const sv = *av_fetch(av, i, FALSE);
6854 const char * const s = SvPV_const(sv, len);
6856 if (len <= total_foldlen
6857 && memEQ(s, (char*)folded, len)
6859 /* If 0, means matched a partial char. See
6861 && map_fold_len_back[len])
6864 /* Advance the target string ptr to account for
6865 * this fold, but have to translate from the
6866 * folded length to the corresponding source
6869 *lenp = map_fold_len_back[len];
6878 /* If we allocated a string above, free it */
6879 if (! utf8_target) Safefree(utf8_p);
6884 return (flags & ANYOF_INVERT) ? !match : match;
6888 S_reghop3(U8 *s, I32 off, const U8* lim)
6890 /* return the position 'off' UTF-8 characters away from 's', forward if
6891 * 'off' >= 0, backwards if negative. But don't go outside of position
6892 * 'lim', which better be < s if off < 0 */
6896 PERL_ARGS_ASSERT_REGHOP3;
6899 while (off-- && s < lim) {
6900 /* XXX could check well-formedness here */
6905 while (off++ && s > lim) {
6907 if (UTF8_IS_CONTINUED(*s)) {
6908 while (s > lim && UTF8_IS_CONTINUATION(*s))
6911 /* XXX could check well-formedness here */
6918 /* there are a bunch of places where we use two reghop3's that should
6919 be replaced with this routine. but since thats not done yet
6920 we ifdef it out - dmq
6923 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6927 PERL_ARGS_ASSERT_REGHOP4;
6930 while (off-- && s < rlim) {
6931 /* XXX could check well-formedness here */
6936 while (off++ && s > llim) {
6938 if (UTF8_IS_CONTINUED(*s)) {
6939 while (s > llim && UTF8_IS_CONTINUATION(*s))
6942 /* XXX could check well-formedness here */
6950 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6954 PERL_ARGS_ASSERT_REGHOPMAYBE3;
6957 while (off-- && s < lim) {
6958 /* XXX could check well-formedness here */
6965 while (off++ && s > lim) {
6967 if (UTF8_IS_CONTINUED(*s)) {
6968 while (s > lim && UTF8_IS_CONTINUATION(*s))
6971 /* XXX could check well-formedness here */
6980 restore_pos(pTHX_ void *arg)
6983 regexp * const rex = (regexp *)arg;
6984 if (PL_reg_eval_set) {
6985 if (PL_reg_oldsaved) {
6986 rex->subbeg = PL_reg_oldsaved;
6987 rex->sublen = PL_reg_oldsavedlen;
6988 #ifdef PERL_OLD_COPY_ON_WRITE
6989 rex->saved_copy = PL_nrs;
6991 RXp_MATCH_COPIED_on(rex);
6993 PL_reg_magic->mg_len = PL_reg_oldpos;
6994 PL_reg_eval_set = 0;
6995 PL_curpm = PL_reg_oldcurpm;
7000 S_to_utf8_substr(pTHX_ register regexp *prog)
7004 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7007 if (prog->substrs->data[i].substr
7008 && !prog->substrs->data[i].utf8_substr) {
7009 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7010 prog->substrs->data[i].utf8_substr = sv;
7011 sv_utf8_upgrade(sv);
7012 if (SvVALID(prog->substrs->data[i].substr)) {
7013 if (SvTAIL(prog->substrs->data[i].substr)) {
7014 /* Trim the trailing \n that fbm_compile added last
7016 SvCUR_set(sv, SvCUR(sv) - 1);
7017 /* Whilst this makes the SV technically "invalid" (as its
7018 buffer is no longer followed by "\0") when fbm_compile()
7019 adds the "\n" back, a "\0" is restored. */
7020 fbm_compile(sv, FBMcf_TAIL);
7024 if (prog->substrs->data[i].substr == prog->check_substr)
7025 prog->check_utf8 = sv;
7031 S_to_byte_substr(pTHX_ register regexp *prog)
7036 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7039 if (prog->substrs->data[i].utf8_substr
7040 && !prog->substrs->data[i].substr) {
7041 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7042 if (sv_utf8_downgrade(sv, TRUE)) {
7043 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7044 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7045 /* Trim the trailing \n that fbm_compile added last
7047 SvCUR_set(sv, SvCUR(sv) - 1);
7048 fbm_compile(sv, FBMcf_TAIL);
7056 prog->substrs->data[i].substr = sv;
7057 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7058 prog->check_substr = sv;
7065 * c-indentation-style: bsd
7067 * indent-tabs-mode: nil
7070 * ex: set ts=8 sts=4 sw=4 et: