5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGEXEC_C
87 #ifdef PERL_IN_XSUB_RE
93 #include "inline_invlist.c"
94 #include "unicode_constants.h"
96 #define RF_tainted 1 /* tainted information used? e.g. locale */
97 #define RF_warned 2 /* warned about big count? */
99 #define RF_utf8 8 /* Pattern contains multibyte chars? */
101 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
103 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
106 #define STATIC static
109 /* Valid for non-utf8 strings: avoids the reginclass
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
112 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
113 : ANYOF_BITMAP_TEST(p,*(c)))
119 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
120 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
122 #define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
126 #define HOPBACKc(pos, off) \
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
133 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
134 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
137 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138 #define NEXTCHR_IS_EOS (nextchr < 0)
140 #define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
143 #define SET_locinput(p) \
148 /* these are unrolled below in the CCC_TRY_XXX defined */
149 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
150 if (!CAT2(PL_utf8_,class)) { \
152 ENTER; save_re_context(); \
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
156 /* Doesn't do an assert to verify that is correct */
157 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
158 if (!CAT2(PL_utf8_,class)) { \
160 PERL_UNUSED_VAR(throw_away); \
161 ENTER; save_re_context(); \
162 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
163 PERL_UNUSED_VAR(throw_away); \
166 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
167 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
169 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
170 /* No asserts are done for some of these, in case called on a */ \
171 /* Unicode version in which they map to nothing */ \
172 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
173 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
175 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
177 /* The actual code for CCC_TRY, which uses several variables from the routine
178 * it's callable from. It is designed to be the bulk of a case statement.
179 * FUNC is the macro or function to call on non-utf8 targets that indicate if
180 * nextchr matches the class.
181 * UTF8_TEST is the whole test string to use for utf8 targets
182 * LOAD is what to use to test, and if not present to load in the swash for the
184 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
186 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
187 * utf8 and a variant, load the swash if necessary and test using the utf8
188 * test. Advance to the next character if test is ok, otherwise fail; If not
189 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
190 * fails, or advance to the next character */
192 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
193 if (NEXTCHR_IS_EOS) { \
196 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
197 LOAD_UTF8_CHARCLASS(CLASS, STR); \
198 if (POS_OR_NEG (UTF8_TEST)) { \
202 else if (POS_OR_NEG (FUNC(nextchr))) { \
205 goto increment_locinput;
207 /* Handle the non-locale cases for a character class and its complement. It
208 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
209 * This is because that code fails when the test succeeds, so we want to have
210 * the test fail so that the code succeeds. The swash is stored in a
211 * predictable PL_ place */
212 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
215 _CCC_TRY_CODE( !, FUNC, \
216 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
217 (U8*)locinput, TRUE)), \
220 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
221 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
222 (U8*)locinput, TRUE)), \
224 /* Generate the case statements for both locale and non-locale character
225 * classes in regmatch for classes that don't have special unicode semantics.
226 * Locales don't use an immediate swash, but an intermediary special locale
227 * function that is called on the pointer to the current place in the input
228 * string. That function will resolve to needing the same swash. One might
229 * think that because we don't know what the locale will match, we shouldn't
230 * check with the swash loading function that it loaded properly; ie, that we
231 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
232 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
234 #define CCC_TRY(NAME, NNAME, FUNC, \
235 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
236 NAMEA, NNAMEA, FUNCA, \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
242 PL_reg_flags |= RF_tainted; \
243 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
246 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
249 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
253 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
256 goto increment_locinput; \
257 /* Generate the non-locale cases */ \
258 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
260 /* This is like CCC_TRY, but has an extra set of parameters for generating case
261 * statements to handle separate Unicode semantics nodes */
262 #define CCC_TRY_U(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEU, NNAMEU, FUNCU, \
265 NAMEA, NNAMEA, FUNCA, \
267 CCC_TRY(NAME, NNAME, FUNC, \
268 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
269 NAMEA, NNAMEA, FUNCA, \
271 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
273 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
275 /* for use after a quantifier and before an EXACT-like node -- japhy */
276 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
278 * NOTE that *nothing* that affects backtracking should be in here, specifically
279 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
280 * node that is in between two EXACT like nodes when ascertaining what the required
281 * "follow" character is. This should probably be moved to regex compile time
282 * although it may be done at run time beause of the REF possibility - more
283 * investigation required. -- demerphq
285 #define JUMPABLE(rn) ( \
287 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
289 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
290 OP(rn) == PLUS || OP(rn) == MINMOD || \
292 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
294 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
296 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
299 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
300 we don't need this definition. */
301 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
302 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
303 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
306 /* ... so we use this as its faster. */
307 #define IS_TEXT(rn) ( OP(rn)==EXACT )
308 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
309 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
310 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
315 Search for mandatory following text node; for lookahead, the text must
316 follow but for lookbehind (rn->flags != 0) we skip to the next step.
318 #define FIND_NEXT_IMPT(rn) STMT_START { \
319 while (JUMPABLE(rn)) { \
320 const OPCODE type = OP(rn); \
321 if (type == SUSPEND || PL_regkind[type] == CURLY) \
322 rn = NEXTOPER(NEXTOPER(rn)); \
323 else if (type == PLUS) \
325 else if (type == IFMATCH) \
326 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
327 else rn += NEXT_OFF(rn); \
331 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
332 * These are for the pre-composed Hangul syllables, which are all in a
333 * contiguous block and arranged there in such a way so as to facilitate
334 * alorithmic determination of their characteristics. As such, they don't need
335 * a swash, but can be determined by simple arithmetic. Almost all are
336 * GCB=LVT, but every 28th one is a GCB=LV */
337 #define SBASE 0xAC00 /* Start of block */
338 #define SCount 11172 /* Length of block */
341 static void restore_pos(pTHX_ void *arg);
343 #define REGCP_PAREN_ELEMS 3
344 #define REGCP_OTHER_ELEMS 3
345 #define REGCP_FRAME_ELEMS 1
346 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
347 * are needed for the regexp context stack bookkeeping. */
350 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
353 const int retval = PL_savestack_ix;
354 const int paren_elems_to_push =
355 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
356 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
357 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
359 GET_RE_DEBUG_FLAGS_DECL;
361 PERL_ARGS_ASSERT_REGCPPUSH;
363 if (paren_elems_to_push < 0)
364 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
365 paren_elems_to_push);
367 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
368 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
369 " out of range (%lu-%ld)",
371 (unsigned long)maxopenparen,
374 SSGROW(total_elems + REGCP_FRAME_ELEMS);
377 if ((int)maxopenparen > (int)parenfloor)
378 PerlIO_printf(Perl_debug_log,
379 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
384 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
385 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
386 SSPUSHINT(rex->offs[p].end);
387 SSPUSHINT(rex->offs[p].start);
388 SSPUSHINT(rex->offs[p].start_tmp);
389 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
390 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
392 (IV)rex->offs[p].start,
393 (IV)rex->offs[p].start_tmp,
397 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
398 SSPUSHINT(maxopenparen);
399 SSPUSHINT(rex->lastparen);
400 SSPUSHINT(rex->lastcloseparen);
401 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
406 /* These are needed since we do not localize EVAL nodes: */
407 #define REGCP_SET(cp) \
409 PerlIO_printf(Perl_debug_log, \
410 " Setting an EVAL scope, savestack=%"IVdf"\n", \
411 (IV)PL_savestack_ix)); \
414 #define REGCP_UNWIND(cp) \
416 if (cp != PL_savestack_ix) \
417 PerlIO_printf(Perl_debug_log, \
418 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
419 (IV)(cp), (IV)PL_savestack_ix)); \
422 #define UNWIND_PAREN(lp, lcp) \
423 for (n = rex->lastparen; n > lp; n--) \
424 rex->offs[n].end = -1; \
425 rex->lastparen = n; \
426 rex->lastcloseparen = lcp;
430 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
435 GET_RE_DEBUG_FLAGS_DECL;
437 PERL_ARGS_ASSERT_REGCPPOP;
439 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
441 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
442 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
443 rex->lastcloseparen = SSPOPINT;
444 rex->lastparen = SSPOPINT;
445 *maxopenparen_p = SSPOPINT;
447 i -= REGCP_OTHER_ELEMS;
448 /* Now restore the parentheses context. */
450 if (i || rex->lastparen + 1 <= rex->nparens)
451 PerlIO_printf(Perl_debug_log,
452 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
457 paren = *maxopenparen_p;
458 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
460 rex->offs[paren].start_tmp = SSPOPINT;
461 rex->offs[paren].start = SSPOPINT;
463 if (paren <= rex->lastparen)
464 rex->offs[paren].end = tmps;
465 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
466 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
468 (IV)rex->offs[paren].start,
469 (IV)rex->offs[paren].start_tmp,
470 (IV)rex->offs[paren].end,
471 (paren > rex->lastparen ? "(skipped)" : ""));
476 /* It would seem that the similar code in regtry()
477 * already takes care of this, and in fact it is in
478 * a better location to since this code can #if 0-ed out
479 * but the code in regtry() is needed or otherwise tests
480 * requiring null fields (pat.t#187 and split.t#{13,14}
481 * (as of patchlevel 7877) will fail. Then again,
482 * this code seems to be necessary or otherwise
483 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
484 * --jhi updated by dapm */
485 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
486 if (i > *maxopenparen_p)
487 rex->offs[i].start = -1;
488 rex->offs[i].end = -1;
489 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
490 " \\%"UVuf": %s ..-1 undeffing\n",
492 (i > *maxopenparen_p) ? "-1" : " "
498 /* restore the parens and associated vars at savestack position ix,
499 * but without popping the stack */
502 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
504 I32 tmpix = PL_savestack_ix;
505 PL_savestack_ix = ix;
506 regcppop(rex, maxopenparen_p);
507 PL_savestack_ix = tmpix;
510 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
513 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
515 /* Returns a boolean as to whether or not 'character' is a member of the
516 * Posix character class given by 'classnum' that should be equivalent to a
517 * value in the typedef '_char_class_number'.
519 * Ideally this could be replaced by a just an array of function pointers
520 * to the C library functions that implement the macros this calls.
521 * However, to compile, the precise function signatures are required, and
522 * these may vary from platform to to platform. To avoid having to figure
523 * out what those all are on each platform, I (khw) am using this method,
524 * which adds an extra layer of function call overhead. But we don't
525 * particularly care about performance with locales anyway. */
527 switch ((_char_class_number) classnum) {
528 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
529 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
530 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
531 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
532 case _CC_ENUM_LOWER: return isLOWER_LC(character);
533 case _CC_ENUM_PRINT: return isPRINT_LC(character);
534 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
535 case _CC_ENUM_UPPER: return isUPPER_LC(character);
536 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
537 case _CC_ENUM_SPACE: return isSPACE_LC(character);
538 case _CC_ENUM_BLANK: return isBLANK_LC(character);
539 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
540 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
541 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
542 case _CC_ENUM_ASCII: return isASCII_LC(character);
543 default: /* VERTSPACE should never occur in locales */
544 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
547 assert(0); /* NOTREACHED */
552 * pregexec and friends
555 #ifndef PERL_IN_XSUB_RE
557 - pregexec - match a regexp against a string
560 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
561 char *strbeg, I32 minend, SV *screamer, U32 nosave)
562 /* stringarg: the point in the string at which to begin matching */
563 /* strend: pointer to null at end of string */
564 /* strbeg: real beginning of string */
565 /* minend: end of match must be >= minend bytes after stringarg. */
566 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
567 * itself is accessed via the pointers above */
568 /* nosave: For optimizations. */
570 PERL_ARGS_ASSERT_PREGEXEC;
573 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
574 nosave ? 0 : REXEC_COPY_STR);
579 * Need to implement the following flags for reg_anch:
581 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
583 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
584 * INTUIT_AUTORITATIVE_ML
585 * INTUIT_ONCE_NOML - Intuit can match in one location only.
588 * Another flag for this function: SECOND_TIME (so that float substrs
589 * with giant delta may be not rechecked).
592 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
594 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
595 Otherwise, only SvCUR(sv) is used to get strbeg. */
597 /* XXXX We assume that strpos is strbeg unless sv. */
599 /* XXXX Some places assume that there is a fixed substring.
600 An update may be needed if optimizer marks as "INTUITable"
601 RExen without fixed substrings. Similarly, it is assumed that
602 lengths of all the strings are no more than minlen, thus they
603 cannot come from lookahead.
604 (Or minlen should take into account lookahead.)
605 NOTE: Some of this comment is not correct. minlen does now take account
606 of lookahead/behind. Further research is required. -- demerphq
610 /* A failure to find a constant substring means that there is no need to make
611 an expensive call to REx engine, thus we celebrate a failure. Similarly,
612 finding a substring too deep into the string means that less calls to
613 regtry() should be needed.
615 REx compiler's optimizer found 4 possible hints:
616 a) Anchored substring;
618 c) Whether we are anchored (beginning-of-line or \G);
619 d) First node (of those at offset 0) which may distinguish positions;
620 We use a)b)d) and multiline-part of c), and try to find a position in the
621 string which does not contradict any of them.
624 /* Most of decisions we do here should have been done at compile time.
625 The nodes of the REx which we used for the search should have been
626 deleted from the finite automaton. */
629 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
630 char *strend, const U32 flags, re_scream_pos_data *data)
633 struct regexp *const prog = ReANY(rx);
635 /* Should be nonnegative! */
641 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
643 char *other_last = NULL; /* other substr checked before this */
644 char *check_at = NULL; /* check substr found at this pos */
645 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
646 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
647 RXi_GET_DECL(prog,progi);
649 const char * const i_strpos = strpos;
651 GET_RE_DEBUG_FLAGS_DECL;
653 PERL_ARGS_ASSERT_RE_INTUIT_START;
654 PERL_UNUSED_ARG(flags);
655 PERL_UNUSED_ARG(data);
657 RX_MATCH_UTF8_set(rx,utf8_target);
660 PL_reg_flags |= RF_utf8;
663 debug_start_match(rx, utf8_target, strpos, strend,
664 sv ? "Guessing start of match in sv for"
665 : "Guessing start of match in string for");
668 /* CHR_DIST() would be more correct here but it makes things slow. */
669 if (prog->minlen > strend - strpos) {
670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
671 "String too short... [re_intuit_start]\n"));
675 /* XXX we need to pass strbeg as a separate arg: the following is
676 * guesswork and can be wrong... */
677 if (sv && SvPOK(sv)) {
678 char * p = SvPVX(sv);
679 STRLEN cur = SvCUR(sv);
680 if (p <= strpos && strpos < p + cur) {
682 assert(p <= strend && strend <= p + cur);
685 strbeg = strend - cur;
692 if (!prog->check_utf8 && prog->check_substr)
693 to_utf8_substr(prog);
694 check = prog->check_utf8;
696 if (!prog->check_substr && prog->check_utf8) {
697 if (! to_byte_substr(prog)) {
698 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
701 check = prog->check_substr;
703 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
704 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
705 || ( (prog->extflags & RXf_ANCH_BOL)
706 && !multiline ) ); /* Check after \n? */
709 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
710 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
711 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
713 && (strpos != strbeg)) {
714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
717 if (prog->check_offset_min == prog->check_offset_max
718 && !(prog->extflags & RXf_CANY_SEEN)
719 && ! multiline) /* /m can cause \n's to match that aren't
720 accounted for in the string max length.
721 See [perl #115242] */
723 /* Substring at constant offset from beg-of-str... */
726 s = HOP3c(strpos, prog->check_offset_min, strend);
729 slen = SvCUR(check); /* >= 1 */
731 if ( strend - s > slen || strend - s < slen - 1
732 || (strend - s == slen && strend[-1] != '\n')) {
733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
736 /* Now should match s[0..slen-2] */
738 if (slen && (*SvPVX_const(check) != *s
740 && memNE(SvPVX_const(check), s, slen)))) {
742 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
746 else if (*SvPVX_const(check) != *s
747 || ((slen = SvCUR(check)) > 1
748 && memNE(SvPVX_const(check), s, slen)))
751 goto success_at_start;
754 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
756 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
757 end_shift = prog->check_end_shift;
760 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
761 - (SvTAIL(check) != 0);
762 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
764 if (end_shift < eshift)
768 else { /* Can match at random position */
771 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
772 end_shift = prog->check_end_shift;
774 /* end shift should be non negative here */
777 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
779 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
780 (IV)end_shift, RX_PRECOMP(prog));
784 /* Find a possible match in the region s..strend by looking for
785 the "check" substring in the region corrected by start/end_shift. */
788 I32 srch_start_shift = start_shift;
789 I32 srch_end_shift = end_shift;
792 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
793 srch_end_shift -= ((strbeg - s) - srch_start_shift);
794 srch_start_shift = strbeg - s;
796 DEBUG_OPTIMISE_MORE_r({
797 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
798 (IV)prog->check_offset_min,
799 (IV)srch_start_shift,
801 (IV)prog->check_end_shift);
804 if (prog->extflags & RXf_CANY_SEEN) {
805 start_point= (U8*)(s + srch_start_shift);
806 end_point= (U8*)(strend - srch_end_shift);
808 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
809 end_point= HOP3(strend, -srch_end_shift, strbeg);
811 DEBUG_OPTIMISE_MORE_r({
812 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
813 (int)(end_point - start_point),
814 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
818 s = fbm_instr( start_point, end_point,
819 check, multiline ? FBMrf_MULTILINE : 0);
821 /* Update the count-of-usability, remove useless subpatterns,
825 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
826 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
827 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
828 (s ? "Found" : "Did not find"),
829 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
830 ? "anchored" : "floating"),
833 (s ? " at offset " : "...\n") );
838 /* Finish the diagnostic message */
839 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
841 /* XXX dmq: first branch is for positive lookbehind...
842 Our check string is offset from the beginning of the pattern.
843 So we need to do any stclass tests offset forward from that
852 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
853 Start with the other substr.
854 XXXX no SCREAM optimization yet - and a very coarse implementation
855 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
856 *always* match. Probably should be marked during compile...
857 Probably it is right to do no SCREAM here...
860 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
861 : (prog->float_substr && prog->anchored_substr))
863 /* Take into account the "other" substring. */
864 /* XXXX May be hopelessly wrong for UTF... */
867 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
870 char * const last = HOP3c(s, -start_shift, strbeg);
872 char * const saved_s = s;
875 t = s - prog->check_offset_max;
876 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
878 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
883 t = HOP3c(t, prog->anchored_offset, strend);
884 if (t < other_last) /* These positions already checked */
886 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
889 /* XXXX It is not documented what units *_offsets are in.
890 We assume bytes, but this is clearly wrong.
891 Meaning this code needs to be carefully reviewed for errors.
895 /* On end-of-str: see comment below. */
896 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
897 if (must == &PL_sv_undef) {
899 DEBUG_r(must = prog->anchored_utf8); /* for debug */
904 HOP3(HOP3(last1, prog->anchored_offset, strend)
905 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
907 multiline ? FBMrf_MULTILINE : 0
910 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
911 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
912 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
913 (s ? "Found" : "Contradicts"),
914 quoted, RE_SV_TAIL(must));
919 if (last1 >= last2) {
920 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
921 ", giving up...\n"));
924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
925 ", trying floating at offset %ld...\n",
926 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
927 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
928 s = HOP3c(last, 1, strend);
932 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
933 (long)(s - i_strpos)));
934 t = HOP3c(s, -prog->anchored_offset, strbeg);
935 other_last = HOP3c(s, 1, strend);
943 else { /* Take into account the floating substring. */
945 char * const saved_s = s;
948 t = HOP3c(s, -start_shift, strbeg);
950 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
951 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
952 last = HOP3c(t, prog->float_max_offset, strend);
953 s = HOP3c(t, prog->float_min_offset, strend);
956 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
957 must = utf8_target ? prog->float_utf8 : prog->float_substr;
958 /* fbm_instr() takes into account exact value of end-of-str
959 if the check is SvTAIL(ed). Since false positives are OK,
960 and end-of-str is not later than strend we are OK. */
961 if (must == &PL_sv_undef) {
963 DEBUG_r(must = prog->float_utf8); /* for debug message */
966 s = fbm_instr((unsigned char*)s,
967 (unsigned char*)last + SvCUR(must)
969 must, multiline ? FBMrf_MULTILINE : 0);
971 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
972 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
973 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
974 (s ? "Found" : "Contradicts"),
975 quoted, RE_SV_TAIL(must));
979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
980 ", giving up...\n"));
983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
984 ", trying anchored starting at offset %ld...\n",
985 (long)(saved_s + 1 - i_strpos)));
987 s = HOP3c(t, 1, strend);
991 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
992 (long)(s - i_strpos)));
993 other_last = s; /* Fix this later. --Hugo */
1003 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1005 DEBUG_OPTIMISE_MORE_r(
1006 PerlIO_printf(Perl_debug_log,
1007 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
1008 (IV)prog->check_offset_min,
1009 (IV)prog->check_offset_max,
1017 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1019 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1022 /* Fixed substring is found far enough so that the match
1023 cannot start at strpos. */
1025 if (ml_anch && t[-1] != '\n') {
1026 /* Eventually fbm_*() should handle this, but often
1027 anchored_offset is not 0, so this check will not be wasted. */
1028 /* XXXX In the code below we prefer to look for "^" even in
1029 presence of anchored substrings. And we search even
1030 beyond the found float position. These pessimizations
1031 are historical artefacts only. */
1033 while (t < strend - prog->minlen) {
1035 if (t < check_at - prog->check_offset_min) {
1036 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1037 /* Since we moved from the found position,
1038 we definitely contradict the found anchored
1039 substr. Due to the above check we do not
1040 contradict "check" substr.
1041 Thus we can arrive here only if check substr
1042 is float. Redo checking for "other"=="fixed".
1045 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1046 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1047 goto do_other_anchored;
1049 /* We don't contradict the found floating substring. */
1050 /* XXXX Why not check for STCLASS? */
1052 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1053 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1056 /* Position contradicts check-string */
1057 /* XXXX probably better to look for check-string
1058 than for "\n", so one should lower the limit for t? */
1059 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1060 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1061 other_last = strpos = s = t + 1;
1066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1067 PL_colors[0], PL_colors[1]));
1071 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1072 PL_colors[0], PL_colors[1]));
1076 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1079 /* The found string does not prohibit matching at strpos,
1080 - no optimization of calling REx engine can be performed,
1081 unless it was an MBOL and we are not after MBOL,
1082 or a future STCLASS check will fail this. */
1084 /* Even in this situation we may use MBOL flag if strpos is offset
1085 wrt the start of the string. */
1086 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1087 && (strpos != strbeg) && strpos[-1] != '\n'
1088 /* May be due to an implicit anchor of m{.*foo} */
1089 && !(prog->intflags & PREGf_IMPLICIT))
1094 DEBUG_EXECUTE_r( if (ml_anch)
1095 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1096 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1099 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1101 prog->check_utf8 /* Could be deleted already */
1102 && --BmUSEFUL(prog->check_utf8) < 0
1103 && (prog->check_utf8 == prog->float_utf8)
1105 prog->check_substr /* Could be deleted already */
1106 && --BmUSEFUL(prog->check_substr) < 0
1107 && (prog->check_substr == prog->float_substr)
1110 /* If flags & SOMETHING - do not do it many times on the same match */
1111 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1112 /* XXX Does the destruction order has to change with utf8_target? */
1113 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1114 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1115 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1116 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1117 check = NULL; /* abort */
1119 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1120 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1121 if (prog->intflags & PREGf_IMPLICIT)
1122 prog->extflags &= ~RXf_ANCH_MBOL;
1123 /* XXXX This is a remnant of the old implementation. It
1124 looks wasteful, since now INTUIT can use many
1125 other heuristics. */
1126 prog->extflags &= ~RXf_USE_INTUIT;
1127 /* XXXX What other flags might need to be cleared in this branch? */
1133 /* Last resort... */
1134 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1135 /* trie stclasses are too expensive to use here, we are better off to
1136 leave it to regmatch itself */
1137 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1138 /* minlen == 0 is possible if regstclass is \b or \B,
1139 and the fixed substr is ''$.
1140 Since minlen is already taken into account, s+1 is before strend;
1141 accidentally, minlen >= 1 guaranties no false positives at s + 1
1142 even for \b or \B. But (minlen? 1 : 0) below assumes that
1143 regstclass does not come from lookahead... */
1144 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1145 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1146 const U8* const str = (U8*)STRING(progi->regstclass);
1147 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1148 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1151 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1152 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1153 else if (prog->float_substr || prog->float_utf8)
1154 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1158 if (checked_upto < s)
1160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1161 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1164 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1169 const char *what = NULL;
1171 if (endpos == strend) {
1172 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1173 "Could not match STCLASS...\n") );
1176 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1177 "This position contradicts STCLASS...\n") );
1178 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1180 checked_upto = HOPBACKc(endpos, start_shift);
1181 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1182 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1183 /* Contradict one of substrings */
1184 if (prog->anchored_substr || prog->anchored_utf8) {
1185 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1186 DEBUG_EXECUTE_r( what = "anchored" );
1188 s = HOP3c(t, 1, strend);
1189 if (s + start_shift + end_shift > strend) {
1190 /* XXXX Should be taken into account earlier? */
1191 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1192 "Could not match STCLASS...\n") );
1197 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1198 "Looking for %s substr starting at offset %ld...\n",
1199 what, (long)(s + start_shift - i_strpos)) );
1202 /* Have both, check_string is floating */
1203 if (t + start_shift >= check_at) /* Contradicts floating=check */
1204 goto retry_floating_check;
1205 /* Recheck anchored substring, but not floating... */
1209 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1210 "Looking for anchored substr starting at offset %ld...\n",
1211 (long)(other_last - i_strpos)) );
1212 goto do_other_anchored;
1214 /* Another way we could have checked stclass at the
1215 current position only: */
1220 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1221 "Looking for /%s^%s/m starting at offset %ld...\n",
1222 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1225 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1227 /* Check is floating substring. */
1228 retry_floating_check:
1229 t = check_at - start_shift;
1230 DEBUG_EXECUTE_r( what = "floating" );
1231 goto hop_and_restart;
1234 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1235 "By STCLASS: moving %ld --> %ld\n",
1236 (long)(t - i_strpos), (long)(s - i_strpos))
1240 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1241 "Does not contradict STCLASS...\n");
1246 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1247 PL_colors[4], (check ? "Guessed" : "Giving up"),
1248 PL_colors[5], (long)(s - i_strpos)) );
1251 fail_finish: /* Substring not found */
1252 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1253 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1255 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1256 PL_colors[4], PL_colors[5]));
1260 #define DECL_TRIE_TYPE(scan) \
1261 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1262 trie_type = ((scan->flags == EXACT) \
1263 ? (utf8_target ? trie_utf8 : trie_plain) \
1264 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1266 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1267 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1269 switch (trie_type) { \
1270 case trie_utf8_fold: \
1271 if ( foldlen>0 ) { \
1272 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1277 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1278 len = UTF8SKIP(uc); \
1279 skiplen = UNISKIP( uvc ); \
1280 foldlen -= skiplen; \
1281 uscan = foldbuf + skiplen; \
1284 case trie_latin_utf8_fold: \
1285 if ( foldlen>0 ) { \
1286 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1292 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1293 skiplen = UNISKIP( uvc ); \
1294 foldlen -= skiplen; \
1295 uscan = foldbuf + skiplen; \
1299 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1306 charid = trie->charmap[ uvc ]; \
1310 if (widecharmap) { \
1311 SV** const svpp = hv_fetch(widecharmap, \
1312 (char*)&uvc, sizeof(UV), 0); \
1314 charid = (U16)SvIV(*svpp); \
1319 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1323 && (ln == 1 || folder(s, pat_string, ln)) \
1324 && (!reginfo || regtry(reginfo, &s)) ) \
1330 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1332 while (s < strend) { \
1338 #define REXEC_FBC_SCAN(CoDe) \
1340 while (s < strend) { \
1346 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1347 REXEC_FBC_UTF8_SCAN( \
1349 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1358 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1361 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1370 #define REXEC_FBC_TRYIT \
1371 if ((!reginfo || regtry(reginfo, &s))) \
1374 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1375 if (utf8_target) { \
1376 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1379 REXEC_FBC_CLASS_SCAN(CoNd); \
1382 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1383 if (utf8_target) { \
1385 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1388 REXEC_FBC_CLASS_SCAN(CoNd); \
1391 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1392 PL_reg_flags |= RF_tainted; \
1393 if (utf8_target) { \
1394 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1397 REXEC_FBC_CLASS_SCAN(CoNd); \
1400 #define DUMP_EXEC_POS(li,s,doutf8) \
1401 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1404 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1405 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1406 tmp = TEST_NON_UTF8(tmp); \
1407 REXEC_FBC_UTF8_SCAN( \
1408 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1417 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1418 if (s == PL_bostr) { \
1422 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1423 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1426 LOAD_UTF8_CHARCLASS_ALNUM(); \
1427 REXEC_FBC_UTF8_SCAN( \
1428 if (tmp == ! (TeSt2_UtF8)) { \
1437 /* The only difference between the BOUND and NBOUND cases is that
1438 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1439 * NBOUND. This is accomplished by passing it in either the if or else clause,
1440 * with the other one being empty */
1441 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1442 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1444 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1445 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1447 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1448 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1450 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1451 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1454 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1455 * be passed in completely with the variable name being tested, which isn't
1456 * such a clean interface, but this is easier to read than it was before. We
1457 * are looking for the boundary (or non-boundary between a word and non-word
1458 * character. The utf8 and non-utf8 cases have the same logic, but the details
1459 * must be different. Find the "wordness" of the character just prior to this
1460 * one, and compare it with the wordness of this one. If they differ, we have
1461 * a boundary. At the beginning of the string, pretend that the previous
1462 * character was a new-line */
1463 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1464 if (utf8_target) { \
1467 else { /* Not utf8 */ \
1468 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1469 tmp = TEST_NON_UTF8(tmp); \
1471 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1480 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1483 /* We know what class REx starts with. Try to find this position... */
1484 /* if reginfo is NULL, its a dryrun */
1485 /* annoyingly all the vars in this routine have different names from their counterparts
1486 in regmatch. /grrr */
1489 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1490 const char *strend, regmatch_info *reginfo)
1493 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1494 char *pat_string; /* The pattern's exactish string */
1495 char *pat_end; /* ptr to end char of pat_string */
1496 re_fold_t folder; /* Function for computing non-utf8 folds */
1497 const U8 *fold_array; /* array for folding ords < 256 */
1504 I32 tmp = 1; /* Scratch variable? */
1505 const bool utf8_target = PL_reg_match_utf8;
1506 UV utf8_fold_flags = 0;
1507 RXi_GET_DECL(prog,progi);
1509 PERL_ARGS_ASSERT_FIND_BYCLASS;
1511 /* We know what class it must start with. */
1515 REXEC_FBC_UTF8_CLASS_SCAN(
1516 reginclass(prog, c, (U8*)s, utf8_target));
1519 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1524 if (tmp && (!reginfo || regtry(reginfo, &s)))
1532 if (UTF_PATTERN || utf8_target) {
1533 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1534 goto do_exactf_utf8;
1536 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1537 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1538 goto do_exactf_non_utf8; /* isn't dealt with by these */
1543 /* regcomp.c already folded this if pattern is in UTF-8 */
1544 utf8_fold_flags = 0;
1545 goto do_exactf_utf8;
1547 fold_array = PL_fold;
1549 goto do_exactf_non_utf8;
1552 if (UTF_PATTERN || utf8_target) {
1553 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1554 goto do_exactf_utf8;
1556 fold_array = PL_fold_locale;
1557 folder = foldEQ_locale;
1558 goto do_exactf_non_utf8;
1562 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1564 goto do_exactf_utf8;
1566 case EXACTFU_TRICKYFOLD:
1568 if (UTF_PATTERN || utf8_target) {
1569 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1570 goto do_exactf_utf8;
1573 /* Any 'ss' in the pattern should have been replaced by regcomp,
1574 * so we don't have to worry here about this single special case
1575 * in the Latin1 range */
1576 fold_array = PL_fold_latin1;
1577 folder = foldEQ_latin1;
1581 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1582 are no glitches with fold-length differences
1583 between the target string and pattern */
1585 /* The idea in the non-utf8 EXACTF* cases is to first find the
1586 * first character of the EXACTF* node and then, if necessary,
1587 * case-insensitively compare the full text of the node. c1 is the
1588 * first character. c2 is its fold. This logic will not work for
1589 * Unicode semantics and the german sharp ss, which hence should
1590 * not be compiled into a node that gets here. */
1591 pat_string = STRING(c);
1592 ln = STR_LEN(c); /* length to match in octets/bytes */
1594 /* We know that we have to match at least 'ln' bytes (which is the
1595 * same as characters, since not utf8). If we have to match 3
1596 * characters, and there are only 2 availabe, we know without
1597 * trying that it will fail; so don't start a match past the
1598 * required minimum number from the far end */
1599 e = HOP3c(strend, -((I32)ln), s);
1601 if (!reginfo && e < s) {
1602 e = s; /* Due to minlen logic of intuit() */
1606 c2 = fold_array[c1];
1607 if (c1 == c2) { /* If char and fold are the same */
1608 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1611 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1619 /* If one of the operands is in utf8, we can't use the simpler folding
1620 * above, due to the fact that many different characters can have the
1621 * same fold, or portion of a fold, or different- length fold */
1622 pat_string = STRING(c);
1623 ln = STR_LEN(c); /* length to match in octets/bytes */
1624 pat_end = pat_string + ln;
1625 lnc = (UTF_PATTERN) /* length to match in characters */
1626 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1629 /* We have 'lnc' characters to match in the pattern, but because of
1630 * multi-character folding, each character in the target can match
1631 * up to 3 characters (Unicode guarantees it will never exceed
1632 * this) if it is utf8-encoded; and up to 2 if not (based on the
1633 * fact that the Latin 1 folds are already determined, and the
1634 * only multi-char fold in that range is the sharp-s folding to
1635 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1636 * string character. Adjust lnc accordingly, rounding up, so that
1637 * if we need to match at least 4+1/3 chars, that really is 5. */
1638 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1639 lnc = (lnc + expansion - 1) / expansion;
1641 /* As in the non-UTF8 case, if we have to match 3 characters, and
1642 * only 2 are left, it's guaranteed to fail, so don't start a
1643 * match that would require us to go beyond the end of the string
1645 e = HOP3c(strend, -((I32)lnc), s);
1647 if (!reginfo && e < s) {
1648 e = s; /* Due to minlen logic of intuit() */
1651 /* XXX Note that we could recalculate e to stop the loop earlier,
1652 * as the worst case expansion above will rarely be met, and as we
1653 * go along we would usually find that e moves further to the left.
1654 * This would happen only after we reached the point in the loop
1655 * where if there were no expansion we should fail. Unclear if
1656 * worth the expense */
1659 char *my_strend= (char *)strend;
1660 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1661 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1662 && (!reginfo || regtry(reginfo, &s)) )
1666 s += (utf8_target) ? UTF8SKIP(s) : 1;
1671 PL_reg_flags |= RF_tainted;
1672 FBC_BOUND(isALNUM_LC,
1673 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1674 isALNUM_LC_utf8((U8*)s));
1677 PL_reg_flags |= RF_tainted;
1678 FBC_NBOUND(isALNUM_LC,
1679 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1680 isALNUM_LC_utf8((U8*)s));
1683 FBC_BOUND(isWORDCHAR,
1685 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1688 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1690 isWORDCHAR_A((U8*)s));
1693 FBC_NBOUND(isWORDCHAR,
1695 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1698 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1700 isWORDCHAR_A((U8*)s));
1703 FBC_BOUND(isWORDCHAR_L1,
1705 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1708 FBC_NBOUND(isWORDCHAR_L1,
1710 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1713 REXEC_FBC_CSCAN_TAINT(
1714 isALNUM_LC_utf8((U8*)s),
1719 REXEC_FBC_CSCAN_PRELOAD(
1720 LOAD_UTF8_CHARCLASS_ALNUM(),
1721 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1722 isWORDCHAR_L1((U8) *s)
1726 REXEC_FBC_CSCAN_PRELOAD(
1727 LOAD_UTF8_CHARCLASS_ALNUM(),
1728 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1733 /* Don't need to worry about utf8, as it can match only a single
1734 * byte invariant character */
1735 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1738 REXEC_FBC_CSCAN_PRELOAD(
1739 LOAD_UTF8_CHARCLASS_ALNUM(),
1740 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1741 ! isWORDCHAR_L1((U8) *s)
1745 REXEC_FBC_CSCAN_PRELOAD(
1746 LOAD_UTF8_CHARCLASS_ALNUM(),
1747 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1758 REXEC_FBC_CSCAN_TAINT(
1759 !isALNUM_LC_utf8((U8*)s),
1765 is_XPERLSPACE_utf8(s),
1771 is_XPERLSPACE_utf8(s),
1776 /* Don't need to worry about utf8, as it can match only a single
1777 * byte invariant character */
1778 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1781 REXEC_FBC_CSCAN_TAINT(
1782 isSPACE_LC_utf8((U8*)s),
1788 ! is_XPERLSPACE_utf8(s),
1789 ! isSPACE_L1((U8) *s)
1794 ! is_XPERLSPACE_utf8(s),
1805 REXEC_FBC_CSCAN_TAINT(
1806 !isSPACE_LC_utf8((U8*)s),
1811 REXEC_FBC_CSCAN_PRELOAD(
1812 LOAD_UTF8_CHARCLASS_DIGIT(),
1813 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1818 /* Don't need to worry about utf8, as it can match only a single
1819 * byte invariant character */
1820 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1823 REXEC_FBC_CSCAN_TAINT(
1824 isDIGIT_LC_utf8((U8*)s),
1829 REXEC_FBC_CSCAN_PRELOAD(
1830 LOAD_UTF8_CHARCLASS_DIGIT(),
1831 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1842 REXEC_FBC_CSCAN_TAINT(
1843 !isDIGIT_LC_utf8((U8*)s),
1848 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1849 is_LNBREAK_latin1_safe(s, strend)
1854 is_VERTWS_utf8_safe(s, strend),
1855 is_VERTWS_latin1_safe(s, strend)
1860 !is_VERTWS_utf8_safe(s, strend),
1861 !is_VERTWS_latin1_safe(s, strend)
1866 is_HORIZWS_utf8_safe(s, strend),
1867 is_HORIZWS_latin1_safe(s, strend)
1872 !is_HORIZWS_utf8_safe(s, strend),
1873 !is_HORIZWS_latin1_safe(s, strend)
1877 /* Don't need to worry about utf8, as it can match only a single
1878 * byte invariant character. The flag in this node type is the
1879 * class number to pass to _generic_isCC() to build a mask for
1880 * searching in PL_charclass[] */
1881 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1885 !_generic_isCC_A(*s, FLAGS(c)),
1886 !_generic_isCC_A(*s, FLAGS(c))
1894 /* what trie are we using right now */
1895 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1896 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1897 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1899 const char *last_start = strend - trie->minlen;
1901 const char *real_start = s;
1903 STRLEN maxlen = trie->maxlen;
1905 U8 **points; /* map of where we were in the input string
1906 when reading a given char. For ASCII this
1907 is unnecessary overhead as the relationship
1908 is always 1:1, but for Unicode, especially
1909 case folded Unicode this is not true. */
1910 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1914 GET_RE_DEBUG_FLAGS_DECL;
1916 /* We can't just allocate points here. We need to wrap it in
1917 * an SV so it gets freed properly if there is a croak while
1918 * running the match */
1921 sv_points=newSV(maxlen * sizeof(U8 *));
1922 SvCUR_set(sv_points,
1923 maxlen * sizeof(U8 *));
1924 SvPOK_on(sv_points);
1925 sv_2mortal(sv_points);
1926 points=(U8**)SvPV_nolen(sv_points );
1927 if ( trie_type != trie_utf8_fold
1928 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1931 bitmap=(U8*)trie->bitmap;
1933 bitmap=(U8*)ANYOF_BITMAP(c);
1935 /* this is the Aho-Corasick algorithm modified a touch
1936 to include special handling for long "unknown char" sequences.
1937 The basic idea being that we use AC as long as we are dealing
1938 with a possible matching char, when we encounter an unknown char
1939 (and we have not encountered an accepting state) we scan forward
1940 until we find a legal starting char.
1941 AC matching is basically that of trie matching, except that when
1942 we encounter a failing transition, we fall back to the current
1943 states "fail state", and try the current char again, a process
1944 we repeat until we reach the root state, state 1, or a legal
1945 transition. If we fail on the root state then we can either
1946 terminate if we have reached an accepting state previously, or
1947 restart the entire process from the beginning if we have not.
1950 while (s <= last_start) {
1951 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1959 U8 *uscan = (U8*)NULL;
1960 U8 *leftmost = NULL;
1962 U32 accepted_word= 0;
1966 while ( state && uc <= (U8*)strend ) {
1968 U32 word = aho->states[ state ].wordnum;
1972 DEBUG_TRIE_EXECUTE_r(
1973 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1974 dump_exec_pos( (char *)uc, c, strend, real_start,
1975 (char *)uc, utf8_target );
1976 PerlIO_printf( Perl_debug_log,
1977 " Scanning for legal start char...\n");
1981 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1985 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1991 if (uc >(U8*)last_start) break;
1995 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1996 if (!leftmost || lpos < leftmost) {
1997 DEBUG_r(accepted_word=word);
2003 points[pointpos++ % maxlen]= uc;
2004 if (foldlen || uc < (U8*)strend) {
2005 REXEC_TRIE_READ_CHAR(trie_type, trie,
2007 uscan, len, uvc, charid, foldlen,
2009 DEBUG_TRIE_EXECUTE_r({
2010 dump_exec_pos( (char *)uc, c, strend,
2011 real_start, s, utf8_target);
2012 PerlIO_printf(Perl_debug_log,
2013 " Charid:%3u CP:%4"UVxf" ",
2025 word = aho->states[ state ].wordnum;
2027 base = aho->states[ state ].trans.base;
2029 DEBUG_TRIE_EXECUTE_r({
2031 dump_exec_pos( (char *)uc, c, strend, real_start,
2033 PerlIO_printf( Perl_debug_log,
2034 "%sState: %4"UVxf", word=%"UVxf,
2035 failed ? " Fail transition to " : "",
2036 (UV)state, (UV)word);
2042 ( ((offset = base + charid
2043 - 1 - trie->uniquecharcount)) >= 0)
2044 && ((U32)offset < trie->lasttrans)
2045 && trie->trans[offset].check == state
2046 && (tmp=trie->trans[offset].next))
2048 DEBUG_TRIE_EXECUTE_r(
2049 PerlIO_printf( Perl_debug_log," - legal\n"));
2054 DEBUG_TRIE_EXECUTE_r(
2055 PerlIO_printf( Perl_debug_log," - fail\n"));
2057 state = aho->fail[state];
2061 /* we must be accepting here */
2062 DEBUG_TRIE_EXECUTE_r(
2063 PerlIO_printf( Perl_debug_log," - accepting\n"));
2072 if (!state) state = 1;
2075 if ( aho->states[ state ].wordnum ) {
2076 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2077 if (!leftmost || lpos < leftmost) {
2078 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2083 s = (char*)leftmost;
2084 DEBUG_TRIE_EXECUTE_r({
2086 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2087 (UV)accepted_word, (IV)(s - real_start)
2090 if (!reginfo || regtry(reginfo, &s)) {
2096 DEBUG_TRIE_EXECUTE_r({
2097 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2100 DEBUG_TRIE_EXECUTE_r(
2101 PerlIO_printf( Perl_debug_log,"No match.\n"));
2110 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2120 - regexec_flags - match a regexp against a string
2123 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2124 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2125 /* stringarg: the point in the string at which to begin matching */
2126 /* strend: pointer to null at end of string */
2127 /* strbeg: real beginning of string */
2128 /* minend: end of match must be >= minend bytes after stringarg. */
2129 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2130 * itself is accessed via the pointers above */
2131 /* data: May be used for some additional optimizations.
2132 Currently its only used, with a U32 cast, for transmitting
2133 the ganch offset when doing a /g match. This will change */
2134 /* nosave: For optimizations. */
2138 struct regexp *const prog = ReANY(rx);
2141 char *startpos = stringarg;
2142 I32 minlen; /* must match at least this many chars */
2143 I32 dontbother = 0; /* how many characters not to try at end */
2144 I32 end_shift = 0; /* Same for the end. */ /* CC */
2145 I32 scream_pos = -1; /* Internal iterator of scream. */
2146 char *scream_olds = NULL;
2147 const bool utf8_target = cBOOL(DO_UTF8(sv));
2149 RXi_GET_DECL(prog,progi);
2150 regmatch_info reginfo; /* create some info to pass to regtry etc */
2151 regexp_paren_pair *swap = NULL;
2152 GET_RE_DEBUG_FLAGS_DECL;
2154 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2155 PERL_UNUSED_ARG(data);
2157 /* Be paranoid... */
2158 if (prog == NULL || startpos == NULL) {
2159 Perl_croak(aTHX_ "NULL regexp parameter");
2163 multiline = prog->extflags & RXf_PMf_MULTILINE;
2164 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2166 RX_MATCH_UTF8_set(rx, utf8_target);
2168 debug_start_match(rx, utf8_target, startpos, strend,
2172 minlen = prog->minlen;
2174 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2175 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2176 "String too short [regexec_flags]...\n"));
2181 /* Check validity of program. */
2182 if (UCHARAT(progi->program) != REG_MAGIC) {
2183 Perl_croak(aTHX_ "corrupted regexp program");
2187 PL_reg_state.re_state_eval_setup_done = FALSE;
2191 PL_reg_flags |= RF_utf8;
2193 /* Mark beginning of line for ^ and lookbehind. */
2194 reginfo.bol = startpos; /* XXX not used ??? */
2198 /* Mark end of line for $ (and such) */
2201 /* see how far we have to get to not match where we matched before */
2202 reginfo.till = startpos+minend;
2204 /* If there is a "must appear" string, look for it. */
2207 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2209 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2210 reginfo.ganch = startpos + prog->gofs;
2211 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2212 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2213 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2215 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2216 && mg->mg_len >= 0) {
2217 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2218 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2219 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2221 if (prog->extflags & RXf_ANCH_GPOS) {
2222 if (s > reginfo.ganch)
2224 s = reginfo.ganch - prog->gofs;
2225 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2226 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2232 reginfo.ganch = strbeg + PTR2UV(data);
2233 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2234 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2236 } else { /* pos() not defined */
2237 reginfo.ganch = strbeg;
2238 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2239 "GPOS: reginfo.ganch = strbeg\n"));
2242 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2243 /* We have to be careful. If the previous successful match
2244 was from this regex we don't want a subsequent partially
2245 successful match to clobber the old results.
2246 So when we detect this possibility we add a swap buffer
2247 to the re, and switch the buffer each match. If we fail
2248 we switch it back, otherwise we leave it swapped.
2251 /* do we need a save destructor here for eval dies? */
2252 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2253 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2254 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2260 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2261 re_scream_pos_data d;
2263 d.scream_olds = &scream_olds;
2264 d.scream_pos = &scream_pos;
2265 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2267 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2268 goto phooey; /* not present */
2274 /* Simplest case: anchored match need be tried only once. */
2275 /* [unless only anchor is BOL and multiline is set] */
2276 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2277 if (s == startpos && regtry(®info, &startpos))
2279 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2280 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2285 dontbother = minlen - 1;
2286 end = HOP3c(strend, -dontbother, strbeg) - 1;
2287 /* for multiline we only have to try after newlines */
2288 if (prog->check_substr || prog->check_utf8) {
2289 /* because of the goto we can not easily reuse the macros for bifurcating the
2290 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2293 goto after_try_utf8;
2295 if (regtry(®info, &s)) {
2302 if (prog->extflags & RXf_USE_INTUIT) {
2303 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2312 } /* end search for check string in unicode */
2314 if (s == startpos) {
2315 goto after_try_latin;
2318 if (regtry(®info, &s)) {
2325 if (prog->extflags & RXf_USE_INTUIT) {
2326 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2335 } /* end search for check string in latin*/
2336 } /* end search for check string */
2337 else { /* search for newline */
2339 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2342 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2343 while (s <= end) { /* note it could be possible to match at the end of the string */
2344 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2345 if (regtry(®info, &s))
2349 } /* end search for newline */
2350 } /* end anchored/multiline check string search */
2352 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2354 /* the warning about reginfo.ganch being used without initialization
2355 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2356 and we only enter this block when the same bit is set. */
2357 char *tmp_s = reginfo.ganch - prog->gofs;
2359 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2364 /* Messy cases: unanchored match. */
2365 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2366 /* we have /x+whatever/ */
2367 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2373 if (! prog->anchored_utf8) {
2374 to_utf8_substr(prog);
2376 ch = SvPVX_const(prog->anchored_utf8)[0];
2379 DEBUG_EXECUTE_r( did_match = 1 );
2380 if (regtry(®info, &s)) goto got_it;
2382 while (s < strend && *s == ch)
2389 if (! prog->anchored_substr) {
2390 if (! to_byte_substr(prog)) {
2391 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2394 ch = SvPVX_const(prog->anchored_substr)[0];
2397 DEBUG_EXECUTE_r( did_match = 1 );
2398 if (regtry(®info, &s)) goto got_it;
2400 while (s < strend && *s == ch)
2405 DEBUG_EXECUTE_r(if (!did_match)
2406 PerlIO_printf(Perl_debug_log,
2407 "Did not find anchored character...\n")
2410 else if (prog->anchored_substr != NULL
2411 || prog->anchored_utf8 != NULL
2412 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2413 && prog->float_max_offset < strend - s)) {
2418 char *last1; /* Last position checked before */
2422 if (prog->anchored_substr || prog->anchored_utf8) {
2424 if (! prog->anchored_utf8) {
2425 to_utf8_substr(prog);
2427 must = prog->anchored_utf8;
2430 if (! prog->anchored_substr) {
2431 if (! to_byte_substr(prog)) {
2432 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2435 must = prog->anchored_substr;
2437 back_max = back_min = prog->anchored_offset;
2440 if (! prog->float_utf8) {
2441 to_utf8_substr(prog);
2443 must = prog->float_utf8;
2446 if (! prog->float_substr) {
2447 if (! to_byte_substr(prog)) {
2448 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2451 must = prog->float_substr;
2453 back_max = prog->float_max_offset;
2454 back_min = prog->float_min_offset;
2460 last = HOP3c(strend, /* Cannot start after this */
2461 -(I32)(CHR_SVLEN(must)
2462 - (SvTAIL(must) != 0) + back_min), strbeg);
2465 last1 = HOPc(s, -1);
2467 last1 = s - 1; /* bogus */
2469 /* XXXX check_substr already used to find "s", can optimize if
2470 check_substr==must. */
2472 dontbother = end_shift;
2473 strend = HOPc(strend, -dontbother);
2474 while ( (s <= last) &&
2475 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2476 (unsigned char*)strend, must,
2477 multiline ? FBMrf_MULTILINE : 0)) ) {
2478 DEBUG_EXECUTE_r( did_match = 1 );
2479 if (HOPc(s, -back_max) > last1) {
2480 last1 = HOPc(s, -back_min);
2481 s = HOPc(s, -back_max);
2484 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2486 last1 = HOPc(s, -back_min);
2490 while (s <= last1) {
2491 if (regtry(®info, &s))
2494 s++; /* to break out of outer loop */
2501 while (s <= last1) {
2502 if (regtry(®info, &s))
2508 DEBUG_EXECUTE_r(if (!did_match) {
2509 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2510 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2511 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2512 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2513 ? "anchored" : "floating"),
2514 quoted, RE_SV_TAIL(must));
2518 else if ( (c = progi->regstclass) ) {
2520 const OPCODE op = OP(progi->regstclass);
2521 /* don't bother with what can't match */
2522 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2523 strend = HOPc(strend, -(minlen - 1));
2526 SV * const prop = sv_newmortal();
2527 regprop(prog, prop, c);
2529 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2531 PerlIO_printf(Perl_debug_log,
2532 "Matching stclass %.*s against %s (%d bytes)\n",
2533 (int)SvCUR(prop), SvPVX_const(prop),
2534 quoted, (int)(strend - s));
2537 if (find_byclass(prog, c, s, strend, ®info))
2539 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2543 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2551 if (! prog->float_utf8) {
2552 to_utf8_substr(prog);
2554 float_real = prog->float_utf8;
2557 if (! prog->float_substr) {
2558 if (! to_byte_substr(prog)) {
2559 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2562 float_real = prog->float_substr;
2565 little = SvPV_const(float_real, len);
2566 if (SvTAIL(float_real)) {
2567 /* This means that float_real contains an artificial \n on
2568 * the end due to the presence of something like this:
2569 * /foo$/ where we can match both "foo" and "foo\n" at the
2570 * end of the string. So we have to compare the end of the
2571 * string first against the float_real without the \n and
2572 * then against the full float_real with the string. We
2573 * have to watch out for cases where the string might be
2574 * smaller than the float_real or the float_real without
2576 char *checkpos= strend - len;
2578 PerlIO_printf(Perl_debug_log,
2579 "%sChecking for float_real.%s\n",
2580 PL_colors[4], PL_colors[5]));
2581 if (checkpos + 1 < strbeg) {
2582 /* can't match, even if we remove the trailing \n
2583 * string is too short to match */
2585 PerlIO_printf(Perl_debug_log,
2586 "%sString shorter than required trailing substring, cannot match.%s\n",
2587 PL_colors[4], PL_colors[5]));
2589 } else if (memEQ(checkpos + 1, little, len - 1)) {
2590 /* can match, the end of the string matches without the
2592 last = checkpos + 1;
2593 } else if (checkpos < strbeg) {
2594 /* cant match, string is too short when the "\n" is
2597 PerlIO_printf(Perl_debug_log,
2598 "%sString does not contain required trailing substring, cannot match.%s\n",
2599 PL_colors[4], PL_colors[5]));
2601 } else if (!multiline) {
2602 /* non multiline match, so compare with the "\n" at the
2603 * end of the string */
2604 if (memEQ(checkpos, little, len)) {
2608 PerlIO_printf(Perl_debug_log,
2609 "%sString does not contain required trailing substring, cannot match.%s\n",
2610 PL_colors[4], PL_colors[5]));
2614 /* multiline match, so we have to search for a place
2615 * where the full string is located */
2621 last = rninstr(s, strend, little, little + len);
2623 last = strend; /* matching "$" */
2626 /* at one point this block contained a comment which was
2627 * probably incorrect, which said that this was a "should not
2628 * happen" case. Even if it was true when it was written I am
2629 * pretty sure it is not anymore, so I have removed the comment
2630 * and replaced it with this one. Yves */
2632 PerlIO_printf(Perl_debug_log,
2633 "String does not contain required substring, cannot match.\n"
2637 dontbother = strend - last + prog->float_min_offset;
2639 if (minlen && (dontbother < minlen))
2640 dontbother = minlen - 1;
2641 strend -= dontbother; /* this one's always in bytes! */
2642 /* We don't know much -- general case. */
2645 if (regtry(®info, &s))
2654 if (regtry(®info, &s))
2656 } while (s++ < strend);
2666 PerlIO_printf(Perl_debug_log,
2667 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2673 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2675 if (PL_reg_state.re_state_eval_setup_done)
2676 restore_pos(aTHX_ prog);
2677 if (RXp_PAREN_NAMES(prog))
2678 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2680 /* make sure $`, $&, $', and $digit will work later */
2681 if ( !(flags & REXEC_NOT_FIRST) ) {
2682 if (flags & REXEC_COPY_STR) {
2686 PerlIO_printf(Perl_debug_log,
2687 "Copy on write: regexp capture, type %d\n",
2690 RX_MATCH_COPY_FREE(rx);
2691 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2692 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2693 assert (SvPOKp(prog->saved_copy));
2694 prog->sublen = PL_regeol - strbeg;
2695 prog->suboffset = 0;
2696 prog->subcoffset = 0;
2701 I32 max = PL_regeol - strbeg;
2704 if ( (flags & REXEC_COPY_SKIP_POST)
2705 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2706 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2707 ) { /* don't copy $' part of string */
2710 /* calculate the right-most part of the string covered
2711 * by a capture. Due to look-ahead, this may be to
2712 * the right of $&, so we have to scan all captures */
2713 while (n <= prog->lastparen) {
2714 if (prog->offs[n].end > max)
2715 max = prog->offs[n].end;
2719 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2720 ? prog->offs[0].start
2722 assert(max >= 0 && max <= PL_regeol - strbeg);
2725 if ( (flags & REXEC_COPY_SKIP_PRE)
2726 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2727 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2728 ) { /* don't copy $` part of string */
2731 /* calculate the left-most part of the string covered
2732 * by a capture. Due to look-behind, this may be to
2733 * the left of $&, so we have to scan all captures */
2734 while (min && n <= prog->lastparen) {
2735 if ( prog->offs[n].start != -1
2736 && prog->offs[n].start < min)
2738 min = prog->offs[n].start;
2742 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2743 && min > prog->offs[0].end
2745 min = prog->offs[0].end;
2749 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2752 if (RX_MATCH_COPIED(rx)) {
2753 if (sublen > prog->sublen)
2755 (char*)saferealloc(prog->subbeg, sublen+1);
2758 prog->subbeg = (char*)safemalloc(sublen+1);
2759 Copy(strbeg + min, prog->subbeg, sublen, char);
2760 prog->subbeg[sublen] = '\0';
2761 prog->suboffset = min;
2762 prog->sublen = sublen;
2763 RX_MATCH_COPIED_on(rx);
2765 prog->subcoffset = prog->suboffset;
2766 if (prog->suboffset && utf8_target) {
2767 /* Convert byte offset to chars.
2768 * XXX ideally should only compute this if @-/@+
2769 * has been seen, a la PL_sawampersand ??? */
2771 /* If there's a direct correspondence between the
2772 * string which we're matching and the original SV,
2773 * then we can use the utf8 len cache associated with
2774 * the SV. In particular, it means that under //g,
2775 * sv_pos_b2u() will use the previously cached
2776 * position to speed up working out the new length of
2777 * subcoffset, rather than counting from the start of
2778 * the string each time. This stops
2779 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2780 * from going quadratic */
2781 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2782 sv_pos_b2u(sv, &(prog->subcoffset));
2784 prog->subcoffset = utf8_length((U8*)strbeg,
2785 (U8*)(strbeg+prog->suboffset));
2789 RX_MATCH_COPY_FREE(rx);
2790 prog->subbeg = strbeg;
2791 prog->suboffset = 0;
2792 prog->subcoffset = 0;
2793 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2801 PL_colors[4], PL_colors[5]));
2802 if (PL_reg_state.re_state_eval_setup_done)
2803 restore_pos(aTHX_ prog);
2805 /* we failed :-( roll it back */
2806 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2807 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2812 Safefree(prog->offs);
2819 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2820 * Do inc before dec, in case old and new rex are the same */
2821 #define SET_reg_curpm(Re2) \
2822 if (PL_reg_state.re_state_eval_setup_done) { \
2823 (void)ReREFCNT_inc(Re2); \
2824 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2825 PM_SETRE((PL_reg_curpm), (Re2)); \
2830 - regtry - try match at specific point
2832 STATIC I32 /* 0 failure, 1 success */
2833 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2837 REGEXP *const rx = reginfo->prog;
2838 regexp *const prog = ReANY(rx);
2840 RXi_GET_DECL(prog,progi);
2841 GET_RE_DEBUG_FLAGS_DECL;
2843 PERL_ARGS_ASSERT_REGTRY;
2845 reginfo->cutpoint=NULL;
2847 if ((prog->extflags & RXf_EVAL_SEEN)
2848 && !PL_reg_state.re_state_eval_setup_done)
2852 PL_reg_state.re_state_eval_setup_done = TRUE;
2854 /* Make $_ available to executed code. */
2855 if (reginfo->sv != DEFSV) {
2857 DEFSV_set(reginfo->sv);
2860 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2861 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2862 /* prepare for quick setting of pos */
2863 #ifdef PERL_OLD_COPY_ON_WRITE
2864 if (SvIsCOW(reginfo->sv))
2865 sv_force_normal_flags(reginfo->sv, 0);
2867 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2868 &PL_vtbl_mglob, NULL, 0);
2872 PL_reg_oldpos = mg->mg_len;
2873 SAVEDESTRUCTOR_X(restore_pos, prog);
2875 if (!PL_reg_curpm) {
2876 Newxz(PL_reg_curpm, 1, PMOP);
2879 SV* const repointer = &PL_sv_undef;
2880 /* this regexp is also owned by the new PL_reg_curpm, which
2881 will try to free it. */
2882 av_push(PL_regex_padav, repointer);
2883 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2884 PL_regex_pad = AvARRAY(PL_regex_padav);
2889 PL_reg_oldcurpm = PL_curpm;
2890 PL_curpm = PL_reg_curpm;
2891 if (RXp_MATCH_COPIED(prog)) {
2892 /* Here is a serious problem: we cannot rewrite subbeg,
2893 since it may be needed if this match fails. Thus
2894 $` inside (?{}) could fail... */
2895 PL_reg_oldsaved = prog->subbeg;
2896 PL_reg_oldsavedlen = prog->sublen;
2897 PL_reg_oldsavedoffset = prog->suboffset;
2898 PL_reg_oldsavedcoffset = prog->suboffset;
2900 PL_nrs = prog->saved_copy;
2902 RXp_MATCH_COPIED_off(prog);
2905 PL_reg_oldsaved = NULL;
2906 prog->subbeg = PL_bostr;
2907 prog->suboffset = 0;
2908 prog->subcoffset = 0;
2909 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2912 PL_reg_starttry = *startposp;
2914 prog->offs[0].start = *startposp - PL_bostr;
2915 prog->lastparen = 0;
2916 prog->lastcloseparen = 0;
2918 /* XXXX What this code is doing here?!!! There should be no need
2919 to do this again and again, prog->lastparen should take care of
2922 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2923 * Actually, the code in regcppop() (which Ilya may be meaning by
2924 * prog->lastparen), is not needed at all by the test suite
2925 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2926 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2927 * Meanwhile, this code *is* needed for the
2928 * above-mentioned test suite tests to succeed. The common theme
2929 * on those tests seems to be returning null fields from matches.
2930 * --jhi updated by dapm */
2932 if (prog->nparens) {
2933 regexp_paren_pair *pp = prog->offs;
2935 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2943 result = regmatch(reginfo, *startposp, progi->program + 1);
2945 prog->offs[0].end = result;
2948 if (reginfo->cutpoint)
2949 *startposp= reginfo->cutpoint;
2950 REGCP_UNWIND(lastcp);
2955 #define sayYES goto yes
2956 #define sayNO goto no
2957 #define sayNO_SILENT goto no_silent
2959 /* we dont use STMT_START/END here because it leads to
2960 "unreachable code" warnings, which are bogus, but distracting. */
2961 #define CACHEsayNO \
2962 if (ST.cache_mask) \
2963 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2966 /* this is used to determine how far from the left messages like
2967 'failed...' are printed. It should be set such that messages
2968 are inline with the regop output that created them.
2970 #define REPORT_CODE_OFF 32
2973 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2974 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2975 #define CHRTEST_NOT_A_CP_1 -999
2976 #define CHRTEST_NOT_A_CP_2 -998
2978 #define SLAB_FIRST(s) (&(s)->states[0])
2979 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2981 /* grab a new slab and return the first slot in it */
2983 STATIC regmatch_state *
2986 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2989 regmatch_slab *s = PL_regmatch_slab->next;
2991 Newx(s, 1, regmatch_slab);
2992 s->prev = PL_regmatch_slab;
2994 PL_regmatch_slab->next = s;
2996 PL_regmatch_slab = s;
2997 return SLAB_FIRST(s);
3001 /* push a new state then goto it */
3003 #define PUSH_STATE_GOTO(state, node, input) \
3004 pushinput = input; \
3006 st->resume_state = state; \
3009 /* push a new state with success backtracking, then goto it */
3011 #define PUSH_YES_STATE_GOTO(state, node, input) \
3012 pushinput = input; \
3014 st->resume_state = state; \
3015 goto push_yes_state;
3022 regmatch() - main matching routine
3024 This is basically one big switch statement in a loop. We execute an op,
3025 set 'next' to point the next op, and continue. If we come to a point which
3026 we may need to backtrack to on failure such as (A|B|C), we push a
3027 backtrack state onto the backtrack stack. On failure, we pop the top
3028 state, and re-enter the loop at the state indicated. If there are no more
3029 states to pop, we return failure.
3031 Sometimes we also need to backtrack on success; for example /A+/, where
3032 after successfully matching one A, we need to go back and try to
3033 match another one; similarly for lookahead assertions: if the assertion
3034 completes successfully, we backtrack to the state just before the assertion
3035 and then carry on. In these cases, the pushed state is marked as
3036 'backtrack on success too'. This marking is in fact done by a chain of
3037 pointers, each pointing to the previous 'yes' state. On success, we pop to
3038 the nearest yes state, discarding any intermediate failure-only states.
3039 Sometimes a yes state is pushed just to force some cleanup code to be
3040 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3041 it to free the inner regex.
3043 Note that failure backtracking rewinds the cursor position, while
3044 success backtracking leaves it alone.
3046 A pattern is complete when the END op is executed, while a subpattern
3047 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3048 ops trigger the "pop to last yes state if any, otherwise return true"
3051 A common convention in this function is to use A and B to refer to the two
3052 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3053 the subpattern to be matched possibly multiple times, while B is the entire
3054 rest of the pattern. Variable and state names reflect this convention.
3056 The states in the main switch are the union of ops and failure/success of
3057 substates associated with with that op. For example, IFMATCH is the op
3058 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3059 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3060 successfully matched A and IFMATCH_A_fail is a state saying that we have
3061 just failed to match A. Resume states always come in pairs. The backtrack
3062 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3063 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3064 on success or failure.
3066 The struct that holds a backtracking state is actually a big union, with
3067 one variant for each major type of op. The variable st points to the
3068 top-most backtrack struct. To make the code clearer, within each
3069 block of code we #define ST to alias the relevant union.
3071 Here's a concrete example of a (vastly oversimplified) IFMATCH
3077 #define ST st->u.ifmatch
3079 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3080 ST.foo = ...; // some state we wish to save
3082 // push a yes backtrack state with a resume value of
3083 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3085 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3088 case IFMATCH_A: // we have successfully executed A; now continue with B
3090 bar = ST.foo; // do something with the preserved value
3093 case IFMATCH_A_fail: // A failed, so the assertion failed
3094 ...; // do some housekeeping, then ...
3095 sayNO; // propagate the failure
3102 For any old-timers reading this who are familiar with the old recursive
3103 approach, the code above is equivalent to:
3105 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3114 ...; // do some housekeeping, then ...
3115 sayNO; // propagate the failure
3118 The topmost backtrack state, pointed to by st, is usually free. If you
3119 want to claim it, populate any ST.foo fields in it with values you wish to
3120 save, then do one of
3122 PUSH_STATE_GOTO(resume_state, node, newinput);
3123 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3125 which sets that backtrack state's resume value to 'resume_state', pushes a
3126 new free entry to the top of the backtrack stack, then goes to 'node'.
3127 On backtracking, the free slot is popped, and the saved state becomes the
3128 new free state. An ST.foo field in this new top state can be temporarily
3129 accessed to retrieve values, but once the main loop is re-entered, it
3130 becomes available for reuse.
3132 Note that the depth of the backtrack stack constantly increases during the
3133 left-to-right execution of the pattern, rather than going up and down with
3134 the pattern nesting. For example the stack is at its maximum at Z at the
3135 end of the pattern, rather than at X in the following:
3137 /(((X)+)+)+....(Y)+....Z/
3139 The only exceptions to this are lookahead/behind assertions and the cut,
3140 (?>A), which pop all the backtrack states associated with A before
3143 Backtrack state structs are allocated in slabs of about 4K in size.
3144 PL_regmatch_state and st always point to the currently active state,
3145 and PL_regmatch_slab points to the slab currently containing
3146 PL_regmatch_state. The first time regmatch() is called, the first slab is
3147 allocated, and is never freed until interpreter destruction. When the slab
3148 is full, a new one is allocated and chained to the end. At exit from
3149 regmatch(), slabs allocated since entry are freed.
3154 #define DEBUG_STATE_pp(pp) \
3156 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3157 PerlIO_printf(Perl_debug_log, \
3158 " %*s"pp" %s%s%s%s%s\n", \
3160 PL_reg_name[st->resume_state], \
3161 ((st==yes_state||st==mark_state) ? "[" : ""), \
3162 ((st==yes_state) ? "Y" : ""), \
3163 ((st==mark_state) ? "M" : ""), \
3164 ((st==yes_state||st==mark_state) ? "]" : "") \
3169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3174 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3175 const char *start, const char *end, const char *blurb)
3177 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3179 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3184 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3185 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3187 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3188 start, end - start, 60);
3190 PerlIO_printf(Perl_debug_log,
3191 "%s%s REx%s %s against %s\n",
3192 PL_colors[4], blurb, PL_colors[5], s0, s1);
3194 if (utf8_target||utf8_pat)
3195 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3196 utf8_pat ? "pattern" : "",
3197 utf8_pat && utf8_target ? " and " : "",
3198 utf8_target ? "string" : ""
3204 S_dump_exec_pos(pTHX_ const char *locinput,
3205 const regnode *scan,
3206 const char *loc_regeol,
3207 const char *loc_bostr,
3208 const char *loc_reg_starttry,
3209 const bool utf8_target)
3211 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3212 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3213 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3214 /* The part of the string before starttry has one color
3215 (pref0_len chars), between starttry and current
3216 position another one (pref_len - pref0_len chars),
3217 after the current position the third one.
3218 We assume that pref0_len <= pref_len, otherwise we
3219 decrease pref0_len. */
3220 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3221 ? (5 + taill) - l : locinput - loc_bostr;
3224 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3226 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3228 pref0_len = pref_len - (locinput - loc_reg_starttry);
3229 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3230 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3231 ? (5 + taill) - pref_len : loc_regeol - locinput);
3232 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3236 if (pref0_len > pref_len)
3237 pref0_len = pref_len;
3239 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3241 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3242 (locinput - pref_len),pref0_len, 60, 4, 5);
3244 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3245 (locinput - pref_len + pref0_len),
3246 pref_len - pref0_len, 60, 2, 3);
3248 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3249 locinput, loc_regeol - locinput, 10, 0, 1);
3251 const STRLEN tlen=len0+len1+len2;
3252 PerlIO_printf(Perl_debug_log,
3253 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3254 (IV)(locinput - loc_bostr),
3257 (docolor ? "" : "> <"),
3259 (int)(tlen > 19 ? 0 : 19 - tlen),
3266 /* reg_check_named_buff_matched()
3267 * Checks to see if a named buffer has matched. The data array of
3268 * buffer numbers corresponding to the buffer is expected to reside
3269 * in the regexp->data->data array in the slot stored in the ARG() of
3270 * node involved. Note that this routine doesn't actually care about the
3271 * name, that information is not preserved from compilation to execution.
3272 * Returns the index of the leftmost defined buffer with the given name
3273 * or 0 if non of the buffers matched.
3276 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3279 RXi_GET_DECL(rex,rexi);
3280 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3281 I32 *nums=(I32*)SvPVX(sv_dat);
3283 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3285 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3286 if ((I32)rex->lastparen >= nums[n] &&
3287 rex->offs[nums[n]].end != -1)
3296 /* free all slabs above current one - called during LEAVE_SCOPE */
3299 S_clear_backtrack_stack(pTHX_ void *p)
3301 regmatch_slab *s = PL_regmatch_slab->next;
3306 PL_regmatch_slab->next = NULL;
3308 regmatch_slab * const osl = s;
3314 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3316 /* This function determines if there are one or two characters that match
3317 * the first character of the passed-in EXACTish node <text_node>, and if
3318 * so, returns them in the passed-in pointers.
3320 * If it determines that no possible character in the target string can
3321 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3322 * the first character in <text_node> requires UTF-8 to represent, and the
3323 * target string isn't in UTF-8.)
3325 * If there are more than two characters that could match the beginning of
3326 * <text_node>, or if more context is required to determine a match or not,
3327 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3329 * The motiviation behind this function is to allow the caller to set up
3330 * tight loops for matching. If <text_node> is of type EXACT, there is
3331 * only one possible character that can match its first character, and so
3332 * the situation is quite simple. But things get much more complicated if
3333 * folding is involved. It may be that the first character of an EXACTFish
3334 * node doesn't participate in any possible fold, e.g., punctuation, so it
3335 * can be matched only by itself. The vast majority of characters that are
3336 * in folds match just two things, their lower and upper-case equivalents.
3337 * But not all are like that; some have multiple possible matches, or match
3338 * sequences of more than one character. This function sorts all that out.
3340 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3341 * loop of trying to match A*, we know we can't exit where the thing
3342 * following it isn't a B. And something can't be a B unless it is the
3343 * beginning of B. By putting a quick test for that beginning in a tight
3344 * loop, we can rule out things that can't possibly be B without having to
3345 * break out of the loop, thus avoiding work. Similarly, if A is a single
3346 * character, we can make a tight loop matching A*, using the outputs of
3349 * If the target string to match isn't in UTF-8, and there aren't
3350 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3351 * the one or two possible octets (which are characters in this situation)
3352 * that can match. In all cases, if there is only one character that can
3353 * match, *<c1p> and *<c2p> will be identical.
3355 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3356 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3357 * can match the beginning of <text_node>. They should be declared with at
3358 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3359 * undefined what these contain.) If one or both of the buffers are
3360 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3361 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3362 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3363 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3364 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3366 const bool utf8_target = PL_reg_match_utf8;
3368 UV c1 = CHRTEST_NOT_A_CP_1;
3369 UV c2 = CHRTEST_NOT_A_CP_2;
3370 bool use_chrtest_void = FALSE;
3372 /* Used when we have both utf8 input and utf8 output, to avoid converting
3373 * to/from code points */
3374 bool utf8_has_been_setup = FALSE;
3378 U8 *pat = (U8*)STRING(text_node);
3380 if (OP(text_node) == EXACT) {
3382 /* In an exact node, only one thing can be matched, that first
3383 * character. If both the pat and the target are UTF-8, we can just
3384 * copy the input to the output, avoiding finding the code point of
3386 if (! UTF_PATTERN) {
3389 else if (utf8_target) {
3390 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3391 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3392 utf8_has_been_setup = TRUE;
3395 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3398 else /* an EXACTFish node */
3400 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3401 pat + STR_LEN(text_node)))
3403 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3404 pat + STR_LEN(text_node))))
3406 /* Multi-character folds require more context to sort out. Also
3407 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3408 * handled outside this routine */
3409 use_chrtest_void = TRUE;
3411 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3412 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3414 /* Load the folds hash, if not already done */
3416 if (! PL_utf8_foldclosures) {
3417 if (! PL_utf8_tofold) {
3418 U8 dummy[UTF8_MAXBYTES+1];
3420 /* Force loading this by folding an above-Latin1 char */
3421 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3422 assert(PL_utf8_tofold); /* Verify that worked */
3424 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3427 /* The fold closures data structure is a hash with the keys being
3428 * the UTF-8 of every character that is folded to, like 'k', and
3429 * the values each an array of all code points that fold to its
3430 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3432 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3437 /* Not found in the hash, therefore there are no folds
3438 * containing it, so there is only a single character that
3442 else { /* Does participate in folds */
3443 AV* list = (AV*) *listp;
3444 if (av_len(list) != 1) {
3446 /* If there aren't exactly two folds to this, it is outside
3447 * the scope of this function */
3448 use_chrtest_void = TRUE;
3450 else { /* There are two. Get them */
3451 SV** c_p = av_fetch(list, 0, FALSE);
3453 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3457 c_p = av_fetch(list, 1, FALSE);
3459 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3463 /* Folds that cross the 255/256 boundary are forbidden if
3464 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3465 * pattern character is above 256, and its only other match
3466 * is below 256, the only legal match will be to itself.
3467 * We have thrown away the original, so have to compute
3468 * which is the one above 255 */
3469 if ((c1 < 256) != (c2 < 256)) {
3470 if (OP(text_node) == EXACTFL
3471 || (OP(text_node) == EXACTFA
3472 && (isASCII(c1) || isASCII(c2))))
3485 else /* Here, c1 is < 255 */
3487 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3488 && OP(text_node) != EXACTFL
3489 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3491 /* Here, there could be something above Latin1 in the target which
3492 * folds to this character in the pattern. All such cases except
3493 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3494 * involved in their folds, so are outside the scope of this
3496 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3497 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3500 use_chrtest_void = TRUE;
3503 else { /* Here nothing above Latin1 can fold to the pattern character */
3504 switch (OP(text_node)) {
3506 case EXACTFL: /* /l rules */
3507 c2 = PL_fold_locale[c1];
3511 if (! utf8_target) { /* /d rules */
3516 /* /u rules for all these. This happens to work for
3517 * EXACTFA as nothing in Latin1 folds to ASCII */
3519 case EXACTFU_TRICKYFOLD:
3522 c2 = PL_fold_latin1[c1];
3526 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3527 assert(0); /* NOTREACHED */
3532 /* Here have figured things out. Set up the returns */
3533 if (use_chrtest_void) {
3534 *c2p = *c1p = CHRTEST_VOID;
3536 else if (utf8_target) {
3537 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3538 uvchr_to_utf8(c1_utf8, c1);
3539 uvchr_to_utf8(c2_utf8, c2);
3542 /* Invariants are stored in both the utf8 and byte outputs; Use
3543 * negative numbers otherwise for the byte ones. Make sure that the
3544 * byte ones are the same iff the utf8 ones are the same */
3545 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3546 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3549 ? CHRTEST_NOT_A_CP_1
3550 : CHRTEST_NOT_A_CP_2;
3552 else if (c1 > 255) {
3553 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3558 *c1p = *c2p = c2; /* c2 is the only representable value */
3560 else { /* c1 is representable; see about c2 */
3562 *c2p = (c2 < 256) ? c2 : c1;
3568 /* returns -1 on failure, $+[0] on success */
3570 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3572 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3576 const bool utf8_target = PL_reg_match_utf8;
3577 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3578 REGEXP *rex_sv = reginfo->prog;
3579 regexp *rex = ReANY(rex_sv);
3580 RXi_GET_DECL(rex,rexi);
3582 /* the current state. This is a cached copy of PL_regmatch_state */
3584 /* cache heavy used fields of st in registers */
3587 U32 n = 0; /* general value; init to avoid compiler warning */
3588 I32 ln = 0; /* len or last; init to avoid compiler warning */
3589 char *locinput = startpos;
3590 char *pushinput; /* where to continue after a PUSH */
3591 I32 nextchr; /* is always set to UCHARAT(locinput) */
3593 bool result = 0; /* return value of S_regmatch */
3594 int depth = 0; /* depth of backtrack stack */
3595 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3596 const U32 max_nochange_depth =
3597 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3598 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3599 regmatch_state *yes_state = NULL; /* state to pop to on success of
3601 /* mark_state piggy backs on the yes_state logic so that when we unwind
3602 the stack on success we can update the mark_state as we go */
3603 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3604 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3605 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3607 bool no_final = 0; /* prevent failure from backtracking? */
3608 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3609 char *startpoint = locinput;
3610 SV *popmark = NULL; /* are we looking for a mark? */
3611 SV *sv_commit = NULL; /* last mark name seen in failure */
3612 SV *sv_yes_mark = NULL; /* last mark name we have seen
3613 during a successful match */
3614 U32 lastopen = 0; /* last open we saw */
3615 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3616 SV* const oreplsv = GvSV(PL_replgv);
3617 /* these three flags are set by various ops to signal information to
3618 * the very next op. They have a useful lifetime of exactly one loop
3619 * iteration, and are not preserved or restored by state pushes/pops
3621 bool sw = 0; /* the condition value in (?(cond)a|b) */
3622 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3623 int logical = 0; /* the following EVAL is:
3627 or the following IFMATCH/UNLESSM is:
3628 false: plain (?=foo)
3629 true: used as a condition: (?(?=foo))
3631 PAD* last_pad = NULL;
3633 I32 gimme = G_SCALAR;
3634 CV *caller_cv = NULL; /* who called us */
3635 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3636 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3637 U32 maxopenparen = 0; /* max '(' index seen so far */
3640 GET_RE_DEBUG_FLAGS_DECL;
3643 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3644 multicall_oldcatch = 0;
3645 multicall_cv = NULL;
3647 PERL_UNUSED_VAR(multicall_cop);
3648 PERL_UNUSED_VAR(newsp);
3651 PERL_ARGS_ASSERT_REGMATCH;
3653 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3654 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3656 /* on first ever call to regmatch, allocate first slab */
3657 if (!PL_regmatch_slab) {
3658 Newx(PL_regmatch_slab, 1, regmatch_slab);
3659 PL_regmatch_slab->prev = NULL;
3660 PL_regmatch_slab->next = NULL;
3661 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3664 oldsave = PL_savestack_ix;
3665 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3666 SAVEVPTR(PL_regmatch_slab);
3667 SAVEVPTR(PL_regmatch_state);
3669 /* grab next free state slot */
3670 st = ++PL_regmatch_state;
3671 if (st > SLAB_LAST(PL_regmatch_slab))
3672 st = PL_regmatch_state = S_push_slab(aTHX);
3674 /* Note that nextchr is a byte even in UTF */
3677 while (scan != NULL) {
3680 SV * const prop = sv_newmortal();
3681 regnode *rnext=regnext(scan);
3682 DUMP_EXEC_POS( locinput, scan, utf8_target );
3683 regprop(rex, prop, scan);
3685 PerlIO_printf(Perl_debug_log,
3686 "%3"IVdf":%*s%s(%"IVdf")\n",
3687 (IV)(scan - rexi->program), depth*2, "",
3689 (PL_regkind[OP(scan)] == END || !rnext) ?
3690 0 : (IV)(rnext - rexi->program));
3693 next = scan + NEXT_OFF(scan);
3696 state_num = OP(scan);
3701 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3703 switch (state_num) {
3704 case BOL: /* /^../ */
3705 if (locinput == PL_bostr)
3707 /* reginfo->till = reginfo->bol; */
3712 case MBOL: /* /^../m */
3713 if (locinput == PL_bostr ||
3714 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3720 case SBOL: /* /^../s */
3721 if (locinput == PL_bostr)
3726 if (locinput == reginfo->ganch)
3730 case KEEPS: /* \K */
3731 /* update the startpoint */
3732 st->u.keeper.val = rex->offs[0].start;
3733 rex->offs[0].start = locinput - PL_bostr;
3734 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3735 assert(0); /*NOTREACHED*/
3736 case KEEPS_next_fail:
3737 /* rollback the start point change */
3738 rex->offs[0].start = st->u.keeper.val;
3740 assert(0); /*NOTREACHED*/
3742 case EOL: /* /..$/ */
3745 case MEOL: /* /..$/m */
3746 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3750 case SEOL: /* /..$/s */
3752 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3754 if (PL_regeol - locinput > 1)
3759 if (!NEXTCHR_IS_EOS)
3763 case SANY: /* /./s */
3766 goto increment_locinput;
3774 case REG_ANY: /* /./ */
3775 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3777 goto increment_locinput;
3781 #define ST st->u.trie
3782 case TRIEC: /* (ab|cd) with known charclass */
3783 /* In this case the charclass data is available inline so
3784 we can fail fast without a lot of extra overhead.
3786 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3788 PerlIO_printf(Perl_debug_log,
3789 "%*s %sfailed to match trie start class...%s\n",
3790 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3793 assert(0); /* NOTREACHED */
3796 case TRIE: /* (ab|cd) */
3797 /* the basic plan of execution of the trie is:
3798 * At the beginning, run though all the states, and
3799 * find the longest-matching word. Also remember the position
3800 * of the shortest matching word. For example, this pattern:
3803 * when matched against the string "abcde", will generate
3804 * accept states for all words except 3, with the longest
3805 * matching word being 4, and the shortest being 2 (with
3806 * the position being after char 1 of the string).
3808 * Then for each matching word, in word order (i.e. 1,2,4,5),
3809 * we run the remainder of the pattern; on each try setting
3810 * the current position to the character following the word,
3811 * returning to try the next word on failure.
3813 * We avoid having to build a list of words at runtime by
3814 * using a compile-time structure, wordinfo[].prev, which
3815 * gives, for each word, the previous accepting word (if any).
3816 * In the case above it would contain the mappings 1->2, 2->0,
3817 * 3->0, 4->5, 5->1. We can use this table to generate, from
3818 * the longest word (4 above), a list of all words, by
3819 * following the list of prev pointers; this gives us the
3820 * unordered list 4,5,1,2. Then given the current word we have
3821 * just tried, we can go through the list and find the
3822 * next-biggest word to try (so if we just failed on word 2,
3823 * the next in the list is 4).
3825 * Since at runtime we don't record the matching position in
3826 * the string for each word, we have to work that out for
3827 * each word we're about to process. The wordinfo table holds
3828 * the character length of each word; given that we recorded
3829 * at the start: the position of the shortest word and its
3830 * length in chars, we just need to move the pointer the
3831 * difference between the two char lengths. Depending on
3832 * Unicode status and folding, that's cheap or expensive.
3834 * This algorithm is optimised for the case where are only a
3835 * small number of accept states, i.e. 0,1, or maybe 2.
3836 * With lots of accepts states, and having to try all of them,
3837 * it becomes quadratic on number of accept states to find all
3842 /* what type of TRIE am I? (utf8 makes this contextual) */
3843 DECL_TRIE_TYPE(scan);
3845 /* what trie are we using right now */
3846 reg_trie_data * const trie
3847 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3848 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3849 U32 state = trie->startstate;
3852 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3854 if (trie->states[ state ].wordnum) {
3856 PerlIO_printf(Perl_debug_log,
3857 "%*s %smatched empty string...%s\n",
3858 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3864 PerlIO_printf(Perl_debug_log,
3865 "%*s %sfailed to match trie start class...%s\n",
3866 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3873 U8 *uc = ( U8* )locinput;
3877 U8 *uscan = (U8*)NULL;
3878 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3879 U32 charcount = 0; /* how many input chars we have matched */
3880 U32 accepted = 0; /* have we seen any accepting states? */
3882 ST.jump = trie->jump;
3885 ST.longfold = FALSE; /* char longer if folded => it's harder */
3888 /* fully traverse the TRIE; note the position of the
3889 shortest accept state and the wordnum of the longest
3892 while ( state && uc <= (U8*)PL_regeol ) {
3893 U32 base = trie->states[ state ].trans.base;
3897 wordnum = trie->states[ state ].wordnum;
3899 if (wordnum) { /* it's an accept state */
3902 /* record first match position */
3904 ST.firstpos = (U8*)locinput;
3909 ST.firstchars = charcount;
3912 if (!ST.nextword || wordnum < ST.nextword)
3913 ST.nextword = wordnum;
3914 ST.topword = wordnum;
3917 DEBUG_TRIE_EXECUTE_r({
3918 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3919 PerlIO_printf( Perl_debug_log,
3920 "%*s %sState: %4"UVxf" Accepted: %c ",
3921 2+depth * 2, "", PL_colors[4],
3922 (UV)state, (accepted ? 'Y' : 'N'));
3925 /* read a char and goto next state */
3926 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3928 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3929 uscan, len, uvc, charid, foldlen,
3936 base + charid - 1 - trie->uniquecharcount)) >= 0)
3938 && ((U32)offset < trie->lasttrans)
3939 && trie->trans[offset].check == state)
3941 state = trie->trans[offset].next;
3952 DEBUG_TRIE_EXECUTE_r(
3953 PerlIO_printf( Perl_debug_log,
3954 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3955 charid, uvc, (UV)state, PL_colors[5] );
3961 /* calculate total number of accept states */
3966 w = trie->wordinfo[w].prev;
3969 ST.accepted = accepted;
3973 PerlIO_printf( Perl_debug_log,
3974 "%*s %sgot %"IVdf" possible matches%s\n",
3975 REPORT_CODE_OFF + depth * 2, "",
3976 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3978 goto trie_first_try; /* jump into the fail handler */
3980 assert(0); /* NOTREACHED */
3982 case TRIE_next_fail: /* we failed - try next alternative */
3986 REGCP_UNWIND(ST.cp);
3987 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3989 if (!--ST.accepted) {
3991 PerlIO_printf( Perl_debug_log,
3992 "%*s %sTRIE failed...%s\n",
3993 REPORT_CODE_OFF+depth*2, "",
4000 /* Find next-highest word to process. Note that this code
4001 * is O(N^2) per trie run (O(N) per branch), so keep tight */
4004 U16 const nextword = ST.nextword;
4005 reg_trie_wordinfo * const wordinfo
4006 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4007 for (word=ST.topword; word; word=wordinfo[word].prev) {
4008 if (word > nextword && (!min || word < min))
4021 ST.lastparen = rex->lastparen;
4022 ST.lastcloseparen = rex->lastcloseparen;
4026 /* find start char of end of current word */
4028 U32 chars; /* how many chars to skip */
4029 reg_trie_data * const trie
4030 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4032 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4034 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4039 /* the hard option - fold each char in turn and find
4040 * its folded length (which may be different */
4041 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4049 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4057 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4062 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4078 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4079 ? ST.jump[ST.nextword]
4083 PerlIO_printf( Perl_debug_log,
4084 "%*s %sTRIE matched word #%d, continuing%s\n",
4085 REPORT_CODE_OFF+depth*2, "",
4092 if (ST.accepted > 1 || has_cutgroup) {
4093 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4094 assert(0); /* NOTREACHED */
4096 /* only one choice left - just continue */
4098 AV *const trie_words
4099 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4100 SV ** const tmp = av_fetch( trie_words,
4102 SV *sv= tmp ? sv_newmortal() : NULL;
4104 PerlIO_printf( Perl_debug_log,
4105 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4106 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4108 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4109 PL_colors[0], PL_colors[1],
4110 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4112 : "not compiled under -Dr",
4116 locinput = (char*)uc;
4117 continue; /* execute rest of RE */
4118 assert(0); /* NOTREACHED */
4122 case EXACT: { /* /abc/ */
4123 char *s = STRING(scan);
4125 if (utf8_target != UTF_PATTERN) {
4126 /* The target and the pattern have differing utf8ness. */
4128 const char * const e = s + ln;
4131 /* The target is utf8, the pattern is not utf8.
4132 * Above-Latin1 code points can't match the pattern;
4133 * invariants match exactly, and the other Latin1 ones need
4134 * to be downgraded to a single byte in order to do the
4135 * comparison. (If we could be confident that the target
4136 * is not malformed, this could be refactored to have fewer
4137 * tests by just assuming that if the first bytes match, it
4138 * is an invariant, but there are tests in the test suite
4139 * dealing with (??{...}) which violate this) */
4143 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4146 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4153 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4162 /* The target is not utf8, the pattern is utf8. */
4164 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4168 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4175 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4186 /* The target and the pattern have the same utf8ness. */
4187 /* Inline the first character, for speed. */
4188 if (UCHARAT(s) != nextchr)
4190 if (PL_regeol - locinput < ln)
4192 if (ln > 1 && memNE(s, locinput, ln))
4198 case EXACTFL: { /* /abc/il */
4200 const U8 * fold_array;
4202 U32 fold_utf8_flags;
4204 PL_reg_flags |= RF_tainted;
4205 folder = foldEQ_locale;
4206 fold_array = PL_fold_locale;
4207 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4210 case EXACTFU_SS: /* /\x{df}/iu */
4211 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4212 case EXACTFU: /* /abc/iu */
4213 folder = foldEQ_latin1;
4214 fold_array = PL_fold_latin1;
4215 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4218 case EXACTFA: /* /abc/iaa */
4219 folder = foldEQ_latin1;
4220 fold_array = PL_fold_latin1;
4221 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4224 case EXACTF: /* /abc/i */
4226 fold_array = PL_fold;
4227 fold_utf8_flags = 0;
4233 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4234 /* Either target or the pattern are utf8, or has the issue where
4235 * the fold lengths may differ. */
4236 const char * const l = locinput;
4237 char *e = PL_regeol;
4239 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
4240 l, &e, 0, utf8_target, fold_utf8_flags))
4248 /* Neither the target nor the pattern are utf8 */
4249 if (UCHARAT(s) != nextchr
4251 && UCHARAT(s) != fold_array[nextchr])
4255 if (PL_regeol - locinput < ln)
4257 if (ln > 1 && ! folder(s, locinput, ln))
4263 /* XXX Could improve efficiency by separating these all out using a
4264 * macro or in-line function. At that point regcomp.c would no longer
4265 * have to set the FLAGS fields of these */
4266 case BOUNDL: /* /\b/l */
4267 case NBOUNDL: /* /\B/l */
4268 PL_reg_flags |= RF_tainted;
4270 case BOUND: /* /\b/ */
4271 case BOUNDU: /* /\b/u */
4272 case BOUNDA: /* /\b/a */
4273 case NBOUND: /* /\B/ */
4274 case NBOUNDU: /* /\B/u */
4275 case NBOUNDA: /* /\B/a */
4276 /* was last char in word? */
4278 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4279 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4281 if (locinput == PL_bostr)
4284 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4286 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4288 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4289 ln = isALNUM_uni(ln);
4293 LOAD_UTF8_CHARCLASS_ALNUM();
4294 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4299 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4300 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4305 /* Here the string isn't utf8, or is utf8 and only ascii
4306 * characters are to match \w. In the latter case looking at
4307 * the byte just prior to the current one may be just the final
4308 * byte of a multi-byte character. This is ok. There are two
4310 * 1) it is a single byte character, and then the test is doing
4311 * just what it's supposed to.
4312 * 2) it is a multi-byte character, in which case the final
4313 * byte is never mistakable for ASCII, and so the test
4314 * will say it is not a word character, which is the
4315 * correct answer. */
4316 ln = (locinput != PL_bostr) ?
4317 UCHARAT(locinput - 1) : '\n';
4318 switch (FLAGS(scan)) {
4319 case REGEX_UNICODE_CHARSET:
4320 ln = isWORDCHAR_L1(ln);
4321 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4323 case REGEX_LOCALE_CHARSET:
4324 ln = isALNUM_LC(ln);
4325 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4327 case REGEX_DEPENDS_CHARSET:
4329 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4331 case REGEX_ASCII_RESTRICTED_CHARSET:
4332 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4333 ln = isWORDCHAR_A(ln);
4334 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4337 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4341 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4343 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4347 case ANYOF: /* /[abc]/ */
4351 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4353 locinput += UTF8SKIP(locinput);
4357 if (!REGINCLASS(rex, scan, (U8*)locinput))
4364 /* Special char classes: \d, \w etc.
4365 * The defines start on line 166 or so */
4366 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4367 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4368 ALNUMU, NALNUMU, isWORDCHAR_L1,
4369 ALNUMA, NALNUMA, isWORDCHAR_A,
4373 PL_reg_flags |= RF_tainted;
4374 if (NEXTCHR_IS_EOS) {
4377 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4378 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4382 else if (! isSPACE_LC((U8) nextchr)) {
4385 goto increment_locinput;
4388 PL_reg_flags |= RF_tainted;
4389 if (NEXTCHR_IS_EOS) {
4392 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4393 if (isSPACE_LC_utf8((U8 *) locinput)) {
4397 else if (isSPACE_LC(nextchr)) {
4400 goto increment_locinput;
4408 if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4411 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4421 if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4424 goto increment_locinput;
4428 if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4431 goto increment_locinput;
4435 if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4438 goto increment_locinput;
4440 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4441 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4442 DIGITA, NDIGITA, isDIGIT_A,
4445 case POSIXA: /* /[[:ascii:]]/ etc */
4446 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4449 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4453 case NPOSIXA: /* /[^[:ascii:]]/ etc */
4454 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4457 goto increment_locinput;
4459 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4460 a Unicode extended Grapheme Cluster */
4461 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4462 extended Grapheme Cluster is:
4465 | Prepend* Begin Extend*
4468 Begin is: ( Special_Begin | ! Control )
4469 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4470 Extend is: ( Grapheme_Extend | Spacing_Mark )
4471 Control is: [ GCB_Control | CR | LF ]
4472 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4474 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4477 Begin is ( Regular_Begin + Special Begin )
4479 It turns out that 98.4% of all Unicode code points match
4480 Regular_Begin. Doing it this way eliminates a table match in
4481 the previous implementation for almost all Unicode code points.
4483 There is a subtlety with Prepend* which showed up in testing.
4484 Note that the Begin, and only the Begin is required in:
4485 | Prepend* Begin Extend*
4486 Also, Begin contains '! Control'. A Prepend must be a
4487 '! Control', which means it must also be a Begin. What it
4488 comes down to is that if we match Prepend* and then find no
4489 suitable Begin afterwards, that if we backtrack the last
4490 Prepend, that one will be a suitable Begin.
4495 if (! utf8_target) {
4497 /* Match either CR LF or '.', as all the other possibilities
4499 locinput++; /* Match the . or CR */
4500 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4502 && locinput < PL_regeol
4503 && UCHARAT(locinput) == '\n')
4510 /* Utf8: See if is ( CR LF ); already know that locinput <
4511 * PL_regeol, so locinput+1 is in bounds */
4512 if ( nextchr == '\r' && locinput+1 < PL_regeol
4513 && UCHARAT(locinput + 1) == '\n')
4520 /* In case have to backtrack to beginning, then match '.' */
4521 char *starting = locinput;
4523 /* In case have to backtrack the last prepend */
4524 char *previous_prepend = NULL;
4526 LOAD_UTF8_CHARCLASS_GCB();
4528 /* Match (prepend)* */
4529 while (locinput < PL_regeol
4530 && (len = is_GCB_Prepend_utf8(locinput)))
4532 previous_prepend = locinput;
4536 /* As noted above, if we matched a prepend character, but
4537 * the next thing won't match, back off the last prepend we
4538 * matched, as it is guaranteed to match the begin */
4539 if (previous_prepend
4540 && (locinput >= PL_regeol
4541 || (! swash_fetch(PL_utf8_X_regular_begin,
4542 (U8*)locinput, utf8_target)
4543 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4546 locinput = previous_prepend;
4549 /* Note that here we know PL_regeol > locinput, as we
4550 * tested that upon input to this switch case, and if we
4551 * moved locinput forward, we tested the result just above
4552 * and it either passed, or we backed off so that it will
4554 if (swash_fetch(PL_utf8_X_regular_begin,
4555 (U8*)locinput, utf8_target)) {
4556 locinput += UTF8SKIP(locinput);
4558 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4560 /* Here did not match the required 'Begin' in the
4561 * second term. So just match the very first
4562 * character, the '.' of the final term of the regex */
4563 locinput = starting + UTF8SKIP(starting);
4567 /* Here is a special begin. It can be composed of
4568 * several individual characters. One possibility is
4570 if ((len = is_GCB_RI_utf8(locinput))) {
4572 while (locinput < PL_regeol
4573 && (len = is_GCB_RI_utf8(locinput)))
4577 } else if ((len = is_GCB_T_utf8(locinput))) {
4578 /* Another possibility is T+ */
4580 while (locinput < PL_regeol
4581 && (len = is_GCB_T_utf8(locinput)))
4587 /* Here, neither RI+ nor T+; must be some other
4588 * Hangul. That means it is one of the others: L,
4589 * LV, LVT or V, and matches:
4590 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4593 while (locinput < PL_regeol
4594 && (len = is_GCB_L_utf8(locinput)))
4599 /* Here, have exhausted L*. If the next character
4600 * is not an LV, LVT nor V, it means we had to have
4601 * at least one L, so matches L+ in the original
4602 * equation, we have a complete hangul syllable.
4605 if (locinput < PL_regeol
4606 && is_GCB_LV_LVT_V_utf8(locinput))
4608 /* Otherwise keep going. Must be LV, LVT or V.
4609 * See if LVT, by first ruling out V, then LV */
4610 if (! is_GCB_V_utf8(locinput)
4611 /* All but every TCount one is LV */
4612 && (valid_utf8_to_uvchr((U8 *) locinput,
4617 locinput += UTF8SKIP(locinput);
4620 /* Must be V or LV. Take it, then match
4622 locinput += UTF8SKIP(locinput);
4623 while (locinput < PL_regeol
4624 && (len = is_GCB_V_utf8(locinput)))
4630 /* And any of LV, LVT, or V can be followed
4632 while (locinput < PL_regeol
4633 && (len = is_GCB_T_utf8(locinput)))
4641 /* Match any extender */
4642 while (locinput < PL_regeol
4643 && swash_fetch(PL_utf8_X_extend,
4644 (U8*)locinput, utf8_target))
4646 locinput += UTF8SKIP(locinput);
4650 if (locinput > PL_regeol) sayNO;
4654 case NREFFL: /* /\g{name}/il */
4655 { /* The capture buffer cases. The ones beginning with N for the
4656 named buffers just convert to the equivalent numbered and
4657 pretend they were called as the corresponding numbered buffer
4659 /* don't initialize these in the declaration, it makes C++
4664 const U8 *fold_array;
4667 PL_reg_flags |= RF_tainted;
4668 folder = foldEQ_locale;
4669 fold_array = PL_fold_locale;
4671 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4674 case NREFFA: /* /\g{name}/iaa */
4675 folder = foldEQ_latin1;
4676 fold_array = PL_fold_latin1;
4678 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4681 case NREFFU: /* /\g{name}/iu */
4682 folder = foldEQ_latin1;
4683 fold_array = PL_fold_latin1;
4685 utf8_fold_flags = 0;
4688 case NREFF: /* /\g{name}/i */
4690 fold_array = PL_fold;
4692 utf8_fold_flags = 0;
4695 case NREF: /* /\g{name}/ */
4699 utf8_fold_flags = 0;
4702 /* For the named back references, find the corresponding buffer
4704 n = reg_check_named_buff_matched(rex,scan);
4709 goto do_nref_ref_common;
4711 case REFFL: /* /\1/il */
4712 PL_reg_flags |= RF_tainted;
4713 folder = foldEQ_locale;
4714 fold_array = PL_fold_locale;
4715 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4718 case REFFA: /* /\1/iaa */
4719 folder = foldEQ_latin1;
4720 fold_array = PL_fold_latin1;
4721 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4724 case REFFU: /* /\1/iu */
4725 folder = foldEQ_latin1;
4726 fold_array = PL_fold_latin1;
4727 utf8_fold_flags = 0;
4730 case REFF: /* /\1/i */
4732 fold_array = PL_fold;
4733 utf8_fold_flags = 0;
4736 case REF: /* /\1/ */
4739 utf8_fold_flags = 0;
4743 n = ARG(scan); /* which paren pair */
4746 ln = rex->offs[n].start;
4747 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4748 if (rex->lastparen < n || ln == -1)
4749 sayNO; /* Do not match unless seen CLOSEn. */
4750 if (ln == rex->offs[n].end)
4754 if (type != REF /* REF can do byte comparison */
4755 && (utf8_target || type == REFFU))
4756 { /* XXX handle REFFL better */
4757 char * limit = PL_regeol;
4759 /* This call case insensitively compares the entire buffer
4760 * at s, with the current input starting at locinput, but
4761 * not going off the end given by PL_regeol, and returns in
4762 * <limit> upon success, how much of the current input was
4764 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4765 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4773 /* Not utf8: Inline the first character, for speed. */
4774 if (!NEXTCHR_IS_EOS &&
4775 UCHARAT(s) != nextchr &&
4777 UCHARAT(s) != fold_array[nextchr]))
4779 ln = rex->offs[n].end - ln;
4780 if (locinput + ln > PL_regeol)
4782 if (ln > 1 && (type == REF
4783 ? memNE(s, locinput, ln)
4784 : ! folder(s, locinput, ln)))
4790 case NOTHING: /* null op; e.g. the 'nothing' following
4791 * the '*' in m{(a+|b)*}' */
4793 case TAIL: /* placeholder while compiling (A|B|C) */
4796 case BACK: /* ??? doesn't appear to be used ??? */
4800 #define ST st->u.eval
4805 regexp_internal *rei;
4806 regnode *startpoint;
4808 case GOSTART: /* (?R) */
4809 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4810 if (cur_eval && cur_eval->locinput==locinput) {
4811 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4812 Perl_croak(aTHX_ "Infinite recursion in regex");
4813 if ( ++nochange_depth > max_nochange_depth )
4815 "Pattern subroutine nesting without pos change"
4816 " exceeded limit in regex");
4823 if (OP(scan)==GOSUB) {
4824 startpoint = scan + ARG2L(scan);
4825 ST.close_paren = ARG(scan);
4827 startpoint = rei->program+1;
4830 goto eval_recurse_doit;
4831 assert(0); /* NOTREACHED */
4833 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4834 if (cur_eval && cur_eval->locinput==locinput) {
4835 if ( ++nochange_depth > max_nochange_depth )
4836 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4841 /* execute the code in the {...} */
4845 OP * const oop = PL_op;
4846 COP * const ocurcop = PL_curcop;
4848 char *saved_regeol = PL_regeol;
4849 struct re_save_state saved_state;
4852 /* save *all* paren positions */
4853 regcppush(rex, 0, maxopenparen);
4854 REGCP_SET(runops_cp);
4856 /* To not corrupt the existing regex state while executing the
4857 * eval we would normally put it on the save stack, like with
4858 * save_re_context. However, re-evals have a weird scoping so we
4859 * can't just add ENTER/LEAVE here. With that, things like
4861 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4863 * would break, as they expect the localisation to be unwound
4864 * only when the re-engine backtracks through the bit that
4867 * What we do instead is just saving the state in a local c
4870 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4872 PL_reg_state.re_reparsing = FALSE;
4875 caller_cv = find_runcv(NULL);
4879 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4881 (REGEXP*)(rexi->data->data[n])
4884 nop = (OP*)rexi->data->data[n+1];
4886 else if (rexi->data->what[n] == 'l') { /* literal code */
4888 nop = (OP*)rexi->data->data[n];
4889 assert(CvDEPTH(newcv));
4892 /* literal with own CV */
4893 assert(rexi->data->what[n] == 'L');
4894 newcv = rex->qr_anoncv;
4895 nop = (OP*)rexi->data->data[n];
4898 /* normally if we're about to execute code from the same
4899 * CV that we used previously, we just use the existing
4900 * CX stack entry. However, its possible that in the
4901 * meantime we may have backtracked, popped from the save
4902 * stack, and undone the SAVECOMPPAD(s) associated with
4903 * PUSH_MULTICALL; in which case PL_comppad no longer
4904 * points to newcv's pad. */
4905 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4907 I32 depth = (newcv == caller_cv) ? 0 : 1;
4908 if (last_pushed_cv) {
4909 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4912 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4914 last_pushed_cv = newcv;
4917 /* these assignments are just to silence compiler
4919 multicall_cop = NULL;
4922 last_pad = PL_comppad;
4924 /* the initial nextstate you would normally execute
4925 * at the start of an eval (which would cause error
4926 * messages to come from the eval), may be optimised
4927 * away from the execution path in the regex code blocks;
4928 * so manually set PL_curcop to it initially */
4930 OP *o = cUNOPx(nop)->op_first;
4931 assert(o->op_type == OP_NULL);
4932 if (o->op_targ == OP_SCOPE) {
4933 o = cUNOPo->op_first;
4936 assert(o->op_targ == OP_LEAVE);
4937 o = cUNOPo->op_first;
4938 assert(o->op_type == OP_ENTER);
4942 if (o->op_type != OP_STUB) {
4943 assert( o->op_type == OP_NEXTSTATE
4944 || o->op_type == OP_DBSTATE
4945 || (o->op_type == OP_NULL
4946 && ( o->op_targ == OP_NEXTSTATE
4947 || o->op_targ == OP_DBSTATE
4951 PL_curcop = (COP*)o;
4956 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4957 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4959 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4962 SV *sv_mrk = get_sv("REGMARK", 1);
4963 sv_setsv(sv_mrk, sv_yes_mark);
4966 /* we don't use MULTICALL here as we want to call the
4967 * first op of the block of interest, rather than the
4968 * first op of the sub */
4969 before = (IV)(SP-PL_stack_base);
4971 CALLRUNOPS(aTHX); /* Scalar context. */
4973 if ((IV)(SP-PL_stack_base) == before)
4974 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4980 /* before restoring everything, evaluate the returned
4981 * value, so that 'uninit' warnings don't use the wrong
4982 * PL_op or pad. Also need to process any magic vars
4983 * (e.g. $1) *before* parentheses are restored */
4988 if (logical == 0) /* (?{})/ */
4989 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4990 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4991 sw = cBOOL(SvTRUE(ret));
4994 else { /* /(??{}) */
4995 /* if its overloaded, let the regex compiler handle
4996 * it; otherwise extract regex, or stringify */
4997 if (!SvAMAGIC(ret)) {
5001 if (SvTYPE(sv) == SVt_REGEXP)
5002 re_sv = (REGEXP*) sv;
5003 else if (SvSMAGICAL(sv)) {
5004 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5006 re_sv = (REGEXP *) mg->mg_obj;
5009 /* force any magic, undef warnings here */
5011 ret = sv_mortalcopy(ret);
5012 (void) SvPV_force_nolen(ret);
5018 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5020 /* *** Note that at this point we don't restore
5021 * PL_comppad, (or pop the CxSUB) on the assumption it may
5022 * be used again soon. This is safe as long as nothing
5023 * in the regexp code uses the pad ! */
5025 PL_curcop = ocurcop;
5026 PL_regeol = saved_regeol;
5027 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5033 /* only /(??{})/ from now on */
5036 /* extract RE object from returned value; compiling if
5040 re_sv = reg_temp_copy(NULL, re_sv);
5045 if (SvUTF8(ret) && IN_BYTES) {
5046 /* In use 'bytes': make a copy of the octet
5047 * sequence, but without the flag on */
5049 const char *const p = SvPV(ret, len);
5050 ret = newSVpvn_flags(p, len, SVs_TEMP);
5052 if (rex->intflags & PREGf_USE_RE_EVAL)
5053 pm_flags |= PMf_USE_RE_EVAL;
5055 /* if we got here, it should be an engine which
5056 * supports compiling code blocks and stuff */
5057 assert(rex->engine && rex->engine->op_comp);
5058 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5059 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5060 rex->engine, NULL, NULL,
5061 /* copy /msix etc to inner pattern */
5066 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5068 /* This isn't a first class regexp. Instead, it's
5069 caching a regexp onto an existing, Perl visible
5071 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5073 /* safe to do now that any $1 etc has been
5074 * interpolated into the new pattern string and
5076 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5081 RXp_MATCH_COPIED_off(re);
5082 re->subbeg = rex->subbeg;
5083 re->sublen = rex->sublen;
5084 re->suboffset = rex->suboffset;
5085 re->subcoffset = rex->subcoffset;
5088 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5089 "Matching embedded");
5091 startpoint = rei->program + 1;
5092 ST.close_paren = 0; /* only used for GOSUB */
5094 eval_recurse_doit: /* Share code with GOSUB below this line */
5095 /* run the pattern returned from (??{...}) */
5097 /* Save *all* the positions. */
5098 ST.cp = regcppush(rex, 0, maxopenparen);
5099 REGCP_SET(ST.lastcp);
5102 re->lastcloseparen = 0;
5106 /* XXXX This is too dramatic a measure... */
5109 ST.toggle_reg_flags = PL_reg_flags;
5111 PL_reg_flags |= RF_utf8;
5113 PL_reg_flags &= ~RF_utf8;
5114 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5116 ST.prev_rex = rex_sv;
5117 ST.prev_curlyx = cur_curlyx;
5119 SET_reg_curpm(rex_sv);
5124 ST.prev_eval = cur_eval;
5126 /* now continue from first node in postoned RE */
5127 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5128 assert(0); /* NOTREACHED */
5131 case EVAL_AB: /* cleanup after a successful (??{A})B */
5132 /* note: this is called twice; first after popping B, then A */
5133 PL_reg_flags ^= ST.toggle_reg_flags;
5134 rex_sv = ST.prev_rex;
5135 SET_reg_curpm(rex_sv);
5136 rex = ReANY(rex_sv);
5137 rexi = RXi_GET(rex);
5139 cur_eval = ST.prev_eval;
5140 cur_curlyx = ST.prev_curlyx;
5142 /* XXXX This is too dramatic a measure... */
5144 if ( nochange_depth )
5149 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5150 /* note: this is called twice; first after popping B, then A */
5151 PL_reg_flags ^= ST.toggle_reg_flags;
5152 rex_sv = ST.prev_rex;
5153 SET_reg_curpm(rex_sv);
5154 rex = ReANY(rex_sv);
5155 rexi = RXi_GET(rex);
5157 REGCP_UNWIND(ST.lastcp);
5158 regcppop(rex, &maxopenparen);
5159 cur_eval = ST.prev_eval;
5160 cur_curlyx = ST.prev_curlyx;
5161 /* XXXX This is too dramatic a measure... */
5163 if ( nochange_depth )
5169 n = ARG(scan); /* which paren pair */
5170 rex->offs[n].start_tmp = locinput - PL_bostr;
5171 if (n > maxopenparen)
5173 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5174 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5178 (IV)rex->offs[n].start_tmp,
5184 /* XXX really need to log other places start/end are set too */
5185 #define CLOSE_CAPTURE \
5186 rex->offs[n].start = rex->offs[n].start_tmp; \
5187 rex->offs[n].end = locinput - PL_bostr; \
5188 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5189 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5191 PTR2UV(rex->offs), \
5193 (IV)rex->offs[n].start, \
5194 (IV)rex->offs[n].end \
5198 n = ARG(scan); /* which paren pair */
5200 if (n > rex->lastparen)
5202 rex->lastcloseparen = n;
5203 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5208 case ACCEPT: /* (*ACCEPT) */
5212 cursor && OP(cursor)!=END;
5213 cursor=regnext(cursor))
5215 if ( OP(cursor)==CLOSE ){
5217 if ( n <= lastopen ) {
5219 if (n > rex->lastparen)
5221 rex->lastcloseparen = n;
5222 if ( n == ARG(scan) || (cur_eval &&
5223 cur_eval->u.eval.close_paren == n))
5232 case GROUPP: /* (?(1)) */
5233 n = ARG(scan); /* which paren pair */
5234 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5237 case NGROUPP: /* (?(<name>)) */
5238 /* reg_check_named_buff_matched returns 0 for no match */
5239 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5242 case INSUBP: /* (?(R)) */
5244 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5247 case DEFINEP: /* (?(DEFINE)) */
5251 case IFTHEN: /* (?(cond)A|B) */
5252 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5254 next = NEXTOPER(NEXTOPER(scan));
5256 next = scan + ARG(scan);
5257 if (OP(next) == IFTHEN) /* Fake one. */
5258 next = NEXTOPER(NEXTOPER(next));
5262 case LOGICAL: /* modifier for EVAL and IFMATCH */
5263 logical = scan->flags;
5266 /*******************************************************************
5268 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5269 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5270 STAR/PLUS/CURLY/CURLYN are used instead.)
5272 A*B is compiled as <CURLYX><A><WHILEM><B>
5274 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5275 state, which contains the current count, initialised to -1. It also sets
5276 cur_curlyx to point to this state, with any previous value saved in the
5279 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5280 since the pattern may possibly match zero times (i.e. it's a while {} loop
5281 rather than a do {} while loop).
5283 Each entry to WHILEM represents a successful match of A. The count in the
5284 CURLYX block is incremented, another WHILEM state is pushed, and execution
5285 passes to A or B depending on greediness and the current count.
5287 For example, if matching against the string a1a2a3b (where the aN are
5288 substrings that match /A/), then the match progresses as follows: (the
5289 pushed states are interspersed with the bits of strings matched so far):
5292 <CURLYX cnt=0><WHILEM>
5293 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5294 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5295 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5296 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5298 (Contrast this with something like CURLYM, which maintains only a single
5302 a1 <CURLYM cnt=1> a2
5303 a1 a2 <CURLYM cnt=2> a3
5304 a1 a2 a3 <CURLYM cnt=3> b
5307 Each WHILEM state block marks a point to backtrack to upon partial failure
5308 of A or B, and also contains some minor state data related to that
5309 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5310 overall state, such as the count, and pointers to the A and B ops.
5312 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5313 must always point to the *current* CURLYX block, the rules are:
5315 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5316 and set cur_curlyx to point the new block.
5318 When popping the CURLYX block after a successful or unsuccessful match,
5319 restore the previous cur_curlyx.
5321 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5322 to the outer one saved in the CURLYX block.
5324 When popping the WHILEM block after a successful or unsuccessful B match,
5325 restore the previous cur_curlyx.
5327 Here's an example for the pattern (AI* BI)*BO
5328 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5331 curlyx backtrack stack
5332 ------ ---------------
5334 CO <CO prev=NULL> <WO>
5335 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5336 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5337 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5339 At this point the pattern succeeds, and we work back down the stack to
5340 clean up, restoring as we go:
5342 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5343 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5344 CO <CO prev=NULL> <WO>
5347 *******************************************************************/
5349 #define ST st->u.curlyx
5351 case CURLYX: /* start of /A*B/ (for complex A) */
5353 /* No need to save/restore up to this paren */
5354 I32 parenfloor = scan->flags;
5356 assert(next); /* keep Coverity happy */
5357 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5360 /* XXXX Probably it is better to teach regpush to support
5361 parenfloor > maxopenparen ... */
5362 if (parenfloor > (I32)rex->lastparen)
5363 parenfloor = rex->lastparen; /* Pessimization... */
5365 ST.prev_curlyx= cur_curlyx;
5367 ST.cp = PL_savestack_ix;
5369 /* these fields contain the state of the current curly.
5370 * they are accessed by subsequent WHILEMs */
5371 ST.parenfloor = parenfloor;
5376 ST.count = -1; /* this will be updated by WHILEM */
5377 ST.lastloc = NULL; /* this will be updated by WHILEM */
5379 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5380 assert(0); /* NOTREACHED */
5383 case CURLYX_end: /* just finished matching all of A*B */
5384 cur_curlyx = ST.prev_curlyx;
5386 assert(0); /* NOTREACHED */
5388 case CURLYX_end_fail: /* just failed to match all of A*B */
5390 cur_curlyx = ST.prev_curlyx;
5392 assert(0); /* NOTREACHED */
5396 #define ST st->u.whilem
5398 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5400 /* see the discussion above about CURLYX/WHILEM */
5402 int min = ARG1(cur_curlyx->u.curlyx.me);
5403 int max = ARG2(cur_curlyx->u.curlyx.me);
5404 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5406 assert(cur_curlyx); /* keep Coverity happy */
5407 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5408 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5409 ST.cache_offset = 0;
5413 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5414 "%*s whilem: matched %ld out of %d..%d\n",
5415 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5418 /* First just match a string of min A's. */
5421 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5423 cur_curlyx->u.curlyx.lastloc = locinput;
5424 REGCP_SET(ST.lastcp);
5426 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5427 assert(0); /* NOTREACHED */
5430 /* If degenerate A matches "", assume A done. */
5432 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5433 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5434 "%*s whilem: empty match detected, trying continuation...\n",
5435 REPORT_CODE_OFF+depth*2, "")
5437 goto do_whilem_B_max;
5440 /* super-linear cache processing */
5444 if (!PL_reg_maxiter) {
5445 /* start the countdown: Postpone detection until we
5446 * know the match is not *that* much linear. */
5447 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5448 /* possible overflow for long strings and many CURLYX's */
5449 if (PL_reg_maxiter < 0)
5450 PL_reg_maxiter = I32_MAX;
5451 PL_reg_leftiter = PL_reg_maxiter;
5454 if (PL_reg_leftiter-- == 0) {
5455 /* initialise cache */
5456 const I32 size = (PL_reg_maxiter + 7)/8;
5457 if (PL_reg_poscache) {
5458 if ((I32)PL_reg_poscache_size < size) {
5459 Renew(PL_reg_poscache, size, char);
5460 PL_reg_poscache_size = size;
5462 Zero(PL_reg_poscache, size, char);
5465 PL_reg_poscache_size = size;
5466 Newxz(PL_reg_poscache, size, char);
5468 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5469 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5470 PL_colors[4], PL_colors[5])
5474 if (PL_reg_leftiter < 0) {
5475 /* have we already failed at this position? */
5477 offset = (scan->flags & 0xf) - 1
5478 + (locinput - PL_bostr) * (scan->flags>>4);
5479 mask = 1 << (offset % 8);
5481 if (PL_reg_poscache[offset] & mask) {
5482 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5483 "%*s whilem: (cache) already tried at this position...\n",
5484 REPORT_CODE_OFF+depth*2, "")
5486 sayNO; /* cache records failure */
5488 ST.cache_offset = offset;
5489 ST.cache_mask = mask;
5493 /* Prefer B over A for minimal matching. */
5495 if (cur_curlyx->u.curlyx.minmod) {
5496 ST.save_curlyx = cur_curlyx;
5497 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5498 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5500 REGCP_SET(ST.lastcp);
5501 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5503 assert(0); /* NOTREACHED */
5506 /* Prefer A over B for maximal matching. */
5508 if (n < max) { /* More greed allowed? */
5509 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5511 cur_curlyx->u.curlyx.lastloc = locinput;
5512 REGCP_SET(ST.lastcp);
5513 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5514 assert(0); /* NOTREACHED */
5516 goto do_whilem_B_max;
5518 assert(0); /* NOTREACHED */
5520 case WHILEM_B_min: /* just matched B in a minimal match */
5521 case WHILEM_B_max: /* just matched B in a maximal match */
5522 cur_curlyx = ST.save_curlyx;
5524 assert(0); /* NOTREACHED */
5526 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5527 cur_curlyx = ST.save_curlyx;
5528 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5529 cur_curlyx->u.curlyx.count--;
5531 assert(0); /* NOTREACHED */
5533 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5535 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5536 REGCP_UNWIND(ST.lastcp);
5537 regcppop(rex, &maxopenparen);
5538 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5539 cur_curlyx->u.curlyx.count--;
5541 assert(0); /* NOTREACHED */
5543 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5544 REGCP_UNWIND(ST.lastcp);
5545 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5546 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5547 "%*s whilem: failed, trying continuation...\n",
5548 REPORT_CODE_OFF+depth*2, "")
5551 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5552 && ckWARN(WARN_REGEXP)
5553 && !(PL_reg_flags & RF_warned))
5555 PL_reg_flags |= RF_warned;
5556 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5557 "Complex regular subexpression recursion limit (%d) "
5563 ST.save_curlyx = cur_curlyx;
5564 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5565 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5567 assert(0); /* NOTREACHED */
5569 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5570 cur_curlyx = ST.save_curlyx;
5571 REGCP_UNWIND(ST.lastcp);
5572 regcppop(rex, &maxopenparen);
5574 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5575 /* Maximum greed exceeded */
5576 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5577 && ckWARN(WARN_REGEXP)
5578 && !(PL_reg_flags & RF_warned))
5580 PL_reg_flags |= RF_warned;
5581 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5582 "Complex regular subexpression recursion "
5583 "limit (%d) exceeded",
5586 cur_curlyx->u.curlyx.count--;
5590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5591 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5593 /* Try grabbing another A and see if it helps. */
5594 cur_curlyx->u.curlyx.lastloc = locinput;
5595 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5597 REGCP_SET(ST.lastcp);
5598 PUSH_STATE_GOTO(WHILEM_A_min,
5599 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5601 assert(0); /* NOTREACHED */
5604 #define ST st->u.branch
5606 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5607 next = scan + ARG(scan);
5610 scan = NEXTOPER(scan);
5613 case BRANCH: /* /(...|A|...)/ */
5614 scan = NEXTOPER(scan); /* scan now points to inner node */
5615 ST.lastparen = rex->lastparen;
5616 ST.lastcloseparen = rex->lastcloseparen;
5617 ST.next_branch = next;
5620 /* Now go into the branch */
5622 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5624 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5626 assert(0); /* NOTREACHED */
5628 case CUTGROUP: /* /(*THEN)/ */
5629 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5630 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5631 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5632 assert(0); /* NOTREACHED */
5634 case CUTGROUP_next_fail:
5637 if (st->u.mark.mark_name)
5638 sv_commit = st->u.mark.mark_name;
5640 assert(0); /* NOTREACHED */
5644 assert(0); /* NOTREACHED */
5646 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5651 REGCP_UNWIND(ST.cp);
5652 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5653 scan = ST.next_branch;
5654 /* no more branches? */
5655 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5657 PerlIO_printf( Perl_debug_log,
5658 "%*s %sBRANCH failed...%s\n",
5659 REPORT_CODE_OFF+depth*2, "",
5665 continue; /* execute next BRANCH[J] op */
5666 assert(0); /* NOTREACHED */
5668 case MINMOD: /* next op will be non-greedy, e.g. A*? */
5673 #define ST st->u.curlym
5675 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5677 /* This is an optimisation of CURLYX that enables us to push
5678 * only a single backtracking state, no matter how many matches
5679 * there are in {m,n}. It relies on the pattern being constant
5680 * length, with no parens to influence future backrefs
5684 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5686 ST.lastparen = rex->lastparen;
5687 ST.lastcloseparen = rex->lastcloseparen;
5689 /* if paren positive, emulate an OPEN/CLOSE around A */
5691 U32 paren = ST.me->flags;
5692 if (paren > maxopenparen)
5693 maxopenparen = paren;
5694 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5702 ST.c1 = CHRTEST_UNINIT;
5705 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5708 curlym_do_A: /* execute the A in /A{m,n}B/ */
5709 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5710 assert(0); /* NOTREACHED */
5712 case CURLYM_A: /* we've just matched an A */
5714 /* after first match, determine A's length: u.curlym.alen */
5715 if (ST.count == 1) {
5716 if (PL_reg_match_utf8) {
5717 char *s = st->locinput;
5718 while (s < locinput) {
5724 ST.alen = locinput - st->locinput;
5727 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5730 PerlIO_printf(Perl_debug_log,
5731 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5732 (int)(REPORT_CODE_OFF+(depth*2)), "",
5733 (IV) ST.count, (IV)ST.alen)
5736 if (cur_eval && cur_eval->u.eval.close_paren &&
5737 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5741 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5742 if ( max == REG_INFTY || ST.count < max )
5743 goto curlym_do_A; /* try to match another A */
5745 goto curlym_do_B; /* try to match B */
5747 case CURLYM_A_fail: /* just failed to match an A */
5748 REGCP_UNWIND(ST.cp);
5750 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5751 || (cur_eval && cur_eval->u.eval.close_paren &&
5752 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5755 curlym_do_B: /* execute the B in /A{m,n}B/ */
5756 if (ST.c1 == CHRTEST_UNINIT) {
5757 /* calculate c1 and c2 for possible match of 1st char
5758 * following curly */
5759 ST.c1 = ST.c2 = CHRTEST_VOID;
5760 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5761 regnode *text_node = ST.B;
5762 if (! HAS_TEXT(text_node))
5763 FIND_NEXT_IMPT(text_node);
5766 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5768 But the former is redundant in light of the latter.
5770 if this changes back then the macro for
5771 IS_TEXT and friends need to change.
5773 if (PL_regkind[OP(text_node)] == EXACT) {
5774 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5775 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5784 PerlIO_printf(Perl_debug_log,
5785 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5786 (int)(REPORT_CODE_OFF+(depth*2)),
5789 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5790 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5791 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5792 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5794 /* simulate B failing */
5796 PerlIO_printf(Perl_debug_log,
5797 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5798 (int)(REPORT_CODE_OFF+(depth*2)),"",
5799 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5800 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5801 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5803 state_num = CURLYM_B_fail;
5804 goto reenter_switch;
5807 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5808 /* simulate B failing */
5810 PerlIO_printf(Perl_debug_log,
5811 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5812 (int)(REPORT_CODE_OFF+(depth*2)),"",
5813 (int) nextchr, ST.c1, ST.c2)
5815 state_num = CURLYM_B_fail;
5816 goto reenter_switch;
5821 /* emulate CLOSE: mark current A as captured */
5822 I32 paren = ST.me->flags;
5824 rex->offs[paren].start
5825 = HOPc(locinput, -ST.alen) - PL_bostr;
5826 rex->offs[paren].end = locinput - PL_bostr;
5827 if ((U32)paren > rex->lastparen)
5828 rex->lastparen = paren;
5829 rex->lastcloseparen = paren;
5832 rex->offs[paren].end = -1;
5833 if (cur_eval && cur_eval->u.eval.close_paren &&
5834 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5843 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5844 assert(0); /* NOTREACHED */
5846 case CURLYM_B_fail: /* just failed to match a B */
5847 REGCP_UNWIND(ST.cp);
5848 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5850 I32 max = ARG2(ST.me);
5851 if (max != REG_INFTY && ST.count == max)
5853 goto curlym_do_A; /* try to match a further A */
5855 /* backtrack one A */
5856 if (ST.count == ARG1(ST.me) /* min */)
5859 SET_locinput(HOPc(locinput, -ST.alen));
5860 goto curlym_do_B; /* try to match B */
5863 #define ST st->u.curly
5865 #define CURLY_SETPAREN(paren, success) \
5868 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5869 rex->offs[paren].end = locinput - PL_bostr; \
5870 if (paren > rex->lastparen) \
5871 rex->lastparen = paren; \
5872 rex->lastcloseparen = paren; \
5875 rex->offs[paren].end = -1; \
5876 rex->lastparen = ST.lastparen; \
5877 rex->lastcloseparen = ST.lastcloseparen; \
5881 case STAR: /* /A*B/ where A is width 1 char */
5885 scan = NEXTOPER(scan);
5888 case PLUS: /* /A+B/ where A is width 1 char */
5892 scan = NEXTOPER(scan);
5895 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5896 ST.paren = scan->flags; /* Which paren to set */
5897 ST.lastparen = rex->lastparen;
5898 ST.lastcloseparen = rex->lastcloseparen;
5899 if (ST.paren > maxopenparen)
5900 maxopenparen = ST.paren;
5901 ST.min = ARG1(scan); /* min to match */
5902 ST.max = ARG2(scan); /* max to match */
5903 if (cur_eval && cur_eval->u.eval.close_paren &&
5904 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5908 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5911 case CURLY: /* /A{m,n}B/ where A is width 1 char */
5913 ST.min = ARG1(scan); /* min to match */
5914 ST.max = ARG2(scan); /* max to match */
5915 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5918 * Lookahead to avoid useless match attempts
5919 * when we know what character comes next.
5921 * Used to only do .*x and .*?x, but now it allows
5922 * for )'s, ('s and (?{ ... })'s to be in the way
5923 * of the quantifier and the EXACT-like node. -- japhy
5926 assert(ST.min <= ST.max);
5927 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5928 ST.c1 = ST.c2 = CHRTEST_VOID;
5931 regnode *text_node = next;
5933 if (! HAS_TEXT(text_node))
5934 FIND_NEXT_IMPT(text_node);
5936 if (! HAS_TEXT(text_node))
5937 ST.c1 = ST.c2 = CHRTEST_VOID;
5939 if ( PL_regkind[OP(text_node)] != EXACT ) {
5940 ST.c1 = ST.c2 = CHRTEST_VOID;
5944 /* Currently we only get here when
5946 PL_rekind[OP(text_node)] == EXACT
5948 if this changes back then the macro for IS_TEXT and
5949 friends need to change. */
5950 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5951 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5962 char *li = locinput;
5964 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5969 if (ST.c1 == CHRTEST_VOID)
5970 goto curly_try_B_min;
5972 ST.oldloc = locinput;
5974 /* set ST.maxpos to the furthest point along the
5975 * string that could possibly match */
5976 if (ST.max == REG_INFTY) {
5977 ST.maxpos = PL_regeol - 1;
5979 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5982 else if (utf8_target) {
5983 int m = ST.max - ST.min;
5984 for (ST.maxpos = locinput;
5985 m >0 && ST.maxpos < PL_regeol; m--)
5986 ST.maxpos += UTF8SKIP(ST.maxpos);
5989 ST.maxpos = locinput + ST.max - ST.min;
5990 if (ST.maxpos >= PL_regeol)
5991 ST.maxpos = PL_regeol - 1;
5993 goto curly_try_B_min_known;
5997 /* avoid taking address of locinput, so it can remain
5999 char *li = locinput;
6000 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
6001 if (ST.count < ST.min)
6004 if ((ST.count > ST.min)
6005 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6007 /* A{m,n} must come at the end of the string, there's
6008 * no point in backing off ... */
6010 /* ...except that $ and \Z can match before *and* after
6011 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6012 We may back off by one in this case. */
6013 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6017 goto curly_try_B_max;
6019 assert(0); /* NOTREACHED */
6022 case CURLY_B_min_known_fail:
6023 /* failed to find B in a non-greedy match where c1,c2 valid */
6025 REGCP_UNWIND(ST.cp);
6027 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6029 /* Couldn't or didn't -- move forward. */
6030 ST.oldloc = locinput;
6032 locinput += UTF8SKIP(locinput);
6036 curly_try_B_min_known:
6037 /* find the next place where 'B' could work, then call B */
6041 n = (ST.oldloc == locinput) ? 0 : 1;
6042 if (ST.c1 == ST.c2) {
6043 /* set n to utf8_distance(oldloc, locinput) */
6044 while (locinput <= ST.maxpos
6045 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6047 locinput += UTF8SKIP(locinput);
6052 /* set n to utf8_distance(oldloc, locinput) */
6053 while (locinput <= ST.maxpos
6054 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6055 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6057 locinput += UTF8SKIP(locinput);
6062 else { /* Not utf8_target */
6063 if (ST.c1 == ST.c2) {
6064 while (locinput <= ST.maxpos &&
6065 UCHARAT(locinput) != ST.c1)
6069 while (locinput <= ST.maxpos
6070 && UCHARAT(locinput) != ST.c1
6071 && UCHARAT(locinput) != ST.c2)
6074 n = locinput - ST.oldloc;
6076 if (locinput > ST.maxpos)
6079 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6080 * at b; check that everything between oldloc and
6081 * locinput matches */
6082 char *li = ST.oldloc;
6084 if (regrepeat(rex, &li, ST.A, n, depth) < n)
6086 assert(n == REG_INFTY || locinput == li);
6088 CURLY_SETPAREN(ST.paren, ST.count);
6089 if (cur_eval && cur_eval->u.eval.close_paren &&
6090 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6093 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6095 assert(0); /* NOTREACHED */
6098 case CURLY_B_min_fail:
6099 /* failed to find B in a non-greedy match where c1,c2 invalid */
6101 REGCP_UNWIND(ST.cp);
6103 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6105 /* failed -- move forward one */
6107 char *li = locinput;
6108 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6115 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6116 ST.count > 0)) /* count overflow ? */
6119 CURLY_SETPAREN(ST.paren, ST.count);
6120 if (cur_eval && cur_eval->u.eval.close_paren &&
6121 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6124 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6128 assert(0); /* NOTREACHED */
6132 /* a successful greedy match: now try to match B */
6133 if (cur_eval && cur_eval->u.eval.close_paren &&
6134 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6138 bool could_match = locinput < PL_regeol;
6140 /* If it could work, try it. */
6141 if (ST.c1 != CHRTEST_VOID && could_match) {
6142 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6144 could_match = memEQ(locinput,
6149 UTF8SKIP(locinput));
6152 could_match = UCHARAT(locinput) == ST.c1
6153 || UCHARAT(locinput) == ST.c2;
6156 if (ST.c1 == CHRTEST_VOID || could_match) {
6157 CURLY_SETPAREN(ST.paren, ST.count);
6158 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6159 assert(0); /* NOTREACHED */
6164 case CURLY_B_max_fail:
6165 /* failed to find B in a greedy match */
6167 REGCP_UNWIND(ST.cp);
6169 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6172 if (--ST.count < ST.min)
6174 locinput = HOPc(locinput, -1);
6175 goto curly_try_B_max;
6179 case END: /* last op of main pattern */
6182 /* we've just finished A in /(??{A})B/; now continue with B */
6183 st->u.eval.toggle_reg_flags
6184 = cur_eval->u.eval.toggle_reg_flags;
6185 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6187 st->u.eval.prev_rex = rex_sv; /* inner */
6189 /* Save *all* the positions. */
6190 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6191 rex_sv = cur_eval->u.eval.prev_rex;
6192 SET_reg_curpm(rex_sv);
6193 rex = ReANY(rex_sv);
6194 rexi = RXi_GET(rex);
6195 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6197 REGCP_SET(st->u.eval.lastcp);
6199 /* Restore parens of the outer rex without popping the
6201 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6204 st->u.eval.prev_eval = cur_eval;
6205 cur_eval = cur_eval->u.eval.prev_eval;
6207 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6208 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6209 if ( nochange_depth )
6212 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6213 locinput); /* match B */
6216 if (locinput < reginfo->till) {
6217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6218 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6220 (long)(locinput - PL_reg_starttry),
6221 (long)(reginfo->till - PL_reg_starttry),
6224 sayNO_SILENT; /* Cannot match: too short. */
6226 sayYES; /* Success! */
6228 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6230 PerlIO_printf(Perl_debug_log,
6231 "%*s %ssubpattern success...%s\n",
6232 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6233 sayYES; /* Success! */
6236 #define ST st->u.ifmatch
6241 case SUSPEND: /* (?>A) */
6243 newstart = locinput;
6246 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6248 goto ifmatch_trivial_fail_test;
6250 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6252 ifmatch_trivial_fail_test:
6254 char * const s = HOPBACKc(locinput, scan->flags);
6259 sw = 1 - cBOOL(ST.wanted);
6263 next = scan + ARG(scan);
6271 newstart = locinput;
6275 ST.logical = logical;
6276 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6278 /* execute body of (?...A) */
6279 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6280 assert(0); /* NOTREACHED */
6283 case IFMATCH_A_fail: /* body of (?...A) failed */
6284 ST.wanted = !ST.wanted;
6287 case IFMATCH_A: /* body of (?...A) succeeded */
6289 sw = cBOOL(ST.wanted);
6291 else if (!ST.wanted)
6294 if (OP(ST.me) != SUSPEND) {
6295 /* restore old position except for (?>...) */
6296 locinput = st->locinput;
6298 scan = ST.me + ARG(ST.me);
6301 continue; /* execute B */
6305 case LONGJMP: /* alternative with many branches compiles to
6306 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6307 next = scan + ARG(scan);
6312 case COMMIT: /* (*COMMIT) */
6313 reginfo->cutpoint = PL_regeol;
6316 case PRUNE: /* (*PRUNE) */
6318 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6319 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6320 assert(0); /* NOTREACHED */
6322 case COMMIT_next_fail:
6326 case OPFAIL: /* (*FAIL) */
6328 assert(0); /* NOTREACHED */
6330 #define ST st->u.mark
6331 case MARKPOINT: /* (*MARK:foo) */
6332 ST.prev_mark = mark_state;
6333 ST.mark_name = sv_commit = sv_yes_mark
6334 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6336 ST.mark_loc = locinput;
6337 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6338 assert(0); /* NOTREACHED */
6340 case MARKPOINT_next:
6341 mark_state = ST.prev_mark;
6343 assert(0); /* NOTREACHED */
6345 case MARKPOINT_next_fail:
6346 if (popmark && sv_eq(ST.mark_name,popmark))
6348 if (ST.mark_loc > startpoint)
6349 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6350 popmark = NULL; /* we found our mark */
6351 sv_commit = ST.mark_name;
6354 PerlIO_printf(Perl_debug_log,
6355 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6356 REPORT_CODE_OFF+depth*2, "",
6357 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6360 mark_state = ST.prev_mark;
6361 sv_yes_mark = mark_state ?
6362 mark_state->u.mark.mark_name : NULL;
6364 assert(0); /* NOTREACHED */
6366 case SKIP: /* (*SKIP) */
6368 /* (*SKIP) : if we fail we cut here*/
6369 ST.mark_name = NULL;
6370 ST.mark_loc = locinput;
6371 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6373 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6374 otherwise do nothing. Meaning we need to scan
6376 regmatch_state *cur = mark_state;
6377 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6380 if ( sv_eq( cur->u.mark.mark_name,
6383 ST.mark_name = find;
6384 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6386 cur = cur->u.mark.prev_mark;
6389 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6392 case SKIP_next_fail:
6394 /* (*CUT:NAME) - Set up to search for the name as we
6395 collapse the stack*/
6396 popmark = ST.mark_name;
6398 /* (*CUT) - No name, we cut here.*/
6399 if (ST.mark_loc > startpoint)
6400 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6401 /* but we set sv_commit to latest mark_name if there
6402 is one so they can test to see how things lead to this
6405 sv_commit=mark_state->u.mark.mark_name;
6409 assert(0); /* NOTREACHED */
6412 case LNBREAK: /* \R */
6413 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6419 #define CASE_CLASS(nAmE) \
6421 if (NEXTCHR_IS_EOS) \
6423 if ((n=is_##nAmE(locinput,utf8_target))) { \
6429 if (NEXTCHR_IS_EOS) \
6431 if ((n=is_##nAmE(locinput,utf8_target))) { \
6434 locinput += UTF8SKIP(locinput); \
6438 CASE_CLASS(VERTWS); /* \v \V */
6439 CASE_CLASS(HORIZWS); /* \h \H */
6443 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6444 PTR2UV(scan), OP(scan));
6445 Perl_croak(aTHX_ "regexp memory corruption");
6447 /* this is a point to jump to in order to increment
6448 * locinput by one character */
6450 assert(!NEXTCHR_IS_EOS);
6452 locinput += PL_utf8skip[nextchr];
6453 /* locinput is allowed to go 1 char off the end, but not 2+ */
6454 if (locinput > PL_regeol)
6463 /* switch break jumps here */
6464 scan = next; /* prepare to execute the next op and ... */
6465 continue; /* ... jump back to the top, reusing st */
6466 assert(0); /* NOTREACHED */
6469 /* push a state that backtracks on success */
6470 st->u.yes.prev_yes_state = yes_state;
6474 /* push a new regex state, then continue at scan */
6476 regmatch_state *newst;
6479 regmatch_state *cur = st;
6480 regmatch_state *curyes = yes_state;
6482 regmatch_slab *slab = PL_regmatch_slab;
6483 for (;curd > -1;cur--,curd--) {
6484 if (cur < SLAB_FIRST(slab)) {
6486 cur = SLAB_LAST(slab);
6488 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6489 REPORT_CODE_OFF + 2 + depth * 2,"",
6490 curd, PL_reg_name[cur->resume_state],
6491 (curyes == cur) ? "yes" : ""
6494 curyes = cur->u.yes.prev_yes_state;
6497 DEBUG_STATE_pp("push")
6500 st->locinput = locinput;
6502 if (newst > SLAB_LAST(PL_regmatch_slab))
6503 newst = S_push_slab(aTHX);
6504 PL_regmatch_state = newst;
6506 locinput = pushinput;
6509 assert(0); /* NOTREACHED */
6514 * We get here only if there's trouble -- normally "case END" is
6515 * the terminating point.
6517 Perl_croak(aTHX_ "corrupted regexp pointers");
6523 /* we have successfully completed a subexpression, but we must now
6524 * pop to the state marked by yes_state and continue from there */
6525 assert(st != yes_state);
6527 while (st != yes_state) {
6529 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6530 PL_regmatch_slab = PL_regmatch_slab->prev;
6531 st = SLAB_LAST(PL_regmatch_slab);
6535 DEBUG_STATE_pp("pop (no final)");
6537 DEBUG_STATE_pp("pop (yes)");
6543 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6544 || yes_state > SLAB_LAST(PL_regmatch_slab))
6546 /* not in this slab, pop slab */
6547 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6548 PL_regmatch_slab = PL_regmatch_slab->prev;
6549 st = SLAB_LAST(PL_regmatch_slab);
6551 depth -= (st - yes_state);
6554 yes_state = st->u.yes.prev_yes_state;
6555 PL_regmatch_state = st;
6558 locinput= st->locinput;
6559 state_num = st->resume_state + no_final;
6560 goto reenter_switch;
6563 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6564 PL_colors[4], PL_colors[5]));
6566 if (PL_reg_state.re_state_eval_setup_done) {
6567 /* each successfully executed (?{...}) block does the equivalent of
6568 * local $^R = do {...}
6569 * When popping the save stack, all these locals would be undone;
6570 * bypass this by setting the outermost saved $^R to the latest
6572 if (oreplsv != GvSV(PL_replgv))
6573 sv_setsv(oreplsv, GvSV(PL_replgv));
6580 PerlIO_printf(Perl_debug_log,
6581 "%*s %sfailed...%s\n",
6582 REPORT_CODE_OFF+depth*2, "",
6583 PL_colors[4], PL_colors[5])
6595 /* there's a previous state to backtrack to */
6597 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6598 PL_regmatch_slab = PL_regmatch_slab->prev;
6599 st = SLAB_LAST(PL_regmatch_slab);
6601 PL_regmatch_state = st;
6602 locinput= st->locinput;
6604 DEBUG_STATE_pp("pop");
6606 if (yes_state == st)
6607 yes_state = st->u.yes.prev_yes_state;
6609 state_num = st->resume_state + 1; /* failure = success + 1 */
6610 goto reenter_switch;
6615 if (rex->intflags & PREGf_VERBARG_SEEN) {
6616 SV *sv_err = get_sv("REGERROR", 1);
6617 SV *sv_mrk = get_sv("REGMARK", 1);
6619 sv_commit = &PL_sv_no;
6621 sv_yes_mark = &PL_sv_yes;
6624 sv_commit = &PL_sv_yes;
6625 sv_yes_mark = &PL_sv_no;
6627 sv_setsv(sv_err, sv_commit);
6628 sv_setsv(sv_mrk, sv_yes_mark);
6632 if (last_pushed_cv) {
6635 PERL_UNUSED_VAR(SP);
6638 /* clean up; in particular, free all slabs above current one */
6639 LEAVE_SCOPE(oldsave);
6641 assert(!result || locinput - PL_bostr >= 0);
6642 return result ? locinput - PL_bostr : -1;
6646 - regrepeat - repeatedly match something simple, report how many
6648 * What 'simple' means is a node which can be the operand of a quantifier like
6651 * startposp - pointer a pointer to the start position. This is updated
6652 * to point to the byte following the highest successful
6654 * p - the regnode to be repeatedly matched against.
6655 * max - maximum number of things to match.
6656 * depth - (for debugging) backtracking depth.
6659 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6662 char *scan; /* Pointer to current position in target string */
6664 char *loceol = PL_regeol; /* local version */
6665 I32 hardcount = 0; /* How many matches so far */
6666 bool utf8_target = PL_reg_match_utf8;
6669 PERL_UNUSED_ARG(depth);
6672 PERL_ARGS_ASSERT_REGREPEAT;
6675 if (max == REG_INFTY)
6677 else if (! utf8_target && scan + max < loceol)
6678 loceol = scan + max;
6680 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6681 * to the maximum of how far we should go in it (leaving it set to the real
6682 * end, if the maximum permissible would take us beyond that). This allows
6683 * us to make the loop exit condition that we haven't gone past <loceol> to
6684 * also mean that we haven't exceeded the max permissible count, saving a
6685 * test each time through the loop. But it assumes that the OP matches a
6686 * single byte, which is true for most of the OPs below when applied to a
6687 * non-UTF-8 target. Those relatively few OPs that don't have this
6688 * characteristic will have to compensate.
6690 * There is no adjustment for UTF-8 targets, as the number of bytes per
6691 * character varies. OPs will have to test both that the count is less
6692 * than the max permissible (using <hardcount> to keep track), and that we
6693 * are still within the bounds of the string (using <loceol>. A few OPs
6694 * match a single byte no matter what the encoding. They can omit the max
6695 * test if, for the UTF-8 case, they do the adjustment that was skipped
6698 * Thus, the code above sets things up for the common case; and exceptional
6699 * cases need extra work; the common case is to make sure <scan> doesn't
6700 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6701 * count doesn't exceed the maximum permissible */
6706 while (scan < loceol && hardcount < max && *scan != '\n') {
6707 scan += UTF8SKIP(scan);
6711 while (scan < loceol && *scan != '\n')
6717 while (scan < loceol && hardcount < max) {
6718 scan += UTF8SKIP(scan);
6725 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6726 if (utf8_target && scan + max < loceol) {
6728 /* <loceol> hadn't been adjusted in the UTF-8 case */
6736 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6740 /* Can use a simple loop if the pattern char to match on is invariant
6741 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6742 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6743 * true iff it doesn't matter if the argument is in UTF-8 or not */
6744 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6745 if (utf8_target && scan + max < loceol) {
6746 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6747 * since here, to match at all, 1 char == 1 byte */
6748 loceol = scan + max;
6750 while (scan < loceol && UCHARAT(scan) == c) {
6754 else if (UTF_PATTERN) {
6756 STRLEN scan_char_len;
6758 /* When both target and pattern are UTF-8, we have to do
6760 while (hardcount < max
6762 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6763 && memEQ(scan, STRING(p), scan_char_len))
6765 scan += scan_char_len;
6769 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6771 /* Target isn't utf8; convert the character in the UTF-8
6772 * pattern to non-UTF8, and do a simple loop */
6773 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6774 while (scan < loceol && UCHARAT(scan) == c) {
6777 } /* else pattern char is above Latin1, can't possibly match the
6782 /* Here, the string must be utf8; pattern isn't, and <c> is
6783 * different in utf8 than not, so can't compare them directly.
6784 * Outside the loop, find the two utf8 bytes that represent c, and
6785 * then look for those in sequence in the utf8 string */
6786 U8 high = UTF8_TWO_BYTE_HI(c);
6787 U8 low = UTF8_TWO_BYTE_LO(c);
6789 while (hardcount < max
6790 && scan + 1 < loceol
6791 && UCHARAT(scan) == high
6792 && UCHARAT(scan + 1) == low)
6801 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6805 PL_reg_flags |= RF_tainted;
6806 utf8_flags = FOLDEQ_UTF8_LOCALE;
6814 case EXACTFU_TRICKYFOLD:
6816 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6820 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6822 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6824 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6825 if (c1 == CHRTEST_VOID) {
6826 /* Use full Unicode fold matching */
6827 char *tmpeol = PL_regeol;
6828 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6829 while (hardcount < max
6830 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6831 STRING(p), NULL, pat_len,
6832 cBOOL(UTF_PATTERN), utf8_flags))
6839 else if (utf8_target) {
6841 while (scan < loceol
6843 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6845 scan += UTF8SKIP(scan);
6850 while (scan < loceol
6852 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6853 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6855 scan += UTF8SKIP(scan);
6860 else if (c1 == c2) {
6861 while (scan < loceol && UCHARAT(scan) == c1) {
6866 while (scan < loceol &&
6867 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6877 while (hardcount < max
6879 && reginclass(prog, p, (U8*)scan, utf8_target))
6881 scan += UTF8SKIP(scan);
6885 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6892 LOAD_UTF8_CHARCLASS_ALNUM();
6893 while (hardcount < max && scan < loceol &&
6894 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6896 scan += UTF8SKIP(scan);
6900 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6908 while (scan < loceol && isALNUM((U8) *scan)) {
6913 if (utf8_target && scan + max < loceol) {
6915 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6916 * since here, to match, 1 char == 1 byte */
6917 loceol = scan + max;
6919 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6924 PL_reg_flags |= RF_tainted;
6926 while (hardcount < max && scan < loceol &&
6927 isALNUM_LC_utf8((U8*)scan)) {
6928 scan += UTF8SKIP(scan);
6932 while (scan < loceol && isALNUM_LC(*scan))
6941 LOAD_UTF8_CHARCLASS_ALNUM();
6942 while (hardcount < max && scan < loceol &&
6943 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6945 scan += UTF8SKIP(scan);
6949 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6956 goto utf8_Nwordchar;
6957 while (scan < loceol && ! isALNUM((U8) *scan)) {
6963 if (utf8_target && scan + max < loceol) {
6965 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6966 * since here, to match, 1 char == 1 byte */
6967 loceol = scan + max;
6969 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6975 while (scan < loceol && hardcount < max
6976 && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6978 scan += UTF8SKIP(scan);
6983 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6990 while (scan < loceol && hardcount < max
6991 && ! isWORDCHAR_A((U8) *scan))
6993 scan += UTF8SKIP(scan);
6998 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
7004 PL_reg_flags |= RF_tainted;
7006 while (hardcount < max && scan < loceol &&
7007 !isALNUM_LC_utf8((U8*)scan)) {
7008 scan += UTF8SKIP(scan);
7012 while (scan < loceol && !isALNUM_LC(*scan))
7021 while (hardcount < max && scan < loceol
7022 && is_XPERLSPACE_utf8((U8*)scan))
7024 scan += UTF8SKIP(scan);
7030 while (scan < loceol && isSPACE_L1((U8) *scan)) {
7039 while (scan < loceol && isSPACE((U8) *scan)) {
7044 if (utf8_target && scan + max < loceol) {
7046 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7047 * since here, to match, 1 char == 1 byte */
7048 loceol = scan + max;
7050 while (scan < loceol && isSPACE_A((U8) *scan)) {
7055 PL_reg_flags |= RF_tainted;
7057 while (hardcount < max && scan < loceol &&
7058 isSPACE_LC_utf8((U8*)scan)) {
7059 scan += UTF8SKIP(scan);
7063 while (scan < loceol && isSPACE_LC(*scan))
7072 while (hardcount < max && scan < loceol
7073 && ! is_XPERLSPACE_utf8((U8*)scan))
7075 scan += UTF8SKIP(scan);
7081 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7090 while (scan < loceol && ! isSPACE((U8) *scan)) {
7096 while (hardcount < max && scan < loceol
7097 && ! isSPACE_A((U8) *scan))
7099 scan += UTF8SKIP(scan);
7104 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7110 PL_reg_flags |= RF_tainted;
7112 while (hardcount < max && scan < loceol &&
7113 !isSPACE_LC_utf8((U8*)scan)) {
7114 scan += UTF8SKIP(scan);
7118 while (scan < loceol && !isSPACE_LC(*scan))
7124 LOAD_UTF8_CHARCLASS_DIGIT();
7125 while (hardcount < max && scan < loceol &&
7126 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7127 scan += UTF8SKIP(scan);
7131 while (scan < loceol && isDIGIT(*scan))
7136 if (utf8_target && scan + max < loceol) {
7138 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7139 * since here, to match, 1 char == 1 byte */
7140 loceol = scan + max;
7142 while (scan < loceol && isDIGIT_A((U8) *scan)) {
7147 PL_reg_flags |= RF_tainted;
7149 while (hardcount < max && scan < loceol &&
7150 isDIGIT_LC_utf8((U8*)scan)) {
7151 scan += UTF8SKIP(scan);
7155 while (scan < loceol && isDIGIT_LC(*scan))
7161 LOAD_UTF8_CHARCLASS_DIGIT();
7162 while (hardcount < max && scan < loceol &&
7163 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7164 scan += UTF8SKIP(scan);
7168 while (scan < loceol && !isDIGIT(*scan))
7174 while (hardcount < max && scan < loceol
7175 && ! isDIGIT_A((U8) *scan)) {
7176 scan += UTF8SKIP(scan);
7181 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7187 PL_reg_flags |= RF_tainted;
7189 while (hardcount < max && scan < loceol &&
7190 !isDIGIT_LC_utf8((U8*)scan)) {
7191 scan += UTF8SKIP(scan);
7195 while (scan < loceol && !isDIGIT_LC(*scan))
7201 while (hardcount < max && scan < loceol &&
7202 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7207 /* LNBREAK can match one or two latin chars, which is ok, but we
7208 * have to use hardcount in this situation, and throw away the
7209 * adjustment to <loceol> done before the switch statement */
7211 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7219 while (hardcount < max && scan < loceol &&
7220 (c=is_HORIZWS_utf8_safe(scan, loceol)))
7226 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7232 while (hardcount < max && scan < loceol &&
7233 !is_HORIZWS_utf8_safe(scan, loceol))
7235 scan += UTF8SKIP(scan);
7239 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7246 while (hardcount < max && scan < loceol &&
7247 (c=is_VERTWS_utf8_safe(scan, loceol)))
7253 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7260 while (hardcount < max && scan < loceol &&
7261 !is_VERTWS_utf8_safe(scan, loceol))
7263 scan += UTF8SKIP(scan);
7267 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7287 /* These are all 0 width, so match right here or not at all. */
7291 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7292 assert(0); /* NOTREACHED */
7299 c = scan - *startposp;
7303 GET_RE_DEBUG_FLAGS_DECL;
7305 SV * const prop = sv_newmortal();
7306 regprop(prog, prop, p);
7307 PerlIO_printf(Perl_debug_log,
7308 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7309 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7317 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7319 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7320 create a copy so that changes the caller makes won't change the shared one.
7321 If <altsvp> is non-null, will return NULL in it, for back-compat.
7324 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7326 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7332 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7337 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7339 /* Returns the swash for the input 'node' in the regex 'prog'.
7340 * If <doinit> is true, will attempt to create the swash if not already
7342 * If <listsvp> is non-null, will return the swash initialization string in
7344 * Tied intimately to how regcomp.c sets up the data structure */
7351 RXi_GET_DECL(prog,progi);
7352 const struct reg_data * const data = prog ? progi->data : NULL;
7354 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7356 assert(ANYOF_NONBITMAP(node));
7358 if (data && data->count) {
7359 const U32 n = ARG(node);
7361 if (data->what[n] == 's') {
7362 SV * const rv = MUTABLE_SV(data->data[n]);
7363 AV * const av = MUTABLE_AV(SvRV(rv));
7364 SV **const ary = AvARRAY(av);
7365 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7367 si = *ary; /* ary[0] = the string to initialize the swash with */
7369 /* Elements 2 and 3 are either both present or both absent. [2] is
7370 * any inversion list generated at compile time; [3] indicates if
7371 * that inversion list has any user-defined properties in it. */
7372 if (av_len(av) >= 2) {
7375 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7382 /* Element [1] is reserved for the set-up swash. If already there,
7383 * return it; if not, create it and store it there */
7384 if (SvROK(ary[1])) {
7387 else if (si && doinit) {
7389 sw = _core_swash_init("utf8", /* the utf8 package */
7393 0, /* not from tr/// */
7396 (void)av_store(av, 1, sw);
7402 SV* matches_string = newSVpvn("", 0);
7404 /* Use the swash, if any, which has to have incorporated into it all
7406 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7407 && (si && si != &PL_sv_undef))
7410 /* If no swash, use the input initialization string, if available */
7411 sv_catsv(matches_string, si);
7414 /* Add the inversion list to whatever we have. This may have come from
7415 * the swash, or from an input parameter */
7417 sv_catsv(matches_string, _invlist_contents(invlist));
7419 *listsvp = matches_string;
7426 - reginclass - determine if a character falls into a character class
7428 n is the ANYOF regnode
7429 p is the target string
7430 utf8_target tells whether p is in UTF-8.
7432 Returns true if matched; false otherwise.
7434 Note that this can be a synthetic start class, a combination of various
7435 nodes, so things you think might be mutually exclusive, such as locale,
7436 aren't. It can match both locale and non-locale
7441 S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7444 const char flags = ANYOF_FLAGS(n);
7448 PERL_ARGS_ASSERT_REGINCLASS;
7450 /* If c is not already the code point, get it. Note that
7451 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7452 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7454 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7455 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7456 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7457 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7458 * UTF8_ALLOW_FFFF */
7459 if (c_len == (STRLEN)-1)
7460 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7463 /* If this character is potentially in the bitmap, check it */
7465 if (ANYOF_BITMAP_TEST(n, c))
7467 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7473 else if (flags & ANYOF_LOCALE) {
7474 PL_reg_flags |= RF_tainted;
7476 if ((flags & ANYOF_LOC_FOLD)
7477 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7481 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7483 /* The data structure is arranged so bits 0, 2, 4, ... are set
7484 * if the class includes the Posix character class given by
7485 * bit/2; and 1, 3, 5, ... are set if the class includes the
7486 * complemented Posix class given by int(bit/2). So we loop
7487 * through the bits, each time changing whether we complement
7488 * the result or not. Suppose for the sake of illustration
7489 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7490 * is set, it means there is a match for this ANYOF node if the
7491 * character is in the class given by the expression (0 / 2 = 0
7492 * = \w). If it is in that class, isFOO_lc() will return 1,
7493 * and since 'to_complement' is 0, the result will stay TRUE,
7494 * and we exit the loop. Suppose instead that bit 0 is 0, but
7495 * bit 1 is 1. That means there is a match if the character
7496 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7497 * but will on bit 1. On the second iteration 'to_complement'
7498 * will be 1, so the exclusive or will reverse things, so we
7499 * are testing for \W. On the third iteration, 'to_complement'
7500 * will be 0, and we would be testing for \s; the fourth
7501 * iteration would test for \S, etc. */
7504 int to_complement = 0;
7505 while (count < ANYOF_MAX) {
7506 if (ANYOF_CLASS_TEST(n, count)
7507 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7519 /* If the bitmap didn't (or couldn't) match, and something outside the
7520 * bitmap could match, try that. Locale nodes specify completely the
7521 * behavior of code points in the bit map (otherwise, a utf8 target would
7522 * cause them to be treated as Unicode and not locale), except in
7523 * the very unlikely event when this node is a synthetic start class, which
7524 * could be a combination of locale and non-locale nodes. So allow locale
7525 * to match for the synthetic start class, which will give a false
7526 * positive that will be resolved when the match is done again as not part
7527 * of the synthetic start class */
7529 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7530 match = TRUE; /* Everything above 255 matches */
7532 else if (ANYOF_NONBITMAP(n)
7533 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7536 || (! (flags & ANYOF_LOCALE))
7537 || (flags & ANYOF_IS_SYNTHETIC)))))
7539 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7544 } else { /* Convert to utf8 */
7546 utf8_p = bytes_to_utf8(p, &len);
7549 if (swash_fetch(sw, utf8_p, TRUE)) {
7553 /* If we allocated a string above, free it */
7554 if (! utf8_target) Safefree(utf8_p);
7558 if (UNICODE_IS_SUPER(c)
7559 && (flags & ANYOF_WARN_SUPER)
7560 && ckWARN_d(WARN_NON_UNICODE))
7562 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7563 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7567 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7568 return cBOOL(flags & ANYOF_INVERT) ^ match;
7572 S_reghop3(U8 *s, I32 off, const U8* lim)
7574 /* return the position 'off' UTF-8 characters away from 's', forward if
7575 * 'off' >= 0, backwards if negative. But don't go outside of position
7576 * 'lim', which better be < s if off < 0 */
7580 PERL_ARGS_ASSERT_REGHOP3;
7583 while (off-- && s < lim) {
7584 /* XXX could check well-formedness here */
7589 while (off++ && s > lim) {
7591 if (UTF8_IS_CONTINUED(*s)) {
7592 while (s > lim && UTF8_IS_CONTINUATION(*s))
7595 /* XXX could check well-formedness here */
7602 /* there are a bunch of places where we use two reghop3's that should
7603 be replaced with this routine. but since thats not done yet
7604 we ifdef it out - dmq
7607 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7611 PERL_ARGS_ASSERT_REGHOP4;
7614 while (off-- && s < rlim) {
7615 /* XXX could check well-formedness here */
7620 while (off++ && s > llim) {
7622 if (UTF8_IS_CONTINUED(*s)) {
7623 while (s > llim && UTF8_IS_CONTINUATION(*s))
7626 /* XXX could check well-formedness here */
7634 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7638 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7641 while (off-- && s < lim) {
7642 /* XXX could check well-formedness here */
7649 while (off++ && s > lim) {
7651 if (UTF8_IS_CONTINUED(*s)) {
7652 while (s > lim && UTF8_IS_CONTINUATION(*s))
7655 /* XXX could check well-formedness here */
7664 restore_pos(pTHX_ void *arg)
7667 regexp * const rex = (regexp *)arg;
7668 if (PL_reg_state.re_state_eval_setup_done) {
7669 if (PL_reg_oldsaved) {
7670 rex->subbeg = PL_reg_oldsaved;
7671 rex->sublen = PL_reg_oldsavedlen;
7672 rex->suboffset = PL_reg_oldsavedoffset;
7673 rex->subcoffset = PL_reg_oldsavedcoffset;
7675 rex->saved_copy = PL_nrs;
7677 RXp_MATCH_COPIED_on(rex);
7679 PL_reg_magic->mg_len = PL_reg_oldpos;
7680 PL_reg_state.re_state_eval_setup_done = FALSE;
7681 PL_curpm = PL_reg_oldcurpm;
7686 S_to_utf8_substr(pTHX_ regexp *prog)
7688 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7689 * on the converted value */
7693 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7696 if (prog->substrs->data[i].substr
7697 && !prog->substrs->data[i].utf8_substr) {
7698 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7699 prog->substrs->data[i].utf8_substr = sv;
7700 sv_utf8_upgrade(sv);
7701 if (SvVALID(prog->substrs->data[i].substr)) {
7702 if (SvTAIL(prog->substrs->data[i].substr)) {
7703 /* Trim the trailing \n that fbm_compile added last
7705 SvCUR_set(sv, SvCUR(sv) - 1);
7706 /* Whilst this makes the SV technically "invalid" (as its
7707 buffer is no longer followed by "\0") when fbm_compile()
7708 adds the "\n" back, a "\0" is restored. */
7709 fbm_compile(sv, FBMcf_TAIL);
7713 if (prog->substrs->data[i].substr == prog->check_substr)
7714 prog->check_utf8 = sv;
7720 S_to_byte_substr(pTHX_ regexp *prog)
7722 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7723 * on the converted value; returns FALSE if can't be converted. */
7728 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7731 if (prog->substrs->data[i].utf8_substr
7732 && !prog->substrs->data[i].substr) {
7733 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7734 if (! sv_utf8_downgrade(sv, TRUE)) {
7737 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7738 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7739 /* Trim the trailing \n that fbm_compile added last
7741 SvCUR_set(sv, SvCUR(sv) - 1);
7742 fbm_compile(sv, FBMcf_TAIL);
7746 prog->substrs->data[i].substr = sv;
7747 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7748 prog->check_substr = sv;
7757 * c-indentation-style: bsd
7759 * indent-tabs-mode: nil
7762 * ex: set ts=8 sts=4 sw=4 et: