5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #include "inline_invlist.c"
84 #include "unicode_constants.h"
87 /* At least one required character in the target string is expressible only in
89 static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
93 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
98 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define STATIC static
104 /* Valid only for non-utf8 strings: avoids the reginclass
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
107 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
108 : ANYOF_BITMAP_TEST(p,*(c)))
114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
115 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
117 #define HOPc(pos,off) \
118 (char *)(reginfo->is_utf8_target \
119 ? reghop3((U8*)pos, off, \
120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
122 #define HOPBACKc(pos, off) \
123 (char*)(reginfo->is_utf8_target \
124 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
125 : (pos - off >= reginfo->strbeg) \
129 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
130 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
132 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
133 * off must be >=0; args should be vars rather than expressions */
134 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
135 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
136 : (U8*)((pos + off) > lim ? lim : (pos + off)))
138 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
139 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
141 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
143 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
144 #define NEXTCHR_IS_EOS (nextchr < 0)
146 #define SET_nextchr \
147 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
149 #define SET_locinput(p) \
154 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
156 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
157 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
158 1, 0, invlist, &flags); \
163 /* If in debug mode, we test that a known character properly matches */
165 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
168 utf8_char_in_property) \
169 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
170 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
172 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
175 utf8_char_in_property) \
176 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
179 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
180 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
182 PL_XPosix_ptrs[_CC_WORDCHAR], \
183 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
185 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
187 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
188 "_X_regular_begin", \
190 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
191 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
194 COMBINING_GRAVE_ACCENT_UTF8); \
197 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
198 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
200 /* for use after a quantifier and before an EXACT-like node -- japhy */
201 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
203 * NOTE that *nothing* that affects backtracking should be in here, specifically
204 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
205 * node that is in between two EXACT like nodes when ascertaining what the required
206 * "follow" character is. This should probably be moved to regex compile time
207 * although it may be done at run time beause of the REF possibility - more
208 * investigation required. -- demerphq
210 #define JUMPABLE(rn) ( \
212 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
214 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
215 OP(rn) == PLUS || OP(rn) == MINMOD || \
217 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
219 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
221 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
224 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
225 we don't need this definition. */
226 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
227 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
228 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
231 /* ... so we use this as its faster. */
232 #define IS_TEXT(rn) ( OP(rn)==EXACT )
233 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
234 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
235 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
240 Search for mandatory following text node; for lookahead, the text must
241 follow but for lookbehind (rn->flags != 0) we skip to the next step.
243 #define FIND_NEXT_IMPT(rn) STMT_START { \
244 while (JUMPABLE(rn)) { \
245 const OPCODE type = OP(rn); \
246 if (type == SUSPEND || PL_regkind[type] == CURLY) \
247 rn = NEXTOPER(NEXTOPER(rn)); \
248 else if (type == PLUS) \
250 else if (type == IFMATCH) \
251 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
252 else rn += NEXT_OFF(rn); \
256 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
257 * These are for the pre-composed Hangul syllables, which are all in a
258 * contiguous block and arranged there in such a way so as to facilitate
259 * alorithmic determination of their characteristics. As such, they don't need
260 * a swash, but can be determined by simple arithmetic. Almost all are
261 * GCB=LVT, but every 28th one is a GCB=LV */
262 #define SBASE 0xAC00 /* Start of block */
263 #define SCount 11172 /* Length of block */
266 #define SLAB_FIRST(s) (&(s)->states[0])
267 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
269 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
270 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
271 static regmatch_state * S_push_slab(pTHX);
273 #define REGCP_PAREN_ELEMS 3
274 #define REGCP_OTHER_ELEMS 3
275 #define REGCP_FRAME_ELEMS 1
276 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
277 * are needed for the regexp context stack bookkeeping. */
280 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
283 const int retval = PL_savestack_ix;
284 const int paren_elems_to_push =
285 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
286 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
287 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
289 GET_RE_DEBUG_FLAGS_DECL;
291 PERL_ARGS_ASSERT_REGCPPUSH;
293 if (paren_elems_to_push < 0)
294 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i",
295 paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS);
297 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
298 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
299 " out of range (%lu-%ld)",
301 (unsigned long)maxopenparen,
304 SSGROW(total_elems + REGCP_FRAME_ELEMS);
307 if ((int)maxopenparen > (int)parenfloor)
308 PerlIO_printf(Perl_debug_log,
309 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
314 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
315 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
316 SSPUSHIV(rex->offs[p].end);
317 SSPUSHIV(rex->offs[p].start);
318 SSPUSHINT(rex->offs[p].start_tmp);
319 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
320 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
322 (IV)rex->offs[p].start,
323 (IV)rex->offs[p].start_tmp,
327 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
328 SSPUSHINT(maxopenparen);
329 SSPUSHINT(rex->lastparen);
330 SSPUSHINT(rex->lastcloseparen);
331 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
336 /* These are needed since we do not localize EVAL nodes: */
337 #define REGCP_SET(cp) \
339 PerlIO_printf(Perl_debug_log, \
340 " Setting an EVAL scope, savestack=%"IVdf"\n", \
341 (IV)PL_savestack_ix)); \
344 #define REGCP_UNWIND(cp) \
346 if (cp != PL_savestack_ix) \
347 PerlIO_printf(Perl_debug_log, \
348 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
349 (IV)(cp), (IV)PL_savestack_ix)); \
352 #define UNWIND_PAREN(lp, lcp) \
353 for (n = rex->lastparen; n > lp; n--) \
354 rex->offs[n].end = -1; \
355 rex->lastparen = n; \
356 rex->lastcloseparen = lcp;
360 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
365 GET_RE_DEBUG_FLAGS_DECL;
367 PERL_ARGS_ASSERT_REGCPPOP;
369 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
371 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
372 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
373 rex->lastcloseparen = SSPOPINT;
374 rex->lastparen = SSPOPINT;
375 *maxopenparen_p = SSPOPINT;
377 i -= REGCP_OTHER_ELEMS;
378 /* Now restore the parentheses context. */
380 if (i || rex->lastparen + 1 <= rex->nparens)
381 PerlIO_printf(Perl_debug_log,
382 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
387 paren = *maxopenparen_p;
388 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
390 rex->offs[paren].start_tmp = SSPOPINT;
391 rex->offs[paren].start = SSPOPIV;
393 if (paren <= rex->lastparen)
394 rex->offs[paren].end = tmps;
395 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
396 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
398 (IV)rex->offs[paren].start,
399 (IV)rex->offs[paren].start_tmp,
400 (IV)rex->offs[paren].end,
401 (paren > rex->lastparen ? "(skipped)" : ""));
406 /* It would seem that the similar code in regtry()
407 * already takes care of this, and in fact it is in
408 * a better location to since this code can #if 0-ed out
409 * but the code in regtry() is needed or otherwise tests
410 * requiring null fields (pat.t#187 and split.t#{13,14}
411 * (as of patchlevel 7877) will fail. Then again,
412 * this code seems to be necessary or otherwise
413 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
414 * --jhi updated by dapm */
415 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
416 if (i > *maxopenparen_p)
417 rex->offs[i].start = -1;
418 rex->offs[i].end = -1;
419 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
420 " \\%"UVuf": %s ..-1 undeffing\n",
422 (i > *maxopenparen_p) ? "-1" : " "
428 /* restore the parens and associated vars at savestack position ix,
429 * but without popping the stack */
432 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
434 I32 tmpix = PL_savestack_ix;
435 PL_savestack_ix = ix;
436 regcppop(rex, maxopenparen_p);
437 PL_savestack_ix = tmpix;
440 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
443 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
445 /* Returns a boolean as to whether or not 'character' is a member of the
446 * Posix character class given by 'classnum' that should be equivalent to a
447 * value in the typedef '_char_class_number'.
449 * Ideally this could be replaced by a just an array of function pointers
450 * to the C library functions that implement the macros this calls.
451 * However, to compile, the precise function signatures are required, and
452 * these may vary from platform to to platform. To avoid having to figure
453 * out what those all are on each platform, I (khw) am using this method,
454 * which adds an extra layer of function call overhead (unless the C
455 * optimizer strips it away). But we don't particularly care about
456 * performance with locales anyway. */
458 switch ((_char_class_number) classnum) {
459 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
460 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
461 case _CC_ENUM_ASCII: return isASCII_LC(character);
462 case _CC_ENUM_BLANK: return isBLANK_LC(character);
463 case _CC_ENUM_CASED: return isLOWER_LC(character)
464 || isUPPER_LC(character);
465 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
466 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
467 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
468 case _CC_ENUM_LOWER: return isLOWER_LC(character);
469 case _CC_ENUM_PRINT: return isPRINT_LC(character);
470 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
471 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
472 case _CC_ENUM_SPACE: return isSPACE_LC(character);
473 case _CC_ENUM_UPPER: return isUPPER_LC(character);
474 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
475 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
476 default: /* VERTSPACE should never occur in locales */
477 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
480 assert(0); /* NOTREACHED */
485 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
487 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
488 * 'character' is a member of the Posix character class given by 'classnum'
489 * that should be equivalent to a value in the typedef
490 * '_char_class_number'.
492 * This just calls isFOO_lc on the code point for the character if it is in
493 * the range 0-255. Outside that range, all characters avoid Unicode
494 * rules, ignoring any locale. So use the Unicode function if this class
495 * requires a swash, and use the Unicode macro otherwise. */
497 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
499 if (UTF8_IS_INVARIANT(*character)) {
500 return isFOO_lc(classnum, *character);
502 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
503 return isFOO_lc(classnum,
504 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
507 if (classnum < _FIRST_NON_SWASH_CC) {
509 /* Initialize the swash unless done already */
510 if (! PL_utf8_swash_ptrs[classnum]) {
511 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
512 PL_utf8_swash_ptrs[classnum] =
513 _core_swash_init("utf8",
516 PL_XPosix_ptrs[classnum], &flags);
519 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
521 TRUE /* is UTF */ ));
524 switch ((_char_class_number) classnum) {
526 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
528 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
529 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
530 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
531 default: return 0; /* Things like CNTRL are always
535 assert(0); /* NOTREACHED */
540 * pregexec and friends
543 #ifndef PERL_IN_XSUB_RE
545 - pregexec - match a regexp against a string
548 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
549 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
550 /* stringarg: the point in the string at which to begin matching */
551 /* strend: pointer to null at end of string */
552 /* strbeg: real beginning of string */
553 /* minend: end of match must be >= minend bytes after stringarg. */
554 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
555 * itself is accessed via the pointers above */
556 /* nosave: For optimizations. */
558 PERL_ARGS_ASSERT_PREGEXEC;
561 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
562 nosave ? 0 : REXEC_COPY_STR);
567 * Need to implement the following flags for reg_anch:
569 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
571 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
572 * INTUIT_AUTORITATIVE_ML
573 * INTUIT_ONCE_NOML - Intuit can match in one location only.
576 * Another flag for this function: SECOND_TIME (so that float substrs
577 * with giant delta may be not rechecked).
580 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
581 Otherwise, only SvCUR(sv) is used to get strbeg. */
583 /* XXXX Some places assume that there is a fixed substring.
584 An update may be needed if optimizer marks as "INTUITable"
585 RExen without fixed substrings. Similarly, it is assumed that
586 lengths of all the strings are no more than minlen, thus they
587 cannot come from lookahead.
588 (Or minlen should take into account lookahead.)
589 NOTE: Some of this comment is not correct. minlen does now take account
590 of lookahead/behind. Further research is required. -- demerphq
594 /* A failure to find a constant substring means that there is no need to make
595 an expensive call to REx engine, thus we celebrate a failure. Similarly,
596 finding a substring too deep into the string means that fewer calls to
597 regtry() should be needed.
599 REx compiler's optimizer found 4 possible hints:
600 a) Anchored substring;
602 c) Whether we are anchored (beginning-of-line or \G);
603 d) First node (of those at offset 0) which may distinguish positions;
604 We use a)b)d) and multiline-part of c), and try to find a position in the
605 string which does not contradict any of them.
608 /* Most of decisions we do here should have been done at compile time.
609 The nodes of the REx which we used for the search should have been
610 deleted from the finite automaton. */
613 * rx: the regex to match against
614 * sv: the SV being matched: only used for utf8 flag; the string
615 * itself is accessed via the pointers below. Note that on
616 * something like an overloaded SV, SvPOK(sv) may be false
617 * and the string pointers may point to something unrelated to
619 * strbeg: real beginning of string
620 * strpos: the point in the string at which to begin matching
621 * strend: pointer to the byte following the last char of the string
622 * flags currently unused; set to 0
623 * data: currently unused; set to NULL
627 Perl_re_intuit_start(pTHX_
630 const char * const strbeg,
634 re_scream_pos_data *data)
637 struct regexp *const prog = ReANY(rx);
638 SSize_t start_shift = 0;
639 /* Should be nonnegative! */
640 SSize_t end_shift = 0;
641 /* current lowest pos in string where the regex can start matching */
642 char *rx_origin = strpos;
644 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
645 U8 other_ix = 1 - prog->substrs->check_ix;
647 char *other_last = strpos;/* latest pos 'other' substr already checked to */
648 char *check_at = NULL; /* check substr found at this pos */
649 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
650 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
651 RXi_GET_DECL(prog,progi);
652 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
653 regmatch_info *const reginfo = ®info_buf;
654 GET_RE_DEBUG_FLAGS_DECL;
656 PERL_ARGS_ASSERT_RE_INTUIT_START;
657 PERL_UNUSED_ARG(flags);
658 PERL_UNUSED_ARG(data);
660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
661 "Intuit: trying to determine minimum start position...\n"));
663 /* for now, assume that all substr offsets are positive. If at some point
664 * in the future someone wants to do clever things with look-behind and
665 * -ve offsets, they'll need to fix up any code in this function
666 * which uses these offsets. See the thread beginning
667 * <20140113145929.GF27210@iabyn.com>
669 assert(prog->substrs->data[0].min_offset >= 0);
670 assert(prog->substrs->data[0].max_offset >= 0);
671 assert(prog->substrs->data[1].min_offset >= 0);
672 assert(prog->substrs->data[1].max_offset >= 0);
673 assert(prog->substrs->data[2].min_offset >= 0);
674 assert(prog->substrs->data[2].max_offset >= 0);
676 /* for now, assume that if both present, that the floating substring
677 * follows the anchored substring, and that they don't overlap.
678 * If you break this assumption (e.g. doing better optimisations
679 * with lookahead/behind), then you'll need to audit the code in this
680 * function carefully first
683 ! ( (prog->anchored_utf8 || prog->anchored_substr)
684 && (prog->float_utf8 || prog->float_substr))
685 || (prog->float_min_offset >= prog->anchored_offset));
687 /* CHR_DIST() would be more correct here but it makes things slow. */
688 if (prog->minlen > strend - strpos) {
689 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
690 " String too short...\n"));
694 reginfo->is_utf8_target = cBOOL(utf8_target);
695 reginfo->info_aux = NULL;
696 reginfo->strbeg = strbeg;
697 reginfo->strend = strend;
698 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
700 /* not actually used within intuit, but zero for safety anyway */
701 reginfo->poscache_maxiter = 0;
704 if (!prog->check_utf8 && prog->check_substr)
705 to_utf8_substr(prog);
706 check = prog->check_utf8;
708 if (!prog->check_substr && prog->check_utf8) {
709 if (! to_byte_substr(prog)) {
710 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
713 check = prog->check_substr;
716 /* dump the various substring data */
717 DEBUG_OPTIMISE_MORE_r({
719 for (i=0; i<=2; i++) {
720 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
721 : prog->substrs->data[i].substr);
725 PerlIO_printf(Perl_debug_log,
726 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
727 " useful=%"IVdf" utf8=%d [%s]\n",
729 (IV)prog->substrs->data[i].min_offset,
730 (IV)prog->substrs->data[i].max_offset,
731 (IV)prog->substrs->data[i].end_shift,
738 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
739 /* Check after \n? */
740 ml_anch = ( (prog->intflags & PREGf_ANCH_MBOL)
741 || ((prog->intflags & PREGf_ANCH_BOL) && multiline));
744 /* we are only allowed to match at BOS or \G */
746 /* trivially reject if there's a BOS anchor and we're not at BOS.
747 * In the case of \G, we hope(!) that the caller has already
748 * set strpos to pos()-gofs, and will already have checked
749 * that this anchor position is legal. So we can skip it here.
751 if ( !(prog->intflags & PREGf_ANCH_GPOS)
752 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
753 && (strpos != strbeg))
755 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
756 " Not at start...\n"));
760 /* in the presence of an anchor, the anchored (relative to the
761 * start of the regex) substr must also be anchored relative
762 * to strpos. So quickly reject if substr isn't found there */
764 if (prog->check_offset_min == prog->check_offset_max
765 && !(prog->intflags & PREGf_CANY_SEEN)
766 && ! multiline) /* /m can cause \n's to match that aren't
767 accounted for in the string max length.
768 See [perl #115242] */
770 /* Substring at constant offset from beg-of-str... */
771 SSize_t slen = SvCUR(check);
774 s = HOP3c(strpos, prog->check_offset_min, strend);
776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
777 " Looking for check substr at fixed offset %"IVdf"...\n",
778 (IV)prog->check_offset_min));
781 /* In this case, the regex is anchored at the end too,
782 * so the lengths must match exactly, give or take a \n.
783 * NB: slen >= 1 since the last char of check is \n */
784 if ( strend - s > slen || strend - s < slen - 1
785 || (strend - s == slen && strend[-1] != '\n'))
787 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
788 " String too long...\n"));
791 /* Now should match s[0..slen-2] */
794 if (slen && (*SvPVX_const(check) != *s
795 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
798 " String not equal...\n"));
803 goto success_at_start;
808 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
809 end_shift = prog->check_end_shift;
811 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
813 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
814 (IV)end_shift, RX_PRECOMP(prog));
818 /* Find a candidate regex origin in the region rx_origin..strend
819 * by looking for the "check" substring in that region, corrected by
827 DEBUG_OPTIMISE_MORE_r({
828 PerlIO_printf(Perl_debug_log,
829 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
830 " Start shift: %"IVdf" End shift %"IVdf
831 " Real end Shift: %"IVdf"\n",
832 (IV)(rx_origin - strpos),
833 (IV)prog->check_offset_min,
836 (IV)prog->check_end_shift);
839 if (prog->intflags & PREGf_CANY_SEEN) {
840 start_point= (U8*)(rx_origin + start_shift);
841 end_point= (U8*)(strend - end_shift);
843 start_point= HOP3(rx_origin, start_shift, strend);
844 end_point= HOP3(strend, -end_shift, strbeg);
847 /* if the regex is absolutely anchored to the start of the string,
848 * then check_offset_max represents an upper bound on the string
849 * where the substr could start */
851 && prog->intflags & PREGf_ANCH
852 && prog->check_offset_max != SSize_t_MAX
853 && start_shift < prog->check_offset_max)
855 SSize_t len = SvCUR(check) - !!SvTAIL(check);
856 end_point = HOP3lim(start_point,
857 prog->check_offset_max - start_shift,
862 DEBUG_OPTIMISE_MORE_r({
863 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
864 (int)(end_point - start_point),
865 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
869 check_at = fbm_instr( start_point, end_point,
870 check, multiline ? FBMrf_MULTILINE : 0);
873 /* Update the count-of-usability, remove useless subpatterns,
877 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
878 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
879 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
880 (check_at ? "Found" : "Did not find"),
881 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
882 ? "anchored" : "floating"),
885 (check_at ? " at offset " : "...\n") );
890 /* Finish the diagnostic message */
891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
893 /* set rx_origin to the minimum position where the regex could start
894 * matching, given the constraint of the just-matched check substring.
895 * But don't set it lower than previously.
898 if (check_at - rx_origin > prog->check_offset_max)
899 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
902 /* XXX dmq: first branch is for positive lookbehind...
903 Our check string is offset from the beginning of the pattern.
904 So we need to do any stclass tests offset forward from that
908 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
909 Start with the other substr.
910 XXXX no SCREAM optimization yet - and a very coarse implementation
911 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
912 *always* match. Probably should be marked during compile...
913 Probably it is right to do no SCREAM here...
916 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
917 : prog->substrs->data[other_ix].substr)
919 /* Take into account the "other" substring. */
923 struct reg_substr_datum *other;
926 other = &prog->substrs->data[other_ix];
928 /* if "other" is anchored:
929 * we've previously found a floating substr starting at check_at.
930 * This means that the regex origin must lie somewhere
931 * between min (rx_origin): HOP3(check_at, -check_offset_max)
932 * and max: HOP3(check_at, -check_offset_min)
933 * (except that min will be >= strpos)
934 * So the fixed substr must lie somewhere between
935 * HOP3(min, anchored_offset)
936 * HOP3(max, anchored_offset) + SvCUR(substr)
939 /* if "other" is floating
940 * Calculate last1, the absolute latest point where the
941 * floating substr could start in the string, ignoring any
942 * constraints from the earlier fixed match. It is calculated
945 * strend - prog->minlen (in chars) is the absolute latest
946 * position within the string where the origin of the regex
947 * could appear. The latest start point for the floating
948 * substr is float_min_offset(*) on from the start of the
949 * regex. last1 simply combines thee two offsets.
951 * (*) You might think the latest start point should be
952 * float_max_offset from the regex origin, and technically
953 * you'd be correct. However, consider
955 * Here, float min, max are 3,5 and minlen is 7.
956 * This can match either
960 * In the first case, the regex matches minlen chars; in the
961 * second, minlen+1, in the third, minlen+2.
962 * In the first case, the floating offset is 3 (which equals
963 * float_min), in the second, 4, and in the third, 5 (which
964 * equals float_max). In all cases, the floating string bcd
965 * can never start more than 4 chars from the end of the
966 * string, which equals minlen - float_min. As the substring
967 * starts to match more than float_min from the start of the
968 * regex, it makes the regex match more than minlen chars,
969 * and the two cancel each other out. So we can always use
970 * float_min - minlen, rather than float_max - minlen for the
971 * latest position in the string.
973 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
974 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
977 assert(prog->minlen >= other->min_offset);
978 last1 = HOP3c(strend,
979 other->min_offset - prog->minlen, strbeg);
981 if (other_ix) {/* i.e. if (other-is-float) */
982 /* last is the latest point where the floating substr could
983 * start, *given* any constraints from the earlier fixed
984 * match. This constraint is that the floating string starts
985 * <= float_max_offset chars from the regex origin (rx_origin).
986 * If this value is less than last1, use it instead.
988 assert(rx_origin <= last1);
990 /* this condition handles the offset==infinity case, and
991 * is a short-cut otherwise. Although it's comparing a
992 * byte offset to a char length, it does so in a safe way,
993 * since 1 char always occupies 1 or more bytes,
994 * so if a string range is (last1 - rx_origin) bytes,
995 * it will be less than or equal to (last1 - rx_origin)
996 * chars; meaning it errs towards doing the accurate HOP3
997 * rather than just using last1 as a short-cut */
998 (last1 - rx_origin) < other->max_offset
1000 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1003 assert(strpos + start_shift <= check_at);
1004 last = HOP4c(check_at, other->min_offset - start_shift,
1008 s = HOP3c(rx_origin, other->min_offset, strend);
1009 if (s < other_last) /* These positions already checked */
1012 must = utf8_target ? other->utf8_substr : other->substr;
1013 assert(SvPOK(must));
1016 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1018 multiline ? FBMrf_MULTILINE : 0
1021 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1022 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1023 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1024 s ? "Found" : "Contradicts",
1025 other_ix ? "floating" : "anchored",
1026 quoted, RE_SV_TAIL(must));
1031 /* last1 is latest possible substr location. If we didn't
1032 * find it before there, we never will */
1033 if (last >= last1) {
1034 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1035 ", giving up...\n"));
1039 /* try to find the check substr again at a later
1040 * position. Maybe next time we'll find the "other" substr
1042 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1043 ", trying %s at offset %ld...\n",
1044 (other_ix ? "floating" : "anchored"),
1045 (long)(HOP3c(check_at, 1, strend) - strpos)));
1047 other_last = HOP3c(last, 1, strend) /* highest failure */;
1049 other_ix /* i.e. if other-is-float */
1050 ? HOP3c(rx_origin, 1, strend)
1051 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1055 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1056 (long)(s - strpos)));
1058 if (other_ix) { /* if (other-is-float) */
1059 /* other_last is set to s, not s+1, since its possible for
1060 * a floating substr to fail first time, then succeed
1061 * second time at the same floating position; e.g.:
1062 * "-AB--AABZ" =~ /\wAB\d*Z/
1063 * The first time round, anchored and float match at
1064 * "-(AB)--AAB(Z)" then fail on the initial \w character
1065 * class. Second time round, they match at "-AB--A(AB)(Z)".
1070 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1071 other_last = HOP3c(s, 1, strend);
1076 DEBUG_OPTIMISE_MORE_r(
1077 PerlIO_printf(Perl_debug_log,
1078 " Check-only match: offset min:%"IVdf" max:%"IVdf
1079 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1080 " strend-strpos:%"IVdf"\n",
1081 (IV)prog->check_offset_min,
1082 (IV)prog->check_offset_max,
1083 (IV)(check_at-strpos),
1084 (IV)(rx_origin-strpos),
1085 (IV)(rx_origin-check_at),
1091 postprocess_substr_matches:
1093 /* handle the extra constraint of /^/m */
1095 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n'
1096 /* May be due to an implicit anchor of m{.*foo} */
1097 && !(prog->intflags & PREGf_IMPLICIT))
1101 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1102 " looking for /^/m anchor"));
1104 /* we have failed the constraint of a \n before rx_origin.
1105 * Find the next \n, if any, even if it's beyond the current
1106 * anchored and/or floating substrings. Whether we should be
1107 * scanning ahead for the next \n or the next substr is debatable.
1108 * On the one hand you'd expect rare substrings to appear less
1109 * often than \n's. On the other hand, searching for \n means
1110 * we're effectively flipping been check_substr and "\n" on each
1111 * iteration as the current "rarest" string candidate, which
1112 * means for example that we'll quickly reject the whole string if
1113 * hasn't got a \n, rather than trying every substr position
1117 s = HOP3c(strend, - prog->minlen, strpos);
1118 if (s <= rx_origin ||
1119 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1121 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1122 " Did not find /%s^%s/m...\n",
1123 PL_colors[0], PL_colors[1]));
1127 /* earliest possible origin is 1 char after the \n.
1128 * (since *rx_origin == '\n', it's safe to ++ here rather than
1129 * HOP(rx_origin, 1)) */
1132 if (prog->substrs->check_ix == 0 /* check is anchored */
1133 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1135 /* Position contradicts check-string; either because
1136 * check was anchored (and thus has no wiggle room),
1137 * or check was float and rx_origin is above the float range */
1138 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1139 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1140 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1144 /* if we get here, the check substr must have been float,
1145 * is in range, and we may or may not have had an anchored
1146 * "other" substr which still contradicts */
1147 assert(prog->substrs->check_ix); /* check is float */
1149 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1150 /* whoops, the anchored "other" substr exists, so we still
1151 * contradict. On the other hand, the float "check" substr
1152 * didn't contradict, so just retry the anchored "other"
1154 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1155 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1156 PL_colors[0], PL_colors[1],
1157 (long)(rx_origin - strpos),
1158 (long)(rx_origin - strpos + prog->anchored_offset)));
1159 goto do_other_substr;
1162 /* success: we don't contradict the found floating substring
1163 * (and there's no anchored substr). */
1164 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1165 " Found /%s^%s/m at offset %ld...\n",
1166 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1169 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1170 " Starting position does not contradict /%s^%s/m...\n",
1171 PL_colors[0], PL_colors[1]));
1175 /* Decide whether using the substrings helped */
1177 if (rx_origin != strpos) {
1178 /* Fixed substring is found far enough so that the match
1179 cannot start at strpos. */
1181 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1182 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1185 /* The found string does not prohibit matching at strpos,
1186 - no optimization of calling REx engine can be performed,
1187 unless it was an MBOL and we are not after MBOL,
1188 or a future STCLASS check will fail this. */
1190 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1192 prog->check_utf8 /* Could be deleted already */
1193 && --BmUSEFUL(prog->check_utf8) < 0
1194 && (prog->check_utf8 == prog->float_utf8)
1196 prog->check_substr /* Could be deleted already */
1197 && --BmUSEFUL(prog->check_substr) < 0
1198 && (prog->check_substr == prog->float_substr)
1201 /* If flags & SOMETHING - do not do it many times on the same match */
1202 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1203 /* XXX Does the destruction order has to change with utf8_target? */
1204 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1205 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1206 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1207 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1208 check = NULL; /* abort */
1209 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1210 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1211 if (prog->intflags & PREGf_IMPLICIT) {
1212 prog->intflags &= ~PREGf_ANCH_MBOL;
1213 /* maybe we have no anchors left after this... */
1214 if (!(prog->intflags & PREGf_ANCH))
1215 prog->extflags &= ~RXf_IS_ANCHORED;
1217 /* XXXX This is a remnant of the old implementation. It
1218 looks wasteful, since now INTUIT can use many
1219 other heuristics. */
1220 prog->extflags &= ~RXf_USE_INTUIT;
1221 /* XXXX What other flags might need to be cleared in this branch? */
1225 /* Last resort... */
1226 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1227 /* trie stclasses are too expensive to use here, we are better off to
1228 leave it to regmatch itself */
1229 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1230 /* minlen == 0 is possible if regstclass is \b or \B,
1231 and the fixed substr is ''$.
1232 Since minlen is already taken into account, rx_origin+1 is before strend;
1233 accidentally, minlen >= 1 guaranties no false positives at rx_origin + 1
1234 even for \b or \B. But (minlen? 1 : 0) below assumes that
1235 regstclass does not come from lookahead... */
1236 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1237 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1238 const U8* const str = (U8*)STRING(progi->regstclass);
1241 /* XXX this value could be pre-computed */
1242 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1243 ? (reginfo->is_utf8_pat
1244 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1245 : STR_LEN(progi->regstclass))
1248 char *s = rx_origin;
1249 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1250 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1251 else if (prog->float_substr || prog->float_utf8)
1252 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1256 if (checked_upto < s)
1258 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1259 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
1260 " s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1261 (IV)start_shift, (IV)(check_at - strbeg),
1262 (IV)(s - strbeg), (IV)(endpos - strbeg),
1263 (IV)(checked_upto- strbeg)));
1266 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1272 const char *what = NULL;
1274 if (endpos == strend) {
1275 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1276 " Could not match STCLASS...\n") );
1279 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1280 " This position contradicts STCLASS...\n") );
1281 if ((prog->intflags & PREGf_ANCH) && !ml_anch)
1283 checked_upto = HOPBACKc(endpos, start_shift);
1284 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1285 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1286 /* Contradict one of substrings */
1287 if (prog->anchored_substr || prog->anchored_utf8) {
1288 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1289 DEBUG_EXECUTE_r( what = "anchored" );
1291 s = HOP3c(t, 1, strend);
1292 if (s + start_shift + end_shift > strend) {
1293 /* XXXX Should be taken into account earlier? */
1294 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1295 " Could not match STCLASS...\n") );
1301 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1302 " Looking for %s substr starting at offset %ld...\n",
1303 what, (long)(rx_origin + start_shift - strpos)) );
1306 /* Have both, check_string is floating */
1307 if (t + start_shift >= check_at) /* Contradicts floating=check */
1308 goto retry_floating_check;
1309 /* Recheck anchored substring, but not floating... */
1314 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1315 " Looking for anchored substr starting at offset %ld...\n",
1316 (long)(other_last - strpos)) );
1317 assert(prog->substrs->check_ix); /* other is float */
1318 goto do_other_substr;
1320 /* Another way we could have checked stclass at the
1321 current position only: */
1323 s = rx_origin = t + 1;
1326 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1327 " Looking for /%s^%s/m starting at offset %ld...\n",
1328 PL_colors[0], PL_colors[1],
1329 (long)(rx_origin - strpos)) );
1330 /* XXX DAPM I don't yet know why this is true, but the code
1331 * assumed it when it used to do goto try_at_offset */
1332 assert(rx_origin != strpos);
1333 goto postprocess_substr_matches;
1335 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1337 /* Check is floating substring. */
1338 retry_floating_check:
1339 t = check_at - start_shift;
1340 DEBUG_EXECUTE_r( what = "floating" );
1341 goto hop_and_restart;
1344 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1345 " By STCLASS: moving %ld --> %ld\n",
1346 (long)(t - strpos), (long)(s - strpos))
1350 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1351 " Does not contradict STCLASS...\n");
1356 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Intuit: %s%s:%s match at offset %ld\n",
1357 PL_colors[4], (check ? "Successfully guessed" : "Giving up"),
1358 PL_colors[5], (long)(rx_origin - strpos)) );
1361 fail_finish: /* Substring not found */
1362 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1363 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1365 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1366 PL_colors[4], PL_colors[5]));
1370 #define DECL_TRIE_TYPE(scan) \
1371 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1372 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1373 trie_type = ((scan->flags == EXACT) \
1374 ? (utf8_target ? trie_utf8 : trie_plain) \
1375 : (scan->flags == EXACTFA) \
1376 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1377 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1379 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1382 U8 flags = FOLD_FLAGS_FULL; \
1383 switch (trie_type) { \
1384 case trie_utf8_exactfa_fold: \
1385 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1386 /* FALL THROUGH */ \
1387 case trie_utf8_fold: \
1388 if ( foldlen>0 ) { \
1389 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1394 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
1395 len = UTF8SKIP(uc); \
1396 skiplen = UNISKIP( uvc ); \
1397 foldlen -= skiplen; \
1398 uscan = foldbuf + skiplen; \
1401 case trie_latin_utf8_exactfa_fold: \
1402 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1403 /* FALL THROUGH */ \
1404 case trie_latin_utf8_fold: \
1405 if ( foldlen>0 ) { \
1406 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1412 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1413 skiplen = UNISKIP( uvc ); \
1414 foldlen -= skiplen; \
1415 uscan = foldbuf + skiplen; \
1419 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1426 charid = trie->charmap[ uvc ]; \
1430 if (widecharmap) { \
1431 SV** const svpp = hv_fetch(widecharmap, \
1432 (char*)&uvc, sizeof(UV), 0); \
1434 charid = (U16)SvIV(*svpp); \
1439 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1443 && (ln == 1 || folder(s, pat_string, ln)) \
1444 && (reginfo->intuit || regtry(reginfo, &s)) )\
1450 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1452 while (s < strend) { \
1458 #define REXEC_FBC_SCAN(CoDe) \
1460 while (s < strend) { \
1466 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1467 REXEC_FBC_UTF8_SCAN( \
1469 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1478 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1481 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1490 #define REXEC_FBC_TRYIT \
1491 if ((reginfo->intuit || regtry(reginfo, &s))) \
1494 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1495 if (utf8_target) { \
1496 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1499 REXEC_FBC_CLASS_SCAN(CoNd); \
1502 #define DUMP_EXEC_POS(li,s,doutf8) \
1503 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1507 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1508 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1509 tmp = TEST_NON_UTF8(tmp); \
1510 REXEC_FBC_UTF8_SCAN( \
1511 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1520 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1521 if (s == reginfo->strbeg) { \
1525 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1526 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1527 0, UTF8_ALLOW_DEFAULT); \
1530 LOAD_UTF8_CHARCLASS_ALNUM(); \
1531 REXEC_FBC_UTF8_SCAN( \
1532 if (tmp == ! (TeSt2_UtF8)) { \
1541 /* The only difference between the BOUND and NBOUND cases is that
1542 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1543 * NBOUND. This is accomplished by passing it in either the if or else clause,
1544 * with the other one being empty */
1545 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1546 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1548 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1549 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1551 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1552 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1554 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1555 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1558 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1559 * be passed in completely with the variable name being tested, which isn't
1560 * such a clean interface, but this is easier to read than it was before. We
1561 * are looking for the boundary (or non-boundary between a word and non-word
1562 * character. The utf8 and non-utf8 cases have the same logic, but the details
1563 * must be different. Find the "wordness" of the character just prior to this
1564 * one, and compare it with the wordness of this one. If they differ, we have
1565 * a boundary. At the beginning of the string, pretend that the previous
1566 * character was a new-line */
1567 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1568 if (utf8_target) { \
1571 else { /* Not utf8 */ \
1572 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1573 tmp = TEST_NON_UTF8(tmp); \
1575 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1584 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
1587 /* We know what class REx starts with. Try to find this position... */
1588 /* if reginfo->intuit, its a dryrun */
1589 /* annoyingly all the vars in this routine have different names from their counterparts
1590 in regmatch. /grrr */
1593 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1594 const char *strend, regmatch_info *reginfo)
1597 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1598 char *pat_string; /* The pattern's exactish string */
1599 char *pat_end; /* ptr to end char of pat_string */
1600 re_fold_t folder; /* Function for computing non-utf8 folds */
1601 const U8 *fold_array; /* array for folding ords < 256 */
1607 I32 tmp = 1; /* Scratch variable? */
1608 const bool utf8_target = reginfo->is_utf8_target;
1609 UV utf8_fold_flags = 0;
1610 const bool is_utf8_pat = reginfo->is_utf8_pat;
1611 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1612 with a result inverts that result, as 0^1 =
1614 _char_class_number classnum;
1616 RXi_GET_DECL(prog,progi);
1618 PERL_ARGS_ASSERT_FIND_BYCLASS;
1620 /* We know what class it must start with. */
1624 REXEC_FBC_UTF8_CLASS_SCAN(
1625 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1628 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1633 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1640 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1641 assert(! is_utf8_pat);
1644 if (is_utf8_pat || utf8_target) {
1645 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1646 goto do_exactf_utf8;
1648 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1649 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1650 goto do_exactf_non_utf8; /* isn't dealt with by these */
1652 case EXACTF: /* This node only generated for non-utf8 patterns */
1653 assert(! is_utf8_pat);
1655 utf8_fold_flags = 0;
1656 goto do_exactf_utf8;
1658 fold_array = PL_fold;
1660 goto do_exactf_non_utf8;
1663 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1664 utf8_fold_flags = FOLDEQ_LOCALE;
1665 goto do_exactf_utf8;
1667 fold_array = PL_fold_locale;
1668 folder = foldEQ_locale;
1669 goto do_exactf_non_utf8;
1673 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1675 goto do_exactf_utf8;
1678 if (is_utf8_pat || utf8_target) {
1679 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1680 goto do_exactf_utf8;
1683 /* Any 'ss' in the pattern should have been replaced by regcomp,
1684 * so we don't have to worry here about this single special case
1685 * in the Latin1 range */
1686 fold_array = PL_fold_latin1;
1687 folder = foldEQ_latin1;
1691 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1692 are no glitches with fold-length differences
1693 between the target string and pattern */
1695 /* The idea in the non-utf8 EXACTF* cases is to first find the
1696 * first character of the EXACTF* node and then, if necessary,
1697 * case-insensitively compare the full text of the node. c1 is the
1698 * first character. c2 is its fold. This logic will not work for
1699 * Unicode semantics and the german sharp ss, which hence should
1700 * not be compiled into a node that gets here. */
1701 pat_string = STRING(c);
1702 ln = STR_LEN(c); /* length to match in octets/bytes */
1704 /* We know that we have to match at least 'ln' bytes (which is the
1705 * same as characters, since not utf8). If we have to match 3
1706 * characters, and there are only 2 availabe, we know without
1707 * trying that it will fail; so don't start a match past the
1708 * required minimum number from the far end */
1709 e = HOP3c(strend, -((SSize_t)ln), s);
1711 if (reginfo->intuit && e < s) {
1712 e = s; /* Due to minlen logic of intuit() */
1716 c2 = fold_array[c1];
1717 if (c1 == c2) { /* If char and fold are the same */
1718 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1721 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1729 /* If one of the operands is in utf8, we can't use the simpler folding
1730 * above, due to the fact that many different characters can have the
1731 * same fold, or portion of a fold, or different- length fold */
1732 pat_string = STRING(c);
1733 ln = STR_LEN(c); /* length to match in octets/bytes */
1734 pat_end = pat_string + ln;
1735 lnc = is_utf8_pat /* length to match in characters */
1736 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1739 /* We have 'lnc' characters to match in the pattern, but because of
1740 * multi-character folding, each character in the target can match
1741 * up to 3 characters (Unicode guarantees it will never exceed
1742 * this) if it is utf8-encoded; and up to 2 if not (based on the
1743 * fact that the Latin 1 folds are already determined, and the
1744 * only multi-char fold in that range is the sharp-s folding to
1745 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1746 * string character. Adjust lnc accordingly, rounding up, so that
1747 * if we need to match at least 4+1/3 chars, that really is 5. */
1748 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1749 lnc = (lnc + expansion - 1) / expansion;
1751 /* As in the non-UTF8 case, if we have to match 3 characters, and
1752 * only 2 are left, it's guaranteed to fail, so don't start a
1753 * match that would require us to go beyond the end of the string
1755 e = HOP3c(strend, -((SSize_t)lnc), s);
1757 if (reginfo->intuit && e < s) {
1758 e = s; /* Due to minlen logic of intuit() */
1761 /* XXX Note that we could recalculate e to stop the loop earlier,
1762 * as the worst case expansion above will rarely be met, and as we
1763 * go along we would usually find that e moves further to the left.
1764 * This would happen only after we reached the point in the loop
1765 * where if there were no expansion we should fail. Unclear if
1766 * worth the expense */
1769 char *my_strend= (char *)strend;
1770 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1771 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1772 && (reginfo->intuit || regtry(reginfo, &s)) )
1776 s += (utf8_target) ? UTF8SKIP(s) : 1;
1781 FBC_BOUND(isWORDCHAR_LC,
1782 isWORDCHAR_LC_uvchr(tmp),
1783 isWORDCHAR_LC_utf8((U8*)s));
1786 FBC_NBOUND(isWORDCHAR_LC,
1787 isWORDCHAR_LC_uvchr(tmp),
1788 isWORDCHAR_LC_utf8((U8*)s));
1791 FBC_BOUND(isWORDCHAR,
1792 isWORDCHAR_uni(tmp),
1793 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1796 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1798 isWORDCHAR_A((U8*)s));
1801 FBC_NBOUND(isWORDCHAR,
1802 isWORDCHAR_uni(tmp),
1803 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1806 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1808 isWORDCHAR_A((U8*)s));
1811 FBC_BOUND(isWORDCHAR_L1,
1812 isWORDCHAR_uni(tmp),
1813 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1816 FBC_NBOUND(isWORDCHAR_L1,
1817 isWORDCHAR_uni(tmp),
1818 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1821 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1822 is_LNBREAK_latin1_safe(s, strend)
1826 /* The argument to all the POSIX node types is the class number to pass to
1827 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1834 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1835 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1850 /* The complement of something that matches only ASCII matches all
1851 * UTF-8 variant code points, plus everything in ASCII that isn't
1853 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1854 || ! _generic_isCC_A(*s, FLAGS(c)));
1863 /* Don't need to worry about utf8, as it can match only a single
1864 * byte invariant character. */
1865 REXEC_FBC_CLASS_SCAN(
1866 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1874 if (! utf8_target) {
1875 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1881 classnum = (_char_class_number) FLAGS(c);
1882 if (classnum < _FIRST_NON_SWASH_CC) {
1883 while (s < strend) {
1885 /* We avoid loading in the swash as long as possible, but
1886 * should we have to, we jump to a separate loop. This
1887 * extra 'if' statement is what keeps this code from being
1888 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1889 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1890 goto found_above_latin1;
1892 if ((UTF8_IS_INVARIANT(*s)
1893 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1895 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1896 && to_complement ^ cBOOL(
1897 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1901 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1913 else switch (classnum) { /* These classes are implemented as
1915 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1916 revert the change of \v matching this */
1919 case _CC_ENUM_PSXSPC:
1920 REXEC_FBC_UTF8_CLASS_SCAN(
1921 to_complement ^ cBOOL(isSPACE_utf8(s)));
1924 case _CC_ENUM_BLANK:
1925 REXEC_FBC_UTF8_CLASS_SCAN(
1926 to_complement ^ cBOOL(isBLANK_utf8(s)));
1929 case _CC_ENUM_XDIGIT:
1930 REXEC_FBC_UTF8_CLASS_SCAN(
1931 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1934 case _CC_ENUM_VERTSPACE:
1935 REXEC_FBC_UTF8_CLASS_SCAN(
1936 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1939 case _CC_ENUM_CNTRL:
1940 REXEC_FBC_UTF8_CLASS_SCAN(
1941 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1945 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1946 assert(0); /* NOTREACHED */
1951 found_above_latin1: /* Here we have to load a swash to get the result
1952 for the current code point */
1953 if (! PL_utf8_swash_ptrs[classnum]) {
1954 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1955 PL_utf8_swash_ptrs[classnum] =
1956 _core_swash_init("utf8",
1959 PL_XPosix_ptrs[classnum], &flags);
1962 /* This is a copy of the loop above for swash classes, though using the
1963 * FBC macro instead of being expanded out. Since we've loaded the
1964 * swash, we don't have to check for that each time through the loop */
1965 REXEC_FBC_UTF8_CLASS_SCAN(
1966 to_complement ^ cBOOL(_generic_utf8(
1969 swash_fetch(PL_utf8_swash_ptrs[classnum],
1977 /* what trie are we using right now */
1978 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1979 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1980 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1982 const char *last_start = strend - trie->minlen;
1984 const char *real_start = s;
1986 STRLEN maxlen = trie->maxlen;
1988 U8 **points; /* map of where we were in the input string
1989 when reading a given char. For ASCII this
1990 is unnecessary overhead as the relationship
1991 is always 1:1, but for Unicode, especially
1992 case folded Unicode this is not true. */
1993 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1997 GET_RE_DEBUG_FLAGS_DECL;
1999 /* We can't just allocate points here. We need to wrap it in
2000 * an SV so it gets freed properly if there is a croak while
2001 * running the match */
2004 sv_points=newSV(maxlen * sizeof(U8 *));
2005 SvCUR_set(sv_points,
2006 maxlen * sizeof(U8 *));
2007 SvPOK_on(sv_points);
2008 sv_2mortal(sv_points);
2009 points=(U8**)SvPV_nolen(sv_points );
2010 if ( trie_type != trie_utf8_fold
2011 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2014 bitmap=(U8*)trie->bitmap;
2016 bitmap=(U8*)ANYOF_BITMAP(c);
2018 /* this is the Aho-Corasick algorithm modified a touch
2019 to include special handling for long "unknown char" sequences.
2020 The basic idea being that we use AC as long as we are dealing
2021 with a possible matching char, when we encounter an unknown char
2022 (and we have not encountered an accepting state) we scan forward
2023 until we find a legal starting char.
2024 AC matching is basically that of trie matching, except that when
2025 we encounter a failing transition, we fall back to the current
2026 states "fail state", and try the current char again, a process
2027 we repeat until we reach the root state, state 1, or a legal
2028 transition. If we fail on the root state then we can either
2029 terminate if we have reached an accepting state previously, or
2030 restart the entire process from the beginning if we have not.
2033 while (s <= last_start) {
2034 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2042 U8 *uscan = (U8*)NULL;
2043 U8 *leftmost = NULL;
2045 U32 accepted_word= 0;
2049 while ( state && uc <= (U8*)strend ) {
2051 U32 word = aho->states[ state ].wordnum;
2055 DEBUG_TRIE_EXECUTE_r(
2056 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2057 dump_exec_pos( (char *)uc, c, strend, real_start,
2058 (char *)uc, utf8_target );
2059 PerlIO_printf( Perl_debug_log,
2060 " Scanning for legal start char...\n");
2064 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2068 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2074 if (uc >(U8*)last_start) break;
2078 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2079 if (!leftmost || lpos < leftmost) {
2080 DEBUG_r(accepted_word=word);
2086 points[pointpos++ % maxlen]= uc;
2087 if (foldlen || uc < (U8*)strend) {
2088 REXEC_TRIE_READ_CHAR(trie_type, trie,
2090 uscan, len, uvc, charid, foldlen,
2092 DEBUG_TRIE_EXECUTE_r({
2093 dump_exec_pos( (char *)uc, c, strend,
2094 real_start, s, utf8_target);
2095 PerlIO_printf(Perl_debug_log,
2096 " Charid:%3u CP:%4"UVxf" ",
2108 word = aho->states[ state ].wordnum;
2110 base = aho->states[ state ].trans.base;
2112 DEBUG_TRIE_EXECUTE_r({
2114 dump_exec_pos( (char *)uc, c, strend, real_start,
2116 PerlIO_printf( Perl_debug_log,
2117 "%sState: %4"UVxf", word=%"UVxf,
2118 failed ? " Fail transition to " : "",
2119 (UV)state, (UV)word);
2125 ( ((offset = base + charid
2126 - 1 - trie->uniquecharcount)) >= 0)
2127 && ((U32)offset < trie->lasttrans)
2128 && trie->trans[offset].check == state
2129 && (tmp=trie->trans[offset].next))
2131 DEBUG_TRIE_EXECUTE_r(
2132 PerlIO_printf( Perl_debug_log," - legal\n"));
2137 DEBUG_TRIE_EXECUTE_r(
2138 PerlIO_printf( Perl_debug_log," - fail\n"));
2140 state = aho->fail[state];
2144 /* we must be accepting here */
2145 DEBUG_TRIE_EXECUTE_r(
2146 PerlIO_printf( Perl_debug_log," - accepting\n"));
2155 if (!state) state = 1;
2158 if ( aho->states[ state ].wordnum ) {
2159 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2160 if (!leftmost || lpos < leftmost) {
2161 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2166 s = (char*)leftmost;
2167 DEBUG_TRIE_EXECUTE_r({
2169 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2170 (UV)accepted_word, (IV)(s - real_start)
2173 if (reginfo->intuit || regtry(reginfo, &s)) {
2179 DEBUG_TRIE_EXECUTE_r({
2180 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2183 DEBUG_TRIE_EXECUTE_r(
2184 PerlIO_printf( Perl_debug_log,"No match.\n"));
2193 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2201 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2202 * flags have same meanings as with regexec_flags() */
2205 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2212 struct regexp *const prog = ReANY(rx);
2214 if (flags & REXEC_COPY_STR) {
2218 PerlIO_printf(Perl_debug_log,
2219 "Copy on write: regexp capture, type %d\n",
2222 /* Create a new COW SV to share the match string and store
2223 * in saved_copy, unless the current COW SV in saved_copy
2224 * is valid and suitable for our purpose */
2225 if (( prog->saved_copy
2226 && SvIsCOW(prog->saved_copy)
2227 && SvPOKp(prog->saved_copy)
2230 && SvPVX(sv) == SvPVX(prog->saved_copy)))
2232 /* just reuse saved_copy SV */
2233 if (RXp_MATCH_COPIED(prog)) {
2234 Safefree(prog->subbeg);
2235 RXp_MATCH_COPIED_off(prog);
2239 /* create new COW SV to share string */
2240 RX_MATCH_COPY_FREE(rx);
2241 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2243 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2244 assert (SvPOKp(prog->saved_copy));
2245 prog->sublen = strend - strbeg;
2246 prog->suboffset = 0;
2247 prog->subcoffset = 0;
2252 SSize_t max = strend - strbeg;
2255 if ( (flags & REXEC_COPY_SKIP_POST)
2256 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2257 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2258 ) { /* don't copy $' part of string */
2261 /* calculate the right-most part of the string covered
2262 * by a capture. Due to look-ahead, this may be to
2263 * the right of $&, so we have to scan all captures */
2264 while (n <= prog->lastparen) {
2265 if (prog->offs[n].end > max)
2266 max = prog->offs[n].end;
2270 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2271 ? prog->offs[0].start
2273 assert(max >= 0 && max <= strend - strbeg);
2276 if ( (flags & REXEC_COPY_SKIP_PRE)
2277 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2278 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2279 ) { /* don't copy $` part of string */
2282 /* calculate the left-most part of the string covered
2283 * by a capture. Due to look-behind, this may be to
2284 * the left of $&, so we have to scan all captures */
2285 while (min && n <= prog->lastparen) {
2286 if ( prog->offs[n].start != -1
2287 && prog->offs[n].start < min)
2289 min = prog->offs[n].start;
2293 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2294 && min > prog->offs[0].end
2296 min = prog->offs[0].end;
2300 assert(min >= 0 && min <= max && min <= strend - strbeg);
2303 if (RX_MATCH_COPIED(rx)) {
2304 if (sublen > prog->sublen)
2306 (char*)saferealloc(prog->subbeg, sublen+1);
2309 prog->subbeg = (char*)safemalloc(sublen+1);
2310 Copy(strbeg + min, prog->subbeg, sublen, char);
2311 prog->subbeg[sublen] = '\0';
2312 prog->suboffset = min;
2313 prog->sublen = sublen;
2314 RX_MATCH_COPIED_on(rx);
2316 prog->subcoffset = prog->suboffset;
2317 if (prog->suboffset && utf8_target) {
2318 /* Convert byte offset to chars.
2319 * XXX ideally should only compute this if @-/@+
2320 * has been seen, a la PL_sawampersand ??? */
2322 /* If there's a direct correspondence between the
2323 * string which we're matching and the original SV,
2324 * then we can use the utf8 len cache associated with
2325 * the SV. In particular, it means that under //g,
2326 * sv_pos_b2u() will use the previously cached
2327 * position to speed up working out the new length of
2328 * subcoffset, rather than counting from the start of
2329 * the string each time. This stops
2330 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2331 * from going quadratic */
2332 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2333 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2334 SV_GMAGIC|SV_CONST_RETURN);
2336 prog->subcoffset = utf8_length((U8*)strbeg,
2337 (U8*)(strbeg+prog->suboffset));
2341 RX_MATCH_COPY_FREE(rx);
2342 prog->subbeg = strbeg;
2343 prog->suboffset = 0;
2344 prog->subcoffset = 0;
2345 prog->sublen = strend - strbeg;
2353 - regexec_flags - match a regexp against a string
2356 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2357 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2358 /* stringarg: the point in the string at which to begin matching */
2359 /* strend: pointer to null at end of string */
2360 /* strbeg: real beginning of string */
2361 /* minend: end of match must be >= minend bytes after stringarg. */
2362 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2363 * itself is accessed via the pointers above */
2364 /* data: May be used for some additional optimizations.
2365 Currently unused. */
2366 /* flags: For optimizations. See REXEC_* in regexp.h */
2370 struct regexp *const prog = ReANY(rx);
2374 SSize_t minlen; /* must match at least this many chars */
2375 SSize_t dontbother = 0; /* how many characters not to try at end */
2376 const bool utf8_target = cBOOL(DO_UTF8(sv));
2378 RXi_GET_DECL(prog,progi);
2379 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2380 regmatch_info *const reginfo = ®info_buf;
2381 regexp_paren_pair *swap = NULL;
2383 GET_RE_DEBUG_FLAGS_DECL;
2385 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2386 PERL_UNUSED_ARG(data);
2388 /* Be paranoid... */
2389 if (prog == NULL || stringarg == NULL) {
2390 Perl_croak(aTHX_ "NULL regexp parameter");
2395 debug_start_match(rx, utf8_target, stringarg, strend,
2399 startpos = stringarg;
2401 if (prog->intflags & PREGf_GPOS_SEEN) {
2404 /* set reginfo->ganch, the position where \G can match */
2407 (flags & REXEC_IGNOREPOS)
2408 ? stringarg /* use start pos rather than pos() */
2409 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2410 /* Defined pos(): */
2411 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2412 : strbeg; /* pos() not defined; use start of string */
2414 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2415 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2417 /* in the presence of \G, we may need to start looking earlier in
2418 * the string than the suggested start point of stringarg:
2419 * if prog->gofs is set, then that's a known, fixed minimum
2422 * /ab|c\G/: gofs = 1
2423 * or if the minimum offset isn't known, then we have to go back
2424 * to the start of the string, e.g. /w+\G/
2427 if (prog->intflags & PREGf_ANCH_GPOS) {
2428 startpos = reginfo->ganch - prog->gofs;
2430 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2432 DEBUG_r(PerlIO_printf(Perl_debug_log,
2433 "fail: ganch-gofs before earliest possible start\n"));
2437 else if (prog->gofs) {
2438 if (startpos - prog->gofs < strbeg)
2441 startpos -= prog->gofs;
2443 else if (prog->intflags & PREGf_GPOS_FLOAT)
2447 minlen = prog->minlen;
2448 if ((startpos + minlen) > strend || startpos < strbeg) {
2449 DEBUG_r(PerlIO_printf(Perl_debug_log,
2450 "Regex match can't succeed, so not even tried\n"));
2454 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2455 * which will call destuctors to reset PL_regmatch_state, free higher
2456 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2457 * regmatch_info_aux_eval */
2459 oldsave = PL_savestack_ix;
2463 if ((prog->extflags & RXf_USE_INTUIT)
2464 && !(flags & REXEC_CHECKED))
2466 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2471 if (prog->extflags & RXf_CHECK_ALL) {
2472 /* we can match based purely on the result of INTUIT.
2473 * Set up captures etc just for $& and $-[0]
2474 * (an intuit-only match wont have $1,$2,..) */
2475 assert(!prog->nparens);
2477 /* s/// doesn't like it if $& is earlier than where we asked it to
2478 * start searching (which can happen on something like /.\G/) */
2479 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2482 /* this should only be possible under \G */
2483 assert(prog->intflags & PREGf_GPOS_SEEN);
2484 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2485 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2489 /* match via INTUIT shouldn't have any captures.
2490 * Let @-, @+, $^N know */
2491 prog->lastparen = prog->lastcloseparen = 0;
2492 RX_MATCH_UTF8_set(rx, utf8_target);
2493 prog->offs[0].start = s - strbeg;
2494 prog->offs[0].end = utf8_target
2495 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2496 : s - strbeg + prog->minlenret;
2497 if ( !(flags & REXEC_NOT_FIRST) )
2498 S_reg_set_capture_string(aTHX_ rx,
2500 sv, flags, utf8_target);
2506 multiline = prog->extflags & RXf_PMf_MULTILINE;
2508 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2509 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2510 "String too short [regexec_flags]...\n"));
2514 /* Check validity of program. */
2515 if (UCHARAT(progi->program) != REG_MAGIC) {
2516 Perl_croak(aTHX_ "corrupted regexp program");
2519 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2520 reginfo->intuit = 0;
2521 reginfo->is_utf8_target = cBOOL(utf8_target);
2522 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2523 reginfo->warned = FALSE;
2524 reginfo->strbeg = strbeg;
2526 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2527 reginfo->strend = strend;
2528 /* see how far we have to get to not match where we matched before */
2529 reginfo->till = stringarg + minend;
2531 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2532 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2533 S_cleanup_regmatch_info_aux has executed (registered by
2534 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2535 magic belonging to this SV.
2536 Not newSVsv, either, as it does not COW.
2538 reginfo->sv = newSV(0);
2539 SvSetSV_nosteal(reginfo->sv, sv);
2540 SAVEFREESV(reginfo->sv);
2543 /* reserve next 2 or 3 slots in PL_regmatch_state:
2544 * slot N+0: may currently be in use: skip it
2545 * slot N+1: use for regmatch_info_aux struct
2546 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2547 * slot N+3: ready for use by regmatch()
2551 regmatch_state *old_regmatch_state;
2552 regmatch_slab *old_regmatch_slab;
2553 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2555 /* on first ever match, allocate first slab */
2556 if (!PL_regmatch_slab) {
2557 Newx(PL_regmatch_slab, 1, regmatch_slab);
2558 PL_regmatch_slab->prev = NULL;
2559 PL_regmatch_slab->next = NULL;
2560 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2563 old_regmatch_state = PL_regmatch_state;
2564 old_regmatch_slab = PL_regmatch_slab;
2566 for (i=0; i <= max; i++) {
2568 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2570 reginfo->info_aux_eval =
2571 reginfo->info_aux->info_aux_eval =
2572 &(PL_regmatch_state->u.info_aux_eval);
2574 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2575 PL_regmatch_state = S_push_slab(aTHX);
2578 /* note initial PL_regmatch_state position; at end of match we'll
2579 * pop back to there and free any higher slabs */
2581 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2582 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2583 reginfo->info_aux->poscache = NULL;
2585 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2587 if ((prog->extflags & RXf_EVAL_SEEN))
2588 S_setup_eval_state(aTHX_ reginfo);
2590 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2593 /* If there is a "must appear" string, look for it. */
2595 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2596 /* We have to be careful. If the previous successful match
2597 was from this regex we don't want a subsequent partially
2598 successful match to clobber the old results.
2599 So when we detect this possibility we add a swap buffer
2600 to the re, and switch the buffer each match. If we fail,
2601 we switch it back; otherwise we leave it swapped.
2604 /* do we need a save destructor here for eval dies? */
2605 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2606 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2607 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2614 /* Simplest case: anchored match need be tried only once. */
2615 /* [unless only anchor is BOL and multiline is set] */
2616 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
2617 if (s == startpos && regtry(reginfo, &s))
2619 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
2624 dontbother = minlen - 1;
2625 end = HOP3c(strend, -dontbother, strbeg) - 1;
2626 /* for multiline we only have to try after newlines */
2627 if (prog->check_substr || prog->check_utf8) {
2628 /* because of the goto we can not easily reuse the macros for bifurcating the
2629 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2632 goto after_try_utf8;
2634 if (regtry(reginfo, &s)) {
2641 if (prog->extflags & RXf_USE_INTUIT) {
2642 s = re_intuit_start(rx, sv, strbeg,
2643 s + UTF8SKIP(s), strend, flags, NULL);
2652 } /* end search for check string in unicode */
2654 if (s == startpos) {
2655 goto after_try_latin;
2658 if (regtry(reginfo, &s)) {
2665 if (prog->extflags & RXf_USE_INTUIT) {
2666 s = re_intuit_start(rx, sv, strbeg,
2667 s + 1, strend, flags, NULL);
2676 } /* end search for check string in latin*/
2677 } /* end search for check string */
2678 else { /* search for newline */
2680 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2683 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2684 while (s <= end) { /* note it could be possible to match at the end of the string */
2685 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2686 if (regtry(reginfo, &s))
2690 } /* end search for newline */
2691 } /* end anchored/multiline check string search */
2693 } else if (prog->intflags & PREGf_ANCH_GPOS)
2695 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2696 assert(prog->intflags & PREGf_GPOS_SEEN);
2697 /* For anchored \G, the only position it can match from is
2698 * (ganch-gofs); we already set startpos to this above; if intuit
2699 * moved us on from there, we can't possibly succeed */
2700 assert(startpos == reginfo->ganch - prog->gofs);
2701 if (s == startpos && regtry(reginfo, &s))
2706 /* Messy cases: unanchored match. */
2707 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2708 /* we have /x+whatever/ */
2709 /* it must be a one character string (XXXX Except is_utf8_pat?) */
2715 if (! prog->anchored_utf8) {
2716 to_utf8_substr(prog);
2718 ch = SvPVX_const(prog->anchored_utf8)[0];
2721 DEBUG_EXECUTE_r( did_match = 1 );
2722 if (regtry(reginfo, &s)) goto got_it;
2724 while (s < strend && *s == ch)
2731 if (! prog->anchored_substr) {
2732 if (! to_byte_substr(prog)) {
2733 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2736 ch = SvPVX_const(prog->anchored_substr)[0];
2739 DEBUG_EXECUTE_r( did_match = 1 );
2740 if (regtry(reginfo, &s)) goto got_it;
2742 while (s < strend && *s == ch)
2747 DEBUG_EXECUTE_r(if (!did_match)
2748 PerlIO_printf(Perl_debug_log,
2749 "Did not find anchored character...\n")
2752 else if (prog->anchored_substr != NULL
2753 || prog->anchored_utf8 != NULL
2754 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2755 && prog->float_max_offset < strend - s)) {
2760 char *last1; /* Last position checked before */
2764 if (prog->anchored_substr || prog->anchored_utf8) {
2766 if (! prog->anchored_utf8) {
2767 to_utf8_substr(prog);
2769 must = prog->anchored_utf8;
2772 if (! prog->anchored_substr) {
2773 if (! to_byte_substr(prog)) {
2774 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2777 must = prog->anchored_substr;
2779 back_max = back_min = prog->anchored_offset;
2782 if (! prog->float_utf8) {
2783 to_utf8_substr(prog);
2785 must = prog->float_utf8;
2788 if (! prog->float_substr) {
2789 if (! to_byte_substr(prog)) {
2790 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2793 must = prog->float_substr;
2795 back_max = prog->float_max_offset;
2796 back_min = prog->float_min_offset;
2802 last = HOP3c(strend, /* Cannot start after this */
2803 -(SSize_t)(CHR_SVLEN(must)
2804 - (SvTAIL(must) != 0) + back_min), strbeg);
2806 if (s > reginfo->strbeg)
2807 last1 = HOPc(s, -1);
2809 last1 = s - 1; /* bogus */
2811 /* XXXX check_substr already used to find "s", can optimize if
2812 check_substr==must. */
2814 strend = HOPc(strend, -dontbother);
2815 while ( (s <= last) &&
2816 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
2817 (unsigned char*)strend, must,
2818 multiline ? FBMrf_MULTILINE : 0)) ) {
2819 DEBUG_EXECUTE_r( did_match = 1 );
2820 if (HOPc(s, -back_max) > last1) {
2821 last1 = HOPc(s, -back_min);
2822 s = HOPc(s, -back_max);
2825 char * const t = (last1 >= reginfo->strbeg)
2826 ? HOPc(last1, 1) : last1 + 1;
2828 last1 = HOPc(s, -back_min);
2832 while (s <= last1) {
2833 if (regtry(reginfo, &s))
2836 s++; /* to break out of outer loop */
2843 while (s <= last1) {
2844 if (regtry(reginfo, &s))
2850 DEBUG_EXECUTE_r(if (!did_match) {
2851 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2852 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2853 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2854 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2855 ? "anchored" : "floating"),
2856 quoted, RE_SV_TAIL(must));
2860 else if ( (c = progi->regstclass) ) {
2862 const OPCODE op = OP(progi->regstclass);
2863 /* don't bother with what can't match */
2864 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2865 strend = HOPc(strend, -(minlen - 1));
2868 SV * const prop = sv_newmortal();
2869 regprop(prog, prop, c, reginfo);
2871 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2873 PerlIO_printf(Perl_debug_log,
2874 "Matching stclass %.*s against %s (%d bytes)\n",
2875 (int)SvCUR(prop), SvPVX_const(prop),
2876 quoted, (int)(strend - s));
2879 if (find_byclass(prog, c, s, strend, reginfo))
2881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2885 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2893 if (! prog->float_utf8) {
2894 to_utf8_substr(prog);
2896 float_real = prog->float_utf8;
2899 if (! prog->float_substr) {
2900 if (! to_byte_substr(prog)) {
2901 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2904 float_real = prog->float_substr;
2907 little = SvPV_const(float_real, len);
2908 if (SvTAIL(float_real)) {
2909 /* This means that float_real contains an artificial \n on
2910 * the end due to the presence of something like this:
2911 * /foo$/ where we can match both "foo" and "foo\n" at the
2912 * end of the string. So we have to compare the end of the
2913 * string first against the float_real without the \n and
2914 * then against the full float_real with the string. We
2915 * have to watch out for cases where the string might be
2916 * smaller than the float_real or the float_real without
2918 char *checkpos= strend - len;
2920 PerlIO_printf(Perl_debug_log,
2921 "%sChecking for float_real.%s\n",
2922 PL_colors[4], PL_colors[5]));
2923 if (checkpos + 1 < strbeg) {
2924 /* can't match, even if we remove the trailing \n
2925 * string is too short to match */
2927 PerlIO_printf(Perl_debug_log,
2928 "%sString shorter than required trailing substring, cannot match.%s\n",
2929 PL_colors[4], PL_colors[5]));
2931 } else if (memEQ(checkpos + 1, little, len - 1)) {
2932 /* can match, the end of the string matches without the
2934 last = checkpos + 1;
2935 } else if (checkpos < strbeg) {
2936 /* cant match, string is too short when the "\n" is
2939 PerlIO_printf(Perl_debug_log,
2940 "%sString does not contain required trailing substring, cannot match.%s\n",
2941 PL_colors[4], PL_colors[5]));
2943 } else if (!multiline) {
2944 /* non multiline match, so compare with the "\n" at the
2945 * end of the string */
2946 if (memEQ(checkpos, little, len)) {
2950 PerlIO_printf(Perl_debug_log,
2951 "%sString does not contain required trailing substring, cannot match.%s\n",
2952 PL_colors[4], PL_colors[5]));
2956 /* multiline match, so we have to search for a place
2957 * where the full string is located */
2963 last = rninstr(s, strend, little, little + len);
2965 last = strend; /* matching "$" */
2968 /* at one point this block contained a comment which was
2969 * probably incorrect, which said that this was a "should not
2970 * happen" case. Even if it was true when it was written I am
2971 * pretty sure it is not anymore, so I have removed the comment
2972 * and replaced it with this one. Yves */
2974 PerlIO_printf(Perl_debug_log,
2975 "String does not contain required substring, cannot match.\n"
2979 dontbother = strend - last + prog->float_min_offset;
2981 if (minlen && (dontbother < minlen))
2982 dontbother = minlen - 1;
2983 strend -= dontbother; /* this one's always in bytes! */
2984 /* We don't know much -- general case. */
2987 if (regtry(reginfo, &s))
2996 if (regtry(reginfo, &s))
2998 } while (s++ < strend);
3006 /* s/// doesn't like it if $& is earlier than where we asked it to
3007 * start searching (which can happen on something like /.\G/) */
3008 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3009 && (prog->offs[0].start < stringarg - strbeg))
3011 /* this should only be possible under \G */
3012 assert(prog->intflags & PREGf_GPOS_SEEN);
3013 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3014 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3020 PerlIO_printf(Perl_debug_log,
3021 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3028 /* clean up; this will trigger destructors that will free all slabs
3029 * above the current one, and cleanup the regmatch_info_aux
3030 * and regmatch_info_aux_eval sructs */
3032 LEAVE_SCOPE(oldsave);
3034 if (RXp_PAREN_NAMES(prog))
3035 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3037 RX_MATCH_UTF8_set(rx, utf8_target);
3039 /* make sure $`, $&, $', and $digit will work later */
3040 if ( !(flags & REXEC_NOT_FIRST) )
3041 S_reg_set_capture_string(aTHX_ rx,
3042 strbeg, reginfo->strend,
3043 sv, flags, utf8_target);
3048 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3049 PL_colors[4], PL_colors[5]));
3051 /* clean up; this will trigger destructors that will free all slabs
3052 * above the current one, and cleanup the regmatch_info_aux
3053 * and regmatch_info_aux_eval sructs */
3055 LEAVE_SCOPE(oldsave);
3058 /* we failed :-( roll it back */
3059 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3060 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3065 Safefree(prog->offs);
3072 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3073 * Do inc before dec, in case old and new rex are the same */
3074 #define SET_reg_curpm(Re2) \
3075 if (reginfo->info_aux_eval) { \
3076 (void)ReREFCNT_inc(Re2); \
3077 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3078 PM_SETRE((PL_reg_curpm), (Re2)); \
3083 - regtry - try match at specific point
3085 STATIC I32 /* 0 failure, 1 success */
3086 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3090 REGEXP *const rx = reginfo->prog;
3091 regexp *const prog = ReANY(rx);
3093 RXi_GET_DECL(prog,progi);
3094 GET_RE_DEBUG_FLAGS_DECL;
3096 PERL_ARGS_ASSERT_REGTRY;
3098 reginfo->cutpoint=NULL;
3100 prog->offs[0].start = *startposp - reginfo->strbeg;
3101 prog->lastparen = 0;
3102 prog->lastcloseparen = 0;
3104 /* XXXX What this code is doing here?!!! There should be no need
3105 to do this again and again, prog->lastparen should take care of
3108 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3109 * Actually, the code in regcppop() (which Ilya may be meaning by
3110 * prog->lastparen), is not needed at all by the test suite
3111 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3112 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3113 * Meanwhile, this code *is* needed for the
3114 * above-mentioned test suite tests to succeed. The common theme
3115 * on those tests seems to be returning null fields from matches.
3116 * --jhi updated by dapm */
3118 if (prog->nparens) {
3119 regexp_paren_pair *pp = prog->offs;
3121 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3129 result = regmatch(reginfo, *startposp, progi->program + 1);
3131 prog->offs[0].end = result;
3134 if (reginfo->cutpoint)
3135 *startposp= reginfo->cutpoint;
3136 REGCP_UNWIND(lastcp);
3141 #define sayYES goto yes
3142 #define sayNO goto no
3143 #define sayNO_SILENT goto no_silent
3145 /* we dont use STMT_START/END here because it leads to
3146 "unreachable code" warnings, which are bogus, but distracting. */
3147 #define CACHEsayNO \
3148 if (ST.cache_mask) \
3149 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3152 /* this is used to determine how far from the left messages like
3153 'failed...' are printed. It should be set such that messages
3154 are inline with the regop output that created them.
3156 #define REPORT_CODE_OFF 32
3159 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3160 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
3161 #define CHRTEST_NOT_A_CP_1 -999
3162 #define CHRTEST_NOT_A_CP_2 -998
3164 /* grab a new slab and return the first slot in it */
3166 STATIC regmatch_state *
3169 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3172 regmatch_slab *s = PL_regmatch_slab->next;
3174 Newx(s, 1, regmatch_slab);
3175 s->prev = PL_regmatch_slab;
3177 PL_regmatch_slab->next = s;
3179 PL_regmatch_slab = s;
3180 return SLAB_FIRST(s);
3184 /* push a new state then goto it */
3186 #define PUSH_STATE_GOTO(state, node, input) \
3187 pushinput = input; \
3189 st->resume_state = state; \
3192 /* push a new state with success backtracking, then goto it */
3194 #define PUSH_YES_STATE_GOTO(state, node, input) \
3195 pushinput = input; \
3197 st->resume_state = state; \
3198 goto push_yes_state;
3205 regmatch() - main matching routine
3207 This is basically one big switch statement in a loop. We execute an op,
3208 set 'next' to point the next op, and continue. If we come to a point which
3209 we may need to backtrack to on failure such as (A|B|C), we push a
3210 backtrack state onto the backtrack stack. On failure, we pop the top
3211 state, and re-enter the loop at the state indicated. If there are no more
3212 states to pop, we return failure.
3214 Sometimes we also need to backtrack on success; for example /A+/, where
3215 after successfully matching one A, we need to go back and try to
3216 match another one; similarly for lookahead assertions: if the assertion
3217 completes successfully, we backtrack to the state just before the assertion
3218 and then carry on. In these cases, the pushed state is marked as
3219 'backtrack on success too'. This marking is in fact done by a chain of
3220 pointers, each pointing to the previous 'yes' state. On success, we pop to
3221 the nearest yes state, discarding any intermediate failure-only states.
3222 Sometimes a yes state is pushed just to force some cleanup code to be
3223 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3224 it to free the inner regex.
3226 Note that failure backtracking rewinds the cursor position, while
3227 success backtracking leaves it alone.
3229 A pattern is complete when the END op is executed, while a subpattern
3230 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3231 ops trigger the "pop to last yes state if any, otherwise return true"
3234 A common convention in this function is to use A and B to refer to the two
3235 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3236 the subpattern to be matched possibly multiple times, while B is the entire
3237 rest of the pattern. Variable and state names reflect this convention.
3239 The states in the main switch are the union of ops and failure/success of
3240 substates associated with with that op. For example, IFMATCH is the op
3241 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3242 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3243 successfully matched A and IFMATCH_A_fail is a state saying that we have
3244 just failed to match A. Resume states always come in pairs. The backtrack
3245 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3246 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3247 on success or failure.
3249 The struct that holds a backtracking state is actually a big union, with
3250 one variant for each major type of op. The variable st points to the
3251 top-most backtrack struct. To make the code clearer, within each
3252 block of code we #define ST to alias the relevant union.
3254 Here's a concrete example of a (vastly oversimplified) IFMATCH
3260 #define ST st->u.ifmatch
3262 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3263 ST.foo = ...; // some state we wish to save
3265 // push a yes backtrack state with a resume value of
3266 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3268 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3271 case IFMATCH_A: // we have successfully executed A; now continue with B
3273 bar = ST.foo; // do something with the preserved value
3276 case IFMATCH_A_fail: // A failed, so the assertion failed
3277 ...; // do some housekeeping, then ...
3278 sayNO; // propagate the failure
3285 For any old-timers reading this who are familiar with the old recursive
3286 approach, the code above is equivalent to:
3288 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3297 ...; // do some housekeeping, then ...
3298 sayNO; // propagate the failure
3301 The topmost backtrack state, pointed to by st, is usually free. If you
3302 want to claim it, populate any ST.foo fields in it with values you wish to
3303 save, then do one of
3305 PUSH_STATE_GOTO(resume_state, node, newinput);
3306 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3308 which sets that backtrack state's resume value to 'resume_state', pushes a
3309 new free entry to the top of the backtrack stack, then goes to 'node'.
3310 On backtracking, the free slot is popped, and the saved state becomes the
3311 new free state. An ST.foo field in this new top state can be temporarily
3312 accessed to retrieve values, but once the main loop is re-entered, it
3313 becomes available for reuse.
3315 Note that the depth of the backtrack stack constantly increases during the
3316 left-to-right execution of the pattern, rather than going up and down with
3317 the pattern nesting. For example the stack is at its maximum at Z at the
3318 end of the pattern, rather than at X in the following:
3320 /(((X)+)+)+....(Y)+....Z/
3322 The only exceptions to this are lookahead/behind assertions and the cut,
3323 (?>A), which pop all the backtrack states associated with A before
3326 Backtrack state structs are allocated in slabs of about 4K in size.
3327 PL_regmatch_state and st always point to the currently active state,
3328 and PL_regmatch_slab points to the slab currently containing
3329 PL_regmatch_state. The first time regmatch() is called, the first slab is
3330 allocated, and is never freed until interpreter destruction. When the slab
3331 is full, a new one is allocated and chained to the end. At exit from
3332 regmatch(), slabs allocated since entry are freed.
3337 #define DEBUG_STATE_pp(pp) \
3339 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3340 PerlIO_printf(Perl_debug_log, \
3341 " %*s"pp" %s%s%s%s%s\n", \
3343 PL_reg_name[st->resume_state], \
3344 ((st==yes_state||st==mark_state) ? "[" : ""), \
3345 ((st==yes_state) ? "Y" : ""), \
3346 ((st==mark_state) ? "M" : ""), \
3347 ((st==yes_state||st==mark_state) ? "]" : "") \
3352 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3357 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3358 const char *start, const char *end, const char *blurb)
3360 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3362 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3367 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3368 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3370 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3371 start, end - start, 60);
3373 PerlIO_printf(Perl_debug_log,
3374 "%s%s REx%s %s against %s\n",
3375 PL_colors[4], blurb, PL_colors[5], s0, s1);
3377 if (utf8_target||utf8_pat)
3378 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3379 utf8_pat ? "pattern" : "",
3380 utf8_pat && utf8_target ? " and " : "",
3381 utf8_target ? "string" : ""
3387 S_dump_exec_pos(pTHX_ const char *locinput,
3388 const regnode *scan,
3389 const char *loc_regeol,
3390 const char *loc_bostr,
3391 const char *loc_reg_starttry,
3392 const bool utf8_target)
3394 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3395 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3396 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3397 /* The part of the string before starttry has one color
3398 (pref0_len chars), between starttry and current
3399 position another one (pref_len - pref0_len chars),
3400 after the current position the third one.
3401 We assume that pref0_len <= pref_len, otherwise we
3402 decrease pref0_len. */
3403 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3404 ? (5 + taill) - l : locinput - loc_bostr;
3407 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3409 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3411 pref0_len = pref_len - (locinput - loc_reg_starttry);
3412 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3413 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3414 ? (5 + taill) - pref_len : loc_regeol - locinput);
3415 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3419 if (pref0_len > pref_len)
3420 pref0_len = pref_len;
3422 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3424 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3425 (locinput - pref_len),pref0_len, 60, 4, 5);
3427 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3428 (locinput - pref_len + pref0_len),
3429 pref_len - pref0_len, 60, 2, 3);
3431 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3432 locinput, loc_regeol - locinput, 10, 0, 1);
3434 const STRLEN tlen=len0+len1+len2;
3435 PerlIO_printf(Perl_debug_log,
3436 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3437 (IV)(locinput - loc_bostr),
3440 (docolor ? "" : "> <"),
3442 (int)(tlen > 19 ? 0 : 19 - tlen),
3449 /* reg_check_named_buff_matched()
3450 * Checks to see if a named buffer has matched. The data array of
3451 * buffer numbers corresponding to the buffer is expected to reside
3452 * in the regexp->data->data array in the slot stored in the ARG() of
3453 * node involved. Note that this routine doesn't actually care about the
3454 * name, that information is not preserved from compilation to execution.
3455 * Returns the index of the leftmost defined buffer with the given name
3456 * or 0 if non of the buffers matched.
3459 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3462 RXi_GET_DECL(rex,rexi);
3463 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3464 I32 *nums=(I32*)SvPVX(sv_dat);
3466 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3468 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3469 if ((I32)rex->lastparen >= nums[n] &&
3470 rex->offs[nums[n]].end != -1)
3480 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3481 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3483 /* This function determines if there are one or two characters that match
3484 * the first character of the passed-in EXACTish node <text_node>, and if
3485 * so, returns them in the passed-in pointers.
3487 * If it determines that no possible character in the target string can
3488 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3489 * the first character in <text_node> requires UTF-8 to represent, and the
3490 * target string isn't in UTF-8.)
3492 * If there are more than two characters that could match the beginning of
3493 * <text_node>, or if more context is required to determine a match or not,
3494 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3496 * The motiviation behind this function is to allow the caller to set up
3497 * tight loops for matching. If <text_node> is of type EXACT, there is
3498 * only one possible character that can match its first character, and so
3499 * the situation is quite simple. But things get much more complicated if
3500 * folding is involved. It may be that the first character of an EXACTFish
3501 * node doesn't participate in any possible fold, e.g., punctuation, so it
3502 * can be matched only by itself. The vast majority of characters that are
3503 * in folds match just two things, their lower and upper-case equivalents.
3504 * But not all are like that; some have multiple possible matches, or match
3505 * sequences of more than one character. This function sorts all that out.
3507 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3508 * loop of trying to match A*, we know we can't exit where the thing
3509 * following it isn't a B. And something can't be a B unless it is the
3510 * beginning of B. By putting a quick test for that beginning in a tight
3511 * loop, we can rule out things that can't possibly be B without having to
3512 * break out of the loop, thus avoiding work. Similarly, if A is a single
3513 * character, we can make a tight loop matching A*, using the outputs of
3516 * If the target string to match isn't in UTF-8, and there aren't
3517 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3518 * the one or two possible octets (which are characters in this situation)
3519 * that can match. In all cases, if there is only one character that can
3520 * match, *<c1p> and *<c2p> will be identical.
3522 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3523 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3524 * can match the beginning of <text_node>. They should be declared with at
3525 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3526 * undefined what these contain.) If one or both of the buffers are
3527 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3528 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3529 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3530 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3531 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3533 const bool utf8_target = reginfo->is_utf8_target;
3535 UV c1 = CHRTEST_NOT_A_CP_1;
3536 UV c2 = CHRTEST_NOT_A_CP_2;
3537 bool use_chrtest_void = FALSE;
3538 const bool is_utf8_pat = reginfo->is_utf8_pat;
3540 /* Used when we have both utf8 input and utf8 output, to avoid converting
3541 * to/from code points */
3542 bool utf8_has_been_setup = FALSE;
3546 U8 *pat = (U8*)STRING(text_node);
3547 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3549 if (OP(text_node) == EXACT) {
3551 /* In an exact node, only one thing can be matched, that first
3552 * character. If both the pat and the target are UTF-8, we can just
3553 * copy the input to the output, avoiding finding the code point of
3558 else if (utf8_target) {
3559 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3560 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3561 utf8_has_been_setup = TRUE;
3564 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3567 else { /* an EXACTFish node */
3568 U8 *pat_end = pat + STR_LEN(text_node);
3570 /* An EXACTFL node has at least some characters unfolded, because what
3571 * they match is not known until now. So, now is the time to fold
3572 * the first few of them, as many as are needed to determine 'c1' and
3573 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
3574 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3575 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
3576 * need to fold as many characters as a single character can fold to,
3577 * so that later we can check if the first ones are such a multi-char
3578 * fold. But, in such a pattern only locale-problematic characters
3579 * aren't folded, so we can skip this completely if the first character
3580 * in the node isn't one of the tricky ones */
3581 if (OP(text_node) == EXACTFL) {
3583 if (! is_utf8_pat) {
3584 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3586 folded[0] = folded[1] = 's';
3588 pat_end = folded + 2;
3591 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3596 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3598 *(d++) = (U8) toFOLD_LC(*s);
3603 _to_utf8_fold_flags(s,
3606 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3617 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8(pat))
3618 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1(pat)))
3620 /* Multi-character folds require more context to sort out. Also
3621 * PL_utf8_foldclosures used below doesn't handle them, so have to
3622 * be handled outside this routine */
3623 use_chrtest_void = TRUE;
3625 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3626 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3628 /* Load the folds hash, if not already done */
3630 if (! PL_utf8_foldclosures) {
3631 if (! PL_utf8_tofold) {
3632 U8 dummy[UTF8_MAXBYTES_CASE+1];
3634 /* Force loading this by folding an above-Latin1 char */
3635 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3636 assert(PL_utf8_tofold); /* Verify that worked */
3638 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3641 /* The fold closures data structure is a hash with the keys
3642 * being the UTF-8 of every character that is folded to, like
3643 * 'k', and the values each an array of all code points that
3644 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
3645 * Multi-character folds are not included */
3646 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3651 /* Not found in the hash, therefore there are no folds
3652 * containing it, so there is only a single character that
3656 else { /* Does participate in folds */
3657 AV* list = (AV*) *listp;
3658 if (av_tindex(list) != 1) {
3660 /* If there aren't exactly two folds to this, it is
3661 * outside the scope of this function */
3662 use_chrtest_void = TRUE;
3664 else { /* There are two. Get them */
3665 SV** c_p = av_fetch(list, 0, FALSE);
3667 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3671 c_p = av_fetch(list, 1, FALSE);
3673 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3677 /* Folds that cross the 255/256 boundary are forbidden
3678 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3679 * one is ASCIII. Since the pattern character is above
3680 * 256, and its only other match is below 256, the only
3681 * legal match will be to itself. We have thrown away
3682 * the original, so have to compute which is the one
3684 if ((c1 < 256) != (c2 < 256)) {
3685 if ((OP(text_node) == EXACTFL
3686 && ! IN_UTF8_CTYPE_LOCALE)
3687 || ((OP(text_node) == EXACTFA
3688 || OP(text_node) == EXACTFA_NO_TRIE)
3689 && (isASCII(c1) || isASCII(c2))))
3702 else /* Here, c1 is < 255 */
3704 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3705 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3706 && ((OP(text_node) != EXACTFA
3707 && OP(text_node) != EXACTFA_NO_TRIE)
3710 /* Here, there could be something above Latin1 in the target
3711 * which folds to this character in the pattern. All such
3712 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3713 * than two characters involved in their folds, so are outside
3714 * the scope of this function */
3715 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3716 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3719 use_chrtest_void = TRUE;
3722 else { /* Here nothing above Latin1 can fold to the pattern
3724 switch (OP(text_node)) {
3726 case EXACTFL: /* /l rules */
3727 c2 = PL_fold_locale[c1];
3730 case EXACTF: /* This node only generated for non-utf8
3732 assert(! is_utf8_pat);
3733 if (! utf8_target) { /* /d rules */
3738 /* /u rules for all these. This happens to work for
3739 * EXACTFA as nothing in Latin1 folds to ASCII */
3740 case EXACTFA_NO_TRIE: /* This node only generated for
3741 non-utf8 patterns */
3742 assert(! is_utf8_pat);
3747 c2 = PL_fold_latin1[c1];
3751 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3752 assert(0); /* NOTREACHED */
3758 /* Here have figured things out. Set up the returns */
3759 if (use_chrtest_void) {
3760 *c2p = *c1p = CHRTEST_VOID;
3762 else if (utf8_target) {
3763 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3764 uvchr_to_utf8(c1_utf8, c1);
3765 uvchr_to_utf8(c2_utf8, c2);
3768 /* Invariants are stored in both the utf8 and byte outputs; Use
3769 * negative numbers otherwise for the byte ones. Make sure that the
3770 * byte ones are the same iff the utf8 ones are the same */
3771 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3772 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3775 ? CHRTEST_NOT_A_CP_1
3776 : CHRTEST_NOT_A_CP_2;
3778 else if (c1 > 255) {
3779 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3784 *c1p = *c2p = c2; /* c2 is the only representable value */
3786 else { /* c1 is representable; see about c2 */
3788 *c2p = (c2 < 256) ? c2 : c1;
3794 /* returns -1 on failure, $+[0] on success */
3796 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3798 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3802 const bool utf8_target = reginfo->is_utf8_target;
3803 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3804 REGEXP *rex_sv = reginfo->prog;
3805 regexp *rex = ReANY(rex_sv);
3806 RXi_GET_DECL(rex,rexi);
3807 /* the current state. This is a cached copy of PL_regmatch_state */
3809 /* cache heavy used fields of st in registers */
3812 U32 n = 0; /* general value; init to avoid compiler warning */
3813 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
3814 char *locinput = startpos;
3815 char *pushinput; /* where to continue after a PUSH */
3816 I32 nextchr; /* is always set to UCHARAT(locinput) */
3818 bool result = 0; /* return value of S_regmatch */
3819 int depth = 0; /* depth of backtrack stack */
3820 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3821 const U32 max_nochange_depth =
3822 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3823 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3824 regmatch_state *yes_state = NULL; /* state to pop to on success of
3826 /* mark_state piggy backs on the yes_state logic so that when we unwind
3827 the stack on success we can update the mark_state as we go */
3828 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3829 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3830 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3832 bool no_final = 0; /* prevent failure from backtracking? */
3833 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3834 char *startpoint = locinput;
3835 SV *popmark = NULL; /* are we looking for a mark? */
3836 SV *sv_commit = NULL; /* last mark name seen in failure */
3837 SV *sv_yes_mark = NULL; /* last mark name we have seen
3838 during a successful match */
3839 U32 lastopen = 0; /* last open we saw */
3840 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3841 SV* const oreplsv = GvSVn(PL_replgv);
3842 /* these three flags are set by various ops to signal information to
3843 * the very next op. They have a useful lifetime of exactly one loop
3844 * iteration, and are not preserved or restored by state pushes/pops
3846 bool sw = 0; /* the condition value in (?(cond)a|b) */
3847 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3848 int logical = 0; /* the following EVAL is:
3852 or the following IFMATCH/UNLESSM is:
3853 false: plain (?=foo)
3854 true: used as a condition: (?(?=foo))
3856 PAD* last_pad = NULL;
3858 I32 gimme = G_SCALAR;
3859 CV *caller_cv = NULL; /* who called us */
3860 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3861 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3862 U32 maxopenparen = 0; /* max '(' index seen so far */
3863 int to_complement; /* Invert the result? */
3864 _char_class_number classnum;
3865 bool is_utf8_pat = reginfo->is_utf8_pat;
3868 GET_RE_DEBUG_FLAGS_DECL;
3871 /* protect against undef(*^R) */
3872 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
3874 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3875 multicall_oldcatch = 0;
3876 multicall_cv = NULL;
3878 PERL_UNUSED_VAR(multicall_cop);
3879 PERL_UNUSED_VAR(newsp);
3882 PERL_ARGS_ASSERT_REGMATCH;
3884 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3885 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3888 st = PL_regmatch_state;
3890 /* Note that nextchr is a byte even in UTF */
3893 while (scan != NULL) {
3896 SV * const prop = sv_newmortal();
3897 regnode *rnext=regnext(scan);
3898 DUMP_EXEC_POS( locinput, scan, utf8_target );
3899 regprop(rex, prop, scan, reginfo);
3901 PerlIO_printf(Perl_debug_log,
3902 "%3"IVdf":%*s%s(%"IVdf")\n",
3903 (IV)(scan - rexi->program), depth*2, "",
3905 (PL_regkind[OP(scan)] == END || !rnext) ?
3906 0 : (IV)(rnext - rexi->program));
3909 next = scan + NEXT_OFF(scan);
3912 state_num = OP(scan);
3918 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3920 switch (state_num) {
3921 case BOL: /* /^../ */
3922 if (locinput == reginfo->strbeg)
3926 case MBOL: /* /^../m */
3927 if (locinput == reginfo->strbeg ||
3928 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3934 case SBOL: /* /^../s */
3935 if (locinput == reginfo->strbeg)
3940 if (locinput == reginfo->ganch)
3944 case KEEPS: /* \K */
3945 /* update the startpoint */
3946 st->u.keeper.val = rex->offs[0].start;
3947 rex->offs[0].start = locinput - reginfo->strbeg;
3948 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3949 assert(0); /*NOTREACHED*/
3950 case KEEPS_next_fail:
3951 /* rollback the start point change */
3952 rex->offs[0].start = st->u.keeper.val;
3954 assert(0); /*NOTREACHED*/
3956 case MEOL: /* /..$/m */
3957 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3961 case EOL: /* /..$/ */
3963 case SEOL: /* /..$/s */
3964 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3966 if (reginfo->strend - locinput > 1)
3971 if (!NEXTCHR_IS_EOS)
3975 case SANY: /* /./s */
3978 goto increment_locinput;
3986 case REG_ANY: /* /./ */
3987 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3989 goto increment_locinput;
3993 #define ST st->u.trie
3994 case TRIEC: /* (ab|cd) with known charclass */
3995 /* In this case the charclass data is available inline so
3996 we can fail fast without a lot of extra overhead.
3998 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4000 PerlIO_printf(Perl_debug_log,
4001 "%*s %sfailed to match trie start class...%s\n",
4002 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4005 assert(0); /* NOTREACHED */
4008 case TRIE: /* (ab|cd) */
4009 /* the basic plan of execution of the trie is:
4010 * At the beginning, run though all the states, and
4011 * find the longest-matching word. Also remember the position
4012 * of the shortest matching word. For example, this pattern:
4015 * when matched against the string "abcde", will generate
4016 * accept states for all words except 3, with the longest
4017 * matching word being 4, and the shortest being 2 (with
4018 * the position being after char 1 of the string).
4020 * Then for each matching word, in word order (i.e. 1,2,4,5),
4021 * we run the remainder of the pattern; on each try setting
4022 * the current position to the character following the word,
4023 * returning to try the next word on failure.
4025 * We avoid having to build a list of words at runtime by
4026 * using a compile-time structure, wordinfo[].prev, which
4027 * gives, for each word, the previous accepting word (if any).
4028 * In the case above it would contain the mappings 1->2, 2->0,
4029 * 3->0, 4->5, 5->1. We can use this table to generate, from
4030 * the longest word (4 above), a list of all words, by
4031 * following the list of prev pointers; this gives us the
4032 * unordered list 4,5,1,2. Then given the current word we have
4033 * just tried, we can go through the list and find the
4034 * next-biggest word to try (so if we just failed on word 2,
4035 * the next in the list is 4).
4037 * Since at runtime we don't record the matching position in
4038 * the string for each word, we have to work that out for
4039 * each word we're about to process. The wordinfo table holds
4040 * the character length of each word; given that we recorded
4041 * at the start: the position of the shortest word and its
4042 * length in chars, we just need to move the pointer the
4043 * difference between the two char lengths. Depending on
4044 * Unicode status and folding, that's cheap or expensive.
4046 * This algorithm is optimised for the case where are only a
4047 * small number of accept states, i.e. 0,1, or maybe 2.
4048 * With lots of accepts states, and having to try all of them,
4049 * it becomes quadratic on number of accept states to find all
4054 /* what type of TRIE am I? (utf8 makes this contextual) */
4055 DECL_TRIE_TYPE(scan);
4057 /* what trie are we using right now */
4058 reg_trie_data * const trie
4059 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
4060 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
4061 U32 state = trie->startstate;
4064 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
4066 if (trie->states[ state ].wordnum) {
4068 PerlIO_printf(Perl_debug_log,
4069 "%*s %smatched empty string...%s\n",
4070 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4076 PerlIO_printf(Perl_debug_log,
4077 "%*s %sfailed to match trie start class...%s\n",
4078 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4085 U8 *uc = ( U8* )locinput;
4089 U8 *uscan = (U8*)NULL;
4090 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
4091 U32 charcount = 0; /* how many input chars we have matched */
4092 U32 accepted = 0; /* have we seen any accepting states? */
4094 ST.jump = trie->jump;
4097 ST.longfold = FALSE; /* char longer if folded => it's harder */
4100 /* fully traverse the TRIE; note the position of the
4101 shortest accept state and the wordnum of the longest
4104 while ( state && uc <= (U8*)(reginfo->strend) ) {
4105 U32 base = trie->states[ state ].trans.base;
4109 wordnum = trie->states[ state ].wordnum;
4111 if (wordnum) { /* it's an accept state */
4114 /* record first match position */
4116 ST.firstpos = (U8*)locinput;
4121 ST.firstchars = charcount;
4124 if (!ST.nextword || wordnum < ST.nextword)
4125 ST.nextword = wordnum;
4126 ST.topword = wordnum;
4129 DEBUG_TRIE_EXECUTE_r({
4130 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
4131 PerlIO_printf( Perl_debug_log,
4132 "%*s %sState: %4"UVxf" Accepted: %c ",
4133 2+depth * 2, "", PL_colors[4],
4134 (UV)state, (accepted ? 'Y' : 'N'));
4137 /* read a char and goto next state */
4138 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
4140 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
4141 uscan, len, uvc, charid, foldlen,
4148 base + charid - 1 - trie->uniquecharcount)) >= 0)
4150 && ((U32)offset < trie->lasttrans)
4151 && trie->trans[offset].check == state)
4153 state = trie->trans[offset].next;
4164 DEBUG_TRIE_EXECUTE_r(
4165 PerlIO_printf( Perl_debug_log,
4166 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
4167 charid, uvc, (UV)state, PL_colors[5] );
4173 /* calculate total number of accept states */
4178 w = trie->wordinfo[w].prev;
4181 ST.accepted = accepted;
4185 PerlIO_printf( Perl_debug_log,
4186 "%*s %sgot %"IVdf" possible matches%s\n",
4187 REPORT_CODE_OFF + depth * 2, "",
4188 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4190 goto trie_first_try; /* jump into the fail handler */
4192 assert(0); /* NOTREACHED */
4194 case TRIE_next_fail: /* we failed - try next alternative */
4198 REGCP_UNWIND(ST.cp);
4199 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4201 if (!--ST.accepted) {
4203 PerlIO_printf( Perl_debug_log,
4204 "%*s %sTRIE failed...%s\n",
4205 REPORT_CODE_OFF+depth*2, "",
4212 /* Find next-highest word to process. Note that this code
4213 * is O(N^2) per trie run (O(N) per branch), so keep tight */
4216 U16 const nextword = ST.nextword;
4217 reg_trie_wordinfo * const wordinfo
4218 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4219 for (word=ST.topword; word; word=wordinfo[word].prev) {
4220 if (word > nextword && (!min || word < min))
4233 ST.lastparen = rex->lastparen;
4234 ST.lastcloseparen = rex->lastcloseparen;
4238 /* find start char of end of current word */
4240 U32 chars; /* how many chars to skip */
4241 reg_trie_data * const trie
4242 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4244 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4246 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4251 /* the hard option - fold each char in turn and find
4252 * its folded length (which may be different */
4253 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4261 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4269 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4274 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4290 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4291 ? ST.jump[ST.nextword]
4295 PerlIO_printf( Perl_debug_log,
4296 "%*s %sTRIE matched word #%d, continuing%s\n",
4297 REPORT_CODE_OFF+depth*2, "",
4304 if (ST.accepted > 1 || has_cutgroup) {
4305 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4306 assert(0); /* NOTREACHED */
4308 /* only one choice left - just continue */
4310 AV *const trie_words
4311 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4312 SV ** const tmp = av_fetch( trie_words,
4314 SV *sv= tmp ? sv_newmortal() : NULL;
4316 PerlIO_printf( Perl_debug_log,
4317 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4318 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4320 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4321 PL_colors[0], PL_colors[1],
4322 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4324 : "not compiled under -Dr",
4328 locinput = (char*)uc;
4329 continue; /* execute rest of RE */
4330 assert(0); /* NOTREACHED */
4334 case EXACT: { /* /abc/ */
4335 char *s = STRING(scan);
4337 if (utf8_target != is_utf8_pat) {
4338 /* The target and the pattern have differing utf8ness. */
4340 const char * const e = s + ln;
4343 /* The target is utf8, the pattern is not utf8.
4344 * Above-Latin1 code points can't match the pattern;
4345 * invariants match exactly, and the other Latin1 ones need
4346 * to be downgraded to a single byte in order to do the
4347 * comparison. (If we could be confident that the target
4348 * is not malformed, this could be refactored to have fewer
4349 * tests by just assuming that if the first bytes match, it
4350 * is an invariant, but there are tests in the test suite
4351 * dealing with (??{...}) which violate this) */
4353 if (l >= reginfo->strend
4354 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4358 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4365 if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4375 /* The target is not utf8, the pattern is utf8. */
4377 if (l >= reginfo->strend
4378 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4382 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4389 if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4401 /* The target and the pattern have the same utf8ness. */
4402 /* Inline the first character, for speed. */
4403 if (reginfo->strend - locinput < ln
4404 || UCHARAT(s) != nextchr
4405 || (ln > 1 && memNE(s, locinput, ln)))
4414 case EXACTFL: { /* /abc/il */
4416 const U8 * fold_array;
4418 U32 fold_utf8_flags;
4420 folder = foldEQ_locale;
4421 fold_array = PL_fold_locale;
4422 fold_utf8_flags = FOLDEQ_LOCALE;
4425 case EXACTFU_SS: /* /\x{df}/iu */
4426 case EXACTFU: /* /abc/iu */
4427 folder = foldEQ_latin1;
4428 fold_array = PL_fold_latin1;
4429 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4432 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
4434 assert(! is_utf8_pat);
4436 case EXACTFA: /* /abc/iaa */
4437 folder = foldEQ_latin1;
4438 fold_array = PL_fold_latin1;
4439 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4442 case EXACTF: /* /abc/i This node only generated for
4443 non-utf8 patterns */
4444 assert(! is_utf8_pat);
4446 fold_array = PL_fold;
4447 fold_utf8_flags = 0;
4455 || state_num == EXACTFU_SS
4456 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4458 /* Either target or the pattern are utf8, or has the issue where
4459 * the fold lengths may differ. */
4460 const char * const l = locinput;
4461 char *e = reginfo->strend;
4463 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
4464 l, &e, 0, utf8_target, fold_utf8_flags))
4472 /* Neither the target nor the pattern are utf8 */
4473 if (UCHARAT(s) != nextchr
4475 && UCHARAT(s) != fold_array[nextchr])
4479 if (reginfo->strend - locinput < ln)
4481 if (ln > 1 && ! folder(s, locinput, ln))
4487 /* XXX Could improve efficiency by separating these all out using a
4488 * macro or in-line function. At that point regcomp.c would no longer
4489 * have to set the FLAGS fields of these */
4490 case BOUNDL: /* /\b/l */
4491 case NBOUNDL: /* /\B/l */
4492 case BOUND: /* /\b/ */
4493 case BOUNDU: /* /\b/u */
4494 case BOUNDA: /* /\b/a */
4495 case NBOUND: /* /\B/ */
4496 case NBOUNDU: /* /\B/u */
4497 case NBOUNDA: /* /\B/a */
4498 /* was last char in word? */
4500 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4501 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4503 if (locinput == reginfo->strbeg)
4506 const U8 * const r =
4507 reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4509 ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
4512 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4513 ln = isWORDCHAR_uni(ln);
4517 LOAD_UTF8_CHARCLASS_ALNUM();
4518 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4523 ln = isWORDCHAR_LC_uvchr(ln);
4524 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4529 /* Here the string isn't utf8, or is utf8 and only ascii
4530 * characters are to match \w. In the latter case looking at
4531 * the byte just prior to the current one may be just the final
4532 * byte of a multi-byte character. This is ok. There are two
4534 * 1) it is a single byte character, and then the test is doing
4535 * just what it's supposed to.
4536 * 2) it is a multi-byte character, in which case the final
4537 * byte is never mistakable for ASCII, and so the test
4538 * will say it is not a word character, which is the
4539 * correct answer. */
4540 ln = (locinput != reginfo->strbeg) ?
4541 UCHARAT(locinput - 1) : '\n';
4542 switch (FLAGS(scan)) {
4543 case REGEX_UNICODE_CHARSET:
4544 ln = isWORDCHAR_L1(ln);
4545 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4547 case REGEX_LOCALE_CHARSET:
4548 ln = isWORDCHAR_LC(ln);
4549 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4551 case REGEX_DEPENDS_CHARSET:
4552 ln = isWORDCHAR(ln);
4553 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4555 case REGEX_ASCII_RESTRICTED_CHARSET:
4556 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4557 ln = isWORDCHAR_A(ln);
4558 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4561 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4565 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4567 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4571 case ANYOF: /* /[abc]/ */
4575 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4578 locinput += UTF8SKIP(locinput);
4581 if (!REGINCLASS(rex, scan, (U8*)locinput))
4587 /* The argument (FLAGS) to all the POSIX node types is the class number
4590 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4594 case POSIXL: /* \w or [:punct:] etc. under /l */
4598 /* Use isFOO_lc() for characters within Latin1. (Note that
4599 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4600 * wouldn't be invariant) */
4601 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4602 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4606 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4607 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4608 (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4609 *(locinput + 1))))))
4614 else { /* Here, must be an above Latin-1 code point */
4615 goto utf8_posix_not_eos;
4618 /* Here, must be utf8 */
4619 locinput += UTF8SKIP(locinput);
4622 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4626 case POSIXD: /* \w or [:punct:] etc. under /d */
4632 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
4634 if (NEXTCHR_IS_EOS) {
4638 /* All UTF-8 variants match */
4639 if (! UTF8_IS_INVARIANT(nextchr)) {
4640 goto increment_locinput;
4646 case POSIXA: /* \w or [:punct:] etc. under /a */
4649 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4650 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4651 * character is a single byte */
4654 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4660 /* Here we are either not in utf8, or we matched a utf8-invariant,
4661 * so the next char is the next byte */
4665 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4669 case POSIXU: /* \w or [:punct:] etc. under /u */
4671 if (NEXTCHR_IS_EOS) {
4676 /* Use _generic_isCC() for characters within Latin1. (Note that
4677 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4678 * wouldn't be invariant) */
4679 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4680 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4687 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4688 if (! (to_complement
4689 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4697 else { /* Handle above Latin-1 code points */
4698 classnum = (_char_class_number) FLAGS(scan);
4699 if (classnum < _FIRST_NON_SWASH_CC) {
4701 /* Here, uses a swash to find such code points. Load if if
4702 * not done already */
4703 if (! PL_utf8_swash_ptrs[classnum]) {
4704 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4705 PL_utf8_swash_ptrs[classnum]
4706 = _core_swash_init("utf8",
4709 PL_XPosix_ptrs[classnum], &flags);
4711 if (! (to_complement
4712 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4713 (U8 *) locinput, TRUE))))
4718 else { /* Here, uses macros to find above Latin-1 code points */
4720 case _CC_ENUM_SPACE: /* XXX would require separate
4721 code if we revert the change
4722 of \v matching this */
4723 case _CC_ENUM_PSXSPC:
4724 if (! (to_complement
4725 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4730 case _CC_ENUM_BLANK:
4731 if (! (to_complement
4732 ^ cBOOL(is_HORIZWS_high(locinput))))
4737 case _CC_ENUM_XDIGIT:
4738 if (! (to_complement
4739 ^ cBOOL(is_XDIGIT_high(locinput))))
4744 case _CC_ENUM_VERTSPACE:
4745 if (! (to_complement
4746 ^ cBOOL(is_VERTWS_high(locinput))))
4751 default: /* The rest, e.g. [:cntrl:], can't match
4753 if (! to_complement) {
4759 locinput += UTF8SKIP(locinput);
4763 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4764 a Unicode extended Grapheme Cluster */
4765 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4766 extended Grapheme Cluster is:
4769 | Prepend* Begin Extend*
4772 Begin is: ( Special_Begin | ! Control )
4773 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4774 Extend is: ( Grapheme_Extend | Spacing_Mark )
4775 Control is: [ GCB_Control | CR | LF ]
4776 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4778 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4781 Begin is ( Regular_Begin + Special Begin )
4783 It turns out that 98.4% of all Unicode code points match
4784 Regular_Begin. Doing it this way eliminates a table match in
4785 the previous implementation for almost all Unicode code points.
4787 There is a subtlety with Prepend* which showed up in testing.
4788 Note that the Begin, and only the Begin is required in:
4789 | Prepend* Begin Extend*
4790 Also, Begin contains '! Control'. A Prepend must be a
4791 '! Control', which means it must also be a Begin. What it
4792 comes down to is that if we match Prepend* and then find no
4793 suitable Begin afterwards, that if we backtrack the last
4794 Prepend, that one will be a suitable Begin.
4799 if (! utf8_target) {
4801 /* Match either CR LF or '.', as all the other possibilities
4803 locinput++; /* Match the . or CR */
4804 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4806 && locinput < reginfo->strend
4807 && UCHARAT(locinput) == '\n')
4814 /* Utf8: See if is ( CR LF ); already know that locinput <
4815 * reginfo->strend, so locinput+1 is in bounds */
4816 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4817 && UCHARAT(locinput + 1) == '\n')
4824 /* In case have to backtrack to beginning, then match '.' */
4825 char *starting = locinput;
4827 /* In case have to backtrack the last prepend */
4828 char *previous_prepend = NULL;
4830 LOAD_UTF8_CHARCLASS_GCB();
4832 /* Match (prepend)* */
4833 while (locinput < reginfo->strend
4834 && (len = is_GCB_Prepend_utf8(locinput)))
4836 previous_prepend = locinput;
4840 /* As noted above, if we matched a prepend character, but
4841 * the next thing won't match, back off the last prepend we
4842 * matched, as it is guaranteed to match the begin */
4843 if (previous_prepend
4844 && (locinput >= reginfo->strend
4845 || (! swash_fetch(PL_utf8_X_regular_begin,
4846 (U8*)locinput, utf8_target)
4847 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4850 locinput = previous_prepend;
4853 /* Note that here we know reginfo->strend > locinput, as we
4854 * tested that upon input to this switch case, and if we
4855 * moved locinput forward, we tested the result just above
4856 * and it either passed, or we backed off so that it will
4858 if (swash_fetch(PL_utf8_X_regular_begin,
4859 (U8*)locinput, utf8_target)) {
4860 locinput += UTF8SKIP(locinput);
4862 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4864 /* Here did not match the required 'Begin' in the
4865 * second term. So just match the very first
4866 * character, the '.' of the final term of the regex */
4867 locinput = starting + UTF8SKIP(starting);
4871 /* Here is a special begin. It can be composed of
4872 * several individual characters. One possibility is
4874 if ((len = is_GCB_RI_utf8(locinput))) {
4876 while (locinput < reginfo->strend
4877 && (len = is_GCB_RI_utf8(locinput)))
4881 } else if ((len = is_GCB_T_utf8(locinput))) {
4882 /* Another possibility is T+ */
4884 while (locinput < reginfo->strend
4885 && (len = is_GCB_T_utf8(locinput)))
4891 /* Here, neither RI+ nor T+; must be some other
4892 * Hangul. That means it is one of the others: L,
4893 * LV, LVT or V, and matches:
4894 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4897 while (locinput < reginfo->strend
4898 && (len = is_GCB_L_utf8(locinput)))
4903 /* Here, have exhausted L*. If the next character
4904 * is not an LV, LVT nor V, it means we had to have
4905 * at least one L, so matches L+ in the original
4906 * equation, we have a complete hangul syllable.
4909 if (locinput < reginfo->strend
4910 && is_GCB_LV_LVT_V_utf8(locinput))
4912 /* Otherwise keep going. Must be LV, LVT or V.
4913 * See if LVT, by first ruling out V, then LV */
4914 if (! is_GCB_V_utf8(locinput)
4915 /* All but every TCount one is LV */
4916 && (valid_utf8_to_uvchr((U8 *) locinput,
4921 locinput += UTF8SKIP(locinput);
4924 /* Must be V or LV. Take it, then match
4926 locinput += UTF8SKIP(locinput);
4927 while (locinput < reginfo->strend
4928 && (len = is_GCB_V_utf8(locinput)))
4934 /* And any of LV, LVT, or V can be followed
4936 while (locinput < reginfo->strend
4937 && (len = is_GCB_T_utf8(locinput)))
4945 /* Match any extender */
4946 while (locinput < reginfo->strend
4947 && swash_fetch(PL_utf8_X_extend,
4948 (U8*)locinput, utf8_target))
4950 locinput += UTF8SKIP(locinput);
4954 if (locinput > reginfo->strend) sayNO;
4958 case NREFFL: /* /\g{name}/il */
4959 { /* The capture buffer cases. The ones beginning with N for the
4960 named buffers just convert to the equivalent numbered and
4961 pretend they were called as the corresponding numbered buffer
4963 /* don't initialize these in the declaration, it makes C++
4968 const U8 *fold_array;
4971 folder = foldEQ_locale;
4972 fold_array = PL_fold_locale;
4974 utf8_fold_flags = FOLDEQ_LOCALE;
4977 case NREFFA: /* /\g{name}/iaa */
4978 folder = foldEQ_latin1;
4979 fold_array = PL_fold_latin1;
4981 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4984 case NREFFU: /* /\g{name}/iu */
4985 folder = foldEQ_latin1;
4986 fold_array = PL_fold_latin1;
4988 utf8_fold_flags = 0;
4991 case NREFF: /* /\g{name}/i */
4993 fold_array = PL_fold;
4995 utf8_fold_flags = 0;
4998 case NREF: /* /\g{name}/ */
5002 utf8_fold_flags = 0;
5005 /* For the named back references, find the corresponding buffer
5007 n = reg_check_named_buff_matched(rex,scan);
5012 goto do_nref_ref_common;
5014 case REFFL: /* /\1/il */
5015 folder = foldEQ_locale;
5016 fold_array = PL_fold_locale;
5017 utf8_fold_flags = FOLDEQ_LOCALE;
5020 case REFFA: /* /\1/iaa */
5021 folder = foldEQ_latin1;
5022 fold_array = PL_fold_latin1;
5023 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5026 case REFFU: /* /\1/iu */
5027 folder = foldEQ_latin1;
5028 fold_array = PL_fold_latin1;
5029 utf8_fold_flags = 0;
5032 case REFF: /* /\1/i */
5034 fold_array = PL_fold;
5035 utf8_fold_flags = 0;
5038 case REF: /* /\1/ */
5041 utf8_fold_flags = 0;
5045 n = ARG(scan); /* which paren pair */
5048 ln = rex->offs[n].start;
5049 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5050 if (rex->lastparen < n || ln == -1)
5051 sayNO; /* Do not match unless seen CLOSEn. */
5052 if (ln == rex->offs[n].end)
5055 s = reginfo->strbeg + ln;
5056 if (type != REF /* REF can do byte comparison */
5057 && (utf8_target || type == REFFU || type == REFFL))
5059 char * limit = reginfo->strend;
5061 /* This call case insensitively compares the entire buffer
5062 * at s, with the current input starting at locinput, but
5063 * not going off the end given by reginfo->strend, and
5064 * returns in <limit> upon success, how much of the
5065 * current input was matched */
5066 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
5067 locinput, &limit, 0, utf8_target, utf8_fold_flags))
5075 /* Not utf8: Inline the first character, for speed. */
5076 if (!NEXTCHR_IS_EOS &&
5077 UCHARAT(s) != nextchr &&
5079 UCHARAT(s) != fold_array[nextchr]))
5081 ln = rex->offs[n].end - ln;
5082 if (locinput + ln > reginfo->strend)
5084 if (ln > 1 && (type == REF
5085 ? memNE(s, locinput, ln)
5086 : ! folder(s, locinput, ln)))
5092 case NOTHING: /* null op; e.g. the 'nothing' following
5093 * the '*' in m{(a+|b)*}' */
5095 case TAIL: /* placeholder while compiling (A|B|C) */
5098 case BACK: /* ??? doesn't appear to be used ??? */
5102 #define ST st->u.eval
5107 regexp_internal *rei;
5108 regnode *startpoint;
5110 case GOSTART: /* (?R) */
5111 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
5112 if (cur_eval && cur_eval->locinput==locinput) {
5113 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
5114 Perl_croak(aTHX_ "Infinite recursion in regex");
5115 if ( ++nochange_depth > max_nochange_depth )
5117 "Pattern subroutine nesting without pos change"
5118 " exceeded limit in regex");
5125 if (OP(scan)==GOSUB) {
5126 startpoint = scan + ARG2L(scan);
5127 ST.close_paren = ARG(scan);
5129 startpoint = rei->program+1;
5133 /* Save all the positions seen so far. */
5134 ST.cp = regcppush(rex, 0, maxopenparen);
5135 REGCP_SET(ST.lastcp);
5137 /* and then jump to the code we share with EVAL */
5138 goto eval_recurse_doit;
5140 assert(0); /* NOTREACHED */
5142 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
5143 if (cur_eval && cur_eval->locinput==locinput) {
5144 if ( ++nochange_depth > max_nochange_depth )
5145 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
5150 /* execute the code in the {...} */
5154 OP * const oop = PL_op;
5155 COP * const ocurcop = PL_curcop;
5159 /* save *all* paren positions */
5160 regcppush(rex, 0, maxopenparen);
5161 REGCP_SET(runops_cp);
5164 caller_cv = find_runcv(NULL);
5168 if (rexi->data->what[n] == 'r') { /* code from an external qr */
5170 (REGEXP*)(rexi->data->data[n])
5173 nop = (OP*)rexi->data->data[n+1];
5175 else if (rexi->data->what[n] == 'l') { /* literal code */
5177 nop = (OP*)rexi->data->data[n];
5178 assert(CvDEPTH(newcv));
5181 /* literal with own CV */
5182 assert(rexi->data->what[n] == 'L');
5183 newcv = rex->qr_anoncv;
5184 nop = (OP*)rexi->data->data[n];
5187 /* normally if we're about to execute code from the same
5188 * CV that we used previously, we just use the existing
5189 * CX stack entry. However, its possible that in the
5190 * meantime we may have backtracked, popped from the save
5191 * stack, and undone the SAVECOMPPAD(s) associated with
5192 * PUSH_MULTICALL; in which case PL_comppad no longer
5193 * points to newcv's pad. */
5194 if (newcv != last_pushed_cv || PL_comppad != last_pad)
5196 U8 flags = (CXp_SUB_RE |
5197 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5198 if (last_pushed_cv) {
5199 CHANGE_MULTICALL_FLAGS(newcv, flags);
5202 PUSH_MULTICALL_FLAGS(newcv, flags);
5204 last_pushed_cv = newcv;
5207 /* these assignments are just to silence compiler
5209 multicall_cop = NULL;
5212 last_pad = PL_comppad;
5214 /* the initial nextstate you would normally execute
5215 * at the start of an eval (which would cause error
5216 * messages to come from the eval), may be optimised
5217 * away from the execution path in the regex code blocks;
5218 * so manually set PL_curcop to it initially */
5220 OP *o = cUNOPx(nop)->op_first;
5221 assert(o->op_type == OP_NULL);
5222 if (o->op_targ == OP_SCOPE) {
5223 o = cUNOPo->op_first;
5226 assert(o->op_targ == OP_LEAVE);
5227 o = cUNOPo->op_first;
5228 assert(o->op_type == OP_ENTER);
5232 if (o->op_type != OP_STUB) {
5233 assert( o->op_type == OP_NEXTSTATE
5234 || o->op_type == OP_DBSTATE
5235 || (o->op_type == OP_NULL
5236 && ( o->op_targ == OP_NEXTSTATE
5237 || o->op_targ == OP_DBSTATE
5241 PL_curcop = (COP*)o;
5246 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5247 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5249 rex->offs[0].end = locinput - reginfo->strbeg;
5250 if (reginfo->info_aux_eval->pos_magic)
5251 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5252 reginfo->sv, reginfo->strbeg,
5253 locinput - reginfo->strbeg);
5256 SV *sv_mrk = get_sv("REGMARK", 1);
5257 sv_setsv(sv_mrk, sv_yes_mark);
5260 /* we don't use MULTICALL here as we want to call the
5261 * first op of the block of interest, rather than the
5262 * first op of the sub */
5263 before = (IV)(SP-PL_stack_base);
5265 CALLRUNOPS(aTHX); /* Scalar context. */
5267 if ((IV)(SP-PL_stack_base) == before)
5268 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
5274 /* before restoring everything, evaluate the returned
5275 * value, so that 'uninit' warnings don't use the wrong
5276 * PL_op or pad. Also need to process any magic vars
5277 * (e.g. $1) *before* parentheses are restored */
5282 if (logical == 0) /* (?{})/ */
5283 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5284 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
5285 sw = cBOOL(SvTRUE(ret));
5288 else { /* /(??{}) */
5289 /* if its overloaded, let the regex compiler handle
5290 * it; otherwise extract regex, or stringify */
5291 if (SvGMAGICAL(ret))
5292 ret = sv_mortalcopy(ret);
5293 if (!SvAMAGIC(ret)) {
5297 if (SvTYPE(sv) == SVt_REGEXP)
5298 re_sv = (REGEXP*) sv;
5299 else if (SvSMAGICAL(ret)) {
5300 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
5302 re_sv = (REGEXP *) mg->mg_obj;
5305 /* force any undef warnings here */
5306 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5307 ret = sv_mortalcopy(ret);
5308 (void) SvPV_force_nolen(ret);
5314 /* *** Note that at this point we don't restore
5315 * PL_comppad, (or pop the CxSUB) on the assumption it may
5316 * be used again soon. This is safe as long as nothing
5317 * in the regexp code uses the pad ! */
5319 PL_curcop = ocurcop;
5320 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5321 PL_curpm = PL_reg_curpm;
5327 /* only /(??{})/ from now on */
5330 /* extract RE object from returned value; compiling if
5334 re_sv = reg_temp_copy(NULL, re_sv);
5339 if (SvUTF8(ret) && IN_BYTES) {
5340 /* In use 'bytes': make a copy of the octet
5341 * sequence, but without the flag on */
5343 const char *const p = SvPV(ret, len);
5344 ret = newSVpvn_flags(p, len, SVs_TEMP);
5346 if (rex->intflags & PREGf_USE_RE_EVAL)
5347 pm_flags |= PMf_USE_RE_EVAL;
5349 /* if we got here, it should be an engine which
5350 * supports compiling code blocks and stuff */
5351 assert(rex->engine && rex->engine->op_comp);
5352 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5353 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5354 rex->engine, NULL, NULL,
5355 /* copy /msix etc to inner pattern */
5360 & (SVs_TEMP | SVs_GMG | SVf_ROK))
5361 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
5362 /* This isn't a first class regexp. Instead, it's
5363 caching a regexp onto an existing, Perl visible
5365 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5371 RXp_MATCH_COPIED_off(re);
5372 re->subbeg = rex->subbeg;
5373 re->sublen = rex->sublen;
5374 re->suboffset = rex->suboffset;
5375 re->subcoffset = rex->subcoffset;
5377 re->lastcloseparen = 0;
5380 debug_start_match(re_sv, utf8_target, locinput,
5381 reginfo->strend, "Matching embedded");
5383 startpoint = rei->program + 1;
5384 ST.close_paren = 0; /* only used for GOSUB */
5385 /* Save all the seen positions so far. */
5386 ST.cp = regcppush(rex, 0, maxopenparen);
5387 REGCP_SET(ST.lastcp);
5388 /* and set maxopenparen to 0, since we are starting a "fresh" match */
5390 /* run the pattern returned from (??{...}) */
5392 eval_recurse_doit: /* Share code with GOSUB below this line
5393 * At this point we expect the stack context to be
5394 * set up correctly */
5396 /* invalidate the S-L poscache. We're now executing a
5397 * different set of WHILEM ops (and their associated
5398 * indexes) against the same string, so the bits in the
5399 * cache are meaningless. Setting maxiter to zero forces
5400 * the cache to be invalidated and zeroed before reuse.
5401 * XXX This is too dramatic a measure. Ideally we should
5402 * save the old cache and restore when running the outer
5404 reginfo->poscache_maxiter = 0;
5406 /* the new regexp might have a different is_utf8_pat than we do */
5407 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5409 ST.prev_rex = rex_sv;
5410 ST.prev_curlyx = cur_curlyx;
5412 SET_reg_curpm(rex_sv);
5417 ST.prev_eval = cur_eval;
5419 /* now continue from first node in postoned RE */
5420 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5421 assert(0); /* NOTREACHED */
5424 case EVAL_AB: /* cleanup after a successful (??{A})B */
5425 /* note: this is called twice; first after popping B, then A */
5426 rex_sv = ST.prev_rex;
5427 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5428 SET_reg_curpm(rex_sv);
5429 rex = ReANY(rex_sv);
5430 rexi = RXi_GET(rex);
5432 /* preserve $^R across LEAVE's. See Bug 121070. */
5433 SV *save_sv= GvSV(PL_replgv);
5434 SvREFCNT_inc(save_sv);
5435 regcpblow(ST.cp); /* LEAVE in disguise */
5436 sv_setsv(GvSV(PL_replgv), save_sv);
5437 SvREFCNT_dec(save_sv);
5439 cur_eval = ST.prev_eval;
5440 cur_curlyx = ST.prev_curlyx;
5442 /* Invalidate cache. See "invalidate" comment above. */
5443 reginfo->poscache_maxiter = 0;
5444 if ( nochange_depth )
5449 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5450 /* note: this is called twice; first after popping B, then A */
5451 rex_sv = ST.prev_rex;
5452 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5453 SET_reg_curpm(rex_sv);
5454 rex = ReANY(rex_sv);
5455 rexi = RXi_GET(rex);
5457 REGCP_UNWIND(ST.lastcp);
5458 regcppop(rex, &maxopenparen);
5459 cur_eval = ST.prev_eval;
5460 cur_curlyx = ST.prev_curlyx;
5461 /* Invalidate cache. See "invalidate" comment above. */
5462 reginfo->poscache_maxiter = 0;
5463 if ( nochange_depth )
5469 n = ARG(scan); /* which paren pair */
5470 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5471 if (n > maxopenparen)
5473 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5474 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5478 (IV)rex->offs[n].start_tmp,
5484 /* XXX really need to log other places start/end are set too */
5485 #define CLOSE_CAPTURE \
5486 rex->offs[n].start = rex->offs[n].start_tmp; \
5487 rex->offs[n].end = locinput - reginfo->strbeg; \
5488 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5489 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5491 PTR2UV(rex->offs), \
5493 (IV)rex->offs[n].start, \
5494 (IV)rex->offs[n].end \
5498 n = ARG(scan); /* which paren pair */
5500 if (n > rex->lastparen)
5502 rex->lastcloseparen = n;
5503 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5508 case ACCEPT: /* (*ACCEPT) */
5512 cursor && OP(cursor)!=END;
5513 cursor=regnext(cursor))
5515 if ( OP(cursor)==CLOSE ){
5517 if ( n <= lastopen ) {
5519 if (n > rex->lastparen)
5521 rex->lastcloseparen = n;
5522 if ( n == ARG(scan) || (cur_eval &&
5523 cur_eval->u.eval.close_paren == n))
5532 case GROUPP: /* (?(1)) */
5533 n = ARG(scan); /* which paren pair */
5534 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5537 case NGROUPP: /* (?(<name>)) */
5538 /* reg_check_named_buff_matched returns 0 for no match */
5539 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5542 case INSUBP: /* (?(R)) */
5544 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5547 case DEFINEP: /* (?(DEFINE)) */
5551 case IFTHEN: /* (?(cond)A|B) */
5552 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5554 next = NEXTOPER(NEXTOPER(scan));
5556 next = scan + ARG(scan);
5557 if (OP(next) == IFTHEN) /* Fake one. */
5558 next = NEXTOPER(NEXTOPER(next));
5562 case LOGICAL: /* modifier for EVAL and IFMATCH */
5563 logical = scan->flags;
5566 /*******************************************************************
5568 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5569 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5570 STAR/PLUS/CURLY/CURLYN are used instead.)
5572 A*B is compiled as <CURLYX><A><WHILEM><B>
5574 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5575 state, which contains the current count, initialised to -1. It also sets
5576 cur_curlyx to point to this state, with any previous value saved in the
5579 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5580 since the pattern may possibly match zero times (i.e. it's a while {} loop
5581 rather than a do {} while loop).
5583 Each entry to WHILEM represents a successful match of A. The count in the
5584 CURLYX block is incremented, another WHILEM state is pushed, and execution
5585 passes to A or B depending on greediness and the current count.
5587 For example, if matching against the string a1a2a3b (where the aN are
5588 substrings that match /A/), then the match progresses as follows: (the
5589 pushed states are interspersed with the bits of strings matched so far):
5592 <CURLYX cnt=0><WHILEM>
5593 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5594 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5595 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5596 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5598 (Contrast this with something like CURLYM, which maintains only a single
5602 a1 <CURLYM cnt=1> a2
5603 a1 a2 <CURLYM cnt=2> a3
5604 a1 a2 a3 <CURLYM cnt=3> b
5607 Each WHILEM state block marks a point to backtrack to upon partial failure
5608 of A or B, and also contains some minor state data related to that
5609 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5610 overall state, such as the count, and pointers to the A and B ops.
5612 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5613 must always point to the *current* CURLYX block, the rules are:
5615 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5616 and set cur_curlyx to point the new block.
5618 When popping the CURLYX block after a successful or unsuccessful match,
5619 restore the previous cur_curlyx.
5621 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5622 to the outer one saved in the CURLYX block.
5624 When popping the WHILEM block after a successful or unsuccessful B match,
5625 restore the previous cur_curlyx.
5627 Here's an example for the pattern (AI* BI)*BO
5628 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5631 curlyx backtrack stack
5632 ------ ---------------
5634 CO <CO prev=NULL> <WO>
5635 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5636 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5637 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5639 At this point the pattern succeeds, and we work back down the stack to
5640 clean up, restoring as we go:
5642 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5643 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5644 CO <CO prev=NULL> <WO>
5647 *******************************************************************/
5649 #define ST st->u.curlyx
5651 case CURLYX: /* start of /A*B/ (for complex A) */
5653 /* No need to save/restore up to this paren */
5654 I32 parenfloor = scan->flags;
5656 assert(next); /* keep Coverity happy */
5657 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5660 /* XXXX Probably it is better to teach regpush to support
5661 parenfloor > maxopenparen ... */
5662 if (parenfloor > (I32)rex->lastparen)
5663 parenfloor = rex->lastparen; /* Pessimization... */
5665 ST.prev_curlyx= cur_curlyx;
5667 ST.cp = PL_savestack_ix;
5669 /* these fields contain the state of the current curly.
5670 * they are accessed by subsequent WHILEMs */
5671 ST.parenfloor = parenfloor;
5676 ST.count = -1; /* this will be updated by WHILEM */
5677 ST.lastloc = NULL; /* this will be updated by WHILEM */
5679 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5680 assert(0); /* NOTREACHED */
5683 case CURLYX_end: /* just finished matching all of A*B */
5684 cur_curlyx = ST.prev_curlyx;
5686 assert(0); /* NOTREACHED */
5688 case CURLYX_end_fail: /* just failed to match all of A*B */
5690 cur_curlyx = ST.prev_curlyx;
5692 assert(0); /* NOTREACHED */
5696 #define ST st->u.whilem
5698 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5700 /* see the discussion above about CURLYX/WHILEM */
5702 int min = ARG1(cur_curlyx->u.curlyx.me);
5703 int max = ARG2(cur_curlyx->u.curlyx.me);
5704 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5706 assert(cur_curlyx); /* keep Coverity happy */
5707 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5708 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5709 ST.cache_offset = 0;
5713 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5714 "%*s whilem: matched %ld out of %d..%d\n",
5715 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5718 /* First just match a string of min A's. */
5721 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5723 cur_curlyx->u.curlyx.lastloc = locinput;
5724 REGCP_SET(ST.lastcp);
5726 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5727 assert(0); /* NOTREACHED */
5730 /* If degenerate A matches "", assume A done. */
5732 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5733 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5734 "%*s whilem: empty match detected, trying continuation...\n",
5735 REPORT_CODE_OFF+depth*2, "")
5737 goto do_whilem_B_max;
5740 /* super-linear cache processing.
5742 * The idea here is that for certain types of CURLYX/WHILEM -
5743 * principally those whose upper bound is infinity (and
5744 * excluding regexes that have things like \1 and other very
5745 * non-regular expresssiony things), then if a pattern like
5746 * /....A*.../ fails and we backtrack to the WHILEM, then we
5747 * make a note that this particular WHILEM op was at string
5748 * position 47 (say) when the rest of pattern failed. Then, if
5749 * we ever find ourselves back at that WHILEM, and at string
5750 * position 47 again, we can just fail immediately rather than
5751 * running the rest of the pattern again.
5753 * This is very handy when patterns start to go
5754 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5755 * with a combinatorial explosion of backtracking.
5757 * The cache is implemented as a bit array, with one bit per
5758 * string byte position per WHILEM op (up to 16) - so its
5759 * between 0.25 and 2x the string size.
5761 * To avoid allocating a poscache buffer every time, we do an
5762 * initially countdown; only after we have executed a WHILEM
5763 * op (string-length x #WHILEMs) times do we allocate the
5766 * The top 4 bits of scan->flags byte say how many different
5767 * relevant CURLLYX/WHILEM op pairs there are, while the
5768 * bottom 4-bits is the identifying index number of this
5774 if (!reginfo->poscache_maxiter) {
5775 /* start the countdown: Postpone detection until we
5776 * know the match is not *that* much linear. */
5777 reginfo->poscache_maxiter
5778 = (reginfo->strend - reginfo->strbeg + 1)
5780 /* possible overflow for long strings and many CURLYX's */
5781 if (reginfo->poscache_maxiter < 0)
5782 reginfo->poscache_maxiter = I32_MAX;
5783 reginfo->poscache_iter = reginfo->poscache_maxiter;
5786 if (reginfo->poscache_iter-- == 0) {
5787 /* initialise cache */
5788 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5789 regmatch_info_aux *const aux = reginfo->info_aux;
5790 if (aux->poscache) {
5791 if ((SSize_t)reginfo->poscache_size < size) {
5792 Renew(aux->poscache, size, char);
5793 reginfo->poscache_size = size;
5795 Zero(aux->poscache, size, char);
5798 reginfo->poscache_size = size;
5799 Newxz(aux->poscache, size, char);
5801 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5802 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5803 PL_colors[4], PL_colors[5])
5807 if (reginfo->poscache_iter < 0) {
5808 /* have we already failed at this position? */
5809 SSize_t offset, mask;
5811 reginfo->poscache_iter = -1; /* stop eventual underflow */
5812 offset = (scan->flags & 0xf) - 1
5813 + (locinput - reginfo->strbeg)
5815 mask = 1 << (offset % 8);
5817 if (reginfo->info_aux->poscache[offset] & mask) {
5818 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5819 "%*s whilem: (cache) already tried at this position...\n",
5820 REPORT_CODE_OFF+depth*2, "")
5822 sayNO; /* cache records failure */
5824 ST.cache_offset = offset;
5825 ST.cache_mask = mask;
5829 /* Prefer B over A for minimal matching. */
5831 if (cur_curlyx->u.curlyx.minmod) {
5832 ST.save_curlyx = cur_curlyx;
5833 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5834 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5836 REGCP_SET(ST.lastcp);
5837 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5839 assert(0); /* NOTREACHED */
5842 /* Prefer A over B for maximal matching. */
5844 if (n < max) { /* More greed allowed? */
5845 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5847 cur_curlyx->u.curlyx.lastloc = locinput;
5848 REGCP_SET(ST.lastcp);
5849 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5850 assert(0); /* NOTREACHED */
5852 goto do_whilem_B_max;
5854 assert(0); /* NOTREACHED */
5856 case WHILEM_B_min: /* just matched B in a minimal match */
5857 case WHILEM_B_max: /* just matched B in a maximal match */
5858 cur_curlyx = ST.save_curlyx;
5860 assert(0); /* NOTREACHED */
5862 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5863 cur_curlyx = ST.save_curlyx;
5864 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5865 cur_curlyx->u.curlyx.count--;
5867 assert(0); /* NOTREACHED */
5869 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5871 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5872 REGCP_UNWIND(ST.lastcp);
5873 regcppop(rex, &maxopenparen);
5874 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5875 cur_curlyx->u.curlyx.count--;
5877 assert(0); /* NOTREACHED */
5879 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5880 REGCP_UNWIND(ST.lastcp);
5881 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5882 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5883 "%*s whilem: failed, trying continuation...\n",
5884 REPORT_CODE_OFF+depth*2, "")
5887 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5888 && ckWARN(WARN_REGEXP)
5889 && !reginfo->warned)
5891 reginfo->warned = TRUE;
5892 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5893 "Complex regular subexpression recursion limit (%d) "
5899 ST.save_curlyx = cur_curlyx;
5900 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5901 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5903 assert(0); /* NOTREACHED */
5905 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5906 cur_curlyx = ST.save_curlyx;
5907 REGCP_UNWIND(ST.lastcp);
5908 regcppop(rex, &maxopenparen);
5910 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5911 /* Maximum greed exceeded */
5912 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5913 && ckWARN(WARN_REGEXP)
5914 && !reginfo->warned)
5916 reginfo->warned = TRUE;
5917 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5918 "Complex regular subexpression recursion "
5919 "limit (%d) exceeded",
5922 cur_curlyx->u.curlyx.count--;
5926 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5927 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5929 /* Try grabbing another A and see if it helps. */
5930 cur_curlyx->u.curlyx.lastloc = locinput;
5931 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5933 REGCP_SET(ST.lastcp);
5934 PUSH_STATE_GOTO(WHILEM_A_min,
5935 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5937 assert(0); /* NOTREACHED */
5940 #define ST st->u.branch
5942 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5943 next = scan + ARG(scan);
5946 scan = NEXTOPER(scan);
5949 case BRANCH: /* /(...|A|...)/ */
5950 scan = NEXTOPER(scan); /* scan now points to inner node */
5951 ST.lastparen = rex->lastparen;
5952 ST.lastcloseparen = rex->lastcloseparen;
5953 ST.next_branch = next;
5956 /* Now go into the branch */
5958 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5960 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5962 assert(0); /* NOTREACHED */
5964 case CUTGROUP: /* /(*THEN)/ */
5965 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5966 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5967 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5968 assert(0); /* NOTREACHED */
5970 case CUTGROUP_next_fail:
5973 if (st->u.mark.mark_name)
5974 sv_commit = st->u.mark.mark_name;
5976 assert(0); /* NOTREACHED */
5980 assert(0); /* NOTREACHED */
5982 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5987 REGCP_UNWIND(ST.cp);
5988 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5989 scan = ST.next_branch;
5990 /* no more branches? */
5991 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5993 PerlIO_printf( Perl_debug_log,
5994 "%*s %sBRANCH failed...%s\n",
5995 REPORT_CODE_OFF+depth*2, "",
6001 continue; /* execute next BRANCH[J] op */
6002 assert(0); /* NOTREACHED */
6004 case MINMOD: /* next op will be non-greedy, e.g. A*? */
6009 #define ST st->u.curlym
6011 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
6013 /* This is an optimisation of CURLYX that enables us to push
6014 * only a single backtracking state, no matter how many matches
6015 * there are in {m,n}. It relies on the pattern being constant
6016 * length, with no parens to influence future backrefs
6020 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6022 ST.lastparen = rex->lastparen;
6023 ST.lastcloseparen = rex->lastcloseparen;
6025 /* if paren positive, emulate an OPEN/CLOSE around A */
6027 U32 paren = ST.me->flags;
6028 if (paren > maxopenparen)
6029 maxopenparen = paren;
6030 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6038 ST.c1 = CHRTEST_UNINIT;
6041 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
6044 curlym_do_A: /* execute the A in /A{m,n}B/ */
6045 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
6046 assert(0); /* NOTREACHED */
6048 case CURLYM_A: /* we've just matched an A */
6050 /* after first match, determine A's length: u.curlym.alen */
6051 if (ST.count == 1) {
6052 if (reginfo->is_utf8_target) {
6053 char *s = st->locinput;
6054 while (s < locinput) {
6060 ST.alen = locinput - st->locinput;
6063 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
6066 PerlIO_printf(Perl_debug_log,
6067 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
6068 (int)(REPORT_CODE_OFF+(depth*2)), "",
6069 (IV) ST.count, (IV)ST.alen)
6072 if (cur_eval && cur_eval->u.eval.close_paren &&
6073 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6077 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
6078 if ( max == REG_INFTY || ST.count < max )
6079 goto curlym_do_A; /* try to match another A */
6081 goto curlym_do_B; /* try to match B */
6083 case CURLYM_A_fail: /* just failed to match an A */
6084 REGCP_UNWIND(ST.cp);
6086 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
6087 || (cur_eval && cur_eval->u.eval.close_paren &&
6088 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
6091 curlym_do_B: /* execute the B in /A{m,n}B/ */
6092 if (ST.c1 == CHRTEST_UNINIT) {
6093 /* calculate c1 and c2 for possible match of 1st char
6094 * following curly */
6095 ST.c1 = ST.c2 = CHRTEST_VOID;
6096 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
6097 regnode *text_node = ST.B;
6098 if (! HAS_TEXT(text_node))
6099 FIND_NEXT_IMPT(text_node);
6102 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
6104 But the former is redundant in light of the latter.
6106 if this changes back then the macro for
6107 IS_TEXT and friends need to change.
6109 if (PL_regkind[OP(text_node)] == EXACT) {
6110 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6111 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6121 PerlIO_printf(Perl_debug_log,
6122 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
6123 (int)(REPORT_CODE_OFF+(depth*2)),
6126 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
6127 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
6128 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6129 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6131 /* simulate B failing */
6133 PerlIO_printf(Perl_debug_log,
6134 "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
6135 (int)(REPORT_CODE_OFF+(depth*2)),"",
6136 valid_utf8_to_uvchr((U8 *) locinput, NULL),
6137 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
6138 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
6140 state_num = CURLYM_B_fail;
6141 goto reenter_switch;
6144 else if (nextchr != ST.c1 && nextchr != ST.c2) {
6145 /* simulate B failing */
6147 PerlIO_printf(Perl_debug_log,
6148 "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
6149 (int)(REPORT_CODE_OFF+(depth*2)),"",
6150 (int) nextchr, ST.c1, ST.c2)
6152 state_num = CURLYM_B_fail;
6153 goto reenter_switch;
6158 /* emulate CLOSE: mark current A as captured */
6159 I32 paren = ST.me->flags;
6161 rex->offs[paren].start
6162 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
6163 rex->offs[paren].end = locinput - reginfo->strbeg;
6164 if ((U32)paren > rex->lastparen)
6165 rex->lastparen = paren;
6166 rex->lastcloseparen = paren;
6169 rex->offs[paren].end = -1;
6170 if (cur_eval && cur_eval->u.eval.close_paren &&
6171 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6180 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
6181 assert(0); /* NOTREACHED */
6183 case CURLYM_B_fail: /* just failed to match a B */
6184 REGCP_UNWIND(ST.cp);
6185 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6187 I32 max = ARG2(ST.me);
6188 if (max != REG_INFTY && ST.count == max)
6190 goto curlym_do_A; /* try to match a further A */
6192 /* backtrack one A */
6193 if (ST.count == ARG1(ST.me) /* min */)
6196 SET_locinput(HOPc(locinput, -ST.alen));
6197 goto curlym_do_B; /* try to match B */
6200 #define ST st->u.curly
6202 #define CURLY_SETPAREN(paren, success) \
6205 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6206 rex->offs[paren].end = locinput - reginfo->strbeg; \
6207 if (paren > rex->lastparen) \
6208 rex->lastparen = paren; \
6209 rex->lastcloseparen = paren; \
6212 rex->offs[paren].end = -1; \
6213 rex->lastparen = ST.lastparen; \
6214 rex->lastcloseparen = ST.lastcloseparen; \
6218 case STAR: /* /A*B/ where A is width 1 char */
6222 scan = NEXTOPER(scan);
6225 case PLUS: /* /A+B/ where A is width 1 char */
6229 scan = NEXTOPER(scan);
6232 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
6233 ST.paren = scan->flags; /* Which paren to set */
6234 ST.lastparen = rex->lastparen;
6235 ST.lastcloseparen = rex->lastcloseparen;
6236 if (ST.paren > maxopenparen)
6237 maxopenparen = ST.paren;
6238 ST.min = ARG1(scan); /* min to match */
6239 ST.max = ARG2(scan); /* max to match */
6240 if (cur_eval && cur_eval->u.eval.close_paren &&
6241 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6245 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6248 case CURLY: /* /A{m,n}B/ where A is width 1 char */
6250 ST.min = ARG1(scan); /* min to match */
6251 ST.max = ARG2(scan); /* max to match */
6252 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6255 * Lookahead to avoid useless match attempts
6256 * when we know what character comes next.
6258 * Used to only do .*x and .*?x, but now it allows
6259 * for )'s, ('s and (?{ ... })'s to be in the way
6260 * of the quantifier and the EXACT-like node. -- japhy
6263 assert(ST.min <= ST.max);
6264 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6265 ST.c1 = ST.c2 = CHRTEST_VOID;
6268 regnode *text_node = next;
6270 if (! HAS_TEXT(text_node))
6271 FIND_NEXT_IMPT(text_node);
6273 if (! HAS_TEXT(text_node))
6274 ST.c1 = ST.c2 = CHRTEST_VOID;
6276 if ( PL_regkind[OP(text_node)] != EXACT ) {
6277 ST.c1 = ST.c2 = CHRTEST_VOID;
6281 /* Currently we only get here when
6283 PL_rekind[OP(text_node)] == EXACT
6285 if this changes back then the macro for IS_TEXT and
6286 friends need to change. */
6287 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6288 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6300 char *li = locinput;
6303 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6309 if (ST.c1 == CHRTEST_VOID)
6310 goto curly_try_B_min;
6312 ST.oldloc = locinput;
6314 /* set ST.maxpos to the furthest point along the
6315 * string that could possibly match */
6316 if (ST.max == REG_INFTY) {
6317 ST.maxpos = reginfo->strend - 1;
6319 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6322 else if (utf8_target) {
6323 int m = ST.max - ST.min;
6324 for (ST.maxpos = locinput;
6325 m >0 && ST.maxpos < reginfo->strend; m--)
6326 ST.maxpos += UTF8SKIP(ST.maxpos);
6329 ST.maxpos = locinput + ST.max - ST.min;
6330 if (ST.maxpos >= reginfo->strend)
6331 ST.maxpos = reginfo->strend - 1;
6333 goto curly_try_B_min_known;
6337 /* avoid taking address of locinput, so it can remain
6339 char *li = locinput;
6340 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6341 if (ST.count < ST.min)
6344 if ((ST.count > ST.min)
6345 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6347 /* A{m,n} must come at the end of the string, there's
6348 * no point in backing off ... */
6350 /* ...except that $ and \Z can match before *and* after
6351 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6352 We may back off by one in this case. */
6353 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6357 goto curly_try_B_max;
6359 assert(0); /* NOTREACHED */
6362 case CURLY_B_min_known_fail:
6363 /* failed to find B in a non-greedy match where c1,c2 valid */
6365 REGCP_UNWIND(ST.cp);
6367 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6369 /* Couldn't or didn't -- move forward. */
6370 ST.oldloc = locinput;
6372 locinput += UTF8SKIP(locinput);
6376 curly_try_B_min_known:
6377 /* find the next place where 'B' could work, then call B */
6381 n = (ST.oldloc == locinput) ? 0 : 1;
6382 if (ST.c1 == ST.c2) {
6383 /* set n to utf8_distance(oldloc, locinput) */
6384 while (locinput <= ST.maxpos
6385 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6387 locinput += UTF8SKIP(locinput);
6392 /* set n to utf8_distance(oldloc, locinput) */
6393 while (locinput <= ST.maxpos
6394 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6395 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6397 locinput += UTF8SKIP(locinput);
6402 else { /* Not utf8_target */
6403 if (ST.c1 == ST.c2) {
6404 while (locinput <= ST.maxpos &&
6405 UCHARAT(locinput) != ST.c1)
6409 while (locinput <= ST.maxpos
6410 && UCHARAT(locinput) != ST.c1
6411 && UCHARAT(locinput) != ST.c2)
6414 n = locinput - ST.oldloc;
6416 if (locinput > ST.maxpos)
6419 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6420 * at b; check that everything between oldloc and
6421 * locinput matches */
6422 char *li = ST.oldloc;
6424 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6426 assert(n == REG_INFTY || locinput == li);
6428 CURLY_SETPAREN(ST.paren, ST.count);
6429 if (cur_eval && cur_eval->u.eval.close_paren &&
6430 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6433 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6435 assert(0); /* NOTREACHED */
6438 case CURLY_B_min_fail:
6439 /* failed to find B in a non-greedy match where c1,c2 invalid */
6441 REGCP_UNWIND(ST.cp);
6443 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6445 /* failed -- move forward one */
6447 char *li = locinput;
6448 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6455 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6456 ST.count > 0)) /* count overflow ? */
6459 CURLY_SETPAREN(ST.paren, ST.count);
6460 if (cur_eval && cur_eval->u.eval.close_paren &&
6461 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6464 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6468 assert(0); /* NOTREACHED */
6472 /* a successful greedy match: now try to match B */
6473 if (cur_eval && cur_eval->u.eval.close_paren &&
6474 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6478 bool could_match = locinput < reginfo->strend;
6480 /* If it could work, try it. */
6481 if (ST.c1 != CHRTEST_VOID && could_match) {
6482 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6484 could_match = memEQ(locinput,
6489 UTF8SKIP(locinput));
6492 could_match = UCHARAT(locinput) == ST.c1
6493 || UCHARAT(locinput) == ST.c2;
6496 if (ST.c1 == CHRTEST_VOID || could_match) {
6497 CURLY_SETPAREN(ST.paren, ST.count);
6498 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6499 assert(0); /* NOTREACHED */
6504 case CURLY_B_max_fail:
6505 /* failed to find B in a greedy match */
6507 REGCP_UNWIND(ST.cp);
6509 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6512 if (--ST.count < ST.min)
6514 locinput = HOPc(locinput, -1);
6515 goto curly_try_B_max;
6519 case END: /* last op of main pattern */
6522 /* we've just finished A in /(??{A})B/; now continue with B */
6524 st->u.eval.prev_rex = rex_sv; /* inner */
6526 /* Save *all* the positions. */
6527 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6528 rex_sv = cur_eval->u.eval.prev_rex;
6529 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6530 SET_reg_curpm(rex_sv);
6531 rex = ReANY(rex_sv);
6532 rexi = RXi_GET(rex);
6533 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6535 REGCP_SET(st->u.eval.lastcp);
6537 /* Restore parens of the outer rex without popping the
6539 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6542 st->u.eval.prev_eval = cur_eval;
6543 cur_eval = cur_eval->u.eval.prev_eval;
6545 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6546 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6547 if ( nochange_depth )
6550 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6551 locinput); /* match B */
6554 if (locinput < reginfo->till) {
6555 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6556 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6558 (long)(locinput - startpos),
6559 (long)(reginfo->till - startpos),
6562 sayNO_SILENT; /* Cannot match: too short. */
6564 sayYES; /* Success! */
6566 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6568 PerlIO_printf(Perl_debug_log,
6569 "%*s %ssubpattern success...%s\n",
6570 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6571 sayYES; /* Success! */
6574 #define ST st->u.ifmatch
6579 case SUSPEND: /* (?>A) */
6581 newstart = locinput;
6584 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6586 goto ifmatch_trivial_fail_test;
6588 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6590 ifmatch_trivial_fail_test:
6592 char * const s = HOPBACKc(locinput, scan->flags);
6597 sw = 1 - cBOOL(ST.wanted);
6601 next = scan + ARG(scan);
6609 newstart = locinput;
6613 ST.logical = logical;
6614 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6616 /* execute body of (?...A) */
6617 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6618 assert(0); /* NOTREACHED */
6621 case IFMATCH_A_fail: /* body of (?...A) failed */
6622 ST.wanted = !ST.wanted;
6625 case IFMATCH_A: /* body of (?...A) succeeded */
6627 sw = cBOOL(ST.wanted);
6629 else if (!ST.wanted)
6632 if (OP(ST.me) != SUSPEND) {
6633 /* restore old position except for (?>...) */
6634 locinput = st->locinput;
6636 scan = ST.me + ARG(ST.me);
6639 continue; /* execute B */
6643 case LONGJMP: /* alternative with many branches compiles to
6644 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6645 next = scan + ARG(scan);
6650 case COMMIT: /* (*COMMIT) */
6651 reginfo->cutpoint = reginfo->strend;
6654 case PRUNE: /* (*PRUNE) */
6656 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6657 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6658 assert(0); /* NOTREACHED */
6660 case COMMIT_next_fail:
6664 case OPFAIL: /* (*FAIL) */
6666 assert(0); /* NOTREACHED */
6668 #define ST st->u.mark
6669 case MARKPOINT: /* (*MARK:foo) */
6670 ST.prev_mark = mark_state;
6671 ST.mark_name = sv_commit = sv_yes_mark
6672 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6674 ST.mark_loc = locinput;
6675 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6676 assert(0); /* NOTREACHED */
6678 case MARKPOINT_next:
6679 mark_state = ST.prev_mark;
6681 assert(0); /* NOTREACHED */
6683 case MARKPOINT_next_fail:
6684 if (popmark && sv_eq(ST.mark_name,popmark))
6686 if (ST.mark_loc > startpoint)
6687 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6688 popmark = NULL; /* we found our mark */
6689 sv_commit = ST.mark_name;
6692 PerlIO_printf(Perl_debug_log,
6693 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6694 REPORT_CODE_OFF+depth*2, "",
6695 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6698 mark_state = ST.prev_mark;
6699 sv_yes_mark = mark_state ?
6700 mark_state->u.mark.mark_name : NULL;
6702 assert(0); /* NOTREACHED */
6704 case SKIP: /* (*SKIP) */
6706 /* (*SKIP) : if we fail we cut here*/
6707 ST.mark_name = NULL;
6708 ST.mark_loc = locinput;
6709 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6711 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6712 otherwise do nothing. Meaning we need to scan
6714 regmatch_state *cur = mark_state;
6715 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6718 if ( sv_eq( cur->u.mark.mark_name,
6721 ST.mark_name = find;
6722 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6724 cur = cur->u.mark.prev_mark;
6727 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6730 case SKIP_next_fail:
6732 /* (*CUT:NAME) - Set up to search for the name as we
6733 collapse the stack*/
6734 popmark = ST.mark_name;
6736 /* (*CUT) - No name, we cut here.*/
6737 if (ST.mark_loc > startpoint)
6738 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6739 /* but we set sv_commit to latest mark_name if there
6740 is one so they can test to see how things lead to this
6743 sv_commit=mark_state->u.mark.mark_name;
6747 assert(0); /* NOTREACHED */
6750 case LNBREAK: /* \R */
6751 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6758 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6759 PTR2UV(scan), OP(scan));
6760 Perl_croak(aTHX_ "regexp memory corruption");
6762 /* this is a point to jump to in order to increment
6763 * locinput by one character */
6765 assert(!NEXTCHR_IS_EOS);
6767 locinput += PL_utf8skip[nextchr];
6768 /* locinput is allowed to go 1 char off the end, but not 2+ */
6769 if (locinput > reginfo->strend)
6778 /* switch break jumps here */
6779 scan = next; /* prepare to execute the next op and ... */
6780 continue; /* ... jump back to the top, reusing st */
6781 assert(0); /* NOTREACHED */
6784 /* push a state that backtracks on success */
6785 st->u.yes.prev_yes_state = yes_state;
6789 /* push a new regex state, then continue at scan */
6791 regmatch_state *newst;
6794 regmatch_state *cur = st;
6795 regmatch_state *curyes = yes_state;
6797 regmatch_slab *slab = PL_regmatch_slab;
6798 for (;curd > -1;cur--,curd--) {
6799 if (cur < SLAB_FIRST(slab)) {
6801 cur = SLAB_LAST(slab);
6803 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6804 REPORT_CODE_OFF + 2 + depth * 2,"",
6805 curd, PL_reg_name[cur->resume_state],
6806 (curyes == cur) ? "yes" : ""
6809 curyes = cur->u.yes.prev_yes_state;
6812 DEBUG_STATE_pp("push")
6815 st->locinput = locinput;
6817 if (newst > SLAB_LAST(PL_regmatch_slab))
6818 newst = S_push_slab(aTHX);
6819 PL_regmatch_state = newst;
6821 locinput = pushinput;
6824 assert(0); /* NOTREACHED */
6829 * We get here only if there's trouble -- normally "case END" is
6830 * the terminating point.
6832 Perl_croak(aTHX_ "corrupted regexp pointers");
6838 /* we have successfully completed a subexpression, but we must now
6839 * pop to the state marked by yes_state and continue from there */
6840 assert(st != yes_state);
6842 while (st != yes_state) {
6844 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6845 PL_regmatch_slab = PL_regmatch_slab->prev;
6846 st = SLAB_LAST(PL_regmatch_slab);
6850 DEBUG_STATE_pp("pop (no final)");
6852 DEBUG_STATE_pp("pop (yes)");
6858 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6859 || yes_state > SLAB_LAST(PL_regmatch_slab))
6861 /* not in this slab, pop slab */
6862 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6863 PL_regmatch_slab = PL_regmatch_slab->prev;
6864 st = SLAB_LAST(PL_regmatch_slab);
6866 depth -= (st - yes_state);
6869 yes_state = st->u.yes.prev_yes_state;
6870 PL_regmatch_state = st;
6873 locinput= st->locinput;
6874 state_num = st->resume_state + no_final;
6875 goto reenter_switch;
6878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6879 PL_colors[4], PL_colors[5]));
6881 if (reginfo->info_aux_eval) {
6882 /* each successfully executed (?{...}) block does the equivalent of
6883 * local $^R = do {...}
6884 * When popping the save stack, all these locals would be undone;
6885 * bypass this by setting the outermost saved $^R to the latest
6887 /* I dont know if this is needed or works properly now.
6888 * see code related to PL_replgv elsewhere in this file.
6891 if (oreplsv != GvSV(PL_replgv))
6892 sv_setsv(oreplsv, GvSV(PL_replgv));
6899 PerlIO_printf(Perl_debug_log,
6900 "%*s %sfailed...%s\n",
6901 REPORT_CODE_OFF+depth*2, "",
6902 PL_colors[4], PL_colors[5])
6914 /* there's a previous state to backtrack to */
6916 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6917 PL_regmatch_slab = PL_regmatch_slab->prev;
6918 st = SLAB_LAST(PL_regmatch_slab);
6920 PL_regmatch_state = st;
6921 locinput= st->locinput;
6923 DEBUG_STATE_pp("pop");
6925 if (yes_state == st)
6926 yes_state = st->u.yes.prev_yes_state;
6928 state_num = st->resume_state + 1; /* failure = success + 1 */
6929 goto reenter_switch;
6934 if (rex->intflags & PREGf_VERBARG_SEEN) {
6935 SV *sv_err = get_sv("REGERROR", 1);
6936 SV *sv_mrk = get_sv("REGMARK", 1);
6938 sv_commit = &PL_sv_no;
6940 sv_yes_mark = &PL_sv_yes;
6943 sv_commit = &PL_sv_yes;
6944 sv_yes_mark = &PL_sv_no;
6946 sv_setsv(sv_err, sv_commit);
6947 sv_setsv(sv_mrk, sv_yes_mark);
6951 if (last_pushed_cv) {
6954 PERL_UNUSED_VAR(SP);
6957 assert(!result || locinput - reginfo->strbeg >= 0);
6958 return result ? locinput - reginfo->strbeg : -1;
6962 - regrepeat - repeatedly match something simple, report how many
6964 * What 'simple' means is a node which can be the operand of a quantifier like
6967 * startposp - pointer a pointer to the start position. This is updated
6968 * to point to the byte following the highest successful
6970 * p - the regnode to be repeatedly matched against.
6971 * reginfo - struct holding match state, such as strend
6972 * max - maximum number of things to match.
6973 * depth - (for debugging) backtracking depth.
6976 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6977 regmatch_info *const reginfo, I32 max, int depth)
6980 char *scan; /* Pointer to current position in target string */
6982 char *loceol = reginfo->strend; /* local version */
6983 I32 hardcount = 0; /* How many matches so far */
6984 bool utf8_target = reginfo->is_utf8_target;
6985 int to_complement = 0; /* Invert the result? */
6987 _char_class_number classnum;
6989 PERL_UNUSED_ARG(depth);
6992 PERL_ARGS_ASSERT_REGREPEAT;
6995 if (max == REG_INFTY)
6997 else if (! utf8_target && loceol - scan > max)
6998 loceol = scan + max;
7000 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
7001 * to the maximum of how far we should go in it (leaving it set to the real
7002 * end, if the maximum permissible would take us beyond that). This allows
7003 * us to make the loop exit condition that we haven't gone past <loceol> to
7004 * also mean that we haven't exceeded the max permissible count, saving a
7005 * test each time through the loop. But it assumes that the OP matches a
7006 * single byte, which is true for most of the OPs below when applied to a
7007 * non-UTF-8 target. Those relatively few OPs that don't have this
7008 * characteristic will have to compensate.
7010 * There is no adjustment for UTF-8 targets, as the number of bytes per
7011 * character varies. OPs will have to test both that the count is less
7012 * than the max permissible (using <hardcount> to keep track), and that we
7013 * are still within the bounds of the string (using <loceol>. A few OPs
7014 * match a single byte no matter what the encoding. They can omit the max
7015 * test if, for the UTF-8 case, they do the adjustment that was skipped
7018 * Thus, the code above sets things up for the common case; and exceptional
7019 * cases need extra work; the common case is to make sure <scan> doesn't
7020 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
7021 * count doesn't exceed the maximum permissible */
7026 while (scan < loceol && hardcount < max && *scan != '\n') {
7027 scan += UTF8SKIP(scan);
7031 while (scan < loceol && *scan != '\n')
7037 while (scan < loceol && hardcount < max) {
7038 scan += UTF8SKIP(scan);
7045 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
7046 if (utf8_target && loceol - scan > max) {
7048 /* <loceol> hadn't been adjusted in the UTF-8 case */
7056 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7060 /* Can use a simple loop if the pattern char to match on is invariant
7061 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
7062 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
7063 * true iff it doesn't matter if the argument is in UTF-8 or not */
7064 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
7065 if (utf8_target && loceol - scan > max) {
7066 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7067 * since here, to match at all, 1 char == 1 byte */
7068 loceol = scan + max;
7070 while (scan < loceol && UCHARAT(scan) == c) {
7074 else if (reginfo->is_utf8_pat) {
7076 STRLEN scan_char_len;
7078 /* When both target and pattern are UTF-8, we have to do
7080 while (hardcount < max
7082 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
7083 && memEQ(scan, STRING(p), scan_char_len))
7085 scan += scan_char_len;
7089 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
7091 /* Target isn't utf8; convert the character in the UTF-8
7092 * pattern to non-UTF8, and do a simple loop */
7093 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
7094 while (scan < loceol && UCHARAT(scan) == c) {
7097 } /* else pattern char is above Latin1, can't possibly match the
7102 /* Here, the string must be utf8; pattern isn't, and <c> is
7103 * different in utf8 than not, so can't compare them directly.
7104 * Outside the loop, find the two utf8 bytes that represent c, and
7105 * then look for those in sequence in the utf8 string */
7106 U8 high = UTF8_TWO_BYTE_HI(c);
7107 U8 low = UTF8_TWO_BYTE_LO(c);
7109 while (hardcount < max
7110 && scan + 1 < loceol
7111 && UCHARAT(scan) == high
7112 && UCHARAT(scan + 1) == low)
7120 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
7121 assert(! reginfo->is_utf8_pat);
7124 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7128 utf8_flags = FOLDEQ_LOCALE;
7131 case EXACTF: /* This node only generated for non-utf8 patterns */
7132 assert(! reginfo->is_utf8_pat);
7138 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
7142 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
7144 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7146 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
7149 if (c1 == CHRTEST_VOID) {
7150 /* Use full Unicode fold matching */
7151 char *tmpeol = reginfo->strend;
7152 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
7153 while (hardcount < max
7154 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
7155 STRING(p), NULL, pat_len,
7156 reginfo->is_utf8_pat, utf8_flags))
7159 tmpeol = reginfo->strend;
7163 else if (utf8_target) {
7165 while (scan < loceol
7167 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
7169 scan += UTF8SKIP(scan);
7174 while (scan < loceol
7176 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
7177 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
7179 scan += UTF8SKIP(scan);
7184 else if (c1 == c2) {
7185 while (scan < loceol && UCHARAT(scan) == c1) {
7190 while (scan < loceol &&
7191 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7201 while (hardcount < max
7203 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
7205 scan += UTF8SKIP(scan);
7209 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7214 /* The argument (FLAGS) to all the POSIX node types is the class number */
7221 if (! utf8_target) {
7222 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7228 while (hardcount < max && scan < loceol
7229 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7232 scan += UTF8SKIP(scan);
7245 if (utf8_target && loceol - scan > max) {
7247 /* We didn't adjust <loceol> at the beginning of this routine
7248 * because is UTF-8, but it is actually ok to do so, since here, to
7249 * match, 1 char == 1 byte. */
7250 loceol = scan + max;
7252 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7265 if (! utf8_target) {
7266 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7272 /* The complement of something that matches only ASCII matches all
7273 * UTF-8 variant code points, plus everything in ASCII that isn't
7275 while (hardcount < max && scan < loceol
7276 && (! UTF8_IS_INVARIANT(*scan)
7277 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7279 scan += UTF8SKIP(scan);
7290 if (! utf8_target) {
7291 while (scan < loceol && to_complement
7292 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7299 classnum = (_char_class_number) FLAGS(p);
7300 if (classnum < _FIRST_NON_SWASH_CC) {
7302 /* Here, a swash is needed for above-Latin1 code points.
7303 * Process as many Latin1 code points using the built-in rules.
7304 * Go to another loop to finish processing upon encountering
7305 * the first Latin1 code point. We could do that in this loop
7306 * as well, but the other way saves having to test if the swash
7307 * has been loaded every time through the loop: extra space to
7309 while (hardcount < max && scan < loceol) {
7310 if (UTF8_IS_INVARIANT(*scan)) {
7311 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7318 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7319 if (! (to_complement
7320 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7329 goto found_above_latin1;
7336 /* For these character classes, the knowledge of how to handle
7337 * every code point is compiled in to Perl via a macro. This
7338 * code is written for making the loops as tight as possible.
7339 * It could be refactored to save space instead */
7341 case _CC_ENUM_SPACE: /* XXX would require separate code
7342 if we revert the change of \v
7345 case _CC_ENUM_PSXSPC:
7346 while (hardcount < max
7348 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7350 scan += UTF8SKIP(scan);
7354 case _CC_ENUM_BLANK:
7355 while (hardcount < max
7357 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7359 scan += UTF8SKIP(scan);
7363 case _CC_ENUM_XDIGIT:
7364 while (hardcount < max
7366 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7368 scan += UTF8SKIP(scan);
7372 case _CC_ENUM_VERTSPACE:
7373 while (hardcount < max
7375 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7377 scan += UTF8SKIP(scan);
7381 case _CC_ENUM_CNTRL:
7382 while (hardcount < max
7384 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7386 scan += UTF8SKIP(scan);
7391 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7397 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7399 /* Load the swash if not already present */
7400 if (! PL_utf8_swash_ptrs[classnum]) {
7401 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7402 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7406 PL_XPosix_ptrs[classnum], &flags);
7409 while (hardcount < max && scan < loceol
7410 && to_complement ^ cBOOL(_generic_utf8(
7413 swash_fetch(PL_utf8_swash_ptrs[classnum],
7417 scan += UTF8SKIP(scan);
7424 while (hardcount < max && scan < loceol &&
7425 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7430 /* LNBREAK can match one or two latin chars, which is ok, but we
7431 * have to use hardcount in this situation, and throw away the
7432 * adjustment to <loceol> done before the switch statement */
7433 loceol = reginfo->strend;
7434 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7455 /* These are all 0 width, so match right here or not at all. */
7459 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7460 assert(0); /* NOTREACHED */
7467 c = scan - *startposp;
7471 GET_RE_DEBUG_FLAGS_DECL;
7473 SV * const prop = sv_newmortal();
7474 regprop(prog, prop, p, reginfo);
7475 PerlIO_printf(Perl_debug_log,
7476 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7477 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7485 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7487 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7488 create a copy so that changes the caller makes won't change the shared one.
7489 If <altsvp> is non-null, will return NULL in it, for back-compat.
7492 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7494 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7500 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL));
7504 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
7505 const regnode* node,
7508 SV** only_utf8_locale_ptr)
7510 /* For internal core use only.
7511 * Returns the swash for the input 'node' in the regex 'prog'.
7512 * If <doinit> is 'true', will attempt to create the swash if not already
7514 * If <listsvp> is non-null, will return the printable contents of the
7515 * swash. This can be used to get debugging information even before the
7516 * swash exists, by calling this function with 'doinit' set to false, in
7517 * which case the components that will be used to eventually create the
7518 * swash are returned (in a printable form).
7519 * Tied intimately to how regcomp.c sets up the data structure */
7523 SV *si = NULL; /* Input swash initialization string */
7526 RXi_GET_DECL(prog,progi);
7527 const struct reg_data * const data = prog ? progi->data : NULL;
7529 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
7531 assert(ANYOF_FLAGS(node)
7532 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
7534 if (data && data->count) {
7535 const U32 n = ARG(node);
7537 if (data->what[n] == 's') {
7538 SV * const rv = MUTABLE_SV(data->data[n]);
7539 AV * const av = MUTABLE_AV(SvRV(rv));
7540 SV **const ary = AvARRAY(av);
7541 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7543 si = *ary; /* ary[0] = the string to initialize the swash with */
7545 /* Elements 3 and 4 are either both present or both absent. [3] is
7546 * any inversion list generated at compile time; [4] indicates if
7547 * that inversion list has any user-defined properties in it. */
7548 if (av_tindex(av) >= 2) {
7549 if (only_utf8_locale_ptr
7551 && ary[2] != &PL_sv_undef)
7553 *only_utf8_locale_ptr = ary[2];
7556 *only_utf8_locale_ptr = NULL;
7559 if (av_tindex(av) >= 3) {
7562 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7570 /* Element [1] is reserved for the set-up swash. If already there,
7571 * return it; if not, create it and store it there */
7572 if (ary[1] && SvROK(ary[1])) {
7575 else if (doinit && ((si && si != &PL_sv_undef)
7576 || (invlist && invlist != &PL_sv_undef))) {
7578 sw = _core_swash_init("utf8", /* the utf8 package */
7582 0, /* not from tr/// */
7585 (void)av_store(av, 1, sw);
7590 /* If requested, return a printable version of what this swash matches */
7592 SV* matches_string = newSVpvn("", 0);
7594 /* The swash should be used, if possible, to get the data, as it
7595 * contains the resolved data. But this function can be called at
7596 * compile-time, before everything gets resolved, in which case we
7597 * return the currently best available information, which is the string
7598 * that will eventually be used to do that resolving, 'si' */
7599 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7600 && (si && si != &PL_sv_undef))
7602 sv_catsv(matches_string, si);
7605 /* Add the inversion list to whatever we have. This may have come from
7606 * the swash, or from an input parameter */
7608 sv_catsv(matches_string, _invlist_contents(invlist));
7610 *listsvp = matches_string;
7615 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
7618 - reginclass - determine if a character falls into a character class
7620 n is the ANYOF regnode
7621 p is the target string
7622 p_end points to one byte beyond the end of the target string
7623 utf8_target tells whether p is in UTF-8.
7625 Returns true if matched; false otherwise.
7627 Note that this can be a synthetic start class, a combination of various
7628 nodes, so things you think might be mutually exclusive, such as locale,
7629 aren't. It can match both locale and non-locale
7634 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
7637 const char flags = ANYOF_FLAGS(n);
7641 PERL_ARGS_ASSERT_REGINCLASS;
7643 /* If c is not already the code point, get it. Note that
7644 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7645 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7647 c = utf8n_to_uvchr(p, p_end - p, &c_len,
7648 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7649 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7650 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7651 * UTF8_ALLOW_FFFF */
7652 if (c_len == (STRLEN)-1)
7653 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7656 /* If this character is potentially in the bitmap, check it */
7658 if (ANYOF_BITMAP_TEST(n, c))
7660 else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL
7666 else if (flags & ANYOF_LOCALE_FLAGS) {
7667 if (flags & ANYOF_LOC_FOLD) {
7668 if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
7672 if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) {
7674 /* The data structure is arranged so bits 0, 2, 4, ... are set
7675 * if the class includes the Posix character class given by
7676 * bit/2; and 1, 3, 5, ... are set if the class includes the
7677 * complemented Posix class given by int(bit/2). So we loop
7678 * through the bits, each time changing whether we complement
7679 * the result or not. Suppose for the sake of illustration
7680 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7681 * is set, it means there is a match for this ANYOF node if the
7682 * character is in the class given by the expression (0 / 2 = 0
7683 * = \w). If it is in that class, isFOO_lc() will return 1,
7684 * and since 'to_complement' is 0, the result will stay TRUE,
7685 * and we exit the loop. Suppose instead that bit 0 is 0, but
7686 * bit 1 is 1. That means there is a match if the character
7687 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7688 * but will on bit 1. On the second iteration 'to_complement'
7689 * will be 1, so the exclusive or will reverse things, so we
7690 * are testing for \W. On the third iteration, 'to_complement'
7691 * will be 0, and we would be testing for \s; the fourth
7692 * iteration would test for \S, etc.
7694 * Note that this code assumes that all the classes are closed
7695 * under folding. For example, if a character matches \w, then
7696 * its fold does too; and vice versa. This should be true for
7697 * any well-behaved locale for all the currently defined Posix
7698 * classes, except for :lower: and :upper:, which are handled
7699 * by the pseudo-class :cased: which matches if either of the
7700 * other two does. To get rid of this assumption, an outer
7701 * loop could be used below to iterate over both the source
7702 * character, and its fold (if different) */
7705 int to_complement = 0;
7707 while (count < ANYOF_MAX) {
7708 if (ANYOF_POSIXL_TEST(n, count)
7709 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7722 /* If the bitmap didn't (or couldn't) match, and something outside the
7723 * bitmap could match, try that. */
7725 if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
7726 match = TRUE; /* Everything above 255 matches */
7728 else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
7729 || (utf8_target && (flags & ANYOF_UTF8))
7730 || ((flags & ANYOF_LOC_FOLD)
7731 && IN_UTF8_CTYPE_LOCALE
7732 && ARG(n) != ANYOF_NONBITMAP_EMPTY))
7734 SV* only_utf8_locale = NULL;
7735 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
7741 } else { /* Convert to utf8 */
7743 utf8_p = bytes_to_utf8(p, &len);
7746 if (swash_fetch(sw, utf8_p, TRUE)) {
7750 /* If we allocated a string above, free it */
7751 if (! utf8_target) Safefree(utf8_p);
7753 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
7754 match = _invlist_contains_cp(only_utf8_locale, c);
7758 if (UNICODE_IS_SUPER(c)
7759 && (flags & ANYOF_WARN_SUPER)
7760 && ckWARN_d(WARN_NON_UNICODE))
7762 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7763 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
7767 #if ANYOF_INVERT != 1
7768 /* Depending on compiler optimization cBOOL takes time, so if don't have to
7770 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7773 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7774 return (flags & ANYOF_INVERT) ^ match;
7778 S_reghop3(U8 *s, SSize_t off, const U8* lim)
7780 /* return the position 'off' UTF-8 characters away from 's', forward if
7781 * 'off' >= 0, backwards if negative. But don't go outside of position
7782 * 'lim', which better be < s if off < 0 */
7786 PERL_ARGS_ASSERT_REGHOP3;
7789 while (off-- && s < lim) {
7790 /* XXX could check well-formedness here */
7795 while (off++ && s > lim) {
7797 if (UTF8_IS_CONTINUED(*s)) {
7798 while (s > lim && UTF8_IS_CONTINUATION(*s))
7801 /* XXX could check well-formedness here */
7808 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7812 PERL_ARGS_ASSERT_REGHOP4;
7815 while (off-- && s < rlim) {
7816 /* XXX could check well-formedness here */
7821 while (off++ && s > llim) {
7823 if (UTF8_IS_CONTINUED(*s)) {
7824 while (s > llim && UTF8_IS_CONTINUATION(*s))
7827 /* XXX could check well-formedness here */
7834 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7838 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7841 while (off-- && s < lim) {
7842 /* XXX could check well-formedness here */
7849 while (off++ && s > lim) {
7851 if (UTF8_IS_CONTINUED(*s)) {
7852 while (s > lim && UTF8_IS_CONTINUATION(*s))
7855 /* XXX could check well-formedness here */
7864 /* when executing a regex that may have (?{}), extra stuff needs setting
7865 up that will be visible to the called code, even before the current
7866 match has finished. In particular:
7868 * $_ is localised to the SV currently being matched;
7869 * pos($_) is created if necessary, ready to be updated on each call-out
7871 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7872 isn't set until the current pattern is successfully finished), so that
7873 $1 etc of the match-so-far can be seen;
7874 * save the old values of subbeg etc of the current regex, and set then
7875 to the current string (again, this is normally only done at the end
7880 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7883 regexp *const rex = ReANY(reginfo->prog);
7884 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7886 eval_state->rex = rex;
7889 /* Make $_ available to executed code. */
7890 if (reginfo->sv != DEFSV) {
7892 DEFSV_set(reginfo->sv);
7895 if (!(mg = mg_find_mglob(reginfo->sv))) {
7896 /* prepare for quick setting of pos */
7897 mg = sv_magicext_mglob(reginfo->sv);
7900 eval_state->pos_magic = mg;
7901 eval_state->pos = mg->mg_len;
7902 eval_state->pos_flags = mg->mg_flags;
7905 eval_state->pos_magic = NULL;
7907 if (!PL_reg_curpm) {
7908 /* PL_reg_curpm is a fake PMOP that we can attach the current
7909 * regex to and point PL_curpm at, so that $1 et al are visible
7910 * within a /(?{})/. It's just allocated once per interpreter the
7911 * first time its needed */
7912 Newxz(PL_reg_curpm, 1, PMOP);
7915 SV* const repointer = &PL_sv_undef;
7916 /* this regexp is also owned by the new PL_reg_curpm, which
7917 will try to free it. */
7918 av_push(PL_regex_padav, repointer);
7919 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
7920 PL_regex_pad = AvARRAY(PL_regex_padav);
7924 SET_reg_curpm(reginfo->prog);
7925 eval_state->curpm = PL_curpm;
7926 PL_curpm = PL_reg_curpm;
7927 if (RXp_MATCH_COPIED(rex)) {
7928 /* Here is a serious problem: we cannot rewrite subbeg,
7929 since it may be needed if this match fails. Thus
7930 $` inside (?{}) could fail... */
7931 eval_state->subbeg = rex->subbeg;
7932 eval_state->sublen = rex->sublen;
7933 eval_state->suboffset = rex->suboffset;
7934 eval_state->subcoffset = rex->subcoffset;
7936 eval_state->saved_copy = rex->saved_copy;
7938 RXp_MATCH_COPIED_off(rex);
7941 eval_state->subbeg = NULL;
7942 rex->subbeg = (char *)reginfo->strbeg;
7944 rex->subcoffset = 0;
7945 rex->sublen = reginfo->strend - reginfo->strbeg;
7949 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7952 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7955 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7956 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
7959 Safefree(aux->poscache);
7963 /* undo the effects of S_setup_eval_state() */
7965 if (eval_state->subbeg) {
7966 regexp * const rex = eval_state->rex;
7967 rex->subbeg = eval_state->subbeg;
7968 rex->sublen = eval_state->sublen;
7969 rex->suboffset = eval_state->suboffset;
7970 rex->subcoffset = eval_state->subcoffset;
7972 rex->saved_copy = eval_state->saved_copy;
7974 RXp_MATCH_COPIED_on(rex);
7976 if (eval_state->pos_magic)
7978 eval_state->pos_magic->mg_len = eval_state->pos;
7979 eval_state->pos_magic->mg_flags =
7980 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
7981 | (eval_state->pos_flags & MGf_BYTES);
7984 PL_curpm = eval_state->curpm;
7987 PL_regmatch_state = aux->old_regmatch_state;
7988 PL_regmatch_slab = aux->old_regmatch_slab;
7990 /* free all slabs above current one - this must be the last action
7991 * of this function, as aux and eval_state are allocated within
7992 * slabs and may be freed here */
7994 s = PL_regmatch_slab->next;
7996 PL_regmatch_slab->next = NULL;
7998 regmatch_slab * const osl = s;
8007 S_to_utf8_substr(pTHX_ regexp *prog)
8009 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8010 * on the converted value */
8014 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8017 if (prog->substrs->data[i].substr
8018 && !prog->substrs->data[i].utf8_substr) {
8019 SV* const sv = newSVsv(prog->substrs->data[i].substr);
8020 prog->substrs->data[i].utf8_substr = sv;
8021 sv_utf8_upgrade(sv);
8022 if (SvVALID(prog->substrs->data[i].substr)) {
8023 if (SvTAIL(prog->substrs->data[i].substr)) {
8024 /* Trim the trailing \n that fbm_compile added last
8026 SvCUR_set(sv, SvCUR(sv) - 1);
8027 /* Whilst this makes the SV technically "invalid" (as its
8028 buffer is no longer followed by "\0") when fbm_compile()
8029 adds the "\n" back, a "\0" is restored. */
8030 fbm_compile(sv, FBMcf_TAIL);
8034 if (prog->substrs->data[i].substr == prog->check_substr)
8035 prog->check_utf8 = sv;
8041 S_to_byte_substr(pTHX_ regexp *prog)
8043 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8044 * on the converted value; returns FALSE if can't be converted. */
8049 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8052 if (prog->substrs->data[i].utf8_substr
8053 && !prog->substrs->data[i].substr) {
8054 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8055 if (! sv_utf8_downgrade(sv, TRUE)) {
8058 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8059 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8060 /* Trim the trailing \n that fbm_compile added last
8062 SvCUR_set(sv, SvCUR(sv) - 1);
8063 fbm_compile(sv, FBMcf_TAIL);
8067 prog->substrs->data[i].substr = sv;
8068 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8069 prog->check_substr = sv;
8078 * c-indentation-style: bsd
8080 * indent-tabs-mode: nil
8083 * ex: set ts=8 sts=4 sw=4 et: