5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #include "inline_invlist.c"
84 #include "unicode_constants.h"
87 /* At least one required character in the target string is expressible only in
89 static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
93 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
98 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define STATIC static
104 /* Valid only for non-utf8 strings: avoids the reginclass
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
107 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
108 : ANYOF_BITMAP_TEST(p,*(c)))
114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
115 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
117 #define HOPc(pos,off) \
118 (char *)(reginfo->is_utf8_target \
119 ? reghop3((U8*)pos, off, \
120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
123 #define HOPBACKc(pos, off) \
124 (char*)(reginfo->is_utf8_target \
125 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
126 : (pos - off >= reginfo->strbeg) \
130 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
131 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
133 /* lim must be +ve. Returns NULL on overshoot */
134 #define HOPMAYBE3(pos,off,lim) \
135 (reginfo->is_utf8_target \
136 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
137 : ((U8*)pos + off <= lim) \
141 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
142 * off must be >=0; args should be vars rather than expressions */
143 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
144 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
145 : (U8*)((pos + off) > lim ? lim : (pos + off)))
147 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
148 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
150 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
152 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
153 #define NEXTCHR_IS_EOS (nextchr < 0)
155 #define SET_nextchr \
156 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
158 #define SET_locinput(p) \
163 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
165 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
166 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
167 1, 0, invlist, &flags); \
172 /* If in debug mode, we test that a known character properly matches */
174 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
177 utf8_char_in_property) \
178 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
179 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
181 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
184 utf8_char_in_property) \
185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
188 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
189 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
191 PL_XPosix_ptrs[_CC_WORDCHAR], \
192 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
194 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
196 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
197 "_X_regular_begin", \
199 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
200 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
203 COMBINING_GRAVE_ACCENT_UTF8); \
206 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
207 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
209 /* for use after a quantifier and before an EXACT-like node -- japhy */
210 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
212 * NOTE that *nothing* that affects backtracking should be in here, specifically
213 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
214 * node that is in between two EXACT like nodes when ascertaining what the required
215 * "follow" character is. This should probably be moved to regex compile time
216 * although it may be done at run time beause of the REF possibility - more
217 * investigation required. -- demerphq
219 #define JUMPABLE(rn) ( \
221 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
223 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
224 OP(rn) == PLUS || OP(rn) == MINMOD || \
226 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
228 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
230 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
233 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
234 we don't need this definition. */
235 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
236 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
237 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
240 /* ... so we use this as its faster. */
241 #define IS_TEXT(rn) ( OP(rn)==EXACT )
242 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
243 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
244 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
249 Search for mandatory following text node; for lookahead, the text must
250 follow but for lookbehind (rn->flags != 0) we skip to the next step.
252 #define FIND_NEXT_IMPT(rn) STMT_START { \
253 while (JUMPABLE(rn)) { \
254 const OPCODE type = OP(rn); \
255 if (type == SUSPEND || PL_regkind[type] == CURLY) \
256 rn = NEXTOPER(NEXTOPER(rn)); \
257 else if (type == PLUS) \
259 else if (type == IFMATCH) \
260 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
261 else rn += NEXT_OFF(rn); \
265 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
266 * These are for the pre-composed Hangul syllables, which are all in a
267 * contiguous block and arranged there in such a way so as to facilitate
268 * alorithmic determination of their characteristics. As such, they don't need
269 * a swash, but can be determined by simple arithmetic. Almost all are
270 * GCB=LVT, but every 28th one is a GCB=LV */
271 #define SBASE 0xAC00 /* Start of block */
272 #define SCount 11172 /* Length of block */
275 #define SLAB_FIRST(s) (&(s)->states[0])
276 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
278 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
279 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
280 static regmatch_state * S_push_slab(pTHX);
282 #define REGCP_PAREN_ELEMS 3
283 #define REGCP_OTHER_ELEMS 3
284 #define REGCP_FRAME_ELEMS 1
285 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
286 * are needed for the regexp context stack bookkeeping. */
289 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
291 const int retval = PL_savestack_ix;
292 const int paren_elems_to_push =
293 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
294 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
295 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
297 GET_RE_DEBUG_FLAGS_DECL;
299 PERL_ARGS_ASSERT_REGCPPUSH;
301 if (paren_elems_to_push < 0)
302 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
303 (int)paren_elems_to_push, (int)maxopenparen,
304 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
306 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
307 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
308 " out of range (%lu-%ld)",
310 (unsigned long)maxopenparen,
313 SSGROW(total_elems + REGCP_FRAME_ELEMS);
316 if ((int)maxopenparen > (int)parenfloor)
317 PerlIO_printf(Perl_debug_log,
318 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
323 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
324 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
325 SSPUSHIV(rex->offs[p].end);
326 SSPUSHIV(rex->offs[p].start);
327 SSPUSHINT(rex->offs[p].start_tmp);
328 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
329 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
331 (IV)rex->offs[p].start,
332 (IV)rex->offs[p].start_tmp,
336 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
337 SSPUSHINT(maxopenparen);
338 SSPUSHINT(rex->lastparen);
339 SSPUSHINT(rex->lastcloseparen);
340 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
345 /* These are needed since we do not localize EVAL nodes: */
346 #define REGCP_SET(cp) \
348 PerlIO_printf(Perl_debug_log, \
349 " Setting an EVAL scope, savestack=%"IVdf"\n", \
350 (IV)PL_savestack_ix)); \
353 #define REGCP_UNWIND(cp) \
355 if (cp != PL_savestack_ix) \
356 PerlIO_printf(Perl_debug_log, \
357 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
358 (IV)(cp), (IV)PL_savestack_ix)); \
361 #define UNWIND_PAREN(lp, lcp) \
362 for (n = rex->lastparen; n > lp; n--) \
363 rex->offs[n].end = -1; \
364 rex->lastparen = n; \
365 rex->lastcloseparen = lcp;
369 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
373 GET_RE_DEBUG_FLAGS_DECL;
375 PERL_ARGS_ASSERT_REGCPPOP;
377 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
379 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
380 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
381 rex->lastcloseparen = SSPOPINT;
382 rex->lastparen = SSPOPINT;
383 *maxopenparen_p = SSPOPINT;
385 i -= REGCP_OTHER_ELEMS;
386 /* Now restore the parentheses context. */
388 if (i || rex->lastparen + 1 <= rex->nparens)
389 PerlIO_printf(Perl_debug_log,
390 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
395 paren = *maxopenparen_p;
396 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
398 rex->offs[paren].start_tmp = SSPOPINT;
399 rex->offs[paren].start = SSPOPIV;
401 if (paren <= rex->lastparen)
402 rex->offs[paren].end = tmps;
403 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
404 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
406 (IV)rex->offs[paren].start,
407 (IV)rex->offs[paren].start_tmp,
408 (IV)rex->offs[paren].end,
409 (paren > rex->lastparen ? "(skipped)" : ""));
414 /* It would seem that the similar code in regtry()
415 * already takes care of this, and in fact it is in
416 * a better location to since this code can #if 0-ed out
417 * but the code in regtry() is needed or otherwise tests
418 * requiring null fields (pat.t#187 and split.t#{13,14}
419 * (as of patchlevel 7877) will fail. Then again,
420 * this code seems to be necessary or otherwise
421 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
422 * --jhi updated by dapm */
423 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
424 if (i > *maxopenparen_p)
425 rex->offs[i].start = -1;
426 rex->offs[i].end = -1;
427 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
428 " \\%"UVuf": %s ..-1 undeffing\n",
430 (i > *maxopenparen_p) ? "-1" : " "
436 /* restore the parens and associated vars at savestack position ix,
437 * but without popping the stack */
440 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
442 I32 tmpix = PL_savestack_ix;
443 PL_savestack_ix = ix;
444 regcppop(rex, maxopenparen_p);
445 PL_savestack_ix = tmpix;
448 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
451 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
453 /* Returns a boolean as to whether or not 'character' is a member of the
454 * Posix character class given by 'classnum' that should be equivalent to a
455 * value in the typedef '_char_class_number'.
457 * Ideally this could be replaced by a just an array of function pointers
458 * to the C library functions that implement the macros this calls.
459 * However, to compile, the precise function signatures are required, and
460 * these may vary from platform to to platform. To avoid having to figure
461 * out what those all are on each platform, I (khw) am using this method,
462 * which adds an extra layer of function call overhead (unless the C
463 * optimizer strips it away). But we don't particularly care about
464 * performance with locales anyway. */
466 switch ((_char_class_number) classnum) {
467 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
468 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
469 case _CC_ENUM_ASCII: return isASCII_LC(character);
470 case _CC_ENUM_BLANK: return isBLANK_LC(character);
471 case _CC_ENUM_CASED: return isLOWER_LC(character)
472 || isUPPER_LC(character);
473 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
474 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
475 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
476 case _CC_ENUM_LOWER: return isLOWER_LC(character);
477 case _CC_ENUM_PRINT: return isPRINT_LC(character);
478 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
479 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
480 case _CC_ENUM_SPACE: return isSPACE_LC(character);
481 case _CC_ENUM_UPPER: return isUPPER_LC(character);
482 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
483 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
484 default: /* VERTSPACE should never occur in locales */
485 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
488 assert(0); /* NOTREACHED */
493 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
495 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
496 * 'character' is a member of the Posix character class given by 'classnum'
497 * that should be equivalent to a value in the typedef
498 * '_char_class_number'.
500 * This just calls isFOO_lc on the code point for the character if it is in
501 * the range 0-255. Outside that range, all characters avoid Unicode
502 * rules, ignoring any locale. So use the Unicode function if this class
503 * requires a swash, and use the Unicode macro otherwise. */
505 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
507 if (UTF8_IS_INVARIANT(*character)) {
508 return isFOO_lc(classnum, *character);
510 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
511 return isFOO_lc(classnum,
512 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
515 if (classnum < _FIRST_NON_SWASH_CC) {
517 /* Initialize the swash unless done already */
518 if (! PL_utf8_swash_ptrs[classnum]) {
519 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
520 PL_utf8_swash_ptrs[classnum] =
521 _core_swash_init("utf8",
524 PL_XPosix_ptrs[classnum], &flags);
527 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
529 TRUE /* is UTF */ ));
532 switch ((_char_class_number) classnum) {
534 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
536 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
537 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
538 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
539 default: return 0; /* Things like CNTRL are always
543 assert(0); /* NOTREACHED */
548 * pregexec and friends
551 #ifndef PERL_IN_XSUB_RE
553 - pregexec - match a regexp against a string
556 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
557 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
558 /* stringarg: the point in the string at which to begin matching */
559 /* strend: pointer to null at end of string */
560 /* strbeg: real beginning of string */
561 /* minend: end of match must be >= minend bytes after stringarg. */
562 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
563 * itself is accessed via the pointers above */
564 /* nosave: For optimizations. */
566 PERL_ARGS_ASSERT_PREGEXEC;
569 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
570 nosave ? 0 : REXEC_COPY_STR);
576 /* re_intuit_start():
578 * Based on some optimiser hints, try to find the earliest position in the
579 * string where the regex could match.
581 * rx: the regex to match against
582 * sv: the SV being matched: only used for utf8 flag; the string
583 * itself is accessed via the pointers below. Note that on
584 * something like an overloaded SV, SvPOK(sv) may be false
585 * and the string pointers may point to something unrelated to
587 * strbeg: real beginning of string
588 * strpos: the point in the string at which to begin matching
589 * strend: pointer to the byte following the last char of the string
590 * flags currently unused; set to 0
591 * data: currently unused; set to NULL
593 * The basic idea of re_intuit_start() is to use some known information
594 * about the pattern, namely:
596 * a) the longest known anchored substring (i.e. one that's at a
597 * constant offset from the beginning of the pattern; but not
598 * necessarily at a fixed offset from the beginning of the
600 * b) the longest floating substring (i.e. one that's not at a constant
601 * offset from the beginning of the pattern);
602 * c) Whether the pattern is anchored to the string; either
603 * an absolute anchor: /^../, or anchored to \n: /^.../m,
604 * or anchored to pos(): /\G/;
605 * d) A start class: a real or synthetic character class which
606 * represents which characters are legal at the start of the pattern;
608 * to either quickly reject the match, or to find the earliest position
609 * within the string at which the pattern might match, thus avoiding
610 * running the full NFA engine at those earlier locations, only to
611 * eventually fail and retry further along.
613 * Returns NULL if the pattern can't match, or returns the address within
614 * the string which is the earliest place the match could occur.
616 * The longest of the anchored and floating substrings is called 'check'
617 * and is checked first. The other is called 'other' and is checked
618 * second. The 'other' substring may not be present. For example,
620 * /(abc|xyz)ABC\d{0,3}DEFG/
624 * check substr (float) = "DEFG", offset 6..9 chars
625 * other substr (anchored) = "ABC", offset 3..3 chars
628 * Be aware that during the course of this function, sometimes 'anchored'
629 * refers to a substring being anchored relative to the start of the
630 * pattern, and sometimes to the pattern itself being anchored relative to
631 * the string. For example:
633 * /\dabc/: "abc" is anchored to the pattern;
634 * /^\dabc/: "abc" is anchored to the pattern and the string;
635 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
636 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
637 * but the pattern is anchored to the string.
641 Perl_re_intuit_start(pTHX_
644 const char * const strbeg,
648 re_scream_pos_data *data)
650 struct regexp *const prog = ReANY(rx);
651 SSize_t start_shift = prog->check_offset_min;
652 /* Should be nonnegative! */
653 SSize_t end_shift = 0;
654 /* current lowest pos in string where the regex can start matching */
655 char *rx_origin = strpos;
657 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
658 U8 other_ix = 1 - prog->substrs->check_ix;
660 char *other_last = strpos;/* latest pos 'other' substr already checked to */
661 char *check_at = NULL; /* check substr found at this pos */
662 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
663 RXi_GET_DECL(prog,progi);
664 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
665 regmatch_info *const reginfo = ®info_buf;
666 GET_RE_DEBUG_FLAGS_DECL;
668 PERL_ARGS_ASSERT_RE_INTUIT_START;
669 PERL_UNUSED_ARG(flags);
670 PERL_UNUSED_ARG(data);
672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
673 "Intuit: trying to determine minimum start position...\n"));
675 /* for now, assume that all substr offsets are positive. If at some point
676 * in the future someone wants to do clever things with look-behind and
677 * -ve offsets, they'll need to fix up any code in this function
678 * which uses these offsets. See the thread beginning
679 * <20140113145929.GF27210@iabyn.com>
681 assert(prog->substrs->data[0].min_offset >= 0);
682 assert(prog->substrs->data[0].max_offset >= 0);
683 assert(prog->substrs->data[1].min_offset >= 0);
684 assert(prog->substrs->data[1].max_offset >= 0);
685 assert(prog->substrs->data[2].min_offset >= 0);
686 assert(prog->substrs->data[2].max_offset >= 0);
688 /* for now, assume that if both present, that the floating substring
689 * doesn't start before the anchored substring.
690 * If you break this assumption (e.g. doing better optimisations
691 * with lookahead/behind), then you'll need to audit the code in this
692 * function carefully first
695 ! ( (prog->anchored_utf8 || prog->anchored_substr)
696 && (prog->float_utf8 || prog->float_substr))
697 || (prog->float_min_offset >= prog->anchored_offset));
699 /* byte rather than char calculation for efficiency. It fails
700 * to quickly reject some cases that can't match, but will reject
701 * them later after doing full char arithmetic */
702 if (prog->minlen > strend - strpos) {
703 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
704 " String too short...\n"));
708 reginfo->is_utf8_target = cBOOL(utf8_target);
709 reginfo->info_aux = NULL;
710 reginfo->strbeg = strbeg;
711 reginfo->strend = strend;
712 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
714 /* not actually used within intuit, but zero for safety anyway */
715 reginfo->poscache_maxiter = 0;
718 if (!prog->check_utf8 && prog->check_substr)
719 to_utf8_substr(prog);
720 check = prog->check_utf8;
722 if (!prog->check_substr && prog->check_utf8) {
723 if (! to_byte_substr(prog)) {
724 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
727 check = prog->check_substr;
730 /* dump the various substring data */
731 DEBUG_OPTIMISE_MORE_r({
733 for (i=0; i<=2; i++) {
734 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
735 : prog->substrs->data[i].substr);
739 PerlIO_printf(Perl_debug_log,
740 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
741 " useful=%"IVdf" utf8=%d [%s]\n",
743 (IV)prog->substrs->data[i].min_offset,
744 (IV)prog->substrs->data[i].max_offset,
745 (IV)prog->substrs->data[i].end_shift,
752 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
754 /* ml_anch: check after \n?
756 * A note about IMPLICIT: on an un-anchored pattern beginning
757 * with /.*.../, these flags will have been added by the
759 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
760 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
762 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
763 && !(prog->intflags & PREGf_IMPLICIT);
765 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
766 /* we are only allowed to match at BOS or \G */
768 /* trivially reject if there's a BOS anchor and we're not at BOS.
770 * Note that we don't try to do a similar quick reject for
771 * \G, since generally the caller will have calculated strpos
772 * based on pos() and gofs, so the string is already correctly
773 * anchored by definition; and handling the exceptions would
774 * be too fiddly (e.g. REXEC_IGNOREPOS).
776 if ( strpos != strbeg
777 && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL)))
779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
780 " Not at start...\n"));
784 /* in the presence of an anchor, the anchored (relative to the
785 * start of the regex) substr must also be anchored relative
786 * to strpos. So quickly reject if substr isn't found there.
787 * This works for \G too, because the caller will already have
788 * subtracted gofs from pos, and gofs is the offset from the
789 * \G to the start of the regex. For example, in /.abc\Gdef/,
790 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
791 * caller will have set strpos=pos()-4; we look for the substr
792 * at position pos()-4+1, which lines up with the "a" */
794 if (prog->check_offset_min == prog->check_offset_max
795 && !(prog->intflags & PREGf_CANY_SEEN))
797 /* Substring at constant offset from beg-of-str... */
798 SSize_t slen = SvCUR(check);
799 char *s = HOP3c(strpos, prog->check_offset_min, strend);
801 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
802 " Looking for check substr at fixed offset %"IVdf"...\n",
803 (IV)prog->check_offset_min));
806 /* In this case, the regex is anchored at the end too.
807 * Unless it's a multiline match, the lengths must match
808 * exactly, give or take a \n. NB: slen >= 1 since
809 * the last char of check is \n */
811 && ( strend - s > slen
812 || strend - s < slen - 1
813 || (strend - s == slen && strend[-1] != '\n')))
815 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
816 " String too long...\n"));
819 /* Now should match s[0..slen-2] */
822 if (slen && (*SvPVX_const(check) != *s
823 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
825 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
826 " String not equal...\n"));
831 goto success_at_start;
836 end_shift = prog->check_end_shift;
838 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
840 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
841 (IV)end_shift, RX_PRECOMP(prog));
846 /* This is the (re)entry point of the main loop in this function.
847 * The goal of this loop is to:
848 * 1) find the "check" substring in the region rx_origin..strend
849 * (adjusted by start_shift / end_shift). If not found, reject
851 * 2) If it exists, look for the "other" substr too if defined; for
852 * example, if the check substr maps to the anchored substr, then
853 * check the floating substr, and vice-versa. If not found, go
854 * back to (1) with rx_origin suitably incremented.
855 * 3) If we find an rx_origin position that doesn't contradict
856 * either of the substrings, then check the possible additional
857 * constraints on rx_origin of /^.../m or a known start class.
858 * If these fail, then depending on which constraints fail, jump
859 * back to here, or to various other re-entry points further along
860 * that skip some of the first steps.
861 * 4) If we pass all those tests, update the BmUSEFUL() count on the
862 * substring. If the start position was determined to be at the
863 * beginning of the string - so, not rejected, but not optimised,
864 * since we have to run regmatch from position 0 - decrement the
865 * BmUSEFUL() count. Otherwise increment it.
869 /* first, look for the 'check' substring */
875 DEBUG_OPTIMISE_MORE_r({
876 PerlIO_printf(Perl_debug_log,
877 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
878 " Start shift: %"IVdf" End shift %"IVdf
879 " Real end Shift: %"IVdf"\n",
880 (IV)(rx_origin - strpos),
881 (IV)prog->check_offset_min,
884 (IV)prog->check_end_shift);
887 if (prog->intflags & PREGf_CANY_SEEN) {
888 start_point= (U8*)(rx_origin + start_shift);
889 end_point= (U8*)(strend - end_shift);
890 if (start_point > end_point)
893 end_point = HOP3(strend, -end_shift, strbeg);
894 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
900 /* If the regex is absolutely anchored to either the start of the
901 * string (BOL,SBOL) or to pos() (ANCH_GPOS), then
902 * check_offset_max represents an upper bound on the string where
903 * the substr could start. For the ANCH_GPOS case, we assume that
904 * the caller of intuit will have already set strpos to
905 * pos()-gofs, so in this case strpos + offset_max will still be
906 * an upper bound on the substr.
909 && prog->intflags & PREGf_ANCH
910 && prog->check_offset_max != SSize_t_MAX)
912 SSize_t len = SvCUR(check) - !!SvTAIL(check);
913 const char * const anchor =
914 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
916 /* do a bytes rather than chars comparison. It's conservative;
917 * so it skips doing the HOP if the result can't possibly end
918 * up earlier than the old value of end_point.
920 if ((char*)end_point - anchor > prog->check_offset_max) {
921 end_point = HOP3lim((U8*)anchor,
922 prog->check_offset_max,
928 DEBUG_OPTIMISE_MORE_r({
929 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
930 (int)(end_point - start_point),
931 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
935 check_at = fbm_instr( start_point, end_point,
936 check, multiline ? FBMrf_MULTILINE : 0);
938 /* Update the count-of-usability, remove useless subpatterns,
942 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
943 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
944 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
945 (check_at ? "Found" : "Did not find"),
946 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
947 ? "anchored" : "floating"),
950 (check_at ? " at offset " : "...\n") );
955 /* Finish the diagnostic message */
956 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
958 /* set rx_origin to the minimum position where the regex could start
959 * matching, given the constraint of the just-matched check substring.
960 * But don't set it lower than previously.
963 if (check_at - rx_origin > prog->check_offset_max)
964 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
968 /* now look for the 'other' substring if defined */
970 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
971 : prog->substrs->data[other_ix].substr)
973 /* Take into account the "other" substring. */
977 struct reg_substr_datum *other;
980 other = &prog->substrs->data[other_ix];
982 /* if "other" is anchored:
983 * we've previously found a floating substr starting at check_at.
984 * This means that the regex origin must lie somewhere
985 * between min (rx_origin): HOP3(check_at, -check_offset_max)
986 * and max: HOP3(check_at, -check_offset_min)
987 * (except that min will be >= strpos)
988 * So the fixed substr must lie somewhere between
989 * HOP3(min, anchored_offset)
990 * HOP3(max, anchored_offset) + SvCUR(substr)
993 /* if "other" is floating
994 * Calculate last1, the absolute latest point where the
995 * floating substr could start in the string, ignoring any
996 * constraints from the earlier fixed match. It is calculated
999 * strend - prog->minlen (in chars) is the absolute latest
1000 * position within the string where the origin of the regex
1001 * could appear. The latest start point for the floating
1002 * substr is float_min_offset(*) on from the start of the
1003 * regex. last1 simply combines thee two offsets.
1005 * (*) You might think the latest start point should be
1006 * float_max_offset from the regex origin, and technically
1007 * you'd be correct. However, consider
1009 * Here, float min, max are 3,5 and minlen is 7.
1010 * This can match either
1014 * In the first case, the regex matches minlen chars; in the
1015 * second, minlen+1, in the third, minlen+2.
1016 * In the first case, the floating offset is 3 (which equals
1017 * float_min), in the second, 4, and in the third, 5 (which
1018 * equals float_max). In all cases, the floating string bcd
1019 * can never start more than 4 chars from the end of the
1020 * string, which equals minlen - float_min. As the substring
1021 * starts to match more than float_min from the start of the
1022 * regex, it makes the regex match more than minlen chars,
1023 * and the two cancel each other out. So we can always use
1024 * float_min - minlen, rather than float_max - minlen for the
1025 * latest position in the string.
1027 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1028 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1031 assert(prog->minlen >= other->min_offset);
1032 last1 = HOP3c(strend,
1033 other->min_offset - prog->minlen, strbeg);
1035 if (other_ix) {/* i.e. if (other-is-float) */
1036 /* last is the latest point where the floating substr could
1037 * start, *given* any constraints from the earlier fixed
1038 * match. This constraint is that the floating string starts
1039 * <= float_max_offset chars from the regex origin (rx_origin).
1040 * If this value is less than last1, use it instead.
1042 assert(rx_origin <= last1);
1044 /* this condition handles the offset==infinity case, and
1045 * is a short-cut otherwise. Although it's comparing a
1046 * byte offset to a char length, it does so in a safe way,
1047 * since 1 char always occupies 1 or more bytes,
1048 * so if a string range is (last1 - rx_origin) bytes,
1049 * it will be less than or equal to (last1 - rx_origin)
1050 * chars; meaning it errs towards doing the accurate HOP3
1051 * rather than just using last1 as a short-cut */
1052 (last1 - rx_origin) < other->max_offset
1054 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1057 assert(strpos + start_shift <= check_at);
1058 last = HOP4c(check_at, other->min_offset - start_shift,
1062 s = HOP3c(rx_origin, other->min_offset, strend);
1063 if (s < other_last) /* These positions already checked */
1066 must = utf8_target ? other->utf8_substr : other->substr;
1067 assert(SvPOK(must));
1070 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1072 multiline ? FBMrf_MULTILINE : 0
1075 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1076 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1077 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1078 s ? "Found" : "Contradicts",
1079 other_ix ? "floating" : "anchored",
1080 quoted, RE_SV_TAIL(must));
1085 /* last1 is latest possible substr location. If we didn't
1086 * find it before there, we never will */
1087 if (last >= last1) {
1088 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1089 ", giving up...\n"));
1093 /* try to find the check substr again at a later
1094 * position. Maybe next time we'll find the "other" substr
1096 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1097 ", trying %s at offset %ld...\n",
1098 (other_ix ? "floating" : "anchored"),
1099 (long)(HOP3c(check_at, 1, strend) - strpos)));
1101 other_last = HOP3c(last, 1, strend) /* highest failure */;
1103 other_ix /* i.e. if other-is-float */
1104 ? HOP3c(rx_origin, 1, strend)
1105 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1109 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1110 (long)(s - strpos)));
1112 if (other_ix) { /* if (other-is-float) */
1113 /* other_last is set to s, not s+1, since its possible for
1114 * a floating substr to fail first time, then succeed
1115 * second time at the same floating position; e.g.:
1116 * "-AB--AABZ" =~ /\wAB\d*Z/
1117 * The first time round, anchored and float match at
1118 * "-(AB)--AAB(Z)" then fail on the initial \w character
1119 * class. Second time round, they match at "-AB--A(AB)(Z)".
1124 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1125 other_last = HOP3c(s, 1, strend);
1130 DEBUG_OPTIMISE_MORE_r(
1131 PerlIO_printf(Perl_debug_log,
1132 " Check-only match: offset min:%"IVdf" max:%"IVdf
1133 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1134 " strend-strpos:%"IVdf"\n",
1135 (IV)prog->check_offset_min,
1136 (IV)prog->check_offset_max,
1137 (IV)(check_at-strpos),
1138 (IV)(rx_origin-strpos),
1139 (IV)(rx_origin-check_at),
1145 postprocess_substr_matches:
1147 /* handle the extra constraint of /^.../m if present */
1149 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1152 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1153 " looking for /^/m anchor"));
1155 /* we have failed the constraint of a \n before rx_origin.
1156 * Find the next \n, if any, even if it's beyond the current
1157 * anchored and/or floating substrings. Whether we should be
1158 * scanning ahead for the next \n or the next substr is debatable.
1159 * On the one hand you'd expect rare substrings to appear less
1160 * often than \n's. On the other hand, searching for \n means
1161 * we're effectively flipping been check_substr and "\n" on each
1162 * iteration as the current "rarest" string candidate, which
1163 * means for example that we'll quickly reject the whole string if
1164 * hasn't got a \n, rather than trying every substr position
1168 s = HOP3c(strend, - prog->minlen, strpos);
1169 if (s <= rx_origin ||
1170 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1172 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1173 " Did not find /%s^%s/m...\n",
1174 PL_colors[0], PL_colors[1]));
1178 /* earliest possible origin is 1 char after the \n.
1179 * (since *rx_origin == '\n', it's safe to ++ here rather than
1180 * HOP(rx_origin, 1)) */
1183 if (prog->substrs->check_ix == 0 /* check is anchored */
1184 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1186 /* Position contradicts check-string; either because
1187 * check was anchored (and thus has no wiggle room),
1188 * or check was float and rx_origin is above the float range */
1189 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1190 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1191 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1195 /* if we get here, the check substr must have been float,
1196 * is in range, and we may or may not have had an anchored
1197 * "other" substr which still contradicts */
1198 assert(prog->substrs->check_ix); /* check is float */
1200 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1201 /* whoops, the anchored "other" substr exists, so we still
1202 * contradict. On the other hand, the float "check" substr
1203 * didn't contradict, so just retry the anchored "other"
1205 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1206 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1207 PL_colors[0], PL_colors[1],
1208 (long)(rx_origin - strpos),
1209 (long)(rx_origin - strpos + prog->anchored_offset)));
1210 goto do_other_substr;
1213 /* success: we don't contradict the found floating substring
1214 * (and there's no anchored substr). */
1215 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1216 " Found /%s^%s/m at offset %ld...\n",
1217 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1220 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1221 " (multiline anchor test skipped)\n"));
1227 /* if we have a starting character class, then test that extra constraint.
1228 * (trie stclasses are too expensive to use here, we are better off to
1229 * leave it to regmatch itself) */
1231 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1232 const U8* const str = (U8*)STRING(progi->regstclass);
1234 /* XXX this value could be pre-computed */
1235 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1236 ? (reginfo->is_utf8_pat
1237 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1238 : STR_LEN(progi->regstclass))
1242 /* latest pos that a matching float substr constrains rx start to */
1243 char *rx_max_float = NULL;
1245 /* if the current rx_origin is anchored, either by satisfying an
1246 * anchored substring constraint, or a /^.../m constraint, then we
1247 * can reject the current origin if the start class isn't found
1248 * at the current position. If we have a float-only match, then
1249 * rx_origin is constrained to a range; so look for the start class
1250 * in that range. if neither, then look for the start class in the
1251 * whole rest of the string */
1253 /* XXX DAPM it's not clear what the minlen test is for, and why
1254 * it's not used in the floating case. Nothing in the test suite
1255 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1256 * Here are some old comments, which may or may not be correct:
1258 * minlen == 0 is possible if regstclass is \b or \B,
1259 * and the fixed substr is ''$.
1260 * Since minlen is already taken into account, rx_origin+1 is
1261 * before strend; accidentally, minlen >= 1 guaranties no false
1262 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1263 * 0) below assumes that regstclass does not come from lookahead...
1264 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1265 * This leaves EXACTF-ish only, which are dealt with in
1269 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1270 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1271 else if (prog->float_substr || prog->float_utf8) {
1272 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1273 endpos= HOP3c(rx_max_float, cl_l, strend);
1278 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1279 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
1280 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1281 (IV)start_shift, (IV)(check_at - strbeg),
1282 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1284 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1287 if (endpos == strend) {
1288 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1289 " Could not match STCLASS...\n") );
1292 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1293 " This position contradicts STCLASS...\n") );
1294 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1295 && !(prog->intflags & PREGf_IMPLICIT))
1298 /* Contradict one of substrings */
1299 if (prog->anchored_substr || prog->anchored_utf8) {
1300 if (prog->substrs->check_ix == 1) { /* check is float */
1301 /* Have both, check_string is floating */
1302 assert(rx_origin + start_shift <= check_at);
1303 if (rx_origin + start_shift != check_at) {
1304 /* not at latest position float substr could match:
1305 * Recheck anchored substring, but not floating.
1306 * The condition above is in bytes rather than
1307 * chars for efficiency. It's conservative, in
1308 * that it errs on the side of doing 'goto
1309 * do_other_substr', where a more accurate
1310 * char-based calculation will be done */
1311 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1312 " Looking for anchored substr starting at offset %ld...\n",
1313 (long)(other_last - strpos)) );
1314 goto do_other_substr;
1322 /* In the presence of ml_anch, we might be able to
1323 * find another \n without breaking the current float
1326 /* strictly speaking this should be HOP3c(..., 1, ...),
1327 * but since we goto a block of code that's going to
1328 * search for the next \n if any, its safe here */
1330 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1331 " Looking for /%s^%s/m starting at offset %ld...\n",
1332 PL_colors[0], PL_colors[1],
1333 (long)(rx_origin - strpos)) );
1334 goto postprocess_substr_matches;
1337 /* strictly speaking this can never be true; but might
1338 * be if we ever allow intuit without substrings */
1339 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1342 rx_origin = rx_max_float;
1345 /* at this point, any matching substrings have been
1346 * contradicted. Start again... */
1348 rx_origin = HOP3c(rx_origin, 1, strend);
1350 /* uses bytes rather than char calculations for efficiency.
1351 * It's conservative: it errs on the side of doing 'goto restart',
1352 * where there is code that does a proper char-based test */
1353 if (rx_origin + start_shift + end_shift > strend) {
1354 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1355 " Could not match STCLASS...\n") );
1358 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1359 " Looking for %s substr starting at offset %ld...\n",
1360 (prog->substrs->check_ix ? "floating" : "anchored"),
1361 (long)(rx_origin + start_shift - strpos)) );
1367 if (rx_origin != s) {
1368 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1369 " By STCLASS: moving %ld --> %ld\n",
1370 (long)(rx_origin - strpos), (long)(s - strpos))
1374 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1375 " Does not contradict STCLASS...\n");
1380 /* Decide whether using the substrings helped */
1382 if (rx_origin != strpos) {
1383 /* Fixed substring is found far enough so that the match
1384 cannot start at strpos. */
1386 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1387 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1390 /* The found rx_origin position does not prohibit matching at
1391 * strpos, so calling intuit didn't gain us anything. Decrement
1392 * the BmUSEFUL() count on the check substring, and if we reach
1394 if (!(prog->intflags & PREGf_NAUGHTY)
1396 prog->check_utf8 /* Could be deleted already */
1397 && --BmUSEFUL(prog->check_utf8) < 0
1398 && (prog->check_utf8 == prog->float_utf8)
1400 prog->check_substr /* Could be deleted already */
1401 && --BmUSEFUL(prog->check_substr) < 0
1402 && (prog->check_substr == prog->float_substr)
1405 /* If flags & SOMETHING - do not do it many times on the same match */
1406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1407 /* XXX Does the destruction order has to change with utf8_target? */
1408 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1409 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1410 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1411 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1412 check = NULL; /* abort */
1413 /* XXXX This is a remnant of the old implementation. It
1414 looks wasteful, since now INTUIT can use many
1415 other heuristics. */
1416 prog->extflags &= ~RXf_USE_INTUIT;
1420 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1421 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1422 PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1426 fail_finish: /* Substring not found */
1427 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1428 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1430 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1431 PL_colors[4], PL_colors[5]));
1436 #define DECL_TRIE_TYPE(scan) \
1437 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1438 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1439 trie_type = ((scan->flags == EXACT) \
1440 ? (utf8_target ? trie_utf8 : trie_plain) \
1441 : (scan->flags == EXACTFA) \
1442 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1443 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1445 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1448 U8 flags = FOLD_FLAGS_FULL; \
1449 switch (trie_type) { \
1450 case trie_utf8_exactfa_fold: \
1451 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1453 case trie_utf8_fold: \
1454 if ( foldlen>0 ) { \
1455 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1460 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
1461 len = UTF8SKIP(uc); \
1462 skiplen = UNISKIP( uvc ); \
1463 foldlen -= skiplen; \
1464 uscan = foldbuf + skiplen; \
1467 case trie_latin_utf8_exactfa_fold: \
1468 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1470 case trie_latin_utf8_fold: \
1471 if ( foldlen>0 ) { \
1472 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1478 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1479 skiplen = UNISKIP( uvc ); \
1480 foldlen -= skiplen; \
1481 uscan = foldbuf + skiplen; \
1485 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1492 charid = trie->charmap[ uvc ]; \
1496 if (widecharmap) { \
1497 SV** const svpp = hv_fetch(widecharmap, \
1498 (char*)&uvc, sizeof(UV), 0); \
1500 charid = (U16)SvIV(*svpp); \
1505 #define REXEC_FBC_EXACTISH_SCAN(COND) \
1509 && (ln == 1 || folder(s, pat_string, ln)) \
1510 && (reginfo->intuit || regtry(reginfo, &s)) )\
1516 #define REXEC_FBC_UTF8_SCAN(CODE) \
1518 while (s < strend) { \
1524 #define REXEC_FBC_SCAN(CODE) \
1526 while (s < strend) { \
1532 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1533 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1535 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1544 #define REXEC_FBC_CLASS_SCAN(COND) \
1545 REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1547 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1556 /* This is the macro to use when we want to see if something that looks like it
1557 * could match, actually does, and if so exits the loop */
1558 #define REXEC_FBC_TRYIT \
1559 if ((reginfo->intuit || regtry(reginfo, &s))) \
1562 #define REXEC_FBC_CSCAN(CONDUTF8,COND) \
1563 if (utf8_target) { \
1564 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
1567 REXEC_FBC_CLASS_SCAN(COND); \
1570 #define DUMP_EXEC_POS(li,s,doutf8) \
1571 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1574 /* The three macros below are slightly different versions of the same logic.
1576 * The first is for /a and /aa when the target string is UTF-8. This can only
1577 * match ascii, but it must advance based on UTF-8. The other two handle the
1578 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1579 * for the boundary (or non-boundary) between a word and non-word character.
1580 * The utf8 and non-utf8 cases have the same logic, but the details must be
1581 * different. Find the "wordness" of the character just prior to this one, and
1582 * compare it with the wordness of this one. If they differ, we have a
1583 * boundary. At the beginning of the string, pretend that the previous
1584 * character was a new-line.
1586 * All these macros uncleanly have side-effects with each other and outside
1587 * variables. So far it's been too much trouble to clean-up
1589 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1590 * a word character or not.
1591 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1593 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1595 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1596 * are looking for a boundary or for a non-boundary. If we are looking for a
1597 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1598 * see if this tentative match actually works, and if so, to quit the loop
1599 * here. And vice-versa if we are looking for a non-boundary.
1601 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1602 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1603 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1604 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1605 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1606 * complement. But in that branch we complement tmp, meaning that at the
1607 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1608 * which means at the top of the loop in the next iteration, it is
1609 * TEST_NON_UTF8(s-1) */
1610 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1611 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1612 tmp = TEST_NON_UTF8(tmp); \
1613 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1614 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1616 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1623 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1624 * TEST_UTF8 is a macro that for the same input code points returns identically
1625 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1626 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
1627 if (s == reginfo->strbeg) { \
1630 else { /* Back-up to the start of the previous character */ \
1631 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1632 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1633 0, UTF8_ALLOW_DEFAULT); \
1635 tmp = TEST_UV(tmp); \
1636 LOAD_UTF8_CHARCLASS_ALNUM(); \
1637 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1638 if (tmp == ! (TEST_UTF8((U8 *) s))) { \
1647 /* The only difference between the BOUND and NBOUND cases is that
1648 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1649 * NBOUND. This is accomplished by passing it as either the if or else clause,
1650 * with the other one being empty (PLACEHOLDER is defined as empty).
1652 * The TEST_FOO parameters are for operating on different forms of input, but
1653 * all should be ones that return identically for the same underlying code
1655 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1657 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1658 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1660 #define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1662 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1663 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1665 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1667 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1668 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1670 #define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1672 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1673 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1675 /* Like the above two macros. UTF8_CODE is the complete code for handling
1676 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1678 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1679 if (utf8_target) { \
1682 else { /* Not utf8 */ \
1683 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1684 tmp = TEST_NON_UTF8(tmp); \
1685 REXEC_FBC_SCAN( /* advances s while s < strend */ \
1686 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1695 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
1699 /* We know what class REx starts with. Try to find this position... */
1700 /* if reginfo->intuit, its a dryrun */
1701 /* annoyingly all the vars in this routine have different names from their counterparts
1702 in regmatch. /grrr */
1704 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1705 const char *strend, regmatch_info *reginfo)
1708 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1709 char *pat_string; /* The pattern's exactish string */
1710 char *pat_end; /* ptr to end char of pat_string */
1711 re_fold_t folder; /* Function for computing non-utf8 folds */
1712 const U8 *fold_array; /* array for folding ords < 256 */
1718 I32 tmp = 1; /* Scratch variable? */
1719 const bool utf8_target = reginfo->is_utf8_target;
1720 UV utf8_fold_flags = 0;
1721 const bool is_utf8_pat = reginfo->is_utf8_pat;
1722 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1723 with a result inverts that result, as 0^1 =
1725 _char_class_number classnum;
1727 RXi_GET_DECL(prog,progi);
1729 PERL_ARGS_ASSERT_FIND_BYCLASS;
1731 /* We know what class it must start with. */
1735 REXEC_FBC_UTF8_CLASS_SCAN(
1736 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1739 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1744 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1751 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1752 assert(! is_utf8_pat);
1755 if (is_utf8_pat || utf8_target) {
1756 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1757 goto do_exactf_utf8;
1759 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1760 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1761 goto do_exactf_non_utf8; /* isn't dealt with by these */
1763 case EXACTF: /* This node only generated for non-utf8 patterns */
1764 assert(! is_utf8_pat);
1766 utf8_fold_flags = 0;
1767 goto do_exactf_utf8;
1769 fold_array = PL_fold;
1771 goto do_exactf_non_utf8;
1774 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1775 utf8_fold_flags = FOLDEQ_LOCALE;
1776 goto do_exactf_utf8;
1778 fold_array = PL_fold_locale;
1779 folder = foldEQ_locale;
1780 goto do_exactf_non_utf8;
1784 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1786 goto do_exactf_utf8;
1789 if (is_utf8_pat || utf8_target) {
1790 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1791 goto do_exactf_utf8;
1794 /* Any 'ss' in the pattern should have been replaced by regcomp,
1795 * so we don't have to worry here about this single special case
1796 * in the Latin1 range */
1797 fold_array = PL_fold_latin1;
1798 folder = foldEQ_latin1;
1802 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1803 are no glitches with fold-length differences
1804 between the target string and pattern */
1806 /* The idea in the non-utf8 EXACTF* cases is to first find the
1807 * first character of the EXACTF* node and then, if necessary,
1808 * case-insensitively compare the full text of the node. c1 is the
1809 * first character. c2 is its fold. This logic will not work for
1810 * Unicode semantics and the german sharp ss, which hence should
1811 * not be compiled into a node that gets here. */
1812 pat_string = STRING(c);
1813 ln = STR_LEN(c); /* length to match in octets/bytes */
1815 /* We know that we have to match at least 'ln' bytes (which is the
1816 * same as characters, since not utf8). If we have to match 3
1817 * characters, and there are only 2 availabe, we know without
1818 * trying that it will fail; so don't start a match past the
1819 * required minimum number from the far end */
1820 e = HOP3c(strend, -((SSize_t)ln), s);
1822 if (reginfo->intuit && e < s) {
1823 e = s; /* Due to minlen logic of intuit() */
1827 c2 = fold_array[c1];
1828 if (c1 == c2) { /* If char and fold are the same */
1829 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1832 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1840 /* If one of the operands is in utf8, we can't use the simpler folding
1841 * above, due to the fact that many different characters can have the
1842 * same fold, or portion of a fold, or different- length fold */
1843 pat_string = STRING(c);
1844 ln = STR_LEN(c); /* length to match in octets/bytes */
1845 pat_end = pat_string + ln;
1846 lnc = is_utf8_pat /* length to match in characters */
1847 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1850 /* We have 'lnc' characters to match in the pattern, but because of
1851 * multi-character folding, each character in the target can match
1852 * up to 3 characters (Unicode guarantees it will never exceed
1853 * this) if it is utf8-encoded; and up to 2 if not (based on the
1854 * fact that the Latin 1 folds are already determined, and the
1855 * only multi-char fold in that range is the sharp-s folding to
1856 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1857 * string character. Adjust lnc accordingly, rounding up, so that
1858 * if we need to match at least 4+1/3 chars, that really is 5. */
1859 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1860 lnc = (lnc + expansion - 1) / expansion;
1862 /* As in the non-UTF8 case, if we have to match 3 characters, and
1863 * only 2 are left, it's guaranteed to fail, so don't start a
1864 * match that would require us to go beyond the end of the string
1866 e = HOP3c(strend, -((SSize_t)lnc), s);
1868 if (reginfo->intuit && e < s) {
1869 e = s; /* Due to minlen logic of intuit() */
1872 /* XXX Note that we could recalculate e to stop the loop earlier,
1873 * as the worst case expansion above will rarely be met, and as we
1874 * go along we would usually find that e moves further to the left.
1875 * This would happen only after we reached the point in the loop
1876 * where if there were no expansion we should fail. Unclear if
1877 * worth the expense */
1880 char *my_strend= (char *)strend;
1881 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1882 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1883 && (reginfo->intuit || regtry(reginfo, &s)) )
1887 s += (utf8_target) ? UTF8SKIP(s) : 1;
1893 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1896 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1899 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1902 FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1905 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1908 FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1911 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1914 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1917 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1918 is_LNBREAK_latin1_safe(s, strend)
1922 /* The argument to all the POSIX node types is the class number to pass to
1923 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1930 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1931 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1946 /* The complement of something that matches only ASCII matches all
1947 * non-ASCII, plus everything in ASCII that isn't in the class. */
1948 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
1949 || ! _generic_isCC_A(*s, FLAGS(c)));
1958 /* Don't need to worry about utf8, as it can match only a single
1959 * byte invariant character. */
1960 REXEC_FBC_CLASS_SCAN(
1961 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1969 if (! utf8_target) {
1970 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1976 classnum = (_char_class_number) FLAGS(c);
1977 if (classnum < _FIRST_NON_SWASH_CC) {
1978 while (s < strend) {
1980 /* We avoid loading in the swash as long as possible, but
1981 * should we have to, we jump to a separate loop. This
1982 * extra 'if' statement is what keeps this code from being
1983 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1984 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1985 goto found_above_latin1;
1987 if ((UTF8_IS_INVARIANT(*s)
1988 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1990 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1991 && to_complement ^ cBOOL(
1992 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1996 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2008 else switch (classnum) { /* These classes are implemented as
2010 case _CC_ENUM_SPACE: /* XXX would require separate code if we
2011 revert the change of \v matching this */
2014 case _CC_ENUM_PSXSPC:
2015 REXEC_FBC_UTF8_CLASS_SCAN(
2016 to_complement ^ cBOOL(isSPACE_utf8(s)));
2019 case _CC_ENUM_BLANK:
2020 REXEC_FBC_UTF8_CLASS_SCAN(
2021 to_complement ^ cBOOL(isBLANK_utf8(s)));
2024 case _CC_ENUM_XDIGIT:
2025 REXEC_FBC_UTF8_CLASS_SCAN(
2026 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2029 case _CC_ENUM_VERTSPACE:
2030 REXEC_FBC_UTF8_CLASS_SCAN(
2031 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2034 case _CC_ENUM_CNTRL:
2035 REXEC_FBC_UTF8_CLASS_SCAN(
2036 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2040 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2041 assert(0); /* NOTREACHED */
2046 found_above_latin1: /* Here we have to load a swash to get the result
2047 for the current code point */
2048 if (! PL_utf8_swash_ptrs[classnum]) {
2049 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2050 PL_utf8_swash_ptrs[classnum] =
2051 _core_swash_init("utf8",
2054 PL_XPosix_ptrs[classnum], &flags);
2057 /* This is a copy of the loop above for swash classes, though using the
2058 * FBC macro instead of being expanded out. Since we've loaded the
2059 * swash, we don't have to check for that each time through the loop */
2060 REXEC_FBC_UTF8_CLASS_SCAN(
2061 to_complement ^ cBOOL(_generic_utf8(
2064 swash_fetch(PL_utf8_swash_ptrs[classnum],
2072 /* what trie are we using right now */
2073 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2074 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2075 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2077 const char *last_start = strend - trie->minlen;
2079 const char *real_start = s;
2081 STRLEN maxlen = trie->maxlen;
2083 U8 **points; /* map of where we were in the input string
2084 when reading a given char. For ASCII this
2085 is unnecessary overhead as the relationship
2086 is always 1:1, but for Unicode, especially
2087 case folded Unicode this is not true. */
2088 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2092 GET_RE_DEBUG_FLAGS_DECL;
2094 /* We can't just allocate points here. We need to wrap it in
2095 * an SV so it gets freed properly if there is a croak while
2096 * running the match */
2099 sv_points=newSV(maxlen * sizeof(U8 *));
2100 SvCUR_set(sv_points,
2101 maxlen * sizeof(U8 *));
2102 SvPOK_on(sv_points);
2103 sv_2mortal(sv_points);
2104 points=(U8**)SvPV_nolen(sv_points );
2105 if ( trie_type != trie_utf8_fold
2106 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2109 bitmap=(U8*)trie->bitmap;
2111 bitmap=(U8*)ANYOF_BITMAP(c);
2113 /* this is the Aho-Corasick algorithm modified a touch
2114 to include special handling for long "unknown char" sequences.
2115 The basic idea being that we use AC as long as we are dealing
2116 with a possible matching char, when we encounter an unknown char
2117 (and we have not encountered an accepting state) we scan forward
2118 until we find a legal starting char.
2119 AC matching is basically that of trie matching, except that when
2120 we encounter a failing transition, we fall back to the current
2121 states "fail state", and try the current char again, a process
2122 we repeat until we reach the root state, state 1, or a legal
2123 transition. If we fail on the root state then we can either
2124 terminate if we have reached an accepting state previously, or
2125 restart the entire process from the beginning if we have not.
2128 while (s <= last_start) {
2129 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2137 U8 *uscan = (U8*)NULL;
2138 U8 *leftmost = NULL;
2140 U32 accepted_word= 0;
2144 while ( state && uc <= (U8*)strend ) {
2146 U32 word = aho->states[ state ].wordnum;
2150 DEBUG_TRIE_EXECUTE_r(
2151 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2152 dump_exec_pos( (char *)uc, c, strend, real_start,
2153 (char *)uc, utf8_target );
2154 PerlIO_printf( Perl_debug_log,
2155 " Scanning for legal start char...\n");
2159 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2163 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2169 if (uc >(U8*)last_start) break;
2173 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2174 if (!leftmost || lpos < leftmost) {
2175 DEBUG_r(accepted_word=word);
2181 points[pointpos++ % maxlen]= uc;
2182 if (foldlen || uc < (U8*)strend) {
2183 REXEC_TRIE_READ_CHAR(trie_type, trie,
2185 uscan, len, uvc, charid, foldlen,
2187 DEBUG_TRIE_EXECUTE_r({
2188 dump_exec_pos( (char *)uc, c, strend,
2189 real_start, s, utf8_target);
2190 PerlIO_printf(Perl_debug_log,
2191 " Charid:%3u CP:%4"UVxf" ",
2203 word = aho->states[ state ].wordnum;
2205 base = aho->states[ state ].trans.base;
2207 DEBUG_TRIE_EXECUTE_r({
2209 dump_exec_pos( (char *)uc, c, strend, real_start,
2211 PerlIO_printf( Perl_debug_log,
2212 "%sState: %4"UVxf", word=%"UVxf,
2213 failed ? " Fail transition to " : "",
2214 (UV)state, (UV)word);
2220 ( ((offset = base + charid
2221 - 1 - trie->uniquecharcount)) >= 0)
2222 && ((U32)offset < trie->lasttrans)
2223 && trie->trans[offset].check == state
2224 && (tmp=trie->trans[offset].next))
2226 DEBUG_TRIE_EXECUTE_r(
2227 PerlIO_printf( Perl_debug_log," - legal\n"));
2232 DEBUG_TRIE_EXECUTE_r(
2233 PerlIO_printf( Perl_debug_log," - fail\n"));
2235 state = aho->fail[state];
2239 /* we must be accepting here */
2240 DEBUG_TRIE_EXECUTE_r(
2241 PerlIO_printf( Perl_debug_log," - accepting\n"));
2250 if (!state) state = 1;
2253 if ( aho->states[ state ].wordnum ) {
2254 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2255 if (!leftmost || lpos < leftmost) {
2256 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2261 s = (char*)leftmost;
2262 DEBUG_TRIE_EXECUTE_r({
2264 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2265 (UV)accepted_word, (IV)(s - real_start)
2268 if (reginfo->intuit || regtry(reginfo, &s)) {
2274 DEBUG_TRIE_EXECUTE_r({
2275 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2278 DEBUG_TRIE_EXECUTE_r(
2279 PerlIO_printf( Perl_debug_log,"No match.\n"));
2288 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2295 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2296 * flags have same meanings as with regexec_flags() */
2299 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2306 struct regexp *const prog = ReANY(rx);
2308 if (flags & REXEC_COPY_STR) {
2312 PerlIO_printf(Perl_debug_log,
2313 "Copy on write: regexp capture, type %d\n",
2316 /* Create a new COW SV to share the match string and store
2317 * in saved_copy, unless the current COW SV in saved_copy
2318 * is valid and suitable for our purpose */
2319 if (( prog->saved_copy
2320 && SvIsCOW(prog->saved_copy)
2321 && SvPOKp(prog->saved_copy)
2324 && SvPVX(sv) == SvPVX(prog->saved_copy)))
2326 /* just reuse saved_copy SV */
2327 if (RXp_MATCH_COPIED(prog)) {
2328 Safefree(prog->subbeg);
2329 RXp_MATCH_COPIED_off(prog);
2333 /* create new COW SV to share string */
2334 RX_MATCH_COPY_FREE(rx);
2335 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2337 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2338 assert (SvPOKp(prog->saved_copy));
2339 prog->sublen = strend - strbeg;
2340 prog->suboffset = 0;
2341 prog->subcoffset = 0;
2346 SSize_t max = strend - strbeg;
2349 if ( (flags & REXEC_COPY_SKIP_POST)
2350 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2351 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2352 ) { /* don't copy $' part of string */
2355 /* calculate the right-most part of the string covered
2356 * by a capture. Due to look-ahead, this may be to
2357 * the right of $&, so we have to scan all captures */
2358 while (n <= prog->lastparen) {
2359 if (prog->offs[n].end > max)
2360 max = prog->offs[n].end;
2364 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2365 ? prog->offs[0].start
2367 assert(max >= 0 && max <= strend - strbeg);
2370 if ( (flags & REXEC_COPY_SKIP_PRE)
2371 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2372 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2373 ) { /* don't copy $` part of string */
2376 /* calculate the left-most part of the string covered
2377 * by a capture. Due to look-behind, this may be to
2378 * the left of $&, so we have to scan all captures */
2379 while (min && n <= prog->lastparen) {
2380 if ( prog->offs[n].start != -1
2381 && prog->offs[n].start < min)
2383 min = prog->offs[n].start;
2387 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2388 && min > prog->offs[0].end
2390 min = prog->offs[0].end;
2394 assert(min >= 0 && min <= max && min <= strend - strbeg);
2397 if (RX_MATCH_COPIED(rx)) {
2398 if (sublen > prog->sublen)
2400 (char*)saferealloc(prog->subbeg, sublen+1);
2403 prog->subbeg = (char*)safemalloc(sublen+1);
2404 Copy(strbeg + min, prog->subbeg, sublen, char);
2405 prog->subbeg[sublen] = '\0';
2406 prog->suboffset = min;
2407 prog->sublen = sublen;
2408 RX_MATCH_COPIED_on(rx);
2410 prog->subcoffset = prog->suboffset;
2411 if (prog->suboffset && utf8_target) {
2412 /* Convert byte offset to chars.
2413 * XXX ideally should only compute this if @-/@+
2414 * has been seen, a la PL_sawampersand ??? */
2416 /* If there's a direct correspondence between the
2417 * string which we're matching and the original SV,
2418 * then we can use the utf8 len cache associated with
2419 * the SV. In particular, it means that under //g,
2420 * sv_pos_b2u() will use the previously cached
2421 * position to speed up working out the new length of
2422 * subcoffset, rather than counting from the start of
2423 * the string each time. This stops
2424 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2425 * from going quadratic */
2426 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2427 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2428 SV_GMAGIC|SV_CONST_RETURN);
2430 prog->subcoffset = utf8_length((U8*)strbeg,
2431 (U8*)(strbeg+prog->suboffset));
2435 RX_MATCH_COPY_FREE(rx);
2436 prog->subbeg = strbeg;
2437 prog->suboffset = 0;
2438 prog->subcoffset = 0;
2439 prog->sublen = strend - strbeg;
2447 - regexec_flags - match a regexp against a string
2450 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2451 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2452 /* stringarg: the point in the string at which to begin matching */
2453 /* strend: pointer to null at end of string */
2454 /* strbeg: real beginning of string */
2455 /* minend: end of match must be >= minend bytes after stringarg. */
2456 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2457 * itself is accessed via the pointers above */
2458 /* data: May be used for some additional optimizations.
2459 Currently unused. */
2460 /* flags: For optimizations. See REXEC_* in regexp.h */
2463 struct regexp *const prog = ReANY(rx);
2467 SSize_t minlen; /* must match at least this many chars */
2468 SSize_t dontbother = 0; /* how many characters not to try at end */
2469 const bool utf8_target = cBOOL(DO_UTF8(sv));
2471 RXi_GET_DECL(prog,progi);
2472 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2473 regmatch_info *const reginfo = ®info_buf;
2474 regexp_paren_pair *swap = NULL;
2476 GET_RE_DEBUG_FLAGS_DECL;
2478 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2479 PERL_UNUSED_ARG(data);
2481 /* Be paranoid... */
2482 if (prog == NULL || stringarg == NULL) {
2483 Perl_croak(aTHX_ "NULL regexp parameter");
2487 debug_start_match(rx, utf8_target, stringarg, strend,
2491 startpos = stringarg;
2493 if (prog->intflags & PREGf_GPOS_SEEN) {
2496 /* set reginfo->ganch, the position where \G can match */
2499 (flags & REXEC_IGNOREPOS)
2500 ? stringarg /* use start pos rather than pos() */
2501 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2502 /* Defined pos(): */
2503 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2504 : strbeg; /* pos() not defined; use start of string */
2506 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2507 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2509 /* in the presence of \G, we may need to start looking earlier in
2510 * the string than the suggested start point of stringarg:
2511 * if prog->gofs is set, then that's a known, fixed minimum
2514 * /ab|c\G/: gofs = 1
2515 * or if the minimum offset isn't known, then we have to go back
2516 * to the start of the string, e.g. /w+\G/
2519 if (prog->intflags & PREGf_ANCH_GPOS) {
2520 startpos = reginfo->ganch - prog->gofs;
2522 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2524 DEBUG_r(PerlIO_printf(Perl_debug_log,
2525 "fail: ganch-gofs before earliest possible start\n"));
2529 else if (prog->gofs) {
2530 if (startpos - prog->gofs < strbeg)
2533 startpos -= prog->gofs;
2535 else if (prog->intflags & PREGf_GPOS_FLOAT)
2539 minlen = prog->minlen;
2540 if ((startpos + minlen) > strend || startpos < strbeg) {
2541 DEBUG_r(PerlIO_printf(Perl_debug_log,
2542 "Regex match can't succeed, so not even tried\n"));
2546 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2547 * which will call destuctors to reset PL_regmatch_state, free higher
2548 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2549 * regmatch_info_aux_eval */
2551 oldsave = PL_savestack_ix;
2555 if ((prog->extflags & RXf_USE_INTUIT)
2556 && !(flags & REXEC_CHECKED))
2558 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2563 if (prog->extflags & RXf_CHECK_ALL) {
2564 /* we can match based purely on the result of INTUIT.
2565 * Set up captures etc just for $& and $-[0]
2566 * (an intuit-only match wont have $1,$2,..) */
2567 assert(!prog->nparens);
2569 /* s/// doesn't like it if $& is earlier than where we asked it to
2570 * start searching (which can happen on something like /.\G/) */
2571 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2574 /* this should only be possible under \G */
2575 assert(prog->intflags & PREGf_GPOS_SEEN);
2576 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2577 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2581 /* match via INTUIT shouldn't have any captures.
2582 * Let @-, @+, $^N know */
2583 prog->lastparen = prog->lastcloseparen = 0;
2584 RX_MATCH_UTF8_set(rx, utf8_target);
2585 prog->offs[0].start = s - strbeg;
2586 prog->offs[0].end = utf8_target
2587 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2588 : s - strbeg + prog->minlenret;
2589 if ( !(flags & REXEC_NOT_FIRST) )
2590 S_reg_set_capture_string(aTHX_ rx,
2592 sv, flags, utf8_target);
2598 multiline = prog->extflags & RXf_PMf_MULTILINE;
2600 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2602 "String too short [regexec_flags]...\n"));
2606 /* Check validity of program. */
2607 if (UCHARAT(progi->program) != REG_MAGIC) {
2608 Perl_croak(aTHX_ "corrupted regexp program");
2611 RX_MATCH_TAINTED_off(rx);
2613 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2614 reginfo->intuit = 0;
2615 reginfo->is_utf8_target = cBOOL(utf8_target);
2616 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2617 reginfo->warned = FALSE;
2618 reginfo->strbeg = strbeg;
2620 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2621 reginfo->strend = strend;
2622 /* see how far we have to get to not match where we matched before */
2623 reginfo->till = stringarg + minend;
2625 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2626 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2627 S_cleanup_regmatch_info_aux has executed (registered by
2628 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2629 magic belonging to this SV.
2630 Not newSVsv, either, as it does not COW.
2632 assert(!IS_PADGV(sv));
2633 reginfo->sv = newSV(0);
2634 SvSetSV_nosteal(reginfo->sv, sv);
2635 SAVEFREESV(reginfo->sv);
2638 /* reserve next 2 or 3 slots in PL_regmatch_state:
2639 * slot N+0: may currently be in use: skip it
2640 * slot N+1: use for regmatch_info_aux struct
2641 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2642 * slot N+3: ready for use by regmatch()
2646 regmatch_state *old_regmatch_state;
2647 regmatch_slab *old_regmatch_slab;
2648 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2650 /* on first ever match, allocate first slab */
2651 if (!PL_regmatch_slab) {
2652 Newx(PL_regmatch_slab, 1, regmatch_slab);
2653 PL_regmatch_slab->prev = NULL;
2654 PL_regmatch_slab->next = NULL;
2655 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2658 old_regmatch_state = PL_regmatch_state;
2659 old_regmatch_slab = PL_regmatch_slab;
2661 for (i=0; i <= max; i++) {
2663 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2665 reginfo->info_aux_eval =
2666 reginfo->info_aux->info_aux_eval =
2667 &(PL_regmatch_state->u.info_aux_eval);
2669 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2670 PL_regmatch_state = S_push_slab(aTHX);
2673 /* note initial PL_regmatch_state position; at end of match we'll
2674 * pop back to there and free any higher slabs */
2676 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2677 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2678 reginfo->info_aux->poscache = NULL;
2680 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2682 if ((prog->extflags & RXf_EVAL_SEEN))
2683 S_setup_eval_state(aTHX_ reginfo);
2685 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2688 /* If there is a "must appear" string, look for it. */
2690 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2691 /* We have to be careful. If the previous successful match
2692 was from this regex we don't want a subsequent partially
2693 successful match to clobber the old results.
2694 So when we detect this possibility we add a swap buffer
2695 to the re, and switch the buffer each match. If we fail,
2696 we switch it back; otherwise we leave it swapped.
2699 /* do we need a save destructor here for eval dies? */
2700 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2701 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2702 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2709 /* Simplest case: anchored match need be tried only once. */
2710 /* [unless only anchor is BOL and multiline is set] */
2711 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
2712 if (s == startpos && regtry(reginfo, &s))
2714 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
2719 dontbother = minlen - 1;
2720 end = HOP3c(strend, -dontbother, strbeg) - 1;
2721 /* for multiline we only have to try after newlines */
2722 if (prog->check_substr || prog->check_utf8) {
2723 /* because of the goto we can not easily reuse the macros for bifurcating the
2724 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2727 goto after_try_utf8;
2729 if (regtry(reginfo, &s)) {
2736 if (prog->extflags & RXf_USE_INTUIT) {
2737 s = re_intuit_start(rx, sv, strbeg,
2738 s + UTF8SKIP(s), strend, flags, NULL);
2747 } /* end search for check string in unicode */
2749 if (s == startpos) {
2750 goto after_try_latin;
2753 if (regtry(reginfo, &s)) {
2760 if (prog->extflags & RXf_USE_INTUIT) {
2761 s = re_intuit_start(rx, sv, strbeg,
2762 s + 1, strend, flags, NULL);
2771 } /* end search for check string in latin*/
2772 } /* end search for check string */
2773 else { /* search for newline */
2775 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2778 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2779 while (s <= end) { /* note it could be possible to match at the end of the string */
2780 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2781 if (regtry(reginfo, &s))
2785 } /* end search for newline */
2786 } /* end anchored/multiline check string search */
2788 } else if (prog->intflags & PREGf_ANCH_GPOS)
2790 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2791 assert(prog->intflags & PREGf_GPOS_SEEN);
2792 /* For anchored \G, the only position it can match from is
2793 * (ganch-gofs); we already set startpos to this above; if intuit
2794 * moved us on from there, we can't possibly succeed */
2795 assert(startpos == reginfo->ganch - prog->gofs);
2796 if (s == startpos && regtry(reginfo, &s))
2801 /* Messy cases: unanchored match. */
2802 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2803 /* we have /x+whatever/ */
2804 /* it must be a one character string (XXXX Except is_utf8_pat?) */
2810 if (! prog->anchored_utf8) {
2811 to_utf8_substr(prog);
2813 ch = SvPVX_const(prog->anchored_utf8)[0];
2816 DEBUG_EXECUTE_r( did_match = 1 );
2817 if (regtry(reginfo, &s)) goto got_it;
2819 while (s < strend && *s == ch)
2826 if (! prog->anchored_substr) {
2827 if (! to_byte_substr(prog)) {
2828 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2831 ch = SvPVX_const(prog->anchored_substr)[0];
2834 DEBUG_EXECUTE_r( did_match = 1 );
2835 if (regtry(reginfo, &s)) goto got_it;
2837 while (s < strend && *s == ch)
2842 DEBUG_EXECUTE_r(if (!did_match)
2843 PerlIO_printf(Perl_debug_log,
2844 "Did not find anchored character...\n")
2847 else if (prog->anchored_substr != NULL
2848 || prog->anchored_utf8 != NULL
2849 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2850 && prog->float_max_offset < strend - s)) {
2855 char *last1; /* Last position checked before */
2859 if (prog->anchored_substr || prog->anchored_utf8) {
2861 if (! prog->anchored_utf8) {
2862 to_utf8_substr(prog);
2864 must = prog->anchored_utf8;
2867 if (! prog->anchored_substr) {
2868 if (! to_byte_substr(prog)) {
2869 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2872 must = prog->anchored_substr;
2874 back_max = back_min = prog->anchored_offset;
2877 if (! prog->float_utf8) {
2878 to_utf8_substr(prog);
2880 must = prog->float_utf8;
2883 if (! prog->float_substr) {
2884 if (! to_byte_substr(prog)) {
2885 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2888 must = prog->float_substr;
2890 back_max = prog->float_max_offset;
2891 back_min = prog->float_min_offset;
2897 last = HOP3c(strend, /* Cannot start after this */
2898 -(SSize_t)(CHR_SVLEN(must)
2899 - (SvTAIL(must) != 0) + back_min), strbeg);
2901 if (s > reginfo->strbeg)
2902 last1 = HOPc(s, -1);
2904 last1 = s - 1; /* bogus */
2906 /* XXXX check_substr already used to find "s", can optimize if
2907 check_substr==must. */
2909 strend = HOPc(strend, -dontbother);
2910 while ( (s <= last) &&
2911 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
2912 (unsigned char*)strend, must,
2913 multiline ? FBMrf_MULTILINE : 0)) ) {
2914 DEBUG_EXECUTE_r( did_match = 1 );
2915 if (HOPc(s, -back_max) > last1) {
2916 last1 = HOPc(s, -back_min);
2917 s = HOPc(s, -back_max);
2920 char * const t = (last1 >= reginfo->strbeg)
2921 ? HOPc(last1, 1) : last1 + 1;
2923 last1 = HOPc(s, -back_min);
2927 while (s <= last1) {
2928 if (regtry(reginfo, &s))
2931 s++; /* to break out of outer loop */
2938 while (s <= last1) {
2939 if (regtry(reginfo, &s))
2945 DEBUG_EXECUTE_r(if (!did_match) {
2946 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2947 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2948 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2949 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2950 ? "anchored" : "floating"),
2951 quoted, RE_SV_TAIL(must));
2955 else if ( (c = progi->regstclass) ) {
2957 const OPCODE op = OP(progi->regstclass);
2958 /* don't bother with what can't match */
2959 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2960 strend = HOPc(strend, -(minlen - 1));
2963 SV * const prop = sv_newmortal();
2964 regprop(prog, prop, c, reginfo);
2966 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2968 PerlIO_printf(Perl_debug_log,
2969 "Matching stclass %.*s against %s (%d bytes)\n",
2970 (int)SvCUR(prop), SvPVX_const(prop),
2971 quoted, (int)(strend - s));
2974 if (find_byclass(prog, c, s, strend, reginfo))
2976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2980 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2988 if (! prog->float_utf8) {
2989 to_utf8_substr(prog);
2991 float_real = prog->float_utf8;
2994 if (! prog->float_substr) {
2995 if (! to_byte_substr(prog)) {
2996 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2999 float_real = prog->float_substr;
3002 little = SvPV_const(float_real, len);
3003 if (SvTAIL(float_real)) {
3004 /* This means that float_real contains an artificial \n on
3005 * the end due to the presence of something like this:
3006 * /foo$/ where we can match both "foo" and "foo\n" at the
3007 * end of the string. So we have to compare the end of the
3008 * string first against the float_real without the \n and
3009 * then against the full float_real with the string. We
3010 * have to watch out for cases where the string might be
3011 * smaller than the float_real or the float_real without
3013 char *checkpos= strend - len;
3015 PerlIO_printf(Perl_debug_log,
3016 "%sChecking for float_real.%s\n",
3017 PL_colors[4], PL_colors[5]));
3018 if (checkpos + 1 < strbeg) {
3019 /* can't match, even if we remove the trailing \n
3020 * string is too short to match */
3022 PerlIO_printf(Perl_debug_log,
3023 "%sString shorter than required trailing substring, cannot match.%s\n",
3024 PL_colors[4], PL_colors[5]));
3026 } else if (memEQ(checkpos + 1, little, len - 1)) {
3027 /* can match, the end of the string matches without the
3029 last = checkpos + 1;
3030 } else if (checkpos < strbeg) {
3031 /* cant match, string is too short when the "\n" is
3034 PerlIO_printf(Perl_debug_log,
3035 "%sString does not contain required trailing substring, cannot match.%s\n",
3036 PL_colors[4], PL_colors[5]));
3038 } else if (!multiline) {
3039 /* non multiline match, so compare with the "\n" at the
3040 * end of the string */
3041 if (memEQ(checkpos, little, len)) {
3045 PerlIO_printf(Perl_debug_log,
3046 "%sString does not contain required trailing substring, cannot match.%s\n",
3047 PL_colors[4], PL_colors[5]));
3051 /* multiline match, so we have to search for a place
3052 * where the full string is located */
3058 last = rninstr(s, strend, little, little + len);
3060 last = strend; /* matching "$" */
3063 /* at one point this block contained a comment which was
3064 * probably incorrect, which said that this was a "should not
3065 * happen" case. Even if it was true when it was written I am
3066 * pretty sure it is not anymore, so I have removed the comment
3067 * and replaced it with this one. Yves */
3069 PerlIO_printf(Perl_debug_log,
3070 "String does not contain required substring, cannot match.\n"
3074 dontbother = strend - last + prog->float_min_offset;
3076 if (minlen && (dontbother < minlen))
3077 dontbother = minlen - 1;
3078 strend -= dontbother; /* this one's always in bytes! */
3079 /* We don't know much -- general case. */
3082 if (regtry(reginfo, &s))
3091 if (regtry(reginfo, &s))
3093 } while (s++ < strend);
3101 /* s/// doesn't like it if $& is earlier than where we asked it to
3102 * start searching (which can happen on something like /.\G/) */
3103 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3104 && (prog->offs[0].start < stringarg - strbeg))
3106 /* this should only be possible under \G */
3107 assert(prog->intflags & PREGf_GPOS_SEEN);
3108 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3109 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3115 PerlIO_printf(Perl_debug_log,
3116 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3123 /* clean up; this will trigger destructors that will free all slabs
3124 * above the current one, and cleanup the regmatch_info_aux
3125 * and regmatch_info_aux_eval sructs */
3127 LEAVE_SCOPE(oldsave);
3129 if (RXp_PAREN_NAMES(prog))
3130 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3132 RX_MATCH_UTF8_set(rx, utf8_target);
3134 /* make sure $`, $&, $', and $digit will work later */
3135 if ( !(flags & REXEC_NOT_FIRST) )
3136 S_reg_set_capture_string(aTHX_ rx,
3137 strbeg, reginfo->strend,
3138 sv, flags, utf8_target);
3143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3144 PL_colors[4], PL_colors[5]));
3146 /* clean up; this will trigger destructors that will free all slabs
3147 * above the current one, and cleanup the regmatch_info_aux
3148 * and regmatch_info_aux_eval sructs */
3150 LEAVE_SCOPE(oldsave);
3153 /* we failed :-( roll it back */
3154 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3155 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3160 Safefree(prog->offs);
3167 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3168 * Do inc before dec, in case old and new rex are the same */
3169 #define SET_reg_curpm(Re2) \
3170 if (reginfo->info_aux_eval) { \
3171 (void)ReREFCNT_inc(Re2); \
3172 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3173 PM_SETRE((PL_reg_curpm), (Re2)); \
3178 - regtry - try match at specific point
3180 STATIC I32 /* 0 failure, 1 success */
3181 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3184 REGEXP *const rx = reginfo->prog;
3185 regexp *const prog = ReANY(rx);
3187 RXi_GET_DECL(prog,progi);
3188 GET_RE_DEBUG_FLAGS_DECL;
3190 PERL_ARGS_ASSERT_REGTRY;
3192 reginfo->cutpoint=NULL;
3194 prog->offs[0].start = *startposp - reginfo->strbeg;
3195 prog->lastparen = 0;
3196 prog->lastcloseparen = 0;
3198 /* XXXX What this code is doing here?!!! There should be no need
3199 to do this again and again, prog->lastparen should take care of
3202 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3203 * Actually, the code in regcppop() (which Ilya may be meaning by
3204 * prog->lastparen), is not needed at all by the test suite
3205 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3206 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3207 * Meanwhile, this code *is* needed for the
3208 * above-mentioned test suite tests to succeed. The common theme
3209 * on those tests seems to be returning null fields from matches.
3210 * --jhi updated by dapm */
3212 if (prog->nparens) {
3213 regexp_paren_pair *pp = prog->offs;
3215 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3223 result = regmatch(reginfo, *startposp, progi->program + 1);
3225 prog->offs[0].end = result;
3228 if (reginfo->cutpoint)
3229 *startposp= reginfo->cutpoint;
3230 REGCP_UNWIND(lastcp);
3235 #define sayYES goto yes
3236 #define sayNO goto no
3237 #define sayNO_SILENT goto no_silent
3239 /* we dont use STMT_START/END here because it leads to
3240 "unreachable code" warnings, which are bogus, but distracting. */
3241 #define CACHEsayNO \
3242 if (ST.cache_mask) \
3243 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3246 /* this is used to determine how far from the left messages like
3247 'failed...' are printed. It should be set such that messages
3248 are inline with the regop output that created them.
3250 #define REPORT_CODE_OFF 32
3253 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3254 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
3255 #define CHRTEST_NOT_A_CP_1 -999
3256 #define CHRTEST_NOT_A_CP_2 -998
3258 /* grab a new slab and return the first slot in it */
3260 STATIC regmatch_state *
3263 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3266 regmatch_slab *s = PL_regmatch_slab->next;
3268 Newx(s, 1, regmatch_slab);
3269 s->prev = PL_regmatch_slab;
3271 PL_regmatch_slab->next = s;
3273 PL_regmatch_slab = s;
3274 return SLAB_FIRST(s);
3278 /* push a new state then goto it */
3280 #define PUSH_STATE_GOTO(state, node, input) \
3281 pushinput = input; \
3283 st->resume_state = state; \
3286 /* push a new state with success backtracking, then goto it */
3288 #define PUSH_YES_STATE_GOTO(state, node, input) \
3289 pushinput = input; \
3291 st->resume_state = state; \
3292 goto push_yes_state;
3299 regmatch() - main matching routine
3301 This is basically one big switch statement in a loop. We execute an op,
3302 set 'next' to point the next op, and continue. If we come to a point which
3303 we may need to backtrack to on failure such as (A|B|C), we push a
3304 backtrack state onto the backtrack stack. On failure, we pop the top
3305 state, and re-enter the loop at the state indicated. If there are no more
3306 states to pop, we return failure.
3308 Sometimes we also need to backtrack on success; for example /A+/, where
3309 after successfully matching one A, we need to go back and try to
3310 match another one; similarly for lookahead assertions: if the assertion
3311 completes successfully, we backtrack to the state just before the assertion
3312 and then carry on. In these cases, the pushed state is marked as
3313 'backtrack on success too'. This marking is in fact done by a chain of
3314 pointers, each pointing to the previous 'yes' state. On success, we pop to
3315 the nearest yes state, discarding any intermediate failure-only states.
3316 Sometimes a yes state is pushed just to force some cleanup code to be
3317 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3318 it to free the inner regex.
3320 Note that failure backtracking rewinds the cursor position, while
3321 success backtracking leaves it alone.
3323 A pattern is complete when the END op is executed, while a subpattern
3324 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3325 ops trigger the "pop to last yes state if any, otherwise return true"
3328 A common convention in this function is to use A and B to refer to the two
3329 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3330 the subpattern to be matched possibly multiple times, while B is the entire
3331 rest of the pattern. Variable and state names reflect this convention.
3333 The states in the main switch are the union of ops and failure/success of
3334 substates associated with with that op. For example, IFMATCH is the op
3335 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3336 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3337 successfully matched A and IFMATCH_A_fail is a state saying that we have
3338 just failed to match A. Resume states always come in pairs. The backtrack
3339 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3340 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3341 on success or failure.
3343 The struct that holds a backtracking state is actually a big union, with
3344 one variant for each major type of op. The variable st points to the
3345 top-most backtrack struct. To make the code clearer, within each
3346 block of code we #define ST to alias the relevant union.
3348 Here's a concrete example of a (vastly oversimplified) IFMATCH
3354 #define ST st->u.ifmatch
3356 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3357 ST.foo = ...; // some state we wish to save
3359 // push a yes backtrack state with a resume value of
3360 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3362 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3365 case IFMATCH_A: // we have successfully executed A; now continue with B
3367 bar = ST.foo; // do something with the preserved value
3370 case IFMATCH_A_fail: // A failed, so the assertion failed
3371 ...; // do some housekeeping, then ...
3372 sayNO; // propagate the failure
3379 For any old-timers reading this who are familiar with the old recursive
3380 approach, the code above is equivalent to:
3382 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3391 ...; // do some housekeeping, then ...
3392 sayNO; // propagate the failure
3395 The topmost backtrack state, pointed to by st, is usually free. If you
3396 want to claim it, populate any ST.foo fields in it with values you wish to
3397 save, then do one of
3399 PUSH_STATE_GOTO(resume_state, node, newinput);
3400 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3402 which sets that backtrack state's resume value to 'resume_state', pushes a
3403 new free entry to the top of the backtrack stack, then goes to 'node'.
3404 On backtracking, the free slot is popped, and the saved state becomes the
3405 new free state. An ST.foo field in this new top state can be temporarily
3406 accessed to retrieve values, but once the main loop is re-entered, it
3407 becomes available for reuse.
3409 Note that the depth of the backtrack stack constantly increases during the
3410 left-to-right execution of the pattern, rather than going up and down with
3411 the pattern nesting. For example the stack is at its maximum at Z at the
3412 end of the pattern, rather than at X in the following:
3414 /(((X)+)+)+....(Y)+....Z/
3416 The only exceptions to this are lookahead/behind assertions and the cut,
3417 (?>A), which pop all the backtrack states associated with A before
3420 Backtrack state structs are allocated in slabs of about 4K in size.
3421 PL_regmatch_state and st always point to the currently active state,
3422 and PL_regmatch_slab points to the slab currently containing
3423 PL_regmatch_state. The first time regmatch() is called, the first slab is
3424 allocated, and is never freed until interpreter destruction. When the slab
3425 is full, a new one is allocated and chained to the end. At exit from
3426 regmatch(), slabs allocated since entry are freed.
3431 #define DEBUG_STATE_pp(pp) \
3433 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3434 PerlIO_printf(Perl_debug_log, \
3435 " %*s"pp" %s%s%s%s%s\n", \
3437 PL_reg_name[st->resume_state], \
3438 ((st==yes_state||st==mark_state) ? "[" : ""), \
3439 ((st==yes_state) ? "Y" : ""), \
3440 ((st==mark_state) ? "M" : ""), \
3441 ((st==yes_state||st==mark_state) ? "]" : "") \
3446 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3451 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3452 const char *start, const char *end, const char *blurb)
3454 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3456 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3461 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3462 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3464 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3465 start, end - start, 60);
3467 PerlIO_printf(Perl_debug_log,
3468 "%s%s REx%s %s against %s\n",
3469 PL_colors[4], blurb, PL_colors[5], s0, s1);
3471 if (utf8_target||utf8_pat)
3472 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3473 utf8_pat ? "pattern" : "",
3474 utf8_pat && utf8_target ? " and " : "",
3475 utf8_target ? "string" : ""
3481 S_dump_exec_pos(pTHX_ const char *locinput,
3482 const regnode *scan,
3483 const char *loc_regeol,
3484 const char *loc_bostr,
3485 const char *loc_reg_starttry,
3486 const bool utf8_target)
3488 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3489 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3490 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3491 /* The part of the string before starttry has one color
3492 (pref0_len chars), between starttry and current
3493 position another one (pref_len - pref0_len chars),
3494 after the current position the third one.
3495 We assume that pref0_len <= pref_len, otherwise we
3496 decrease pref0_len. */
3497 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3498 ? (5 + taill) - l : locinput - loc_bostr;
3501 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3503 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3505 pref0_len = pref_len - (locinput - loc_reg_starttry);
3506 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3507 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3508 ? (5 + taill) - pref_len : loc_regeol - locinput);
3509 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3513 if (pref0_len > pref_len)
3514 pref0_len = pref_len;
3516 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3518 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3519 (locinput - pref_len),pref0_len, 60, 4, 5);
3521 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3522 (locinput - pref_len + pref0_len),
3523 pref_len - pref0_len, 60, 2, 3);
3525 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3526 locinput, loc_regeol - locinput, 10, 0, 1);
3528 const STRLEN tlen=len0+len1+len2;
3529 PerlIO_printf(Perl_debug_log,
3530 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3531 (IV)(locinput - loc_bostr),
3534 (docolor ? "" : "> <"),
3536 (int)(tlen > 19 ? 0 : 19 - tlen),
3543 /* reg_check_named_buff_matched()
3544 * Checks to see if a named buffer has matched. The data array of
3545 * buffer numbers corresponding to the buffer is expected to reside
3546 * in the regexp->data->data array in the slot stored in the ARG() of
3547 * node involved. Note that this routine doesn't actually care about the
3548 * name, that information is not preserved from compilation to execution.
3549 * Returns the index of the leftmost defined buffer with the given name
3550 * or 0 if non of the buffers matched.
3553 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3556 RXi_GET_DECL(rex,rexi);
3557 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3558 I32 *nums=(I32*)SvPVX(sv_dat);
3560 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3562 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3563 if ((I32)rex->lastparen >= nums[n] &&
3564 rex->offs[nums[n]].end != -1)
3574 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3575 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3577 /* This function determines if there are one or two characters that match
3578 * the first character of the passed-in EXACTish node <text_node>, and if
3579 * so, returns them in the passed-in pointers.
3581 * If it determines that no possible character in the target string can
3582 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3583 * the first character in <text_node> requires UTF-8 to represent, and the
3584 * target string isn't in UTF-8.)
3586 * If there are more than two characters that could match the beginning of
3587 * <text_node>, or if more context is required to determine a match or not,
3588 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3590 * The motiviation behind this function is to allow the caller to set up
3591 * tight loops for matching. If <text_node> is of type EXACT, there is
3592 * only one possible character that can match its first character, and so
3593 * the situation is quite simple. But things get much more complicated if
3594 * folding is involved. It may be that the first character of an EXACTFish
3595 * node doesn't participate in any possible fold, e.g., punctuation, so it
3596 * can be matched only by itself. The vast majority of characters that are
3597 * in folds match just two things, their lower and upper-case equivalents.
3598 * But not all are like that; some have multiple possible matches, or match
3599 * sequences of more than one character. This function sorts all that out.
3601 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3602 * loop of trying to match A*, we know we can't exit where the thing
3603 * following it isn't a B. And something can't be a B unless it is the
3604 * beginning of B. By putting a quick test for that beginning in a tight
3605 * loop, we can rule out things that can't possibly be B without having to
3606 * break out of the loop, thus avoiding work. Similarly, if A is a single
3607 * character, we can make a tight loop matching A*, using the outputs of
3610 * If the target string to match isn't in UTF-8, and there aren't
3611 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3612 * the one or two possible octets (which are characters in this situation)
3613 * that can match. In all cases, if there is only one character that can
3614 * match, *<c1p> and *<c2p> will be identical.
3616 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3617 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3618 * can match the beginning of <text_node>. They should be declared with at
3619 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3620 * undefined what these contain.) If one or both of the buffers are
3621 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3622 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3623 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3624 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3625 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3627 const bool utf8_target = reginfo->is_utf8_target;
3629 UV c1 = CHRTEST_NOT_A_CP_1;
3630 UV c2 = CHRTEST_NOT_A_CP_2;
3631 bool use_chrtest_void = FALSE;
3632 const bool is_utf8_pat = reginfo->is_utf8_pat;
3634 /* Used when we have both utf8 input and utf8 output, to avoid converting
3635 * to/from code points */
3636 bool utf8_has_been_setup = FALSE;
3640 U8 *pat = (U8*)STRING(text_node);
3641 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3643 if (OP(text_node) == EXACT) {
3645 /* In an exact node, only one thing can be matched, that first
3646 * character. If both the pat and the target are UTF-8, we can just
3647 * copy the input to the output, avoiding finding the code point of
3652 else if (utf8_target) {
3653 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3654 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3655 utf8_has_been_setup = TRUE;
3658 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3661 else { /* an EXACTFish node */
3662 U8 *pat_end = pat + STR_LEN(text_node);
3664 /* An EXACTFL node has at least some characters unfolded, because what
3665 * they match is not known until now. So, now is the time to fold
3666 * the first few of them, as many as are needed to determine 'c1' and
3667 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
3668 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3669 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
3670 * need to fold as many characters as a single character can fold to,
3671 * so that later we can check if the first ones are such a multi-char
3672 * fold. But, in such a pattern only locale-problematic characters
3673 * aren't folded, so we can skip this completely if the first character
3674 * in the node isn't one of the tricky ones */
3675 if (OP(text_node) == EXACTFL) {
3677 if (! is_utf8_pat) {
3678 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3680 folded[0] = folded[1] = 's';
3682 pat_end = folded + 2;
3685 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3690 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3692 *(d++) = (U8) toFOLD_LC(*s);
3697 _to_utf8_fold_flags(s,
3700 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3711 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3712 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
3714 /* Multi-character folds require more context to sort out. Also
3715 * PL_utf8_foldclosures used below doesn't handle them, so have to
3716 * be handled outside this routine */
3717 use_chrtest_void = TRUE;
3719 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3720 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3722 /* Load the folds hash, if not already done */
3724 if (! PL_utf8_foldclosures) {
3725 _load_PL_utf8_foldclosures();
3728 /* The fold closures data structure is a hash with the keys
3729 * being the UTF-8 of every character that is folded to, like
3730 * 'k', and the values each an array of all code points that
3731 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
3732 * Multi-character folds are not included */
3733 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3738 /* Not found in the hash, therefore there are no folds
3739 * containing it, so there is only a single character that
3743 else { /* Does participate in folds */
3744 AV* list = (AV*) *listp;
3745 if (av_tindex(list) != 1) {
3747 /* If there aren't exactly two folds to this, it is
3748 * outside the scope of this function */
3749 use_chrtest_void = TRUE;
3751 else { /* There are two. Get them */
3752 SV** c_p = av_fetch(list, 0, FALSE);
3754 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3758 c_p = av_fetch(list, 1, FALSE);
3760 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3764 /* Folds that cross the 255/256 boundary are forbidden
3765 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3766 * one is ASCIII. Since the pattern character is above
3767 * 255, and its only other match is below 256, the only
3768 * legal match will be to itself. We have thrown away
3769 * the original, so have to compute which is the one
3771 if ((c1 < 256) != (c2 < 256)) {
3772 if ((OP(text_node) == EXACTFL
3773 && ! IN_UTF8_CTYPE_LOCALE)
3774 || ((OP(text_node) == EXACTFA
3775 || OP(text_node) == EXACTFA_NO_TRIE)
3776 && (isASCII(c1) || isASCII(c2))))
3789 else /* Here, c1 is <= 255 */
3791 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3792 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3793 && ((OP(text_node) != EXACTFA
3794 && OP(text_node) != EXACTFA_NO_TRIE)
3797 /* Here, there could be something above Latin1 in the target
3798 * which folds to this character in the pattern. All such
3799 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3800 * than two characters involved in their folds, so are outside
3801 * the scope of this function */
3802 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3803 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3806 use_chrtest_void = TRUE;
3809 else { /* Here nothing above Latin1 can fold to the pattern
3811 switch (OP(text_node)) {
3813 case EXACTFL: /* /l rules */
3814 c2 = PL_fold_locale[c1];
3817 case EXACTF: /* This node only generated for non-utf8
3819 assert(! is_utf8_pat);
3820 if (! utf8_target) { /* /d rules */
3825 /* /u rules for all these. This happens to work for
3826 * EXACTFA as nothing in Latin1 folds to ASCII */
3827 case EXACTFA_NO_TRIE: /* This node only generated for
3828 non-utf8 patterns */
3829 assert(! is_utf8_pat);
3834 c2 = PL_fold_latin1[c1];
3838 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3839 assert(0); /* NOTREACHED */
3845 /* Here have figured things out. Set up the returns */
3846 if (use_chrtest_void) {
3847 *c2p = *c1p = CHRTEST_VOID;
3849 else if (utf8_target) {
3850 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3851 uvchr_to_utf8(c1_utf8, c1);
3852 uvchr_to_utf8(c2_utf8, c2);
3855 /* Invariants are stored in both the utf8 and byte outputs; Use
3856 * negative numbers otherwise for the byte ones. Make sure that the
3857 * byte ones are the same iff the utf8 ones are the same */
3858 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3859 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3862 ? CHRTEST_NOT_A_CP_1
3863 : CHRTEST_NOT_A_CP_2;
3865 else if (c1 > 255) {
3866 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3871 *c1p = *c2p = c2; /* c2 is the only representable value */
3873 else { /* c1 is representable; see about c2 */
3875 *c2p = (c2 < 256) ? c2 : c1;
3881 /* returns -1 on failure, $+[0] on success */
3883 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3885 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3889 const bool utf8_target = reginfo->is_utf8_target;
3890 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3891 REGEXP *rex_sv = reginfo->prog;
3892 regexp *rex = ReANY(rex_sv);
3893 RXi_GET_DECL(rex,rexi);
3894 /* the current state. This is a cached copy of PL_regmatch_state */
3896 /* cache heavy used fields of st in registers */
3899 U32 n = 0; /* general value; init to avoid compiler warning */
3900 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
3901 char *locinput = startpos;
3902 char *pushinput; /* where to continue after a PUSH */
3903 I32 nextchr; /* is always set to UCHARAT(locinput) */
3905 bool result = 0; /* return value of S_regmatch */
3906 int depth = 0; /* depth of backtrack stack */
3907 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3908 const U32 max_nochange_depth =
3909 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3910 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3911 regmatch_state *yes_state = NULL; /* state to pop to on success of
3913 /* mark_state piggy backs on the yes_state logic so that when we unwind
3914 the stack on success we can update the mark_state as we go */
3915 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3916 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3917 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3919 bool no_final = 0; /* prevent failure from backtracking? */
3920 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3921 char *startpoint = locinput;
3922 SV *popmark = NULL; /* are we looking for a mark? */
3923 SV *sv_commit = NULL; /* last mark name seen in failure */
3924 SV *sv_yes_mark = NULL; /* last mark name we have seen
3925 during a successful match */
3926 U32 lastopen = 0; /* last open we saw */
3927 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3928 SV* const oreplsv =&n