5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGEXEC_C
87 #ifdef PERL_IN_XSUB_RE
93 #include "inline_invlist.c"
94 #include "unicode_constants.h"
96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
102 /* Valid for non-utf8 strings: avoids the reginclass
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
105 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
106 : ANYOF_BITMAP_TEST(p,*(c)))
112 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
113 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
115 #define HOPc(pos,off) \
116 (char *)(reginfo->is_utf8_target \
117 ? reghop3((U8*)pos, off, \
118 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
120 #define HOPBACKc(pos, off) \
121 (char*)(reginfo->is_utf8_target \
122 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
123 : (pos - off >= reginfo->strbeg) \
127 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
131 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
132 #define NEXTCHR_IS_EOS (nextchr < 0)
134 #define SET_nextchr \
135 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
137 #define SET_locinput(p) \
142 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
144 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
145 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
146 1, 0, NULL, &flags); \
151 /* If in debug mode, we test that a known character properly matches */
153 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
155 utf8_char_in_property) \
156 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
157 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
159 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
161 utf8_char_in_property) \
162 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
166 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
167 swash_property_names[_CC_WORDCHAR], \
168 GREEK_SMALL_LETTER_IOTA_UTF8)
170 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
172 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
173 "_X_regular_begin", \
174 GREEK_SMALL_LETTER_IOTA_UTF8); \
175 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
177 COMBINING_GRAVE_ACCENT_UTF8); \
180 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
181 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
183 /* for use after a quantifier and before an EXACT-like node -- japhy */
184 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
186 * NOTE that *nothing* that affects backtracking should be in here, specifically
187 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
188 * node that is in between two EXACT like nodes when ascertaining what the required
189 * "follow" character is. This should probably be moved to regex compile time
190 * although it may be done at run time beause of the REF possibility - more
191 * investigation required. -- demerphq
193 #define JUMPABLE(rn) ( \
195 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
197 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
198 OP(rn) == PLUS || OP(rn) == MINMOD || \
200 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
202 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
204 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
207 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
208 we don't need this definition. */
209 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
210 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
211 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
214 /* ... so we use this as its faster. */
215 #define IS_TEXT(rn) ( OP(rn)==EXACT )
216 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
217 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
218 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
223 Search for mandatory following text node; for lookahead, the text must
224 follow but for lookbehind (rn->flags != 0) we skip to the next step.
226 #define FIND_NEXT_IMPT(rn) STMT_START { \
227 while (JUMPABLE(rn)) { \
228 const OPCODE type = OP(rn); \
229 if (type == SUSPEND || PL_regkind[type] == CURLY) \
230 rn = NEXTOPER(NEXTOPER(rn)); \
231 else if (type == PLUS) \
233 else if (type == IFMATCH) \
234 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
235 else rn += NEXT_OFF(rn); \
239 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
240 * These are for the pre-composed Hangul syllables, which are all in a
241 * contiguous block and arranged there in such a way so as to facilitate
242 * alorithmic determination of their characteristics. As such, they don't need
243 * a swash, but can be determined by simple arithmetic. Almost all are
244 * GCB=LVT, but every 28th one is a GCB=LV */
245 #define SBASE 0xAC00 /* Start of block */
246 #define SCount 11172 /* Length of block */
249 #define SLAB_FIRST(s) (&(s)->states[0])
250 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
252 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
253 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
254 static void S_clear_backtrack_stack(pTHX_ void *p);
255 static regmatch_state * S_push_slab(pTHX);
257 #define REGCP_PAREN_ELEMS 3
258 #define REGCP_OTHER_ELEMS 3
259 #define REGCP_FRAME_ELEMS 1
260 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
261 * are needed for the regexp context stack bookkeeping. */
264 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
267 const int retval = PL_savestack_ix;
268 const int paren_elems_to_push =
269 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
270 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
271 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
273 GET_RE_DEBUG_FLAGS_DECL;
275 PERL_ARGS_ASSERT_REGCPPUSH;
277 if (paren_elems_to_push < 0)
278 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
279 paren_elems_to_push);
281 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
282 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
283 " out of range (%lu-%ld)",
285 (unsigned long)maxopenparen,
288 SSGROW(total_elems + REGCP_FRAME_ELEMS);
291 if ((int)maxopenparen > (int)parenfloor)
292 PerlIO_printf(Perl_debug_log,
293 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
298 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
299 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
300 SSPUSHINT(rex->offs[p].end);
301 SSPUSHINT(rex->offs[p].start);
302 SSPUSHINT(rex->offs[p].start_tmp);
303 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
304 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
306 (IV)rex->offs[p].start,
307 (IV)rex->offs[p].start_tmp,
311 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
312 SSPUSHINT(maxopenparen);
313 SSPUSHINT(rex->lastparen);
314 SSPUSHINT(rex->lastcloseparen);
315 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
320 /* These are needed since we do not localize EVAL nodes: */
321 #define REGCP_SET(cp) \
323 PerlIO_printf(Perl_debug_log, \
324 " Setting an EVAL scope, savestack=%"IVdf"\n", \
325 (IV)PL_savestack_ix)); \
328 #define REGCP_UNWIND(cp) \
330 if (cp != PL_savestack_ix) \
331 PerlIO_printf(Perl_debug_log, \
332 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
333 (IV)(cp), (IV)PL_savestack_ix)); \
336 #define UNWIND_PAREN(lp, lcp) \
337 for (n = rex->lastparen; n > lp; n--) \
338 rex->offs[n].end = -1; \
339 rex->lastparen = n; \
340 rex->lastcloseparen = lcp;
344 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
349 GET_RE_DEBUG_FLAGS_DECL;
351 PERL_ARGS_ASSERT_REGCPPOP;
353 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
355 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
356 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
357 rex->lastcloseparen = SSPOPINT;
358 rex->lastparen = SSPOPINT;
359 *maxopenparen_p = SSPOPINT;
361 i -= REGCP_OTHER_ELEMS;
362 /* Now restore the parentheses context. */
364 if (i || rex->lastparen + 1 <= rex->nparens)
365 PerlIO_printf(Perl_debug_log,
366 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
371 paren = *maxopenparen_p;
372 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
374 rex->offs[paren].start_tmp = SSPOPINT;
375 rex->offs[paren].start = SSPOPINT;
377 if (paren <= rex->lastparen)
378 rex->offs[paren].end = tmps;
379 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
380 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
382 (IV)rex->offs[paren].start,
383 (IV)rex->offs[paren].start_tmp,
384 (IV)rex->offs[paren].end,
385 (paren > rex->lastparen ? "(skipped)" : ""));
390 /* It would seem that the similar code in regtry()
391 * already takes care of this, and in fact it is in
392 * a better location to since this code can #if 0-ed out
393 * but the code in regtry() is needed or otherwise tests
394 * requiring null fields (pat.t#187 and split.t#{13,14}
395 * (as of patchlevel 7877) will fail. Then again,
396 * this code seems to be necessary or otherwise
397 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
398 * --jhi updated by dapm */
399 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
400 if (i > *maxopenparen_p)
401 rex->offs[i].start = -1;
402 rex->offs[i].end = -1;
403 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
404 " \\%"UVuf": %s ..-1 undeffing\n",
406 (i > *maxopenparen_p) ? "-1" : " "
412 /* restore the parens and associated vars at savestack position ix,
413 * but without popping the stack */
416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
418 I32 tmpix = PL_savestack_ix;
419 PL_savestack_ix = ix;
420 regcppop(rex, maxopenparen_p);
421 PL_savestack_ix = tmpix;
424 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
427 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
429 /* Returns a boolean as to whether or not 'character' is a member of the
430 * Posix character class given by 'classnum' that should be equivalent to a
431 * value in the typedef '_char_class_number'.
433 * Ideally this could be replaced by a just an array of function pointers
434 * to the C library functions that implement the macros this calls.
435 * However, to compile, the precise function signatures are required, and
436 * these may vary from platform to to platform. To avoid having to figure
437 * out what those all are on each platform, I (khw) am using this method,
438 * which adds an extra layer of function call overhead (unless the C
439 * optimizer strips it away). But we don't particularly care about
440 * performance with locales anyway. */
442 switch ((_char_class_number) classnum) {
443 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
444 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
445 case _CC_ENUM_ASCII: return isASCII_LC(character);
446 case _CC_ENUM_BLANK: return isBLANK_LC(character);
447 case _CC_ENUM_CASED: return isLOWER_LC(character)
448 || isUPPER_LC(character);
449 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
450 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
451 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
452 case _CC_ENUM_LOWER: return isLOWER_LC(character);
453 case _CC_ENUM_PRINT: return isPRINT_LC(character);
454 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
455 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
456 case _CC_ENUM_SPACE: return isSPACE_LC(character);
457 case _CC_ENUM_UPPER: return isUPPER_LC(character);
458 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
459 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
460 default: /* VERTSPACE should never occur in locales */
461 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
464 assert(0); /* NOTREACHED */
469 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
471 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
472 * 'character' is a member of the Posix character class given by 'classnum'
473 * that should be equivalent to a value in the typedef
474 * '_char_class_number'.
476 * This just calls isFOO_lc on the code point for the character if it is in
477 * the range 0-255. Outside that range, all characters avoid Unicode
478 * rules, ignoring any locale. So use the Unicode function if this class
479 * requires a swash, and use the Unicode macro otherwise. */
481 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
483 if (UTF8_IS_INVARIANT(*character)) {
484 return isFOO_lc(classnum, *character);
486 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
487 return isFOO_lc(classnum,
488 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
491 if (classnum < _FIRST_NON_SWASH_CC) {
493 /* Initialize the swash unless done already */
494 if (! PL_utf8_swash_ptrs[classnum]) {
495 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
496 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
497 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
500 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
502 TRUE /* is UTF */ ));
505 switch ((_char_class_number) classnum) {
507 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
509 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
510 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
511 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
512 default: return 0; /* Things like CNTRL are always
516 assert(0); /* NOTREACHED */
521 * pregexec and friends
524 #ifndef PERL_IN_XSUB_RE
526 - pregexec - match a regexp against a string
529 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
530 char *strbeg, I32 minend, SV *screamer, U32 nosave)
531 /* stringarg: the point in the string at which to begin matching */
532 /* strend: pointer to null at end of string */
533 /* strbeg: real beginning of string */
534 /* minend: end of match must be >= minend bytes after stringarg. */
535 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
536 * itself is accessed via the pointers above */
537 /* nosave: For optimizations. */
539 PERL_ARGS_ASSERT_PREGEXEC;
542 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
543 nosave ? 0 : REXEC_COPY_STR);
548 * Need to implement the following flags for reg_anch:
550 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
552 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
553 * INTUIT_AUTORITATIVE_ML
554 * INTUIT_ONCE_NOML - Intuit can match in one location only.
557 * Another flag for this function: SECOND_TIME (so that float substrs
558 * with giant delta may be not rechecked).
561 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
563 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
564 Otherwise, only SvCUR(sv) is used to get strbeg. */
566 /* XXXX We assume that strpos is strbeg unless sv. */
568 /* XXXX Some places assume that there is a fixed substring.
569 An update may be needed if optimizer marks as "INTUITable"
570 RExen without fixed substrings. Similarly, it is assumed that
571 lengths of all the strings are no more than minlen, thus they
572 cannot come from lookahead.
573 (Or minlen should take into account lookahead.)
574 NOTE: Some of this comment is not correct. minlen does now take account
575 of lookahead/behind. Further research is required. -- demerphq
579 /* A failure to find a constant substring means that there is no need to make
580 an expensive call to REx engine, thus we celebrate a failure. Similarly,
581 finding a substring too deep into the string means that fewer calls to
582 regtry() should be needed.
584 REx compiler's optimizer found 4 possible hints:
585 a) Anchored substring;
587 c) Whether we are anchored (beginning-of-line or \G);
588 d) First node (of those at offset 0) which may distinguish positions;
589 We use a)b)d) and multiline-part of c), and try to find a position in the
590 string which does not contradict any of them.
593 /* Most of decisions we do here should have been done at compile time.
594 The nodes of the REx which we used for the search should have been
595 deleted from the finite automaton. */
598 * rx: the regex to match against
599 * sv: the SV being matched: only used for utf8 flag; the string
600 * itself is accessed via the pointers below. Note that on
601 * something like an overloaded SV, SvPOK(sv) may be false
602 * and the string pointers may point to something unrelated to
604 * strbeg: real beginning of string
605 * strpos: the point in the string at which to begin matching
606 * strend: pointer to the byte following the last char of the string
607 * flags currently unused; set to 0
608 * data: currently unused; set to NULL
612 Perl_re_intuit_start(pTHX_
615 const char * const strbeg,
619 re_scream_pos_data *data)
622 struct regexp *const prog = ReANY(rx);
624 /* Should be nonnegative! */
629 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
631 char *other_last = NULL; /* other substr checked before this */
632 char *check_at = NULL; /* check substr found at this pos */
633 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
634 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
635 RXi_GET_DECL(prog,progi);
636 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
637 regmatch_info *const reginfo = ®info_buf;
639 const char * const i_strpos = strpos;
641 GET_RE_DEBUG_FLAGS_DECL;
643 PERL_ARGS_ASSERT_RE_INTUIT_START;
644 PERL_UNUSED_ARG(flags);
645 PERL_UNUSED_ARG(data);
647 reginfo->is_utf8_target = cBOOL(utf8_target);
649 /* CHR_DIST() would be more correct here but it makes things slow. */
650 if (prog->minlen > strend - strpos) {
651 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
652 "String too short... [re_intuit_start]\n"));
656 reginfo->info_aux = NULL;
657 reginfo->strbeg = strbeg;
658 reginfo->strend = strend;
659 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
661 /* not actually used within intuit, but zero for safety anyway */
662 reginfo->poscache_maxiter = 0;
665 if (!prog->check_utf8 && prog->check_substr)
666 to_utf8_substr(prog);
667 check = prog->check_utf8;
669 if (!prog->check_substr && prog->check_utf8) {
670 if (! to_byte_substr(prog)) {
671 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
674 check = prog->check_substr;
676 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
677 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
678 || ( (prog->extflags & RXf_ANCH_BOL)
679 && !multiline ) ); /* Check after \n? */
682 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
683 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
684 && (strpos != strbeg)) {
685 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
688 if (prog->check_offset_min == prog->check_offset_max
689 && !(prog->extflags & RXf_CANY_SEEN)
690 && ! multiline) /* /m can cause \n's to match that aren't
691 accounted for in the string max length.
692 See [perl #115242] */
694 /* Substring at constant offset from beg-of-str... */
697 s = HOP3c(strpos, prog->check_offset_min, strend);
700 slen = SvCUR(check); /* >= 1 */
702 if ( strend - s > slen || strend - s < slen - 1
703 || (strend - s == slen && strend[-1] != '\n')) {
704 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
707 /* Now should match s[0..slen-2] */
709 if (slen && (*SvPVX_const(check) != *s
711 && memNE(SvPVX_const(check), s, slen)))) {
713 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
717 else if (*SvPVX_const(check) != *s
718 || ((slen = SvCUR(check)) > 1
719 && memNE(SvPVX_const(check), s, slen)))
722 goto success_at_start;
725 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
727 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
728 end_shift = prog->check_end_shift;
731 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
732 - (SvTAIL(check) != 0);
733 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
735 if (end_shift < eshift)
739 else { /* Can match at random position */
742 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
743 end_shift = prog->check_end_shift;
745 /* end shift should be non negative here */
748 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
750 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
751 (IV)end_shift, RX_PRECOMP(prog));
755 /* Find a possible match in the region s..strend by looking for
756 the "check" substring in the region corrected by start/end_shift. */
759 I32 srch_start_shift = start_shift;
760 I32 srch_end_shift = end_shift;
763 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
764 srch_end_shift -= ((strbeg - s) - srch_start_shift);
765 srch_start_shift = strbeg - s;
767 DEBUG_OPTIMISE_MORE_r({
768 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
769 (IV)prog->check_offset_min,
770 (IV)srch_start_shift,
772 (IV)prog->check_end_shift);
775 if (prog->extflags & RXf_CANY_SEEN) {
776 start_point= (U8*)(s + srch_start_shift);
777 end_point= (U8*)(strend - srch_end_shift);
779 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
780 end_point= HOP3(strend, -srch_end_shift, strbeg);
782 DEBUG_OPTIMISE_MORE_r({
783 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
784 (int)(end_point - start_point),
785 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
789 s = fbm_instr( start_point, end_point,
790 check, multiline ? FBMrf_MULTILINE : 0);
792 /* Update the count-of-usability, remove useless subpatterns,
796 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
797 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
798 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
799 (s ? "Found" : "Did not find"),
800 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
801 ? "anchored" : "floating"),
804 (s ? " at offset " : "...\n") );
809 /* Finish the diagnostic message */
810 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
812 /* XXX dmq: first branch is for positive lookbehind...
813 Our check string is offset from the beginning of the pattern.
814 So we need to do any stclass tests offset forward from that
823 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
824 Start with the other substr.
825 XXXX no SCREAM optimization yet - and a very coarse implementation
826 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
827 *always* match. Probably should be marked during compile...
828 Probably it is right to do no SCREAM here...
831 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
832 : (prog->float_substr && prog->anchored_substr))
834 /* Take into account the "other" substring. */
835 /* XXXX May be hopelessly wrong for UTF... */
838 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
841 char * const last = HOP3c(s, -start_shift, strbeg);
843 char * const saved_s = s;
846 t = s - prog->check_offset_max;
847 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
849 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
854 t = HOP3c(t, prog->anchored_offset, strend);
855 if (t < other_last) /* These positions already checked */
857 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
860 /* XXXX It is not documented what units *_offsets are in.
861 We assume bytes, but this is clearly wrong.
862 Meaning this code needs to be carefully reviewed for errors.
866 /* On end-of-str: see comment below. */
867 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
868 if (must == &PL_sv_undef) {
870 DEBUG_r(must = prog->anchored_utf8); /* for debug */
875 HOP3(HOP3(last1, prog->anchored_offset, strend)
876 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
878 multiline ? FBMrf_MULTILINE : 0
881 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
882 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
883 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
884 (s ? "Found" : "Contradicts"),
885 quoted, RE_SV_TAIL(must));
890 if (last1 >= last2) {
891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
892 ", giving up...\n"));
895 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
896 ", trying floating at offset %ld...\n",
897 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
898 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
899 s = HOP3c(last, 1, strend);
903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
904 (long)(s - i_strpos)));
905 t = HOP3c(s, -prog->anchored_offset, strbeg);
906 other_last = HOP3c(s, 1, strend);
914 else { /* Take into account the floating substring. */
916 char * const saved_s = s;
919 t = HOP3c(s, -start_shift, strbeg);
921 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
922 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
923 last = HOP3c(t, prog->float_max_offset, strend);
924 s = HOP3c(t, prog->float_min_offset, strend);
927 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
928 must = utf8_target ? prog->float_utf8 : prog->float_substr;
929 /* fbm_instr() takes into account exact value of end-of-str
930 if the check is SvTAIL(ed). Since false positives are OK,
931 and end-of-str is not later than strend we are OK. */
932 if (must == &PL_sv_undef) {
934 DEBUG_r(must = prog->float_utf8); /* for debug message */
937 s = fbm_instr((unsigned char*)s,
938 (unsigned char*)last + SvCUR(must)
940 must, multiline ? FBMrf_MULTILINE : 0);
942 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
943 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
944 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
945 (s ? "Found" : "Contradicts"),
946 quoted, RE_SV_TAIL(must));
950 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
951 ", giving up...\n"));
954 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
955 ", trying anchored starting at offset %ld...\n",
956 (long)(saved_s + 1 - i_strpos)));
958 s = HOP3c(t, 1, strend);
962 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
963 (long)(s - i_strpos)));
964 other_last = s; /* Fix this later. --Hugo */
974 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
976 DEBUG_OPTIMISE_MORE_r(
977 PerlIO_printf(Perl_debug_log,
978 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
979 (IV)prog->check_offset_min,
980 (IV)prog->check_offset_max,
988 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
990 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
993 /* Fixed substring is found far enough so that the match
994 cannot start at strpos. */
996 if (ml_anch && t[-1] != '\n') {
997 /* Eventually fbm_*() should handle this, but often
998 anchored_offset is not 0, so this check will not be wasted. */
999 /* XXXX In the code below we prefer to look for "^" even in
1000 presence of anchored substrings. And we search even
1001 beyond the found float position. These pessimizations
1002 are historical artefacts only. */
1004 while (t < strend - prog->minlen) {
1006 if (t < check_at - prog->check_offset_min) {
1007 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1008 /* Since we moved from the found position,
1009 we definitely contradict the found anchored
1010 substr. Due to the above check we do not
1011 contradict "check" substr.
1012 Thus we can arrive here only if check substr
1013 is float. Redo checking for "other"=="fixed".
1016 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1017 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1018 goto do_other_anchored;
1020 /* We don't contradict the found floating substring. */
1021 /* XXXX Why not check for STCLASS? */
1023 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1024 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1027 /* Position contradicts check-string */
1028 /* XXXX probably better to look for check-string
1029 than for "\n", so one should lower the limit for t? */
1030 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1031 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1032 other_last = strpos = s = t + 1;
1037 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1038 PL_colors[0], PL_colors[1]));
1042 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1043 PL_colors[0], PL_colors[1]));
1047 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1050 /* The found string does not prohibit matching at strpos,
1051 - no optimization of calling REx engine can be performed,
1052 unless it was an MBOL and we are not after MBOL,
1053 or a future STCLASS check will fail this. */
1055 /* Even in this situation we may use MBOL flag if strpos is offset
1056 wrt the start of the string. */
1057 if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1058 /* May be due to an implicit anchor of m{.*foo} */
1059 && !(prog->intflags & PREGf_IMPLICIT))
1064 DEBUG_EXECUTE_r( if (ml_anch)
1065 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1066 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1069 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1071 prog->check_utf8 /* Could be deleted already */
1072 && --BmUSEFUL(prog->check_utf8) < 0
1073 && (prog->check_utf8 == prog->float_utf8)
1075 prog->check_substr /* Could be deleted already */
1076 && --BmUSEFUL(prog->check_substr) < 0
1077 && (prog->check_substr == prog->float_substr)
1080 /* If flags & SOMETHING - do not do it many times on the same match */
1081 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1082 /* XXX Does the destruction order has to change with utf8_target? */
1083 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1084 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1085 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1086 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1087 check = NULL; /* abort */
1089 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1090 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1091 if (prog->intflags & PREGf_IMPLICIT)
1092 prog->extflags &= ~RXf_ANCH_MBOL;
1093 /* XXXX This is a remnant of the old implementation. It
1094 looks wasteful, since now INTUIT can use many
1095 other heuristics. */
1096 prog->extflags &= ~RXf_USE_INTUIT;
1097 /* XXXX What other flags might need to be cleared in this branch? */
1103 /* Last resort... */
1104 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1105 /* trie stclasses are too expensive to use here, we are better off to
1106 leave it to regmatch itself */
1107 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1108 /* minlen == 0 is possible if regstclass is \b or \B,
1109 and the fixed substr is ''$.
1110 Since minlen is already taken into account, s+1 is before strend;
1111 accidentally, minlen >= 1 guaranties no false positives at s + 1
1112 even for \b or \B. But (minlen? 1 : 0) below assumes that
1113 regstclass does not come from lookahead... */
1114 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1115 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1116 const U8* const str = (U8*)STRING(progi->regstclass);
1117 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1118 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1121 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1122 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1123 else if (prog->float_substr || prog->float_utf8)
1124 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1128 if (checked_upto < s)
1130 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1131 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1134 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1140 const char *what = NULL;
1142 if (endpos == strend) {
1143 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1144 "Could not match STCLASS...\n") );
1147 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1148 "This position contradicts STCLASS...\n") );
1149 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1151 checked_upto = HOPBACKc(endpos, start_shift);
1152 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1153 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1154 /* Contradict one of substrings */
1155 if (prog->anchored_substr || prog->anchored_utf8) {
1156 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1157 DEBUG_EXECUTE_r( what = "anchored" );
1159 s = HOP3c(t, 1, strend);
1160 if (s + start_shift + end_shift > strend) {
1161 /* XXXX Should be taken into account earlier? */
1162 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1163 "Could not match STCLASS...\n") );
1168 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1169 "Looking for %s substr starting at offset %ld...\n",
1170 what, (long)(s + start_shift - i_strpos)) );
1173 /* Have both, check_string is floating */
1174 if (t + start_shift >= check_at) /* Contradicts floating=check */
1175 goto retry_floating_check;
1176 /* Recheck anchored substring, but not floating... */
1180 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1181 "Looking for anchored substr starting at offset %ld...\n",
1182 (long)(other_last - i_strpos)) );
1183 goto do_other_anchored;
1185 /* Another way we could have checked stclass at the
1186 current position only: */
1191 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1192 "Looking for /%s^%s/m starting at offset %ld...\n",
1193 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1196 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1198 /* Check is floating substring. */
1199 retry_floating_check:
1200 t = check_at - start_shift;
1201 DEBUG_EXECUTE_r( what = "floating" );
1202 goto hop_and_restart;
1205 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1206 "By STCLASS: moving %ld --> %ld\n",
1207 (long)(t - i_strpos), (long)(s - i_strpos))
1211 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1212 "Does not contradict STCLASS...\n");
1217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1218 PL_colors[4], (check ? "Guessed" : "Giving up"),
1219 PL_colors[5], (long)(s - i_strpos)) );
1222 fail_finish: /* Substring not found */
1223 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1224 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1226 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1227 PL_colors[4], PL_colors[5]));
1231 #define DECL_TRIE_TYPE(scan) \
1232 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1233 trie_type = ((scan->flags == EXACT) \
1234 ? (utf8_target ? trie_utf8 : trie_plain) \
1235 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1237 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1240 switch (trie_type) { \
1241 case trie_utf8_fold: \
1242 if ( foldlen>0 ) { \
1243 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1248 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1249 len = UTF8SKIP(uc); \
1250 skiplen = UNISKIP( uvc ); \
1251 foldlen -= skiplen; \
1252 uscan = foldbuf + skiplen; \
1255 case trie_latin_utf8_fold: \
1256 if ( foldlen>0 ) { \
1257 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1263 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
1264 skiplen = UNISKIP( uvc ); \
1265 foldlen -= skiplen; \
1266 uscan = foldbuf + skiplen; \
1270 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1277 charid = trie->charmap[ uvc ]; \
1281 if (widecharmap) { \
1282 SV** const svpp = hv_fetch(widecharmap, \
1283 (char*)&uvc, sizeof(UV), 0); \
1285 charid = (U16)SvIV(*svpp); \
1290 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1294 && (ln == 1 || folder(s, pat_string, ln)) \
1295 && (reginfo->intuit || regtry(reginfo, &s)) )\
1301 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1303 while (s < strend) { \
1309 #define REXEC_FBC_SCAN(CoDe) \
1311 while (s < strend) { \
1317 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1318 REXEC_FBC_UTF8_SCAN( \
1320 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1329 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1332 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1341 #define REXEC_FBC_TRYIT \
1342 if ((reginfo->intuit || regtry(reginfo, &s))) \
1345 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1346 if (utf8_target) { \
1347 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1350 REXEC_FBC_CLASS_SCAN(CoNd); \
1353 #define DUMP_EXEC_POS(li,s,doutf8) \
1354 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1355 (PL_reg_starttry),doutf8)
1358 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1359 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1360 tmp = TEST_NON_UTF8(tmp); \
1361 REXEC_FBC_UTF8_SCAN( \
1362 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1371 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1372 if (s == reginfo->strbeg) { \
1376 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1377 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1380 LOAD_UTF8_CHARCLASS_ALNUM(); \
1381 REXEC_FBC_UTF8_SCAN( \
1382 if (tmp == ! (TeSt2_UtF8)) { \
1391 /* The only difference between the BOUND and NBOUND cases is that
1392 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1393 * NBOUND. This is accomplished by passing it in either the if or else clause,
1394 * with the other one being empty */
1395 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1396 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1398 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1399 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1401 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1402 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1404 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1405 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1408 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1409 * be passed in completely with the variable name being tested, which isn't
1410 * such a clean interface, but this is easier to read than it was before. We
1411 * are looking for the boundary (or non-boundary between a word and non-word
1412 * character. The utf8 and non-utf8 cases have the same logic, but the details
1413 * must be different. Find the "wordness" of the character just prior to this
1414 * one, and compare it with the wordness of this one. If they differ, we have
1415 * a boundary. At the beginning of the string, pretend that the previous
1416 * character was a new-line */
1417 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1418 if (utf8_target) { \
1421 else { /* Not utf8 */ \
1422 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1423 tmp = TEST_NON_UTF8(tmp); \
1425 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1434 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
1437 /* We know what class REx starts with. Try to find this position... */
1438 /* if reginfo->intuit, its a dryrun */
1439 /* annoyingly all the vars in this routine have different names from their counterparts
1440 in regmatch. /grrr */
1443 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1444 const char *strend, regmatch_info *reginfo)
1447 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1448 char *pat_string; /* The pattern's exactish string */
1449 char *pat_end; /* ptr to end char of pat_string */
1450 re_fold_t folder; /* Function for computing non-utf8 folds */
1451 const U8 *fold_array; /* array for folding ords < 256 */
1457 I32 tmp = 1; /* Scratch variable? */
1458 const bool utf8_target = reginfo->is_utf8_target;
1459 UV utf8_fold_flags = 0;
1460 const bool is_utf8_pat = reginfo->is_utf8_pat;
1461 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1462 with a result inverts that result, as 0^1 =
1464 _char_class_number classnum;
1466 RXi_GET_DECL(prog,progi);
1468 PERL_ARGS_ASSERT_FIND_BYCLASS;
1470 /* We know what class it must start with. */
1473 case ANYOF_SYNTHETIC:
1474 case ANYOF_WARN_SUPER:
1476 REXEC_FBC_UTF8_CLASS_SCAN(
1477 reginclass(prog, c, (U8*)s, utf8_target));
1480 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1485 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1493 if (is_utf8_pat || utf8_target) {
1494 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1495 goto do_exactf_utf8;
1497 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1498 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1499 goto do_exactf_non_utf8; /* isn't dealt with by these */
1504 /* regcomp.c already folded this if pattern is in UTF-8 */
1505 utf8_fold_flags = 0;
1506 goto do_exactf_utf8;
1508 fold_array = PL_fold;
1510 goto do_exactf_non_utf8;
1513 if (is_utf8_pat || utf8_target) {
1514 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1515 goto do_exactf_utf8;
1517 fold_array = PL_fold_locale;
1518 folder = foldEQ_locale;
1519 goto do_exactf_non_utf8;
1523 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1525 goto do_exactf_utf8;
1527 case EXACTFU_TRICKYFOLD:
1529 if (is_utf8_pat || utf8_target) {
1530 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1531 goto do_exactf_utf8;
1534 /* Any 'ss' in the pattern should have been replaced by regcomp,
1535 * so we don't have to worry here about this single special case
1536 * in the Latin1 range */
1537 fold_array = PL_fold_latin1;
1538 folder = foldEQ_latin1;
1542 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1543 are no glitches with fold-length differences
1544 between the target string and pattern */
1546 /* The idea in the non-utf8 EXACTF* cases is to first find the
1547 * first character of the EXACTF* node and then, if necessary,
1548 * case-insensitively compare the full text of the node. c1 is the
1549 * first character. c2 is its fold. This logic will not work for
1550 * Unicode semantics and the german sharp ss, which hence should
1551 * not be compiled into a node that gets here. */
1552 pat_string = STRING(c);
1553 ln = STR_LEN(c); /* length to match in octets/bytes */
1555 /* We know that we have to match at least 'ln' bytes (which is the
1556 * same as characters, since not utf8). If we have to match 3
1557 * characters, and there are only 2 availabe, we know without
1558 * trying that it will fail; so don't start a match past the
1559 * required minimum number from the far end */
1560 e = HOP3c(strend, -((I32)ln), s);
1562 if (reginfo->intuit && e < s) {
1563 e = s; /* Due to minlen logic of intuit() */
1567 c2 = fold_array[c1];
1568 if (c1 == c2) { /* If char and fold are the same */
1569 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1572 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1580 /* If one of the operands is in utf8, we can't use the simpler folding
1581 * above, due to the fact that many different characters can have the
1582 * same fold, or portion of a fold, or different- length fold */
1583 pat_string = STRING(c);
1584 ln = STR_LEN(c); /* length to match in octets/bytes */
1585 pat_end = pat_string + ln;
1586 lnc = is_utf8_pat /* length to match in characters */
1587 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1590 /* We have 'lnc' characters to match in the pattern, but because of
1591 * multi-character folding, each character in the target can match
1592 * up to 3 characters (Unicode guarantees it will never exceed
1593 * this) if it is utf8-encoded; and up to 2 if not (based on the
1594 * fact that the Latin 1 folds are already determined, and the
1595 * only multi-char fold in that range is the sharp-s folding to
1596 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1597 * string character. Adjust lnc accordingly, rounding up, so that
1598 * if we need to match at least 4+1/3 chars, that really is 5. */
1599 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1600 lnc = (lnc + expansion - 1) / expansion;
1602 /* As in the non-UTF8 case, if we have to match 3 characters, and
1603 * only 2 are left, it's guaranteed to fail, so don't start a
1604 * match that would require us to go beyond the end of the string
1606 e = HOP3c(strend, -((I32)lnc), s);
1608 if (reginfo->intuit && e < s) {
1609 e = s; /* Due to minlen logic of intuit() */
1612 /* XXX Note that we could recalculate e to stop the loop earlier,
1613 * as the worst case expansion above will rarely be met, and as we
1614 * go along we would usually find that e moves further to the left.
1615 * This would happen only after we reached the point in the loop
1616 * where if there were no expansion we should fail. Unclear if
1617 * worth the expense */
1620 char *my_strend= (char *)strend;
1621 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1622 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1623 && (reginfo->intuit || regtry(reginfo, &s)) )
1627 s += (utf8_target) ? UTF8SKIP(s) : 1;
1632 RXp_MATCH_TAINTED_on(prog);
1633 FBC_BOUND(isWORDCHAR_LC,
1634 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1635 isWORDCHAR_LC_utf8((U8*)s));
1638 RXp_MATCH_TAINTED_on(prog);
1639 FBC_NBOUND(isWORDCHAR_LC,
1640 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1641 isWORDCHAR_LC_utf8((U8*)s));
1644 FBC_BOUND(isWORDCHAR,
1645 isWORDCHAR_uni(tmp),
1646 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1649 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1651 isWORDCHAR_A((U8*)s));
1654 FBC_NBOUND(isWORDCHAR,
1655 isWORDCHAR_uni(tmp),
1656 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1659 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1661 isWORDCHAR_A((U8*)s));
1664 FBC_BOUND(isWORDCHAR_L1,
1665 isWORDCHAR_uni(tmp),
1666 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1669 FBC_NBOUND(isWORDCHAR_L1,
1670 isWORDCHAR_uni(tmp),
1671 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1674 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1675 is_LNBREAK_latin1_safe(s, strend)
1679 /* The argument to all the POSIX node types is the class number to pass to
1680 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1687 RXp_MATCH_TAINTED_on(prog);
1688 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1689 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1704 /* The complement of something that matches only ASCII matches all
1705 * UTF-8 variant code points, plus everything in ASCII that isn't
1707 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1708 || ! _generic_isCC_A(*s, FLAGS(c)));
1717 /* Don't need to worry about utf8, as it can match only a single
1718 * byte invariant character. */
1719 REXEC_FBC_CLASS_SCAN(
1720 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1728 if (! utf8_target) {
1729 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1735 classnum = (_char_class_number) FLAGS(c);
1736 if (classnum < _FIRST_NON_SWASH_CC) {
1737 while (s < strend) {
1739 /* We avoid loading in the swash as long as possible, but
1740 * should we have to, we jump to a separate loop. This
1741 * extra 'if' statement is what keeps this code from being
1742 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1743 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1744 goto found_above_latin1;
1746 if ((UTF8_IS_INVARIANT(*s)
1747 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1749 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1750 && to_complement ^ cBOOL(
1751 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1754 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1766 else switch (classnum) { /* These classes are implemented as
1768 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1769 revert the change of \v matching this */
1772 case _CC_ENUM_PSXSPC:
1773 REXEC_FBC_UTF8_CLASS_SCAN(
1774 to_complement ^ cBOOL(isSPACE_utf8(s)));
1777 case _CC_ENUM_BLANK:
1778 REXEC_FBC_UTF8_CLASS_SCAN(
1779 to_complement ^ cBOOL(isBLANK_utf8(s)));
1782 case _CC_ENUM_XDIGIT:
1783 REXEC_FBC_UTF8_CLASS_SCAN(
1784 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1787 case _CC_ENUM_VERTSPACE:
1788 REXEC_FBC_UTF8_CLASS_SCAN(
1789 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1792 case _CC_ENUM_CNTRL:
1793 REXEC_FBC_UTF8_CLASS_SCAN(
1794 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1798 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1799 assert(0); /* NOTREACHED */
1804 found_above_latin1: /* Here we have to load a swash to get the result
1805 for the current code point */
1806 if (! PL_utf8_swash_ptrs[classnum]) {
1807 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1808 PL_utf8_swash_ptrs[classnum] =
1809 _core_swash_init("utf8", swash_property_names[classnum],
1810 &PL_sv_undef, 1, 0, NULL, &flags);
1813 /* This is a copy of the loop above for swash classes, though using the
1814 * FBC macro instead of being expanded out. Since we've loaded the
1815 * swash, we don't have to check for that each time through the loop */
1816 REXEC_FBC_UTF8_CLASS_SCAN(
1817 to_complement ^ cBOOL(_generic_utf8(
1820 swash_fetch(PL_utf8_swash_ptrs[classnum],
1828 /* what trie are we using right now */
1829 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1830 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1831 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1833 const char *last_start = strend - trie->minlen;
1835 const char *real_start = s;
1837 STRLEN maxlen = trie->maxlen;
1839 U8 **points; /* map of where we were in the input string
1840 when reading a given char. For ASCII this
1841 is unnecessary overhead as the relationship
1842 is always 1:1, but for Unicode, especially
1843 case folded Unicode this is not true. */
1844 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1848 GET_RE_DEBUG_FLAGS_DECL;
1850 /* We can't just allocate points here. We need to wrap it in
1851 * an SV so it gets freed properly if there is a croak while
1852 * running the match */
1855 sv_points=newSV(maxlen * sizeof(U8 *));
1856 SvCUR_set(sv_points,
1857 maxlen * sizeof(U8 *));
1858 SvPOK_on(sv_points);
1859 sv_2mortal(sv_points);
1860 points=(U8**)SvPV_nolen(sv_points );
1861 if ( trie_type != trie_utf8_fold
1862 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1865 bitmap=(U8*)trie->bitmap;
1867 bitmap=(U8*)ANYOF_BITMAP(c);
1869 /* this is the Aho-Corasick algorithm modified a touch
1870 to include special handling for long "unknown char" sequences.
1871 The basic idea being that we use AC as long as we are dealing
1872 with a possible matching char, when we encounter an unknown char
1873 (and we have not encountered an accepting state) we scan forward
1874 until we find a legal starting char.
1875 AC matching is basically that of trie matching, except that when
1876 we encounter a failing transition, we fall back to the current
1877 states "fail state", and try the current char again, a process
1878 we repeat until we reach the root state, state 1, or a legal
1879 transition. If we fail on the root state then we can either
1880 terminate if we have reached an accepting state previously, or
1881 restart the entire process from the beginning if we have not.
1884 while (s <= last_start) {
1885 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1893 U8 *uscan = (U8*)NULL;
1894 U8 *leftmost = NULL;
1896 U32 accepted_word= 0;
1900 while ( state && uc <= (U8*)strend ) {
1902 U32 word = aho->states[ state ].wordnum;
1906 DEBUG_TRIE_EXECUTE_r(
1907 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1908 dump_exec_pos( (char *)uc, c, strend, real_start,
1909 (char *)uc, utf8_target );
1910 PerlIO_printf( Perl_debug_log,
1911 " Scanning for legal start char...\n");
1915 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1919 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1925 if (uc >(U8*)last_start) break;
1929 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1930 if (!leftmost || lpos < leftmost) {
1931 DEBUG_r(accepted_word=word);
1937 points[pointpos++ % maxlen]= uc;
1938 if (foldlen || uc < (U8*)strend) {
1939 REXEC_TRIE_READ_CHAR(trie_type, trie,
1941 uscan, len, uvc, charid, foldlen,
1943 DEBUG_TRIE_EXECUTE_r({
1944 dump_exec_pos( (char *)uc, c, strend,
1945 real_start, s, utf8_target);
1946 PerlIO_printf(Perl_debug_log,
1947 " Charid:%3u CP:%4"UVxf" ",
1959 word = aho->states[ state ].wordnum;
1961 base = aho->states[ state ].trans.base;
1963 DEBUG_TRIE_EXECUTE_r({
1965 dump_exec_pos( (char *)uc, c, strend, real_start,
1967 PerlIO_printf( Perl_debug_log,
1968 "%sState: %4"UVxf", word=%"UVxf,
1969 failed ? " Fail transition to " : "",
1970 (UV)state, (UV)word);
1976 ( ((offset = base + charid
1977 - 1 - trie->uniquecharcount)) >= 0)
1978 && ((U32)offset < trie->lasttrans)
1979 && trie->trans[offset].check == state
1980 && (tmp=trie->trans[offset].next))
1982 DEBUG_TRIE_EXECUTE_r(
1983 PerlIO_printf( Perl_debug_log," - legal\n"));
1988 DEBUG_TRIE_EXECUTE_r(
1989 PerlIO_printf( Perl_debug_log," - fail\n"));
1991 state = aho->fail[state];
1995 /* we must be accepting here */
1996 DEBUG_TRIE_EXECUTE_r(
1997 PerlIO_printf( Perl_debug_log," - accepting\n"));
2006 if (!state) state = 1;
2009 if ( aho->states[ state ].wordnum ) {
2010 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2011 if (!leftmost || lpos < leftmost) {
2012 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2017 s = (char*)leftmost;
2018 DEBUG_TRIE_EXECUTE_r({
2020 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2021 (UV)accepted_word, (IV)(s - real_start)
2024 if (reginfo->intuit || regtry(reginfo, &s)) {
2030 DEBUG_TRIE_EXECUTE_r({
2031 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2034 DEBUG_TRIE_EXECUTE_r(
2035 PerlIO_printf( Perl_debug_log,"No match.\n"));
2044 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2054 - regexec_flags - match a regexp against a string
2057 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2058 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2059 /* stringarg: the point in the string at which to begin matching */
2060 /* strend: pointer to null at end of string */
2061 /* strbeg: real beginning of string */
2062 /* minend: end of match must be >= minend bytes after stringarg. */
2063 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2064 * itself is accessed via the pointers above */
2065 /* data: May be used for some additional optimizations.
2066 Currently its only used, with a U32 cast, for transmitting
2067 the ganch offset when doing a /g match. This will change */
2068 /* nosave: For optimizations. */
2072 struct regexp *const prog = ReANY(rx);
2075 char *startpos = stringarg;
2076 I32 minlen; /* must match at least this many chars */
2077 I32 dontbother = 0; /* how many characters not to try at end */
2078 I32 end_shift = 0; /* Same for the end. */ /* CC */
2079 I32 scream_pos = -1; /* Internal iterator of scream. */
2080 char *scream_olds = NULL;
2081 const bool utf8_target = cBOOL(DO_UTF8(sv));
2083 RXi_GET_DECL(prog,progi);
2084 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2085 regmatch_info *const reginfo = ®info_buf;
2086 regexp_paren_pair *swap = NULL;
2088 GET_RE_DEBUG_FLAGS_DECL;
2090 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2091 PERL_UNUSED_ARG(data);
2093 /* Be paranoid... */
2094 if (prog == NULL || startpos == NULL) {
2095 Perl_croak(aTHX_ "NULL regexp parameter");
2099 multiline = prog->extflags & RXf_PMf_MULTILINE;
2101 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2102 reginfo->intuit = 0;
2103 reginfo->is_utf8_target = cBOOL(utf8_target);
2105 /* on first ever match, allocate first slab */
2106 if (!PL_regmatch_slab) {
2107 Newx(PL_regmatch_slab, 1, regmatch_slab);
2108 PL_regmatch_slab->prev = NULL;
2109 PL_regmatch_slab->next = NULL;
2110 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2113 /* note current PL_regmatch_state position; at end of match we'll
2114 * pop back to there and free any higher slabs */
2115 oldsave = PL_savestack_ix;
2118 debug_start_match(rx, utf8_target, startpos, strend,
2122 minlen = prog->minlen;
2124 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2125 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2126 "String too short [regexec_flags]...\n"));
2131 /* Check validity of program. */
2132 if (UCHARAT(progi->program) != REG_MAGIC) {
2133 Perl_croak(aTHX_ "corrupted regexp program");
2136 RX_MATCH_TAINTED_off(rx);
2138 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2139 reginfo->warned = FALSE;
2140 reginfo->strbeg = strbeg;
2142 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2144 /* Mark end of string for $ (and such) */
2145 reginfo->strend = strend;
2147 /* see how far we have to get to not match where we matched before */
2148 reginfo->till = startpos+minend;
2150 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2151 SAVEVPTR(PL_regmatch_slab);
2152 SAVEVPTR(PL_regmatch_state);
2154 /* grab next slot in regmatch_state stack to store regmatch_info_aux
2157 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2158 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2159 PL_regmatch_state = S_push_slab(aTHX);
2161 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2164 if ((prog->extflags & RXf_EVAL_SEEN)) {
2165 /* grab next slot in regmatch_state stack to store
2166 * regmatch_info_aux_eval struct */
2168 reginfo->info_aux_eval =
2169 reginfo->info_aux->info_aux_eval =
2170 &(PL_regmatch_state->u.info_aux_eval);
2172 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2173 PL_regmatch_state = S_push_slab(aTHX);
2175 S_setup_eval_state(aTHX_ reginfo);
2178 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2181 /* If there is a "must appear" string, look for it. */
2184 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2186 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2187 reginfo->ganch = startpos + prog->gofs;
2188 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2189 "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2190 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2192 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2193 && mg->mg_len >= 0) {
2194 reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */
2195 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2196 "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2198 if (prog->extflags & RXf_ANCH_GPOS) {
2199 if (s > reginfo->ganch)
2201 s = reginfo->ganch - prog->gofs;
2202 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2203 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2209 reginfo->ganch = strbeg + PTR2UV(data);
2210 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2211 "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2213 } else { /* pos() not defined */
2214 reginfo->ganch = strbeg;
2215 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2216 "GPOS: reginfo->ganch = strbeg\n"));
2219 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2220 /* We have to be careful. If the previous successful match
2221 was from this regex we don't want a subsequent partially
2222 successful match to clobber the old results.
2223 So when we detect this possibility we add a swap buffer
2224 to the re, and switch the buffer each match. If we fail,
2225 we switch it back; otherwise we leave it swapped.
2228 /* do we need a save destructor here for eval dies? */
2229 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2230 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2231 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2237 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2238 re_scream_pos_data d;
2240 d.scream_olds = &scream_olds;
2241 d.scream_pos = &scream_pos;
2242 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
2244 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2245 goto phooey; /* not present */
2251 /* Simplest case: anchored match need be tried only once. */
2252 /* [unless only anchor is BOL and multiline is set] */
2253 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2254 if (s == startpos && regtry(reginfo, &startpos))
2256 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2257 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2262 dontbother = minlen - 1;
2263 end = HOP3c(strend, -dontbother, strbeg) - 1;
2264 /* for multiline we only have to try after newlines */
2265 if (prog->check_substr || prog->check_utf8) {
2266 /* because of the goto we can not easily reuse the macros for bifurcating the
2267 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2270 goto after_try_utf8;
2272 if (regtry(reginfo, &s)) {
2279 if (prog->extflags & RXf_USE_INTUIT) {
2280 s = re_intuit_start(rx, sv, strbeg,
2281 s + UTF8SKIP(s), strend, flags, NULL);
2290 } /* end search for check string in unicode */
2292 if (s == startpos) {
2293 goto after_try_latin;
2296 if (regtry(reginfo, &s)) {
2303 if (prog->extflags & RXf_USE_INTUIT) {
2304 s = re_intuit_start(rx, sv, strbeg,
2305 s + 1, strend, flags, NULL);
2314 } /* end search for check string in latin*/
2315 } /* end search for check string */
2316 else { /* search for newline */
2318 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2321 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2322 while (s <= end) { /* note it could be possible to match at the end of the string */
2323 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2324 if (regtry(reginfo, &s))
2328 } /* end search for newline */
2329 } /* end anchored/multiline check string search */
2331 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2333 /* the warning about reginfo->ganch being used without initialization
2334 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2335 and we only enter this block when the same bit is set. */
2336 char *tmp_s = reginfo->ganch - prog->gofs;
2338 if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
2343 /* Messy cases: unanchored match. */
2344 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2345 /* we have /x+whatever/ */
2346 /* it must be a one character string (XXXX Except is_utf8_pat?) */
2352 if (! prog->anchored_utf8) {
2353 to_utf8_substr(prog);
2355 ch = SvPVX_const(prog->anchored_utf8)[0];
2358 DEBUG_EXECUTE_r( did_match = 1 );
2359 if (regtry(reginfo, &s)) goto got_it;
2361 while (s < strend && *s == ch)
2368 if (! prog->anchored_substr) {
2369 if (! to_byte_substr(prog)) {
2370 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2373 ch = SvPVX_const(prog->anchored_substr)[0];
2376 DEBUG_EXECUTE_r( did_match = 1 );
2377 if (regtry(reginfo, &s)) goto got_it;
2379 while (s < strend && *s == ch)
2384 DEBUG_EXECUTE_r(if (!did_match)
2385 PerlIO_printf(Perl_debug_log,
2386 "Did not find anchored character...\n")
2389 else if (prog->anchored_substr != NULL
2390 || prog->anchored_utf8 != NULL
2391 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2392 && prog->float_max_offset < strend - s)) {
2397 char *last1; /* Last position checked before */
2401 if (prog->anchored_substr || prog->anchored_utf8) {
2403 if (! prog->anchored_utf8) {
2404 to_utf8_substr(prog);
2406 must = prog->anchored_utf8;
2409 if (! prog->anchored_substr) {
2410 if (! to_byte_substr(prog)) {
2411 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2414 must = prog->anchored_substr;
2416 back_max = back_min = prog->anchored_offset;
2419 if (! prog->float_utf8) {
2420 to_utf8_substr(prog);
2422 must = prog->float_utf8;
2425 if (! prog->float_substr) {
2426 if (! to_byte_substr(prog)) {
2427 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2430 must = prog->float_substr;
2432 back_max = prog->float_max_offset;
2433 back_min = prog->float_min_offset;
2439 last = HOP3c(strend, /* Cannot start after this */
2440 -(I32)(CHR_SVLEN(must)
2441 - (SvTAIL(must) != 0) + back_min), strbeg);
2443 if (s > reginfo->strbeg)
2444 last1 = HOPc(s, -1);
2446 last1 = s - 1; /* bogus */
2448 /* XXXX check_substr already used to find "s", can optimize if
2449 check_substr==must. */
2451 dontbother = end_shift;
2452 strend = HOPc(strend, -dontbother);
2453 while ( (s <= last) &&
2454 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2455 (unsigned char*)strend, must,
2456 multiline ? FBMrf_MULTILINE : 0)) ) {
2457 DEBUG_EXECUTE_r( did_match = 1 );
2458 if (HOPc(s, -back_max) > last1) {
2459 last1 = HOPc(s, -back_min);
2460 s = HOPc(s, -back_max);
2463 char * const t = (last1 >= reginfo->strbeg)
2464 ? HOPc(last1, 1) : last1 + 1;
2466 last1 = HOPc(s, -back_min);
2470 while (s <= last1) {
2471 if (regtry(reginfo, &s))
2474 s++; /* to break out of outer loop */
2481 while (s <= last1) {
2482 if (regtry(reginfo, &s))
2488 DEBUG_EXECUTE_r(if (!did_match) {
2489 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2490 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2491 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2492 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2493 ? "anchored" : "floating"),
2494 quoted, RE_SV_TAIL(must));
2498 else if ( (c = progi->regstclass) ) {
2500 const OPCODE op = OP(progi->regstclass);
2501 /* don't bother with what can't match */
2502 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2503 strend = HOPc(strend, -(minlen - 1));
2506 SV * const prop = sv_newmortal();
2507 regprop(prog, prop, c);
2509 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2511 PerlIO_printf(Perl_debug_log,
2512 "Matching stclass %.*s against %s (%d bytes)\n",
2513 (int)SvCUR(prop), SvPVX_const(prop),
2514 quoted, (int)(strend - s));
2517 if (find_byclass(prog, c, s, strend, reginfo))
2519 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2523 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2531 if (! prog->float_utf8) {
2532 to_utf8_substr(prog);
2534 float_real = prog->float_utf8;
2537 if (! prog->float_substr) {
2538 if (! to_byte_substr(prog)) {
2539 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2542 float_real = prog->float_substr;
2545 little = SvPV_const(float_real, len);
2546 if (SvTAIL(float_real)) {
2547 /* This means that float_real contains an artificial \n on
2548 * the end due to the presence of something like this:
2549 * /foo$/ where we can match both "foo" and "foo\n" at the
2550 * end of the string. So we have to compare the end of the
2551 * string first against the float_real without the \n and
2552 * then against the full float_real with the string. We
2553 * have to watch out for cases where the string might be
2554 * smaller than the float_real or the float_real without
2556 char *checkpos= strend - len;
2558 PerlIO_printf(Perl_debug_log,
2559 "%sChecking for float_real.%s\n",
2560 PL_colors[4], PL_colors[5]));
2561 if (checkpos + 1 < strbeg) {
2562 /* can't match, even if we remove the trailing \n
2563 * string is too short to match */
2565 PerlIO_printf(Perl_debug_log,
2566 "%sString shorter than required trailing substring, cannot match.%s\n",
2567 PL_colors[4], PL_colors[5]));
2569 } else if (memEQ(checkpos + 1, little, len - 1)) {
2570 /* can match, the end of the string matches without the
2572 last = checkpos + 1;
2573 } else if (checkpos < strbeg) {
2574 /* cant match, string is too short when the "\n" is
2577 PerlIO_printf(Perl_debug_log,
2578 "%sString does not contain required trailing substring, cannot match.%s\n",
2579 PL_colors[4], PL_colors[5]));
2581 } else if (!multiline) {
2582 /* non multiline match, so compare with the "\n" at the
2583 * end of the string */
2584 if (memEQ(checkpos, little, len)) {
2588 PerlIO_printf(Perl_debug_log,
2589 "%sString does not contain required trailing substring, cannot match.%s\n",
2590 PL_colors[4], PL_colors[5]));
2594 /* multiline match, so we have to search for a place
2595 * where the full string is located */
2601 last = rninstr(s, strend, little, little + len);
2603 last = strend; /* matching "$" */
2606 /* at one point this block contained a comment which was
2607 * probably incorrect, which said that this was a "should not
2608 * happen" case. Even if it was true when it was written I am
2609 * pretty sure it is not anymore, so I have removed the comment
2610 * and replaced it with this one. Yves */
2612 PerlIO_printf(Perl_debug_log,
2613 "String does not contain required substring, cannot match.\n"
2617 dontbother = strend - last + prog->float_min_offset;
2619 if (minlen && (dontbother < minlen))
2620 dontbother = minlen - 1;
2621 strend -= dontbother; /* this one's always in bytes! */
2622 /* We don't know much -- general case. */
2625 if (regtry(reginfo, &s))
2634 if (regtry(reginfo, &s))
2636 } while (s++ < strend);
2646 PerlIO_printf(Perl_debug_log,
2647 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2654 /* clean up; this will trigger destructors that will free all slabs
2655 * above the current one, and cleanup the regmatch_info_aux
2656 * and regmatch_info_aux_eval sructs */
2658 LEAVE_SCOPE(oldsave);
2660 if (RXp_PAREN_NAMES(prog))
2661 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2663 RX_MATCH_UTF8_set(rx, utf8_target);
2665 /* make sure $`, $&, $', and $digit will work later */
2666 if ( !(flags & REXEC_NOT_FIRST) ) {
2667 if (flags & REXEC_COPY_STR) {
2671 PerlIO_printf(Perl_debug_log,
2672 "Copy on write: regexp capture, type %d\n",
2675 RX_MATCH_COPY_FREE(rx);
2676 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2677 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2678 assert (SvPOKp(prog->saved_copy));
2679 prog->sublen = reginfo->strend - strbeg;
2680 prog->suboffset = 0;
2681 prog->subcoffset = 0;
2686 I32 max = reginfo->strend - strbeg;
2689 if ( (flags & REXEC_COPY_SKIP_POST)
2690 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2691 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2692 ) { /* don't copy $' part of string */
2695 /* calculate the right-most part of the string covered
2696 * by a capture. Due to look-ahead, this may be to
2697 * the right of $&, so we have to scan all captures */
2698 while (n <= prog->lastparen) {
2699 if (prog->offs[n].end > max)
2700 max = prog->offs[n].end;
2704 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2705 ? prog->offs[0].start
2707 assert(max >= 0 && max <= reginfo->strend - strbeg);
2710 if ( (flags & REXEC_COPY_SKIP_PRE)
2711 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2712 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2713 ) { /* don't copy $` part of string */
2716 /* calculate the left-most part of the string covered
2717 * by a capture. Due to look-behind, this may be to
2718 * the left of $&, so we have to scan all captures */
2719 while (min && n <= prog->lastparen) {
2720 if ( prog->offs[n].start != -1
2721 && prog->offs[n].start < min)
2723 min = prog->offs[n].start;
2727 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2728 && min > prog->offs[0].end
2730 min = prog->offs[0].end;
2734 assert(min >= 0 && min <= max
2735 && min <= reginfo->strend - strbeg);
2738 if (RX_MATCH_COPIED(rx)) {
2739 if (sublen > prog->sublen)
2741 (char*)saferealloc(prog->subbeg, sublen+1);
2744 prog->subbeg = (char*)safemalloc(sublen+1);
2745 Copy(strbeg + min, prog->subbeg, sublen, char);
2746 prog->subbeg[sublen] = '\0';
2747 prog->suboffset = min;
2748 prog->sublen = sublen;
2749 RX_MATCH_COPIED_on(rx);
2751 prog->subcoffset = prog->suboffset;
2752 if (prog->suboffset && utf8_target) {
2753 /* Convert byte offset to chars.
2754 * XXX ideally should only compute this if @-/@+
2755 * has been seen, a la PL_sawampersand ??? */
2757 /* If there's a direct correspondence between the
2758 * string which we're matching and the original SV,
2759 * then we can use the utf8 len cache associated with
2760 * the SV. In particular, it means that under //g,
2761 * sv_pos_b2u() will use the previously cached
2762 * position to speed up working out the new length of
2763 * subcoffset, rather than counting from the start of
2764 * the string each time. This stops
2765 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2766 * from going quadratic */
2767 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2768 sv_pos_b2u(sv, &(prog->subcoffset));
2770 prog->subcoffset = utf8_length((U8*)strbeg,
2771 (U8*)(strbeg+prog->suboffset));
2775 RX_MATCH_COPY_FREE(rx);
2776 prog->subbeg = strbeg;
2777 prog->suboffset = 0;
2778 prog->subcoffset = 0;
2779 /* use reginfo->strend, as strend may have been modified */
2780 prog->sublen = reginfo->strend - strbeg;
2787 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2788 PL_colors[4], PL_colors[5]));
2790 /* clean up; this will trigger destructors that will free all slabs
2791 * above the current one, and cleanup the regmatch_info_aux
2792 * and regmatch_info_aux_eval sructs */
2794 LEAVE_SCOPE(oldsave);
2797 /* we failed :-( roll it back */
2798 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2799 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2804 Safefree(prog->offs);
2811 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2812 * Do inc before dec, in case old and new rex are the same */
2813 #define SET_reg_curpm(Re2) \
2814 if (reginfo->info_aux_eval) { \
2815 (void)ReREFCNT_inc(Re2); \
2816 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2817 PM_SETRE((PL_reg_curpm), (Re2)); \
2822 - regtry - try match at specific point
2824 STATIC I32 /* 0 failure, 1 success */
2825 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2829 REGEXP *const rx = reginfo->prog;
2830 regexp *const prog = ReANY(rx);
2832 RXi_GET_DECL(prog,progi);
2833 GET_RE_DEBUG_FLAGS_DECL;
2835 PERL_ARGS_ASSERT_REGTRY;
2837 reginfo->cutpoint=NULL;
2840 PL_reg_starttry = *startposp;
2842 prog->offs[0].start = *startposp - reginfo->strbeg;
2843 prog->lastparen = 0;
2844 prog->lastcloseparen = 0;
2846 /* XXXX What this code is doing here?!!! There should be no need
2847 to do this again and again, prog->lastparen should take care of
2850 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2851 * Actually, the code in regcppop() (which Ilya may be meaning by
2852 * prog->lastparen), is not needed at all by the test suite
2853 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2854 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2855 * Meanwhile, this code *is* needed for the
2856 * above-mentioned test suite tests to succeed. The common theme
2857 * on those tests seems to be returning null fields from matches.
2858 * --jhi updated by dapm */
2860 if (prog->nparens) {
2861 regexp_paren_pair *pp = prog->offs;
2863 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2871 result = regmatch(reginfo, *startposp, progi->program + 1);
2873 prog->offs[0].end = result;
2876 if (reginfo->cutpoint)
2877 *startposp= reginfo->cutpoint;
2878 REGCP_UNWIND(lastcp);
2883 #define sayYES goto yes
2884 #define sayNO goto no
2885 #define sayNO_SILENT goto no_silent
2887 /* we dont use STMT_START/END here because it leads to
2888 "unreachable code" warnings, which are bogus, but distracting. */
2889 #define CACHEsayNO \
2890 if (ST.cache_mask) \
2891 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2894 /* this is used to determine how far from the left messages like
2895 'failed...' are printed. It should be set such that messages
2896 are inline with the regop output that created them.
2898 #define REPORT_CODE_OFF 32
2901 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2902 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2903 #define CHRTEST_NOT_A_CP_1 -999
2904 #define CHRTEST_NOT_A_CP_2 -998
2906 /* grab a new slab and return the first slot in it */
2908 STATIC regmatch_state *
2911 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2914 regmatch_slab *s = PL_regmatch_slab->next;
2916 Newx(s, 1, regmatch_slab);
2917 s->prev = PL_regmatch_slab;
2919 PL_regmatch_slab->next = s;
2921 PL_regmatch_slab = s;
2922 return SLAB_FIRST(s);
2926 /* push a new state then goto it */
2928 #define PUSH_STATE_GOTO(state, node, input) \
2929 pushinput = input; \
2931 st->resume_state = state; \
2934 /* push a new state with success backtracking, then goto it */
2936 #define PUSH_YES_STATE_GOTO(state, node, input) \
2937 pushinput = input; \
2939 st->resume_state = state; \
2940 goto push_yes_state;
2947 regmatch() - main matching routine
2949 This is basically one big switch statement in a loop. We execute an op,
2950 set 'next' to point the next op, and continue. If we come to a point which
2951 we may need to backtrack to on failure such as (A|B|C), we push a
2952 backtrack state onto the backtrack stack. On failure, we pop the top
2953 state, and re-enter the loop at the state indicated. If there are no more
2954 states to pop, we return failure.
2956 Sometimes we also need to backtrack on success; for example /A+/, where
2957 after successfully matching one A, we need to go back and try to
2958 match another one; similarly for lookahead assertions: if the assertion
2959 completes successfully, we backtrack to the state just before the assertion
2960 and then carry on. In these cases, the pushed state is marked as
2961 'backtrack on success too'. This marking is in fact done by a chain of
2962 pointers, each pointing to the previous 'yes' state. On success, we pop to
2963 the nearest yes state, discarding any intermediate failure-only states.
2964 Sometimes a yes state is pushed just to force some cleanup code to be
2965 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2966 it to free the inner regex.
2968 Note that failure backtracking rewinds the cursor position, while
2969 success backtracking leaves it alone.
2971 A pattern is complete when the END op is executed, while a subpattern
2972 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2973 ops trigger the "pop to last yes state if any, otherwise return true"
2976 A common convention in this function is to use A and B to refer to the two
2977 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2978 the subpattern to be matched possibly multiple times, while B is the entire
2979 rest of the pattern. Variable and state names reflect this convention.
2981 The states in the main switch are the union of ops and failure/success of
2982 substates associated with with that op. For example, IFMATCH is the op
2983 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2984 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2985 successfully matched A and IFMATCH_A_fail is a state saying that we have
2986 just failed to match A. Resume states always come in pairs. The backtrack
2987 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2988 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2989 on success or failure.
2991 The struct that holds a backtracking state is actually a big union, with
2992 one variant for each major type of op. The variable st points to the
2993 top-most backtrack struct. To make the code clearer, within each
2994 block of code we #define ST to alias the relevant union.
2996 Here's a concrete example of a (vastly oversimplified) IFMATCH
3002 #define ST st->u.ifmatch
3004 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3005 ST.foo = ...; // some state we wish to save
3007 // push a yes backtrack state with a resume value of
3008 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3010 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3013 case IFMATCH_A: // we have successfully executed A; now continue with B
3015 bar = ST.foo; // do something with the preserved value
3018 case IFMATCH_A_fail: // A failed, so the assertion failed
3019 ...; // do some housekeeping, then ...
3020 sayNO; // propagate the failure
3027 For any old-timers reading this who are familiar with the old recursive
3028 approach, the code above is equivalent to:
3030 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3039 ...; // do some housekeeping, then ...
3040 sayNO; // propagate the failure
3043 The topmost backtrack state, pointed to by st, is usually free. If you
3044 want to claim it, populate any ST.foo fields in it with values you wish to
3045 save, then do one of
3047 PUSH_STATE_GOTO(resume_state, node, newinput);
3048 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3050 which sets that backtrack state's resume value to 'resume_state', pushes a
3051 new free entry to the top of the backtrack stack, then goes to 'node'.
3052 On backtracking, the free slot is popped, and the saved state becomes the
3053 new free state. An ST.foo field in this new top state can be temporarily
3054 accessed to retrieve values, but once the main loop is re-entered, it
3055 becomes available for reuse.
3057 Note that the depth of the backtrack stack constantly increases during the
3058 left-to-right execution of the pattern, rather than going up and down with
3059 the pattern nesting. For example the stack is at its maximum at Z at the
3060 end of the pattern, rather than at X in the following:
3062 /(((X)+)+)+....(Y)+....Z/
3064 The only exceptions to this are lookahead/behind assertions and the cut,
3065 (?>A), which pop all the backtrack states associated with A before
3068 Backtrack state structs are allocated in slabs of about 4K in size.
3069 PL_regmatch_state and st always point to the currently active state,
3070 and PL_regmatch_slab points to the slab currently containing
3071 PL_regmatch_state. The first time regmatch() is called, the first slab is
3072 allocated, and is never freed until interpreter destruction. When the slab
3073 is full, a new one is allocated and chained to the end. At exit from
3074 regmatch(), slabs allocated since entry are freed.
3079 #define DEBUG_STATE_pp(pp) \
3081 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3082 PerlIO_printf(Perl_debug_log, \
3083 " %*s"pp" %s%s%s%s%s\n", \
3085 PL_reg_name[st->resume_state], \
3086 ((st==yes_state||st==mark_state) ? "[" : ""), \
3087 ((st==yes_state) ? "Y" : ""), \
3088 ((st==mark_state) ? "M" : ""), \
3089 ((st==yes_state||st==mark_state) ? "]" : "") \
3094 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3099 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3100 const char *start, const char *end, const char *blurb)
3102 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3104 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3109 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3110 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3112 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3113 start, end - start, 60);
3115 PerlIO_printf(Perl_debug_log,
3116 "%s%s REx%s %s against %s\n",
3117 PL_colors[4], blurb, PL_colors[5], s0, s1);
3119 if (utf8_target||utf8_pat)
3120 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3121 utf8_pat ? "pattern" : "",
3122 utf8_pat && utf8_target ? " and " : "",
3123 utf8_target ? "string" : ""
3129 S_dump_exec_pos(pTHX_ const char *locinput,
3130 const regnode *scan,
3131 const char *loc_regeol,
3132 const char *loc_bostr,
3133 const char *loc_reg_starttry,
3134 const bool utf8_target)
3136 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3137 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3138 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3139 /* The part of the string before starttry has one color
3140 (pref0_len chars), between starttry and current
3141 position another one (pref_len - pref0_len chars),
3142 after the current position the third one.
3143 We assume that pref0_len <= pref_len, otherwise we
3144 decrease pref0_len. */
3145 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3146 ? (5 + taill) - l : locinput - loc_bostr;
3149 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3151 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3153 pref0_len = pref_len - (locinput - loc_reg_starttry);
3154 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3155 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3156 ? (5 + taill) - pref_len : loc_regeol - locinput);
3157 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3161 if (pref0_len > pref_len)
3162 pref0_len = pref_len;
3164 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3166 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3167 (locinput - pref_len),pref0_len, 60, 4, 5);
3169 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3170 (locinput - pref_len + pref0_len),
3171 pref_len - pref0_len, 60, 2, 3);
3173 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3174 locinput, loc_regeol - locinput, 10, 0, 1);
3176 const STRLEN tlen=len0+len1+len2;
3177 PerlIO_printf(Perl_debug_log,
3178 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3179 (IV)(locinput - loc_bostr),
3182 (docolor ? "" : "> <"),
3184 (int)(tlen > 19 ? 0 : 19 - tlen),
3191 /* reg_check_named_buff_matched()
3192 * Checks to see if a named buffer has matched. The data array of
3193 * buffer numbers corresponding to the buffer is expected to reside
3194 * in the regexp->data->data array in the slot stored in the ARG() of
3195 * node involved. Note that this routine doesn't actually care about the
3196 * name, that information is not preserved from compilation to execution.
3197 * Returns the index of the leftmost defined buffer with the given name
3198 * or 0 if non of the buffers matched.
3201 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3204 RXi_GET_DECL(rex,rexi);
3205 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3206 I32 *nums=(I32*)SvPVX(sv_dat);
3208 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3210 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3211 if ((I32)rex->lastparen >= nums[n] &&
3212 rex->offs[nums[n]].end != -1)
3221 /* free all slabs above current one - called during LEAVE_SCOPE */
3224 S_clear_backtrack_stack(pTHX_ void *p)
3226 regmatch_slab *s = PL_regmatch_slab->next;
3231 PL_regmatch_slab->next = NULL;
3233 regmatch_slab * const osl = s;
3241 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3242 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3244 /* This function determines if there are one or two characters that match
3245 * the first character of the passed-in EXACTish node <text_node>, and if
3246 * so, returns them in the passed-in pointers.
3248 * If it determines that no possible character in the target string can
3249 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3250 * the first character in <text_node> requires UTF-8 to represent, and the
3251 * target string isn't in UTF-8.)
3253 * If there are more than two characters that could match the beginning of
3254 * <text_node>, or if more context is required to determine a match or not,
3255 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3257 * The motiviation behind this function is to allow the caller to set up
3258 * tight loops for matching. If <text_node> is of type EXACT, there is
3259 * only one possible character that can match its first character, and so
3260 * the situation is quite simple. But things get much more complicated if
3261 * folding is involved. It may be that the first character of an EXACTFish
3262 * node doesn't participate in any possible fold, e.g., punctuation, so it
3263 * can be matched only by itself. The vast majority of characters that are
3264 * in folds match just two things, their lower and upper-case equivalents.
3265 * But not all are like that; some have multiple possible matches, or match
3266 * sequences of more than one character. This function sorts all that out.
3268 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3269 * loop of trying to match A*, we know we can't exit where the thing
3270 * following it isn't a B. And something can't be a B unless it is the
3271 * beginning of B. By putting a quick test for that beginning in a tight
3272 * loop, we can rule out things that can't possibly be B without having to
3273 * break out of the loop, thus avoiding work. Similarly, if A is a single
3274 * character, we can make a tight loop matching A*, using the outputs of
3277 * If the target string to match isn't in UTF-8, and there aren't
3278 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3279 * the one or two possible octets (which are characters in this situation)
3280 * that can match. In all cases, if there is only one character that can
3281 * match, *<c1p> and *<c2p> will be identical.
3283 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3284 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3285 * can match the beginning of <text_node>. They should be declared with at
3286 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3287 * undefined what these contain.) If one or both of the buffers are
3288 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3289 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3290 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3291 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3292 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3294 const bool utf8_target = reginfo->is_utf8_target;
3296 UV c1 = CHRTEST_NOT_A_CP_1;
3297 UV c2 = CHRTEST_NOT_A_CP_2;
3298 bool use_chrtest_void = FALSE;
3299 const bool is_utf8_pat = reginfo->is_utf8_pat;
3301 /* Used when we have both utf8 input and utf8 output, to avoid converting
3302 * to/from code points */
3303 bool utf8_has_been_setup = FALSE;
3307 U8 *pat = (U8*)STRING(text_node);
3309 if (OP(text_node) == EXACT) {
3311 /* In an exact node, only one thing can be matched, that first
3312 * character. If both the pat and the target are UTF-8, we can just
3313 * copy the input to the output, avoiding finding the code point of
3318 else if (utf8_target) {
3319 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3320 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3321 utf8_has_been_setup = TRUE;
3324 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3327 else /* an EXACTFish node */
3329 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3330 pat + STR_LEN(text_node)))
3332 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3333 pat + STR_LEN(text_node))))
3335 /* Multi-character folds require more context to sort out. Also
3336 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3337 * handled outside this routine */
3338 use_chrtest_void = TRUE;
3340 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3341 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3343 /* Load the folds hash, if not already done */
3345 if (! PL_utf8_foldclosures) {
3346 if (! PL_utf8_tofold) {
3347 U8 dummy[UTF8_MAXBYTES+1];
3349 /* Force loading this by folding an above-Latin1 char */
3350 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3351 assert(PL_utf8_tofold); /* Verify that worked */
3353 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3356 /* The fold closures data structure is a hash with the keys being
3357 * the UTF-8 of every character that is folded to, like 'k', and
3358 * the values each an array of all code points that fold to its
3359 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3361 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3366 /* Not found in the hash, therefore there are no folds
3367 * containing it, so there is only a single character that
3371 else { /* Does participate in folds */
3372 AV* list = (AV*) *listp;
3373 if (av_len(list) != 1) {
3375 /* If there aren't exactly two folds to this, it is outside
3376 * the scope of this function */
3377 use_chrtest_void = TRUE;
3379 else { /* There are two. Get them */
3380 SV** c_p = av_fetch(list, 0, FALSE);
3382 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3386 c_p = av_fetch(list, 1, FALSE);
3388 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3392 /* Folds that cross the 255/256 boundary are forbidden if
3393 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3394 * pattern character is above 256, and its only other match
3395 * is below 256, the only legal match will be to itself.
3396 * We have thrown away the original, so have to compute
3397 * which is the one above 255 */
3398 if ((c1 < 256) != (c2 < 256)) {
3399 if (OP(text_node) == EXACTFL
3400 || (OP(text_node) == EXACTFA
3401 && (isASCII(c1) || isASCII(c2))))
3414 else /* Here, c1 is < 255 */
3416 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3417 && OP(text_node) != EXACTFL
3418 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3420 /* Here, there could be something above Latin1 in the target which
3421 * folds to this character in the pattern. All such cases except
3422 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3423 * involved in their folds, so are outside the scope of this
3425 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3426 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3429 use_chrtest_void = TRUE;
3432 else { /* Here nothing above Latin1 can fold to the pattern character */
3433 switch (OP(text_node)) {
3435 case EXACTFL: /* /l rules */
3436 c2 = PL_fold_locale[c1];
3440 if (! utf8_target) { /* /d rules */
3445 /* /u rules for all these. This happens to work for
3446 * EXACTFA as nothing in Latin1 folds to ASCII */
3448 case EXACTFU_TRICKYFOLD:
3451 c2 = PL_fold_latin1[c1];
3455 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3456 assert(0); /* NOTREACHED */
3461 /* Here have figured things out. Set up the returns */
3462 if (use_chrtest_void) {
3463 *c2p = *c1p = CHRTEST_VOID;
3465 else if (utf8_target) {
3466 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3467 uvchr_to_utf8(c1_utf8, c1);
3468 uvchr_to_utf8(c2_utf8, c2);
3471 /* Invariants are stored in both the utf8 and byte outputs; Use
3472 * negative numbers otherwise for the byte ones. Make sure that the
3473 * byte ones are the same iff the utf8 ones are the same */
3474 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3475 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3478 ? CHRTEST_NOT_A_CP_1
3479 : CHRTEST_NOT_A_CP_2;
3481 else if (c1 > 255) {
3482 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3487 *c1p = *c2p = c2; /* c2 is the only representable value */
3489 else { /* c1 is representable; see about c2 */
3491 *c2p = (c2 < 256) ? c2 : c1;
3497 /* returns -1 on failure, $+[0] on success */
3499 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3501 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3505 const bool utf8_target = reginfo->is_utf8_target;
3506 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3507 REGEXP *rex_sv = reginfo->prog;
3508 regexp *rex = ReANY(rex_sv);
3509 RXi_GET_DECL(rex,rexi);
3510 /* the current state. This is a cached copy of PL_regmatch_state */
3512 /* cache heavy used fields of st in registers */
3515 U32 n = 0; /* general value; init to avoid compiler warning */
3516 I32 ln = 0; /* len or last; init to avoid compiler warning */
3517 char *locinput = startpos;
3518 char *pushinput; /* where to continue after a PUSH */
3519 I32 nextchr; /* is always set to UCHARAT(locinput) */
3521 bool result = 0; /* return value of S_regmatch */
3522 int depth = 0; /* depth of backtrack stack */
3523 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3524 const U32 max_nochange_depth =
3525 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3526 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3527 regmatch_state *yes_state = NULL; /* state to pop to on success of
3529 /* mark_state piggy backs on the yes_state logic so that when we unwind
3530 the stack on success we can update the mark_state as we go */
3531 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3532 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3533 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3535 bool no_final = 0; /* prevent failure from backtracking? */
3536 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3537 char *startpoint = locinput;
3538 SV *popmark = NULL; /* are we looking for a mark? */
3539 SV *sv_commit = NULL; /* last mark name seen in failure */
3540 SV *sv_yes_mark = NULL; /* last mark name we have seen
3541 during a successful match */
3542 U32 lastopen = 0; /* last open we saw */
3543 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3544 SV* const oreplsv = GvSV(PL_replgv);
3545 /* these three flags are set by various ops to signal information to
3546 * the very next op. They have a useful lifetime of exactly one loop
3547 * iteration, and are not preserved or restored by state pushes/pops
3549 bool sw = 0; /* the condition value in (?(cond)a|b) */
3550 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3551 int logical = 0; /* the following EVAL is:
3555 or the following IFMATCH/UNLESSM is:
3556 false: plain (?=foo)
3557 true: used as a condition: (?(?=foo))
3559 PAD* last_pad = NULL;
3561 I32 gimme = G_SCALAR;
3562 CV *caller_cv = NULL; /* who called us */
3563 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3564 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3565 U32 maxopenparen = 0; /* max '(' index seen so far */
3566 int to_complement; /* Invert the result? */
3567 _char_class_number classnum;
3568 bool is_utf8_pat = reginfo->is_utf8_pat;
3571 GET_RE_DEBUG_FLAGS_DECL;
3574 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3575 multicall_oldcatch = 0;
3576 multicall_cv = NULL;
3578 PERL_UNUSED_VAR(multicall_cop);
3579 PERL_UNUSED_VAR(newsp);
3582 PERL_ARGS_ASSERT_REGMATCH;
3584 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3585 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3588 /* grab next free state slot */
3589 st = ++PL_regmatch_state;
3590 if (st > SLAB_LAST(PL_regmatch_slab))
3591 st = PL_regmatch_state = S_push_slab(aTHX);
3593 /* Note that nextchr is a byte even in UTF */
3596 while (scan != NULL) {
3599 SV * const prop = sv_newmortal();
3600 regnode *rnext=regnext(scan);
3601 DUMP_EXEC_POS( locinput, scan, utf8_target );
3602 regprop(rex, prop, scan);
3604 PerlIO_printf(Perl_debug_log,
3605 "%3"IVdf":%*s%s(%"IVdf")\n",
3606 (IV)(scan - rexi->program), depth*2, "",
3608 (PL_regkind[OP(scan)] == END || !rnext) ?
3609 0 : (IV)(rnext - rexi->program));
3612 next = scan + NEXT_OFF(scan);
3615 state_num = OP(scan);
3621 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3623 switch (state_num) {
3624 case BOL: /* /^../ */
3625 if (locinput == reginfo->strbeg)
3629 case MBOL: /* /^../m */
3630 if (locinput == reginfo->strbeg ||
3631 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3637 case SBOL: /* /^../s */
3638 if (locinput == reginfo->strbeg)
3643 if (locinput == reginfo->ganch)
3647 case KEEPS: /* \K */
3648 /* update the startpoint */
3649 st->u.keeper.val = rex->offs[0].start;
3650 rex->offs[0].start = locinput - reginfo->strbeg;
3651 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3652 assert(0); /*NOTREACHED*/
3653 case KEEPS_next_fail:
3654 /* rollback the start point change */
3655 rex->offs[0].start = st->u.keeper.val;
3657 assert(0); /*NOTREACHED*/
3659 case EOL: /* /..$/ */
3662 case MEOL: /* /..$/m */
3663 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3667 case SEOL: /* /..$/s */
3669 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3671 if (reginfo->strend - locinput > 1)
3676 if (!NEXTCHR_IS_EOS)
3680 case SANY: /* /./s */
3683 goto increment_locinput;
3691 case REG_ANY: /* /./ */
3692 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3694 goto increment_locinput;
3698 #define ST st->u.trie
3699 case TRIEC: /* (ab|cd) with known charclass */
3700 /* In this case the charclass data is available inline so
3701 we can fail fast without a lot of extra overhead.
3703 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3705 PerlIO_printf(Perl_debug_log,
3706 "%*s %sfailed to match trie start class...%s\n",
3707 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3710 assert(0); /* NOTREACHED */
3713 case TRIE: /* (ab|cd) */
3714 /* the basic plan of execution of the trie is:
3715 * At the beginning, run though all the states, and
3716 * find the longest-matching word. Also remember the position
3717 * of the shortest matching word. For example, this pattern:
3720 * when matched against the string "abcde", will generate
3721 * accept states for all words except 3, with the longest
3722 * matching word being 4, and the shortest being 2 (with
3723 * the position being after char 1 of the string).
3725 * Then for each matching word, in word order (i.e. 1,2,4,5),
3726 * we run the remainder of the pattern; on each try setting
3727 * the current position to the character following the word,
3728 * returning to try the next word on failure.
3730 * We avoid having to build a list of words at runtime by
3731 * using a compile-time structure, wordinfo[].prev, which
3732 * gives, for each word, the previous accepting word (if any).
3733 * In the case above it would contain the mappings 1->2, 2->0,
3734 * 3->0, 4->5, 5->1. We can use this table to generate, from
3735 * the longest word (4 above), a list of all words, by
3736 * following the list of prev pointers; this gives us the
3737 * unordered list 4,5,1,2. Then given the current word we have
3738 * just tried, we can go through the list and find the
3739 * next-biggest word to try (so if we just failed on word 2,
3740 * the next in the list is 4).
3742 * Since at runtime we don't record the matching position in
3743 * the string for each word, we have to work that out for
3744 * each word we're about to process. The wordinfo table holds
3745 * the character length of each word; given that we recorded
3746 * at the start: the position of the shortest word and its
3747 * length in chars, we just need to move the pointer the
3748 * difference between the two char lengths. Depending on
3749 * Unicode status and folding, that's cheap or expensive.
3751 * This algorithm is optimised for the case where are only a
3752 * small number of accept states, i.e. 0,1, or maybe 2.
3753 * With lots of accepts states, and having to try all of them,
3754 * it becomes quadratic on number of accept states to find all
3759 /* what type of TRIE am I? (utf8 makes this contextual) */
3760 DECL_TRIE_TYPE(scan);
3762 /* what trie are we using right now */
3763 reg_trie_data * const trie
3764 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3765 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3766 U32 state = trie->startstate;
3769 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3771 if (trie->states[ state ].wordnum) {
3773 PerlIO_printf(Perl_debug_log,
3774 "%*s %smatched empty string...%s\n",
3775 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3781 PerlIO_printf(Perl_debug_log,
3782 "%*s %sfailed to match trie start class...%s\n",
3783 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3790 U8 *uc = ( U8* )locinput;
3794 U8 *uscan = (U8*)NULL;
3795 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3796 U32 charcount = 0; /* how many input chars we have matched */
3797 U32 accepted = 0; /* have we seen any accepting states? */
3799 ST.jump = trie->jump;
3802 ST.longfold = FALSE; /* char longer if folded => it's harder */
3805 /* fully traverse the TRIE; note the position of the
3806 shortest accept state and the wordnum of the longest
3809 while ( state && uc <= (U8*)(reginfo->strend) ) {
3810 U32 base = trie->states[ state ].trans.base;
3814 wordnum = trie->states[ state ].wordnum;
3816 if (wordnum) { /* it's an accept state */
3819 /* record first match position */
3821 ST.firstpos = (U8*)locinput;
3826 ST.firstchars = charcount;
3829 if (!ST.nextword || wordnum < ST.nextword)
3830 ST.nextword = wordnum;
3831 ST.topword = wordnum;
3834 DEBUG_TRIE_EXECUTE_r({
3835 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3836 PerlIO_printf( Perl_debug_log,
3837 "%*s %sState: %4"UVxf" Accepted: %c ",
3838 2+depth * 2, "", PL_colors[4],
3839 (UV)state, (accepted ? 'Y' : 'N'));
3842 /* read a char and goto next state */
3843 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3845 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3846 uscan, len, uvc, charid, foldlen,
3853 base + charid - 1 - trie->uniquecharcount)) >= 0)
3855 && ((U32)offset < trie->lasttrans)
3856 && trie->trans[offset].check == state)
3858 state = trie->trans[offset].next;
3869 DEBUG_TRIE_EXECUTE_r(
3870 PerlIO_printf( Perl_debug_log,
3871 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3872 charid, uvc, (UV)state, PL_colors[5] );
3878 /* calculate total number of accept states */
3883 w = trie->wordinfo[w].prev;
3886 ST.accepted = accepted;
3890 PerlIO_printf( Perl_debug_log,
3891 "%*s %sgot %"IVdf" possible matches%s\n",
3892 REPORT_CODE_OFF + depth * 2, "",
3893 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3895 goto trie_first_try; /* jump into the fail handler */
3897 assert(0); /* NOTREACHED */
3899 case TRIE_next_fail: /* we failed - try next alternative */
3903 REGCP_UNWIND(ST.cp);
3904 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3906 if (!--ST.accepted) {
3908 PerlIO_printf( Perl_debug_log,
3909 "%*s %sTRIE failed...%s\n",
3910 REPORT_CODE_OFF+depth*2, "",
3917 /* Find next-highest word to process. Note that this code
3918 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3921 U16 const nextword = ST.nextword;
3922 reg_trie_wordinfo * const wordinfo
3923 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3924 for (word=ST.topword; word; word=wordinfo[word].prev) {
3925 if (word > nextword && (!min || word < min))
3938 ST.lastparen = rex->lastparen;
3939 ST.lastcloseparen = rex->lastcloseparen;
3943 /* find start char of end of current word */
3945 U32 chars; /* how many chars to skip */
3946 reg_trie_data * const trie
3947 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3949 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3951 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3956 /* the hard option - fold each char in turn and find
3957 * its folded length (which may be different */
3958 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3966 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3974 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3979 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3995 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3996 ? ST.jump[ST.nextword]
4000 PerlIO_printf( Perl_debug_log,
4001 "%*s %sTRIE matched word #%d, continuing%s\n",
4002 REPORT_CODE_OFF+depth*2, "",
4009 if (ST.accepted > 1 || has_cutgroup) {
4010 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4011 assert(0); /* NOTREACHED */
4013 /* only one choice left - just continue */
4015 AV *const trie_words
4016 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4017 SV ** const tmp = av_fetch( trie_words,
4019 SV *sv= tmp ? sv_newmortal() : NULL;
4021 PerlIO_printf( Perl_debug_log,
4022 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4023 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4025 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4026 PL_colors[0], PL_colors[1],
4027 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4029 : "not compiled under -Dr",
4033 locinput = (char*)uc;
4034 continue; /* execute rest of RE */
4035 assert(0); /* NOTREACHED */
4039 case EXACT: { /* /abc/ */
4040 char *s = STRING(scan);
4042 if (utf8_target != is_utf8_pat) {
4043 /* The target and the pattern have differing utf8ness. */
4045 const char * const e = s + ln;
4048 /* The target is utf8, the pattern is not utf8.
4049 * Above-Latin1 code points can't match the pattern;
4050 * invariants match exactly, and the other Latin1 ones need
4051 * to be downgraded to a single byte in order to do the
4052 * comparison. (If we could be confident that the target
4053 * is not malformed, this could be refactored to have fewer
4054 * tests by just assuming that if the first bytes match, it
4055 * is an invariant, but there are tests in the test suite
4056 * dealing with (??{...}) which violate this) */
4058 if (l >= reginfo->strend
4059 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4063 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4070 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4079 /* The target is not utf8, the pattern is utf8. */
4081 if (l >= reginfo->strend
4082 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4086 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4093 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4104 /* The target and the pattern have the same utf8ness. */
4105 /* Inline the first character, for speed. */
4106 if (reginfo->strend - locinput < ln
4107 || UCHARAT(s) != nextchr
4108 || (ln > 1 && memNE(s, locinput, ln)))
4117 case EXACTFL: { /* /abc/il */
4119 const U8 * fold_array;
4121 U32 fold_utf8_flags;
4123 RX_MATCH_TAINTED_on(reginfo->prog);
4124 folder = foldEQ_locale;
4125 fold_array = PL_fold_locale;
4126 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4129 case EXACTFU_SS: /* /\x{df}/iu */
4130 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4131 case EXACTFU: /* /abc/iu */
4132 folder = foldEQ_latin1;
4133 fold_array = PL_fold_latin1;
4134 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4137 case EXACTFA: /* /abc/iaa */
4138 folder = foldEQ_latin1;
4139 fold_array = PL_fold_latin1;
4140 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4143 case EXACTF: /* /abc/i */
4145 fold_array = PL_fold;
4146 fold_utf8_flags = 0;
4152 if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4153 /* Either target or the pattern are utf8, or has the issue where
4154 * the fold lengths may differ. */
4155 const char * const l = locinput;
4156 char *e = reginfo->strend;
4158 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
4159 l, &e, 0, utf8_target, fold_utf8_flags))
4167 /* Neither the target nor the pattern are utf8 */
4168 if (UCHARAT(s) != nextchr
4170 && UCHARAT(s) != fold_array[nextchr])
4174 if (reginfo->strend - locinput < ln)
4176 if (ln > 1 && ! folder(s, locinput, ln))
4182 /* XXX Could improve efficiency by separating these all out using a
4183 * macro or in-line function. At that point regcomp.c would no longer
4184 * have to set the FLAGS fields of these */
4185 case BOUNDL: /* /\b/l */
4186 case NBOUNDL: /* /\B/l */
4187 RX_MATCH_TAINTED_on(reginfo->prog);
4189 case BOUND: /* /\b/ */
4190 case BOUNDU: /* /\b/u */
4191 case BOUNDA: /* /\b/a */
4192 case NBOUND: /* /\B/ */
4193 case NBOUNDU: /* /\B/u */
4194 case NBOUNDA: /* /\B/a */
4195 /* was last char in word? */
4197 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4198 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4200 if (locinput == reginfo->strbeg)
4203 const U8 * const r =
4204 reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4206 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4208 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4209 ln = isWORDCHAR_uni(ln);
4213 LOAD_UTF8_CHARCLASS_ALNUM();
4214 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4219 ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4220 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4225 /* Here the string isn't utf8, or is utf8 and only ascii
4226 * characters are to match \w. In the latter case looking at
4227 * the byte just prior to the current one may be just the final
4228 * byte of a multi-byte character. This is ok. There are two
4230 * 1) it is a single byte character, and then the test is doing
4231 * just what it's supposed to.
4232 * 2) it is a multi-byte character, in which case the final
4233 * byte is never mistakable for ASCII, and so the test
4234 * will say it is not a word character, which is the
4235 * correct answer. */
4236 ln = (locinput != reginfo->strbeg) ?
4237 UCHARAT(locinput - 1) : '\n';
4238 switch (FLAGS(scan)) {
4239 case REGEX_UNICODE_CHARSET:
4240 ln = isWORDCHAR_L1(ln);
4241 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4243 case REGEX_LOCALE_CHARSET:
4244 ln = isWORDCHAR_LC(ln);
4245 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4247 case REGEX_DEPENDS_CHARSET:
4248 ln = isWORDCHAR(ln);
4249 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4251 case REGEX_ASCII_RESTRICTED_CHARSET:
4252 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4253 ln = isWORDCHAR_A(ln);
4254 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4257 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4261 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4263 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4267 case ANYOF: /* /[abc]/ */
4268 case ANYOF_WARN_SUPER:
4272 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4274 locinput += UTF8SKIP(locinput);
4277 if (!REGINCLASS(rex, scan, (U8*)locinput))
4283 /* The argument (FLAGS) to all the POSIX node types is the class number
4286 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4290 case POSIXL: /* \w or [:punct:] etc. under /l */
4294 /* The locale hasn't influenced the outcome before this, so defer
4295 * tainting until now */
4296 RX_MATCH_TAINTED_on(reginfo->prog);
4298 /* Use isFOO_lc() for characters within Latin1. (Note that
4299 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4300 * wouldn't be invariant) */
4301 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4302 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4306 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4307 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4308 (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
4309 *(locinput + 1))))))
4314 else { /* Here, must be an above Latin-1 code point */
4315 goto utf8_posix_not_eos;
4318 /* Here, must be utf8 */
4319 locinput += UTF8SKIP(locinput);
4322 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4326 case POSIXD: /* \w or [:punct:] etc. under /d */
4332 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
4334 if (NEXTCHR_IS_EOS) {
4338 /* All UTF-8 variants match */
4339 if (! UTF8_IS_INVARIANT(nextchr)) {
4340 goto increment_locinput;
4346 case POSIXA: /* \w or [:punct:] etc. under /a */
4349 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4350 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4351 * character is a single byte */
4354 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4360 /* Here we are either not in utf8, or we matched a utf8-invariant,
4361 * so the next char is the next byte */
4365 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4369 case POSIXU: /* \w or [:punct:] etc. under /u */
4371 if (NEXTCHR_IS_EOS) {
4376 /* Use _generic_isCC() for characters within Latin1. (Note that
4377 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4378 * wouldn't be invariant) */
4379 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4380 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4387 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4388 if (! (to_complement
4389 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4397 else { /* Handle above Latin-1 code points */
4398 classnum = (_char_class_number) FLAGS(scan);
4399 if (classnum < _FIRST_NON_SWASH_CC) {
4401 /* Here, uses a swash to find such code points. Load if if
4402 * not done already */
4403 if (! PL_utf8_swash_ptrs[classnum]) {
4404 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4405 PL_utf8_swash_ptrs[classnum]
4406 = _core_swash_init("utf8",
4407 swash_property_names[classnum],
4408 &PL_sv_undef, 1, 0, NULL, &flags);
4410 if (! (to_complement
4411 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4412 (U8 *) locinput, TRUE))))
4417 else { /* Here, uses macros to find above Latin-1 code points */
4419 case _CC_ENUM_SPACE: /* XXX would require separate
4420 code if we revert the change
4421 of \v matching this */
4422 case _CC_ENUM_PSXSPC:
4423 if (! (to_complement
4424 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4429 case _CC_ENUM_BLANK:
4430 if (! (to_complement
4431 ^ cBOOL(is_HORIZWS_high(locinput))))
4436 case _CC_ENUM_XDIGIT:
4437 if (! (to_complement
4438 ^ cBOOL(is_XDIGIT_high(locinput))))
4443 case _CC_ENUM_VERTSPACE:
4444 if (! (to_complement
4445 ^ cBOOL(is_VERTWS_high(locinput))))
4450 default: /* The rest, e.g. [:cntrl:], can't match
4452 if (! to_complement) {
4458 locinput += UTF8SKIP(locinput);
4462 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4463 a Unicode extended Grapheme Cluster */
4464 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4465 extended Grapheme Cluster is:
4468 | Prepend* Begin Extend*
4471 Begin is: ( Special_Begin | ! Control )
4472 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4473 Extend is: ( Grapheme_Extend | Spacing_Mark )
4474 Control is: [ GCB_Control | CR | LF ]
4475 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4477 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4480 Begin is ( Regular_Begin + Special Begin )
4482 It turns out that 98.4% of all Unicode code points match
4483 Regular_Begin. Doing it this way eliminates a table match in
4484 the previous implementation for almost all Unicode code points.
4486 There is a subtlety with Prepend* which showed up in testing.
4487 Note that the Begin, and only the Begin is required in:
4488 | Prepend* Begin Extend*
4489 Also, Begin contains '! Control'. A Prepend must be a
4490 '! Control', which means it must also be a Begin. What it
4491 comes down to is that if we match Prepend* and then find no
4492 suitable Begin afterwards, that if we backtrack the last
4493 Prepend, that one will be a suitable Begin.
4498 if (! utf8_target) {
4500 /* Match either CR LF or '.', as all the other possibilities
4502 locinput++; /* Match the . or CR */
4503 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4505 && locinput < reginfo->strend
4506 && UCHARAT(locinput) == '\n')
4513 /* Utf8: See if is ( CR LF ); already know that locinput <
4514 * reginfo->strend, so locinput+1 is in bounds */
4515 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4516 && UCHARAT(locinput + 1) == '\n')
4523 /* In case have to backtrack to beginning, then match '.' */
4524 char *starting = locinput;
4526 /* In case have to backtrack the last prepend */
4527 char *previous_prepend = NULL;
4529 LOAD_UTF8_CHARCLASS_GCB();
4531 /* Match (prepend)* */
4532 while (locinput < reginfo->strend
4533 && (len = is_GCB_Prepend_utf8(locinput)))
4535 previous_prepend = locinput;
4539 /* As noted above, if we matched a prepend character, but
4540 * the next thing won't match, back off the last prepend we
4541 * matched, as it is guaranteed to match the begin */
4542 if (previous_prepend
4543 && (locinput >= reginfo->strend
4544 || (! swash_fetch(PL_utf8_X_regular_begin,
4545 (U8*)locinput, utf8_target)
4546 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4549 locinput = previous_prepend;
4552 /* Note that here we know reginfo->strend > locinput, as we
4553 * tested that upon input to this switch case, and if we
4554 * moved locinput forward, we tested the result just above
4555 * and it either passed, or we backed off so that it will
4557 if (swash_fetch(PL_utf8_X_regular_begin,
4558 (U8*)locinput, utf8_target)) {
4559 locinput += UTF8SKIP(locinput);
4561 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4563 /* Here did not match the required 'Begin' in the
4564 * second term. So just match the very first
4565 * character, the '.' of the final term of the regex */
4566 locinput = starting + UTF8SKIP(starting);
4570 /* Here is a special begin. It can be composed of
4571 * several individual characters. One possibility is
4573 if ((len = is_GCB_RI_utf8(locinput))) {
4575 while (locinput < reginfo->strend
4576 && (len = is_GCB_RI_utf8(locinput)))
4580 } else if ((len = is_GCB_T_utf8(locinput))) {
4581 /* Another possibility is T+ */
4583 while (locinput < reginfo->strend
4584 && (len = is_GCB_T_utf8(locinput)))
4590 /* Here, neither RI+ nor T+; must be some other
4591 * Hangul. That means it is one of the others: L,
4592 * LV, LVT or V, and matches:
4593 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4596 while (locinput < reginfo->strend
4597 && (len = is_GCB_L_utf8(locinput)))
4602 /* Here, have exhausted L*. If the next character
4603 * is not an LV, LVT nor V, it means we had to have
4604 * at least one L, so matches L+ in the original
4605 * equation, we have a complete hangul syllable.
4608 if (locinput < reginfo->strend
4609 && is_GCB_LV_LVT_V_utf8(locinput))
4611 /* Otherwise keep going. Must be LV, LVT or V.
4612 * See if LVT, by first ruling out V, then LV */
4613 if (! is_GCB_V_utf8(locinput)
4614 /* All but every TCount one is LV */
4615 && (valid_utf8_to_uvchr((U8 *) locinput,
4620 locinput += UTF8SKIP(locinput);
4623 /* Must be V or LV. Take it, then match
4625 locinput += UTF8SKIP(locinput);
4626 while (locinput < reginfo->strend
4627 && (len = is_GCB_V_utf8(locinput)))
4633 /* And any of LV, LVT, or V can be followed
4635 while (locinput < reginfo->strend
4636 && (len = is_GCB_T_utf8(locinput)))
4644 /* Match any extender */
4645 while (locinput < reginfo->strend
4646 && swash_fetch(PL_utf8_X_extend,
4647 (U8*)locinput, utf8_target))
4649 locinput += UTF8SKIP(locinput);
4653 if (locinput > reginfo->strend) sayNO;
4657 case NREFFL: /* /\g{name}/il */
4658 { /* The capture buffer cases. The ones beginning with N for the
4659 named buffers just convert to the equivalent numbered and
4660 pretend they were called as the corresponding numbered buffer
4662 /* don't initialize these in the declaration, it makes C++
4667 const U8 *fold_array;
4670 RX_MATCH_TAINTED_on(reginfo->prog);
4671 folder = foldEQ_locale;
4672 fold_array = PL_fold_locale;
4674 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4677 case NREFFA: /* /\g{name}/iaa */
4678 folder = foldEQ_latin1;
4679 fold_array = PL_fold_latin1;
4681 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4684 case NREFFU: /* /\g{name}/iu */
4685 folder = foldEQ_latin1;
4686 fold_array = PL_fold_latin1;
4688 utf8_fold_flags = 0;
4691 case NREFF: /* /\g{name}/i */
4693 fold_array = PL_fold;
4695 utf8_fold_flags = 0;
4698 case NREF: /* /\g{name}/ */
4702 utf8_fold_flags = 0;
4705 /* For the named back references, find the corresponding buffer
4707 n = reg_check_named_buff_matched(rex,scan);
4712 goto do_nref_ref_common;
4714 case REFFL: /* /\1/il */
4715 RX_MATCH_TAINTED_on(reginfo->prog);
4716 folder = foldEQ_locale;
4717 fold_array = PL_fold_locale;
4718 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4721 case REFFA: /* /\1/iaa */
4722 folder = foldEQ_latin1;
4723 fold_array = PL_fold_latin1;
4724 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4727 case REFFU: /* /\1/iu */
4728 folder = foldEQ_latin1;
4729 fold_array = PL_fold_latin1;
4730 utf8_fold_flags = 0;
4733 case REFF: /* /\1/i */
4735 fold_array = PL_fold;
4736 utf8_fold_flags = 0;
4739 case REF: /* /\1/ */
4742 utf8_fold_flags = 0;
4746 n = ARG(scan); /* which paren pair */
4749 ln = rex->offs[n].start;
4750 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4751 if (rex->lastparen < n || ln == -1)
4752 sayNO; /* Do not match unless seen CLOSEn. */
4753 if (ln == rex->offs[n].end)
4756 s = reginfo->strbeg + ln;
4757 if (type != REF /* REF can do byte comparison */
4758 && (utf8_target || type == REFFU))
4759 { /* XXX handle REFFL better */
4760 char * limit = reginfo->strend;
4762 /* This call case insensitively compares the entire buffer
4763 * at s, with the current input starting at locinput, but
4764 * not going off the end given by reginfo->strend, and
4765 * returns in <limit> upon success, how much of the
4766 * current input was matched */
4767 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4768 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4776 /* Not utf8: Inline the first character, for speed. */
4777 if (!NEXTCHR_IS_EOS &&
4778 UCHARAT(s) != nextchr &&
4780 UCHARAT(s) != fold_array[nextchr]))
4782 ln = rex->offs[n].end - ln;
4783 if (locinput + ln > reginfo->strend)
4785 if (ln > 1 && (type == REF
4786 ? memNE(s, locinput, ln)
4787 : ! folder(s, locinput, ln)))
4793 case NOTHING: /* null op; e.g. the 'nothing' following
4794 * the '*' in m{(a+|b)*}' */
4796 case TAIL: /* placeholder while compiling (A|B|C) */
4799 case BACK: /* ??? doesn't appear to be used ??? */
4803 #define ST st->u.eval
4808 regexp_internal *rei;
4809 regnode *startpoint;
4811 case GOSTART: /* (?R) */
4812 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4813 if (cur_eval && cur_eval->locinput==locinput) {
4814 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4815 Perl_croak(aTHX_ "Infinite recursion in regex");
4816 if ( ++nochange_depth > max_nochange_depth )
4818 "Pattern subroutine nesting without pos change"
4819 " exceeded limit in regex");
4826 if (OP(scan)==GOSUB) {
4827 startpoint = scan + ARG2L(scan);
4828 ST.close_paren = ARG(scan);
4830 startpoint = rei->program+1;
4833 goto eval_recurse_doit;
4834 assert(0); /* NOTREACHED */
4836 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4837 if (cur_eval && cur_eval->locinput==locinput) {
4838 if ( ++nochange_depth > max_nochange_depth )
4839 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4844 /* execute the code in the {...} */
4848 OP * const oop = PL_op;
4849 COP * const ocurcop = PL_curcop;
4851 struct re_save_state saved_state;
4854 /* save *all* paren positions */
4855 regcppush(rex, 0, maxopenparen);
4856 REGCP_SET(runops_cp);
4858 /* To not corrupt the existing regex state while executing the
4859 * eval we would normally put it on the save stack, like with
4860 * save_re_context. However, re-evals have a weird scoping so we
4861 * can't just add ENTER/LEAVE here. With that, things like
4863 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4865 * would break, as they expect the localisation to be unwound
4866 * only when the re-engine backtracks through the bit that
4869 * What we do instead is just saving the state in a local c
4872 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4875 caller_cv = find_runcv(NULL);
4879 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4881 (REGEXP*)(rexi->data->data[n])
4884 nop = (OP*)rexi->data->data[n+1];
4886 else if (rexi->data->what[n] == 'l') { /* literal code */
4888 nop = (OP*)rexi->data->data[n];
4889 assert(CvDEPTH(newcv));
4892 /* literal with own CV */
4893 assert(rexi->data->what[n] == 'L');
4894 newcv = rex->qr_anoncv;
4895 nop = (OP*)rexi->data->data[n];
4898 /* normally if we're about to execute code from the same
4899 * CV that we used previously, we just use the existing
4900 * CX stack entry. However, its possible that in the
4901 * meantime we may have backtracked, popped from the save
4902 * stack, and undone the SAVECOMPPAD(s) associated with
4903 * PUSH_MULTICALL; in which case PL_comppad no longer
4904 * points to newcv's pad. */
4905 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4907 U8 flags = (CXp_SUB_RE |
4908 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
4909 if (last_pushed_cv) {
4910 CHANGE_MULTICALL_FLAGS(newcv, flags);
4913 PUSH_MULTICALL_FLAGS(newcv, flags);
4915 last_pushed_cv = newcv;
4918 /* these assignments are just to silence compiler
4920 multicall_cop = NULL;
4923 last_pad = PL_comppad;
4925 /* the initial nextstate you would normally execute
4926 * at the start of an eval (which would cause error
4927 * messages to come from the eval), may be optimised
4928 * away from the execution path in the regex code blocks;
4929 * so manually set PL_curcop to it initially */
4931 OP *o = cUNOPx(nop)->op_first;
4932 assert(o->op_type == OP_NULL);
4933 if (o->op_targ == OP_SCOPE) {
4934 o = cUNOPo->op_first;
4937 assert(o->op_targ == OP_LEAVE);
4938 o = cUNOPo->op_first;
4939 assert(o->op_type == OP_ENTER);
4943 if (o->op_type != OP_STUB) {
4944 assert( o->op_type == OP_NEXTSTATE
4945 || o->op_type == OP_DBSTATE
4946 || (o->op_type == OP_NULL
4947 && ( o->op_targ == OP_NEXTSTATE
4948 || o->op_targ == OP_DBSTATE
4952 PL_curcop = (COP*)o;
4957 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4958 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4960 rex->offs[0].end = locinput - reginfo->strbeg;
4961 if (reginfo->info_aux_eval->pos_magic)
4962 reginfo->info_aux_eval->pos_magic->mg_len
4963 = locinput - reginfo->strbeg;
4966 SV *sv_mrk = get_sv("REGMARK", 1);
4967 sv_setsv(sv_mrk, sv_yes_mark);
4970 /* we don't use MULTICALL here as we want to call the
4971 * first op of the block of interest, rather than the
4972 * first op of the sub */
4973 before = (IV)(SP-PL_stack_base);
4975 CALLRUNOPS(aTHX); /* Scalar context. */
4977 if ((IV)(SP-PL_stack_base) == before)
4978 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4984 /* before restoring everything, evaluate the returned
4985 * value, so that 'uninit' warnings don't use the wrong
4986 * PL_op or pad. Also need to process any magic vars
4987 * (e.g. $1) *before* parentheses are restored */
4992 if (logical == 0) /* (?{})/ */
4993 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4994 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4995 sw = cBOOL(SvTRUE(ret));
4998 else { /* /(??{}) */
4999 /* if its overloaded, let the regex compiler handle
5000 * it; otherwise extract regex, or stringify */
5001 if (!SvAMAGIC(ret)) {
5005 if (SvTYPE(sv) == SVt_REGEXP)
5006 re_sv = (REGEXP*) sv;
5007 else if (SvSMAGICAL(sv)) {
5008 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5010 re_sv = (REGEXP *) mg->mg_obj;
5013 /* force any magic, undef warnings here */
5015 ret = sv_mortalcopy(ret);
5016 (void) SvPV_force_nolen(ret);
5022 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5024 /* *** Note that at this point we don't restore
5025 * PL_comppad, (or pop the CxSUB) on the assumption it may
5026 * be used again soon. This is safe as long as nothing
5027 * in the regexp code uses the pad ! */
5029 PL_curcop = ocurcop;
5030 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5036 /* only /(??{})/ from now on */
5039 /* extract RE object from returned value; compiling if
5043 re_sv = reg_temp_copy(NULL, re_sv);
5048 if (SvUTF8(ret) && IN_BYTES) {
5049 /* In use 'bytes': make a copy of the octet
5050 * sequence, but without the flag on */
5052 const char *const p = SvPV(ret, len);
5053 ret = newSVpvn_flags(p, len, SVs_TEMP);
5055 if (rex->intflags & PREGf_USE_RE_EVAL)
5056 pm_flags |= PMf_USE_RE_EVAL;
5058 /* if we got here, it should be an engine which
5059 * supports compiling code blocks and stuff */
5060 assert(rex->engine && rex->engine->op_comp);
5061 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5062 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5063 rex->engine, NULL, NULL,
5064 /* copy /msix etc to inner pattern */
5069 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5071 /* This isn't a first class regexp. Instead, it's
5072 caching a regexp onto an existing, Perl visible
5074 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5076 /* safe to do now that any $1 etc has been
5077 * interpolated into the new pattern string and
5079 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5084 RXp_MATCH_COPIED_off(re);
5085 re->subbeg = rex->subbeg;
5086 re->sublen = rex->sublen;
5087 re->suboffset = rex->suboffset;
5088 re->subcoffset = rex->subcoffset;
5091 debug_start_match(re_sv, utf8_target, locinput,
5092 reginfo->strend, "Matching embedded");
5094 startpoint = rei->program + 1;
5095 ST.close_paren = 0; /* only used for GOSUB */
5097 eval_recurse_doit: /* Share code with GOSUB below this line */
5098 /* run the pattern returned from (??{...}) */
5100 /* Save *all* the positions. */
5101 ST.cp = regcppush(rex, 0, maxopenparen);
5102 REGCP_SET(ST.lastcp);
5105 re->lastcloseparen = 0;
5109 /* XXXX This is too dramatic a measure... */
5110 reginfo->poscache_maxiter = 0;
5112 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5114 ST.prev_rex = rex_sv;
5115 ST.prev_curlyx = cur_curlyx;
5117 SET_reg_curpm(rex_sv);
5122 ST.prev_eval = cur_eval;
5124 /* now continue from first node in postoned RE */
5125 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5126 assert(0); /* NOTREACHED */
5129 case EVAL_AB: /* cleanup after a successful (??{A})B */
5130 /* note: this is called twice; first after popping B, then A */
5131 rex_sv = ST.prev_rex;
5132 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5133 SET_reg_curpm(rex_sv);
5134 rex = ReANY(rex_sv);
5135 rexi = RXi_GET(rex);
5137 cur_eval = ST.prev_eval;
5138 cur_curlyx = ST.prev_curlyx;
5140 /* XXXX This is too dramatic a measure... */
5141 reginfo->poscache_maxiter = 0;
5142 if ( nochange_depth )
5147 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5148 /* note: this is called twice; first after popping B, then A */
5149 rex_sv = ST.prev_rex;
5150 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5151 SET_reg_curpm(rex_sv);
5152 rex = ReANY(rex_sv);
5153 rexi = RXi_GET(rex);
5155 REGCP_UNWIND(ST.lastcp);
5156 regcppop(rex, &maxopenparen);
5157 cur_eval = ST.prev_eval;
5158 cur_curlyx = ST.prev_curlyx;
5159 /* XXXX This is too dramatic a measure... */
5160 reginfo->poscache_maxiter = 0;
5161 if ( nochange_depth )
5167 n = ARG(scan); /* which paren pair */
5168 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5169 if (n > maxopenparen)
5171 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5172 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5176 (IV)rex->offs[n].start_tmp,
5182 /* XXX really need to log other places start/end are set too */
5183 #define CLOSE_CAPTURE \
5184 rex->offs[n].start = rex->offs[n].start_tmp; \
5185 rex->offs[n].end = locinput - reginfo->strbeg; \
5186 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5187 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5189 PTR2UV(rex->offs), \
5191 (IV)rex->offs[n].start, \
5192 (IV)rex->offs[n].end \
5196 n = ARG(scan); /* which paren pair */
5198 if (n > rex->lastparen)
5200 rex->lastcloseparen = n;
5201 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5206 case ACCEPT: /* (*ACCEPT) */
5210 cursor && OP(cursor)!=END;
5211 cursor=regnext(cursor))
5213 if ( OP(cursor)==CLOSE ){
5215 if ( n <= lastopen ) {
5217 if (n > rex->lastparen)
5219 rex->lastcloseparen = n;
5220 if ( n == ARG(scan) || (cur_eval &&
5221 cur_eval->u.eval.close_paren == n))
5230 case GROUPP: /* (?(1)) */
5231 n = ARG(scan); /* which paren pair */
5232 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5235 case NGROUPP: /* (?(<name>)) */
5236 /* reg_check_named_buff_matched returns 0 for no match */
5237 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5240 case INSUBP: /* (?(R)) */
5242 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5245 case DEFINEP: /* (?(DEFINE)) */
5249 case IFTHEN: /* (?(cond)A|B) */
5250 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5252 next = NEXTOPER(NEXTOPER(scan));
5254 next = scan + ARG(scan);
5255 if (OP(next) == IFTHEN) /* Fake one. */
5256 next = NEXTOPER(NEXTOPER(next));
5260 case LOGICAL: /* modifier for EVAL and IFMATCH */
5261 logical = scan->flags;
5264 /*******************************************************************
5266 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5267 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5268 STAR/PLUS/CURLY/CURLYN are used instead.)
5270 A*B is compiled as <CURLYX><A><WHILEM><B>
5272 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5273 state, which contains the current count, initialised to -1. It also sets
5274 cur_curlyx to point to this state, with any previous value saved in the
5277 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5278 since the pattern may possibly match zero times (i.e. it's a while {} loop
5279 rather than a do {} while loop).
5281 Each entry to WHILEM represents a successful match of A. The count in the
5282 CURLYX block is incremented, another WHILEM state is pushed, and execution
5283 passes to A or B depending on greediness and the current count.
5285 For example, if matching against the string a1a2a3b (where the aN are
5286 substrings that match /A/), then the match progresses as follows: (the
5287 pushed states are interspersed with the bits of strings matched so far):
5290 <CURLYX cnt=0><WHILEM>
5291 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5292 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5293 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5294 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5296 (Contrast this with something like CURLYM, which maintains only a single
5300 a1 <CURLYM cnt=1> a2
5301 a1 a2 <CURLYM cnt=2> a3
5302 a1 a2 a3 <CURLYM cnt=3> b
5305 Each WHILEM state block marks a point to backtrack to upon partial failure
5306 of A or B, and also contains some minor state data related to that
5307 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5308 overall state, such as the count, and pointers to the A and B ops.
5310 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5311 must always point to the *current* CURLYX block, the rules are:
5313 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5314 and set cur_curlyx to point the new block.
5316 When popping the CURLYX block after a successful or unsuccessful match,
5317 restore the previous cur_curlyx.
5319 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5320 to the outer one saved in the CURLYX block.
5322 When popping the WHILEM block after a successful or unsuccessful B match,
5323 restore the previous cur_curlyx.
5325 Here's an example for the pattern (AI* BI)*BO
5326 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5329 curlyx backtrack stack
5330 ------ ---------------
5332 CO <CO prev=NULL> <WO>
5333 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5334 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5335 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5337 At this point the pattern succeeds, and we work back down the stack to
5338 clean up, restoring as we go:
5340 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5341 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5342 CO <CO prev=NULL> <WO>
5345 *******************************************************************/
5347 #define ST st->u.curlyx
5349 case CURLYX: /* start of /A*B/ (for complex A) */
5351 /* No need to save/restore up to this paren */
5352 I32 parenfloor = scan->flags;
5354 assert(next); /* keep Coverity happy */
5355 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5358 /* XXXX Probably it is better to teach regpush to support
5359 parenfloor > maxopenparen ... */
5360 if (parenfloor > (I32)rex->lastparen)
5361 parenfloor = rex->lastparen; /* Pessimization... */
5363 ST.prev_curlyx= cur_curlyx;
5365 ST.cp = PL_savestack_ix;
5367 /* these fields contain the state of the current curly.
5368 * they are accessed by subsequent WHILEMs */
5369 ST.parenfloor = parenfloor;
5374 ST.count = -1; /* this will be updated by WHILEM */
5375 ST.lastloc = NULL; /* this will be updated by WHILEM */
5377 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5378 assert(0); /* NOTREACHED */
5381 case CURLYX_end: /* just finished matching all of A*B */
5382 cur_curlyx = ST.prev_curlyx;
5384 assert(0); /* NOTREACHED */
5386 case CURLYX_end_fail: /* just failed to match all of A*B */
5388 cur_curlyx = ST.prev_curlyx;
5390 assert(0); /* NOTREACHED */
5394 #define ST st->u.whilem
5396 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5398 /* see the discussion above about CURLYX/WHILEM */
5400 int min = ARG1(cur_curlyx->u.curlyx.me);
5401 int max = ARG2(cur_curlyx->u.curlyx.me);
5402 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5404 assert(cur_curlyx); /* keep Coverity happy */
5405 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5406 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5407 ST.cache_offset = 0;
5411 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5412 "%*s whilem: matched %ld out of %d..%d\n",
5413 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5416 /* First just match a string of min A's. */
5419 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5421 cur_curlyx->u.curlyx.lastloc = locinput;
5422 REGCP_SET(ST.lastcp);
5424 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5425 assert(0); /* NOTREACHED */
5428 /* If degenerate A matches "", assume A done. */
5430 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5431 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5432 "%*s whilem: empty match detected, trying continuation...\n",
5433 REPORT_CODE_OFF+depth*2, "")
5435 goto do_whilem_B_max;
5438 /* super-linear cache processing */
5442 if (!reginfo->poscache_maxiter) {
5443 /* start the countdown: Postpone detection until we
5444 * know the match is not *that* much linear. */
5445 reginfo->poscache_maxiter
5446 = (reginfo->strend - reginfo->strbeg + 1)
5448 /* possible overflow for long strings and many CURLYX's */
5449 if (reginfo->poscache_maxiter < 0)
5450 reginfo->poscache_maxiter = I32_MAX;
5451 reginfo->poscache_iter = reginfo->poscache_maxiter;
5454 if (reginfo->poscache_iter-- == 0) {
5455 /* initialise cache */
5456 const I32 size = (reginfo->poscache_maxiter + 7)/8;
5457 if (PL_reg_poscache) {
5458 if ((I32)PL_reg_poscache_size < size) {
5459 Renew(PL_reg_poscache, size, char);
5460 PL_reg_poscache_size = size;
5462 Zero(PL_reg_poscache, size, char);
5465 PL_reg_poscache_size = size;
5466 Newxz(PL_reg_poscache, size, char);
5468 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5469 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5470 PL_colors[4], PL_colors[5])
5474 if (reginfo->poscache_iter < 0) {
5475 /* have we already failed at this position? */
5477 offset = (scan->flags & 0xf) - 1
5478 + (locinput - reginfo->strbeg)
5480 mask = 1 << (offset % 8);
5482 if (PL_reg_poscache[offset] & mask) {
5483 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5484 "%*s whilem: (cache) already tried at this position...\n",
5485 REPORT_CODE_OFF+depth*2, "")
5487 sayNO; /* cache records failure */
5489 ST.cache_offset = offset;
5490 ST.cache_mask = mask;
5494 /* Prefer B over A for minimal matching. */
5496 if (cur_curlyx->u.curlyx.minmod) {
5497 ST.save_curlyx = cur_curlyx;
5498 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5499 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5501 REGCP_SET(ST.lastcp);
5502 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5504 assert(0); /* NOTREACHED */
5507 /* Prefer A over B for maximal matching. */
5509 if (n < max) { /* More greed allowed? */
5510 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5512 cur_curlyx->u.curlyx.lastloc = locinput;
5513 REGCP_SET(ST.lastcp);
5514 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5515 assert(0); /* NOTREACHED */
5517 goto do_whilem_B_max;
5519 assert(0); /* NOTREACHED */
5521 case WHILEM_B_min: /* just matched B in a minimal match */
5522 case WHILEM_B_max: /* just matched B in a maximal match */
5523 cur_curlyx = ST.save_curlyx;
5525 assert(0); /* NOTREACHED */
5527 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5528 cur_curlyx = ST.save_curlyx;
5529 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5530 cur_curlyx->u.curlyx.count--;
5532 assert(0); /* NOTREACHED */
5534 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5536 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5537 REGCP_UNWIND(ST.lastcp);
5538 regcppop(rex, &maxopenparen);
5539 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5540 cur_curlyx->u.curlyx.count--;
5542 assert(0); /* NOTREACHED */
5544 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5545 REGCP_UNWIND(ST.lastcp);
5546 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5547 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5548 "%*s whilem: failed, trying continuation...\n",
5549 REPORT_CODE_OFF+depth*2, "")
5552 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5553 && ckWARN(WARN_REGEXP)
5554 && !reginfo->warned)
5556 reginfo->warned = TRUE;
5557 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5558 "Complex regular subexpression recursion limit (%d) "
5564 ST.save_curlyx = cur_curlyx;
5565 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5566 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5568 assert(0); /* NOTREACHED */
5570 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5571 cur_curlyx = ST.save_curlyx;
5572 REGCP_UNWIND(ST.lastcp);
5573 regcppop(rex, &maxopenparen);
5575 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5576 /* Maximum greed exceeded */
5577 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5578 && ckWARN(WARN_REGEXP)
5579 && !reginfo->warned)
5581 reginfo->warned = TRUE;
5582 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5583 "Complex regular subexpression recursion "
5584 "limit (%d) exceeded",
5587 cur_curlyx->u.curlyx.count--;
5591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5592 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5594 /* Try grabbing another A and see if it helps. */
5595 cur_curlyx->u.curlyx.lastloc = locinput;
5596 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5598 REGCP_SET(ST.lastcp);
5599 PUSH_STATE_GOTO(WHILEM_A_min,
5600 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5602 assert(0); /* NOTREACHED */
5605 #define ST st->u.branch
5607 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5608 next = scan + ARG(scan);
5611 scan = NEXTOPER(scan);
5614 case BRANCH: /* /(...|A|...)/ */
5615 scan = NEXTOPER(scan); /* scan now points to inner node */
5616 ST.lastparen = rex->lastparen;
5617 ST.lastcloseparen = rex->lastcloseparen;
5618 ST.next_branch = next;
5621 /* Now go into the branch */
5623 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5625 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5627 assert(0); /* NOTREACHED */
5629 case CUTGROUP: /* /(*THEN)/ */
5630 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5631 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5632 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5633 assert(0); /* NOTREACHED */
5635 case CUTGROUP_next_fail:
5638 if (st->u.mark.mark_name)
5639 sv_commit = st->u.mark.mark_name;
5641 assert(0); /* NOTREACHED */
5645 assert(0); /* NOTREACHED */
5647 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5652 REGCP_UNWIND(ST.cp);
5653 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5654 scan = ST.next_branch;
5655 /* no more branches? */
5656 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5658 PerlIO_printf( Perl_debug_log,
5659 "%*s %sBRANCH failed...%s\n",
5660 REPORT_CODE_OFF+depth*2, "",
5666 continue; /* execute next BRANCH[J] op */
5667 assert(0); /* NOTREACHED */
5669 case MINMOD: /* next op will be non-greedy, e.g. A*? */
5674 #define ST st->u.curlym
5676 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5678 /* This is an optimisation of CURLYX that enables us to push
5679 * only a single backtracking state, no matter how many matches
5680 * there are in {m,n}. It relies on the pattern being constant
5681 * length, with no parens to influence future backrefs
5685 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5687 ST.lastparen = rex->lastparen;
5688 ST.lastcloseparen = rex->lastcloseparen;
5690 /* if paren positive, emulate an OPEN/CLOSE around A */
5692 U32 paren = ST.me->flags;
5693 if (paren > maxopenparen)
5694 maxopenparen = paren;
5695 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5703 ST.c1 = CHRTEST_UNINIT;
5706 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5709 curlym_do_A: /* execute the A in /A{m,n}B/ */
5710 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5711 assert(0); /* NOTREACHED */
5713 case CURLYM_A: /* we've just matched an A */
5715 /* after first match, determine A's length: u.curlym.alen */
5716 if (ST.count == 1) {
5717 if (reginfo->is_utf8_target) {
5718 char *s = st->locinput;
5719 while (s < locinput) {
5725 ST.alen = locinput - st->locinput;
5728 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5731 PerlIO_printf(Perl_debug_log,
5732 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5733 (int)(REPORT_CODE_OFF+(depth*2)), "",
5734 (IV) ST.count, (IV)ST.alen)
5737 if (cur_eval && cur_eval->u.eval.close_paren &&
5738 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5742 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5743 if ( max == REG_INFTY || ST.count < max )
5744 goto curlym_do_A; /* try to match another A */
5746 goto curlym_do_B; /* try to match B */
5748 case CURLYM_A_fail: /* just failed to match an A */
5749 REGCP_UNWIND(ST.cp);
5751 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5752 || (cur_eval && cur_eval->u.eval.close_paren &&
5753 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5756 curlym_do_B: /* execute the B in /A{m,n}B/ */
5757 if (ST.c1 == CHRTEST_UNINIT) {
5758 /* calculate c1 and c2 for possible match of 1st char
5759 * following curly */
5760 ST.c1 = ST.c2 = CHRTEST_VOID;
5761 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5762 regnode *text_node = ST.B;
5763 if (! HAS_TEXT(text_node))
5764 FIND_NEXT_IMPT(text_node);
5767 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5769 But the former is redundant in light of the latter.
5771 if this changes back then the macro for
5772 IS_TEXT and friends need to change.
5774 if (PL_regkind[OP(text_node)] == EXACT) {
5775 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5776 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5786 PerlIO_printf(Perl_debug_log,
5787 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5788 (int)(REPORT_CODE_OFF+(depth*2)),
5791 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5792 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5793 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5794 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5796 /* simulate B failing */
5798 PerlIO_printf(Perl_debug_log,
5799 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5800 (int)(REPORT_CODE_OFF+(depth*2)),"",
5801 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5802 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5803 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5805 state_num = CURLYM_B_fail;
5806 goto reenter_switch;
5809 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5810 /* simulate B failing */
5812 PerlIO_printf(Perl_debug_log,
5813 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5814 (int)(REPORT_CODE_OFF+(depth*2)),"",
5815 (int) nextchr, ST.c1, ST.c2)
5817 state_num = CURLYM_B_fail;
5818 goto reenter_switch;
5823 /* emulate CLOSE: mark current A as captured */
5824 I32 paren = ST.me->flags;
5826 rex->offs[paren].start
5827 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5828 rex->offs[paren].end = locinput - reginfo->strbeg;
5829 if ((U32)paren > rex->lastparen)
5830 rex->lastparen = paren;
5831 rex->lastcloseparen = paren;
5834 rex->offs[paren].end = -1;
5835 if (cur_eval && cur_eval->u.eval.close_paren &&
5836 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5845 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5846 assert(0); /* NOTREACHED */
5848 case CURLYM_B_fail: /* just failed to match a B */
5849 REGCP_UNWIND(ST.cp);
5850 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5852 I32 max = ARG2(ST.me);
5853 if (max != REG_INFTY && ST.count == max)
5855 goto curlym_do_A; /* try to match a further A */
5857 /* backtrack one A */
5858 if (ST.count == ARG1(ST.me) /* min */)
5861 SET_locinput(HOPc(locinput, -ST.alen));
5862 goto curlym_do_B; /* try to match B */
5865 #define ST st->u.curly
5867 #define CURLY_SETPAREN(paren, success) \
5870 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
5871 rex->offs[paren].end = locinput - reginfo->strbeg; \
5872 if (paren > rex->lastparen) \
5873 rex->lastparen = paren; \
5874 rex->lastcloseparen = paren; \
5877 rex->offs[paren].end = -1; \
5878 rex->lastparen = ST.lastparen; \
5879 rex->lastcloseparen = ST.lastcloseparen; \
5883 case STAR: /* /A*B/ where A is width 1 char */
5887 scan = NEXTOPER(scan);
5890 case PLUS: /* /A+B/ where A is width 1 char */
5894 scan = NEXTOPER(scan);
5897 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5898 ST.paren = scan->flags; /* Which paren to set */
5899 ST.lastparen = rex->lastparen;
5900 ST.lastcloseparen = rex->lastcloseparen;
5901 if (ST.paren > maxopenparen)
5902 maxopenparen = ST.paren;
5903 ST.min = ARG1(scan); /* min to match */
5904 ST.max = ARG2(scan); /* max to match */
5905 if (cur_eval && cur_eval->u.eval.close_paren &&
5906 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5910 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5913 case CURLY: /* /A{m,n}B/ where A is width 1 char */
5915 ST.min = ARG1(scan); /* min to match */
5916 ST.max = ARG2(scan); /* max to match */
5917 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5920 * Lookahead to avoid useless match attempts
5921 * when we know what character comes next.
5923 * Used to only do .*x and .*?x, but now it allows
5924 * for )'s, ('s and (?{ ... })'s to be in the way
5925 * of the quantifier and the EXACT-like node. -- japhy
5928 assert(ST.min <= ST.max);
5929 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5930 ST.c1 = ST.c2 = CHRTEST_VOID;
5933 regnode *text_node = next;
5935 if (! HAS_TEXT(text_node))
5936 FIND_NEXT_IMPT(text_node);
5938 if (! HAS_TEXT(text_node))
5939 ST.c1 = ST.c2 = CHRTEST_VOID;
5941 if ( PL_regkind[OP(text_node)] != EXACT ) {
5942 ST.c1 = ST.c2 = CHRTEST_VOID;
5946 /* Currently we only get here when
5948 PL_rekind[OP(text_node)] == EXACT
5950 if this changes back then the macro for IS_TEXT and
5951 friends need to change. */
5952 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5953 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5965 char *li = locinput;
5968 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
5974 if (ST.c1 == CHRTEST_VOID)
5975 goto curly_try_B_min;
5977 ST.oldloc = locinput;
5979 /* set ST.maxpos to the furthest point along the
5980 * string that could possibly match */
5981 if (ST.max == REG_INFTY) {
5982 ST.maxpos = reginfo->strend - 1;
5984 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5987 else if (utf8_target) {
5988 int m = ST.max - ST.min;
5989 for (ST.maxpos = locinput;
5990 m >0 && ST.maxpos < reginfo->strend; m--)
5991 ST.maxpos += UTF8SKIP(ST.maxpos);
5994 ST.maxpos = locinput + ST.max - ST.min;
5995 if (ST.maxpos >= reginfo->strend)
5996 ST.maxpos = reginfo->strend - 1;
5998 goto curly_try_B_min_known;
6002 /* avoid taking address of locinput, so it can remain
6004 char *li = locinput;
6005 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6006 if (ST.count < ST.min)
6009 if ((ST.count > ST.min)
6010 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6012 /* A{m,n} must come at the end of the string, there's
6013 * no point in backing off ... */
6015 /* ...except that $ and \Z can match before *and* after
6016 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6017 We may back off by one in this case. */
6018 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6022 goto curly_try_B_max;
6024 assert(0); /* NOTREACHED */
6027 case CURLY_B_min_known_fail:
6028 /* failed to find B in a non-greedy match where c1,c2 valid */
6030 REGCP_UNWIND(ST.cp);
6032 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6034 /* Couldn't or didn't -- move forward. */
6035 ST.oldloc = locinput;
6037 locinput += UTF8SKIP(locinput);
6041 curly_try_B_min_known:
6042 /* find the next place where 'B' could work, then call B */
6046 n = (ST.oldloc == locinput) ? 0 : 1;
6047 if (ST.c1 == ST.c2) {
6048 /* set n to utf8_distance(oldloc, locinput) */
6049 while (locinput <= ST.maxpos
6050 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6052 locinput += UTF8SKIP(locinput);
6057 /* set n to utf8_distance(oldloc, locinput) */
6058 while (locinput <= ST.maxpos
6059 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6060 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6062 locinput += UTF8SKIP(locinput);
6067 else { /* Not utf8_target */
6068 if (ST.c1 == ST.c2) {
6069 while (locinput <= ST.maxpos &&
6070 UCHARAT(locinput) != ST.c1)
6074 while (locinput <= ST.maxpos
6075 && UCHARAT(locinput) != ST.c1
6076 && UCHARAT(locinput) != ST.c2)
6079 n = locinput - ST.oldloc;
6081 if (locinput > ST.maxpos)
6084 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6085 * at b; check that everything between oldloc and
6086 * locinput matches */
6087 char *li = ST.oldloc;
6089 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6091 assert(n == REG_INFTY || locinput == li);
6093 CURLY_SETPAREN(ST.paren, ST.count);
6094 if (cur_eval && cur_eval->u.eval.close_paren &&
6095 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6098 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6100 assert(0); /* NOTREACHED */
6103 case CURLY_B_min_fail:
6104 /* failed to find B in a non-greedy match where c1,c2 invalid */
6106 REGCP_UNWIND(ST.cp);
6108 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6110 /* failed -- move forward one */
6112 char *li = locinput;
6113 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6120 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6121 ST.count > 0)) /* count overflow ? */
6124 CURLY_SETPAREN(ST.paren, ST.count);
6125 if (cur_eval && cur_eval->u.eval.close_paren &&
6126 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6129 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6133 assert(0); /* NOTREACHED */
6137 /* a successful greedy match: now try to match B */
6138 if (cur_eval && cur_eval->u.eval.close_paren &&
6139 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6143 bool could_match = locinput < reginfo->strend;
6145 /* If it could work, try it. */
6146 if (ST.c1 != CHRTEST_VOID && could_match) {
6147 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6149 could_match = memEQ(locinput,
6154 UTF8SKIP(locinput));
6157 could_match = UCHARAT(locinput) == ST.c1
6158 || UCHARAT(locinput) == ST.c2;
6161 if (ST.c1 == CHRTEST_VOID || could_match) {
6162 CURLY_SETPAREN(ST.paren, ST.count);
6163 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6164 assert(0); /* NOTREACHED */
6169 case CURLY_B_max_fail:
6170 /* failed to find B in a greedy match */
6172 REGCP_UNWIND(ST.cp);
6174 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6177 if (--ST.count < ST.min)
6179 locinput = HOPc(locinput, -1);
6180 goto curly_try_B_max;
6184 case END: /* last op of main pattern */
6187 /* we've just finished A in /(??{A})B/; now continue with B */
6189 st->u.eval.prev_rex = rex_sv; /* inner */
6191 /* Save *all* the positions. */
6192 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6193 rex_sv = cur_eval->u.eval.prev_rex;
6194 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6195 SET_reg_curpm(rex_sv);
6196 rex = ReANY(rex_sv);
6197 rexi = RXi_GET(rex);
6198 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6200 REGCP_SET(st->u.eval.lastcp);
6202 /* Restore parens of the outer rex without popping the
6204 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6207 st->u.eval.prev_eval = cur_eval;
6208 cur_eval = cur_eval->u.eval.prev_eval;
6210 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6211 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6212 if ( nochange_depth )
6215 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6216 locinput); /* match B */
6219 if (locinput < reginfo->till) {
6220 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6221 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6223 (long)(locinput - PL_reg_starttry),
6224 (long)(reginfo->till - PL_reg_starttry),
6227 sayNO_SILENT; /* Cannot match: too short. */
6229 sayYES; /* Success! */
6231 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6233 PerlIO_printf(Perl_debug_log,
6234 "%*s %ssubpattern success...%s\n",
6235 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6236 sayYES; /* Success! */
6239 #define ST st->u.ifmatch
6244 case SUSPEND: /* (?>A) */
6246 newstart = locinput;
6249 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6251 goto ifmatch_trivial_fail_test;
6253 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6255 ifmatch_trivial_fail_test:
6257 char * const s = HOPBACKc(locinput, scan->flags);
6262 sw = 1 - cBOOL(ST.wanted);
6266 next = scan + ARG(scan);
6274 newstart = locinput;
6278 ST.logical = logical;
6279 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6281 /* execute body of (?...A) */
6282 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6283 assert(0); /* NOTREACHED */
6286 case IFMATCH_A_fail: /* body of (?...A) failed */
6287 ST.wanted = !ST.wanted;
6290 case IFMATCH_A: /* body of (?...A) succeeded */
6292 sw = cBOOL(ST.wanted);
6294 else if (!ST.wanted)
6297 if (OP(ST.me) != SUSPEND) {
6298 /* restore old position except for (?>...) */
6299 locinput = st->locinput;
6301 scan = ST.me + ARG(ST.me);
6304 continue; /* execute B */
6308 case LONGJMP: /* alternative with many branches compiles to
6309 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6310 next = scan + ARG(scan);
6315 case COMMIT: /* (*COMMIT) */
6316 reginfo->cutpoint = reginfo->strend;
6319 case PRUNE: /* (*PRUNE) */
6321 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6322 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6323 assert(0); /* NOTREACHED */
6325 case COMMIT_next_fail:
6329 case OPFAIL: /* (*FAIL) */
6331 assert(0); /* NOTREACHED */
6333 #define ST st->u.mark
6334 case MARKPOINT: /* (*MARK:foo) */
6335 ST.prev_mark = mark_state;
6336 ST.mark_name = sv_commit = sv_yes_mark
6337 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6339 ST.mark_loc = locinput;
6340 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6341 assert(0); /* NOTREACHED */
6343 case MARKPOINT_next:
6344 mark_state = ST.prev_mark;
6346 assert(0); /* NOTREACHED */
6348 case MARKPOINT_next_fail:
6349 if (popmark && sv_eq(ST.mark_name,popmark))
6351 if (ST.mark_loc > startpoint)
6352 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6353 popmark = NULL; /* we found our mark */
6354 sv_commit = ST.mark_name;
6357 PerlIO_printf(Perl_debug_log,
6358 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6359 REPORT_CODE_OFF+depth*2, "",
6360 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6363 mark_state = ST.prev_mark;
6364 sv_yes_mark = mark_state ?
6365 mark_state->u.mark.mark_name : NULL;
6367 assert(0); /* NOTREACHED */
6369 case SKIP: /* (*SKIP) */
6371 /* (*SKIP) : if we fail we cut here*/
6372 ST.mark_name = NULL;
6373 ST.mark_loc = locinput;
6374 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6376 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6377 otherwise do nothing. Meaning we need to scan
6379 regmatch_state *cur = mark_state;
6380 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6383 if ( sv_eq( cur->u.mark.mark_name,
6386 ST.mark_name = find;
6387 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6389 cur = cur->u.mark.prev_mark;
6392 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6395 case SKIP_next_fail:
6397 /* (*CUT:NAME) - Set up to search for the name as we
6398 collapse the stack*/
6399 popmark = ST.mark_name;
6401 /* (*CUT) - No name, we cut here.*/
6402 if (ST.mark_loc > startpoint)
6403 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6404 /* but we set sv_commit to latest mark_name if there
6405 is one so they can test to see how things lead to this
6408 sv_commit=mark_state->u.mark.mark_name;
6412 assert(0); /* NOTREACHED */
6415 case LNBREAK: /* \R */
6416 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6423 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6424 PTR2UV(scan), OP(scan));
6425 Perl_croak(aTHX_ "regexp memory corruption");
6427 /* this is a point to jump to in order to increment
6428 * locinput by one character */
6430 assert(!NEXTCHR_IS_EOS);
6432 locinput += PL_utf8skip[nextchr];
6433 /* locinput is allowed to go 1 char off the end, but not 2+ */
6434 if (locinput > reginfo->strend)
6443 /* switch break jumps here */
6444 scan = next; /* prepare to execute the next op and ... */
6445 continue; /* ... jump back to the top, reusing st */
6446 assert(0); /* NOTREACHED */
6449 /* push a state that backtracks on success */
6450 st->u.yes.prev_yes_state = yes_state;
6454 /* push a new regex state, then continue at scan */
6456 regmatch_state *newst;
6459 regmatch_state *cur = st;
6460 regmatch_state *curyes = yes_state;
6462 regmatch_slab *slab = PL_regmatch_slab;
6463 for (;curd > -1;cur--,curd--) {
6464 if (cur < SLAB_FIRST(slab)) {
6466 cur = SLAB_LAST(slab);
6468 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6469 REPORT_CODE_OFF + 2 + depth * 2,"",
6470 curd, PL_reg_name[cur->resume_state],
6471 (curyes == cur) ? "yes" : ""
6474 curyes = cur->u.yes.prev_yes_state;
6477 DEBUG_STATE_pp("push")
6480 st->locinput = locinput;
6482 if (newst > SLAB_LAST(PL_regmatch_slab))
6483 newst = S_push_slab(aTHX);
6484 PL_regmatch_state = newst;
6486 locinput = pushinput;
6489 assert(0); /* NOTREACHED */
6494 * We get here only if there's trouble -- normally "case END" is
6495 * the terminating point.
6497 Perl_croak(aTHX_ "corrupted regexp pointers");
6503 /* we have successfully completed a subexpression, but we must now
6504 * pop to the state marked by yes_state and continue from there */
6505 assert(st != yes_state);
6507 while (st != yes_state) {
6509 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6510 PL_regmatch_slab = PL_regmatch_slab->prev;
6511 st = SLAB_LAST(PL_regmatch_slab);
6515 DEBUG_STATE_pp("pop (no final)");
6517 DEBUG_STATE_pp("pop (yes)");
6523 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6524 || yes_state > SLAB_LAST(PL_regmatch_slab))
6526 /* not in this slab, pop slab */
6527 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6528 PL_regmatch_slab = PL_regmatch_slab->prev;
6529 st = SLAB_LAST(PL_regmatch_slab);
6531 depth -= (st - yes_state);
6534 yes_state = st->u.yes.prev_yes_state;
6535 PL_regmatch_state = st;
6538 locinput= st->locinput;
6539 state_num = st->resume_state + no_final;
6540 goto reenter_switch;
6543 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6544 PL_colors[4], PL_colors[5]));
6546 if (reginfo->info_aux_eval) {
6547 /* each successfully executed (?{...}) block does the equivalent of
6548 * local $^R = do {...}
6549 * When popping the save stack, all these locals would be undone;
6550 * bypass this by setting the outermost saved $^R to the latest
6552 if (oreplsv != GvSV(PL_replgv))
6553 sv_setsv(oreplsv, GvSV(PL_replgv));
6560 PerlIO_printf(Perl_debug_log,
6561 "%*s %sfailed...%s\n",
6562 REPORT_CODE_OFF+depth*2, "",
6563 PL_colors[4], PL_colors[5])
6575 /* there's a previous state to backtrack to */
6577 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6578 PL_regmatch_slab = PL_regmatch_slab->prev;
6579 st = SLAB_LAST(PL_regmatch_slab);
6581 PL_regmatch_state = st;
6582 locinput= st->locinput;
6584 DEBUG_STATE_pp("pop");
6586 if (yes_state == st)
6587 yes_state = st->u.yes.prev_yes_state;
6589 state_num = st->resume_state + 1; /* failure = success + 1 */
6590 goto reenter_switch;
6595 if (rex->intflags & PREGf_VERBARG_SEEN) {
6596 SV *sv_err = get_sv("REGERROR", 1);
6597 SV *sv_mrk = get_sv("REGMARK", 1);
6599 sv_commit = &PL_sv_no;
6601 sv_yes_mark = &PL_sv_yes;
6604 sv_commit = &PL_sv_yes;
6605 sv_yes_mark = &PL_sv_no;
6607 sv_setsv(sv_err, sv_commit);
6608 sv_setsv(sv_mrk, sv_yes_mark);
6612 if (last_pushed_cv) {
6615 PERL_UNUSED_VAR(SP);
6618 assert(!result || locinput - reginfo->strbeg >= 0);
6619 return result ? locinput - reginfo->strbeg : -1;
6623 - regrepeat - repeatedly match something simple, report how many
6625 * What 'simple' means is a node which can be the operand of a quantifier like
6628 * startposp - pointer a pointer to the start position. This is updated
6629 * to point to the byte following the highest successful
6631 * p - the regnode to be repeatedly matched against.
6632 * reginfo - struct holding match state, such as strend
6633 * max - maximum number of things to match.
6634 * depth - (for debugging) backtracking depth.
6637 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6638 regmatch_info *const reginfo, I32 max, int depth)
6641 char *scan; /* Pointer to current position in target string */
6643 char *loceol = reginfo->strend; /* local version */
6644 I32 hardcount = 0; /* How many matches so far */
6645 bool utf8_target = reginfo->is_utf8_target;
6646 int to_complement = 0; /* Invert the result? */
6648 _char_class_number classnum;
6650 PERL_UNUSED_ARG(depth);
6653 PERL_ARGS_ASSERT_REGREPEAT;
6656 if (max == REG_INFTY)
6658 else if (! utf8_target && scan + max < loceol)
6659 loceol = scan + max;
6661 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6662 * to the maximum of how far we should go in it (leaving it set to the real
6663 * end, if the maximum permissible would take us beyond that). This allows
6664 * us to make the loop exit condition that we haven't gone past <loceol> to
6665 * also mean that we haven't exceeded the max permissible count, saving a
6666 * test each time through the loop. But it assumes that the OP matches a
6667 * single byte, which is true for most of the OPs below when applied to a
6668 * non-UTF-8 target. Those relatively few OPs that don't have this
6669 * characteristic will have to compensate.
6671 * There is no adjustment for UTF-8 targets, as the number of bytes per
6672 * character varies. OPs will have to test both that the count is less
6673 * than the max permissible (using <hardcount> to keep track), and that we
6674 * are still within the bounds of the string (using <loceol>. A few OPs
6675 * match a single byte no matter what the encoding. They can omit the max
6676 * test if, for the UTF-8 case, they do the adjustment that was skipped
6679 * Thus, the code above sets things up for the common case; and exceptional
6680 * cases need extra work; the common case is to make sure <scan> doesn't
6681 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6682 * count doesn't exceed the maximum permissible */
6687 while (scan < loceol && hardcount < max && *scan != '\n') {
6688 scan += UTF8SKIP(scan);
6692 while (scan < loceol && *scan != '\n')
6698 while (scan < loceol && hardcount < max) {
6699 scan += UTF8SKIP(scan);
6706 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6707 if (utf8_target && scan + max < loceol) {
6709 /* <loceol> hadn't been adjusted in the UTF-8 case */
6717 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6721 /* Can use a simple loop if the pattern char to match on is invariant
6722 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6723 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6724 * true iff it doesn't matter if the argument is in UTF-8 or not */
6725 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6726 if (utf8_target && scan + max < loceol) {
6727 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6728 * since here, to match at all, 1 char == 1 byte */
6729 loceol = scan + max;
6731 while (scan < loceol && UCHARAT(scan) == c) {
6735 else if (reginfo->is_utf8_pat) {
6737 STRLEN scan_char_len;
6739 /* When both target and pattern are UTF-8, we have to do
6741 while (hardcount < max
6743 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6744 && memEQ(scan, STRING(p), scan_char_len))
6746 scan += scan_char_len;
6750 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6752 /* Target isn't utf8; convert the character in the UTF-8
6753 * pattern to non-UTF8, and do a simple loop */
6754 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6755 while (scan < loceol && UCHARAT(scan) == c) {
6758 } /* else pattern char is above Latin1, can't possibly match the
6763 /* Here, the string must be utf8; pattern isn't, and <c> is
6764 * different in utf8 than not, so can't compare them directly.
6765 * Outside the loop, find the two utf8 bytes that represent c, and
6766 * then look for those in sequence in the utf8 string */
6767 U8 high = UTF8_TWO_BYTE_HI(c);
6768 U8 low = UTF8_TWO_BYTE_LO(c);
6770 while (hardcount < max
6771 && scan + 1 < loceol
6772 && UCHARAT(scan) == high
6773 && UCHARAT(scan + 1) == low)
6782 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6786 RXp_MATCH_TAINTED_on(prog);
6787 utf8_flags = FOLDEQ_UTF8_LOCALE;
6795 case EXACTFU_TRICKYFOLD:
6797 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6801 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6803 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6805 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6808 if (c1 == CHRTEST_VOID) {
6809 /* Use full Unicode fold matching */
6810 char *tmpeol = reginfo->strend;
6811 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6812 while (hardcount < max
6813 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6814 STRING(p), NULL, pat_len,
6815 reginfo->is_utf8_pat, utf8_flags))
6818 tmpeol = reginfo->strend;
6822 else if (utf8_target) {
6824 while (scan < loceol
6826 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6828 scan += UTF8SKIP(scan);
6833 while (scan < loceol
6835 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6836 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6838 scan += UTF8SKIP(scan);
6843 else if (c1 == c2) {
6844 while (scan < loceol && UCHARAT(scan) == c1) {
6849 while (scan < loceol &&
6850 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6859 case ANYOF_WARN_SUPER:
6861 while (hardcount < max
6863 && reginclass(prog, p, (U8*)scan, utf8_target))
6865 scan += UTF8SKIP(scan);
6869 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6874 /* The argument (FLAGS) to all the POSIX node types is the class number */
6881 RXp_MATCH_TAINTED_on(prog);
6882 if (! utf8_target) {
6883 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6889 while (hardcount < max && scan < loceol
6890 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6893 scan += UTF8SKIP(scan);
6906 if (utf8_target && scan + max < loceol) {
6908 /* We didn't adjust <loceol> at the beginning of this routine
6909 * because is UTF-8, but it is actually ok to do so, since here, to
6910 * match, 1 char == 1 byte. */
6911 loceol = scan + max;
6913 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6926 if (! utf8_target) {
6927 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6933 /* The complement of something that matches only ASCII matches all
6934 * UTF-8 variant code points, plus everything in ASCII that isn't
6936 while (hardcount < max && scan < loceol
6937 && (! UTF8_IS_INVARIANT(*scan)
6938 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
6940 scan += UTF8SKIP(scan);
6951 if (! utf8_target) {
6952 while (scan < loceol && to_complement
6953 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
6960 classnum = (_char_class_number) FLAGS(p);
6961 if (classnum < _FIRST_NON_SWASH_CC) {
6963 /* Here, a swash is needed for above-Latin1 code points.
6964 * Process as many Latin1 code points using the built-in rules.
6965 * Go to another loop to finish processing upon encountering
6966 * the first Latin1 code point. We could do that in this loop
6967 * as well, but the other way saves having to test if the swash
6968 * has been loaded every time through the loop: extra space to
6970 while (hardcount < max && scan < loceol) {
6971 if (UTF8_IS_INVARIANT(*scan)) {
6972 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
6979 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
6980 if (! (to_complement
6981 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
6990 goto found_above_latin1;
6997 /* For these character classes, the knowledge of how to handle
6998 * every code point is compiled in to Perl via a macro. This
6999 * code is written for making the loops as tight as possible.
7000 * It could be refactored to save space instead */
7002 case _CC_ENUM_SPACE: /* XXX would require separate code
7003 if we revert the change of \v
7006 case _CC_ENUM_PSXSPC:
7007 while (hardcount < max
7009 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7011 scan += UTF8SKIP(scan);
7015 case _CC_ENUM_BLANK:
7016 while (hardcount < max
7018 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7020 scan += UTF8SKIP(scan);
7024 case _CC_ENUM_XDIGIT:
7025 while (hardcount < max
7027 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7029 scan += UTF8SKIP(scan);
7033 case _CC_ENUM_VERTSPACE:
7034 while (hardcount < max
7036 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7038 scan += UTF8SKIP(scan);
7042 case _CC_ENUM_CNTRL:
7043 while (hardcount < max
7045 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7047 scan += UTF8SKIP(scan);
7052 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7058 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7060 /* Load the swash if not already present */
7061 if (! PL_utf8_swash_ptrs[classnum]) {
7062 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7063 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7064 "utf8", swash_property_names[classnum],
7065 &PL_sv_undef, 1, 0, NULL, &flags);
7068 while (hardcount < max && scan < loceol
7069 && to_complement ^ cBOOL(_generic_utf8(
7072 swash_fetch(PL_utf8_swash_ptrs[classnum],
7076 scan += UTF8SKIP(scan);
7083 while (hardcount < max && scan < loceol &&
7084 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7089 /* LNBREAK can match one or two latin chars, which is ok, but we
7090 * have to use hardcount in this situation, and throw away the
7091 * adjustment to <loceol> done before the switch statement */
7092 loceol = reginfo->strend;
7093 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7114 /* These are all 0 width, so match right here or not at all. */
7118 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7119 assert(0); /* NOTREACHED */
7126 c = scan - *startposp;
7130 GET_RE_DEBUG_FLAGS_DECL;
7132 SV * const prop = sv_newmortal();
7133 regprop(prog, prop, p);
7134 PerlIO_printf(Perl_debug_log,
7135 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7136 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7144 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7146 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7147 create a copy so that changes the caller makes won't change the shared one.
7148 If <altsvp> is non-null, will return NULL in it, for back-compat.
7151 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7153 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7159 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7164 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7166 /* Returns the swash for the input 'node' in the regex 'prog'.
7167 * If <doinit> is true, will attempt to create the swash if not already
7169 * If <listsvp> is non-null, will return the swash initialization string in
7171 * Tied intimately to how regcomp.c sets up the data structure */
7178 RXi_GET_DECL(prog,progi);
7179 const struct reg_data * const data = prog ? progi->data : NULL;
7181 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7183 assert(ANYOF_NONBITMAP(node));
7185 if (data && data->count) {
7186 const U32 n = ARG(node);
7188 if (data->what[n] == 's') {
7189 SV * const rv = MUTABLE_SV(data->data[n]);
7190 AV * const av = MUTABLE_AV(SvRV(rv));
7191 SV **const ary = AvARRAY(av);
7192 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7194 si = *ary; /* ary[0] = the string to initialize the swash with */
7196 /* Elements 2 and 3 are either both present or both absent. [2] is
7197 * any inversion list generated at compile time; [3] indicates if
7198 * that inversion list has any user-defined properties in it. */
7199 if (av_len(av) >= 2) {
7202 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7209 /* Element [1] is reserved for the set-up swash. If already there,
7210 * return it; if not, create it and store it there */
7211 if (SvROK(ary[1])) {
7214 else if (si && doinit) {
7216 sw = _core_swash_init("utf8", /* the utf8 package */
7220 0, /* not from tr/// */
7223 (void)av_store(av, 1, sw);
7229 SV* matches_string = newSVpvn("", 0);
7231 /* Use the swash, if any, which has to have incorporated into it all
7233 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7234 && (si && si != &PL_sv_undef))
7237 /* If no swash, use the input initialization string, if available */
7238 sv_catsv(matches_string, si);
7241 /* Add the inversion list to whatever we have. This may have come from
7242 * the swash, or from an input parameter */
7244 sv_catsv(matches_string, _invlist_contents(invlist));
7246 *listsvp = matches_string;
7253 - reginclass - determine if a character falls into a character class
7255 n is the ANYOF regnode
7256 p is the target string
7257 utf8_target tells whether p is in UTF-8.
7259 Returns true if matched; false otherwise.
7261 Note that this can be a synthetic start class, a combination of various
7262 nodes, so things you think might be mutually exclusive, such as locale,
7263 aren't. It can match both locale and non-locale
7268 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7271 const char flags = ANYOF_FLAGS(n);
7275 PERL_ARGS_ASSERT_REGINCLASS;
7277 /* If c is not already the code point, get it. Note that
7278 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7279 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7281 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7282 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7283 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7284 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7285 * UTF8_ALLOW_FFFF */
7286 if (c_len == (STRLEN)-1)
7287 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7290 /* If this character is potentially in the bitmap, check it */
7292 if (ANYOF_BITMAP_TEST(n, c))
7294 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7300 else if (flags & ANYOF_LOCALE) {
7301 RXp_MATCH_TAINTED_on(prog);
7303 if ((flags & ANYOF_LOC_FOLD)
7304 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7308 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7310 /* The data structure is arranged so bits 0, 2, 4, ... are set
7311 * if the class includes the Posix character class given by
7312 * bit/2; and 1, 3, 5, ... are set if the class includes the
7313 * complemented Posix class given by int(bit/2). So we loop
7314 * through the bits, each time changing whether we complement
7315 * the result or not. Suppose for the sake of illustration
7316 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7317 * is set, it means there is a match for this ANYOF node if the
7318 * character is in the class given by the expression (0 / 2 = 0
7319 * = \w). If it is in that class, isFOO_lc() will return 1,
7320 * and since 'to_complement' is 0, the result will stay TRUE,
7321 * and we exit the loop. Suppose instead that bit 0 is 0, but
7322 * bit 1 is 1. That means there is a match if the character
7323 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7324 * but will on bit 1. On the second iteration 'to_complement'
7325 * will be 1, so the exclusive or will reverse things, so we
7326 * are testing for \W. On the third iteration, 'to_complement'
7327 * will be 0, and we would be testing for \s; the fourth
7328 * iteration would test for \S, etc.
7330 * Note that this code assumes that all the classes are closed
7331 * under folding. For example, if a character matches \w, then
7332 * its fold does too; and vice versa. This should be true for
7333 * any well-behaved locale for all the currently defined Posix
7334 * classes, except for :lower: and :upper:, which are handled
7335 * by the pseudo-class :cased: which matches if either of the
7336 * other two does. To get rid of this assumption, an outer
7337 * loop could be used below to iterate over both the source
7338 * character, and its fold (if different) */
7341 int to_complement = 0;
7342 while (count < ANYOF_MAX) {
7343 if (ANYOF_CLASS_TEST(n, count)
7344 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7356 /* If the bitmap didn't (or couldn't) match, and something outside the
7357 * bitmap could match, try that. Locale nodes specify completely the
7358 * behavior of code points in the bit map (otherwise, a utf8 target would
7359 * cause them to be treated as Unicode and not locale), except in
7360 * the very unlikely event when this node is a synthetic start class, which
7361 * could be a combination of locale and non-locale nodes. So allow locale
7362 * to match for the synthetic start class, which will give a false
7363 * positive that will be resolved when the match is done again as not part
7364 * of the synthetic start class */
7366 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7367 match = TRUE; /* Everything above 255 matches */
7369 else if (ANYOF_NONBITMAP(n)
7370 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7373 || (! (flags & ANYOF_LOCALE))
7374 || OP(n) == ANYOF_SYNTHETIC))))
7376 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7381 } else { /* Convert to utf8 */
7383 utf8_p = bytes_to_utf8(p, &len);
7386 if (swash_fetch(sw, utf8_p, TRUE)) {
7390 /* If we allocated a string above, free it */
7391 if (! utf8_target) Safefree(utf8_p);
7395 if (UNICODE_IS_SUPER(c)
7396 && OP(n) == ANYOF_WARN_SUPER
7397 && ckWARN_d(WARN_NON_UNICODE))
7399 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7400 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7404 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7405 return cBOOL(flags & ANYOF_INVERT) ^ match;
7409 S_reghop3(U8 *s, I32 off, const U8* lim)
7411 /* return the position 'off' UTF-8 characters away from 's', forward if
7412 * 'off' >= 0, backwards if negative. But don't go outside of position
7413 * 'lim', which better be < s if off < 0 */
7417 PERL_ARGS_ASSERT_REGHOP3;
7420 while (off-- && s < lim) {
7421 /* XXX could check well-formedness here */
7426 while (off++ && s > lim) {
7428 if (UTF8_IS_CONTINUED(*s)) {
7429 while (s > lim && UTF8_IS_CONTINUATION(*s))
7432 /* XXX could check well-formedness here */
7439 /* there are a bunch of places where we use two reghop3's that should
7440 be replaced with this routine. but since thats not done yet
7441 we ifdef it out - dmq
7444 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7448 PERL_ARGS_ASSERT_REGHOP4;
7451 while (off-- && s < rlim) {
7452 /* XXX could check well-formedness here */
7457 while (off++ && s > llim) {
7459 if (UTF8_IS_CONTINUED(*s)) {
7460 while (s > llim && UTF8_IS_CONTINUATION(*s))
7463 /* XXX could check well-formedness here */
7471 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7475 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7478 while (off-- && s < lim) {
7479 /* XXX could check well-formedness here */
7486 while (off++ && s > lim) {
7488 if (UTF8_IS_CONTINUED(*s)) {
7489 while (s > lim && UTF8_IS_CONTINUATION(*s))
7492 /* XXX could check well-formedness here */
7501 /* when executing a regex that may have (?{}), extra stuff needs setting
7502 up that will be visible to the called code, even before the current
7503 match has finished. In particular:
7505 * $_ is localised to the SV currently being matched;
7506 * pos($_) is created if necessary, ready to be updated on each call-out
7508 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7509 isn't set until the current pattern is successfully finished), so that
7510 $1 etc of the match-so-far can be seen;
7511 * save the old values of subbeg etc of the current regex, and set then
7512 to the current string (again, this is normally only done at the end
7517 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7520 regexp *const rex = ReANY(reginfo->prog);
7521 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7523 eval_state->rex = rex;
7526 /* Make $_ available to executed code. */
7527 if (reginfo->sv != DEFSV) {
7529 DEFSV_set(reginfo->sv);
7532 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
7533 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
7534 /* prepare for quick setting of pos */
7535 #ifdef PERL_OLD_COPY_ON_WRITE
7536 if (SvIsCOW(reginfo->sv))
7537 sv_force_normal_flags(reginfo->sv, 0);
7539 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
7540 &PL_vtbl_mglob, NULL, 0);
7543 eval_state->pos_magic = mg;
7544 eval_state->pos = mg->mg_len;
7547 eval_state->pos_magic = NULL;
7549 if (!PL_reg_curpm) {
7550 Newxz(PL_reg_curpm, 1, PMOP);
7553 SV* const repointer = &PL_sv_undef;
7554 /* this regexp is also owned by the new PL_reg_curpm, which
7555 will try to free it. */
7556 av_push(PL_regex_padav, repointer);
7557 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7558 PL_regex_pad = AvARRAY(PL_regex_padav);
7562 SET_reg_curpm(reginfo->prog);
7563 eval_state->curpm = PL_curpm;
7564 PL_curpm = PL_reg_curpm;
7565 if (RXp_MATCH_COPIED(rex)) {
7566 /* Here is a serious problem: we cannot rewrite subbeg,
7567 since it may be needed if this match fails. Thus
7568 $` inside (?{}) could fail... */
7569 eval_state->subbeg = rex->subbeg;
7570 eval_state->sublen = rex->sublen;
7571 eval_state->suboffset = rex->suboffset;
7572 eval_state->subcoffset = rex->subcoffset;
7574 eval_state->saved_copy = rex->saved_copy;
7576 RXp_MATCH_COPIED_off(rex);
7579 eval_state->subbeg = NULL;
7580 rex->subbeg = (char *)reginfo->strbeg;
7582 rex->subcoffset = 0;
7583 rex->sublen = reginfo->strend - reginfo->strbeg;
7587 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7590 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7593 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7594 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
7599 /* undo the effects of S_setup_eval_state() */
7601 if (eval_state->subbeg) {
7602 regexp * const rex = eval_state->rex;
7603 rex->subbeg = eval_state->subbeg;
7604 rex->sublen = eval_state->sublen;
7605 rex->suboffset = eval_state->suboffset;
7606 rex->subcoffset = eval_state->subcoffset;
7608 rex->saved_copy = eval_state->saved_copy;
7610 RXp_MATCH_COPIED_on(rex);
7612 if (eval_state->pos_magic)
7613 eval_state->pos_magic->mg_len = eval_state->pos;
7615 PL_curpm = eval_state->curpm;
7620 S_to_utf8_substr(pTHX_ regexp *prog)
7622 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7623 * on the converted value */
7627 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7630 if (prog->substrs->data[i].substr
7631 && !prog->substrs->data[i].utf8_substr) {
7632 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7633 prog->substrs->data[i].utf8_substr = sv;
7634 sv_utf8_upgrade(sv);
7635 if (SvVALID(prog->substrs->data[i].substr)) {
7636 if (SvTAIL(prog->substrs->data[i].substr)) {
7637 /* Trim the trailing \n that fbm_compile added last
7639 SvCUR_set(sv, SvCUR(sv) - 1);
7640 /* Whilst this makes the SV technically "invalid" (as its
7641 buffer is no longer followed by "\0") when fbm_compile()
7642 adds the "\n" back, a "\0" is restored. */
7643 fbm_compile(sv, FBMcf_TAIL);
7647 if (prog->substrs->data[i].substr == prog->check_substr)
7648 prog->check_utf8 = sv;
7654 S_to_byte_substr(pTHX_ regexp *prog)
7656 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7657 * on the converted value; returns FALSE if can't be converted. */
7662 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7665 if (prog->substrs->data[i].utf8_substr
7666 && !prog->substrs->data[i].substr) {
7667 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7668 if (! sv_utf8_downgrade(sv, TRUE)) {
7671 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7672 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7673 /* Trim the trailing \n that fbm_compile added last
7675 SvCUR_set(sv, SvCUR(sv) - 1);
7676 fbm_compile(sv, FBMcf_TAIL);
7680 prog->substrs->data[i].substr = sv;
7681 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7682 prog->check_substr = sv;
7691 * c-indentation-style: bsd
7693 * indent-tabs-mode: nil
7696 * ex: set ts=8 sts=4 sw=4 et: