5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGEXEC_C
87 #ifdef PERL_IN_XSUB_RE
93 #include "inline_invlist.c"
94 #include "unicode_constants.h"
96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
102 /* Valid for non-utf8 strings: avoids the reginclass
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
105 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
106 : ANYOF_BITMAP_TEST(p,*(c)))
112 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
113 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
115 #define HOPc(pos,off) \
116 (char *)(reginfo->is_utf8_target \
117 ? reghop3((U8*)pos, off, \
118 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
120 #define HOPBACKc(pos, off) \
121 (char*)(reginfo->is_utf8_target \
122 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
123 : (pos - off >= reginfo->strbeg) \
127 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
131 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
132 #define NEXTCHR_IS_EOS (nextchr < 0)
134 #define SET_nextchr \
135 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
137 #define SET_locinput(p) \
142 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
144 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
145 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
146 1, 0, NULL, &flags); \
151 /* If in debug mode, we test that a known character properly matches */
153 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
155 utf8_char_in_property) \
156 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
157 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
159 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
161 utf8_char_in_property) \
162 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
165 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
166 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
167 swash_property_names[_CC_WORDCHAR], \
168 GREEK_SMALL_LETTER_IOTA_UTF8)
170 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
172 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
173 "_X_regular_begin", \
174 GREEK_SMALL_LETTER_IOTA_UTF8); \
175 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
177 COMBINING_GRAVE_ACCENT_UTF8); \
180 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
181 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
183 /* for use after a quantifier and before an EXACT-like node -- japhy */
184 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
186 * NOTE that *nothing* that affects backtracking should be in here, specifically
187 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
188 * node that is in between two EXACT like nodes when ascertaining what the required
189 * "follow" character is. This should probably be moved to regex compile time
190 * although it may be done at run time beause of the REF possibility - more
191 * investigation required. -- demerphq
193 #define JUMPABLE(rn) ( \
195 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
197 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
198 OP(rn) == PLUS || OP(rn) == MINMOD || \
200 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
202 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
204 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
207 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
208 we don't need this definition. */
209 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
210 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
211 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
214 /* ... so we use this as its faster. */
215 #define IS_TEXT(rn) ( OP(rn)==EXACT )
216 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
217 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
218 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
223 Search for mandatory following text node; for lookahead, the text must
224 follow but for lookbehind (rn->flags != 0) we skip to the next step.
226 #define FIND_NEXT_IMPT(rn) STMT_START { \
227 while (JUMPABLE(rn)) { \
228 const OPCODE type = OP(rn); \
229 if (type == SUSPEND || PL_regkind[type] == CURLY) \
230 rn = NEXTOPER(NEXTOPER(rn)); \
231 else if (type == PLUS) \
233 else if (type == IFMATCH) \
234 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
235 else rn += NEXT_OFF(rn); \
239 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
240 * These are for the pre-composed Hangul syllables, which are all in a
241 * contiguous block and arranged there in such a way so as to facilitate
242 * alorithmic determination of their characteristics. As such, they don't need
243 * a swash, but can be determined by simple arithmetic. Almost all are
244 * GCB=LVT, but every 28th one is a GCB=LV */
245 #define SBASE 0xAC00 /* Start of block */
246 #define SCount 11172 /* Length of block */
249 #define SLAB_FIRST(s) (&(s)->states[0])
250 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
252 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
253 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
254 static regmatch_state * S_push_slab(pTHX);
256 #define REGCP_PAREN_ELEMS 3
257 #define REGCP_OTHER_ELEMS 3
258 #define REGCP_FRAME_ELEMS 1
259 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
260 * are needed for the regexp context stack bookkeeping. */
263 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
266 const int retval = PL_savestack_ix;
267 const int paren_elems_to_push =
268 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
269 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
270 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
272 GET_RE_DEBUG_FLAGS_DECL;
274 PERL_ARGS_ASSERT_REGCPPUSH;
276 if (paren_elems_to_push < 0)
277 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
278 paren_elems_to_push);
280 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
281 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
282 " out of range (%lu-%ld)",
284 (unsigned long)maxopenparen,
287 SSGROW(total_elems + REGCP_FRAME_ELEMS);
290 if ((int)maxopenparen > (int)parenfloor)
291 PerlIO_printf(Perl_debug_log,
292 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
297 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
298 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
299 SSPUSHINT(rex->offs[p].end);
300 SSPUSHINT(rex->offs[p].start);
301 SSPUSHINT(rex->offs[p].start_tmp);
302 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
303 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
305 (IV)rex->offs[p].start,
306 (IV)rex->offs[p].start_tmp,
310 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
311 SSPUSHINT(maxopenparen);
312 SSPUSHINT(rex->lastparen);
313 SSPUSHINT(rex->lastcloseparen);
314 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
319 /* These are needed since we do not localize EVAL nodes: */
320 #define REGCP_SET(cp) \
322 PerlIO_printf(Perl_debug_log, \
323 " Setting an EVAL scope, savestack=%"IVdf"\n", \
324 (IV)PL_savestack_ix)); \
327 #define REGCP_UNWIND(cp) \
329 if (cp != PL_savestack_ix) \
330 PerlIO_printf(Perl_debug_log, \
331 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
332 (IV)(cp), (IV)PL_savestack_ix)); \
335 #define UNWIND_PAREN(lp, lcp) \
336 for (n = rex->lastparen; n > lp; n--) \
337 rex->offs[n].end = -1; \
338 rex->lastparen = n; \
339 rex->lastcloseparen = lcp;
343 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
348 GET_RE_DEBUG_FLAGS_DECL;
350 PERL_ARGS_ASSERT_REGCPPOP;
352 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
354 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
355 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
356 rex->lastcloseparen = SSPOPINT;
357 rex->lastparen = SSPOPINT;
358 *maxopenparen_p = SSPOPINT;
360 i -= REGCP_OTHER_ELEMS;
361 /* Now restore the parentheses context. */
363 if (i || rex->lastparen + 1 <= rex->nparens)
364 PerlIO_printf(Perl_debug_log,
365 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
370 paren = *maxopenparen_p;
371 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
373 rex->offs[paren].start_tmp = SSPOPINT;
374 rex->offs[paren].start = SSPOPINT;
376 if (paren <= rex->lastparen)
377 rex->offs[paren].end = tmps;
378 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
379 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
381 (IV)rex->offs[paren].start,
382 (IV)rex->offs[paren].start_tmp,
383 (IV)rex->offs[paren].end,
384 (paren > rex->lastparen ? "(skipped)" : ""));
389 /* It would seem that the similar code in regtry()
390 * already takes care of this, and in fact it is in
391 * a better location to since this code can #if 0-ed out
392 * but the code in regtry() is needed or otherwise tests
393 * requiring null fields (pat.t#187 and split.t#{13,14}
394 * (as of patchlevel 7877) will fail. Then again,
395 * this code seems to be necessary or otherwise
396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397 * --jhi updated by dapm */
398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399 if (i > *maxopenparen_p)
400 rex->offs[i].start = -1;
401 rex->offs[i].end = -1;
402 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
403 " \\%"UVuf": %s ..-1 undeffing\n",
405 (i > *maxopenparen_p) ? "-1" : " "
411 /* restore the parens and associated vars at savestack position ix,
412 * but without popping the stack */
415 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
417 I32 tmpix = PL_savestack_ix;
418 PL_savestack_ix = ix;
419 regcppop(rex, maxopenparen_p);
420 PL_savestack_ix = tmpix;
423 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
426 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
428 /* Returns a boolean as to whether or not 'character' is a member of the
429 * Posix character class given by 'classnum' that should be equivalent to a
430 * value in the typedef '_char_class_number'.
432 * Ideally this could be replaced by a just an array of function pointers
433 * to the C library functions that implement the macros this calls.
434 * However, to compile, the precise function signatures are required, and
435 * these may vary from platform to to platform. To avoid having to figure
436 * out what those all are on each platform, I (khw) am using this method,
437 * which adds an extra layer of function call overhead (unless the C
438 * optimizer strips it away). But we don't particularly care about
439 * performance with locales anyway. */
441 switch ((_char_class_number) classnum) {
442 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
443 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
444 case _CC_ENUM_ASCII: return isASCII_LC(character);
445 case _CC_ENUM_BLANK: return isBLANK_LC(character);
446 case _CC_ENUM_CASED: return isLOWER_LC(character)
447 || isUPPER_LC(character);
448 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
449 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
450 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
451 case _CC_ENUM_LOWER: return isLOWER_LC(character);
452 case _CC_ENUM_PRINT: return isPRINT_LC(character);
453 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
454 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
455 case _CC_ENUM_SPACE: return isSPACE_LC(character);
456 case _CC_ENUM_UPPER: return isUPPER_LC(character);
457 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
458 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
459 default: /* VERTSPACE should never occur in locales */
460 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
463 assert(0); /* NOTREACHED */
468 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
470 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
471 * 'character' is a member of the Posix character class given by 'classnum'
472 * that should be equivalent to a value in the typedef
473 * '_char_class_number'.
475 * This just calls isFOO_lc on the code point for the character if it is in
476 * the range 0-255. Outside that range, all characters avoid Unicode
477 * rules, ignoring any locale. So use the Unicode function if this class
478 * requires a swash, and use the Unicode macro otherwise. */
480 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
482 if (UTF8_IS_INVARIANT(*character)) {
483 return isFOO_lc(classnum, *character);
485 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
486 return isFOO_lc(classnum,
487 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
490 if (classnum < _FIRST_NON_SWASH_CC) {
492 /* Initialize the swash unless done already */
493 if (! PL_utf8_swash_ptrs[classnum]) {
494 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
495 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
496 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
499 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
501 TRUE /* is UTF */ ));
504 switch ((_char_class_number) classnum) {
506 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
508 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
509 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
510 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
511 default: return 0; /* Things like CNTRL are always
515 assert(0); /* NOTREACHED */
520 * pregexec and friends
523 #ifndef PERL_IN_XSUB_RE
525 - pregexec - match a regexp against a string
528 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
529 char *strbeg, I32 minend, SV *screamer, U32 nosave)
530 /* stringarg: the point in the string at which to begin matching */
531 /* strend: pointer to null at end of string */
532 /* strbeg: real beginning of string */
533 /* minend: end of match must be >= minend bytes after stringarg. */
534 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
535 * itself is accessed via the pointers above */
536 /* nosave: For optimizations. */
538 PERL_ARGS_ASSERT_PREGEXEC;
541 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
542 nosave ? 0 : REXEC_COPY_STR);
547 * Need to implement the following flags for reg_anch:
549 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
551 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
552 * INTUIT_AUTORITATIVE_ML
553 * INTUIT_ONCE_NOML - Intuit can match in one location only.
556 * Another flag for this function: SECOND_TIME (so that float substrs
557 * with giant delta may be not rechecked).
560 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
562 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
563 Otherwise, only SvCUR(sv) is used to get strbeg. */
565 /* XXXX We assume that strpos is strbeg unless sv. */
567 /* XXXX Some places assume that there is a fixed substring.
568 An update may be needed if optimizer marks as "INTUITable"
569 RExen without fixed substrings. Similarly, it is assumed that
570 lengths of all the strings are no more than minlen, thus they
571 cannot come from lookahead.
572 (Or minlen should take into account lookahead.)
573 NOTE: Some of this comment is not correct. minlen does now take account
574 of lookahead/behind. Further research is required. -- demerphq
578 /* A failure to find a constant substring means that there is no need to make
579 an expensive call to REx engine, thus we celebrate a failure. Similarly,
580 finding a substring too deep into the string means that fewer calls to
581 regtry() should be needed.
583 REx compiler's optimizer found 4 possible hints:
584 a) Anchored substring;
586 c) Whether we are anchored (beginning-of-line or \G);
587 d) First node (of those at offset 0) which may distinguish positions;
588 We use a)b)d) and multiline-part of c), and try to find a position in the
589 string which does not contradict any of them.
592 /* Most of decisions we do here should have been done at compile time.
593 The nodes of the REx which we used for the search should have been
594 deleted from the finite automaton. */
597 * rx: the regex to match against
598 * sv: the SV being matched: only used for utf8 flag; the string
599 * itself is accessed via the pointers below. Note that on
600 * something like an overloaded SV, SvPOK(sv) may be false
601 * and the string pointers may point to something unrelated to
603 * strbeg: real beginning of string
604 * strpos: the point in the string at which to begin matching
605 * strend: pointer to the byte following the last char of the string
606 * flags currently unused; set to 0
607 * data: currently unused; set to NULL
611 Perl_re_intuit_start(pTHX_
614 const char * const strbeg,
618 re_scream_pos_data *data)
621 struct regexp *const prog = ReANY(rx);
623 /* Should be nonnegative! */
628 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
630 char *other_last = NULL; /* other substr checked before this */
631 char *check_at = NULL; /* check substr found at this pos */
632 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
633 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
634 RXi_GET_DECL(prog,progi);
635 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
636 regmatch_info *const reginfo = ®info_buf;
638 const char * const i_strpos = strpos;
640 GET_RE_DEBUG_FLAGS_DECL;
642 PERL_ARGS_ASSERT_RE_INTUIT_START;
643 PERL_UNUSED_ARG(flags);
644 PERL_UNUSED_ARG(data);
646 /* CHR_DIST() would be more correct here but it makes things slow. */
647 if (prog->minlen > strend - strpos) {
648 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
649 "String too short... [re_intuit_start]\n"));
653 reginfo->is_utf8_target = cBOOL(utf8_target);
654 reginfo->info_aux = NULL;
655 reginfo->strbeg = strbeg;
656 reginfo->strend = strend;
657 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
659 /* not actually used within intuit, but zero for safety anyway */
660 reginfo->poscache_maxiter = 0;
663 if (!prog->check_utf8 && prog->check_substr)
664 to_utf8_substr(prog);
665 check = prog->check_utf8;
667 if (!prog->check_substr && prog->check_utf8) {
668 if (! to_byte_substr(prog)) {
669 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
672 check = prog->check_substr;
674 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
675 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
676 || ( (prog->extflags & RXf_ANCH_BOL)
677 && !multiline ) ); /* Check after \n? */
680 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
681 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
682 && (strpos != strbeg)) {
683 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
686 if (prog->check_offset_min == prog->check_offset_max
687 && !(prog->extflags & RXf_CANY_SEEN)
688 && ! multiline) /* /m can cause \n's to match that aren't
689 accounted for in the string max length.
690 See [perl #115242] */
692 /* Substring at constant offset from beg-of-str... */
695 s = HOP3c(strpos, prog->check_offset_min, strend);
698 slen = SvCUR(check); /* >= 1 */
700 if ( strend - s > slen || strend - s < slen - 1
701 || (strend - s == slen && strend[-1] != '\n')) {
702 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
705 /* Now should match s[0..slen-2] */
707 if (slen && (*SvPVX_const(check) != *s
709 && memNE(SvPVX_const(check), s, slen)))) {
711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
715 else if (*SvPVX_const(check) != *s
716 || ((slen = SvCUR(check)) > 1
717 && memNE(SvPVX_const(check), s, slen)))
720 goto success_at_start;
723 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
725 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
726 end_shift = prog->check_end_shift;
729 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
730 - (SvTAIL(check) != 0);
731 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
733 if (end_shift < eshift)
737 else { /* Can match at random position */
740 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
741 end_shift = prog->check_end_shift;
743 /* end shift should be non negative here */
746 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
748 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
749 (IV)end_shift, RX_PRECOMP(prog));
753 /* Find a possible match in the region s..strend by looking for
754 the "check" substring in the region corrected by start/end_shift. */
757 I32 srch_start_shift = start_shift;
758 I32 srch_end_shift = end_shift;
761 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
762 srch_end_shift -= ((strbeg - s) - srch_start_shift);
763 srch_start_shift = strbeg - s;
765 DEBUG_OPTIMISE_MORE_r({
766 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
767 (IV)prog->check_offset_min,
768 (IV)srch_start_shift,
770 (IV)prog->check_end_shift);
773 if (prog->extflags & RXf_CANY_SEEN) {
774 start_point= (U8*)(s + srch_start_shift);
775 end_point= (U8*)(strend - srch_end_shift);
777 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
778 end_point= HOP3(strend, -srch_end_shift, strbeg);
780 DEBUG_OPTIMISE_MORE_r({
781 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
782 (int)(end_point - start_point),
783 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
787 s = fbm_instr( start_point, end_point,
788 check, multiline ? FBMrf_MULTILINE : 0);
790 /* Update the count-of-usability, remove useless subpatterns,
794 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
795 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
796 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
797 (s ? "Found" : "Did not find"),
798 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
799 ? "anchored" : "floating"),
802 (s ? " at offset " : "...\n") );
807 /* Finish the diagnostic message */
808 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
810 /* XXX dmq: first branch is for positive lookbehind...
811 Our check string is offset from the beginning of the pattern.
812 So we need to do any stclass tests offset forward from that
821 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
822 Start with the other substr.
823 XXXX no SCREAM optimization yet - and a very coarse implementation
824 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
825 *always* match. Probably should be marked during compile...
826 Probably it is right to do no SCREAM here...
829 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
830 : (prog->float_substr && prog->anchored_substr))
832 /* Take into account the "other" substring. */
833 /* XXXX May be hopelessly wrong for UTF... */
836 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
839 char * const last = HOP3c(s, -start_shift, strbeg);
841 char * const saved_s = s;
844 t = s - prog->check_offset_max;
845 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
847 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
852 t = HOP3c(t, prog->anchored_offset, strend);
853 if (t < other_last) /* These positions already checked */
855 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
858 /* XXXX It is not documented what units *_offsets are in.
859 We assume bytes, but this is clearly wrong.
860 Meaning this code needs to be carefully reviewed for errors.
864 /* On end-of-str: see comment below. */
865 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
866 if (must == &PL_sv_undef) {
868 DEBUG_r(must = prog->anchored_utf8); /* for debug */
873 HOP3(HOP3(last1, prog->anchored_offset, strend)
874 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
876 multiline ? FBMrf_MULTILINE : 0
879 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
880 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
881 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
882 (s ? "Found" : "Contradicts"),
883 quoted, RE_SV_TAIL(must));
888 if (last1 >= last2) {
889 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
890 ", giving up...\n"));
893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
894 ", trying floating at offset %ld...\n",
895 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
896 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
897 s = HOP3c(last, 1, strend);
901 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
902 (long)(s - i_strpos)));
903 t = HOP3c(s, -prog->anchored_offset, strbeg);
904 other_last = HOP3c(s, 1, strend);
912 else { /* Take into account the floating substring. */
914 char * const saved_s = s;
917 t = HOP3c(s, -start_shift, strbeg);
919 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
920 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
921 last = HOP3c(t, prog->float_max_offset, strend);
922 s = HOP3c(t, prog->float_min_offset, strend);
925 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
926 must = utf8_target ? prog->float_utf8 : prog->float_substr;
927 /* fbm_instr() takes into account exact value of end-of-str
928 if the check is SvTAIL(ed). Since false positives are OK,
929 and end-of-str is not later than strend we are OK. */
930 if (must == &PL_sv_undef) {
932 DEBUG_r(must = prog->float_utf8); /* for debug message */
935 s = fbm_instr((unsigned char*)s,
936 (unsigned char*)last + SvCUR(must)
938 must, multiline ? FBMrf_MULTILINE : 0);
940 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
941 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
942 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
943 (s ? "Found" : "Contradicts"),
944 quoted, RE_SV_TAIL(must));
948 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
949 ", giving up...\n"));
952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
953 ", trying anchored starting at offset %ld...\n",
954 (long)(saved_s + 1 - i_strpos)));
956 s = HOP3c(t, 1, strend);
960 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
961 (long)(s - i_strpos)));
962 other_last = s; /* Fix this later. --Hugo */
972 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
974 DEBUG_OPTIMISE_MORE_r(
975 PerlIO_printf(Perl_debug_log,
976 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
977 (IV)prog->check_offset_min,
978 (IV)prog->check_offset_max,
986 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
988 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
991 /* Fixed substring is found far enough so that the match
992 cannot start at strpos. */
994 if (ml_anch && t[-1] != '\n') {
995 /* Eventually fbm_*() should handle this, but often
996 anchored_offset is not 0, so this check will not be wasted. */
997 /* XXXX In the code below we prefer to look for "^" even in
998 presence of anchored substrings. And we search even
999 beyond the found float position. These pessimizations
1000 are historical artefacts only. */
1002 while (t < strend - prog->minlen) {
1004 if (t < check_at - prog->check_offset_min) {
1005 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1006 /* Since we moved from the found position,
1007 we definitely contradict the found anchored
1008 substr. Due to the above check we do not
1009 contradict "check" substr.
1010 Thus we can arrive here only if check substr
1011 is float. Redo checking for "other"=="fixed".
1014 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1015 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1016 goto do_other_anchored;
1018 /* We don't contradict the found floating substring. */
1019 /* XXXX Why not check for STCLASS? */
1021 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1022 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1025 /* Position contradicts check-string */
1026 /* XXXX probably better to look for check-string
1027 than for "\n", so one should lower the limit for t? */
1028 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1029 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1030 other_last = strpos = s = t + 1;
1035 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1036 PL_colors[0], PL_colors[1]));
1040 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1041 PL_colors[0], PL_colors[1]));
1045 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1048 /* The found string does not prohibit matching at strpos,
1049 - no optimization of calling REx engine can be performed,
1050 unless it was an MBOL and we are not after MBOL,
1051 or a future STCLASS check will fail this. */
1053 /* Even in this situation we may use MBOL flag if strpos is offset
1054 wrt the start of the string. */
1055 if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1056 /* May be due to an implicit anchor of m{.*foo} */
1057 && !(prog->intflags & PREGf_IMPLICIT))
1062 DEBUG_EXECUTE_r( if (ml_anch)
1063 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1064 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1067 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1069 prog->check_utf8 /* Could be deleted already */
1070 && --BmUSEFUL(prog->check_utf8) < 0
1071 && (prog->check_utf8 == prog->float_utf8)
1073 prog->check_substr /* Could be deleted already */
1074 && --BmUSEFUL(prog->check_substr) < 0
1075 && (prog->check_substr == prog->float_substr)
1078 /* If flags & SOMETHING - do not do it many times on the same match */
1079 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1080 /* XXX Does the destruction order has to change with utf8_target? */
1081 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1082 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1083 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1084 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1085 check = NULL; /* abort */
1087 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1088 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1089 if (prog->intflags & PREGf_IMPLICIT)
1090 prog->extflags &= ~RXf_ANCH_MBOL;
1091 /* XXXX This is a remnant of the old implementation. It
1092 looks wasteful, since now INTUIT can use many
1093 other heuristics. */
1094 prog->extflags &= ~RXf_USE_INTUIT;
1095 /* XXXX What other flags might need to be cleared in this branch? */
1101 /* Last resort... */
1102 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1103 /* trie stclasses are too expensive to use here, we are better off to
1104 leave it to regmatch itself */
1105 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1106 /* minlen == 0 is possible if regstclass is \b or \B,
1107 and the fixed substr is ''$.
1108 Since minlen is already taken into account, s+1 is before strend;
1109 accidentally, minlen >= 1 guaranties no false positives at s + 1
1110 even for \b or \B. But (minlen? 1 : 0) below assumes that
1111 regstclass does not come from lookahead... */
1112 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1113 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1114 const U8* const str = (U8*)STRING(progi->regstclass);
1115 /* XXX this value could be pre-computed */
1116 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1117 ? (reginfo->is_utf8_pat
1118 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1119 : STR_LEN(progi->regstclass))
1122 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1123 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1124 else if (prog->float_substr || prog->float_utf8)
1125 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1129 if (checked_upto < s)
1131 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1132 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1135 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1141 const char *what = NULL;
1143 if (endpos == strend) {
1144 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1145 "Could not match STCLASS...\n") );
1148 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1149 "This position contradicts STCLASS...\n") );
1150 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1152 checked_upto = HOPBACKc(endpos, start_shift);
1153 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1154 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1155 /* Contradict one of substrings */
1156 if (prog->anchored_substr || prog->anchored_utf8) {
1157 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1158 DEBUG_EXECUTE_r( what = "anchored" );
1160 s = HOP3c(t, 1, strend);
1161 if (s + start_shift + end_shift > strend) {
1162 /* XXXX Should be taken into account earlier? */
1163 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1164 "Could not match STCLASS...\n") );
1169 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1170 "Looking for %s substr starting at offset %ld...\n",
1171 what, (long)(s + start_shift - i_strpos)) );
1174 /* Have both, check_string is floating */
1175 if (t + start_shift >= check_at) /* Contradicts floating=check */
1176 goto retry_floating_check;
1177 /* Recheck anchored substring, but not floating... */
1181 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1182 "Looking for anchored substr starting at offset %ld...\n",
1183 (long)(other_last - i_strpos)) );
1184 goto do_other_anchored;
1186 /* Another way we could have checked stclass at the
1187 current position only: */
1192 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1193 "Looking for /%s^%s/m starting at offset %ld...\n",
1194 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1197 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1199 /* Check is floating substring. */
1200 retry_floating_check:
1201 t = check_at - start_shift;
1202 DEBUG_EXECUTE_r( what = "floating" );
1203 goto hop_and_restart;
1206 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1207 "By STCLASS: moving %ld --> %ld\n",
1208 (long)(t - i_strpos), (long)(s - i_strpos))
1212 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1213 "Does not contradict STCLASS...\n");
1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1219 PL_colors[4], (check ? "Guessed" : "Giving up"),
1220 PL_colors[5], (long)(s - i_strpos)) );
1223 fail_finish: /* Substring not found */
1224 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1225 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1227 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1228 PL_colors[4], PL_colors[5]));
1232 #define DECL_TRIE_TYPE(scan) \
1233 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1234 trie_type = ((scan->flags == EXACT) \
1235 ? (utf8_target ? trie_utf8 : trie_plain) \
1236 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1238 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1241 switch (trie_type) { \
1242 case trie_utf8_fold: \
1243 if ( foldlen>0 ) { \
1244 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1249 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1250 len = UTF8SKIP(uc); \
1251 skiplen = UNISKIP( uvc ); \
1252 foldlen -= skiplen; \
1253 uscan = foldbuf + skiplen; \
1256 case trie_latin_utf8_fold: \
1257 if ( foldlen>0 ) { \
1258 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1264 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
1265 skiplen = UNISKIP( uvc ); \
1266 foldlen -= skiplen; \
1267 uscan = foldbuf + skiplen; \
1271 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1278 charid = trie->charmap[ uvc ]; \
1282 if (widecharmap) { \
1283 SV** const svpp = hv_fetch(widecharmap, \
1284 (char*)&uvc, sizeof(UV), 0); \
1286 charid = (U16)SvIV(*svpp); \
1291 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1295 && (ln == 1 || folder(s, pat_string, ln)) \
1296 && (reginfo->intuit || regtry(reginfo, &s)) )\
1302 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1304 while (s < strend) { \
1310 #define REXEC_FBC_SCAN(CoDe) \
1312 while (s < strend) { \
1318 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1319 REXEC_FBC_UTF8_SCAN( \
1321 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1330 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1333 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1342 #define REXEC_FBC_TRYIT \
1343 if ((reginfo->intuit || regtry(reginfo, &s))) \
1346 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1347 if (utf8_target) { \
1348 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1351 REXEC_FBC_CLASS_SCAN(CoNd); \
1354 #define DUMP_EXEC_POS(li,s,doutf8) \
1355 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1359 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1360 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1361 tmp = TEST_NON_UTF8(tmp); \
1362 REXEC_FBC_UTF8_SCAN( \
1363 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1372 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1373 if (s == reginfo->strbeg) { \
1377 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1378 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1381 LOAD_UTF8_CHARCLASS_ALNUM(); \
1382 REXEC_FBC_UTF8_SCAN( \
1383 if (tmp == ! (TeSt2_UtF8)) { \
1392 /* The only difference between the BOUND and NBOUND cases is that
1393 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1394 * NBOUND. This is accomplished by passing it in either the if or else clause,
1395 * with the other one being empty */
1396 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1397 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1399 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1400 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1402 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1403 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1405 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1406 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1409 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1410 * be passed in completely with the variable name being tested, which isn't
1411 * such a clean interface, but this is easier to read than it was before. We
1412 * are looking for the boundary (or non-boundary between a word and non-word
1413 * character. The utf8 and non-utf8 cases have the same logic, but the details
1414 * must be different. Find the "wordness" of the character just prior to this
1415 * one, and compare it with the wordness of this one. If they differ, we have
1416 * a boundary. At the beginning of the string, pretend that the previous
1417 * character was a new-line */
1418 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1419 if (utf8_target) { \
1422 else { /* Not utf8 */ \
1423 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1424 tmp = TEST_NON_UTF8(tmp); \
1426 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1435 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
1438 /* We know what class REx starts with. Try to find this position... */
1439 /* if reginfo->intuit, its a dryrun */
1440 /* annoyingly all the vars in this routine have different names from their counterparts
1441 in regmatch. /grrr */
1444 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1445 const char *strend, regmatch_info *reginfo)
1448 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1449 char *pat_string; /* The pattern's exactish string */
1450 char *pat_end; /* ptr to end char of pat_string */
1451 re_fold_t folder; /* Function for computing non-utf8 folds */
1452 const U8 *fold_array; /* array for folding ords < 256 */
1458 I32 tmp = 1; /* Scratch variable? */
1459 const bool utf8_target = reginfo->is_utf8_target;
1460 UV utf8_fold_flags = 0;
1461 const bool is_utf8_pat = reginfo->is_utf8_pat;
1462 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1463 with a result inverts that result, as 0^1 =
1465 _char_class_number classnum;
1467 RXi_GET_DECL(prog,progi);
1469 PERL_ARGS_ASSERT_FIND_BYCLASS;
1471 /* We know what class it must start with. */
1474 case ANYOF_SYNTHETIC:
1475 case ANYOF_WARN_SUPER:
1477 REXEC_FBC_UTF8_CLASS_SCAN(
1478 reginclass(prog, c, (U8*)s, utf8_target));
1481 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1486 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1494 if (is_utf8_pat || utf8_target) {
1495 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1496 goto do_exactf_utf8;
1498 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1499 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1500 goto do_exactf_non_utf8; /* isn't dealt with by these */
1505 /* regcomp.c already folded this if pattern is in UTF-8 */
1506 utf8_fold_flags = 0;
1507 goto do_exactf_utf8;
1509 fold_array = PL_fold;
1511 goto do_exactf_non_utf8;
1514 if (is_utf8_pat || utf8_target) {
1515 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1516 goto do_exactf_utf8;
1518 fold_array = PL_fold_locale;
1519 folder = foldEQ_locale;
1520 goto do_exactf_non_utf8;
1524 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1526 goto do_exactf_utf8;
1528 case EXACTFU_TRICKYFOLD:
1530 if (is_utf8_pat || utf8_target) {
1531 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1532 goto do_exactf_utf8;
1535 /* Any 'ss' in the pattern should have been replaced by regcomp,
1536 * so we don't have to worry here about this single special case
1537 * in the Latin1 range */
1538 fold_array = PL_fold_latin1;
1539 folder = foldEQ_latin1;
1543 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1544 are no glitches with fold-length differences
1545 between the target string and pattern */
1547 /* The idea in the non-utf8 EXACTF* cases is to first find the
1548 * first character of the EXACTF* node and then, if necessary,
1549 * case-insensitively compare the full text of the node. c1 is the
1550 * first character. c2 is its fold. This logic will not work for
1551 * Unicode semantics and the german sharp ss, which hence should
1552 * not be compiled into a node that gets here. */
1553 pat_string = STRING(c);
1554 ln = STR_LEN(c); /* length to match in octets/bytes */
1556 /* We know that we have to match at least 'ln' bytes (which is the
1557 * same as characters, since not utf8). If we have to match 3
1558 * characters, and there are only 2 availabe, we know without
1559 * trying that it will fail; so don't start a match past the
1560 * required minimum number from the far end */
1561 e = HOP3c(strend, -((I32)ln), s);
1563 if (reginfo->intuit && e < s) {
1564 e = s; /* Due to minlen logic of intuit() */
1568 c2 = fold_array[c1];
1569 if (c1 == c2) { /* If char and fold are the same */
1570 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1573 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1581 /* If one of the operands is in utf8, we can't use the simpler folding
1582 * above, due to the fact that many different characters can have the
1583 * same fold, or portion of a fold, or different- length fold */
1584 pat_string = STRING(c);
1585 ln = STR_LEN(c); /* length to match in octets/bytes */
1586 pat_end = pat_string + ln;
1587 lnc = is_utf8_pat /* length to match in characters */
1588 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1591 /* We have 'lnc' characters to match in the pattern, but because of
1592 * multi-character folding, each character in the target can match
1593 * up to 3 characters (Unicode guarantees it will never exceed
1594 * this) if it is utf8-encoded; and up to 2 if not (based on the
1595 * fact that the Latin 1 folds are already determined, and the
1596 * only multi-char fold in that range is the sharp-s folding to
1597 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1598 * string character. Adjust lnc accordingly, rounding up, so that
1599 * if we need to match at least 4+1/3 chars, that really is 5. */
1600 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1601 lnc = (lnc + expansion - 1) / expansion;
1603 /* As in the non-UTF8 case, if we have to match 3 characters, and
1604 * only 2 are left, it's guaranteed to fail, so don't start a
1605 * match that would require us to go beyond the end of the string
1607 e = HOP3c(strend, -((I32)lnc), s);
1609 if (reginfo->intuit && e < s) {
1610 e = s; /* Due to minlen logic of intuit() */
1613 /* XXX Note that we could recalculate e to stop the loop earlier,
1614 * as the worst case expansion above will rarely be met, and as we
1615 * go along we would usually find that e moves further to the left.
1616 * This would happen only after we reached the point in the loop
1617 * where if there were no expansion we should fail. Unclear if
1618 * worth the expense */
1621 char *my_strend= (char *)strend;
1622 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1623 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1624 && (reginfo->intuit || regtry(reginfo, &s)) )
1628 s += (utf8_target) ? UTF8SKIP(s) : 1;
1633 RXp_MATCH_TAINTED_on(prog);
1634 FBC_BOUND(isWORDCHAR_LC,
1635 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1636 isWORDCHAR_LC_utf8((U8*)s));
1639 RXp_MATCH_TAINTED_on(prog);
1640 FBC_NBOUND(isWORDCHAR_LC,
1641 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1642 isWORDCHAR_LC_utf8((U8*)s));
1645 FBC_BOUND(isWORDCHAR,
1646 isWORDCHAR_uni(tmp),
1647 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1650 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1652 isWORDCHAR_A((U8*)s));
1655 FBC_NBOUND(isWORDCHAR,
1656 isWORDCHAR_uni(tmp),
1657 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1660 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1662 isWORDCHAR_A((U8*)s));
1665 FBC_BOUND(isWORDCHAR_L1,
1666 isWORDCHAR_uni(tmp),
1667 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1670 FBC_NBOUND(isWORDCHAR_L1,
1671 isWORDCHAR_uni(tmp),
1672 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1675 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1676 is_LNBREAK_latin1_safe(s, strend)
1680 /* The argument to all the POSIX node types is the class number to pass to
1681 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1688 RXp_MATCH_TAINTED_on(prog);
1689 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1690 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1705 /* The complement of something that matches only ASCII matches all
1706 * UTF-8 variant code points, plus everything in ASCII that isn't
1708 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1709 || ! _generic_isCC_A(*s, FLAGS(c)));
1718 /* Don't need to worry about utf8, as it can match only a single
1719 * byte invariant character. */
1720 REXEC_FBC_CLASS_SCAN(
1721 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1729 if (! utf8_target) {
1730 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1736 classnum = (_char_class_number) FLAGS(c);
1737 if (classnum < _FIRST_NON_SWASH_CC) {
1738 while (s < strend) {
1740 /* We avoid loading in the swash as long as possible, but
1741 * should we have to, we jump to a separate loop. This
1742 * extra 'if' statement is what keeps this code from being
1743 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1744 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1745 goto found_above_latin1;
1747 if ((UTF8_IS_INVARIANT(*s)
1748 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1750 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1751 && to_complement ^ cBOOL(
1752 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1755 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1767 else switch (classnum) { /* These classes are implemented as
1769 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1770 revert the change of \v matching this */
1773 case _CC_ENUM_PSXSPC:
1774 REXEC_FBC_UTF8_CLASS_SCAN(
1775 to_complement ^ cBOOL(isSPACE_utf8(s)));
1778 case _CC_ENUM_BLANK:
1779 REXEC_FBC_UTF8_CLASS_SCAN(
1780 to_complement ^ cBOOL(isBLANK_utf8(s)));
1783 case _CC_ENUM_XDIGIT:
1784 REXEC_FBC_UTF8_CLASS_SCAN(
1785 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1788 case _CC_ENUM_VERTSPACE:
1789 REXEC_FBC_UTF8_CLASS_SCAN(
1790 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1793 case _CC_ENUM_CNTRL:
1794 REXEC_FBC_UTF8_CLASS_SCAN(
1795 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1799 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1800 assert(0); /* NOTREACHED */
1805 found_above_latin1: /* Here we have to load a swash to get the result
1806 for the current code point */
1807 if (! PL_utf8_swash_ptrs[classnum]) {
1808 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1809 PL_utf8_swash_ptrs[classnum] =
1810 _core_swash_init("utf8", swash_property_names[classnum],
1811 &PL_sv_undef, 1, 0, NULL, &flags);
1814 /* This is a copy of the loop above for swash classes, though using the
1815 * FBC macro instead of being expanded out. Since we've loaded the
1816 * swash, we don't have to check for that each time through the loop */
1817 REXEC_FBC_UTF8_CLASS_SCAN(
1818 to_complement ^ cBOOL(_generic_utf8(
1821 swash_fetch(PL_utf8_swash_ptrs[classnum],
1829 /* what trie are we using right now */
1830 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1831 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1832 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1834 const char *last_start = strend - trie->minlen;
1836 const char *real_start = s;
1838 STRLEN maxlen = trie->maxlen;
1840 U8 **points; /* map of where we were in the input string
1841 when reading a given char. For ASCII this
1842 is unnecessary overhead as the relationship
1843 is always 1:1, but for Unicode, especially
1844 case folded Unicode this is not true. */
1845 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1849 GET_RE_DEBUG_FLAGS_DECL;
1851 /* We can't just allocate points here. We need to wrap it in
1852 * an SV so it gets freed properly if there is a croak while
1853 * running the match */
1856 sv_points=newSV(maxlen * sizeof(U8 *));
1857 SvCUR_set(sv_points,
1858 maxlen * sizeof(U8 *));
1859 SvPOK_on(sv_points);
1860 sv_2mortal(sv_points);
1861 points=(U8**)SvPV_nolen(sv_points );
1862 if ( trie_type != trie_utf8_fold
1863 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1866 bitmap=(U8*)trie->bitmap;
1868 bitmap=(U8*)ANYOF_BITMAP(c);
1870 /* this is the Aho-Corasick algorithm modified a touch
1871 to include special handling for long "unknown char" sequences.
1872 The basic idea being that we use AC as long as we are dealing
1873 with a possible matching char, when we encounter an unknown char
1874 (and we have not encountered an accepting state) we scan forward
1875 until we find a legal starting char.
1876 AC matching is basically that of trie matching, except that when
1877 we encounter a failing transition, we fall back to the current
1878 states "fail state", and try the current char again, a process
1879 we repeat until we reach the root state, state 1, or a legal
1880 transition. If we fail on the root state then we can either
1881 terminate if we have reached an accepting state previously, or
1882 restart the entire process from the beginning if we have not.
1885 while (s <= last_start) {
1886 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1894 U8 *uscan = (U8*)NULL;
1895 U8 *leftmost = NULL;
1897 U32 accepted_word= 0;
1901 while ( state && uc <= (U8*)strend ) {
1903 U32 word = aho->states[ state ].wordnum;
1907 DEBUG_TRIE_EXECUTE_r(
1908 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1909 dump_exec_pos( (char *)uc, c, strend, real_start,
1910 (char *)uc, utf8_target );
1911 PerlIO_printf( Perl_debug_log,
1912 " Scanning for legal start char...\n");
1916 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1920 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1926 if (uc >(U8*)last_start) break;
1930 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1931 if (!leftmost || lpos < leftmost) {
1932 DEBUG_r(accepted_word=word);
1938 points[pointpos++ % maxlen]= uc;
1939 if (foldlen || uc < (U8*)strend) {
1940 REXEC_TRIE_READ_CHAR(trie_type, trie,
1942 uscan, len, uvc, charid, foldlen,
1944 DEBUG_TRIE_EXECUTE_r({
1945 dump_exec_pos( (char *)uc, c, strend,
1946 real_start, s, utf8_target);
1947 PerlIO_printf(Perl_debug_log,
1948 " Charid:%3u CP:%4"UVxf" ",
1960 word = aho->states[ state ].wordnum;
1962 base = aho->states[ state ].trans.base;
1964 DEBUG_TRIE_EXECUTE_r({
1966 dump_exec_pos( (char *)uc, c, strend, real_start,
1968 PerlIO_printf( Perl_debug_log,
1969 "%sState: %4"UVxf", word=%"UVxf,
1970 failed ? " Fail transition to " : "",
1971 (UV)state, (UV)word);
1977 ( ((offset = base + charid
1978 - 1 - trie->uniquecharcount)) >= 0)
1979 && ((U32)offset < trie->lasttrans)
1980 && trie->trans[offset].check == state
1981 && (tmp=trie->trans[offset].next))
1983 DEBUG_TRIE_EXECUTE_r(
1984 PerlIO_printf( Perl_debug_log," - legal\n"));
1989 DEBUG_TRIE_EXECUTE_r(
1990 PerlIO_printf( Perl_debug_log," - fail\n"));
1992 state = aho->fail[state];
1996 /* we must be accepting here */
1997 DEBUG_TRIE_EXECUTE_r(
1998 PerlIO_printf( Perl_debug_log," - accepting\n"));
2007 if (!state) state = 1;
2010 if ( aho->states[ state ].wordnum ) {
2011 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2012 if (!leftmost || lpos < leftmost) {
2013 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2018 s = (char*)leftmost;
2019 DEBUG_TRIE_EXECUTE_r({
2021 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2022 (UV)accepted_word, (IV)(s - real_start)
2025 if (reginfo->intuit || regtry(reginfo, &s)) {
2031 DEBUG_TRIE_EXECUTE_r({
2032 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2035 DEBUG_TRIE_EXECUTE_r(
2036 PerlIO_printf( Perl_debug_log,"No match.\n"));
2045 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2053 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2054 * flags have same meanings as with regexec_flags() */
2057 Perl_reg_set_capture_string(pTHX_ REGEXP * const rx,
2064 struct regexp *const prog = ReANY(rx);
2066 PERL_ARGS_ASSERT_REG_SET_CAPTURE_STRING;
2068 if (flags & REXEC_COPY_STR) {
2072 PerlIO_printf(Perl_debug_log,
2073 "Copy on write: regexp capture, type %d\n",
2076 RX_MATCH_COPY_FREE(rx);
2077 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2078 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2079 assert (SvPOKp(prog->saved_copy));
2080 prog->sublen = strend - strbeg;
2081 prog->suboffset = 0;
2082 prog->subcoffset = 0;
2087 I32 max = strend - strbeg;
2090 if ( (flags & REXEC_COPY_SKIP_POST)
2091 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2092 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2093 ) { /* don't copy $' part of string */
2096 /* calculate the right-most part of the string covered
2097 * by a capture. Due to look-ahead, this may be to
2098 * the right of $&, so we have to scan all captures */
2099 while (n <= prog->lastparen) {
2100 if (prog->offs[n].end > max)
2101 max = prog->offs[n].end;
2105 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2106 ? prog->offs[0].start
2108 assert(max >= 0 && max <= strend - strbeg);
2111 if ( (flags & REXEC_COPY_SKIP_PRE)
2112 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2113 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2114 ) { /* don't copy $` part of string */
2117 /* calculate the left-most part of the string covered
2118 * by a capture. Due to look-behind, this may be to
2119 * the left of $&, so we have to scan all captures */
2120 while (min && n <= prog->lastparen) {
2121 if ( prog->offs[n].start != -1
2122 && prog->offs[n].start < min)
2124 min = prog->offs[n].start;
2128 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2129 && min > prog->offs[0].end
2131 min = prog->offs[0].end;
2135 assert(min >= 0 && min <= max && min <= strend - strbeg);
2138 if (RX_MATCH_COPIED(rx)) {
2139 if (sublen > prog->sublen)
2141 (char*)saferealloc(prog->subbeg, sublen+1);
2144 prog->subbeg = (char*)safemalloc(sublen+1);
2145 Copy(strbeg + min, prog->subbeg, sublen, char);
2146 prog->subbeg[sublen] = '\0';
2147 prog->suboffset = min;
2148 prog->sublen = sublen;
2149 RX_MATCH_COPIED_on(rx);
2151 prog->subcoffset = prog->suboffset;
2152 if (prog->suboffset && utf8_target) {
2153 /* Convert byte offset to chars.
2154 * XXX ideally should only compute this if @-/@+
2155 * has been seen, a la PL_sawampersand ??? */
2157 /* If there's a direct correspondence between the
2158 * string which we're matching and the original SV,
2159 * then we can use the utf8 len cache associated with
2160 * the SV. In particular, it means that under //g,
2161 * sv_pos_b2u() will use the previously cached
2162 * position to speed up working out the new length of
2163 * subcoffset, rather than counting from the start of
2164 * the string each time. This stops
2165 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2166 * from going quadratic */
2167 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2168 sv_pos_b2u(sv, &(prog->subcoffset));
2170 prog->subcoffset = utf8_length((U8*)strbeg,
2171 (U8*)(strbeg+prog->suboffset));
2175 RX_MATCH_COPY_FREE(rx);
2176 prog->subbeg = strbeg;
2177 prog->suboffset = 0;
2178 prog->subcoffset = 0;
2179 prog->sublen = strend - strbeg;
2187 - regexec_flags - match a regexp against a string
2190 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2191 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2192 /* stringarg: the point in the string at which to begin matching */
2193 /* strend: pointer to null at end of string */
2194 /* strbeg: real beginning of string */
2195 /* minend: end of match must be >= minend bytes after stringarg. */
2196 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2197 * itself is accessed via the pointers above */
2198 /* data: May be used for some additional optimizations.
2199 Currently its only used, with a U32 cast, for transmitting
2200 the ganch offset when doing a /g match. This will change */
2201 /* nosave: For optimizations. */
2205 struct regexp *const prog = ReANY(rx);
2208 char *startpos = stringarg;
2209 I32 minlen; /* must match at least this many chars */
2210 I32 dontbother = 0; /* how many characters not to try at end */
2211 I32 end_shift = 0; /* Same for the end. */ /* CC */
2212 I32 scream_pos = -1; /* Internal iterator of scream. */
2213 char *scream_olds = NULL;
2214 const bool utf8_target = cBOOL(DO_UTF8(sv));
2216 RXi_GET_DECL(prog,progi);
2217 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2218 regmatch_info *const reginfo = ®info_buf;
2219 regexp_paren_pair *swap = NULL;
2221 GET_RE_DEBUG_FLAGS_DECL;
2223 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2224 PERL_UNUSED_ARG(data);
2226 /* Be paranoid... */
2227 if (prog == NULL || startpos == NULL) {
2228 Perl_croak(aTHX_ "NULL regexp parameter");
2233 debug_start_match(rx, utf8_target, startpos, strend,
2237 if ((RX_EXTFLAGS(rx) & RXf_USE_INTUIT)
2238 && !(flags & REXEC_CHECKED))
2240 stringarg = re_intuit_start(rx, sv, strbeg, stringarg, strend,
2245 if (RX_EXTFLAGS(rx) & RXf_CHECK_ALL) {
2246 /* we can match based purely on the result of INTUIT.
2247 * Set up captures etc just for $& and $-[0]
2248 * (an intuit-only match wont have $1,$2,..) */
2249 assert(!prog->nparens);
2250 /* match via INTUIT shouldn't have any captures.
2251 * Let @-, @+, $^N know */
2252 prog->lastparen = prog->lastcloseparen = 0;
2253 RX_MATCH_UTF8_set(rx, utf8_target);
2254 if ( !(flags & REXEC_NOT_FIRST) )
2255 Perl_reg_set_capture_string(aTHX_ rx,
2257 sv, flags, utf8_target);
2259 prog->offs[0].start = stringarg - strbeg;
2260 prog->offs[0].end = utf8_target
2261 ? (char*)utf8_hop((U8*)stringarg, prog->minlenret) - strbeg
2262 : stringarg - strbeg + prog->minlenret;
2268 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2269 * which will call destuctors to reset PL_regmatch_state, free higher
2270 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2271 * regmatch_info_aux_eval */
2273 oldsave = PL_savestack_ix;
2275 multiline = prog->extflags & RXf_PMf_MULTILINE;
2276 minlen = prog->minlen;
2278 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2279 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2280 "String too short [regexec_flags]...\n"));
2284 /* Check validity of program. */
2285 if (UCHARAT(progi->program) != REG_MAGIC) {
2286 Perl_croak(aTHX_ "corrupted regexp program");
2289 RX_MATCH_TAINTED_off(rx);
2291 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2292 reginfo->intuit = 0;
2293 reginfo->is_utf8_target = cBOOL(utf8_target);
2294 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2295 reginfo->warned = FALSE;
2296 reginfo->strbeg = strbeg;
2298 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2299 reginfo->strend = strend;
2300 /* see how far we have to get to not match where we matched before */
2301 reginfo->till = startpos+minend;
2303 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2304 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2305 S_cleanup_regmatch_info_aux has executed (registered by
2306 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2307 magic belonging to this SV.
2308 Not newSVsv, either, as it does not COW.
2310 reginfo->sv = newSV(0);
2311 sv_setsv(reginfo->sv, sv);
2312 SAVEFREESV(reginfo->sv);
2315 /* reserve next 2 or 3 slots in PL_regmatch_state:
2316 * slot N+0: may currently be in use: skip it
2317 * slot N+1: use for regmatch_info_aux struct
2318 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2319 * slot N+3: ready for use by regmatch()
2323 regmatch_state *old_regmatch_state;
2324 regmatch_slab *old_regmatch_slab;
2325 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2327 /* on first ever match, allocate first slab */
2328 if (!PL_regmatch_slab) {
2329 Newx(PL_regmatch_slab, 1, regmatch_slab);
2330 PL_regmatch_slab->prev = NULL;
2331 PL_regmatch_slab->next = NULL;
2332 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2335 old_regmatch_state = PL_regmatch_state;
2336 old_regmatch_slab = PL_regmatch_slab;
2338 for (i=0; i <= max; i++) {
2340 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2342 reginfo->info_aux_eval =
2343 reginfo->info_aux->info_aux_eval =
2344 &(PL_regmatch_state->u.info_aux_eval);
2346 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2347 PL_regmatch_state = S_push_slab(aTHX);
2350 /* note initial PL_regmatch_state position; at end of match we'll
2351 * pop back to there and free any higher slabs */
2353 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2354 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2355 reginfo->info_aux->poscache = NULL;
2357 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2359 if ((prog->extflags & RXf_EVAL_SEEN))
2360 S_setup_eval_state(aTHX_ reginfo);
2362 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2365 /* If there is a "must appear" string, look for it. */
2368 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2370 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2371 reginfo->ganch = startpos + prog->gofs;
2372 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2373 "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2374 } else if (sv && (mg = mg_find_mglob(sv))
2375 && mg->mg_len >= 0) {
2376 reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */
2377 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2378 "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2380 if (prog->extflags & RXf_ANCH_GPOS) {
2381 if (s > reginfo->ganch)
2383 s = reginfo->ganch - prog->gofs;
2384 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2385 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2391 reginfo->ganch = strbeg + PTR2UV(data);
2392 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2393 "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2395 } else { /* pos() not defined */
2396 reginfo->ganch = strbeg;
2397 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2398 "GPOS: reginfo->ganch = strbeg\n"));
2401 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2402 /* We have to be careful. If the previous successful match
2403 was from this regex we don't want a subsequent partially
2404 successful match to clobber the old results.
2405 So when we detect this possibility we add a swap buffer
2406 to the re, and switch the buffer each match. If we fail,
2407 we switch it back; otherwise we leave it swapped.
2410 /* do we need a save destructor here for eval dies? */
2411 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2412 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2413 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2419 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2420 re_scream_pos_data d;
2422 d.scream_olds = &scream_olds;
2423 d.scream_pos = &scream_pos;
2424 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
2426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2427 goto phooey; /* not present */
2433 /* Simplest case: anchored match need be tried only once. */
2434 /* [unless only anchor is BOL and multiline is set] */
2435 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2436 if (s == startpos && regtry(reginfo, &startpos))
2438 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2439 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2444 dontbother = minlen - 1;
2445 end = HOP3c(strend, -dontbother, strbeg) - 1;
2446 /* for multiline we only have to try after newlines */
2447 if (prog->check_substr || prog->check_utf8) {
2448 /* because of the goto we can not easily reuse the macros for bifurcating the
2449 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2452 goto after_try_utf8;
2454 if (regtry(reginfo, &s)) {
2461 if (prog->extflags & RXf_USE_INTUIT) {
2462 s = re_intuit_start(rx, sv, strbeg,
2463 s + UTF8SKIP(s), strend, flags, NULL);
2472 } /* end search for check string in unicode */
2474 if (s == startpos) {
2475 goto after_try_latin;
2478 if (regtry(reginfo, &s)) {
2485 if (prog->extflags & RXf_USE_INTUIT) {
2486 s = re_intuit_start(rx, sv, strbeg,
2487 s + 1, strend, flags, NULL);
2496 } /* end search for check string in latin*/
2497 } /* end search for check string */
2498 else { /* search for newline */
2500 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2503 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2504 while (s <= end) { /* note it could be possible to match at the end of the string */
2505 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2506 if (regtry(reginfo, &s))
2510 } /* end search for newline */
2511 } /* end anchored/multiline check string search */
2513 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2515 /* the warning about reginfo->ganch being used without initialization
2516 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2517 and we only enter this block when the same bit is set. */
2518 char *tmp_s = reginfo->ganch - prog->gofs;
2520 if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
2525 /* Messy cases: unanchored match. */
2526 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2527 /* we have /x+whatever/ */
2528 /* it must be a one character string (XXXX Except is_utf8_pat?) */
2534 if (! prog->anchored_utf8) {
2535 to_utf8_substr(prog);
2537 ch = SvPVX_const(prog->anchored_utf8)[0];
2540 DEBUG_EXECUTE_r( did_match = 1 );
2541 if (regtry(reginfo, &s)) goto got_it;
2543 while (s < strend && *s == ch)
2550 if (! prog->anchored_substr) {
2551 if (! to_byte_substr(prog)) {
2552 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2555 ch = SvPVX_const(prog->anchored_substr)[0];
2558 DEBUG_EXECUTE_r( did_match = 1 );
2559 if (regtry(reginfo, &s)) goto got_it;
2561 while (s < strend && *s == ch)
2566 DEBUG_EXECUTE_r(if (!did_match)
2567 PerlIO_printf(Perl_debug_log,
2568 "Did not find anchored character...\n")
2571 else if (prog->anchored_substr != NULL
2572 || prog->anchored_utf8 != NULL
2573 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2574 && prog->float_max_offset < strend - s)) {
2579 char *last1; /* Last position checked before */
2583 if (prog->anchored_substr || prog->anchored_utf8) {
2585 if (! prog->anchored_utf8) {
2586 to_utf8_substr(prog);
2588 must = prog->anchored_utf8;
2591 if (! prog->anchored_substr) {
2592 if (! to_byte_substr(prog)) {
2593 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2596 must = prog->anchored_substr;
2598 back_max = back_min = prog->anchored_offset;
2601 if (! prog->float_utf8) {
2602 to_utf8_substr(prog);
2604 must = prog->float_utf8;
2607 if (! prog->float_substr) {
2608 if (! to_byte_substr(prog)) {
2609 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2612 must = prog->float_substr;
2614 back_max = prog->float_max_offset;
2615 back_min = prog->float_min_offset;
2621 last = HOP3c(strend, /* Cannot start after this */
2622 -(I32)(CHR_SVLEN(must)
2623 - (SvTAIL(must) != 0) + back_min), strbeg);
2625 if (s > reginfo->strbeg)
2626 last1 = HOPc(s, -1);
2628 last1 = s - 1; /* bogus */
2630 /* XXXX check_substr already used to find "s", can optimize if
2631 check_substr==must. */
2633 dontbother = end_shift;
2634 strend = HOPc(strend, -dontbother);
2635 while ( (s <= last) &&
2636 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2637 (unsigned char*)strend, must,
2638 multiline ? FBMrf_MULTILINE : 0)) ) {
2639 DEBUG_EXECUTE_r( did_match = 1 );
2640 if (HOPc(s, -back_max) > last1) {
2641 last1 = HOPc(s, -back_min);
2642 s = HOPc(s, -back_max);
2645 char * const t = (last1 >= reginfo->strbeg)
2646 ? HOPc(last1, 1) : last1 + 1;
2648 last1 = HOPc(s, -back_min);
2652 while (s <= last1) {
2653 if (regtry(reginfo, &s))
2656 s++; /* to break out of outer loop */
2663 while (s <= last1) {
2664 if (regtry(reginfo, &s))
2670 DEBUG_EXECUTE_r(if (!did_match) {
2671 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2672 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2673 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2674 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2675 ? "anchored" : "floating"),
2676 quoted, RE_SV_TAIL(must));
2680 else if ( (c = progi->regstclass) ) {
2682 const OPCODE op = OP(progi->regstclass);
2683 /* don't bother with what can't match */
2684 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2685 strend = HOPc(strend, -(minlen - 1));
2688 SV * const prop = sv_newmortal();
2689 regprop(prog, prop, c);
2691 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2693 PerlIO_printf(Perl_debug_log,
2694 "Matching stclass %.*s against %s (%d bytes)\n",
2695 (int)SvCUR(prop), SvPVX_const(prop),
2696 quoted, (int)(strend - s));
2699 if (find_byclass(prog, c, s, strend, reginfo))
2701 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2705 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2713 if (! prog->float_utf8) {
2714 to_utf8_substr(prog);
2716 float_real = prog->float_utf8;
2719 if (! prog->float_substr) {
2720 if (! to_byte_substr(prog)) {
2721 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2724 float_real = prog->float_substr;
2727 little = SvPV_const(float_real, len);
2728 if (SvTAIL(float_real)) {
2729 /* This means that float_real contains an artificial \n on
2730 * the end due to the presence of something like this:
2731 * /foo$/ where we can match both "foo" and "foo\n" at the
2732 * end of the string. So we have to compare the end of the
2733 * string first against the float_real without the \n and
2734 * then against the full float_real with the string. We
2735 * have to watch out for cases where the string might be
2736 * smaller than the float_real or the float_real without
2738 char *checkpos= strend - len;
2740 PerlIO_printf(Perl_debug_log,
2741 "%sChecking for float_real.%s\n",
2742 PL_colors[4], PL_colors[5]));
2743 if (checkpos + 1 < strbeg) {
2744 /* can't match, even if we remove the trailing \n
2745 * string is too short to match */
2747 PerlIO_printf(Perl_debug_log,
2748 "%sString shorter than required trailing substring, cannot match.%s\n",
2749 PL_colors[4], PL_colors[5]));
2751 } else if (memEQ(checkpos + 1, little, len - 1)) {
2752 /* can match, the end of the string matches without the
2754 last = checkpos + 1;
2755 } else if (checkpos < strbeg) {
2756 /* cant match, string is too short when the "\n" is
2759 PerlIO_printf(Perl_debug_log,
2760 "%sString does not contain required trailing substring, cannot match.%s\n",
2761 PL_colors[4], PL_colors[5]));
2763 } else if (!multiline) {
2764 /* non multiline match, so compare with the "\n" at the
2765 * end of the string */
2766 if (memEQ(checkpos, little, len)) {
2770 PerlIO_printf(Perl_debug_log,
2771 "%sString does not contain required trailing substring, cannot match.%s\n",
2772 PL_colors[4], PL_colors[5]));
2776 /* multiline match, so we have to search for a place
2777 * where the full string is located */
2783 last = rninstr(s, strend, little, little + len);
2785 last = strend; /* matching "$" */
2788 /* at one point this block contained a comment which was
2789 * probably incorrect, which said that this was a "should not
2790 * happen" case. Even if it was true when it was written I am
2791 * pretty sure it is not anymore, so I have removed the comment
2792 * and replaced it with this one. Yves */
2794 PerlIO_printf(Perl_debug_log,
2795 "String does not contain required substring, cannot match.\n"
2799 dontbother = strend - last + prog->float_min_offset;
2801 if (minlen && (dontbother < minlen))
2802 dontbother = minlen - 1;
2803 strend -= dontbother; /* this one's always in bytes! */
2804 /* We don't know much -- general case. */
2807 if (regtry(reginfo, &s))
2816 if (regtry(reginfo, &s))
2818 } while (s++ < strend);
2828 PerlIO_printf(Perl_debug_log,
2829 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2836 /* clean up; this will trigger destructors that will free all slabs
2837 * above the current one, and cleanup the regmatch_info_aux
2838 * and regmatch_info_aux_eval sructs */
2840 LEAVE_SCOPE(oldsave);
2842 if (RXp_PAREN_NAMES(prog))
2843 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2845 RX_MATCH_UTF8_set(rx, utf8_target);
2847 /* make sure $`, $&, $', and $digit will work later */
2848 if ( !(flags & REXEC_NOT_FIRST) )
2849 Perl_reg_set_capture_string(aTHX_ rx,
2850 strbeg, reginfo->strend,
2851 sv, flags, utf8_target);
2856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2857 PL_colors[4], PL_colors[5]));
2859 /* clean up; this will trigger destructors that will free all slabs
2860 * above the current one, and cleanup the regmatch_info_aux
2861 * and regmatch_info_aux_eval sructs */
2863 LEAVE_SCOPE(oldsave);
2866 /* we failed :-( roll it back */
2867 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2868 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2873 Safefree(prog->offs);
2880 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
2881 * Do inc before dec, in case old and new rex are the same */
2882 #define SET_reg_curpm(Re2) \
2883 if (reginfo->info_aux_eval) { \
2884 (void)ReREFCNT_inc(Re2); \
2885 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2886 PM_SETRE((PL_reg_curpm), (Re2)); \
2891 - regtry - try match at specific point
2893 STATIC I32 /* 0 failure, 1 success */
2894 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2898 REGEXP *const rx = reginfo->prog;
2899 regexp *const prog = ReANY(rx);
2901 RXi_GET_DECL(prog,progi);
2902 GET_RE_DEBUG_FLAGS_DECL;
2904 PERL_ARGS_ASSERT_REGTRY;
2906 reginfo->cutpoint=NULL;
2908 prog->offs[0].start = *startposp - reginfo->strbeg;
2909 prog->lastparen = 0;
2910 prog->lastcloseparen = 0;
2912 /* XXXX What this code is doing here?!!! There should be no need
2913 to do this again and again, prog->lastparen should take care of
2916 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2917 * Actually, the code in regcppop() (which Ilya may be meaning by
2918 * prog->lastparen), is not needed at all by the test suite
2919 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2920 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2921 * Meanwhile, this code *is* needed for the
2922 * above-mentioned test suite tests to succeed. The common theme
2923 * on those tests seems to be returning null fields from matches.
2924 * --jhi updated by dapm */
2926 if (prog->nparens) {
2927 regexp_paren_pair *pp = prog->offs;
2929 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2937 result = regmatch(reginfo, *startposp, progi->program + 1);
2939 prog->offs[0].end = result;
2942 if (reginfo->cutpoint)
2943 *startposp= reginfo->cutpoint;
2944 REGCP_UNWIND(lastcp);
2949 #define sayYES goto yes
2950 #define sayNO goto no
2951 #define sayNO_SILENT goto no_silent
2953 /* we dont use STMT_START/END here because it leads to
2954 "unreachable code" warnings, which are bogus, but distracting. */
2955 #define CACHEsayNO \
2956 if (ST.cache_mask) \
2957 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
2960 /* this is used to determine how far from the left messages like
2961 'failed...' are printed. It should be set such that messages
2962 are inline with the regop output that created them.
2964 #define REPORT_CODE_OFF 32
2967 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2968 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2969 #define CHRTEST_NOT_A_CP_1 -999
2970 #define CHRTEST_NOT_A_CP_2 -998
2972 /* grab a new slab and return the first slot in it */
2974 STATIC regmatch_state *
2977 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2980 regmatch_slab *s = PL_regmatch_slab->next;
2982 Newx(s, 1, regmatch_slab);
2983 s->prev = PL_regmatch_slab;
2985 PL_regmatch_slab->next = s;
2987 PL_regmatch_slab = s;
2988 return SLAB_FIRST(s);
2992 /* push a new state then goto it */
2994 #define PUSH_STATE_GOTO(state, node, input) \
2995 pushinput = input; \
2997 st->resume_state = state; \
3000 /* push a new state with success backtracking, then goto it */
3002 #define PUSH_YES_STATE_GOTO(state, node, input) \
3003 pushinput = input; \
3005 st->resume_state = state; \
3006 goto push_yes_state;
3013 regmatch() - main matching routine
3015 This is basically one big switch statement in a loop. We execute an op,
3016 set 'next' to point the next op, and continue. If we come to a point which
3017 we may need to backtrack to on failure such as (A|B|C), we push a
3018 backtrack state onto the backtrack stack. On failure, we pop the top
3019 state, and re-enter the loop at the state indicated. If there are no more
3020 states to pop, we return failure.
3022 Sometimes we also need to backtrack on success; for example /A+/, where
3023 after successfully matching one A, we need to go back and try to
3024 match another one; similarly for lookahead assertions: if the assertion
3025 completes successfully, we backtrack to the state just before the assertion
3026 and then carry on. In these cases, the pushed state is marked as
3027 'backtrack on success too'. This marking is in fact done by a chain of
3028 pointers, each pointing to the previous 'yes' state. On success, we pop to
3029 the nearest yes state, discarding any intermediate failure-only states.
3030 Sometimes a yes state is pushed just to force some cleanup code to be
3031 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3032 it to free the inner regex.
3034 Note that failure backtracking rewinds the cursor position, while
3035 success backtracking leaves it alone.
3037 A pattern is complete when the END op is executed, while a subpattern
3038 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3039 ops trigger the "pop to last yes state if any, otherwise return true"
3042 A common convention in this function is to use A and B to refer to the two
3043 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3044 the subpattern to be matched possibly multiple times, while B is the entire
3045 rest of the pattern. Variable and state names reflect this convention.
3047 The states in the main switch are the union of ops and failure/success of
3048 substates associated with with that op. For example, IFMATCH is the op
3049 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3050 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3051 successfully matched A and IFMATCH_A_fail is a state saying that we have
3052 just failed to match A. Resume states always come in pairs. The backtrack
3053 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3054 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3055 on success or failure.
3057 The struct that holds a backtracking state is actually a big union, with
3058 one variant for each major type of op. The variable st points to the
3059 top-most backtrack struct. To make the code clearer, within each
3060 block of code we #define ST to alias the relevant union.
3062 Here's a concrete example of a (vastly oversimplified) IFMATCH
3068 #define ST st->u.ifmatch
3070 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3071 ST.foo = ...; // some state we wish to save
3073 // push a yes backtrack state with a resume value of
3074 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3076 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3079 case IFMATCH_A: // we have successfully executed A; now continue with B
3081 bar = ST.foo; // do something with the preserved value
3084 case IFMATCH_A_fail: // A failed, so the assertion failed
3085 ...; // do some housekeeping, then ...
3086 sayNO; // propagate the failure
3093 For any old-timers reading this who are familiar with the old recursive
3094 approach, the code above is equivalent to:
3096 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3105 ...; // do some housekeeping, then ...
3106 sayNO; // propagate the failure
3109 The topmost backtrack state, pointed to by st, is usually free. If you
3110 want to claim it, populate any ST.foo fields in it with values you wish to
3111 save, then do one of
3113 PUSH_STATE_GOTO(resume_state, node, newinput);
3114 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3116 which sets that backtrack state's resume value to 'resume_state', pushes a
3117 new free entry to the top of the backtrack stack, then goes to 'node'.
3118 On backtracking, the free slot is popped, and the saved state becomes the
3119 new free state. An ST.foo field in this new top state can be temporarily
3120 accessed to retrieve values, but once the main loop is re-entered, it
3121 becomes available for reuse.
3123 Note that the depth of the backtrack stack constantly increases during the
3124 left-to-right execution of the pattern, rather than going up and down with
3125 the pattern nesting. For example the stack is at its maximum at Z at the
3126 end of the pattern, rather than at X in the following:
3128 /(((X)+)+)+....(Y)+....Z/
3130 The only exceptions to this are lookahead/behind assertions and the cut,
3131 (?>A), which pop all the backtrack states associated with A before
3134 Backtrack state structs are allocated in slabs of about 4K in size.
3135 PL_regmatch_state and st always point to the currently active state,
3136 and PL_regmatch_slab points to the slab currently containing
3137 PL_regmatch_state. The first time regmatch() is called, the first slab is
3138 allocated, and is never freed until interpreter destruction. When the slab
3139 is full, a new one is allocated and chained to the end. At exit from
3140 regmatch(), slabs allocated since entry are freed.
3145 #define DEBUG_STATE_pp(pp) \
3147 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3148 PerlIO_printf(Perl_debug_log, \
3149 " %*s"pp" %s%s%s%s%s\n", \
3151 PL_reg_name[st->resume_state], \
3152 ((st==yes_state||st==mark_state) ? "[" : ""), \
3153 ((st==yes_state) ? "Y" : ""), \
3154 ((st==mark_state) ? "M" : ""), \
3155 ((st==yes_state||st==mark_state) ? "]" : "") \
3160 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3165 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3166 const char *start, const char *end, const char *blurb)
3168 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3170 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3175 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3176 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3178 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3179 start, end - start, 60);
3181 PerlIO_printf(Perl_debug_log,
3182 "%s%s REx%s %s against %s\n",
3183 PL_colors[4], blurb, PL_colors[5], s0, s1);
3185 if (utf8_target||utf8_pat)
3186 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3187 utf8_pat ? "pattern" : "",
3188 utf8_pat && utf8_target ? " and " : "",
3189 utf8_target ? "string" : ""
3195 S_dump_exec_pos(pTHX_ const char *locinput,
3196 const regnode *scan,
3197 const char *loc_regeol,
3198 const char *loc_bostr,
3199 const char *loc_reg_starttry,
3200 const bool utf8_target)
3202 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3203 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3204 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3205 /* The part of the string before starttry has one color
3206 (pref0_len chars), between starttry and current
3207 position another one (pref_len - pref0_len chars),
3208 after the current position the third one.
3209 We assume that pref0_len <= pref_len, otherwise we
3210 decrease pref0_len. */
3211 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3212 ? (5 + taill) - l : locinput - loc_bostr;
3215 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3217 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3219 pref0_len = pref_len - (locinput - loc_reg_starttry);
3220 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3221 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3222 ? (5 + taill) - pref_len : loc_regeol - locinput);
3223 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3227 if (pref0_len > pref_len)
3228 pref0_len = pref_len;
3230 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3232 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3233 (locinput - pref_len),pref0_len, 60, 4, 5);
3235 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3236 (locinput - pref_len + pref0_len),
3237 pref_len - pref0_len, 60, 2, 3);
3239 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3240 locinput, loc_regeol - locinput, 10, 0, 1);
3242 const STRLEN tlen=len0+len1+len2;
3243 PerlIO_printf(Perl_debug_log,
3244 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3245 (IV)(locinput - loc_bostr),
3248 (docolor ? "" : "> <"),
3250 (int)(tlen > 19 ? 0 : 19 - tlen),
3257 /* reg_check_named_buff_matched()
3258 * Checks to see if a named buffer has matched. The data array of
3259 * buffer numbers corresponding to the buffer is expected to reside
3260 * in the regexp->data->data array in the slot stored in the ARG() of
3261 * node involved. Note that this routine doesn't actually care about the
3262 * name, that information is not preserved from compilation to execution.
3263 * Returns the index of the leftmost defined buffer with the given name
3264 * or 0 if non of the buffers matched.
3267 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3270 RXi_GET_DECL(rex,rexi);
3271 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3272 I32 *nums=(I32*)SvPVX(sv_dat);
3274 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3276 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3277 if ((I32)rex->lastparen >= nums[n] &&
3278 rex->offs[nums[n]].end != -1)
3288 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3289 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3291 /* This function determines if there are one or two characters that match
3292 * the first character of the passed-in EXACTish node <text_node>, and if
3293 * so, returns them in the passed-in pointers.
3295 * If it determines that no possible character in the target string can
3296 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3297 * the first character in <text_node> requires UTF-8 to represent, and the
3298 * target string isn't in UTF-8.)
3300 * If there are more than two characters that could match the beginning of
3301 * <text_node>, or if more context is required to determine a match or not,
3302 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3304 * The motiviation behind this function is to allow the caller to set up
3305 * tight loops for matching. If <text_node> is of type EXACT, there is
3306 * only one possible character that can match its first character, and so
3307 * the situation is quite simple. But things get much more complicated if
3308 * folding is involved. It may be that the first character of an EXACTFish
3309 * node doesn't participate in any possible fold, e.g., punctuation, so it
3310 * can be matched only by itself. The vast majority of characters that are
3311 * in folds match just two things, their lower and upper-case equivalents.
3312 * But not all are like that; some have multiple possible matches, or match
3313 * sequences of more than one character. This function sorts all that out.
3315 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3316 * loop of trying to match A*, we know we can't exit where the thing
3317 * following it isn't a B. And something can't be a B unless it is the
3318 * beginning of B. By putting a quick test for that beginning in a tight
3319 * loop, we can rule out things that can't possibly be B without having to
3320 * break out of the loop, thus avoiding work. Similarly, if A is a single
3321 * character, we can make a tight loop matching A*, using the outputs of
3324 * If the target string to match isn't in UTF-8, and there aren't
3325 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3326 * the one or two possible octets (which are characters in this situation)
3327 * that can match. In all cases, if there is only one character that can
3328 * match, *<c1p> and *<c2p> will be identical.
3330 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3331 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3332 * can match the beginning of <text_node>. They should be declared with at
3333 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3334 * undefined what these contain.) If one or both of the buffers are
3335 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3336 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3337 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3338 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3339 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3341 const bool utf8_target = reginfo->is_utf8_target;
3343 UV c1 = CHRTEST_NOT_A_CP_1;
3344 UV c2 = CHRTEST_NOT_A_CP_2;
3345 bool use_chrtest_void = FALSE;
3346 const bool is_utf8_pat = reginfo->is_utf8_pat;
3348 /* Used when we have both utf8 input and utf8 output, to avoid converting
3349 * to/from code points */
3350 bool utf8_has_been_setup = FALSE;
3354 U8 *pat = (U8*)STRING(text_node);
3356 if (OP(text_node) == EXACT) {
3358 /* In an exact node, only one thing can be matched, that first
3359 * character. If both the pat and the target are UTF-8, we can just
3360 * copy the input to the output, avoiding finding the code point of
3365 else if (utf8_target) {
3366 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3367 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3368 utf8_has_been_setup = TRUE;
3371 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3374 else /* an EXACTFish node */
3376 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3377 pat + STR_LEN(text_node)))
3379 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3380 pat + STR_LEN(text_node))))
3382 /* Multi-character folds require more context to sort out. Also
3383 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3384 * handled outside this routine */
3385 use_chrtest_void = TRUE;
3387 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3388 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3390 /* Load the folds hash, if not already done */
3392 if (! PL_utf8_foldclosures) {
3393 if (! PL_utf8_tofold) {
3394 U8 dummy[UTF8_MAXBYTES+1];
3396 /* Force loading this by folding an above-Latin1 char */
3397 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3398 assert(PL_utf8_tofold); /* Verify that worked */
3400 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3403 /* The fold closures data structure is a hash with the keys being
3404 * the UTF-8 of every character that is folded to, like 'k', and
3405 * the values each an array of all code points that fold to its
3406 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3408 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3413 /* Not found in the hash, therefore there are no folds
3414 * containing it, so there is only a single character that
3418 else { /* Does participate in folds */
3419 AV* list = (AV*) *listp;
3420 if (av_len(list) != 1) {
3422 /* If there aren't exactly two folds to this, it is outside
3423 * the scope of this function */
3424 use_chrtest_void = TRUE;
3426 else { /* There are two. Get them */
3427 SV** c_p = av_fetch(list, 0, FALSE);
3429 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3433 c_p = av_fetch(list, 1, FALSE);
3435 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3439 /* Folds that cross the 255/256 boundary are forbidden if
3440 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3441 * pattern character is above 256, and its only other match
3442 * is below 256, the only legal match will be to itself.
3443 * We have thrown away the original, so have to compute
3444 * which is the one above 255 */
3445 if ((c1 < 256) != (c2 < 256)) {
3446 if (OP(text_node) == EXACTFL
3447 || (OP(text_node) == EXACTFA
3448 && (isASCII(c1) || isASCII(c2))))
3461 else /* Here, c1 is < 255 */
3463 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3464 && OP(text_node) != EXACTFL
3465 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3467 /* Here, there could be something above Latin1 in the target which
3468 * folds to this character in the pattern. All such cases except
3469 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3470 * involved in their folds, so are outside the scope of this
3472 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3473 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3476 use_chrtest_void = TRUE;
3479 else { /* Here nothing above Latin1 can fold to the pattern character */
3480 switch (OP(text_node)) {
3482 case EXACTFL: /* /l rules */
3483 c2 = PL_fold_locale[c1];
3487 if (! utf8_target) { /* /d rules */
3492 /* /u rules for all these. This happens to work for
3493 * EXACTFA as nothing in Latin1 folds to ASCII */
3495 case EXACTFU_TRICKYFOLD:
3498 c2 = PL_fold_latin1[c1];
3502 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3503 assert(0); /* NOTREACHED */
3508 /* Here have figured things out. Set up the returns */
3509 if (use_chrtest_void) {
3510 *c2p = *c1p = CHRTEST_VOID;
3512 else if (utf8_target) {
3513 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3514 uvchr_to_utf8(c1_utf8, c1);
3515 uvchr_to_utf8(c2_utf8, c2);
3518 /* Invariants are stored in both the utf8 and byte outputs; Use
3519 * negative numbers otherwise for the byte ones. Make sure that the
3520 * byte ones are the same iff the utf8 ones are the same */
3521 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3522 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3525 ? CHRTEST_NOT_A_CP_1
3526 : CHRTEST_NOT_A_CP_2;
3528 else if (c1 > 255) {
3529 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3534 *c1p = *c2p = c2; /* c2 is the only representable value */
3536 else { /* c1 is representable; see about c2 */
3538 *c2p = (c2 < 256) ? c2 : c1;
3544 /* returns -1 on failure, $+[0] on success */
3546 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3548 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3552 const bool utf8_target = reginfo->is_utf8_target;
3553 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3554 REGEXP *rex_sv = reginfo->prog;
3555 regexp *rex = ReANY(rex_sv);
3556 RXi_GET_DECL(rex,rexi);
3557 /* the current state. This is a cached copy of PL_regmatch_state */
3559 /* cache heavy used fields of st in registers */
3562 U32 n = 0; /* general value; init to avoid compiler warning */
3563 I32 ln = 0; /* len or last; init to avoid compiler warning */
3564 char *locinput = startpos;
3565 char *pushinput; /* where to continue after a PUSH */
3566 I32 nextchr; /* is always set to UCHARAT(locinput) */
3568 bool result = 0; /* return value of S_regmatch */
3569 int depth = 0; /* depth of backtrack stack */
3570 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3571 const U32 max_nochange_depth =
3572 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3573 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3574 regmatch_state *yes_state = NULL; /* state to pop to on success of
3576 /* mark_state piggy backs on the yes_state logic so that when we unwind
3577 the stack on success we can update the mark_state as we go */
3578 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3579 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3580 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3582 bool no_final = 0; /* prevent failure from backtracking? */
3583 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3584 char *startpoint = locinput;
3585 SV *popmark = NULL; /* are we looking for a mark? */
3586 SV *sv_commit = NULL; /* last mark name seen in failure */
3587 SV *sv_yes_mark = NULL; /* last mark name we have seen
3588 during a successful match */
3589 U32 lastopen = 0; /* last open we saw */
3590 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3591 SV* const oreplsv = GvSV(PL_replgv);
3592 /* these three flags are set by various ops to signal information to
3593 * the very next op. They have a useful lifetime of exactly one loop
3594 * iteration, and are not preserved or restored by state pushes/pops
3596 bool sw = 0; /* the condition value in (?(cond)a|b) */
3597 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3598 int logical = 0; /* the following EVAL is:
3602 or the following IFMATCH/UNLESSM is:
3603 false: plain (?=foo)
3604 true: used as a condition: (?(?=foo))
3606 PAD* last_pad = NULL;
3608 I32 gimme = G_SCALAR;
3609 CV *caller_cv = NULL; /* who called us */
3610 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3611 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3612 U32 maxopenparen = 0; /* max '(' index seen so far */
3613 int to_complement; /* Invert the result? */
3614 _char_class_number classnum;
3615 bool is_utf8_pat = reginfo->is_utf8_pat;
3618 GET_RE_DEBUG_FLAGS_DECL;
3621 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3622 multicall_oldcatch = 0;
3623 multicall_cv = NULL;
3625 PERL_UNUSED_VAR(multicall_cop);
3626 PERL_UNUSED_VAR(newsp);
3629 PERL_ARGS_ASSERT_REGMATCH;
3631 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3632 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3635 st = PL_regmatch_state;
3637 /* Note that nextchr is a byte even in UTF */
3640 while (scan != NULL) {
3643 SV * const prop = sv_newmortal();
3644 regnode *rnext=regnext(scan);
3645 DUMP_EXEC_POS( locinput, scan, utf8_target );
3646 regprop(rex, prop, scan);
3648 PerlIO_printf(Perl_debug_log,
3649 "%3"IVdf":%*s%s(%"IVdf")\n",
3650 (IV)(scan - rexi->program), depth*2, "",
3652 (PL_regkind[OP(scan)] == END || !rnext) ?
3653 0 : (IV)(rnext - rexi->program));
3656 next = scan + NEXT_OFF(scan);
3659 state_num = OP(scan);
3665 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3667 switch (state_num) {
3668 case BOL: /* /^../ */
3669 if (locinput == reginfo->strbeg)
3673 case MBOL: /* /^../m */
3674 if (locinput == reginfo->strbeg ||
3675 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3681 case SBOL: /* /^../s */
3682 if (locinput == reginfo->strbeg)
3687 if (locinput == reginfo->ganch)
3691 case KEEPS: /* \K */
3692 /* update the startpoint */
3693 st->u.keeper.val = rex->offs[0].start;
3694 rex->offs[0].start = locinput - reginfo->strbeg;
3695 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3696 assert(0); /*NOTREACHED*/
3697 case KEEPS_next_fail:
3698 /* rollback the start point change */
3699 rex->offs[0].start = st->u.keeper.val;
3701 assert(0); /*NOTREACHED*/
3703 case EOL: /* /..$/ */
3706 case MEOL: /* /..$/m */
3707 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3711 case SEOL: /* /..$/s */
3713 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3715 if (reginfo->strend - locinput > 1)
3720 if (!NEXTCHR_IS_EOS)
3724 case SANY: /* /./s */
3727 goto increment_locinput;
3735 case REG_ANY: /* /./ */
3736 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3738 goto increment_locinput;
3742 #define ST st->u.trie
3743 case TRIEC: /* (ab|cd) with known charclass */
3744 /* In this case the charclass data is available inline so
3745 we can fail fast without a lot of extra overhead.
3747 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3749 PerlIO_printf(Perl_debug_log,
3750 "%*s %sfailed to match trie start class...%s\n",
3751 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3754 assert(0); /* NOTREACHED */
3757 case TRIE: /* (ab|cd) */
3758 /* the basic plan of execution of the trie is:
3759 * At the beginning, run though all the states, and
3760 * find the longest-matching word. Also remember the position
3761 * of the shortest matching word. For example, this pattern:
3764 * when matched against the string "abcde", will generate
3765 * accept states for all words except 3, with the longest
3766 * matching word being 4, and the shortest being 2 (with
3767 * the position being after char 1 of the string).
3769 * Then for each matching word, in word order (i.e. 1,2,4,5),
3770 * we run the remainder of the pattern; on each try setting
3771 * the current position to the character following the word,
3772 * returning to try the next word on failure.
3774 * We avoid having to build a list of words at runtime by
3775 * using a compile-time structure, wordinfo[].prev, which
3776 * gives, for each word, the previous accepting word (if any).
3777 * In the case above it would contain the mappings 1->2, 2->0,
3778 * 3->0, 4->5, 5->1. We can use this table to generate, from
3779 * the longest word (4 above), a list of all words, by
3780 * following the list of prev pointers; this gives us the
3781 * unordered list 4,5,1,2. Then given the current word we have
3782 * just tried, we can go through the list and find the
3783 * next-biggest word to try (so if we just failed on word 2,
3784 * the next in the list is 4).
3786 * Since at runtime we don't record the matching position in
3787 * the string for each word, we have to work that out for
3788 * each word we're about to process. The wordinfo table holds
3789 * the character length of each word; given that we recorded
3790 * at the start: the position of the shortest word and its
3791 * length in chars, we just need to move the pointer the
3792 * difference between the two char lengths. Depending on
3793 * Unicode status and folding, that's cheap or expensive.
3795 * This algorithm is optimised for the case where are only a
3796 * small number of accept states, i.e. 0,1, or maybe 2.
3797 * With lots of accepts states, and having to try all of them,
3798 * it becomes quadratic on number of accept states to find all
3803 /* what type of TRIE am I? (utf8 makes this contextual) */
3804 DECL_TRIE_TYPE(scan);
3806 /* what trie are we using right now */
3807 reg_trie_data * const trie
3808 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3809 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3810 U32 state = trie->startstate;
3813 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3815 if (trie->states[ state ].wordnum) {
3817 PerlIO_printf(Perl_debug_log,
3818 "%*s %smatched empty string...%s\n",
3819 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3825 PerlIO_printf(Perl_debug_log,
3826 "%*s %sfailed to match trie start class...%s\n",
3827 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3834 U8 *uc = ( U8* )locinput;
3838 U8 *uscan = (U8*)NULL;
3839 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3840 U32 charcount = 0; /* how many input chars we have matched */
3841 U32 accepted = 0; /* have we seen any accepting states? */
3843 ST.jump = trie->jump;
3846 ST.longfold = FALSE; /* char longer if folded => it's harder */
3849 /* fully traverse the TRIE; note the position of the
3850 shortest accept state and the wordnum of the longest
3853 while ( state && uc <= (U8*)(reginfo->strend) ) {
3854 U32 base = trie->states[ state ].trans.base;
3858 wordnum = trie->states[ state ].wordnum;
3860 if (wordnum) { /* it's an accept state */
3863 /* record first match position */
3865 ST.firstpos = (U8*)locinput;
3870 ST.firstchars = charcount;
3873 if (!ST.nextword || wordnum < ST.nextword)
3874 ST.nextword = wordnum;
3875 ST.topword = wordnum;
3878 DEBUG_TRIE_EXECUTE_r({
3879 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3880 PerlIO_printf( Perl_debug_log,
3881 "%*s %sState: %4"UVxf" Accepted: %c ",
3882 2+depth * 2, "", PL_colors[4],
3883 (UV)state, (accepted ? 'Y' : 'N'));
3886 /* read a char and goto next state */
3887 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3889 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3890 uscan, len, uvc, charid, foldlen,
3897 base + charid - 1 - trie->uniquecharcount)) >= 0)
3899 && ((U32)offset < trie->lasttrans)
3900 && trie->trans[offset].check == state)
3902 state = trie->trans[offset].next;
3913 DEBUG_TRIE_EXECUTE_r(
3914 PerlIO_printf( Perl_debug_log,
3915 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3916 charid, uvc, (UV)state, PL_colors[5] );
3922 /* calculate total number of accept states */
3927 w = trie->wordinfo[w].prev;
3930 ST.accepted = accepted;
3934 PerlIO_printf( Perl_debug_log,
3935 "%*s %sgot %"IVdf" possible matches%s\n",
3936 REPORT_CODE_OFF + depth * 2, "",
3937 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3939 goto trie_first_try; /* jump into the fail handler */
3941 assert(0); /* NOTREACHED */
3943 case TRIE_next_fail: /* we failed - try next alternative */
3947 REGCP_UNWIND(ST.cp);
3948 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3950 if (!--ST.accepted) {
3952 PerlIO_printf( Perl_debug_log,
3953 "%*s %sTRIE failed...%s\n",
3954 REPORT_CODE_OFF+depth*2, "",
3961 /* Find next-highest word to process. Note that this code
3962 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3965 U16 const nextword = ST.nextword;
3966 reg_trie_wordinfo * const wordinfo
3967 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3968 for (word=ST.topword; word; word=wordinfo[word].prev) {
3969 if (word > nextword && (!min || word < min))
3982 ST.lastparen = rex->lastparen;
3983 ST.lastcloseparen = rex->lastcloseparen;
3987 /* find start char of end of current word */
3989 U32 chars; /* how many chars to skip */
3990 reg_trie_data * const trie
3991 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3993 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3995 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4000 /* the hard option - fold each char in turn and find
4001 * its folded length (which may be different */
4002 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4010 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4018 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4023 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4039 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4040 ? ST.jump[ST.nextword]
4044 PerlIO_printf( Perl_debug_log,
4045 "%*s %sTRIE matched word #%d, continuing%s\n",
4046 REPORT_CODE_OFF+depth*2, "",
4053 if (ST.accepted > 1 || has_cutgroup) {
4054 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4055 assert(0); /* NOTREACHED */
4057 /* only one choice left - just continue */
4059 AV *const trie_words
4060 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4061 SV ** const tmp = av_fetch( trie_words,
4063 SV *sv= tmp ? sv_newmortal() : NULL;
4065 PerlIO_printf( Perl_debug_log,
4066 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4067 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4069 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4070 PL_colors[0], PL_colors[1],
4071 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4073 : "not compiled under -Dr",
4077 locinput = (char*)uc;
4078 continue; /* execute rest of RE */
4079 assert(0); /* NOTREACHED */
4083 case EXACT: { /* /abc/ */
4084 char *s = STRING(scan);
4086 if (utf8_target != is_utf8_pat) {
4087 /* The target and the pattern have differing utf8ness. */
4089 const char * const e = s + ln;
4092 /* The target is utf8, the pattern is not utf8.
4093 * Above-Latin1 code points can't match the pattern;
4094 * invariants match exactly, and the other Latin1 ones need
4095 * to be downgraded to a single byte in order to do the
4096 * comparison. (If we could be confident that the target
4097 * is not malformed, this could be refactored to have fewer
4098 * tests by just assuming that if the first bytes match, it
4099 * is an invariant, but there are tests in the test suite
4100 * dealing with (??{...}) which violate this) */
4102 if (l >= reginfo->strend
4103 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4107 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4114 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4123 /* The target is not utf8, the pattern is utf8. */
4125 if (l >= reginfo->strend
4126 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4130 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4137 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4148 /* The target and the pattern have the same utf8ness. */
4149 /* Inline the first character, for speed. */
4150 if (reginfo->strend - locinput < ln
4151 || UCHARAT(s) != nextchr
4152 || (ln > 1 && memNE(s, locinput, ln)))
4161 case EXACTFL: { /* /abc/il */
4163 const U8 * fold_array;
4165 U32 fold_utf8_flags;
4167 RX_MATCH_TAINTED_on(reginfo->prog);
4168 folder = foldEQ_locale;
4169 fold_array = PL_fold_locale;
4170 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4173 case EXACTFU_SS: /* /\x{df}/iu */
4174 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4175 case EXACTFU: /* /abc/iu */
4176 folder = foldEQ_latin1;
4177 fold_array = PL_fold_latin1;
4178 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4181 case EXACTFA: /* /abc/iaa */
4182 folder = foldEQ_latin1;
4183 fold_array = PL_fold_latin1;
4184 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4187 case EXACTF: /* /abc/i */
4189 fold_array = PL_fold;
4190 fold_utf8_flags = 0;
4196 if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4197 /* Either target or the pattern are utf8, or has the issue where
4198 * the fold lengths may differ. */
4199 const char * const l = locinput;
4200 char *e = reginfo->strend;
4202 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
4203 l, &e, 0, utf8_target, fold_utf8_flags))
4211 /* Neither the target nor the pattern are utf8 */
4212 if (UCHARAT(s) != nextchr
4214 && UCHARAT(s) != fold_array[nextchr])
4218 if (reginfo->strend - locinput < ln)
4220 if (ln > 1 && ! folder(s, locinput, ln))
4226 /* XXX Could improve efficiency by separating these all out using a
4227 * macro or in-line function. At that point regcomp.c would no longer
4228 * have to set the FLAGS fields of these */
4229 case BOUNDL: /* /\b/l */
4230 case NBOUNDL: /* /\B/l */
4231 RX_MATCH_TAINTED_on(reginfo->prog);
4233 case BOUND: /* /\b/ */
4234 case BOUNDU: /* /\b/u */
4235 case BOUNDA: /* /\b/a */
4236 case NBOUND: /* /\B/ */
4237 case NBOUNDU: /* /\B/u */
4238 case NBOUNDA: /* /\B/a */
4239 /* was last char in word? */
4241 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4242 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4244 if (locinput == reginfo->strbeg)
4247 const U8 * const r =
4248 reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4250 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4252 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4253 ln = isWORDCHAR_uni(ln);
4257 LOAD_UTF8_CHARCLASS_ALNUM();
4258 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4263 ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4264 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4269 /* Here the string isn't utf8, or is utf8 and only ascii
4270 * characters are to match \w. In the latter case looking at
4271 * the byte just prior to the current one may be just the final
4272 * byte of a multi-byte character. This is ok. There are two
4274 * 1) it is a single byte character, and then the test is doing
4275 * just what it's supposed to.
4276 * 2) it is a multi-byte character, in which case the final
4277 * byte is never mistakable for ASCII, and so the test
4278 * will say it is not a word character, which is the
4279 * correct answer. */
4280 ln = (locinput != reginfo->strbeg) ?
4281 UCHARAT(locinput - 1) : '\n';
4282 switch (FLAGS(scan)) {
4283 case REGEX_UNICODE_CHARSET:
4284 ln = isWORDCHAR_L1(ln);
4285 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4287 case REGEX_LOCALE_CHARSET:
4288 ln = isWORDCHAR_LC(ln);
4289 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4291 case REGEX_DEPENDS_CHARSET:
4292 ln = isWORDCHAR(ln);
4293 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4295 case REGEX_ASCII_RESTRICTED_CHARSET:
4296 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4297 ln = isWORDCHAR_A(ln);
4298 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4301 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4305 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4307 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4311 case ANYOF: /* /[abc]/ */
4312 case ANYOF_WARN_SUPER:
4316 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4318 locinput += UTF8SKIP(locinput);
4321 if (!REGINCLASS(rex, scan, (U8*)locinput))
4327 /* The argument (FLAGS) to all the POSIX node types is the class number
4330 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4334 case POSIXL: /* \w or [:punct:] etc. under /l */
4338 /* The locale hasn't influenced the outcome before this, so defer
4339 * tainting until now */
4340 RX_MATCH_TAINTED_on(reginfo->prog);
4342 /* Use isFOO_lc() for characters within Latin1. (Note that
4343 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4344 * wouldn't be invariant) */
4345 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4346 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4350 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4351 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4352 (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
4353 *(locinput + 1))))))
4358 else { /* Here, must be an above Latin-1 code point */
4359 goto utf8_posix_not_eos;
4362 /* Here, must be utf8 */
4363 locinput += UTF8SKIP(locinput);
4366 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4370 case POSIXD: /* \w or [:punct:] etc. under /d */
4376 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
4378 if (NEXTCHR_IS_EOS) {
4382 /* All UTF-8 variants match */
4383 if (! UTF8_IS_INVARIANT(nextchr)) {
4384 goto increment_locinput;
4390 case POSIXA: /* \w or [:punct:] etc. under /a */
4393 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4394 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4395 * character is a single byte */
4398 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4404 /* Here we are either not in utf8, or we matched a utf8-invariant,
4405 * so the next char is the next byte */
4409 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4413 case POSIXU: /* \w or [:punct:] etc. under /u */
4415 if (NEXTCHR_IS_EOS) {
4420 /* Use _generic_isCC() for characters within Latin1. (Note that
4421 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4422 * wouldn't be invariant) */
4423 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4424 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4431 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4432 if (! (to_complement
4433 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4441 else { /* Handle above Latin-1 code points */
4442 classnum = (_char_class_number) FLAGS(scan);
4443 if (classnum < _FIRST_NON_SWASH_CC) {
4445 /* Here, uses a swash to find such code points. Load if if
4446 * not done already */
4447 if (! PL_utf8_swash_ptrs[classnum]) {
4448 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4449 PL_utf8_swash_ptrs[classnum]
4450 = _core_swash_init("utf8",
4451 swash_property_names[classnum],
4452 &PL_sv_undef, 1, 0, NULL, &flags);
4454 if (! (to_complement
4455 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4456 (U8 *) locinput, TRUE))))
4461 else { /* Here, uses macros to find above Latin-1 code points */
4463 case _CC_ENUM_SPACE: /* XXX would require separate
4464 code if we revert the change
4465 of \v matching this */
4466 case _CC_ENUM_PSXSPC:
4467 if (! (to_complement
4468 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4473 case _CC_ENUM_BLANK:
4474 if (! (to_complement
4475 ^ cBOOL(is_HORIZWS_high(locinput))))
4480 case _CC_ENUM_XDIGIT:
4481 if (! (to_complement
4482 ^ cBOOL(is_XDIGIT_high(locinput))))
4487 case _CC_ENUM_VERTSPACE:
4488 if (! (to_complement
4489 ^ cBOOL(is_VERTWS_high(locinput))))
4494 default: /* The rest, e.g. [:cntrl:], can't match
4496 if (! to_complement) {
4502 locinput += UTF8SKIP(locinput);
4506 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4507 a Unicode extended Grapheme Cluster */
4508 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4509 extended Grapheme Cluster is:
4512 | Prepend* Begin Extend*
4515 Begin is: ( Special_Begin | ! Control )
4516 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4517 Extend is: ( Grapheme_Extend | Spacing_Mark )
4518 Control is: [ GCB_Control | CR | LF ]
4519 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4521 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4524 Begin is ( Regular_Begin + Special Begin )
4526 It turns out that 98.4% of all Unicode code points match
4527 Regular_Begin. Doing it this way eliminates a table match in
4528 the previous implementation for almost all Unicode code points.
4530 There is a subtlety with Prepend* which showed up in testing.
4531 Note that the Begin, and only the Begin is required in:
4532 | Prepend* Begin Extend*
4533 Also, Begin contains '! Control'. A Prepend must be a
4534 '! Control', which means it must also be a Begin. What it
4535 comes down to is that if we match Prepend* and then find no
4536 suitable Begin afterwards, that if we backtrack the last
4537 Prepend, that one will be a suitable Begin.
4542 if (! utf8_target) {
4544 /* Match either CR LF or '.', as all the other possibilities
4546 locinput++; /* Match the . or CR */
4547 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4549 && locinput < reginfo->strend
4550 && UCHARAT(locinput) == '\n')
4557 /* Utf8: See if is ( CR LF ); already know that locinput <
4558 * reginfo->strend, so locinput+1 is in bounds */
4559 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4560 && UCHARAT(locinput + 1) == '\n')
4567 /* In case have to backtrack to beginning, then match '.' */
4568 char *starting = locinput;
4570 /* In case have to backtrack the last prepend */
4571 char *previous_prepend = NULL;
4573 LOAD_UTF8_CHARCLASS_GCB();
4575 /* Match (prepend)* */
4576 while (locinput < reginfo->strend
4577 && (len = is_GCB_Prepend_utf8(locinput)))
4579 previous_prepend = locinput;
4583 /* As noted above, if we matched a prepend character, but
4584 * the next thing won't match, back off the last prepend we
4585 * matched, as it is guaranteed to match the begin */
4586 if (previous_prepend
4587 && (locinput >= reginfo->strend
4588 || (! swash_fetch(PL_utf8_X_regular_begin,
4589 (U8*)locinput, utf8_target)
4590 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4593 locinput = previous_prepend;
4596 /* Note that here we know reginfo->strend > locinput, as we
4597 * tested that upon input to this switch case, and if we
4598 * moved locinput forward, we tested the result just above
4599 * and it either passed, or we backed off so that it will
4601 if (swash_fetch(PL_utf8_X_regular_begin,
4602 (U8*)locinput, utf8_target)) {
4603 locinput += UTF8SKIP(locinput);
4605 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4607 /* Here did not match the required 'Begin' in the
4608 * second term. So just match the very first
4609 * character, the '.' of the final term of the regex */
4610 locinput = starting + UTF8SKIP(starting);
4614 /* Here is a special begin. It can be composed of
4615 * several individual characters. One possibility is
4617 if ((len = is_GCB_RI_utf8(locinput))) {
4619 while (locinput < reginfo->strend
4620 && (len = is_GCB_RI_utf8(locinput)))
4624 } else if ((len = is_GCB_T_utf8(locinput))) {
4625 /* Another possibility is T+ */
4627 while (locinput < reginfo->strend
4628 && (len = is_GCB_T_utf8(locinput)))
4634 /* Here, neither RI+ nor T+; must be some other
4635 * Hangul. That means it is one of the others: L,
4636 * LV, LVT or V, and matches:
4637 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4640 while (locinput < reginfo->strend
4641 && (len = is_GCB_L_utf8(locinput)))
4646 /* Here, have exhausted L*. If the next character
4647 * is not an LV, LVT nor V, it means we had to have
4648 * at least one L, so matches L+ in the original
4649 * equation, we have a complete hangul syllable.
4652 if (locinput < reginfo->strend
4653 && is_GCB_LV_LVT_V_utf8(locinput))
4655 /* Otherwise keep going. Must be LV, LVT or V.
4656 * See if LVT, by first ruling out V, then LV */
4657 if (! is_GCB_V_utf8(locinput)
4658 /* All but every TCount one is LV */
4659 && (valid_utf8_to_uvchr((U8 *) locinput,
4664 locinput += UTF8SKIP(locinput);
4667 /* Must be V or LV. Take it, then match
4669 locinput += UTF8SKIP(locinput);
4670 while (locinput < reginfo->strend
4671 && (len = is_GCB_V_utf8(locinput)))
4677 /* And any of LV, LVT, or V can be followed
4679 while (locinput < reginfo->strend
4680 && (len = is_GCB_T_utf8(locinput)))
4688 /* Match any extender */
4689 while (locinput < reginfo->strend
4690 && swash_fetch(PL_utf8_X_extend,
4691 (U8*)locinput, utf8_target))
4693 locinput += UTF8SKIP(locinput);
4697 if (locinput > reginfo->strend) sayNO;
4701 case NREFFL: /* /\g{name}/il */
4702 { /* The capture buffer cases. The ones beginning with N for the
4703 named buffers just convert to the equivalent numbered and
4704 pretend they were called as the corresponding numbered buffer
4706 /* don't initialize these in the declaration, it makes C++
4711 const U8 *fold_array;
4714 RX_MATCH_TAINTED_on(reginfo->prog);
4715 folder = foldEQ_locale;
4716 fold_array = PL_fold_locale;
4718 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4721 case NREFFA: /* /\g{name}/iaa */
4722 folder = foldEQ_latin1;
4723 fold_array = PL_fold_latin1;
4725 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4728 case NREFFU: /* /\g{name}/iu */
4729 folder = foldEQ_latin1;
4730 fold_array = PL_fold_latin1;
4732 utf8_fold_flags = 0;
4735 case NREFF: /* /\g{name}/i */
4737 fold_array = PL_fold;
4739 utf8_fold_flags = 0;
4742 case NREF: /* /\g{name}/ */
4746 utf8_fold_flags = 0;
4749 /* For the named back references, find the corresponding buffer
4751 n = reg_check_named_buff_matched(rex,scan);
4756 goto do_nref_ref_common;
4758 case REFFL: /* /\1/il */
4759 RX_MATCH_TAINTED_on(reginfo->prog);
4760 folder = foldEQ_locale;
4761 fold_array = PL_fold_locale;
4762 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4765 case REFFA: /* /\1/iaa */
4766 folder = foldEQ_latin1;
4767 fold_array = PL_fold_latin1;
4768 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4771 case REFFU: /* /\1/iu */
4772 folder = foldEQ_latin1;
4773 fold_array = PL_fold_latin1;
4774 utf8_fold_flags = 0;
4777 case REFF: /* /\1/i */
4779 fold_array = PL_fold;
4780 utf8_fold_flags = 0;
4783 case REF: /* /\1/ */
4786 utf8_fold_flags = 0;
4790 n = ARG(scan); /* which paren pair */
4793 ln = rex->offs[n].start;
4794 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4795 if (rex->lastparen < n || ln == -1)
4796 sayNO; /* Do not match unless seen CLOSEn. */
4797 if (ln == rex->offs[n].end)
4800 s = reginfo->strbeg + ln;
4801 if (type != REF /* REF can do byte comparison */
4802 && (utf8_target || type == REFFU))
4803 { /* XXX handle REFFL better */
4804 char * limit = reginfo->strend;
4806 /* This call case insensitively compares the entire buffer
4807 * at s, with the current input starting at locinput, but
4808 * not going off the end given by reginfo->strend, and
4809 * returns in <limit> upon success, how much of the
4810 * current input was matched */
4811 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4812 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4820 /* Not utf8: Inline the first character, for speed. */
4821 if (!NEXTCHR_IS_EOS &&
4822 UCHARAT(s) != nextchr &&
4824 UCHARAT(s) != fold_array[nextchr]))
4826 ln = rex->offs[n].end - ln;
4827 if (locinput + ln > reginfo->strend)
4829 if (ln > 1 && (type == REF
4830 ? memNE(s, locinput, ln)
4831 : ! folder(s, locinput, ln)))
4837 case NOTHING: /* null op; e.g. the 'nothing' following
4838 * the '*' in m{(a+|b)*}' */
4840 case TAIL: /* placeholder while compiling (A|B|C) */
4843 case BACK: /* ??? doesn't appear to be used ??? */
4847 #define ST st->u.eval
4852 regexp_internal *rei;
4853 regnode *startpoint;
4855 case GOSTART: /* (?R) */
4856 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4857 if (cur_eval && cur_eval->locinput==locinput) {
4858 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4859 Perl_croak(aTHX_ "Infinite recursion in regex");
4860 if ( ++nochange_depth > max_nochange_depth )
4862 "Pattern subroutine nesting without pos change"
4863 " exceeded limit in regex");
4870 if (OP(scan)==GOSUB) {
4871 startpoint = scan + ARG2L(scan);
4872 ST.close_paren = ARG(scan);
4874 startpoint = rei->program+1;
4877 goto eval_recurse_doit;
4878 assert(0); /* NOTREACHED */
4880 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4881 if (cur_eval && cur_eval->locinput==locinput) {
4882 if ( ++nochange_depth > max_nochange_depth )
4883 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4888 /* execute the code in the {...} */
4892 OP * const oop = PL_op;
4893 COP * const ocurcop = PL_curcop;
4897 /* save *all* paren positions */
4898 regcppush(rex, 0, maxopenparen);
4899 REGCP_SET(runops_cp);
4902 caller_cv = find_runcv(NULL);
4906 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4908 (REGEXP*)(rexi->data->data[n])
4911 nop = (OP*)rexi->data->data[n+1];
4913 else if (rexi->data->what[n] == 'l') { /* literal code */
4915 nop = (OP*)rexi->data->data[n];
4916 assert(CvDEPTH(newcv));
4919 /* literal with own CV */
4920 assert(rexi->data->what[n] == 'L');
4921 newcv = rex->qr_anoncv;
4922 nop = (OP*)rexi->data->data[n];
4925 /* normally if we're about to execute code from the same
4926 * CV that we used previously, we just use the existing
4927 * CX stack entry. However, its possible that in the
4928 * meantime we may have backtracked, popped from the save
4929 * stack, and undone the SAVECOMPPAD(s) associated with
4930 * PUSH_MULTICALL; in which case PL_comppad no longer
4931 * points to newcv's pad. */
4932 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4934 U8 flags = (CXp_SUB_RE |
4935 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
4936 if (last_pushed_cv) {
4937 CHANGE_MULTICALL_FLAGS(newcv, flags);
4940 PUSH_MULTICALL_FLAGS(newcv, flags);
4942 last_pushed_cv = newcv;
4945 /* these assignments are just to silence compiler
4947 multicall_cop = NULL;
4950 last_pad = PL_comppad;
4952 /* the initial nextstate you would normally execute
4953 * at the start of an eval (which would cause error
4954 * messages to come from the eval), may be optimised
4955 * away from the execution path in the regex code blocks;
4956 * so manually set PL_curcop to it initially */
4958 OP *o = cUNOPx(nop)->op_first;
4959 assert(o->op_type == OP_NULL);
4960 if (o->op_targ == OP_SCOPE) {
4961 o = cUNOPo->op_first;
4964 assert(o->op_targ == OP_LEAVE);
4965 o = cUNOPo->op_first;
4966 assert(o->op_type == OP_ENTER);
4970 if (o->op_type != OP_STUB) {
4971 assert( o->op_type == OP_NEXTSTATE
4972 || o->op_type == OP_DBSTATE
4973 || (o->op_type == OP_NULL
4974 && ( o->op_targ == OP_NEXTSTATE
4975 || o->op_targ == OP_DBSTATE
4979 PL_curcop = (COP*)o;
4984 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4985 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4987 rex->offs[0].end = locinput - reginfo->strbeg;
4988 if (reginfo->info_aux_eval->pos_magic)
4989 reginfo->info_aux_eval->pos_magic->mg_len
4990 = locinput - reginfo->strbeg;
4993 SV *sv_mrk = get_sv("REGMARK", 1);
4994 sv_setsv(sv_mrk, sv_yes_mark);
4997 /* we don't use MULTICALL here as we want to call the
4998 * first op of the block of interest, rather than the
4999 * first op of the sub */
5000 before = (IV)(SP-PL_stack_base);
5002 CALLRUNOPS(aTHX); /* Scalar context. */
5004 if ((IV)(SP-PL_stack_base) == before)
5005 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
5011 /* before restoring everything, evaluate the returned
5012 * value, so that 'uninit' warnings don't use the wrong
5013 * PL_op or pad. Also need to process any magic vars
5014 * (e.g. $1) *before* parentheses are restored */
5019 if (logical == 0) /* (?{})/ */
5020 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5021 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
5022 sw = cBOOL(SvTRUE(ret));
5025 else { /* /(??{}) */
5026 /* if its overloaded, let the regex compiler handle
5027 * it; otherwise extract regex, or stringify */
5028 if (!SvAMAGIC(ret)) {
5032 if (SvTYPE(sv) == SVt_REGEXP)
5033 re_sv = (REGEXP*) sv;
5034 else if (SvSMAGICAL(sv)) {
5035 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5037 re_sv = (REGEXP *) mg->mg_obj;
5040 /* force any magic, undef warnings here */
5042 ret = sv_mortalcopy(ret);
5043 (void) SvPV_force_nolen(ret);
5049 /* *** Note that at this point we don't restore
5050 * PL_comppad, (or pop the CxSUB) on the assumption it may
5051 * be used again soon. This is safe as long as nothing
5052 * in the regexp code uses the pad ! */
5054 PL_curcop = ocurcop;
5055 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5056 PL_curpm = PL_reg_curpm;
5062 /* only /(??{})/ from now on */
5065 /* extract RE object from returned value; compiling if
5069 re_sv = reg_temp_copy(NULL, re_sv);
5074 if (SvUTF8(ret) && IN_BYTES) {
5075 /* In use 'bytes': make a copy of the octet
5076 * sequence, but without the flag on */
5078 const char *const p = SvPV(ret, len);
5079 ret = newSVpvn_flags(p, len, SVs_TEMP);
5081 if (rex->intflags & PREGf_USE_RE_EVAL)
5082 pm_flags |= PMf_USE_RE_EVAL;
5084 /* if we got here, it should be an engine which
5085 * supports compiling code blocks and stuff */
5086 assert(rex->engine && rex->engine->op_comp);
5087 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5088 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5089 rex->engine, NULL, NULL,
5090 /* copy /msix etc to inner pattern */
5095 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5097 /* This isn't a first class regexp. Instead, it's
5098 caching a regexp onto an existing, Perl visible
5100 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5102 /* safe to do now that any $1 etc has been
5103 * interpolated into the new pattern string and
5105 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5110 RXp_MATCH_COPIED_off(re);
5111 re->subbeg = rex->subbeg;
5112 re->sublen = rex->sublen;
5113 re->suboffset = rex->suboffset;
5114 re->subcoffset = rex->subcoffset;
5117 debug_start_match(re_sv, utf8_target, locinput,
5118 reginfo->strend, "Matching embedded");
5120 startpoint = rei->program + 1;
5121 ST.close_paren = 0; /* only used for GOSUB */
5123 eval_recurse_doit: /* Share code with GOSUB below this line */
5124 /* run the pattern returned from (??{...}) */
5126 /* Save *all* the positions. */
5127 ST.cp = regcppush(rex, 0, maxopenparen);
5128 REGCP_SET(ST.lastcp);
5131 re->lastcloseparen = 0;
5135 /* invalidate the S-L poscache. We're now executing a
5136 * different set of WHILEM ops (and their associated
5137 * indexes) against the same string, so the bits in the
5138 * cache are meaningless. Setting maxiter to zero forces
5139 * the cache to be invalidated and zeroed before reuse.
5140 * XXX This is too dramatic a measure. Ideally we should
5141 * save the old cache and restore when running the outer
5143 reginfo->poscache_maxiter = 0;
5145 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5147 ST.prev_rex = rex_sv;
5148 ST.prev_curlyx = cur_curlyx;
5150 SET_reg_curpm(rex_sv);
5155 ST.prev_eval = cur_eval;
5157 /* now continue from first node in postoned RE */
5158 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5159 assert(0); /* NOTREACHED */
5162 case EVAL_AB: /* cleanup after a successful (??{A})B */
5163 /* note: this is called twice; first after popping B, then A */
5164 rex_sv = ST.prev_rex;
5165 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5166 SET_reg_curpm(rex_sv);
5167 rex = ReANY(rex_sv);
5168 rexi = RXi_GET(rex);
5170 cur_eval = ST.prev_eval;
5171 cur_curlyx = ST.prev_curlyx;
5173 /* Invalidate cache. See "invalidate" comment above. */
5174 reginfo->poscache_maxiter = 0;
5175 if ( nochange_depth )
5180 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5181 /* note: this is called twice; first after popping B, then A */
5182 rex_sv = ST.prev_rex;
5183 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5184 SET_reg_curpm(rex_sv);
5185 rex = ReANY(rex_sv);
5186 rexi = RXi_GET(rex);
5188 REGCP_UNWIND(ST.lastcp);
5189 regcppop(rex, &maxopenparen);
5190 cur_eval = ST.prev_eval;
5191 cur_curlyx = ST.prev_curlyx;
5192 /* Invalidate cache. See "invalidate" comment above. */
5193 reginfo->poscache_maxiter = 0;
5194 if ( nochange_depth )
5200 n = ARG(scan); /* which paren pair */
5201 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5202 if (n > maxopenparen)
5204 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5205 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5209 (IV)rex->offs[n].start_tmp,
5215 /* XXX really need to log other places start/end are set too */
5216 #define CLOSE_CAPTURE \
5217 rex->offs[n].start = rex->offs[n].start_tmp; \
5218 rex->offs[n].end = locinput - reginfo->strbeg; \
5219 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5220 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5222 PTR2UV(rex->offs), \
5224 (IV)rex->offs[n].start, \
5225 (IV)rex->offs[n].end \
5229 n = ARG(scan); /* which paren pair */
5231 if (n > rex->lastparen)
5233 rex->lastcloseparen = n;
5234 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5239 case ACCEPT: /* (*ACCEPT) */
5243 cursor && OP(cursor)!=END;
5244 cursor=regnext(cursor))
5246 if ( OP(cursor)==CLOSE ){
5248 if ( n <= lastopen ) {
5250 if (n > rex->lastparen)
5252 rex->lastcloseparen = n;
5253 if ( n == ARG(scan) || (cur_eval &&
5254 cur_eval->u.eval.close_paren == n))
5263 case GROUPP: /* (?(1)) */
5264 n = ARG(scan); /* which paren pair */
5265 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5268 case NGROUPP: /* (?(<name>)) */
5269 /* reg_check_named_buff_matched returns 0 for no match */
5270 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5273 case INSUBP: /* (?(R)) */
5275 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5278 case DEFINEP: /* (?(DEFINE)) */
5282 case IFTHEN: /* (?(cond)A|B) */
5283 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5285 next = NEXTOPER(NEXTOPER(scan));
5287 next = scan + ARG(scan);
5288 if (OP(next) == IFTHEN) /* Fake one. */
5289 next = NEXTOPER(NEXTOPER(next));
5293 case LOGICAL: /* modifier for EVAL and IFMATCH */
5294 logical = scan->flags;
5297 /*******************************************************************
5299 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5300 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5301 STAR/PLUS/CURLY/CURLYN are used instead.)
5303 A*B is compiled as <CURLYX><A><WHILEM><B>
5305 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5306 state, which contains the current count, initialised to -1. It also sets
5307 cur_curlyx to point to this state, with any previous value saved in the
5310 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5311 since the pattern may possibly match zero times (i.e. it's a while {} loop
5312 rather than a do {} while loop).
5314 Each entry to WHILEM represents a successful match of A. The count in the
5315 CURLYX block is incremented, another WHILEM state is pushed, and execution
5316 passes to A or B depending on greediness and the current count.
5318 For example, if matching against the string a1a2a3b (where the aN are
5319 substrings that match /A/), then the match progresses as follows: (the
5320 pushed states are interspersed with the bits of strings matched so far):
5323 <CURLYX cnt=0><WHILEM>
5324 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5325 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5326 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5327 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5329 (Contrast this with something like CURLYM, which maintains only a single
5333 a1 <CURLYM cnt=1> a2
5334 a1 a2 <CURLYM cnt=2> a3
5335 a1 a2 a3 <CURLYM cnt=3> b
5338 Each WHILEM state block marks a point to backtrack to upon partial failure
5339 of A or B, and also contains some minor state data related to that
5340 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5341 overall state, such as the count, and pointers to the A and B ops.
5343 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5344 must always point to the *current* CURLYX block, the rules are:
5346 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5347 and set cur_curlyx to point the new block.
5349 When popping the CURLYX block after a successful or unsuccessful match,
5350 restore the previous cur_curlyx.
5352 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5353 to the outer one saved in the CURLYX block.
5355 When popping the WHILEM block after a successful or unsuccessful B match,
5356 restore the previous cur_curlyx.
5358 Here's an example for the pattern (AI* BI)*BO
5359 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5362 curlyx backtrack stack
5363 ------ ---------------
5365 CO <CO prev=NULL> <WO>
5366 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5367 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5368 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5370 At this point the pattern succeeds, and we work back down the stack to
5371 clean up, restoring as we go:
5373 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5374 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5375 CO <CO prev=NULL> <WO>
5378 *******************************************************************/
5380 #define ST st->u.curlyx
5382 case CURLYX: /* start of /A*B/ (for complex A) */
5384 /* No need to save/restore up to this paren */
5385 I32 parenfloor = scan->flags;
5387 assert(next); /* keep Coverity happy */
5388 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5391 /* XXXX Probably it is better to teach regpush to support
5392 parenfloor > maxopenparen ... */
5393 if (parenfloor > (I32)rex->lastparen)
5394 parenfloor = rex->lastparen; /* Pessimization... */
5396 ST.prev_curlyx= cur_curlyx;
5398 ST.cp = PL_savestack_ix;
5400 /* these fields contain the state of the current curly.
5401 * they are accessed by subsequent WHILEMs */
5402 ST.parenfloor = parenfloor;
5407 ST.count = -1; /* this will be updated by WHILEM */
5408 ST.lastloc = NULL; /* this will be updated by WHILEM */
5410 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5411 assert(0); /* NOTREACHED */
5414 case CURLYX_end: /* just finished matching all of A*B */
5415 cur_curlyx = ST.prev_curlyx;
5417 assert(0); /* NOTREACHED */
5419 case CURLYX_end_fail: /* just failed to match all of A*B */
5421 cur_curlyx = ST.prev_curlyx;
5423 assert(0); /* NOTREACHED */
5427 #define ST st->u.whilem
5429 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5431 /* see the discussion above about CURLYX/WHILEM */
5433 int min = ARG1(cur_curlyx->u.curlyx.me);
5434 int max = ARG2(cur_curlyx->u.curlyx.me);
5435 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5437 assert(cur_curlyx); /* keep Coverity happy */
5438 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5439 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5440 ST.cache_offset = 0;
5444 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5445 "%*s whilem: matched %ld out of %d..%d\n",
5446 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5449 /* First just match a string of min A's. */
5452 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5454 cur_curlyx->u.curlyx.lastloc = locinput;
5455 REGCP_SET(ST.lastcp);
5457 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5458 assert(0); /* NOTREACHED */
5461 /* If degenerate A matches "", assume A done. */
5463 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5464 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5465 "%*s whilem: empty match detected, trying continuation...\n",
5466 REPORT_CODE_OFF+depth*2, "")
5468 goto do_whilem_B_max;
5471 /* super-linear cache processing.
5473 * The idea here is that for certain types of CURLYX/WHILEM -
5474 * principally those whose upper bound is infinity (and
5475 * excluding regexes that have things like \1 and other very
5476 * non-regular expresssiony things), then if a pattern like
5477 * /....A*.../ fails and we backtrack to the WHILEM, then we
5478 * make a note that this particular WHILEM op was at string
5479 * position 47 (say) when the rest of pattern failed. Then, if
5480 * we ever find ourselves back at that WHILEM, and at string
5481 * position 47 again, we can just fail immediately rather than
5482 * running the rest of the pattern again.
5484 * This is very handy when patterns start to go
5485 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5486 * with a combinatorial explosion of backtracking.
5488 * The cache is implemented as a bit array, with one bit per
5489 * string byte position per WHILEM op (up to 16) - so its
5490 * between 0.25 and 2x the string size.
5492 * To avoid allocating a poscache buffer every time, we do an
5493 * initially countdown; only after we have executed a WHILEM
5494 * op (string-length x #WHILEMs) times do we allocate the
5497 * The top 4 bits of scan->flags byte say how many different
5498 * relevant CURLLYX/WHILEM op pairs there are, while the
5499 * bottom 4-bits is the identifying index number of this
5505 if (!reginfo->poscache_maxiter) {
5506 /* start the countdown: Postpone detection until we
5507 * know the match is not *that* much linear. */
5508 reginfo->poscache_maxiter
5509 = (reginfo->strend - reginfo->strbeg + 1)
5511 /* possible overflow for long strings and many CURLYX's */
5512 if (reginfo->poscache_maxiter < 0)
5513 reginfo->poscache_maxiter = I32_MAX;
5514 reginfo->poscache_iter = reginfo->poscache_maxiter;
5517 if (reginfo->poscache_iter-- == 0) {
5518 /* initialise cache */
5519 const I32 size = (reginfo->poscache_maxiter + 7)/8;
5520 regmatch_info_aux *const aux = reginfo->info_aux;
5521 if (aux->poscache) {
5522 if ((I32)reginfo->poscache_size < size) {
5523 Renew(aux->poscache, size, char);
5524 reginfo->poscache_size = size;
5526 Zero(aux->poscache, size, char);
5529 reginfo->poscache_size = size;
5530 Newxz(aux->poscache, size, char);
5532 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5533 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5534 PL_colors[4], PL_colors[5])
5538 if (reginfo->poscache_iter < 0) {
5539 /* have we already failed at this position? */
5542 reginfo->poscache_iter = -1; /* stop eventual underflow */
5543 offset = (scan->flags & 0xf) - 1
5544 + (locinput - reginfo->strbeg)
5546 mask = 1 << (offset % 8);
5548 if (reginfo->info_aux->poscache[offset] & mask) {
5549 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5550 "%*s whilem: (cache) already tried at this position...\n",
5551 REPORT_CODE_OFF+depth*2, "")
5553 sayNO; /* cache records failure */
5555 ST.cache_offset = offset;
5556 ST.cache_mask = mask;
5560 /* Prefer B over A for minimal matching. */
5562 if (cur_curlyx->u.curlyx.minmod) {
5563 ST.save_curlyx = cur_curlyx;
5564 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5565 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5567 REGCP_SET(ST.lastcp);
5568 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5570 assert(0); /* NOTREACHED */
5573 /* Prefer A over B for maximal matching. */
5575 if (n < max) { /* More greed allowed? */
5576 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5578 cur_curlyx->u.curlyx.lastloc = locinput;
5579 REGCP_SET(ST.lastcp);
5580 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5581 assert(0); /* NOTREACHED */
5583 goto do_whilem_B_max;
5585 assert(0); /* NOTREACHED */
5587 case WHILEM_B_min: /* just matched B in a minimal match */
5588 case WHILEM_B_max: /* just matched B in a maximal match */
5589 cur_curlyx = ST.save_curlyx;
5591 assert(0); /* NOTREACHED */
5593 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5594 cur_curlyx = ST.save_curlyx;
5595 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5596 cur_curlyx->u.curlyx.count--;
5598 assert(0); /* NOTREACHED */
5600 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5602 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5603 REGCP_UNWIND(ST.lastcp);
5604 regcppop(rex, &maxopenparen);
5605 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5606 cur_curlyx->u.curlyx.count--;
5608 assert(0); /* NOTREACHED */
5610 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5611 REGCP_UNWIND(ST.lastcp);
5612 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5613 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5614 "%*s whilem: failed, trying continuation...\n",
5615 REPORT_CODE_OFF+depth*2, "")
5618 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5619 && ckWARN(WARN_REGEXP)
5620 && !reginfo->warned)
5622 reginfo->warned = TRUE;
5623 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5624 "Complex regular subexpression recursion limit (%d) "
5630 ST.save_curlyx = cur_curlyx;
5631 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5632 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5634 assert(0); /* NOTREACHED */
5636 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5637 cur_curlyx = ST.save_curlyx;
5638 REGCP_UNWIND(ST.lastcp);
5639 regcppop(rex, &maxopenparen);
5641 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5642 /* Maximum greed exceeded */
5643 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5644 && ckWARN(WARN_REGEXP)
5645 && !reginfo->warned)
5647 reginfo->warned = TRUE;
5648 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5649 "Complex regular subexpression recursion "
5650 "limit (%d) exceeded",
5653 cur_curlyx->u.curlyx.count--;
5657 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5658 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5660 /* Try grabbing another A and see if it helps. */
5661 cur_curlyx->u.curlyx.lastloc = locinput;
5662 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5664 REGCP_SET(ST.lastcp);
5665 PUSH_STATE_GOTO(WHILEM_A_min,
5666 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5668 assert(0); /* NOTREACHED */
5671 #define ST st->u.branch
5673 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5674 next = scan + ARG(scan);
5677 scan = NEXTOPER(scan);
5680 case BRANCH: /* /(...|A|...)/ */
5681 scan = NEXTOPER(scan); /* scan now points to inner node */
5682 ST.lastparen = rex->lastparen;
5683 ST.lastcloseparen = rex->lastcloseparen;
5684 ST.next_branch = next;
5687 /* Now go into the branch */
5689 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5691 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5693 assert(0); /* NOTREACHED */
5695 case CUTGROUP: /* /(*THEN)/ */
5696 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5697 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5698 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5699 assert(0); /* NOTREACHED */
5701 case CUTGROUP_next_fail:
5704 if (st->u.mark.mark_name)
5705 sv_commit = st->u.mark.mark_name;
5707 assert(0); /* NOTREACHED */
5711 assert(0); /* NOTREACHED */
5713 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5718 REGCP_UNWIND(ST.cp);
5719 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5720 scan = ST.next_branch;
5721 /* no more branches? */
5722 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5724 PerlIO_printf( Perl_debug_log,
5725 "%*s %sBRANCH failed...%s\n",
5726 REPORT_CODE_OFF+depth*2, "",
5732 continue; /* execute next BRANCH[J] op */
5733 assert(0); /* NOTREACHED */
5735 case MINMOD: /* next op will be non-greedy, e.g. A*? */
5740 #define ST st->u.curlym
5742 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5744 /* This is an optimisation of CURLYX that enables us to push
5745 * only a single backtracking state, no matter how many matches
5746 * there are in {m,n}. It relies on the pattern being constant
5747 * length, with no parens to influence future backrefs
5751 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5753 ST.lastparen = rex->lastparen;
5754 ST.lastcloseparen = rex->lastcloseparen;
5756 /* if paren positive, emulate an OPEN/CLOSE around A */
5758 U32 paren = ST.me->flags;
5759 if (paren > maxopenparen)
5760 maxopenparen = paren;
5761 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5769 ST.c1 = CHRTEST_UNINIT;
5772 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5775 curlym_do_A: /* execute the A in /A{m,n}B/ */
5776 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5777 assert(0); /* NOTREACHED */
5779 case CURLYM_A: /* we've just matched an A */
5781 /* after first match, determine A's length: u.curlym.alen */
5782 if (ST.count == 1) {
5783 if (reginfo->is_utf8_target) {
5784 char *s = st->locinput;
5785 while (s < locinput) {
5791 ST.alen = locinput - st->locinput;
5794 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5797 PerlIO_printf(Perl_debug_log,
5798 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5799 (int)(REPORT_CODE_OFF+(depth*2)), "",
5800 (IV) ST.count, (IV)ST.alen)
5803 if (cur_eval && cur_eval->u.eval.close_paren &&
5804 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5808 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5809 if ( max == REG_INFTY || ST.count < max )
5810 goto curlym_do_A; /* try to match another A */
5812 goto curlym_do_B; /* try to match B */
5814 case CURLYM_A_fail: /* just failed to match an A */
5815 REGCP_UNWIND(ST.cp);
5817 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5818 || (cur_eval && cur_eval->u.eval.close_paren &&
5819 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5822 curlym_do_B: /* execute the B in /A{m,n}B/ */
5823 if (ST.c1 == CHRTEST_UNINIT) {
5824 /* calculate c1 and c2 for possible match of 1st char
5825 * following curly */
5826 ST.c1 = ST.c2 = CHRTEST_VOID;
5827 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5828 regnode *text_node = ST.B;
5829 if (! HAS_TEXT(text_node))
5830 FIND_NEXT_IMPT(text_node);
5833 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5835 But the former is redundant in light of the latter.
5837 if this changes back then the macro for
5838 IS_TEXT and friends need to change.
5840 if (PL_regkind[OP(text_node)] == EXACT) {
5841 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5842 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5852 PerlIO_printf(Perl_debug_log,
5853 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5854 (int)(REPORT_CODE_OFF+(depth*2)),
5857 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5858 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5859 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5860 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5862 /* simulate B failing */
5864 PerlIO_printf(Perl_debug_log,
5865 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5866 (int)(REPORT_CODE_OFF+(depth*2)),"",
5867 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5868 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5869 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5871 state_num = CURLYM_B_fail;
5872 goto reenter_switch;
5875 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5876 /* simulate B failing */
5878 PerlIO_printf(Perl_debug_log,
5879 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5880 (int)(REPORT_CODE_OFF+(depth*2)),"",
5881 (int) nextchr, ST.c1, ST.c2)
5883 state_num = CURLYM_B_fail;
5884 goto reenter_switch;
5889 /* emulate CLOSE: mark current A as captured */
5890 I32 paren = ST.me->flags;
5892 rex->offs[paren].start
5893 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5894 rex->offs[paren].end = locinput - reginfo->strbeg;
5895 if ((U32)paren > rex->lastparen)
5896 rex->lastparen = paren;
5897 rex->lastcloseparen = paren;
5900 rex->offs[paren].end = -1;
5901 if (cur_eval && cur_eval->u.eval.close_paren &&
5902 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5911 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5912 assert(0); /* NOTREACHED */
5914 case CURLYM_B_fail: /* just failed to match a B */
5915 REGCP_UNWIND(ST.cp);
5916 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5918 I32 max = ARG2(ST.me);
5919 if (max != REG_INFTY && ST.count == max)
5921 goto curlym_do_A; /* try to match a further A */
5923 /* backtrack one A */
5924 if (ST.count == ARG1(ST.me) /* min */)
5927 SET_locinput(HOPc(locinput, -ST.alen));
5928 goto curlym_do_B; /* try to match B */
5931 #define ST st->u.curly
5933 #define CURLY_SETPAREN(paren, success) \
5936 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
5937 rex->offs[paren].end = locinput - reginfo->strbeg; \
5938 if (paren > rex->lastparen) \
5939 rex->lastparen = paren; \
5940 rex->lastcloseparen = paren; \
5943 rex->offs[paren].end = -1; \
5944 rex->lastparen = ST.lastparen; \
5945 rex->lastcloseparen = ST.lastcloseparen; \
5949 case STAR: /* /A*B/ where A is width 1 char */
5953 scan = NEXTOPER(scan);
5956 case PLUS: /* /A+B/ where A is width 1 char */
5960 scan = NEXTOPER(scan);
5963 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5964 ST.paren = scan->flags; /* Which paren to set */
5965 ST.lastparen = rex->lastparen;
5966 ST.lastcloseparen = rex->lastcloseparen;
5967 if (ST.paren > maxopenparen)
5968 maxopenparen = ST.paren;
5969 ST.min = ARG1(scan); /* min to match */
5970 ST.max = ARG2(scan); /* max to match */
5971 if (cur_eval && cur_eval->u.eval.close_paren &&
5972 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5976 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5979 case CURLY: /* /A{m,n}B/ where A is width 1 char */
5981 ST.min = ARG1(scan); /* min to match */
5982 ST.max = ARG2(scan); /* max to match */
5983 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5986 * Lookahead to avoid useless match attempts
5987 * when we know what character comes next.
5989 * Used to only do .*x and .*?x, but now it allows
5990 * for )'s, ('s and (?{ ... })'s to be in the way
5991 * of the quantifier and the EXACT-like node. -- japhy
5994 assert(ST.min <= ST.max);
5995 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5996 ST.c1 = ST.c2 = CHRTEST_VOID;
5999 regnode *text_node = next;
6001 if (! HAS_TEXT(text_node))
6002 FIND_NEXT_IMPT(text_node);
6004 if (! HAS_TEXT(text_node))
6005 ST.c1 = ST.c2 = CHRTEST_VOID;
6007 if ( PL_regkind[OP(text_node)] != EXACT ) {
6008 ST.c1 = ST.c2 = CHRTEST_VOID;
6012 /* Currently we only get here when
6014 PL_rekind[OP(text_node)] == EXACT
6016 if this changes back then the macro for IS_TEXT and
6017 friends need to change. */
6018 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6019 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6031 char *li = locinput;
6034 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6040 if (ST.c1 == CHRTEST_VOID)
6041 goto curly_try_B_min;
6043 ST.oldloc = locinput;
6045 /* set ST.maxpos to the furthest point along the
6046 * string that could possibly match */
6047 if (ST.max == REG_INFTY) {
6048 ST.maxpos = reginfo->strend - 1;
6050 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6053 else if (utf8_target) {
6054 int m = ST.max - ST.min;
6055 for (ST.maxpos = locinput;
6056 m >0 && ST.maxpos < reginfo->strend; m--)
6057 ST.maxpos += UTF8SKIP(ST.maxpos);
6060 ST.maxpos = locinput + ST.max - ST.min;
6061 if (ST.maxpos >= reginfo->strend)
6062 ST.maxpos = reginfo->strend - 1;
6064 goto curly_try_B_min_known;
6068 /* avoid taking address of locinput, so it can remain
6070 char *li = locinput;
6071 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6072 if (ST.count < ST.min)
6075 if ((ST.count > ST.min)
6076 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6078 /* A{m,n} must come at the end of the string, there's
6079 * no point in backing off ... */
6081 /* ...except that $ and \Z can match before *and* after
6082 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6083 We may back off by one in this case. */
6084 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6088 goto curly_try_B_max;
6090 assert(0); /* NOTREACHED */
6093 case CURLY_B_min_known_fail:
6094 /* failed to find B in a non-greedy match where c1,c2 valid */
6096 REGCP_UNWIND(ST.cp);
6098 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6100 /* Couldn't or didn't -- move forward. */
6101 ST.oldloc = locinput;
6103 locinput += UTF8SKIP(locinput);
6107 curly_try_B_min_known:
6108 /* find the next place where 'B' could work, then call B */
6112 n = (ST.oldloc == locinput) ? 0 : 1;
6113 if (ST.c1 == ST.c2) {
6114 /* set n to utf8_distance(oldloc, locinput) */
6115 while (locinput <= ST.maxpos
6116 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6118 locinput += UTF8SKIP(locinput);
6123 /* set n to utf8_distance(oldloc, locinput) */
6124 while (locinput <= ST.maxpos
6125 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6126 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6128 locinput += UTF8SKIP(locinput);
6133 else { /* Not utf8_target */
6134 if (ST.c1 == ST.c2) {
6135 while (locinput <= ST.maxpos &&
6136 UCHARAT(locinput) != ST.c1)
6140 while (locinput <= ST.maxpos
6141 && UCHARAT(locinput) != ST.c1
6142 && UCHARAT(locinput) != ST.c2)
6145 n = locinput - ST.oldloc;
6147 if (locinput > ST.maxpos)
6150 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6151 * at b; check that everything between oldloc and
6152 * locinput matches */
6153 char *li = ST.oldloc;
6155 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6157 assert(n == REG_INFTY || locinput == li);
6159 CURLY_SETPAREN(ST.paren, ST.count);
6160 if (cur_eval && cur_eval->u.eval.close_paren &&
6161 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6164 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6166 assert(0); /* NOTREACHED */
6169 case CURLY_B_min_fail:
6170 /* failed to find B in a non-greedy match where c1,c2 invalid */
6172 REGCP_UNWIND(ST.cp);
6174 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6176 /* failed -- move forward one */
6178 char *li = locinput;
6179 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6186 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6187 ST.count > 0)) /* count overflow ? */
6190 CURLY_SETPAREN(ST.paren, ST.count);
6191 if (cur_eval && cur_eval->u.eval.close_paren &&
6192 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6195 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6199 assert(0); /* NOTREACHED */
6203 /* a successful greedy match: now try to match B */
6204 if (cur_eval && cur_eval->u.eval.close_paren &&
6205 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6209 bool could_match = locinput < reginfo->strend;
6211 /* If it could work, try it. */
6212 if (ST.c1 != CHRTEST_VOID && could_match) {
6213 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6215 could_match = memEQ(locinput,
6220 UTF8SKIP(locinput));
6223 could_match = UCHARAT(locinput) == ST.c1
6224 || UCHARAT(locinput) == ST.c2;
6227 if (ST.c1 == CHRTEST_VOID || could_match) {
6228 CURLY_SETPAREN(ST.paren, ST.count);
6229 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6230 assert(0); /* NOTREACHED */
6235 case CURLY_B_max_fail:
6236 /* failed to find B in a greedy match */
6238 REGCP_UNWIND(ST.cp);
6240 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6243 if (--ST.count < ST.min)
6245 locinput = HOPc(locinput, -1);
6246 goto curly_try_B_max;
6250 case END: /* last op of main pattern */
6253 /* we've just finished A in /(??{A})B/; now continue with B */
6255 st->u.eval.prev_rex = rex_sv; /* inner */
6257 /* Save *all* the positions. */
6258 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6259 rex_sv = cur_eval->u.eval.prev_rex;
6260 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6261 SET_reg_curpm(rex_sv);
6262 rex = ReANY(rex_sv);
6263 rexi = RXi_GET(rex);
6264 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6266 REGCP_SET(st->u.eval.lastcp);
6268 /* Restore parens of the outer rex without popping the
6270 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6273 st->u.eval.prev_eval = cur_eval;
6274 cur_eval = cur_eval->u.eval.prev_eval;
6276 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6277 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6278 if ( nochange_depth )
6281 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6282 locinput); /* match B */
6285 if (locinput < reginfo->till) {
6286 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6287 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6289 (long)(locinput - startpos),
6290 (long)(reginfo->till - startpos),
6293 sayNO_SILENT; /* Cannot match: too short. */
6295 sayYES; /* Success! */
6297 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6299 PerlIO_printf(Perl_debug_log,
6300 "%*s %ssubpattern success...%s\n",
6301 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6302 sayYES; /* Success! */
6305 #define ST st->u.ifmatch
6310 case SUSPEND: /* (?>A) */
6312 newstart = locinput;
6315 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6317 goto ifmatch_trivial_fail_test;
6319 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6321 ifmatch_trivial_fail_test:
6323 char * const s = HOPBACKc(locinput, scan->flags);
6328 sw = 1 - cBOOL(ST.wanted);
6332 next = scan + ARG(scan);
6340 newstart = locinput;
6344 ST.logical = logical;
6345 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6347 /* execute body of (?...A) */
6348 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6349 assert(0); /* NOTREACHED */
6352 case IFMATCH_A_fail: /* body of (?...A) failed */
6353 ST.wanted = !ST.wanted;
6356 case IFMATCH_A: /* body of (?...A) succeeded */
6358 sw = cBOOL(ST.wanted);
6360 else if (!ST.wanted)
6363 if (OP(ST.me) != SUSPEND) {
6364 /* restore old position except for (?>...) */
6365 locinput = st->locinput;
6367 scan = ST.me + ARG(ST.me);
6370 continue; /* execute B */
6374 case LONGJMP: /* alternative with many branches compiles to
6375 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6376 next = scan + ARG(scan);
6381 case COMMIT: /* (*COMMIT) */
6382 reginfo->cutpoint = reginfo->strend;
6385 case PRUNE: /* (*PRUNE) */
6387 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6388 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6389 assert(0); /* NOTREACHED */
6391 case COMMIT_next_fail:
6395 case OPFAIL: /* (*FAIL) */
6397 assert(0); /* NOTREACHED */
6399 #define ST st->u.mark
6400 case MARKPOINT: /* (*MARK:foo) */
6401 ST.prev_mark = mark_state;
6402 ST.mark_name = sv_commit = sv_yes_mark
6403 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6405 ST.mark_loc = locinput;
6406 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6407 assert(0); /* NOTREACHED */
6409 case MARKPOINT_next:
6410 mark_state = ST.prev_mark;
6412 assert(0); /* NOTREACHED */
6414 case MARKPOINT_next_fail:
6415 if (popmark && sv_eq(ST.mark_name,popmark))
6417 if (ST.mark_loc > startpoint)
6418 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6419 popmark = NULL; /* we found our mark */
6420 sv_commit = ST.mark_name;
6423 PerlIO_printf(Perl_debug_log,
6424 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6425 REPORT_CODE_OFF+depth*2, "",
6426 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6429 mark_state = ST.prev_mark;
6430 sv_yes_mark = mark_state ?
6431 mark_state->u.mark.mark_name : NULL;
6433 assert(0); /* NOTREACHED */
6435 case SKIP: /* (*SKIP) */
6437 /* (*SKIP) : if we fail we cut here*/
6438 ST.mark_name = NULL;
6439 ST.mark_loc = locinput;
6440 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6442 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6443 otherwise do nothing. Meaning we need to scan
6445 regmatch_state *cur = mark_state;
6446 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6449 if ( sv_eq( cur->u.mark.mark_name,
6452 ST.mark_name = find;
6453 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6455 cur = cur->u.mark.prev_mark;
6458 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6461 case SKIP_next_fail:
6463 /* (*CUT:NAME) - Set up to search for the name as we
6464 collapse the stack*/
6465 popmark = ST.mark_name;
6467 /* (*CUT) - No name, we cut here.*/
6468 if (ST.mark_loc > startpoint)
6469 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6470 /* but we set sv_commit to latest mark_name if there
6471 is one so they can test to see how things lead to this
6474 sv_commit=mark_state->u.mark.mark_name;
6478 assert(0); /* NOTREACHED */
6481 case LNBREAK: /* \R */
6482 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6489 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6490 PTR2UV(scan), OP(scan));
6491 Perl_croak(aTHX_ "regexp memory corruption");
6493 /* this is a point to jump to in order to increment
6494 * locinput by one character */
6496 assert(!NEXTCHR_IS_EOS);
6498 locinput += PL_utf8skip[nextchr];
6499 /* locinput is allowed to go 1 char off the end, but not 2+ */
6500 if (locinput > reginfo->strend)
6509 /* switch break jumps here */
6510 scan = next; /* prepare to execute the next op and ... */
6511 continue; /* ... jump back to the top, reusing st */
6512 assert(0); /* NOTREACHED */
6515 /* push a state that backtracks on success */
6516 st->u.yes.prev_yes_state = yes_state;
6520 /* push a new regex state, then continue at scan */
6522 regmatch_state *newst;
6525 regmatch_state *cur = st;
6526 regmatch_state *curyes = yes_state;
6528 regmatch_slab *slab = PL_regmatch_slab;
6529 for (;curd > -1;cur--,curd--) {
6530 if (cur < SLAB_FIRST(slab)) {
6532 cur = SLAB_LAST(slab);
6534 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6535 REPORT_CODE_OFF + 2 + depth * 2,"",
6536 curd, PL_reg_name[cur->resume_state],
6537 (curyes == cur) ? "yes" : ""
6540 curyes = cur->u.yes.prev_yes_state;
6543 DEBUG_STATE_pp("push")
6546 st->locinput = locinput;
6548 if (newst > SLAB_LAST(PL_regmatch_slab))
6549 newst = S_push_slab(aTHX);
6550 PL_regmatch_state = newst;
6552 locinput = pushinput;
6555 assert(0); /* NOTREACHED */
6560 * We get here only if there's trouble -- normally "case END" is
6561 * the terminating point.
6563 Perl_croak(aTHX_ "corrupted regexp pointers");
6569 /* we have successfully completed a subexpression, but we must now
6570 * pop to the state marked by yes_state and continue from there */
6571 assert(st != yes_state);
6573 while (st != yes_state) {
6575 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6576 PL_regmatch_slab = PL_regmatch_slab->prev;
6577 st = SLAB_LAST(PL_regmatch_slab);
6581 DEBUG_STATE_pp("pop (no final)");
6583 DEBUG_STATE_pp("pop (yes)");
6589 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6590 || yes_state > SLAB_LAST(PL_regmatch_slab))
6592 /* not in this slab, pop slab */
6593 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6594 PL_regmatch_slab = PL_regmatch_slab->prev;
6595 st = SLAB_LAST(PL_regmatch_slab);
6597 depth -= (st - yes_state);
6600 yes_state = st->u.yes.prev_yes_state;
6601 PL_regmatch_state = st;
6604 locinput= st->locinput;
6605 state_num = st->resume_state + no_final;
6606 goto reenter_switch;
6609 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6610 PL_colors[4], PL_colors[5]));
6612 if (reginfo->info_aux_eval) {
6613 /* each successfully executed (?{...}) block does the equivalent of
6614 * local $^R = do {...}
6615 * When popping the save stack, all these locals would be undone;
6616 * bypass this by setting the outermost saved $^R to the latest
6618 if (oreplsv != GvSV(PL_replgv))
6619 sv_setsv(oreplsv, GvSV(PL_replgv));
6626 PerlIO_printf(Perl_debug_log,
6627 "%*s %sfailed...%s\n",
6628 REPORT_CODE_OFF+depth*2, "",
6629 PL_colors[4], PL_colors[5])
6641 /* there's a previous state to backtrack to */
6643 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6644 PL_regmatch_slab = PL_regmatch_slab->prev;
6645 st = SLAB_LAST(PL_regmatch_slab);
6647 PL_regmatch_state = st;
6648 locinput= st->locinput;
6650 DEBUG_STATE_pp("pop");
6652 if (yes_state == st)
6653 yes_state = st->u.yes.prev_yes_state;
6655 state_num = st->resume_state + 1; /* failure = success + 1 */
6656 goto reenter_switch;
6661 if (rex->intflags & PREGf_VERBARG_SEEN) {
6662 SV *sv_err = get_sv("REGERROR", 1);
6663 SV *sv_mrk = get_sv("REGMARK", 1);
6665 sv_commit = &PL_sv_no;
6667 sv_yes_mark = &PL_sv_yes;
6670 sv_commit = &PL_sv_yes;
6671 sv_yes_mark = &PL_sv_no;
6673 sv_setsv(sv_err, sv_commit);
6674 sv_setsv(sv_mrk, sv_yes_mark);
6678 if (last_pushed_cv) {
6681 PERL_UNUSED_VAR(SP);
6684 assert(!result || locinput - reginfo->strbeg >= 0);
6685 return result ? locinput - reginfo->strbeg : -1;
6689 - regrepeat - repeatedly match something simple, report how many
6691 * What 'simple' means is a node which can be the operand of a quantifier like
6694 * startposp - pointer a pointer to the start position. This is updated
6695 * to point to the byte following the highest successful
6697 * p - the regnode to be repeatedly matched against.
6698 * reginfo - struct holding match state, such as strend
6699 * max - maximum number of things to match.
6700 * depth - (for debugging) backtracking depth.
6703 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6704 regmatch_info *const reginfo, I32 max, int depth)
6707 char *scan; /* Pointer to current position in target string */
6709 char *loceol = reginfo->strend; /* local version */
6710 I32 hardcount = 0; /* How many matches so far */
6711 bool utf8_target = reginfo->is_utf8_target;
6712 int to_complement = 0; /* Invert the result? */
6714 _char_class_number classnum;
6716 PERL_UNUSED_ARG(depth);
6719 PERL_ARGS_ASSERT_REGREPEAT;
6722 if (max == REG_INFTY)
6724 else if (! utf8_target && loceol - scan > max)
6725 loceol = scan + max;
6727 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6728 * to the maximum of how far we should go in it (leaving it set to the real
6729 * end, if the maximum permissible would take us beyond that). This allows
6730 * us to make the loop exit condition that we haven't gone past <loceol> to
6731 * also mean that we haven't exceeded the max permissible count, saving a
6732 * test each time through the loop. But it assumes that the OP matches a
6733 * single byte, which is true for most of the OPs below when applied to a
6734 * non-UTF-8 target. Those relatively few OPs that don't have this
6735 * characteristic will have to compensate.
6737 * There is no adjustment for UTF-8 targets, as the number of bytes per
6738 * character varies. OPs will have to test both that the count is less
6739 * than the max permissible (using <hardcount> to keep track), and that we
6740 * are still within the bounds of the string (using <loceol>. A few OPs
6741 * match a single byte no matter what the encoding. They can omit the max
6742 * test if, for the UTF-8 case, they do the adjustment that was skipped
6745 * Thus, the code above sets things up for the common case; and exceptional
6746 * cases need extra work; the common case is to make sure <scan> doesn't
6747 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6748 * count doesn't exceed the maximum permissible */
6753 while (scan < loceol && hardcount < max && *scan != '\n') {
6754 scan += UTF8SKIP(scan);
6758 while (scan < loceol && *scan != '\n')
6764 while (scan < loceol && hardcount < max) {
6765 scan += UTF8SKIP(scan);
6772 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6773 if (utf8_target && loceol - scan > max) {
6775 /* <loceol> hadn't been adjusted in the UTF-8 case */
6783 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6787 /* Can use a simple loop if the pattern char to match on is invariant
6788 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6789 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6790 * true iff it doesn't matter if the argument is in UTF-8 or not */
6791 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6792 if (utf8_target && loceol - scan > max) {
6793 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6794 * since here, to match at all, 1 char == 1 byte */
6795 loceol = scan + max;
6797 while (scan < loceol && UCHARAT(scan) == c) {
6801 else if (reginfo->is_utf8_pat) {
6803 STRLEN scan_char_len;
6805 /* When both target and pattern are UTF-8, we have to do
6807 while (hardcount < max
6809 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6810 && memEQ(scan, STRING(p), scan_char_len))
6812 scan += scan_char_len;
6816 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6818 /* Target isn't utf8; convert the character in the UTF-8
6819 * pattern to non-UTF8, and do a simple loop */
6820 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6821 while (scan < loceol && UCHARAT(scan) == c) {
6824 } /* else pattern char is above Latin1, can't possibly match the
6829 /* Here, the string must be utf8; pattern isn't, and <c> is
6830 * different in utf8 than not, so can't compare them directly.
6831 * Outside the loop, find the two utf8 bytes that represent c, and
6832 * then look for those in sequence in the utf8 string */
6833 U8 high = UTF8_TWO_BYTE_HI(c);
6834 U8 low = UTF8_TWO_BYTE_LO(c);
6836 while (hardcount < max
6837 && scan + 1 < loceol
6838 && UCHARAT(scan) == high
6839 && UCHARAT(scan + 1) == low)
6848 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6852 RXp_MATCH_TAINTED_on(prog);
6853 utf8_flags = FOLDEQ_UTF8_LOCALE;
6861 case EXACTFU_TRICKYFOLD:
6863 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6867 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6869 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6871 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6874 if (c1 == CHRTEST_VOID) {
6875 /* Use full Unicode fold matching */
6876 char *tmpeol = reginfo->strend;
6877 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6878 while (hardcount < max
6879 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6880 STRING(p), NULL, pat_len,
6881 reginfo->is_utf8_pat, utf8_flags))
6884 tmpeol = reginfo->strend;
6888 else if (utf8_target) {
6890 while (scan < loceol
6892 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6894 scan += UTF8SKIP(scan);
6899 while (scan < loceol
6901 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6902 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6904 scan += UTF8SKIP(scan);
6909 else if (c1 == c2) {
6910 while (scan < loceol && UCHARAT(scan) == c1) {
6915 while (scan < loceol &&
6916 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6925 case ANYOF_WARN_SUPER:
6927 while (hardcount < max
6929 && reginclass(prog, p, (U8*)scan, utf8_target))
6931 scan += UTF8SKIP(scan);
6935 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6940 /* The argument (FLAGS) to all the POSIX node types is the class number */
6947 RXp_MATCH_TAINTED_on(prog);
6948 if (! utf8_target) {
6949 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6955 while (hardcount < max && scan < loceol
6956 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6959 scan += UTF8SKIP(scan);
6972 if (utf8_target && loceol - scan > max) {
6974 /* We didn't adjust <loceol> at the beginning of this routine
6975 * because is UTF-8, but it is actually ok to do so, since here, to
6976 * match, 1 char == 1 byte. */
6977 loceol = scan + max;
6979 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6992 if (! utf8_target) {
6993 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6999 /* The complement of something that matches only ASCII matches all
7000 * UTF-8 variant code points, plus everything in ASCII that isn't
7002 while (hardcount < max && scan < loceol
7003 && (! UTF8_IS_INVARIANT(*scan)
7004 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7006 scan += UTF8SKIP(scan);
7017 if (! utf8_target) {
7018 while (scan < loceol && to_complement
7019 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7026 classnum = (_char_class_number) FLAGS(p);
7027 if (classnum < _FIRST_NON_SWASH_CC) {
7029 /* Here, a swash is needed for above-Latin1 code points.
7030 * Process as many Latin1 code points using the built-in rules.
7031 * Go to another loop to finish processing upon encountering
7032 * the first Latin1 code point. We could do that in this loop
7033 * as well, but the other way saves having to test if the swash
7034 * has been loaded every time through the loop: extra space to
7036 while (hardcount < max && scan < loceol) {
7037 if (UTF8_IS_INVARIANT(*scan)) {
7038 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7045 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7046 if (! (to_complement
7047 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
7056 goto found_above_latin1;
7063 /* For these character classes, the knowledge of how to handle
7064 * every code point is compiled in to Perl via a macro. This
7065 * code is written for making the loops as tight as possible.
7066 * It could be refactored to save space instead */
7068 case _CC_ENUM_SPACE: /* XXX would require separate code
7069 if we revert the change of \v
7072 case _CC_ENUM_PSXSPC:
7073 while (hardcount < max
7075 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7077 scan += UTF8SKIP(scan);
7081 case _CC_ENUM_BLANK:
7082 while (hardcount < max
7084 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7086 scan += UTF8SKIP(scan);
7090 case _CC_ENUM_XDIGIT:
7091 while (hardcount < max
7093 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7095 scan += UTF8SKIP(scan);
7099 case _CC_ENUM_VERTSPACE:
7100 while (hardcount < max
7102 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7104 scan += UTF8SKIP(scan);
7108 case _CC_ENUM_CNTRL:
7109 while (hardcount < max
7111 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7113 scan += UTF8SKIP(scan);
7118 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7124 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7126 /* Load the swash if not already present */
7127 if (! PL_utf8_swash_ptrs[classnum]) {
7128 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7129 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7130 "utf8", swash_property_names[classnum],
7131 &PL_sv_undef, 1, 0, NULL, &flags);
7134 while (hardcount < max && scan < loceol
7135 && to_complement ^ cBOOL(_generic_utf8(
7138 swash_fetch(PL_utf8_swash_ptrs[classnum],
7142 scan += UTF8SKIP(scan);
7149 while (hardcount < max && scan < loceol &&
7150 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7155 /* LNBREAK can match one or two latin chars, which is ok, but we
7156 * have to use hardcount in this situation, and throw away the
7157 * adjustment to <loceol> done before the switch statement */
7158 loceol = reginfo->strend;
7159 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7180 /* These are all 0 width, so match right here or not at all. */
7184 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7185 assert(0); /* NOTREACHED */
7192 c = scan - *startposp;
7196 GET_RE_DEBUG_FLAGS_DECL;
7198 SV * const prop = sv_newmortal();
7199 regprop(prog, prop, p);
7200 PerlIO_printf(Perl_debug_log,
7201 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7202 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7210 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7212 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7213 create a copy so that changes the caller makes won't change the shared one.
7214 If <altsvp> is non-null, will return NULL in it, for back-compat.
7217 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7219 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7225 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7230 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7232 /* Returns the swash for the input 'node' in the regex 'prog'.
7233 * If <doinit> is true, will attempt to create the swash if not already
7235 * If <listsvp> is non-null, will return the swash initialization string in
7237 * Tied intimately to how regcomp.c sets up the data structure */
7244 RXi_GET_DECL(prog,progi);
7245 const struct reg_data * const data = prog ? progi->data : NULL;
7247 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7249 assert(ANYOF_NONBITMAP(node));
7251 if (data && data->count) {
7252 const U32 n = ARG(node);
7254 if (data->what[n] == 's') {
7255 SV * const rv = MUTABLE_SV(data->data[n]);
7256 AV * const av = MUTABLE_AV(SvRV(rv));
7257 SV **const ary = AvARRAY(av);
7258 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7260 si = *ary; /* ary[0] = the string to initialize the swash with */
7262 /* Elements 2 and 3 are either both present or both absent. [2] is
7263 * any inversion list generated at compile time; [3] indicates if
7264 * that inversion list has any user-defined properties in it. */
7265 if (av_len(av) >= 2) {
7268 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7275 /* Element [1] is reserved for the set-up swash. If already there,
7276 * return it; if not, create it and store it there */
7277 if (SvROK(ary[1])) {
7280 else if (si && doinit) {
7282 sw = _core_swash_init("utf8", /* the utf8 package */
7286 0, /* not from tr/// */
7289 (void)av_store(av, 1, sw);
7295 SV* matches_string = newSVpvn("", 0);
7297 /* Use the swash, if any, which has to have incorporated into it all
7299 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7300 && (si && si != &PL_sv_undef))
7303 /* If no swash, use the input initialization string, if available */
7304 sv_catsv(matches_string, si);
7307 /* Add the inversion list to whatever we have. This may have come from
7308 * the swash, or from an input parameter */
7310 sv_catsv(matches_string, _invlist_contents(invlist));
7312 *listsvp = matches_string;
7319 - reginclass - determine if a character falls into a character class
7321 n is the ANYOF regnode
7322 p is the target string
7323 utf8_target tells whether p is in UTF-8.
7325 Returns true if matched; false otherwise.
7327 Note that this can be a synthetic start class, a combination of various
7328 nodes, so things you think might be mutually exclusive, such as locale,
7329 aren't. It can match both locale and non-locale
7334 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7337 const char flags = ANYOF_FLAGS(n);
7341 PERL_ARGS_ASSERT_REGINCLASS;
7343 /* If c is not already the code point, get it. Note that
7344 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7345 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7347 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7348 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7349 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7350 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7351 * UTF8_ALLOW_FFFF */
7352 if (c_len == (STRLEN)-1)
7353 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7356 /* If this character is potentially in the bitmap, check it */
7358 if (ANYOF_BITMAP_TEST(n, c))
7360 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7366 else if (flags & ANYOF_LOCALE) {
7367 RXp_MATCH_TAINTED_on(prog);
7369 if ((flags & ANYOF_LOC_FOLD)
7370 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7374 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7376 /* The data structure is arranged so bits 0, 2, 4, ... are set
7377 * if the class includes the Posix character class given by
7378 * bit/2; and 1, 3, 5, ... are set if the class includes the
7379 * complemented Posix class given by int(bit/2). So we loop
7380 * through the bits, each time changing whether we complement
7381 * the result or not. Suppose for the sake of illustration
7382 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7383 * is set, it means there is a match for this ANYOF node if the
7384 * character is in the class given by the expression (0 / 2 = 0
7385 * = \w). If it is in that class, isFOO_lc() will return 1,
7386 * and since 'to_complement' is 0, the result will stay TRUE,
7387 * and we exit the loop. Suppose instead that bit 0 is 0, but
7388 * bit 1 is 1. That means there is a match if the character
7389 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7390 * but will on bit 1. On the second iteration 'to_complement'
7391 * will be 1, so the exclusive or will reverse things, so we
7392 * are testing for \W. On the third iteration, 'to_complement'
7393 * will be 0, and we would be testing for \s; the fourth
7394 * iteration would test for \S, etc.
7396 * Note that this code assumes that all the classes are closed
7397 * under folding. For example, if a character matches \w, then
7398 * its fold does too; and vice versa. This should be true for
7399 * any well-behaved locale for all the currently defined Posix
7400 * classes, except for :lower: and :upper:, which are handled
7401 * by the pseudo-class :cased: which matches if either of the
7402 * other two does. To get rid of this assumption, an outer
7403 * loop could be used below to iterate over both the source
7404 * character, and its fold (if different) */
7407 int to_complement = 0;
7408 while (count < ANYOF_MAX) {
7409 if (ANYOF_CLASS_TEST(n, count)
7410 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7422 /* If the bitmap didn't (or couldn't) match, and something outside the
7423 * bitmap could match, try that. Locale nodes specify completely the
7424 * behavior of code points in the bit map (otherwise, a utf8 target would
7425 * cause them to be treated as Unicode and not locale), except in
7426 * the very unlikely event when this node is a synthetic start class, which
7427 * could be a combination of locale and non-locale nodes. So allow locale
7428 * to match for the synthetic start class, which will give a false
7429 * positive that will be resolved when the match is done again as not part
7430 * of the synthetic start class */
7432 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7433 match = TRUE; /* Everything above 255 matches */
7435 else if (ANYOF_NONBITMAP(n)
7436 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7439 || (! (flags & ANYOF_LOCALE))
7440 || OP(n) == ANYOF_SYNTHETIC))))
7442 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7447 } else { /* Convert to utf8 */
7449 utf8_p = bytes_to_utf8(p, &len);
7452 if (swash_fetch(sw, utf8_p, TRUE)) {
7456 /* If we allocated a string above, free it */
7457 if (! utf8_target) Safefree(utf8_p);
7461 if (UNICODE_IS_SUPER(c)
7462 && OP(n) == ANYOF_WARN_SUPER
7463 && ckWARN_d(WARN_NON_UNICODE))
7465 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7466 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7470 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7471 return cBOOL(flags & ANYOF_INVERT) ^ match;
7475 S_reghop3(U8 *s, I32 off, const U8* lim)
7477 /* return the position 'off' UTF-8 characters away from 's', forward if
7478 * 'off' >= 0, backwards if negative. But don't go outside of position
7479 * 'lim', which better be < s if off < 0 */
7483 PERL_ARGS_ASSERT_REGHOP3;
7486 while (off-- && s < lim) {
7487 /* XXX could check well-formedness here */
7492 while (off++ && s > lim) {
7494 if (UTF8_IS_CONTINUED(*s)) {
7495 while (s > lim && UTF8_IS_CONTINUATION(*s))
7498 /* XXX could check well-formedness here */
7505 /* there are a bunch of places where we use two reghop3's that should
7506 be replaced with this routine. but since thats not done yet
7507 we ifdef it out - dmq
7510 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7514 PERL_ARGS_ASSERT_REGHOP4;
7517 while (off-- && s < rlim) {
7518 /* XXX could check well-formedness here */
7523 while (off++ && s > llim) {
7525 if (UTF8_IS_CONTINUED(*s)) {
7526 while (s > llim && UTF8_IS_CONTINUATION(*s))
7529 /* XXX could check well-formedness here */
7537 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7541 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7544 while (off-- && s < lim) {
7545 /* XXX could check well-formedness here */
7552 while (off++ && s > lim) {
7554 if (UTF8_IS_CONTINUED(*s)) {
7555 while (s > lim && UTF8_IS_CONTINUATION(*s))
7558 /* XXX could check well-formedness here */
7567 /* when executing a regex that may have (?{}), extra stuff needs setting
7568 up that will be visible to the called code, even before the current
7569 match has finished. In particular:
7571 * $_ is localised to the SV currently being matched;
7572 * pos($_) is created if necessary, ready to be updated on each call-out
7574 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7575 isn't set until the current pattern is successfully finished), so that
7576 $1 etc of the match-so-far can be seen;
7577 * save the old values of subbeg etc of the current regex, and set then
7578 to the current string (again, this is normally only done at the end
7583 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7586 regexp *const rex = ReANY(reginfo->prog);
7587 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7589 eval_state->rex = rex;
7592 /* Make $_ available to executed code. */
7593 if (reginfo->sv != DEFSV) {
7595 DEFSV_set(reginfo->sv);
7598 if (!(mg = mg_find_mglob(reginfo->sv))) {
7599 /* prepare for quick setting of pos */
7600 mg = sv_magicext_mglob(reginfo->sv);
7603 eval_state->pos_magic = mg;
7604 eval_state->pos = mg->mg_len;
7607 eval_state->pos_magic = NULL;
7609 if (!PL_reg_curpm) {
7610 /* PL_reg_curpm is a fake PMOP that we can attach the current
7611 * regex to and point PL_curpm at, so that $1 et al are visible
7612 * within a /(?{})/. It's just allocated once per interpreter the
7613 * first time its needed */
7614 Newxz(PL_reg_curpm, 1, PMOP);
7617 SV* const repointer = &PL_sv_undef;
7618 /* this regexp is also owned by the new PL_reg_curpm, which
7619 will try to free it. */
7620 av_push(PL_regex_padav, repointer);
7621 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7622 PL_regex_pad = AvARRAY(PL_regex_padav);
7626 SET_reg_curpm(reginfo->prog);
7627 eval_state->curpm = PL_curpm;
7628 PL_curpm = PL_reg_curpm;
7629 if (RXp_MATCH_COPIED(rex)) {
7630 /* Here is a serious problem: we cannot rewrite subbeg,
7631 since it may be needed if this match fails. Thus
7632 $` inside (?{}) could fail... */
7633 eval_state->subbeg = rex->subbeg;
7634 eval_state->sublen = rex->sublen;
7635 eval_state->suboffset = rex->suboffset;
7636 eval_state->subcoffset = rex->subcoffset;
7638 eval_state->saved_copy = rex->saved_copy;
7640 RXp_MATCH_COPIED_off(rex);
7643 eval_state->subbeg = NULL;
7644 rex->subbeg = (char *)reginfo->strbeg;
7646 rex->subcoffset = 0;
7647 rex->sublen = reginfo->strend - reginfo->strbeg;
7651 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7654 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7657 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7658 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
7661 Safefree(aux->poscache);
7665 /* undo the effects of S_setup_eval_state() */
7667 if (eval_state->subbeg) {
7668 regexp * const rex = eval_state->rex;
7669 rex->subbeg = eval_state->subbeg;
7670 rex->sublen = eval_state->sublen;
7671 rex->suboffset = eval_state->suboffset;
7672 rex->subcoffset = eval_state->subcoffset;
7674 rex->saved_copy = eval_state->saved_copy;
7676 RXp_MATCH_COPIED_on(rex);
7678 if (eval_state->pos_magic)
7679 eval_state->pos_magic->mg_len = eval_state->pos;
7681 PL_curpm = eval_state->curpm;
7684 PL_regmatch_state = aux->old_regmatch_state;
7685 PL_regmatch_slab = aux->old_regmatch_slab;
7687 /* free all slabs above current one - this must be the last action
7688 * of this function, as aux and eval_state are allocated within
7689 * slabs and may be freed here */
7691 s = PL_regmatch_slab->next;
7693 PL_regmatch_slab->next = NULL;
7695 regmatch_slab * const osl = s;
7704 S_to_utf8_substr(pTHX_ regexp *prog)
7706 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7707 * on the converted value */
7711 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7714 if (prog->substrs->data[i].substr
7715 && !prog->substrs->data[i].utf8_substr) {
7716 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7717 prog->substrs->data[i].utf8_substr = sv;
7718 sv_utf8_upgrade(sv);
7719 if (SvVALID(prog->substrs->data[i].substr)) {
7720 if (SvTAIL(prog->substrs->data[i].substr)) {
7721 /* Trim the trailing \n that fbm_compile added last
7723 SvCUR_set(sv, SvCUR(sv) - 1);
7724 /* Whilst this makes the SV technically "invalid" (as its
7725 buffer is no longer followed by "\0") when fbm_compile()
7726 adds the "\n" back, a "\0" is restored. */
7727 fbm_compile(sv, FBMcf_TAIL);
7731 if (prog->substrs->data[i].substr == prog->check_substr)
7732 prog->check_utf8 = sv;
7738 S_to_byte_substr(pTHX_ regexp *prog)
7740 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7741 * on the converted value; returns FALSE if can't be converted. */
7746 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7749 if (prog->substrs->data[i].utf8_substr
7750 && !prog->substrs->data[i].substr) {
7751 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7752 if (! sv_utf8_downgrade(sv, TRUE)) {
7755 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7756 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7757 /* Trim the trailing \n that fbm_compile added last
7759 SvCUR_set(sv, SvCUR(sv) - 1);
7760 fbm_compile(sv, FBMcf_TAIL);
7764 prog->substrs->data[i].substr = sv;
7765 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7766 prog->check_substr = sv;
7775 * c-indentation-style: bsd
7777 * indent-tabs-mode: nil
7780 * ex: set ts=8 sts=4 sw=4 et: