5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
89 static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
93 /* At least one required character in the target string is expressible only in
95 static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
100 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
107 #define STATIC static
110 /* Valid only if 'c', the character being looke-up, is an invariant under
111 * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
112 * everything matchable is straight forward in the bitmap */
113 #define REGINCLASS(prog,p,c,u) (ANYOF_FLAGS(p) \
114 ? reginclass(prog,p,c,c+1,u) \
115 : ANYOF_BITMAP_TEST(p,*(c)))
121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
122 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
124 #define HOPc(pos,off) \
125 (char *)(reginfo->is_utf8_target \
126 ? reghop3((U8*)pos, off, \
127 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
130 #define HOPBACKc(pos, off) \
131 (char*)(reginfo->is_utf8_target \
132 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(reginfo->strbeg)) \
133 : (pos - off >= reginfo->strbeg) \
137 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
138 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140 /* lim must be +ve. Returns NULL on overshoot */
141 #define HOPMAYBE3(pos,off,lim) \
142 (reginfo->is_utf8_target \
143 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
144 : ((U8*)pos + off <= lim) \
148 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
149 * off must be >=0; args should be vars rather than expressions */
150 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
151 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
152 : (U8*)((pos + off) > lim ? lim : (pos + off)))
154 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
155 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
157 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
159 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
160 #define NEXTCHR_IS_EOS (nextchr < 0)
162 #define SET_nextchr \
163 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
165 #define SET_locinput(p) \
170 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
172 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
173 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
174 1, 0, invlist, &flags); \
179 /* If in debug mode, we test that a known character properly matches */
181 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
184 utf8_char_in_property) \
185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
186 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
188 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
191 utf8_char_in_property) \
192 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
195 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
196 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
198 PL_XPosix_ptrs[_CC_WORDCHAR], \
199 LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
201 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
202 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
204 /* for use after a quantifier and before an EXACT-like node -- japhy */
205 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
207 * NOTE that *nothing* that affects backtracking should be in here, specifically
208 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
209 * node that is in between two EXACT like nodes when ascertaining what the required
210 * "follow" character is. This should probably be moved to regex compile time
211 * although it may be done at run time beause of the REF possibility - more
212 * investigation required. -- demerphq
214 #define JUMPABLE(rn) ( \
216 (OP(rn) == CLOSE && \
217 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
219 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
220 OP(rn) == PLUS || OP(rn) == MINMOD || \
222 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
224 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
226 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
229 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
230 we don't need this definition. XXX These are now out-of-sync*/
231 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
232 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
233 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
236 /* ... so we use this as its faster. */
237 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
238 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
239 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
240 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
245 Search for mandatory following text node; for lookahead, the text must
246 follow but for lookbehind (rn->flags != 0) we skip to the next step.
248 #define FIND_NEXT_IMPT(rn) STMT_START { \
249 while (JUMPABLE(rn)) { \
250 const OPCODE type = OP(rn); \
251 if (type == SUSPEND || PL_regkind[type] == CURLY) \
252 rn = NEXTOPER(NEXTOPER(rn)); \
253 else if (type == PLUS) \
255 else if (type == IFMATCH) \
256 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
257 else rn += NEXT_OFF(rn); \
261 #define SLAB_FIRST(s) (&(s)->states[0])
262 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
264 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
265 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
266 static regmatch_state * S_push_slab(pTHX);
268 #define REGCP_PAREN_ELEMS 3
269 #define REGCP_OTHER_ELEMS 3
270 #define REGCP_FRAME_ELEMS 1
271 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
272 * are needed for the regexp context stack bookkeeping. */
275 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
277 const int retval = PL_savestack_ix;
278 const int paren_elems_to_push =
279 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
280 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
281 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
283 GET_RE_DEBUG_FLAGS_DECL;
285 PERL_ARGS_ASSERT_REGCPPUSH;
287 if (paren_elems_to_push < 0)
288 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
289 (int)paren_elems_to_push, (int)maxopenparen,
290 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
292 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
293 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
294 " out of range (%lu-%ld)",
296 (unsigned long)maxopenparen,
299 SSGROW(total_elems + REGCP_FRAME_ELEMS);
302 if ((int)maxopenparen > (int)parenfloor)
303 Perl_re_exec_indentf( aTHX_
304 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
310 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
311 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
312 SSPUSHIV(rex->offs[p].end);
313 SSPUSHIV(rex->offs[p].start);
314 SSPUSHINT(rex->offs[p].start_tmp);
315 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
316 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
319 (IV)rex->offs[p].start,
320 (IV)rex->offs[p].start_tmp,
324 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
325 SSPUSHINT(maxopenparen);
326 SSPUSHINT(rex->lastparen);
327 SSPUSHINT(rex->lastcloseparen);
328 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
333 /* These are needed since we do not localize EVAL nodes: */
334 #define REGCP_SET(cp) \
336 Perl_re_exec_indentf( aTHX_ \
337 "Setting an EVAL scope, savestack=%"IVdf",\n", \
338 depth, (IV)PL_savestack_ix \
343 #define REGCP_UNWIND(cp) \
345 if (cp != PL_savestack_ix) \
346 Perl_re_exec_indentf( aTHX_ \
347 "Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n",\
348 depth, (IV)(cp), (IV)PL_savestack_ix \
353 #define UNWIND_PAREN(lp, lcp) \
354 for (n = rex->lastparen; n > lp; n--) \
355 rex->offs[n].end = -1; \
356 rex->lastparen = n; \
357 rex->lastcloseparen = lcp;
361 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
365 GET_RE_DEBUG_FLAGS_DECL;
367 PERL_ARGS_ASSERT_REGCPPOP;
369 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
371 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
372 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
373 rex->lastcloseparen = SSPOPINT;
374 rex->lastparen = SSPOPINT;
375 *maxopenparen_p = SSPOPINT;
377 i -= REGCP_OTHER_ELEMS;
378 /* Now restore the parentheses context. */
380 if (i || rex->lastparen + 1 <= rex->nparens)
381 Perl_re_exec_indentf( aTHX_
382 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
388 paren = *maxopenparen_p;
389 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
391 rex->offs[paren].start_tmp = SSPOPINT;
392 rex->offs[paren].start = SSPOPIV;
394 if (paren <= rex->lastparen)
395 rex->offs[paren].end = tmps;
396 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
397 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
400 (IV)rex->offs[paren].start,
401 (IV)rex->offs[paren].start_tmp,
402 (IV)rex->offs[paren].end,
403 (paren > rex->lastparen ? "(skipped)" : ""));
408 /* It would seem that the similar code in regtry()
409 * already takes care of this, and in fact it is in
410 * a better location to since this code can #if 0-ed out
411 * but the code in regtry() is needed or otherwise tests
412 * requiring null fields (pat.t#187 and split.t#{13,14}
413 * (as of patchlevel 7877) will fail. Then again,
414 * this code seems to be necessary or otherwise
415 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
416 * --jhi updated by dapm */
417 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
418 if (i > *maxopenparen_p)
419 rex->offs[i].start = -1;
420 rex->offs[i].end = -1;
421 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
422 " \\%"UVuf": %s ..-1 undeffing\n",
425 (i > *maxopenparen_p) ? "-1" : " "
431 /* restore the parens and associated vars at savestack position ix,
432 * but without popping the stack */
435 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
437 I32 tmpix = PL_savestack_ix;
438 PERL_ARGS_ASSERT_REGCP_RESTORE;
440 PL_savestack_ix = ix;
441 regcppop(rex, maxopenparen_p);
442 PL_savestack_ix = tmpix;
445 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
448 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
450 /* Returns a boolean as to whether or not 'character' is a member of the
451 * Posix character class given by 'classnum' that should be equivalent to a
452 * value in the typedef '_char_class_number'.
454 * Ideally this could be replaced by a just an array of function pointers
455 * to the C library functions that implement the macros this calls.
456 * However, to compile, the precise function signatures are required, and
457 * these may vary from platform to to platform. To avoid having to figure
458 * out what those all are on each platform, I (khw) am using this method,
459 * which adds an extra layer of function call overhead (unless the C
460 * optimizer strips it away). But we don't particularly care about
461 * performance with locales anyway. */
463 switch ((_char_class_number) classnum) {
464 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
465 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
466 case _CC_ENUM_ASCII: return isASCII_LC(character);
467 case _CC_ENUM_BLANK: return isBLANK_LC(character);
468 case _CC_ENUM_CASED: return isLOWER_LC(character)
469 || isUPPER_LC(character);
470 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
471 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
472 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
473 case _CC_ENUM_LOWER: return isLOWER_LC(character);
474 case _CC_ENUM_PRINT: return isPRINT_LC(character);
475 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
476 case _CC_ENUM_SPACE: return isSPACE_LC(character);
477 case _CC_ENUM_UPPER: return isUPPER_LC(character);
478 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
479 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
480 default: /* VERTSPACE should never occur in locales */
481 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
484 NOT_REACHED; /* NOTREACHED */
489 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
491 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
492 * 'character' is a member of the Posix character class given by 'classnum'
493 * that should be equivalent to a value in the typedef
494 * '_char_class_number'.
496 * This just calls isFOO_lc on the code point for the character if it is in
497 * the range 0-255. Outside that range, all characters use Unicode
498 * rules, ignoring any locale. So use the Unicode function if this class
499 * requires a swash, and use the Unicode macro otherwise. */
501 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
503 if (UTF8_IS_INVARIANT(*character)) {
504 return isFOO_lc(classnum, *character);
506 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
507 return isFOO_lc(classnum,
508 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
511 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
513 if (classnum < _FIRST_NON_SWASH_CC) {
515 /* Initialize the swash unless done already */
516 if (! PL_utf8_swash_ptrs[classnum]) {
517 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
518 PL_utf8_swash_ptrs[classnum] =
519 _core_swash_init("utf8",
522 PL_XPosix_ptrs[classnum], &flags);
525 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
527 TRUE /* is UTF */ ));
530 switch ((_char_class_number) classnum) {
531 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
532 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
533 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
534 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
538 return FALSE; /* Things like CNTRL are always below 256 */
542 * pregexec and friends
545 #ifndef PERL_IN_XSUB_RE
547 - pregexec - match a regexp against a string
550 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
551 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
552 /* stringarg: the point in the string at which to begin matching */
553 /* strend: pointer to null at end of string */
554 /* strbeg: real beginning of string */
555 /* minend: end of match must be >= minend bytes after stringarg. */
556 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
557 * itself is accessed via the pointers above */
558 /* nosave: For optimizations. */
560 PERL_ARGS_ASSERT_PREGEXEC;
563 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
564 nosave ? 0 : REXEC_COPY_STR);
570 /* re_intuit_start():
572 * Based on some optimiser hints, try to find the earliest position in the
573 * string where the regex could match.
575 * rx: the regex to match against
576 * sv: the SV being matched: only used for utf8 flag; the string
577 * itself is accessed via the pointers below. Note that on
578 * something like an overloaded SV, SvPOK(sv) may be false
579 * and the string pointers may point to something unrelated to
581 * strbeg: real beginning of string
582 * strpos: the point in the string at which to begin matching
583 * strend: pointer to the byte following the last char of the string
584 * flags currently unused; set to 0
585 * data: currently unused; set to NULL
587 * The basic idea of re_intuit_start() is to use some known information
588 * about the pattern, namely:
590 * a) the longest known anchored substring (i.e. one that's at a
591 * constant offset from the beginning of the pattern; but not
592 * necessarily at a fixed offset from the beginning of the
594 * b) the longest floating substring (i.e. one that's not at a constant
595 * offset from the beginning of the pattern);
596 * c) Whether the pattern is anchored to the string; either
597 * an absolute anchor: /^../, or anchored to \n: /^.../m,
598 * or anchored to pos(): /\G/;
599 * d) A start class: a real or synthetic character class which
600 * represents which characters are legal at the start of the pattern;
602 * to either quickly reject the match, or to find the earliest position
603 * within the string at which the pattern might match, thus avoiding
604 * running the full NFA engine at those earlier locations, only to
605 * eventually fail and retry further along.
607 * Returns NULL if the pattern can't match, or returns the address within
608 * the string which is the earliest place the match could occur.
610 * The longest of the anchored and floating substrings is called 'check'
611 * and is checked first. The other is called 'other' and is checked
612 * second. The 'other' substring may not be present. For example,
614 * /(abc|xyz)ABC\d{0,3}DEFG/
618 * check substr (float) = "DEFG", offset 6..9 chars
619 * other substr (anchored) = "ABC", offset 3..3 chars
622 * Be aware that during the course of this function, sometimes 'anchored'
623 * refers to a substring being anchored relative to the start of the
624 * pattern, and sometimes to the pattern itself being anchored relative to
625 * the string. For example:
627 * /\dabc/: "abc" is anchored to the pattern;
628 * /^\dabc/: "abc" is anchored to the pattern and the string;
629 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
630 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
631 * but the pattern is anchored to the string.
635 Perl_re_intuit_start(pTHX_
638 const char * const strbeg,
642 re_scream_pos_data *data)
644 struct regexp *const prog = ReANY(rx);
645 SSize_t start_shift = prog->check_offset_min;
646 /* Should be nonnegative! */
647 SSize_t end_shift = 0;
648 /* current lowest pos in string where the regex can start matching */
649 char *rx_origin = strpos;
651 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
652 U8 other_ix = 1 - prog->substrs->check_ix;
654 char *other_last = strpos;/* latest pos 'other' substr already checked to */
655 char *check_at = NULL; /* check substr found at this pos */
656 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
657 RXi_GET_DECL(prog,progi);
658 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
659 regmatch_info *const reginfo = ®info_buf;
660 GET_RE_DEBUG_FLAGS_DECL;
662 PERL_ARGS_ASSERT_RE_INTUIT_START;
663 PERL_UNUSED_ARG(flags);
664 PERL_UNUSED_ARG(data);
666 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
667 "Intuit: trying to determine minimum start position...\n"));
669 /* for now, assume that all substr offsets are positive. If at some point
670 * in the future someone wants to do clever things with lookbehind and
671 * -ve offsets, they'll need to fix up any code in this function
672 * which uses these offsets. See the thread beginning
673 * <20140113145929.GF27210@iabyn.com>
675 assert(prog->substrs->data[0].min_offset >= 0);
676 assert(prog->substrs->data[0].max_offset >= 0);
677 assert(prog->substrs->data[1].min_offset >= 0);
678 assert(prog->substrs->data[1].max_offset >= 0);
679 assert(prog->substrs->data[2].min_offset >= 0);
680 assert(prog->substrs->data[2].max_offset >= 0);
682 /* for now, assume that if both present, that the floating substring
683 * doesn't start before the anchored substring.
684 * If you break this assumption (e.g. doing better optimisations
685 * with lookahead/behind), then you'll need to audit the code in this
686 * function carefully first
689 ! ( (prog->anchored_utf8 || prog->anchored_substr)
690 && (prog->float_utf8 || prog->float_substr))
691 || (prog->float_min_offset >= prog->anchored_offset));
693 /* byte rather than char calculation for efficiency. It fails
694 * to quickly reject some cases that can't match, but will reject
695 * them later after doing full char arithmetic */
696 if (prog->minlen > strend - strpos) {
697 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
698 " String too short...\n"));
702 RX_MATCH_UTF8_set(rx,utf8_target);
703 reginfo->is_utf8_target = cBOOL(utf8_target);
704 reginfo->info_aux = NULL;
705 reginfo->strbeg = strbeg;
706 reginfo->strend = strend;
707 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
709 /* not actually used within intuit, but zero for safety anyway */
710 reginfo->poscache_maxiter = 0;
713 if ((!prog->anchored_utf8 && prog->anchored_substr)
714 || (!prog->float_utf8 && prog->float_substr))
715 to_utf8_substr(prog);
716 check = prog->check_utf8;
718 if (!prog->check_substr && prog->check_utf8) {
719 if (! to_byte_substr(prog)) {
720 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
723 check = prog->check_substr;
726 /* dump the various substring data */
727 DEBUG_OPTIMISE_MORE_r({
729 for (i=0; i<=2; i++) {
730 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
731 : prog->substrs->data[i].substr);
735 Perl_re_printf( aTHX_
736 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
737 " useful=%"IVdf" utf8=%d [%s]\n",
739 (IV)prog->substrs->data[i].min_offset,
740 (IV)prog->substrs->data[i].max_offset,
741 (IV)prog->substrs->data[i].end_shift,
748 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
750 /* ml_anch: check after \n?
752 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
753 * with /.*.../, these flags will have been added by the
755 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
756 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
758 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
759 && !(prog->intflags & PREGf_IMPLICIT);
761 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
762 /* we are only allowed to match at BOS or \G */
764 /* trivially reject if there's a BOS anchor and we're not at BOS.
766 * Note that we don't try to do a similar quick reject for
767 * \G, since generally the caller will have calculated strpos
768 * based on pos() and gofs, so the string is already correctly
769 * anchored by definition; and handling the exceptions would
770 * be too fiddly (e.g. REXEC_IGNOREPOS).
772 if ( strpos != strbeg
773 && (prog->intflags & PREGf_ANCH_SBOL))
775 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
776 " Not at start...\n"));
780 /* in the presence of an anchor, the anchored (relative to the
781 * start of the regex) substr must also be anchored relative
782 * to strpos. So quickly reject if substr isn't found there.
783 * This works for \G too, because the caller will already have
784 * subtracted gofs from pos, and gofs is the offset from the
785 * \G to the start of the regex. For example, in /.abc\Gdef/,
786 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
787 * caller will have set strpos=pos()-4; we look for the substr
788 * at position pos()-4+1, which lines up with the "a" */
790 if (prog->check_offset_min == prog->check_offset_max) {
791 /* Substring at constant offset from beg-of-str... */
792 SSize_t slen = SvCUR(check);
793 char *s = HOP3c(strpos, prog->check_offset_min, strend);
795 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
796 " Looking for check substr at fixed offset %"IVdf"...\n",
797 (IV)prog->check_offset_min));
800 /* In this case, the regex is anchored at the end too.
801 * Unless it's a multiline match, the lengths must match
802 * exactly, give or take a \n. NB: slen >= 1 since
803 * the last char of check is \n */
805 && ( strend - s > slen
806 || strend - s < slen - 1
807 || (strend - s == slen && strend[-1] != '\n')))
809 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
810 " String too long...\n"));
813 /* Now should match s[0..slen-2] */
816 if (slen && (strend - s < slen
817 || *SvPVX_const(check) != *s
818 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
820 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
821 " String not equal...\n"));
826 goto success_at_start;
831 end_shift = prog->check_end_shift;
833 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
835 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
836 (IV)end_shift, RX_PRECOMP(prog));
841 /* This is the (re)entry point of the main loop in this function.
842 * The goal of this loop is to:
843 * 1) find the "check" substring in the region rx_origin..strend
844 * (adjusted by start_shift / end_shift). If not found, reject
846 * 2) If it exists, look for the "other" substr too if defined; for
847 * example, if the check substr maps to the anchored substr, then
848 * check the floating substr, and vice-versa. If not found, go
849 * back to (1) with rx_origin suitably incremented.
850 * 3) If we find an rx_origin position that doesn't contradict
851 * either of the substrings, then check the possible additional
852 * constraints on rx_origin of /^.../m or a known start class.
853 * If these fail, then depending on which constraints fail, jump
854 * back to here, or to various other re-entry points further along
855 * that skip some of the first steps.
856 * 4) If we pass all those tests, update the BmUSEFUL() count on the
857 * substring. If the start position was determined to be at the
858 * beginning of the string - so, not rejected, but not optimised,
859 * since we have to run regmatch from position 0 - decrement the
860 * BmUSEFUL() count. Otherwise increment it.
864 /* first, look for the 'check' substring */
870 DEBUG_OPTIMISE_MORE_r({
871 Perl_re_printf( aTHX_
872 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
873 " Start shift: %"IVdf" End shift %"IVdf
874 " Real end Shift: %"IVdf"\n",
875 (IV)(rx_origin - strbeg),
876 (IV)prog->check_offset_min,
879 (IV)prog->check_end_shift);
882 end_point = HOP3(strend, -end_shift, strbeg);
883 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
888 /* If the regex is absolutely anchored to either the start of the
889 * string (SBOL) or to pos() (ANCH_GPOS), then
890 * check_offset_max represents an upper bound on the string where
891 * the substr could start. For the ANCH_GPOS case, we assume that
892 * the caller of intuit will have already set strpos to
893 * pos()-gofs, so in this case strpos + offset_max will still be
894 * an upper bound on the substr.
897 && prog->intflags & PREGf_ANCH
898 && prog->check_offset_max != SSize_t_MAX)
900 SSize_t len = SvCUR(check) - !!SvTAIL(check);
901 const char * const anchor =
902 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
904 /* do a bytes rather than chars comparison. It's conservative;
905 * so it skips doing the HOP if the result can't possibly end
906 * up earlier than the old value of end_point.
908 if ((char*)end_point - anchor > prog->check_offset_max) {
909 end_point = HOP3lim((U8*)anchor,
910 prog->check_offset_max,
916 check_at = fbm_instr( start_point, end_point,
917 check, multiline ? FBMrf_MULTILINE : 0);
919 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
920 " doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
921 (IV)((char*)start_point - strbeg),
922 (IV)((char*)end_point - strbeg),
923 (IV)(check_at ? check_at - strbeg : -1)
926 /* Update the count-of-usability, remove useless subpatterns,
930 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
931 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
932 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
933 (check_at ? "Found" : "Did not find"),
934 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
935 ? "anchored" : "floating"),
938 (check_at ? " at offset " : "...\n") );
943 /* set rx_origin to the minimum position where the regex could start
944 * matching, given the constraint of the just-matched check substring.
945 * But don't set it lower than previously.
948 if (check_at - rx_origin > prog->check_offset_max)
949 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
950 /* Finish the diagnostic message */
951 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
952 "%ld (rx_origin now %"IVdf")...\n",
953 (long)(check_at - strbeg),
954 (IV)(rx_origin - strbeg)
959 /* now look for the 'other' substring if defined */
961 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
962 : prog->substrs->data[other_ix].substr)
964 /* Take into account the "other" substring. */
968 struct reg_substr_datum *other;
971 other = &prog->substrs->data[other_ix];
973 /* if "other" is anchored:
974 * we've previously found a floating substr starting at check_at.
975 * This means that the regex origin must lie somewhere
976 * between min (rx_origin): HOP3(check_at, -check_offset_max)
977 * and max: HOP3(check_at, -check_offset_min)
978 * (except that min will be >= strpos)
979 * So the fixed substr must lie somewhere between
980 * HOP3(min, anchored_offset)
981 * HOP3(max, anchored_offset) + SvCUR(substr)
984 /* if "other" is floating
985 * Calculate last1, the absolute latest point where the
986 * floating substr could start in the string, ignoring any
987 * constraints from the earlier fixed match. It is calculated
990 * strend - prog->minlen (in chars) is the absolute latest
991 * position within the string where the origin of the regex
992 * could appear. The latest start point for the floating
993 * substr is float_min_offset(*) on from the start of the
994 * regex. last1 simply combines thee two offsets.
996 * (*) You might think the latest start point should be
997 * float_max_offset from the regex origin, and technically
998 * you'd be correct. However, consider
1000 * Here, float min, max are 3,5 and minlen is 7.
1001 * This can match either
1005 * In the first case, the regex matches minlen chars; in the
1006 * second, minlen+1, in the third, minlen+2.
1007 * In the first case, the floating offset is 3 (which equals
1008 * float_min), in the second, 4, and in the third, 5 (which
1009 * equals float_max). In all cases, the floating string bcd
1010 * can never start more than 4 chars from the end of the
1011 * string, which equals minlen - float_min. As the substring
1012 * starts to match more than float_min from the start of the
1013 * regex, it makes the regex match more than minlen chars,
1014 * and the two cancel each other out. So we can always use
1015 * float_min - minlen, rather than float_max - minlen for the
1016 * latest position in the string.
1018 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1019 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1022 assert(prog->minlen >= other->min_offset);
1023 last1 = HOP3c(strend,
1024 other->min_offset - prog->minlen, strbeg);
1026 if (other_ix) {/* i.e. if (other-is-float) */
1027 /* last is the latest point where the floating substr could
1028 * start, *given* any constraints from the earlier fixed
1029 * match. This constraint is that the floating string starts
1030 * <= float_max_offset chars from the regex origin (rx_origin).
1031 * If this value is less than last1, use it instead.
1033 assert(rx_origin <= last1);
1035 /* this condition handles the offset==infinity case, and
1036 * is a short-cut otherwise. Although it's comparing a
1037 * byte offset to a char length, it does so in a safe way,
1038 * since 1 char always occupies 1 or more bytes,
1039 * so if a string range is (last1 - rx_origin) bytes,
1040 * it will be less than or equal to (last1 - rx_origin)
1041 * chars; meaning it errs towards doing the accurate HOP3
1042 * rather than just using last1 as a short-cut */
1043 (last1 - rx_origin) < other->max_offset
1045 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1048 assert(strpos + start_shift <= check_at);
1049 last = HOP4c(check_at, other->min_offset - start_shift,
1053 s = HOP3c(rx_origin, other->min_offset, strend);
1054 if (s < other_last) /* These positions already checked */
1057 must = utf8_target ? other->utf8_substr : other->substr;
1058 assert(SvPOK(must));
1061 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1067 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1068 " skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1069 (IV)(from - strbeg),
1075 (unsigned char*)from,
1078 multiline ? FBMrf_MULTILINE : 0
1080 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1081 " doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1082 (IV)(from - strbeg),
1084 (IV)(s ? s - strbeg : -1)
1090 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1091 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1092 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1093 s ? "Found" : "Contradicts",
1094 other_ix ? "floating" : "anchored",
1095 quoted, RE_SV_TAIL(must));
1100 /* last1 is latest possible substr location. If we didn't
1101 * find it before there, we never will */
1102 if (last >= last1) {
1103 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1104 "; giving up...\n"));
1108 /* try to find the check substr again at a later
1109 * position. Maybe next time we'll find the "other" substr
1111 other_last = HOP3c(last, 1, strend) /* highest failure */;
1113 other_ix /* i.e. if other-is-float */
1114 ? HOP3c(rx_origin, 1, strend)
1115 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1116 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1117 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1118 (other_ix ? "floating" : "anchored"),
1119 (long)(HOP3c(check_at, 1, strend) - strbeg),
1120 (IV)(rx_origin - strbeg)
1125 if (other_ix) { /* if (other-is-float) */
1126 /* other_last is set to s, not s+1, since its possible for
1127 * a floating substr to fail first time, then succeed
1128 * second time at the same floating position; e.g.:
1129 * "-AB--AABZ" =~ /\wAB\d*Z/
1130 * The first time round, anchored and float match at
1131 * "-(AB)--AAB(Z)" then fail on the initial \w character
1132 * class. Second time round, they match at "-AB--A(AB)(Z)".
1137 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1138 other_last = HOP3c(s, 1, strend);
1140 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1141 " at offset %ld (rx_origin now %"IVdf")...\n",
1143 (IV)(rx_origin - strbeg)
1149 DEBUG_OPTIMISE_MORE_r(
1150 Perl_re_printf( aTHX_
1151 " Check-only match: offset min:%"IVdf" max:%"IVdf
1152 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1153 " strend:%"IVdf"\n",
1154 (IV)prog->check_offset_min,
1155 (IV)prog->check_offset_max,
1156 (IV)(check_at-strbeg),
1157 (IV)(rx_origin-strbeg),
1158 (IV)(rx_origin-check_at),
1164 postprocess_substr_matches:
1166 /* handle the extra constraint of /^.../m if present */
1168 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1171 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1172 " looking for /^/m anchor"));
1174 /* we have failed the constraint of a \n before rx_origin.
1175 * Find the next \n, if any, even if it's beyond the current
1176 * anchored and/or floating substrings. Whether we should be
1177 * scanning ahead for the next \n or the next substr is debatable.
1178 * On the one hand you'd expect rare substrings to appear less
1179 * often than \n's. On the other hand, searching for \n means
1180 * we're effectively flipping between check_substr and "\n" on each
1181 * iteration as the current "rarest" string candidate, which
1182 * means for example that we'll quickly reject the whole string if
1183 * hasn't got a \n, rather than trying every substr position
1187 s = HOP3c(strend, - prog->minlen, strpos);
1188 if (s <= rx_origin ||
1189 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1191 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1192 " Did not find /%s^%s/m...\n",
1193 PL_colors[0], PL_colors[1]));
1197 /* earliest possible origin is 1 char after the \n.
1198 * (since *rx_origin == '\n', it's safe to ++ here rather than
1199 * HOP(rx_origin, 1)) */
1202 if (prog->substrs->check_ix == 0 /* check is anchored */
1203 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1205 /* Position contradicts check-string; either because
1206 * check was anchored (and thus has no wiggle room),
1207 * or check was float and rx_origin is above the float range */
1208 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1209 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1210 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1214 /* if we get here, the check substr must have been float,
1215 * is in range, and we may or may not have had an anchored
1216 * "other" substr which still contradicts */
1217 assert(prog->substrs->check_ix); /* check is float */
1219 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1220 /* whoops, the anchored "other" substr exists, so we still
1221 * contradict. On the other hand, the float "check" substr
1222 * didn't contradict, so just retry the anchored "other"
1224 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1225 " Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
1226 PL_colors[0], PL_colors[1],
1227 (IV)(rx_origin - strbeg + prog->anchored_offset),
1228 (IV)(rx_origin - strbeg)
1230 goto do_other_substr;
1233 /* success: we don't contradict the found floating substring
1234 * (and there's no anchored substr). */
1235 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1236 " Found /%s^%s/m with rx_origin %ld...\n",
1237 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1240 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1241 " (multiline anchor test skipped)\n"));
1247 /* if we have a starting character class, then test that extra constraint.
1248 * (trie stclasses are too expensive to use here, we are better off to
1249 * leave it to regmatch itself) */
1251 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1252 const U8* const str = (U8*)STRING(progi->regstclass);
1254 /* XXX this value could be pre-computed */
1255 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1256 ? (reginfo->is_utf8_pat
1257 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1258 : STR_LEN(progi->regstclass))
1262 /* latest pos that a matching float substr constrains rx start to */
1263 char *rx_max_float = NULL;
1265 /* if the current rx_origin is anchored, either by satisfying an
1266 * anchored substring constraint, or a /^.../m constraint, then we
1267 * can reject the current origin if the start class isn't found
1268 * at the current position. If we have a float-only match, then
1269 * rx_origin is constrained to a range; so look for the start class
1270 * in that range. if neither, then look for the start class in the
1271 * whole rest of the string */
1273 /* XXX DAPM it's not clear what the minlen test is for, and why
1274 * it's not used in the floating case. Nothing in the test suite
1275 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1276 * Here are some old comments, which may or may not be correct:
1278 * minlen == 0 is possible if regstclass is \b or \B,
1279 * and the fixed substr is ''$.
1280 * Since minlen is already taken into account, rx_origin+1 is
1281 * before strend; accidentally, minlen >= 1 guaranties no false
1282 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1283 * 0) below assumes that regstclass does not come from lookahead...
1284 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1285 * This leaves EXACTF-ish only, which are dealt with in
1289 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1290 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1291 else if (prog->float_substr || prog->float_utf8) {
1292 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1293 endpos= HOP3c(rx_max_float, cl_l, strend);
1298 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1299 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
1300 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1301 (IV)start_shift, (IV)(check_at - strbeg),
1302 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1304 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1307 if (endpos == strend) {
1308 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1309 " Could not match STCLASS...\n") );
1312 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1313 " This position contradicts STCLASS...\n") );
1314 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1315 && !(prog->intflags & PREGf_IMPLICIT))
1318 /* Contradict one of substrings */
1319 if (prog->anchored_substr || prog->anchored_utf8) {
1320 if (prog->substrs->check_ix == 1) { /* check is float */
1321 /* Have both, check_string is floating */
1322 assert(rx_origin + start_shift <= check_at);
1323 if (rx_origin + start_shift != check_at) {
1324 /* not at latest position float substr could match:
1325 * Recheck anchored substring, but not floating.
1326 * The condition above is in bytes rather than
1327 * chars for efficiency. It's conservative, in
1328 * that it errs on the side of doing 'goto
1329 * do_other_substr'. In this case, at worst,
1330 * an extra anchored search may get done, but in
1331 * practice the extra fbm_instr() is likely to
1332 * get skipped anyway. */
1333 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1334 " about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1335 (long)(other_last - strbeg),
1336 (IV)(rx_origin - strbeg)
1338 goto do_other_substr;
1346 /* In the presence of ml_anch, we might be able to
1347 * find another \n without breaking the current float
1350 /* strictly speaking this should be HOP3c(..., 1, ...),
1351 * but since we goto a block of code that's going to
1352 * search for the next \n if any, its safe here */
1354 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1355 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1356 PL_colors[0], PL_colors[1],
1357 (long)(rx_origin - strbeg)) );
1358 goto postprocess_substr_matches;
1361 /* strictly speaking this can never be true; but might
1362 * be if we ever allow intuit without substrings */
1363 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1366 rx_origin = rx_max_float;
1369 /* at this point, any matching substrings have been
1370 * contradicted. Start again... */
1372 rx_origin = HOP3c(rx_origin, 1, strend);
1374 /* uses bytes rather than char calculations for efficiency.
1375 * It's conservative: it errs on the side of doing 'goto restart',
1376 * where there is code that does a proper char-based test */
1377 if (rx_origin + start_shift + end_shift > strend) {
1378 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1379 " Could not match STCLASS...\n") );
1382 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1383 " about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
1384 (prog->substrs->check_ix ? "floating" : "anchored"),
1385 (long)(rx_origin + start_shift - strbeg),
1386 (IV)(rx_origin - strbeg)
1393 if (rx_origin != s) {
1394 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1395 " By STCLASS: moving %ld --> %ld\n",
1396 (long)(rx_origin - strbeg), (long)(s - strbeg))
1400 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1401 " Does not contradict STCLASS...\n");
1406 /* Decide whether using the substrings helped */
1408 if (rx_origin != strpos) {
1409 /* Fixed substring is found far enough so that the match
1410 cannot start at strpos. */
1412 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1413 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1416 /* The found rx_origin position does not prohibit matching at
1417 * strpos, so calling intuit didn't gain us anything. Decrement
1418 * the BmUSEFUL() count on the check substring, and if we reach
1420 if (!(prog->intflags & PREGf_NAUGHTY)
1422 prog->check_utf8 /* Could be deleted already */
1423 && --BmUSEFUL(prog->check_utf8) < 0
1424 && (prog->check_utf8 == prog->float_utf8)
1426 prog->check_substr /* Could be deleted already */
1427 && --BmUSEFUL(prog->check_substr) < 0
1428 && (prog->check_substr == prog->float_substr)
1431 /* If flags & SOMETHING - do not do it many times on the same match */
1432 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1433 /* XXX Does the destruction order has to change with utf8_target? */
1434 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1435 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1436 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1437 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1438 check = NULL; /* abort */
1439 /* XXXX This is a remnant of the old implementation. It
1440 looks wasteful, since now INTUIT can use many
1441 other heuristics. */
1442 prog->extflags &= ~RXf_USE_INTUIT;
1446 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1447 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1448 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1452 fail_finish: /* Substring not found */
1453 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1454 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1456 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1457 PL_colors[4], PL_colors[5]));
1462 #define DECL_TRIE_TYPE(scan) \
1463 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1464 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1465 trie_utf8l, trie_flu8 } \
1466 trie_type = ((scan->flags == EXACT) \
1467 ? (utf8_target ? trie_utf8 : trie_plain) \
1468 : (scan->flags == EXACTL) \
1469 ? (utf8_target ? trie_utf8l : trie_plain) \
1470 : (scan->flags == EXACTFA) \
1472 ? trie_utf8_exactfa_fold \
1473 : trie_latin_utf8_exactfa_fold) \
1474 : (scan->flags == EXACTFLU8 \
1478 : trie_latin_utf8_fold)))
1480 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1483 U8 flags = FOLD_FLAGS_FULL; \
1484 switch (trie_type) { \
1486 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1487 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1488 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1490 goto do_trie_utf8_fold; \
1491 case trie_utf8_exactfa_fold: \
1492 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1494 case trie_utf8_fold: \
1495 do_trie_utf8_fold: \
1496 if ( foldlen>0 ) { \
1497 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1502 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
1503 len = UTF8SKIP(uc); \
1504 skiplen = UVCHR_SKIP( uvc ); \
1505 foldlen -= skiplen; \
1506 uscan = foldbuf + skiplen; \
1509 case trie_latin_utf8_exactfa_fold: \
1510 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1512 case trie_latin_utf8_fold: \
1513 if ( foldlen>0 ) { \
1514 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1520 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1521 skiplen = UVCHR_SKIP( uvc ); \
1522 foldlen -= skiplen; \
1523 uscan = foldbuf + skiplen; \
1527 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1528 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1529 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1533 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1540 charid = trie->charmap[ uvc ]; \
1544 if (widecharmap) { \
1545 SV** const svpp = hv_fetch(widecharmap, \
1546 (char*)&uvc, sizeof(UV), 0); \
1548 charid = (U16)SvIV(*svpp); \
1553 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1554 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1555 startpos, doutf8, depth)
1557 #define REXEC_FBC_EXACTISH_SCAN(COND) \
1561 && (ln == 1 || folder(s, pat_string, ln)) \
1562 && (reginfo->intuit || regtry(reginfo, &s)) )\
1568 #define REXEC_FBC_UTF8_SCAN(CODE) \
1570 while (s < strend) { \
1576 #define REXEC_FBC_SCAN(CODE) \
1578 while (s < strend) { \
1584 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1585 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1587 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1596 #define REXEC_FBC_CLASS_SCAN(COND) \
1597 REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1599 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1608 #define REXEC_FBC_CSCAN(CONDUTF8,COND) \
1609 if (utf8_target) { \
1610 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
1613 REXEC_FBC_CLASS_SCAN(COND); \
1616 /* The three macros below are slightly different versions of the same logic.
1618 * The first is for /a and /aa when the target string is UTF-8. This can only
1619 * match ascii, but it must advance based on UTF-8. The other two handle the
1620 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1621 * for the boundary (or non-boundary) between a word and non-word character.
1622 * The utf8 and non-utf8 cases have the same logic, but the details must be
1623 * different. Find the "wordness" of the character just prior to this one, and
1624 * compare it with the wordness of this one. If they differ, we have a
1625 * boundary. At the beginning of the string, pretend that the previous
1626 * character was a new-line.
1628 * All these macros uncleanly have side-effects with each other and outside
1629 * variables. So far it's been too much trouble to clean-up
1631 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1632 * a word character or not.
1633 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1635 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1637 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1638 * are looking for a boundary or for a non-boundary. If we are looking for a
1639 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1640 * see if this tentative match actually works, and if so, to quit the loop
1641 * here. And vice-versa if we are looking for a non-boundary.
1643 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1644 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1645 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1646 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1647 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1648 * complement. But in that branch we complement tmp, meaning that at the
1649 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1650 * which means at the top of the loop in the next iteration, it is
1651 * TEST_NON_UTF8(s-1) */
1652 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1653 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1654 tmp = TEST_NON_UTF8(tmp); \
1655 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1656 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1658 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1665 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1666 * TEST_UTF8 is a macro that for the same input code points returns identically
1667 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1668 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
1669 if (s == reginfo->strbeg) { \
1672 else { /* Back-up to the start of the previous character */ \
1673 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1674 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1675 0, UTF8_ALLOW_DEFAULT); \
1677 tmp = TEST_UV(tmp); \
1678 LOAD_UTF8_CHARCLASS_ALNUM(); \
1679 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1680 if (tmp == ! (TEST_UTF8((U8 *) s))) { \
1689 /* Like the above two macros. UTF8_CODE is the complete code for handling
1690 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1692 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1693 if (utf8_target) { \
1696 else { /* Not utf8 */ \
1697 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1698 tmp = TEST_NON_UTF8(tmp); \
1699 REXEC_FBC_SCAN( /* advances s while s < strend */ \
1700 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1709 /* Here, things have been set up by the previous code so that tmp is the \
1710 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1711 * utf8ness of the target). We also have to check if this matches against \
1712 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1713 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1715 if (tmp == ! TEST_NON_UTF8('\n')) { \
1722 /* This is the macro to use when we want to see if something that looks like it
1723 * could match, actually does, and if so exits the loop */
1724 #define REXEC_FBC_TRYIT \
1725 if ((reginfo->intuit || regtry(reginfo, &s))) \
1728 /* The only difference between the BOUND and NBOUND cases is that
1729 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1730 * NBOUND. This is accomplished by passing it as either the if or else clause,
1731 * with the other one being empty (PLACEHOLDER is defined as empty).
1733 * The TEST_FOO parameters are for operating on different forms of input, but
1734 * all should be ones that return identically for the same underlying code
1736 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1738 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1739 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1741 #define FBC_BOUND_A(TEST_NON_UTF8) \
1743 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1744 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1746 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1748 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1749 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1751 #define FBC_NBOUND_A(TEST_NON_UTF8) \
1753 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1754 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1758 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1759 IV cp_out = Perl__invlist_search(invlist, cp_in);
1760 assert(cp_out >= 0);
1763 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1764 invmap[S_get_break_val_cp_checked(invlist, cp)]
1766 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1767 invmap[_invlist_search(invlist, cp)]
1770 /* Takes a pointer to an inversion list, a pointer to its corresponding
1771 * inversion map, and a code point, and returns the code point's value
1772 * according to the two arrays. It assumes that all code points have a value.
1773 * This is used as the base macro for macros for particular properties */
1774 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
1775 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
1777 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1778 * of a code point, returning the value for the first code point in the string.
1779 * And it takes the particular macro name that finds the desired value given a
1780 * code point. Merely convert the UTF-8 to code point and call the cp macro */
1781 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
1782 (__ASSERT_(pos < strend) \
1783 /* Note assumes is valid UTF-8 */ \
1784 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1786 /* Returns the GCB value for the input code point */
1787 #define getGCB_VAL_CP(cp) \
1788 _generic_GET_BREAK_VAL_CP( \
1793 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1794 * bounded by pos and strend */
1795 #define getGCB_VAL_UTF8(pos, strend) \
1796 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1798 /* Returns the LB value for the input code point */
1799 #define getLB_VAL_CP(cp) \
1800 _generic_GET_BREAK_VAL_CP( \
1805 /* Returns the LB value for the first code point in the UTF-8 encoded string
1806 * bounded by pos and strend */
1807 #define getLB_VAL_UTF8(pos, strend) \
1808 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1811 /* Returns the SB value for the input code point */
1812 #define getSB_VAL_CP(cp) \
1813 _generic_GET_BREAK_VAL_CP( \
1818 /* Returns the SB value for the first code point in the UTF-8 encoded string
1819 * bounded by pos and strend */
1820 #define getSB_VAL_UTF8(pos, strend) \
1821 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1823 /* Returns the WB value for the input code point */
1824 #define getWB_VAL_CP(cp) \
1825 _generic_GET_BREAK_VAL_CP( \
1830 /* Returns the WB value for the first code point in the UTF-8 encoded string
1831 * bounded by pos and strend */
1832 #define getWB_VAL_UTF8(pos, strend) \
1833 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1835 /* We know what class REx starts with. Try to find this position... */
1836 /* if reginfo->intuit, its a dryrun */
1837 /* annoyingly all the vars in this routine have different names from their counterparts
1838 in regmatch. /grrr */
1840 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1841 const char *strend, regmatch_info *reginfo)
1844 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1845 char *pat_string; /* The pattern's exactish string */
1846 char *pat_end; /* ptr to end char of pat_string */
1847 re_fold_t folder; /* Function for computing non-utf8 folds */
1848 const U8 *fold_array; /* array for folding ords < 256 */
1854 I32 tmp = 1; /* Scratch variable? */
1855 const bool utf8_target = reginfo->is_utf8_target;
1856 UV utf8_fold_flags = 0;
1857 const bool is_utf8_pat = reginfo->is_utf8_pat;
1858 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1859 with a result inverts that result, as 0^1 =
1861 _char_class_number classnum;
1863 RXi_GET_DECL(prog,progi);
1865 PERL_ARGS_ASSERT_FIND_BYCLASS;
1867 /* We know what class it must start with. */
1870 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1872 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
1873 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1880 REXEC_FBC_UTF8_CLASS_SCAN(
1881 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1884 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s, 0));
1888 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1889 assert(! is_utf8_pat);
1892 if (is_utf8_pat || utf8_target) {
1893 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1894 goto do_exactf_utf8;
1896 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1897 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1898 goto do_exactf_non_utf8; /* isn't dealt with by these */
1900 case EXACTF: /* This node only generated for non-utf8 patterns */
1901 assert(! is_utf8_pat);
1903 utf8_fold_flags = 0;
1904 goto do_exactf_utf8;
1906 fold_array = PL_fold;
1908 goto do_exactf_non_utf8;
1911 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1912 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1913 utf8_fold_flags = FOLDEQ_LOCALE;
1914 goto do_exactf_utf8;
1916 fold_array = PL_fold_locale;
1917 folder = foldEQ_locale;
1918 goto do_exactf_non_utf8;
1922 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1924 goto do_exactf_utf8;
1927 if (! utf8_target) { /* All code points in this node require
1928 UTF-8 to express. */
1931 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1932 | FOLDEQ_S2_FOLDS_SANE;
1933 goto do_exactf_utf8;
1936 if (is_utf8_pat || utf8_target) {
1937 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1938 goto do_exactf_utf8;
1941 /* Any 'ss' in the pattern should have been replaced by regcomp,
1942 * so we don't have to worry here about this single special case
1943 * in the Latin1 range */
1944 fold_array = PL_fold_latin1;
1945 folder = foldEQ_latin1;
1949 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1950 are no glitches with fold-length differences
1951 between the target string and pattern */
1953 /* The idea in the non-utf8 EXACTF* cases is to first find the
1954 * first character of the EXACTF* node and then, if necessary,
1955 * case-insensitively compare the full text of the node. c1 is the
1956 * first character. c2 is its fold. This logic will not work for
1957 * Unicode semantics and the german sharp ss, which hence should
1958 * not be compiled into a node that gets here. */
1959 pat_string = STRING(c);
1960 ln = STR_LEN(c); /* length to match in octets/bytes */
1962 /* We know that we have to match at least 'ln' bytes (which is the
1963 * same as characters, since not utf8). If we have to match 3
1964 * characters, and there are only 2 availabe, we know without
1965 * trying that it will fail; so don't start a match past the
1966 * required minimum number from the far end */
1967 e = HOP3c(strend, -((SSize_t)ln), s);
1969 if (reginfo->intuit && e < s) {
1970 e = s; /* Due to minlen logic of intuit() */
1974 c2 = fold_array[c1];
1975 if (c1 == c2) { /* If char and fold are the same */
1976 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1979 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1987 /* If one of the operands is in utf8, we can't use the simpler folding
1988 * above, due to the fact that many different characters can have the
1989 * same fold, or portion of a fold, or different- length fold */
1990 pat_string = STRING(c);
1991 ln = STR_LEN(c); /* length to match in octets/bytes */
1992 pat_end = pat_string + ln;
1993 lnc = is_utf8_pat /* length to match in characters */
1994 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1997 /* We have 'lnc' characters to match in the pattern, but because of
1998 * multi-character folding, each character in the target can match
1999 * up to 3 characters (Unicode guarantees it will never exceed
2000 * this) if it is utf8-encoded; and up to 2 if not (based on the
2001 * fact that the Latin 1 folds are already determined, and the
2002 * only multi-char fold in that range is the sharp-s folding to
2003 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
2004 * string character. Adjust lnc accordingly, rounding up, so that
2005 * if we need to match at least 4+1/3 chars, that really is 5. */
2006 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2007 lnc = (lnc + expansion - 1) / expansion;
2009 /* As in the non-UTF8 case, if we have to match 3 characters, and
2010 * only 2 are left, it's guaranteed to fail, so don't start a
2011 * match that would require us to go beyond the end of the string
2013 e = HOP3c(strend, -((SSize_t)lnc), s);
2015 if (reginfo->intuit && e < s) {
2016 e = s; /* Due to minlen logic of intuit() */
2019 /* XXX Note that we could recalculate e to stop the loop earlier,
2020 * as the worst case expansion above will rarely be met, and as we
2021 * go along we would usually find that e moves further to the left.
2022 * This would happen only after we reached the point in the loop
2023 * where if there were no expansion we should fail. Unclear if
2024 * worth the expense */
2027 char *my_strend= (char *)strend;
2028 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2029 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2030 && (reginfo->intuit || regtry(reginfo, &s)) )
2034 s += (utf8_target) ? UTF8SKIP(s) : 1;
2040 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2041 if (FLAGS(c) != TRADITIONAL_BOUND) {
2042 if (! IN_UTF8_CTYPE_LOCALE) {
2043 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2044 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2049 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2053 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2054 if (FLAGS(c) != TRADITIONAL_BOUND) {
2055 if (! IN_UTF8_CTYPE_LOCALE) {
2056 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2057 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2062 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2065 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2067 assert(FLAGS(c) == TRADITIONAL_BOUND);
2069 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2072 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2074 assert(FLAGS(c) == TRADITIONAL_BOUND);
2076 FBC_BOUND_A(isWORDCHAR_A);
2079 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2081 assert(FLAGS(c) == TRADITIONAL_BOUND);
2083 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2086 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2088 assert(FLAGS(c) == TRADITIONAL_BOUND);
2090 FBC_NBOUND_A(isWORDCHAR_A);
2094 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2095 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2106 switch((bound_type) FLAGS(c)) {
2107 case TRADITIONAL_BOUND:
2108 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2111 if (s == reginfo->strbeg) {
2112 if (reginfo->intuit || regtry(reginfo, &s))
2117 /* Didn't match. Try at the next position (if there is one) */
2118 s += (utf8_target) ? UTF8SKIP(s) : 1;
2119 if (UNLIKELY(s >= reginfo->strend)) {
2125 GCB_enum before = getGCB_VAL_UTF8(
2127 (U8*)(reginfo->strbeg)),
2128 (U8*) reginfo->strend);
2129 while (s < strend) {
2130 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2131 (U8*) reginfo->strend);
2132 if ( (to_complement ^ isGCB(before,
2134 (U8*) reginfo->strbeg,
2137 && (reginfo->intuit || regtry(reginfo, &s)))
2145 else { /* Not utf8. Everything is a GCB except between CR and
2147 while (s < strend) {
2148 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2149 || UCHARAT(s) != '\n'))
2150 && (reginfo->intuit || regtry(reginfo, &s)))
2158 /* And, since this is a bound, it can match after the final
2159 * character in the string */
2160 if ((reginfo->intuit || regtry(reginfo, &s))) {
2166 if (s == reginfo->strbeg) {
2167 if (reginfo->intuit || regtry(reginfo, &s)) {
2170 s += (utf8_target) ? UTF8SKIP(s) : 1;
2171 if (UNLIKELY(s >= reginfo->strend)) {
2177 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2179 (U8*)(reginfo->strbeg)),
2180 (U8*) reginfo->strend);
2181 while (s < strend) {
2182 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2183 if (to_complement ^ isLB(before,
2185 (U8*) reginfo->strbeg,
2187 (U8*) reginfo->strend,
2189 && (reginfo->intuit || regtry(reginfo, &s)))
2197 else { /* Not utf8. */
2198 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2199 while (s < strend) {
2200 LB_enum after = getLB_VAL_CP((U8) *s);
2201 if (to_complement ^ isLB(before,
2203 (U8*) reginfo->strbeg,
2205 (U8*) reginfo->strend,
2207 && (reginfo->intuit || regtry(reginfo, &s)))
2216 if (reginfo->intuit || regtry(reginfo, &s)) {
2223 if (s == reginfo->strbeg) {
2224 if (reginfo->intuit || regtry(reginfo, &s)) {
2227 s += (utf8_target) ? UTF8SKIP(s) : 1;
2228 if (UNLIKELY(s >= reginfo->strend)) {
2234 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2236 (U8*)(reginfo->strbeg)),
2237 (U8*) reginfo->strend);
2238 while (s < strend) {
2239 SB_enum after = getSB_VAL_UTF8((U8*) s,
2240 (U8*) reginfo->strend);
2241 if ((to_complement ^ isSB(before,
2243 (U8*) reginfo->strbeg,
2245 (U8*) reginfo->strend,
2247 && (reginfo->intuit || regtry(reginfo, &s)))
2255 else { /* Not utf8. */
2256 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2257 while (s < strend) {
2258 SB_enum after = getSB_VAL_CP((U8) *s);
2259 if ((to_complement ^ isSB(before,
2261 (U8*) reginfo->strbeg,
2263 (U8*) reginfo->strend,
2265 && (reginfo->intuit || regtry(reginfo, &s)))
2274 /* Here are at the final position in the target string. The SB
2275 * value is always true here, so matches, depending on other
2277 if (reginfo->intuit || regtry(reginfo, &s)) {
2284 if (s == reginfo->strbeg) {
2285 if (reginfo->intuit || regtry(reginfo, &s)) {
2288 s += (utf8_target) ? UTF8SKIP(s) : 1;
2289 if (UNLIKELY(s >= reginfo->strend)) {
2295 /* We are at a boundary between char_sub_0 and char_sub_1.
2296 * We also keep track of the value for char_sub_-1 as we
2297 * loop through the line. Context may be needed to make a
2298 * determination, and if so, this can save having to
2300 WB_enum previous = WB_UNKNOWN;
2301 WB_enum before = getWB_VAL_UTF8(
2304 (U8*)(reginfo->strbeg)),
2305 (U8*) reginfo->strend);
2306 while (s < strend) {
2307 WB_enum after = getWB_VAL_UTF8((U8*) s,
2308 (U8*) reginfo->strend);
2309 if ((to_complement ^ isWB(previous,
2312 (U8*) reginfo->strbeg,
2314 (U8*) reginfo->strend,
2316 && (reginfo->intuit || regtry(reginfo, &s)))
2325 else { /* Not utf8. */
2326 WB_enum previous = WB_UNKNOWN;
2327 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2328 while (s < strend) {
2329 WB_enum after = getWB_VAL_CP((U8) *s);
2330 if ((to_complement ^ isWB(previous,
2333 (U8*) reginfo->strbeg,
2335 (U8*) reginfo->strend,
2337 && (reginfo->intuit || regtry(reginfo, &s)))
2347 if (reginfo->intuit || regtry(reginfo, &s)) {
2354 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2355 is_LNBREAK_latin1_safe(s, strend)
2359 /* The argument to all the POSIX node types is the class number to pass to
2360 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2367 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2368 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2369 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2384 /* The complement of something that matches only ASCII matches all
2385 * non-ASCII, plus everything in ASCII that isn't in the class. */
2386 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2387 || ! _generic_isCC_A(*s, FLAGS(c)));
2396 /* Don't need to worry about utf8, as it can match only a single
2397 * byte invariant character. */
2398 REXEC_FBC_CLASS_SCAN(
2399 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2407 if (! utf8_target) {
2408 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2414 classnum = (_char_class_number) FLAGS(c);
2415 if (classnum < _FIRST_NON_SWASH_CC) {
2416 while (s < strend) {
2418 /* We avoid loading in the swash as long as possible, but
2419 * should we have to, we jump to a separate loop. This
2420 * extra 'if' statement is what keeps this code from being
2421 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2422 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2423 goto found_above_latin1;
2425 if ((UTF8_IS_INVARIANT(*s)
2426 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2428 || (UTF8_IS_DOWNGRADEABLE_START(*s)
2429 && to_complement ^ cBOOL(
2430 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2434 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2446 else switch (classnum) { /* These classes are implemented as
2448 case _CC_ENUM_SPACE:
2449 REXEC_FBC_UTF8_CLASS_SCAN(
2450 to_complement ^ cBOOL(isSPACE_utf8(s)));
2453 case _CC_ENUM_BLANK:
2454 REXEC_FBC_UTF8_CLASS_SCAN(
2455 to_complement ^ cBOOL(isBLANK_utf8(s)));
2458 case _CC_ENUM_XDIGIT:
2459 REXEC_FBC_UTF8_CLASS_SCAN(
2460 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2463 case _CC_ENUM_VERTSPACE:
2464 REXEC_FBC_UTF8_CLASS_SCAN(
2465 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2468 case _CC_ENUM_CNTRL:
2469 REXEC_FBC_UTF8_CLASS_SCAN(
2470 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2474 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2475 NOT_REACHED; /* NOTREACHED */
2480 found_above_latin1: /* Here we have to load a swash to get the result
2481 for the current code point */
2482 if (! PL_utf8_swash_ptrs[classnum]) {
2483 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2484 PL_utf8_swash_ptrs[classnum] =
2485 _core_swash_init("utf8",
2488 PL_XPosix_ptrs[classnum], &flags);
2491 /* This is a copy of the loop above for swash classes, though using the
2492 * FBC macro instead of being expanded out. Since we've loaded the
2493 * swash, we don't have to check for that each time through the loop */
2494 REXEC_FBC_UTF8_CLASS_SCAN(
2495 to_complement ^ cBOOL(_generic_utf8(
2498 swash_fetch(PL_utf8_swash_ptrs[classnum],
2506 /* what trie are we using right now */
2507 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2508 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2509 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2511 const char *last_start = strend - trie->minlen;
2513 const char *real_start = s;
2515 STRLEN maxlen = trie->maxlen;
2517 U8 **points; /* map of where we were in the input string
2518 when reading a given char. For ASCII this
2519 is unnecessary overhead as the relationship
2520 is always 1:1, but for Unicode, especially
2521 case folded Unicode this is not true. */
2522 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2526 GET_RE_DEBUG_FLAGS_DECL;
2528 /* We can't just allocate points here. We need to wrap it in
2529 * an SV so it gets freed properly if there is a croak while
2530 * running the match */
2533 sv_points=newSV(maxlen * sizeof(U8 *));
2534 SvCUR_set(sv_points,
2535 maxlen * sizeof(U8 *));
2536 SvPOK_on(sv_points);
2537 sv_2mortal(sv_points);
2538 points=(U8**)SvPV_nolen(sv_points );
2539 if ( trie_type != trie_utf8_fold
2540 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2543 bitmap=(U8*)trie->bitmap;
2545 bitmap=(U8*)ANYOF_BITMAP(c);
2547 /* this is the Aho-Corasick algorithm modified a touch
2548 to include special handling for long "unknown char" sequences.
2549 The basic idea being that we use AC as long as we are dealing
2550 with a possible matching char, when we encounter an unknown char
2551 (and we have not encountered an accepting state) we scan forward
2552 until we find a legal starting char.
2553 AC matching is basically that of trie matching, except that when
2554 we encounter a failing transition, we fall back to the current
2555 states "fail state", and try the current char again, a process
2556 we repeat until we reach the root state, state 1, or a legal
2557 transition. If we fail on the root state then we can either
2558 terminate if we have reached an accepting state previously, or
2559 restart the entire process from the beginning if we have not.
2562 while (s <= last_start) {
2563 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2571 U8 *uscan = (U8*)NULL;
2572 U8 *leftmost = NULL;
2574 U32 accepted_word= 0;
2578 while ( state && uc <= (U8*)strend ) {
2580 U32 word = aho->states[ state ].wordnum;
2584 DEBUG_TRIE_EXECUTE_r(
2585 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2586 dump_exec_pos( (char *)uc, c, strend, real_start,
2587 (char *)uc, utf8_target, 0 );
2588 Perl_re_printf( aTHX_
2589 " Scanning for legal start char...\n");
2593 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2597 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2603 if (uc >(U8*)last_start) break;
2607 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2608 if (!leftmost || lpos < leftmost) {
2609 DEBUG_r(accepted_word=word);
2615 points[pointpos++ % maxlen]= uc;
2616 if (foldlen || uc < (U8*)strend) {
2617 REXEC_TRIE_READ_CHAR(trie_type, trie,
2619 uscan, len, uvc, charid, foldlen,
2621 DEBUG_TRIE_EXECUTE_r({
2622 dump_exec_pos( (char *)uc, c, strend,
2623 real_start, s, utf8_target, 0);
2624 Perl_re_printf( aTHX_
2625 " Charid:%3u CP:%4"UVxf" ",
2637 word = aho->states[ state ].wordnum;
2639 base = aho->states[ state ].trans.base;
2641 DEBUG_TRIE_EXECUTE_r({
2643 dump_exec_pos( (char *)uc, c, strend, real_start,
2644 s, utf8_target, 0 );
2645 Perl_re_printf( aTHX_
2646 "%sState: %4"UVxf", word=%"UVxf,
2647 failed ? " Fail transition to " : "",
2648 (UV)state, (UV)word);
2654 ( ((offset = base + charid
2655 - 1 - trie->uniquecharcount)) >= 0)
2656 && ((U32)offset < trie->lasttrans)
2657 && trie->trans[offset].check == state
2658 && (tmp=trie->trans[offset].next))
2660 DEBUG_TRIE_EXECUTE_r(
2661 Perl_re_printf( aTHX_ " - legal\n"));
2666 DEBUG_TRIE_EXECUTE_r(
2667 Perl_re_printf( aTHX_ " - fail\n"));
2669 state = aho->fail[state];
2673 /* we must be accepting here */
2674 DEBUG_TRIE_EXECUTE_r(
2675 Perl_re_printf( aTHX_ " - accepting\n"));
2684 if (!state) state = 1;
2687 if ( aho->states[ state ].wordnum ) {
2688 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2689 if (!leftmost || lpos < leftmost) {
2690 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2695 s = (char*)leftmost;
2696 DEBUG_TRIE_EXECUTE_r({
2697 Perl_re_printf( aTHX_ "Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2698 (UV)accepted_word, (IV)(s - real_start)
2701 if (reginfo->intuit || regtry(reginfo, &s)) {
2707 DEBUG_TRIE_EXECUTE_r({
2708 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
2711 DEBUG_TRIE_EXECUTE_r(
2712 Perl_re_printf( aTHX_ "No match.\n"));
2721 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2728 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2729 * flags have same meanings as with regexec_flags() */
2732 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2739 struct regexp *const prog = ReANY(rx);
2741 if (flags & REXEC_COPY_STR) {
2744 DEBUG_C(Perl_re_printf( aTHX_
2745 "Copy on write: regexp capture, type %d\n",
2747 /* Create a new COW SV to share the match string and store
2748 * in saved_copy, unless the current COW SV in saved_copy
2749 * is valid and suitable for our purpose */
2750 if (( prog->saved_copy
2751 && SvIsCOW(prog->saved_copy)
2752 && SvPOKp(prog->saved_copy)
2755 && SvPVX(sv) == SvPVX(prog->saved_copy)))
2757 /* just reuse saved_copy SV */
2758 if (RXp_MATCH_COPIED(prog)) {
2759 Safefree(prog->subbeg);
2760 RXp_MATCH_COPIED_off(prog);
2764 /* create new COW SV to share string */
2765 RX_MATCH_COPY_FREE(rx);
2766 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2768 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2769 assert (SvPOKp(prog->saved_copy));
2770 prog->sublen = strend - strbeg;
2771 prog->suboffset = 0;
2772 prog->subcoffset = 0;
2777 SSize_t max = strend - strbeg;
2780 if ( (flags & REXEC_COPY_SKIP_POST)
2781 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2782 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2783 ) { /* don't copy $' part of string */
2786 /* calculate the right-most part of the string covered
2787 * by a capture. Due to lookahead, this may be to
2788 * the right of $&, so we have to scan all captures */
2789 while (n <= prog->lastparen) {
2790 if (prog->offs[n].end > max)
2791 max = prog->offs[n].end;
2795 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2796 ? prog->offs[0].start
2798 assert(max >= 0 && max <= strend - strbeg);
2801 if ( (flags & REXEC_COPY_SKIP_PRE)
2802 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2803 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2804 ) { /* don't copy $` part of string */
2807 /* calculate the left-most part of the string covered
2808 * by a capture. Due to lookbehind, this may be to
2809 * the left of $&, so we have to scan all captures */
2810 while (min && n <= prog->lastparen) {
2811 if ( prog->offs[n].start != -1
2812 && prog->offs[n].start < min)
2814 min = prog->offs[n].start;
2818 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2819 && min > prog->offs[0].end
2821 min = prog->offs[0].end;
2825 assert(min >= 0 && min <= max && min <= strend - strbeg);
2828 if (RX_MATCH_COPIED(rx)) {
2829 if (sublen > prog->sublen)
2831 (char*)saferealloc(prog->subbeg, sublen+1);
2834 prog->subbeg = (char*)safemalloc(sublen+1);
2835 Copy(strbeg + min, prog->subbeg, sublen, char);
2836 prog->subbeg[sublen] = '\0';
2837 prog->suboffset = min;
2838 prog->sublen = sublen;
2839 RX_MATCH_COPIED_on(rx);
2841 prog->subcoffset = prog->suboffset;
2842 if (prog->suboffset && utf8_target) {
2843 /* Convert byte offset to chars.
2844 * XXX ideally should only compute this if @-/@+
2845 * has been seen, a la PL_sawampersand ??? */
2847 /* If there's a direct correspondence between the
2848 * string which we're matching and the original SV,
2849 * then we can use the utf8 len cache associated with
2850 * the SV. In particular, it means that under //g,
2851 * sv_pos_b2u() will use the previously cached
2852 * position to speed up working out the new length of
2853 * subcoffset, rather than counting from the start of
2854 * the string each time. This stops
2855 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2856 * from going quadratic */
2857 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2858 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2859 SV_GMAGIC|SV_CONST_RETURN);
2861 prog->subcoffset = utf8_length((U8*)strbeg,
2862 (U8*)(strbeg+prog->suboffset));
2866 RX_MATCH_COPY_FREE(rx);
2867 prog->subbeg = strbeg;
2868 prog->suboffset = 0;
2869 prog->subcoffset = 0;
2870 prog->sublen = strend - strbeg;
2878 - regexec_flags - match a regexp against a string
2881 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2882 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2883 /* stringarg: the point in the string at which to begin matching */
2884 /* strend: pointer to null at end of string */
2885 /* strbeg: real beginning of string */
2886 /* minend: end of match must be >= minend bytes after stringarg. */
2887 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2888 * itself is accessed via the pointers above */
2889 /* data: May be used for some additional optimizations.
2890 Currently unused. */
2891 /* flags: For optimizations. See REXEC_* in regexp.h */
2894 struct regexp *const prog = ReANY(rx);
2898 SSize_t minlen; /* must match at least this many chars */
2899 SSize_t dontbother = 0; /* how many characters not to try at end */
2900 const bool utf8_target = cBOOL(DO_UTF8(sv));
2902 RXi_GET_DECL(prog,progi);
2903 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2904 regmatch_info *const reginfo = ®info_buf;
2905 regexp_paren_pair *swap = NULL;
2907 GET_RE_DEBUG_FLAGS_DECL;
2909 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2910 PERL_UNUSED_ARG(data);
2912 /* Be paranoid... */
2914 Perl_croak(aTHX_ "NULL regexp parameter");
2918 debug_start_match(rx, utf8_target, stringarg, strend,
2922 startpos = stringarg;
2924 /* set these early as they may be used by the HOP macros below */
2925 reginfo->strbeg = strbeg;
2926 reginfo->strend = strend;
2927 reginfo->is_utf8_target = cBOOL(utf8_target);
2929 if (prog->intflags & PREGf_GPOS_SEEN) {
2932 /* set reginfo->ganch, the position where \G can match */
2935 (flags & REXEC_IGNOREPOS)
2936 ? stringarg /* use start pos rather than pos() */
2937 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2938 /* Defined pos(): */
2939 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2940 : strbeg; /* pos() not defined; use start of string */
2942 DEBUG_GPOS_r(Perl_re_printf( aTHX_
2943 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2945 /* in the presence of \G, we may need to start looking earlier in
2946 * the string than the suggested start point of stringarg:
2947 * if prog->gofs is set, then that's a known, fixed minimum
2950 * /ab|c\G/: gofs = 1
2951 * or if the minimum offset isn't known, then we have to go back
2952 * to the start of the string, e.g. /w+\G/
2955 if (prog->intflags & PREGf_ANCH_GPOS) {
2957 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
2959 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
2961 DEBUG_r(Perl_re_printf( aTHX_
2962 "fail: ganch-gofs before earliest possible start\n"));
2967 startpos = reginfo->ganch;
2969 else if (prog->gofs) {
2970 startpos = HOPBACKc(startpos, prog->gofs);
2974 else if (prog->intflags & PREGf_GPOS_FLOAT)
2978 minlen = prog->minlen;
2979 if ((startpos + minlen) > strend || startpos < strbeg) {
2980 DEBUG_r(Perl_re_printf( aTHX_
2981 "Regex match can't succeed, so not even tried\n"));
2985 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2986 * which will call destuctors to reset PL_regmatch_state, free higher
2987 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2988 * regmatch_info_aux_eval */
2990 oldsave = PL_savestack_ix;
2994 if ((prog->extflags & RXf_USE_INTUIT)
2995 && !(flags & REXEC_CHECKED))
2997 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3002 if (prog->extflags & RXf_CHECK_ALL) {
3003 /* we can match based purely on the result of INTUIT.
3004 * Set up captures etc just for $& and $-[0]
3005 * (an intuit-only match wont have $1,$2,..) */
3006 assert(!prog->nparens);
3008 /* s/// doesn't like it if $& is earlier than where we asked it to
3009 * start searching (which can happen on something like /.\G/) */
3010 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3013 /* this should only be possible under \G */
3014 assert(prog->intflags & PREGf_GPOS_SEEN);
3015 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3016 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3020 /* match via INTUIT shouldn't have any captures.
3021 * Let @-, @+, $^N know */
3022 prog->lastparen = prog->lastcloseparen = 0;
3023 RX_MATCH_UTF8_set(rx, utf8_target);
3024 prog->offs[0].start = s - strbeg;
3025 prog->offs[0].end = utf8_target
3026 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3027 : s - strbeg + prog->minlenret;
3028 if ( !(flags & REXEC_NOT_FIRST) )
3029 S_reg_set_capture_string(aTHX_ rx,
3031 sv, flags, utf8_target);
3037 multiline = prog->extflags & RXf_PMf_MULTILINE;
3039 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3040 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3041 "String too short [regexec_flags]...\n"));
3045 /* Check validity of program. */
3046 if (UCHARAT(progi->program) != REG_MAGIC) {
3047 Perl_croak(aTHX_ "corrupted regexp program");
3050 RX_MATCH_TAINTED_off(rx);
3051 RX_MATCH_UTF8_set(rx, utf8_target);
3053 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3054 reginfo->intuit = 0;
3055 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3056 reginfo->warned = FALSE;
3058 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3059 /* see how far we have to get to not match where we matched before */
3060 reginfo->till = stringarg + minend;
3062 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3063 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3064 S_cleanup_regmatch_info_aux has executed (registered by
3065 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3066 magic belonging to this SV.
3067 Not newSVsv, either, as it does not COW.
3069 reginfo->sv = newSV(0);
3070 SvSetSV_nosteal(reginfo->sv, sv);
3071 SAVEFREESV(reginfo->sv);
3074 /* reserve next 2 or 3 slots in PL_regmatch_state:
3075 * slot N+0: may currently be in use: skip it
3076 * slot N+1: use for regmatch_info_aux struct
3077 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3078 * slot N+3: ready for use by regmatch()
3082 regmatch_state *old_regmatch_state;
3083 regmatch_slab *old_regmatch_slab;
3084 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3086 /* on first ever match, allocate first slab */
3087 if (!PL_regmatch_slab) {
3088 Newx(PL_regmatch_slab, 1, regmatch_slab);
3089 PL_regmatch_slab->prev = NULL;
3090 PL_regmatch_slab->next = NULL;
3091 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3094 old_regmatch_state = PL_regmatch_state;
3095 old_regmatch_slab = PL_regmatch_slab;
3097 for (i=0; i <= max; i++) {
3099 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3101 reginfo->info_aux_eval =
3102 reginfo->info_aux->info_aux_eval =
3103 &(PL_regmatch_state->u.info_aux_eval);
3105 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3106 PL_regmatch_state = S_push_slab(aTHX);
3109 /* note initial PL_regmatch_state position; at end of match we'll
3110 * pop back to there and free any higher slabs */
3112 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3113 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3114 reginfo->info_aux->poscache = NULL;
3116 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3118 if ((prog->extflags & RXf_EVAL_SEEN))
3119 S_setup_eval_state(aTHX_ reginfo);
3121 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3124 /* If there is a "must appear" string, look for it. */
3126 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3127 /* We have to be careful. If the previous successful match
3128 was from this regex we don't want a subsequent partially
3129 successful match to clobber the old results.
3130 So when we detect this possibility we add a swap buffer
3131 to the re, and switch the buffer each match. If we fail,
3132 we switch it back; otherwise we leave it swapped.
3135 /* do we need a save destructor here for eval dies? */
3136 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3137 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3138 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3146 if (prog->recurse_locinput)
3147 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3149 /* Simplest case: anchored match need be tried only once, or with
3150 * MBOL, only at the beginning of each line.
3152 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3153 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3154 * match at the start of the string then it won't match anywhere else
3155 * either; while with /.*.../, if it doesn't match at the beginning,
3156 * the earliest it could match is at the start of the next line */
3158 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3161 if (regtry(reginfo, &s))
3164 if (!(prog->intflags & PREGf_ANCH_MBOL))
3167 /* didn't match at start, try at other newline positions */
3170 dontbother = minlen - 1;
3171 end = HOP3c(strend, -dontbother, strbeg) - 1;
3173 /* skip to next newline */
3175 while (s <= end) { /* note it could be possible to match at the end of the string */
3176 /* NB: newlines are the same in unicode as they are in latin */
3179 if (prog->check_substr || prog->check_utf8) {
3180 /* note that with PREGf_IMPLICIT, intuit can only fail
3181 * or return the start position, so it's of limited utility.
3182 * Nevertheless, I made the decision that the potential for
3183 * quick fail was still worth it - DAPM */
3184 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3188 if (regtry(reginfo, &s))
3192 } /* end anchored search */
3194 if (prog->intflags & PREGf_ANCH_GPOS)
3196 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3197 assert(prog->intflags & PREGf_GPOS_SEEN);
3198 /* For anchored \G, the only position it can match from is
3199 * (ganch-gofs); we already set startpos to this above; if intuit
3200 * moved us on from there, we can't possibly succeed */
3201 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3202 if (s == startpos && regtry(reginfo, &s))
3207 /* Messy cases: unanchored match. */
3208 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3209 /* we have /x+whatever/ */
3210 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3216 if (! prog->anchored_utf8) {
3217 to_utf8_substr(prog);
3219 ch = SvPVX_const(prog->anchored_utf8)[0];
3222 DEBUG_EXECUTE_r( did_match = 1 );
3223 if (regtry(reginfo, &s)) goto got_it;
3225 while (s < strend && *s == ch)
3232 if (! prog->anchored_substr) {
3233 if (! to_byte_substr(prog)) {
3234 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3237 ch = SvPVX_const(prog->anchored_substr)[0];
3240 DEBUG_EXECUTE_r( did_match = 1 );
3241 if (regtry(reginfo, &s)) goto got_it;
3243 while (s < strend && *s == ch)
3248 DEBUG_EXECUTE_r(if (!did_match)
3249 Perl_re_printf( aTHX_
3250 "Did not find anchored character...\n")
3253 else if (prog->anchored_substr != NULL
3254 || prog->anchored_utf8 != NULL
3255 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3256 && prog->float_max_offset < strend - s)) {
3261 char *last1; /* Last position checked before */
3265 if (prog->anchored_substr || prog->anchored_utf8) {
3267 if (! prog->anchored_utf8) {
3268 to_utf8_substr(prog);
3270 must = prog->anchored_utf8;
3273 if (! prog->anchored_substr) {
3274 if (! to_byte_substr(prog)) {
3275 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3278 must = prog->anchored_substr;
3280 back_max = back_min = prog->anchored_offset;
3283 if (! prog->float_utf8) {
3284 to_utf8_substr(prog);
3286 must = prog->float_utf8;
3289 if (! prog->float_substr) {
3290 if (! to_byte_substr(prog)) {
3291 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3294 must = prog->float_substr;
3296 back_max = prog->float_max_offset;
3297 back_min = prog->float_min_offset;
3303 last = HOP3c(strend, /* Cannot start after this */
3304 -(SSize_t)(CHR_SVLEN(must)
3305 - (SvTAIL(must) != 0) + back_min), strbeg);
3307 if (s > reginfo->strbeg)
3308 last1 = HOPc(s, -1);
3310 last1 = s - 1; /* bogus */
3312 /* XXXX check_substr already used to find "s", can optimize if
3313 check_substr==must. */
3315 strend = HOPc(strend, -dontbother);
3316 while ( (s <= last) &&
3317 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
3318 (unsigned char*)strend, must,
3319 multiline ? FBMrf_MULTILINE : 0)) ) {
3320 DEBUG_EXECUTE_r( did_match = 1 );
3321 if (HOPc(s, -back_max) > last1) {
3322 last1 = HOPc(s, -back_min);
3323 s = HOPc(s, -back_max);
3326 char * const t = (last1 >= reginfo->strbeg)
3327 ? HOPc(last1, 1) : last1 + 1;
3329 last1 = HOPc(s, -back_min);
3333 while (s <= last1) {
3334 if (regtry(reginfo, &s))
3337 s++; /* to break out of outer loop */
3344 while (s <= last1) {
3345 if (regtry(reginfo, &s))
3351 DEBUG_EXECUTE_r(if (!did_match) {
3352 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3353 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3354 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
3355 ((must == prog->anchored_substr || must == prog->anchored_utf8)
3356 ? "anchored" : "floating"),
3357 quoted, RE_SV_TAIL(must));
3361 else if ( (c = progi->regstclass) ) {
3363 const OPCODE op = OP(progi->regstclass);
3364 /* don't bother with what can't match */
3365 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3366 strend = HOPc(strend, -(minlen - 1));
3369 SV * const prop = sv_newmortal();
3370 regprop(prog, prop, c, reginfo, NULL);
3372 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3374 Perl_re_printf( aTHX_
3375 "Matching stclass %.*s against %s (%d bytes)\n",
3376 (int)SvCUR(prop), SvPVX_const(prop),
3377 quoted, (int)(strend - s));
3380 if (find_byclass(prog, c, s, strend, reginfo))
3382 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
3386 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3394 if (! prog->float_utf8) {
3395 to_utf8_substr(prog);
3397 float_real = prog->float_utf8;
3400 if (! prog->float_substr) {
3401 if (! to_byte_substr(prog)) {
3402 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3405 float_real = prog->float_substr;
3408 little = SvPV_const(float_real, len);
3409 if (SvTAIL(float_real)) {
3410 /* This means that float_real contains an artificial \n on
3411 * the end due to the presence of something like this:
3412 * /foo$/ where we can match both "foo" and "foo\n" at the
3413 * end of the string. So we have to compare the end of the
3414 * string first against the float_real without the \n and
3415 * then against the full float_real with the string. We
3416 * have to watch out for cases where the string might be
3417 * smaller than the float_real or the float_real without
3419 char *checkpos= strend - len;
3421 Perl_re_printf( aTHX_
3422 "%sChecking for float_real.%s\n",
3423 PL_colors[4], PL_colors[5]));
3424 if (checkpos + 1 < strbeg) {
3425 /* can't match, even if we remove the trailing \n
3426 * string is too short to match */
3428 Perl_re_printf( aTHX_
3429 "%sString shorter than required trailing substring, cannot match.%s\n",
3430 PL_colors[4], PL_colors[5]));
3432 } else if (memEQ(checkpos + 1, little, len - 1)) {
3433 /* can match, the end of the string matches without the
3435 last = checkpos + 1;
3436 } else if (checkpos < strbeg) {
3437 /* cant match, string is too short when the "\n" is
3440 Perl_re_printf( aTHX_
3441 "%sString does not contain required trailing substring, cannot match.%s\n",
3442 PL_colors[4], PL_colors[5]));
3444 } else if (!multiline) {
3445 /* non multiline match, so compare with the "\n" at the
3446 * end of the string */
3447 if (memEQ(checkpos, little, len)) {
3451 Perl_re_printf( aTHX_
3452 "%sString does not contain required trailing substring, cannot match.%s\n",
3453 PL_colors[4], PL_colors[5]));
3457 /* multiline match, so we have to search for a place
3458 * where the full string is located */
3464 last = rninstr(s, strend, little, little + len);
3466 last = strend; /* matching "$" */
3469 /* at one point this block contained a comment which was
3470 * probably incorrect, which said that this was a "should not
3471 * happen" case. Even if it was true when it was written I am
3472 * pretty sure it is not anymore, so I have removed the comment
3473 * and replaced it with this one. Yves */
3475 Perl_re_printf( aTHX_
3476 "%sString does not contain required substring, cannot match.%s\n",
3477 PL_colors[4], PL_colors[5]
3481 dontbother = strend - last + prog->float_min_offset;
3483 if (minlen && (dontbother < minlen))
3484 dontbother = minlen - 1;
3485 strend -= dontbother; /* this one's always in bytes! */
3486 /* We don't know much -- general case. */
3489 if (regtry(reginfo, &s))
3498 if (regtry(reginfo, &s))
3500 } while (s++ < strend);
3508 /* s/// doesn't like it if $& is earlier than where we asked it to
3509 * start searching (which can happen on something like /.\G/) */
3510 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3511 && (prog->offs[0].start < stringarg - strbeg))
3513 /* this should only be possible under \G */
3514 assert(prog->intflags & PREGf_GPOS_SEEN);
3515 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3516 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3522 Perl_re_exec_indentf( aTHX_
3523 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3531 /* clean up; this will trigger destructors that will free all slabs
3532 * above the current one, and cleanup the regmatch_info_aux
3533 * and regmatch_info_aux_eval sructs */
3535 LEAVE_SCOPE(oldsave);
3537 if (RXp_PAREN_NAMES(prog))
3538 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3540 /* make sure $`, $&, $', and $digit will work later */
3541 if ( !(flags & REXEC_NOT_FIRST) )
3542 S_reg_set_capture_string(aTHX_ rx,
3543 strbeg, reginfo->strend,
3544 sv, flags, utf8_target);
3549 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
3550 PL_colors[4], PL_colors[5]));
3552 /* clean up; this will trigger destructors that will free all slabs
3553 * above the current one, and cleanup the regmatch_info_aux
3554 * and regmatch_info_aux_eval sructs */
3556 LEAVE_SCOPE(oldsave);
3559 /* we failed :-( roll it back */
3560 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3561 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3567 Safefree(prog->offs);
3574 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3575 * Do inc before dec, in case old and new rex are the same */
3576 #define SET_reg_curpm(Re2) \
3577 if (reginfo->info_aux_eval) { \
3578 (void)ReREFCNT_inc(Re2); \
3579 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3580 PM_SETRE((PL_reg_curpm), (Re2)); \
3585 - regtry - try match at specific point
3587 STATIC bool /* 0 failure, 1 success */
3588 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3591 REGEXP *const rx = reginfo->prog;
3592 regexp *const prog = ReANY(rx);
3595 U32 depth = 0; /* used by REGCP_SET */
3597 RXi_GET_DECL(prog,progi);
3598 GET_RE_DEBUG_FLAGS_DECL;
3600 PERL_ARGS_ASSERT_REGTRY;
3602 reginfo->cutpoint=NULL;
3604 prog->offs[0].start = *startposp - reginfo->strbeg;
3605 prog->lastparen = 0;
3606 prog->lastcloseparen = 0;
3608 /* XXXX What this code is doing here?!!! There should be no need
3609 to do this again and again, prog->lastparen should take care of
3612 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3613 * Actually, the code in regcppop() (which Ilya may be meaning by
3614 * prog->lastparen), is not needed at all by the test suite
3615 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3616 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3617 * Meanwhile, this code *is* needed for the
3618 * above-mentioned test suite tests to succeed. The common theme
3619 * on those tests seems to be returning null fields from matches.
3620 * --jhi updated by dapm */
3622 /* After encountering a variant of the issue mentioned above I think
3623 * the point Ilya was making is that if we properly unwind whenever
3624 * we set lastparen to a smaller value then we should not need to do
3625 * this every time, only when needed. So if we have tests that fail if
3626 * we remove this, then it suggests somewhere else we are improperly
3627 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3628 * places it is called, and related regcp() routines. - Yves */
3630 if (prog->nparens) {
3631 regexp_paren_pair *pp = prog->offs;
3633 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3641 result = regmatch(reginfo, *startposp, progi->program + 1);
3643 prog->offs[0].end = result;
3646 if (reginfo->cutpoint)
3647 *startposp= reginfo->cutpoint;
3648 REGCP_UNWIND(lastcp);
3653 #define sayYES goto yes
3654 #define sayNO goto no
3655 #define sayNO_SILENT goto no_silent
3657 /* we dont use STMT_START/END here because it leads to
3658 "unreachable code" warnings, which are bogus, but distracting. */
3659 #define CACHEsayNO \
3660 if (ST.cache_mask) \
3661 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3664 /* this is used to determine how far from the left messages like
3665 'failed...' are printed in regexec.c. It should be set such that
3666 messages are inline with the regop output that created them.
3668 #define REPORT_CODE_OFF 29
3669 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3672 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3676 PerlIO *f= Perl_debug_log;
3677 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3678 va_start(ap, depth);
3679 PerlIO_printf(f, "%*s|%4"UVuf"| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3680 result = PerlIO_vprintf(f, fmt, ap);
3684 #endif /* DEBUGGING */
3687 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3688 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
3689 #define CHRTEST_NOT_A_CP_1 -999
3690 #define CHRTEST_NOT_A_CP_2 -998
3692 /* grab a new slab and return the first slot in it */
3694 STATIC regmatch_state *
3697 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3700 regmatch_slab *s = PL_regmatch_slab->next;
3702 Newx(s, 1, regmatch_slab);
3703 s->prev = PL_regmatch_slab;
3705 PL_regmatch_slab->next = s;
3707 PL_regmatch_slab = s;
3708 return SLAB_FIRST(s);
3712 /* push a new state then goto it */
3714 #define PUSH_STATE_GOTO(state, node, input) \
3715 pushinput = input; \
3717 st->resume_state = state; \
3720 /* push a new state with success backtracking, then goto it */
3722 #define PUSH_YES_STATE_GOTO(state, node, input) \
3723 pushinput = input; \
3725 st->resume_state = state; \
3726 goto push_yes_state;
3733 regmatch() - main matching routine
3735 This is basically one big switch statement in a loop. We execute an op,
3736 set 'next' to point the next op, and continue. If we come to a point which
3737 we may need to backtrack to on failure such as (A|B|C), we push a
3738 backtrack state onto the backtrack stack. On failure, we pop the top
3739 state, and re-enter the loop at the state indicated. If there are no more
3740 states to pop, we return failure.
3742 Sometimes we also need to backtrack on success; for example /A+/, where
3743 after successfully matching one A, we need to go back and try to
3744 match another one; similarly for lookahead assertions: if the assertion
3745 completes successfully, we backtrack to the state just before the assertion
3746 and then carry on. In these cases, the pushed state is marked as
3747 'backtrack on success too'. This marking is in fact done by a chain of
3748 pointers, each pointing to the previous 'yes' state. On success, we pop to
3749 the nearest yes state, discarding any intermediate failure-only states.
3750 Sometimes a yes state is pushed just to force some cleanup code to be
3751 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3752 it to free the inner regex.
3754 Note that failure backtracking rewinds the cursor position, while
3755 success backtracking leaves it alone.
3757 A pattern is complete when the END op is executed, while a subpattern
3758 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3759 ops trigger the "pop to last yes state if any, otherwise return true"
3762 A common convention in this function is to use A and B to refer to the two
3763 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3764 the subpattern to be matched possibly multiple times, while B is the entire
3765 rest of the pattern. Variable and state names reflect this convention.
3767 The states in the main switch are the union of ops and failure/success of
3768 substates associated with with that op. For example, IFMATCH is the op
3769 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3770 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3771 successfully matched A and IFMATCH_A_fail is a state saying that we have
3772 just failed to match A. Resume states always come in pairs. The backtrack
3773 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3774 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3775 on success or failure.
3777 The struct that holds a backtracking state is actually a big union, with
3778 one variant for each major type of op. The variable st points to the
3779 top-most backtrack struct. To make the code clearer, within each
3780 block of code we #define ST to alias the relevant union.
3782 Here's a concrete example of a (vastly oversimplified) IFMATCH
3788 #define ST st->u.ifmatch
3790 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3791 ST.foo = ...; // some state we wish to save
3793 // push a yes backtrack state with a resume value of
3794 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3796 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3799 case IFMATCH_A: // we have successfully executed A; now continue with B
3801 bar = ST.foo; // do something with the preserved value
3804 case IFMATCH_A_fail: // A failed, so the assertion failed
3805 ...; // do some housekeeping, then ...
3806 sayNO; // propagate the failure
3813 For any old-timers reading this who are familiar with the old recursive
3814 approach, the code above is equivalent to:
3816 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3825 ...; // do some housekeeping, then ...
3826 sayNO; // propagate the failure
3829 The topmost backtrack state, pointed to by st, is usually free. If you
3830 want to claim it, populate any ST.foo fields in it with values you wish to
3831 save, then do one of
3833 PUSH_STATE_GOTO(resume_state, node, newinput);
3834 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3836 which sets that backtrack state's resume value to 'resume_state', pushes a
3837 new free entry to the top of the backtrack stack, then goes to 'node'.
3838 On backtracking, the free slot is popped, and the saved state becomes the
3839 new free state. An ST.foo field in this new top state can be temporarily
3840 accessed to retrieve values, but once the main loop is re-entered, it
3841 becomes available for reuse.
3843 Note that the depth of the backtrack stack constantly increases during the
3844 left-to-right execution of the pattern, rather than going up and down with
3845 the pattern nesting. For example the stack is at its maximum at Z at the
3846 end of the pattern, rather than at X in the following:
3848 /(((X)+)+)+....(Y)+....Z/
3850 The only exceptions to this are lookahead/behind assertions and the cut,
3851 (?>A), which pop all the backtrack states associated with A before
3854 Backtrack state structs are allocated in slabs of about 4K in size.
3855 PL_regmatch_state and st always point to the currently active state,
3856 and PL_regmatch_slab points to the slab currently containing
3857 PL_regmatch_state. The first time regmatch() is called, the first slab is
3858 allocated, and is never freed until interpreter destruction. When the slab
3859 is full, a new one is allocated and chained to the end. At exit from
3860 regmatch(), slabs allocated since entry are freed.
3865 #define DEBUG_STATE_pp(pp) \
3867 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
3868 Perl_re_printf( aTHX_ \
3869 "%*s" pp " %s%s%s%s%s\n", \
3870 INDENT_CHARS(depth), "", \
3871 PL_reg_name[st->resume_state], \
3872 ((st==yes_state||st==mark_state) ? "[" : ""), \
3873 ((st==yes_state) ? "Y" : ""), \
3874 ((st==mark_state) ? "M" : ""), \
3875 ((st==yes_state||st==mark_state) ? "]" : "") \
3880 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3885 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3886 const char *start, const char *end, const char *blurb)
3888 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3890 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3895 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3896 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3898 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3899 start, end - start, 60);
3901 Perl_re_printf( aTHX_
3902 "%s%s REx%s %s against %s\n",
3903 PL_colors[4], blurb, PL_colors[5], s0, s1);
3905 if (utf8_target||utf8_pat)
3906 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
3907 utf8_pat ? "pattern" : "",
3908 utf8_pat && utf8_target ? " and " : "",
3909 utf8_target ? "string" : ""
3915 S_dump_exec_pos(pTHX_ const char *locinput,
3916 const regnode *scan,
3917 const char *loc_regeol,
3918 const char *loc_bostr,
3919 const char *loc_reg_starttry,
3920 const bool utf8_target,
3924 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3925 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3926 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3927 /* The part of the string before starttry has one color
3928 (pref0_len chars), between starttry and current
3929 position another one (pref_len - pref0_len chars),
3930 after the current position the third one.
3931 We assume that pref0_len <= pref_len, otherwise we
3932 decrease pref0_len. */
3933 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3934 ? (5 + taill) - l : locinput - loc_bostr;
3937 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3939 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3941 pref0_len = pref_len - (locinput - loc_reg_starttry);
3942 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3943 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3944 ? (5 + taill) - pref_len : loc_regeol - locinput);
3945 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3949 if (pref0_len > pref_len)
3950 pref0_len = pref_len;
3952 const int is_uni = utf8_target ? 1 : 0;
3954 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3955 (locinput - pref_len),pref0_len, 60, 4, 5);
3957 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3958 (locinput - pref_len + pref0_len),
3959 pref_len - pref0_len, 60, 2, 3);
3961 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3962 locinput, loc_regeol - locinput, 10, 0, 1);
3964 const STRLEN tlen=len0+len1+len2;
3965 Perl_re_printf( aTHX_
3966 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|%4u| ",
3967 (IV)(locinput - loc_bostr),
3970 (docolor ? "" : "> <"),
3972 (int)(tlen > 19 ? 0 : 19 - tlen),
3980 /* reg_check_named_buff_matched()
3981 * Checks to see if a named buffer has matched. The data array of
3982 * buffer numbers corresponding to the buffer is expected to reside
3983 * in the regexp->data->data array in the slot stored in the ARG() of
3984 * node involved. Note that this routine doesn't actually care about the
3985 * name, that information is not preserved from compilation to execution.
3986 * Returns the index of the leftmost defined buffer with the given name
3987 * or 0 if non of the buffers matched.
3990 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3993 RXi_GET_DECL(rex,rexi);
3994 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3995 I32 *nums=(I32*)SvPVX(sv_dat);
3997 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3999 for ( n=0; n<SvIVX(sv_dat); n++ ) {
4000 if ((I32)rex->lastparen >= nums[n] &&
4001 rex->offs[nums[n]].end != -1)
4011 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4012 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4014 /* This function determines if there are one or two characters that match
4015 * the first character of the passed-in EXACTish node <text_node>, and if
4016 * so, returns them in the passed-in pointers.
4018 * If it determines that no possible character in the target string can
4019 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
4020 * the first character in <text_node> requires UTF-8 to represent, and the
4021 * target string isn't in UTF-8.)
4023 * If there are more than two characters that could match the beginning of
4024 * <text_node>, or if more context is required to determine a match or not,
4025 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4027 * The motiviation behind this function is to allow the caller to set up
4028 * tight loops for matching. If <text_node> is of type EXACT, there is
4029 * only one possible character that can match its first character, and so
4030 * the situation is quite simple. But things get much more complicated if
4031 * folding is involved. It may be that the first character of an EXACTFish
4032 * node doesn't participate in any possible fold, e.g., punctuation, so it
4033 * can be matched only by itself. The vast majority of characters that are
4034 * in folds match just two things, their lower and upper-case equivalents.
4035 * But not all are like that; some have multiple possible matches, or match
4036 * sequences of more than one character. This function sorts all that out.
4038 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
4039 * loop of trying to match A*, we know we can't exit where the thing
4040 * following it isn't a B. And something can't be a B unless it is the
4041 * beginning of B. By putting a quick test for that beginning in a tight
4042 * loop, we can rule out things that can't possibly be B without having to
4043 * break out of the loop, thus avoiding work. Similarly, if A is a single
4044 * character, we can make a tight loop matching A*, using the outputs of
4047 * If the target string to match isn't in UTF-8, and there aren't
4048 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4049 * the one or two possible octets (which are characters in this situation)
4050 * that can match. In all cases, if there is only one character that can
4051 * match, *<c1p> and *<c2p> will be identical.
4053 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4054 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4055 * can match the beginning of <text_node>. They should be declared with at
4056 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
4057 * undefined what these contain.) If one or both of the buffers are
4058 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4059 * corresponding invariant. If variant, the corresponding *<c1p> and/or
4060 * *<c2p> will be set to a negative number(s) that shouldn't match any code
4061 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
4062 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4064 const bool utf8_target = reginfo->is_utf8_target;
4066 UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4067 UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4068 bool use_chrtest_void = FALSE;
4069 const bool is_utf8_pat = reginfo->is_utf8_pat;
4071 /* Used when we have both utf8 input and utf8 output, to avoid converting
4072 * to/from code points */
4073 bool utf8_has_been_setup = FALSE;
4077 U8 *pat = (U8*)STRING(text_node);
4078 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4080 if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4082 /* In an exact node, only one thing can be matched, that first
4083 * character. If both the pat and the target are UTF-8, we can just
4084 * copy the input to the output, avoiding finding the code point of
4089 else if (utf8_target) {
4090 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4091 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4092 utf8_has_been_setup = TRUE;
4095 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4098 else { /* an EXACTFish node */
4099 U8 *pat_end = pat + STR_LEN(text_node);
4101 /* An EXACTFL node has at least some characters unfolded, because what
4102 * they match is not known until now. So, now is the time to fold
4103 * the first few of them, as many as are needed to determine 'c1' and
4104 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
4105 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4106 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
4107 * need to fold as many characters as a single character can fold to,
4108 * so that later we can check if the first ones are such a multi-char
4109 * fold. But, in such a pattern only locale-problematic characters
4110 * aren't folded, so we can skip this completely if the first character
4111 * in the node isn't one of the tricky ones */
4112 if (OP(text_node) == EXACTFL) {
4114 if (! is_utf8_pat) {
4115 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4117 folded[0] = folded[1] = 's';
4119 pat_end = folded + 2;
4122 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4127 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4129 *(d++) = (U8) toFOLD_LC(*s);
4134 _to_utf8_fold_flags(s,
4137 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4148 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4149 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4151 /* Multi-character folds require more context to sort out. Also
4152 * PL_utf8_foldclosures used below doesn't handle them, so have to
4153 * be handled outside this routine */
4154 use_chrtest_void = TRUE;
4156 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4157 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4159 /* Load the folds hash, if not already done */
4161 if (! PL_utf8_foldclosures) {
4162 _load_PL_utf8_foldclosures();
4165 /* The fold closures data structure is a hash with the keys
4166 * being the UTF-8 of every character that is folded to, like
4167 * 'k', and the values each an array of all code points that
4168 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
4169 * Multi-character folds are not included */
4170 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4175 /* Not found in the hash, therefore there are no folds
4176 * containing it, so there is only a single character that
4180 else { /* Does participate in folds */
4181 AV* list = (AV*) *listp;
4182 if (av_tindex_nomg(list) != 1) {
4184 /* If there aren't exactly two folds to this, it is
4185 * outside the scope of this function */
4186 use_chrtest_void = TRUE;
4188 else { /* There are two. Get them */
4189 SV** c_p = av_fetch(list, 0, FALSE);
4191 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4195 c_p = av_fetch(list, 1, FALSE);
4197 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4201 /* Folds that cross the 255/256 boundary are forbidden
4202 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4203 * one is ASCIII. Since the pattern character is above
4204 * 255, and its only other match is below 256, the only
4205 * legal match will be to itself. We have thrown away
4206 * the original, so have to compute which is the one
4208 if ((c1 < 256) != (c2 < 256)) {
4209 if ((OP(text_node) == EXACTFL
4210 && ! IN_UTF8_CTYPE_LOCALE)
4211 || ((OP(text_node) == EXACTFA
4212 || OP(text_node) == EXACTFA_NO_TRIE)
4213 && (isASCII(c1) || isASCII(c2))))
4226 else /* Here, c1 is <= 255 */
4228 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4229 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4230 && ((OP(text_node) != EXACTFA
4231 && OP(text_node) != EXACTFA_NO_TRIE)
4234 /* Here, there could be something above Latin1 in the target
4235 * which folds to this character in the pattern. All such
4236 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4237 * than two characters involved in their folds, so are outside
4238 * the scope of this function */
4239 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4240 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4243 use_chrtest_void = TRUE;
4246 else { /* Here nothing above Latin1 can fold to the pattern
4248 switch (OP(text_node)) {
4250 case EXACTFL: /* /l rules */
4251 c2 = PL_fold_locale[c1];
4254 case EXACTF: /* This node only generated for non-utf8
4256 assert(! is_utf8_pat);
4257 if (! utf8_target) { /* /d rules */
4262 /* /u rules for all these. This happens to work for
4263 * EXACTFA as nothing in Latin1 folds to ASCII */
4264 case EXACTFA_NO_TRIE: /* This node only generated for
4265 non-utf8 patterns */
4266 assert(! is_utf8_pat);
4271 c2 = PL_fold_latin1[c1];
4275 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4276 NOT_REACHED; /* NOTREACHED */
4282 /* Here have figured things out. Set up the returns */
4283 if (use_chrtest_void) {
4284 *c2p = *c1p = CHRTEST_VOID;
4286 else if (utf8_target) {
4287 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
4288 uvchr_to_utf8(c1_utf8, c1);
4289 uvchr_to_utf8(c2_utf8, c2);
4292 /* Invariants are stored in both the utf8 and byte outputs; Use
4293 * negative numbers otherwise for the byte ones. Make sure that the
4294 * byte ones are the same iff the utf8 ones are the same */
4295 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4296 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4299 ? CHRTEST_NOT_A_CP_1
4300 : CHRTEST_NOT_A_CP_2;
4302 else if (c1 > 255) {
4303 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
4308 *c1p = *c2p = c2; /* c2 is the only representable value */
4310 else { /* c1 is representable; see about c2 */
4312 *c2p = (c2 < 256) ? c2 : c1;
4319 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4321 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4322 * between the inputs. See http://www.unicode.org/reports/tr29/. */
4324 PERL_ARGS_ASSERT_ISGCB;
4326 switch (GCB_table[before][after]) {
4333 case GCB_RI_then_RI:
4336 U8 * temp_pos = (U8 *) curpos;
4338 /* Do not break within emoji flag sequences. That is, do not
4339 * break between regional indicator (RI) symbols if there is an
4340 * odd number of RI characters before the break point.
4341 * GB12 ^ (RI RI)* RI × RI
4342 * GB13 [^RI] (RI RI)* RI × RI */
4344 while (backup_one_GCB(strbeg,
4346 utf8_target) == GCB_Regional_Indicator)
4351 return RI_count % 2 != 1;
4354 case GCB_EX_then_EM:
4356 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
4358 U8 * temp_pos = (U8 *) curpos;
4362 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4364 while (prev == GCB_Extend);
4366 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4374 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4375 before, after, GCB_table[before][after]);
4382 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4386 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4388 if (*curpos < strbeg) {
4393 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4394 U8 * prev_prev_char_pos;
4396 if (! prev_char_pos) {
4400 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4401 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4402 *curpos = prev_char_pos;
4403 prev_char_pos = prev_prev_char_pos;
4406 *curpos = (U8 *) strbeg;
4411 if (*curpos - 2 < strbeg) {
4412 *curpos = (U8 *) strbeg;
4416 gcb = getGCB_VAL_CP(*(*curpos - 1));
4422 /* Combining marks attach to most classes that precede them, but this defines
4423 * the exceptions (from TR14) */
4424 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
4425 || prev == LB_Mandatory_Break \
4426 || prev == LB_Carriage_Return \
4427 || prev == LB_Line_Feed \
4428 || prev == LB_Next_Line \
4429 || prev == LB_Space \
4430 || prev == LB_ZWSpace))
4433 S_isLB(pTHX_ LB_enum before,
4435 const U8 * const strbeg,
4436 const U8 * const curpos,
4437 const U8 * const strend,
4438 const bool utf8_target)
4440 U8 * temp_pos = (U8 *) curpos;
4441 LB_enum prev = before;
4443 /* Is the boundary between 'before' and 'after' line-breakable?
4444 * Most of this is just a table lookup of a generated table from Unicode
4445 * rules. But some rules require context to decide, and so have to be
4446 * implemented in code */
4448 PERL_ARGS_ASSERT_ISLB;
4450 /* Rule numbers in the comments below are as of Unicode 9.0 */
4454 switch (LB_table[before][after]) {
4459 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4462 case LB_SP_foo + LB_BREAKABLE:
4463 case LB_SP_foo + LB_NOBREAK:
4464 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4466 /* When we have something following a SP, we have to look at the
4467 * context in order to know what to do.
4469 * SP SP should not reach here because LB7: Do not break before
4470 * spaces. (For two spaces in a row there is nothing that
4471 * overrides that) */
4472 assert(after != LB_Space);
4474 /* Here we have a space followed by a non-space. Mostly this is a
4475 * case of LB18: "Break after spaces". But there are complications
4476 * as the handling of spaces is somewhat tricky. They are in a
4477 * number of rules, which have to be applied in priority order, but
4478 * something earlier in the string can cause a rule to be skipped
4479 * and a lower priority rule invoked. A prime example is LB7 which
4480 * says don't break before a space. But rule LB8 (lower priority)
4481 * says that the first break opportunity after a ZW is after any
4482 * span of spaces immediately after it. If a ZW comes before a SP
4483 * in the input, rule LB8 applies, and not LB7. Other such rules
4484 * involve combining marks which are rules 9 and 10, but they may
4485 * override higher priority rules if they come earlier in the
4486 * string. Since we're doing random access into the middle of the
4487 * string, we have to look for rules that should get applied based
4488 * on both string position and priority. Combining marks do not
4489 * attach to either ZW nor SP, so we don't have to consider them
4492 * To check for LB8, we have to find the first non-space character
4493 * before this span of spaces */
4495 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4497 while (prev == LB_Space);
4499 /* LB8 Break before any character following a zero-width space,
4500 * even if one or more spaces intervene.
4502 * So if we have a ZW just before this span, and to get here this
4503 * is the final space in the span. */
4504 if (prev == LB_ZWSpace) {
4508 /* Here, not ZW SP+. There are several rules that have higher
4509 * priority than LB18 and can be resolved now, as they don't depend
4510 * on anything earlier in the string (except ZW, which we have
4511 * already handled). One of these rules is LB11 Do not break
4512 * before Word joiner, but we have specially encoded that in the
4513 * lookup table so it is caught by the single test below which
4514 * catches the other ones. */
4515 if (LB_table[LB_Space][after] - LB_SP_foo
4516 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4521 /* If we get here, we have to XXX consider combining marks. */
4522 if (prev == LB_Combining_Mark) {
4524 /* What happens with these depends on the character they
4527 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4529 while (prev == LB_Combining_Mark);
4531 /* Most times these attach to and inherit the characteristics
4532 * of that character, but not always, and when not, they are to
4533 * be treated as AL by rule LB10. */
4534 if (! LB_CM_ATTACHES_TO(prev)) {
4535 prev = LB_Alphabetic;
4539 /* Here, we have the character preceding the span of spaces all set
4540 * up. We follow LB18: "Break after spaces" unless the table shows
4541 * that is overriden */
4542 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4546 /* We don't know how to treat the CM except by looking at the first
4547 * non-CM character preceding it. ZWJ is treated as CM */
4549 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4551 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4553 /* Here, 'prev' is that first earlier non-CM character. If the CM
4554 * attatches to it, then it inherits the behavior of 'prev'. If it
4555 * doesn't attach, it is to be treated as an AL */
4556 if (! LB_CM_ATTACHES_TO(prev)) {
4557 prev = LB_Alphabetic;
4562 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4563 case LB_HY_or_BA_then_foo + LB_NOBREAK:
4565 /* LB21a Don't break after Hebrew + Hyphen.
4566 * HL (HY | BA) × */
4568 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4569 == LB_Hebrew_Letter)
4574 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4576 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4577 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4579 /* LB25a (PR | PO) × ( OP | HY )? NU */
4580 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4584 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4587 case LB_SY_or_IS_then_various + LB_BREAKABLE:
4588 case LB_SY_or_IS_then_various + LB_NOBREAK:
4590 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4592 LB_enum temp = prev;
4594 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4596 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4597 if (temp == LB_Numeric) {
4601 return LB_table[prev][after] - LB_SY_or_IS_then_various
4605 case LB_various_then_PO_or_PR + LB_BREAKABLE:
4606 case LB_various_then_PO_or_PR + LB_NOBREAK:
4608 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4610 LB_enum temp = prev;
4611 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4613 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4615 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4616 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4618 if (temp == LB_Numeric) {
4621 return LB_various_then_PO_or_PR;
4624 case LB_RI_then_RI + LB_NOBREAK:
4625 case LB_RI_then_RI + LB_BREAKABLE:
4629 /* LB30a Break between two regional indicator symbols if and
4630 * only if there are an even number of regional indicators
4631 * preceding the position of the break.
4633 * sot (RI RI)* RI × RI
4634 * [^RI] (RI RI)* RI × RI */
4636 while (backup_one_LB(strbeg,
4638 utf8_target) == LB_Regional_Indicator)
4643 return RI_count % 2 == 0;
4651 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4652 before, after, LB_table[before][after]);
4659 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4663 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4665 if (*curpos >= strend) {
4670 *curpos += UTF8SKIP(*curpos);
4671 if (*curpos >= strend) {
4674 lb = getLB_VAL_UTF8(*curpos, strend);
4678 if (*curpos >= strend) {
4681 lb = getLB_VAL_CP(**curpos);
4688 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4692 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4694 if (*curpos < strbeg) {
4699 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4700 U8 * prev_prev_char_pos;
4702 if (! prev_char_pos) {
4706 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4707 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4708 *curpos = prev_char_pos;
4709 prev_char_pos = prev_prev_char_pos;
4712 *curpos = (U8 *) strbeg;
4717 if (*curpos - 2 < strbeg) {
4718 *curpos = (U8 *) strbeg;
4722 lb = getLB_VAL_CP(*(*curpos - 1));
4729 S_isSB(pTHX_ SB_enum before,
4731 const U8 * const strbeg,
4732 const U8 * const curpos,
4733 const U8 * const strend,
4734 const bool utf8_target)
4736 /* returns a boolean indicating if there is a Sentence Boundary Break
4737 * between the inputs. See http://www.unicode.org/reports/tr29/ */
4739 U8 * lpos = (U8 *) curpos;
4740 bool has_para_sep = FALSE;
4741 bool has_sp = FALSE;
4743 PERL_ARGS_ASSERT_ISSB;
4745 /* Break at the start and end of text.
4748 But unstated in Unicode is don't break if the text is empty */
4749 if (before == SB_EDGE || after == SB_EDGE) {
4750 return before != after;
4753 /* SB 3: Do not break within CRLF. */
4754 if (before == SB_CR && after == SB_LF) {
4758 /* Break after paragraph separators. CR and LF are considered
4759 * so because Unicode views text as like word processing text where there
4760 * are no newlines except between paragraphs, and the word processor takes
4761 * care of wrapping without there being hard line-breaks in the text *./
4762 SB4. Sep | CR | LF ÷ */
4763 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4767 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4768 * (See Section 6.2, Replacing Ignore Rules.)
4769 SB5. X (Extend | Format)* → X */
4770 if (after == SB_Extend || after == SB_Format) {
4772 /* Implied is that the these characters attach to everything
4773 * immediately prior to them except for those separator-type
4774 * characters. And the rules earlier have already handled the case
4775 * when one of those immediately precedes the extend char */
4779 if (before == SB_Extend || before == SB_Format) {
4780 U8 * temp_pos = lpos;
4781 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4782 if ( backup != SB_EDGE
4791 /* Here, both 'before' and 'backup' are these types; implied is that we
4792 * don't break between them */
4793 if (backup == SB_Extend || backup == SB_Format) {
4798 /* Do not break after ambiguous terminators like period, if they are
4799 * immediately followed by a number or lowercase letter, if they are
4800 * between uppercase letters, if the first following letter (optionally
4801 * after certain punctuation) is lowercase, or if they are followed by
4802 * "continuation" punctuation such as comma, colon, or semicolon. For
4803 * example, a period may be an abbreviation or numeric period, and thus may
4804 * not mark the end of a sentence.
4806 * SB6. ATerm × Numeric */
4807 if (before == SB_ATerm && after == SB_Numeric) {
4811 /* SB7. (Upper | Lower) ATerm × Upper */
4812 if (before == SB_ATerm && after == SB_Upper) {
4813 U8 * temp_pos = lpos;
4814 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4815 if (backup == SB_Upper || backup == SB_Lower) {
4820 /* The remaining rules that aren't the final one, all require an STerm or
4821 * an ATerm after having backed up over some Close* Sp*, and in one case an
4822 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4823 * So do that backup now, setting flags if either Sp or a paragraph
4824 * separator are found */
4826 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4827 has_para_sep = TRUE;
4828 before = backup_one_SB(strbeg, &lpos, utf8_target);
4831 if (before == SB_Sp) {
4834 before = backup_one_SB(strbeg, &lpos, utf8_target);
4836 while (before == SB_Sp);
4839 while (before == SB_Close) {
4840 before = backup_one_SB(strbeg, &lpos, utf8_target);
4843 /* The next few rules apply only when the backed-up-to is an ATerm, and in
4844 * most cases an STerm */
4845 if (before == SB_STerm || before == SB_ATerm) {
4847 /* So, here the lhs matches
4848 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
4849 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
4850 * The rules that apply here are:
4852 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
4853 | LF | STerm | ATerm) )* Lower
4854 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
4855 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
4856 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
4857 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
4860 /* And all but SB11 forbid having seen a paragraph separator */
4861 if (! has_para_sep) {
4862 if (before == SB_ATerm) { /* SB8 */
4863 U8 * rpos = (U8 *) curpos;
4864 SB_enum later = after;
4866 while ( later != SB_OLetter
4867 && later != SB_Upper
4868 && later != SB_Lower
4872 && later != SB_STerm
4873 && later != SB_ATerm
4874 && later != SB_EDGE)
4876 later = advance_one_SB(&rpos, strend, utf8_target);
4878 if (later == SB_Lower) {
4883 if ( after == SB_SContinue /* SB8a */
4884 || after == SB_STerm
4885 || after == SB_ATerm)
4890 if (! has_sp) { /* SB9 applies only if there was no Sp* */
4891 if ( after == SB_Close
4901 /* SB10. This and SB9 could probably be combined some way, but khw
4902 * has decided to follow the Unicode rule book precisely for
4903 * simplified maintenance */
4917 /* Otherwise, do not break.
4924 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4928 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4930 if (*curpos >= strend) {
4936 *curpos += UTF8SKIP(*curpos);
4937 if (*curpos >= strend) {
4940 sb = getSB_VAL_UTF8(*curpos, strend);
4941 } while (sb == SB_Extend || sb == SB_Format);
4946 if (*curpos >= strend) {
4949 sb = getSB_VAL_CP(**curpos);
4950 } while (sb == SB_Extend || sb == SB_Format);
4957 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4961 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4963 if (*curpos < strbeg) {
4968 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4969 if (! prev_char_pos) {
4973 /* Back up over Extend and Format. curpos is always just to the right
4974 * of the characater whose value we are getting */
4976 U8 * prev_prev_char_pos;
4977 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4980 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4981 *curpos = prev_char_pos;
4982 prev_char_pos = prev_prev_char_pos;
4985 *curpos = (U8 *) strbeg;
4988 } while (sb == SB_Extend || sb == SB_Format);
4992 if (*curpos - 2 < strbeg) {
4993 *curpos = (U8 *) strbeg;
4997 sb = getSB_VAL_CP(*(*curpos - 1));
4998 } while (sb == SB_Extend || sb == SB_Format);
5005 S_isWB(pTHX_ WB_enum previous,
5008 const U8 * const strbeg,
5009 const U8 * const curpos,
5010 const U8 * const strend,
5011 const bool utf8_target)
5013 /* Return a boolean as to if the boundary between 'before' and 'after' is
5014 * a Unicode word break, using their published algorithm, but tailored for
5015 * Perl by treating spans of white space as one unit. Context may be
5016 * needed to make this determination. If the value for the character
5017 * before 'before' is known, it is passed as 'previous'; otherwise that
5018 * should be set to WB_UNKNOWN. The other input parameters give the
5019 * boundaries and current position in the matching of the string. That
5020 * is, 'curpos' marks the position where the character whose wb value is
5021 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5023 U8 * before_pos = (U8 *) curpos;
5024 U8 * after_pos = (U8 *) curpos;
5025 WB_enum prev = before;
5028 PERL_ARGS_ASSERT_ISWB;
5030 /* Rule numbers in the comments below are as of Unicode 9.0 */
5034 switch (WB_table[before][after]) {
5041 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5042 next = advance_one_WB(&after_pos, strend, utf8_target,
5043 FALSE /* Don't skip Extend nor Format */ );
5044 /* A space immediately preceeding an Extend or Format is attached
5045 * to by them, and hence gets separated from previous spaces.
5046 * Otherwise don't break between horizontal white space */
5047 return next == WB_Extend || next == WB_Format;
5049 /* WB4 Ignore Format and Extend characters, except when they appear at
5050 * the beginning of a region of text. This code currently isn't
5051 * general purpose, but it works as the rules are currently and likely
5052 * to be laid out. The reason it works is that when 'they appear at
5053 * the beginning of a region of text', the rule is to break before
5054 * them, just like any other character. Therefore, the default rule
5055 * applies and we don't have to look in more depth. Should this ever
5056 * change, we would have to have 2 'case' statements, like in the rules
5057 * below, and backup a single character (not spacing over the extend
5058 * ones) and then see if that is one of the region-end characters and
5060 case WB_Ex_or_FO_or_ZWJ_then_foo:
5061 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5064 case WB_DQ_then_HL + WB_BREAKABLE:
5065 case WB_DQ_then_HL + WB_NOBREAK:
5067 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5069 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5070 == WB_Hebrew_Letter)
5075 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5077 case WB_HL_then_DQ + WB_BREAKABLE:
5078 case WB_HL_then_DQ + WB_NOBREAK:
5080 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5082 if (advance_one_WB(&after_pos, strend, utf8_target,
5083 TRUE /* Do skip Extend and Format */ )
5084 == WB_Hebrew_Letter)
5089 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5091 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5092 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5094 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5095 * | Single_Quote) (ALetter | Hebrew_Letter) */
5097 next = advance_one_WB(&after_pos, strend, utf8_target,
5098 TRUE /* Do skip Extend and Format */ );
5100 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5105 return WB_table[before][after]
5106 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5108 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5109 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5111 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5112 * | Single_Quote) × (ALetter | Hebrew_Letter) */
5114 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5115 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5120 return WB_table[before][after]
5121 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5123 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5124 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5126 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
5129 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5135 return WB_table[before][after]
5136 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5138 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5139 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5141 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
5143 if (advance_one_WB(&after_pos, strend, utf8_target,
5144 TRUE /* Do skip Extend and Format */ )
5150 return WB_table[before][after]
5151 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5153 case WB_RI_then_RI + WB_NOBREAK:
5154 case WB_RI_then_RI + WB_BREAKABLE:
5158 /* Do not break within emoji flag sequences. That is, do not
5159 * break between regional indicator (RI) symbols if there is an
5160 * odd number of RI characters before the potential break
5163 * WB15 ^ (RI RI)* RI × RI
5164 * WB16 [^RI] (RI RI)* RI × RI */
5166 while (backup_one_WB(&previous,
5169 utf8_target) == WB_Regional_Indicator)
5174 return RI_count % 2 != 1;
5182 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5183 before, after, WB_table[before][after]);
5190 S_advance_one_WB(pTHX_ U8 ** curpos,
5191 const U8 * const strend,
5192 const bool utf8_target,
5193 const bool skip_Extend_Format)
5197 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5199 if (*curpos >= strend) {
5205 /* Advance over Extend and Format */
5207 *curpos += UTF8SKIP(*curpos);
5208 if (*curpos >= strend) {
5211 wb = getWB_VAL_UTF8(*curpos, strend);
5212 } while ( skip_Extend_Format
5213 && (wb == WB_Extend || wb == WB_Format));
5218 if (*curpos >= strend) {
5221 wb = getWB_VAL_CP(**curpos);
5222 } while ( skip_Extend_Format
5223 && (wb == WB_Extend || wb == WB_Format));
5230 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5234 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5236 /* If we know what the previous character's break value is, don't have
5238 if (*previous != WB_UNKNOWN) {
5241 /* But we need to move backwards by one */
5243 *curpos = reghopmaybe3(*curpos, -1, strbeg);
5245 *previous = WB_EDGE;
5246 *curpos = (U8 *) strbeg;
5249 *previous = WB_UNKNOWN;
5254 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5257 /* And we always back up over these three types */
5258 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5263 if (*curpos < strbeg) {
5268 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5269 if (! prev_char_pos) {
5273 /* Back up over Extend and Format. curpos is always just to the right
5274 * of the characater whose value we are getting */
5276 U8 * prev_prev_char_pos;
5277 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5281 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5282 *curpos = prev_char_pos;
5283 prev_char_pos = prev_prev_char_pos;
5286 *curpos = (U8 *) strbeg;
5289 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5293 if (*curpos - 2 < strbeg) {
5294 *curpos = (U8 *) strbeg;
5298 wb = getWB_VAL_CP(*(*curpos - 1));
5299 } while (wb == WB_Extend || wb == WB_Format);
5305 #define EVAL_CLOSE_PAREN_IS(st,expr) \
5308 ( ( st )->u.eval.close_paren ) && \
5309 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
5312 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
5315 ( ( st )->u.eval.close_paren ) && \
5317 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
5321 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5322 (st)->u.eval.close_paren = ( (expr) + 1 )
5324 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5325 (st)->u.eval.close_paren = 0
5327 /* returns -1 on failure, $+[0] on success */
5329 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5332 #if PERL_VERSION < 9 && !defined(PERL_CORE)
5336 const bool utf8_target = reginfo->is_utf8_target;
5337 const U32 uniflags = UTF8_ALLOW_DEFAULT;
5338 REGEXP *rex_sv = reginfo->prog;
5339 regexp *rex = ReANY(rex_sv);
5340 RXi_GET_DECL(rex,rexi);
5341 /* the current state. This is a cached copy of PL_regmatch_state */
5343 /* cache heavy used fields of st in registers */
5346 U32 n = 0; /* general value; init to avoid compiler warning */
5347 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
5348 char *locinput = startpos;
5349 char *pushinput; /* where to continue after a PUSH */
5350 I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */
5352 bool result = 0; /* return value of S_regmatch */
5353 U32 depth = 0; /* depth of backtrack stack */
5354 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5355 const U32 max_nochange_depth =
5356 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5357 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5358 regmatch_state *yes_state = NULL; /* state to pop to on success of
5360 /* mark_state piggy backs on the yes_state logic so that when we unwind
5361 the stack on success we can update the mark_state as we go */
5362 regmatch_state *mark_state = NULL; /* last mark state we have seen */
5363 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5364 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
5366 bool no_final = 0; /* prevent failure from backtracking? */
5367 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
5368 char *startpoint = locinput;
5369 SV *popmark = NULL; /* are we looking for a mark? */
5370 SV *sv_commit = NULL; /* last mark name seen in failure */
5371 SV *sv_yes_mark = NULL; /* last mark name we have seen
5372 during a successful match */
5373 U32 lastopen = 0; /* last open we saw */
5374 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
5375 SV* const oreplsv = GvSVn(PL_replgv);
5376 /* these three flags are set by various ops to signal information to
5377 * the very next op. They have a useful lifetime of exactly one loop
5378 * iteration, and are not preserved or restored by state pushes/pops
5380 bool sw = 0; /* the condition value in (?(cond)a|b) */
5381 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
5382 int logical = 0; /* the following EVAL is:
5386 or the following IFMATCH/UNLESSM is:
5387 false: plain (?=foo)
5388 true: used as a condition: (?(?=foo))
5390 PAD* last_pad = NULL;
5392 U8 gimme = G_SCALAR;
5393 CV *caller_cv = NULL; /* who called us */
5394 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
5395 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
5396 U32 maxopenparen = 0; /* max '(' index seen so far */
5397 int to_complement; /* Invert the result? */
5398 _char_class_number classnum;
5399 bool is_utf8_pat = reginfo->is_utf8_pat;
5402 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5403 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5404 # define SOLARIS_BAD_OPTIMIZER
5405 const U32 *pl_charclass_dup = PL_charclass;
5406 # define PL_charclass pl_charclass_dup
5410 GET_RE_DEBUG_FLAGS_DECL;
5413 /* protect against undef(*^R) */
5414 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5416 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5417 multicall_oldcatch = 0;
5418 PERL_UNUSED_VAR(multicall_cop);
5420 PERL_ARGS_ASSERT_REGMATCH;
5422 st = PL_regmatch_state;
5424 /* Note that nextchr is a byte even in UTF */
5428 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5429 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5430 Perl_re_printf( aTHX_ "regmatch start\n" );
5433 while (scan != NULL) {
5436 next = scan + NEXT_OFF(scan);
5439 state_num = OP(scan);
5443 if (state_num <= REGNODE_MAX) {
5444 SV * const prop = sv_newmortal();
5445 regnode *rnext = regnext(scan);
5447 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5448 regprop(rex, prop, scan, reginfo, NULL);
5449 Perl_re_printf( aTHX_
5450 "%*s%"IVdf":%s(%"IVdf")\n",
5451 INDENT_CHARS(depth), "",
5452 (IV)(scan - rexi->program),
5454 (PL_regkind[OP(scan)] == END || !rnext) ?
5455 0 : (IV)(rnext - rexi->program));
5462 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5464 switch (state_num) {
5465 case SBOL: /* /^../ and /\A../ */
5466 if (locinput == reginfo->strbeg)
5470 case MBOL: /* /^../m */
5471 if (locinput == reginfo->strbeg ||
5472 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5479 if (locinput == reginfo->ganch)
5483 case KEEPS: /* \K */
5484 /* update the startpoint */
5485 st->u.keeper.val = rex->offs[0].start;
5486 rex->offs[0].start = locinput - reginfo->strbeg;
5487 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5488 NOT_REACHED; /* NOTREACHED */
5490 case KEEPS_next_fail:
5491 /* rollback the start point change */
5492 rex->offs[0].start = st->u.keeper.val;
5494 NOT_REACHED; /* NOTREACHED */
5496 case MEOL: /* /..$/m */
5497 if (!NEXTCHR_IS_EOS && nextchr != '\n')
5501 case SEOL: /* /..$/ */
5502 if (!NEXTCHR_IS_EOS && nextchr != '\n')
5504 if (reginfo->strend - locinput > 1)
5509 if (!NEXTCHR_IS_EOS)
5513 case SANY: /* /./s */
5516 goto increment_locinput;
5518 case REG_ANY: /* /./ */
5519 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5521 goto increment_locinput;
5525 #define ST st->u.trie
5526 case TRIEC: /* (ab|cd) with known charclass */
5527 /* In this case the charclass data is available inline so
5528 we can fail fast without a lot of extra overhead.
5530 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5532 Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
5533 depth, PL_colors[4], PL_colors[5])
5536 NOT_REACHED; /* NOTREACHED */
5539 case TRIE: /* (ab|cd) */
5540 /* the basic plan of execution of the trie is:
5541 * At the beginning, run though all the states, and
5542 * find the longest-matching word. Also remember the position
5543 * of the shortest matching word. For example, this pattern:
5546 * when matched against the string "abcde", will generate
5547 * accept states for all words except 3, with the longest
5548 * matching word being 4, and the shortest being 2 (with
5549 * the position being after char 1 of the string).
5551 * Then for each matching word, in word order (i.e. 1,2,4,5),
5552 * we run the remainder of the pattern; on each try setting
5553 * the current position to the character following the word,
5554 * returning to try the next word on failure.
5556 * We avoid having to build a list of words at runtime by
5557 * using a compile-time structure, wordinfo[].prev, which
5558 * gives, for each word, the previous accepting word (if any).
5559 * In the case above it would contain the mappings 1->2, 2->0,
5560 * 3->0, 4->5, 5->1. We can use this table to generate, from
5561 * the longest word (4 above), a list of all words, by
5562 * following the list of prev pointers; this gives us the
5563 * unordered list 4,5,1,2. Then given the current word we have
5564 * just tried, we can go through the list and find the
5565 * next-biggest word to try (so if we just failed on word 2,
5566 * the next in the list is 4).
5568 * Since at runtime we don't record the matching position in
5569 * the string for each word, we have to work that out for
5570 * each word we're about to process. The wordinfo table holds
5571 * the character length of each word; given that we recorded
5572 * at the start: the position of the shortest word and its
5573 * length in chars, we just need to move the pointer the
5574 * difference between the two char lengths. Depending on
5575 * Unicode status and folding, that's cheap or expensive.
5577 * This algorithm is optimised for the case where are only a
5578 * small number of accept states, i.e. 0,1, or maybe 2.
5579 * With lots of accepts states, and having to try all of them,
5580 * it becomes quadratic on number of accept states to find all
5585 /* what type of TRIE am I? (utf8 makes this contextual) */
5586 DECL_TRIE_TYPE(scan);
5588 /* what trie are we using right now */
5589 reg_trie_data * const trie
5590 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5591 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5592 U32 state = trie->startstate;
5594 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5595 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5597 && UTF8_IS_ABOVE_LATIN1(nextchr)
5598 && scan->flags == EXACTL)
5600 /* We only output for EXACTL, as we let the folder
5601 * output this message for EXACTFLU8 to avoid
5603 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5608 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5610 if (trie->states[ state ].wordnum) {
5612 Perl_re_exec_indentf( aTHX_ "%smatched empty string...%s\n",
5613 depth, PL_colors[4], PL_colors[5])
5619 Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
5620 depth, PL_colors[4], PL_colors[5])
5627 U8 *uc = ( U8* )locinput;
5631 U8 *uscan = (U8*)NULL;
5632 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5633 U32 charcount = 0; /* how many input chars we have matched */
5634 U32 accepted = 0; /* have we seen any accepting states? */
5636 ST.jump = trie->jump;
5639 ST.longfold = FALSE; /* char longer if folded => it's harder */
5642 /* fully traverse the TRIE; note the position of the
5643 shortest accept state and the wordnum of the longest
5646 while ( state && uc <= (U8*)(reginfo->strend) ) {
5647 U32 base = trie->states[ state ].trans.base;
5651 wordnum = trie->states[ state ].wordnum;
5653 if (wordnum) { /* it's an accept state */
5656 /* record first match position */
5658 ST.firstpos = (U8*)locinput;
5663 ST.firstchars = charcount;
5666 if (!ST.nextword || wordnum < ST.nextword)
5667 ST.nextword = wordnum;
5668 ST.topword = wordnum;
5671 DEBUG_TRIE_EXECUTE_r({
5672 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5674 PerlIO_printf( Perl_debug_log,
5675 "%*s%sState: %4"UVxf" Accepted: %c ",
5676 INDENT_CHARS(depth), "", PL_colors[4],
5677 (UV)state, (accepted ? 'Y' : 'N'));
5680 /* read a char and goto next state */
5681 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5683 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5684 uscan, len, uvc, charid, foldlen,
5691 base + charid - 1 - trie->uniquecharcount)) >= 0)
5693 && ((U32)offset < trie->lasttrans)
5694 && trie->trans[offset].check == state)
5696 state = trie->trans[offset].next;
5707 DEBUG_TRIE_EXECUTE_r(
5708 Perl_re_printf( aTHX_
5709 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5710 charid, uvc, (UV)state, PL_colors[5] );
5716 /* calculate total number of accept states */
5721 w = trie->wordinfo[w].prev;
5724 ST.accepted = accepted;
5728 Perl_re_exec_indentf( aTHX_ "%sgot %"IVdf" possible matches%s\n",
5730 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5732 goto trie_first_try; /* jump into the fail handler */
5734 NOT_REACHED; /* NOTREACHED */
5736 case TRIE_next_fail: /* we failed - try next alternative */
5740 REGCP_UNWIND(ST.cp);
5741 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5743 if (!--ST.accepted) {
5745 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
5753 /* Find next-highest word to process. Note that this code
5754 * is O(N^2) per trie run (O(N) per branch), so keep tight */
5757 U16 const nextword = ST.nextword;
5758 reg_trie_wordinfo * const wordinfo
5759 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5760 for (word=ST.topword; word; word=wordinfo[word].prev) {
5761 if (word > nextword && (!min || word < min))
5774 ST.lastparen = rex->lastparen;
5775 ST.lastcloseparen = rex->lastcloseparen;
5779 /* find start char of end of current word */
5781 U32 chars; /* how many chars to skip */
5782 reg_trie_data * const trie
5783 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5785 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5787 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5792 /* the hard option - fold each char in turn and find
5793 * its folded length (which may be different */
5794 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5802 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5810 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5815 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5831 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5832 ? ST.jump[ST.nextword]
5836 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
5844 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
5845 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5846 NOT_REACHED; /* NOTREACHED */
5848 /* only one choice left - just continue */
5850 AV *const trie_words
5851 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5852 SV ** const tmp = trie_words
5853 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5854 SV *sv= tmp ? sv_newmortal() : NULL;
5856 Perl_re_exec_indentf( aTHX_ "%sonly one match left, short-circuiting: #%d <%s>%s\n",
5857 depth, PL_colors[4],
5859 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5860 PL_colors[0], PL_colors[1],
5861 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5863 : "not compiled under -Dr",
5867 locinput = (char*)uc;
5868 continue; /* execute rest of RE */
5873 case EXACTL: /* /abc/l */
5874 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5876 /* Complete checking would involve going through every character
5877 * matched by the string to see if any is above latin1. But the
5878 * comparision otherwise might very well be a fast assembly
5879 * language routine, and I (khw) don't think slowing things down
5880 * just to check for this warning is worth it. So this just checks
5881 * the first character */
5882 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5883 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5886 case EXACT: { /* /abc/ */
5887 char *s = STRING(scan);
5889 if (utf8_target != is_utf8_pat) {
5890 /* The target and the pattern have differing utf8ness. */
5892 const char * const e = s + ln;
5895 /* The target is utf8, the pattern is not utf8.
5896 * Above-Latin1 code points can't match the pattern;
5897 * invariants match exactly, and the other Latin1 ones need
5898 * to be downgraded to a single byte in order to do the
5899 * comparison. (If we could be confident that the target
5900 * is not malformed, this could be refactored to have fewer
5901 * tests by just assuming that if the first bytes match, it
5902 * is an invariant, but there are tests in the test suite
5903 * dealing with (??{...}) which violate this) */
5905 if (l >= reginfo->strend
5906 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5910 if (UTF8_IS_INVARIANT(*(U8*)l)) {
5917 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5927 /* The target is not utf8, the pattern is utf8. */
5929 if (l >= reginfo->strend
5930 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5934 if (UTF8_IS_INVARIANT(*(U8*)s)) {
5941 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5953 /* The target and the pattern have the same utf8ness. */
5954 /* Inline the first character, for speed. */
5955 if (reginfo->strend - locinput < ln
5956 || UCHARAT(s) != nextchr
5957 || (ln > 1 && memNE(s, locinput, ln)))
5966 case EXACTFL: { /* /abc/il */
5968 const U8 * fold_array;
5970 U32 fold_utf8_flags;
5972 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5973 folder = foldEQ_locale;
5974 fold_array = PL_fold_locale;
5975 fold_utf8_flags = FOLDEQ_LOCALE;
5978 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
5979 is effectively /u; hence to match, target
5981 if (! utf8_target) {
5984 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5985 | FOLDEQ_S1_FOLDS_SANE;
5986 folder = foldEQ_latin1;
5987 fold_array = PL_fold_latin1;
5990 case EXACTFU_SS: /* /\x{df}/iu */
5991 case EXACTFU: /* /abc/iu */
5992 folder = foldEQ_latin1;
5993 fold_array = PL_fold_latin1;
5994 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5997 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
5999 assert(! is_utf8_pat);
6001 case EXACTFA: /* /abc/iaa */
6002 folder = foldEQ_latin1;
6003 fold_array = PL_fold_latin1;
6004 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6007 case EXACTF: /* /abc/i This node only generated for
6008 non-utf8 patterns */
6009 assert(! is_utf8_pat);
6011 fold_array = PL_fold;
6012 fold_utf8_flags = 0;
6020 || state_num == EXACTFU_SS
6021 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6023 /* Either target or the pattern are utf8, or has the issue where
6024 * the fold lengths may differ. */
6025 const char * const l = locinput;
6026 char *e = reginfo->strend;
6028 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
6029 l, &e, 0, utf8_target, fold_utf8_flags))
6037 /* Neither the target nor the pattern are utf8 */
6038 if (UCHARAT(s) != nextchr
6040 && UCHARAT(s) != fold_array[nextchr])
6044 if (reginfo->strend - locinput < ln)
6046 if (ln > 1 && ! folder(s, locinput, ln))
6052 case NBOUNDL: /* /\B/l */
6056 case BOUNDL: /* /\b/l */
6059 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6061 if (FLAGS(scan) != TRADITIONAL_BOUND) {
6062 if (! IN_UTF8_CTYPE_LOCALE) {
6063 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6064 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6070 if (locinput == reginfo->strbeg)
6071 b1 = isWORDCHAR_LC('\n');
6073 b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
6074 (U8*)(reginfo->strbeg)));
6076 b2 = (NEXTCHR_IS_EOS)
6077 ? isWORDCHAR_LC('\n')
6078 : isWORDCHAR_LC_utf8((U8*)locinput);
6080 else { /* Here the string isn't utf8 */
6081 b1 = (locinput == reginfo->strbeg)
6082 ? isWORDCHAR_LC('\n')
6083 : isWORDCHAR_LC(UCHARAT(locinput - 1));
6084 b2 = (NEXTCHR_IS_EOS)
6085 ? isWORDCHAR_LC('\n')
6086 : isWORDCHAR_LC(nextchr);
6088 if (to_complement ^ (b1 == b2)) {
6094 case NBOUND: /* /\B/ */
6098 case BOUND: /* /\b/ */
6102 goto bound_ascii_match_only;
6104 case NBOUNDA: /* /\B/a */
6108 case BOUNDA: /* /\b/a */
6112 bound_ascii_match_only:
6113 /* Here the string isn't utf8, or is utf8 and only ascii characters
6114 * are to match \w. In the latter case looking at the byte just
6115 * prior to the current one may be just the final byte of a
6116 * multi-byte character. This is ok. There are two cases:
6117 * 1) it is a single byte character, and then the test is doing
6118 * just what it's supposed to.
6119 * 2) it is a multi-byte character, in which case the final byte is
6120 * never mistakable for ASCII, and so the test will say it is
6121 * not a word character, which is the correct answer. */
6122 b1 = (locinput == reginfo->strbeg)
6123 ? isWORDCHAR_A('\n')
6124 : isWORDCHAR_A(UCHARAT(locinput - 1));
6125 b2 = (NEXTCHR_IS_EOS)
6126 ? isWORDCHAR_A('\n')
6127 : isWORDCHAR_A(nextchr);
6128 if (to_complement ^ (b1 == b2)) {
6134 case NBOUNDU: /* /\B/u */
6138 case BOUNDU: /* /\b/u */
6141 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6144 else if (utf8_target) {
6146 switch((bound_type) FLAGS(scan)) {
6147 case TRADITIONAL_BOUND:
6150 b1 = (locinput == reginfo->strbeg)
6151 ? 0 /* isWORDCHAR_L1('\n') */
6152 : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
6153 (U8*)(reginfo->strbeg)));
6154 b2 = (NEXTCHR_IS_EOS)
6155 ? 0 /* isWORDCHAR_L1('\n') */
6156 : isWORDCHAR_utf8((U8*)locinput);
6157 match = cBOOL(b1 != b2);
6161 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6162 match = TRUE; /* GCB always matches at begin and
6166 /* Find the gcb values of previous and current
6167 * chars, then see if is a break point */
6168 match = isGCB(getGCB_VAL_UTF8(
6169 reghop3((U8*)locinput,
6171 (U8*)(reginfo->strbeg)),
6172 (U8*) reginfo->strend),
6173 getGCB_VAL_UTF8((U8*) locinput,
6174 (U8*) reginfo->strend),
6175 (U8*) reginfo->strbeg,
6182 if (locinput == reginfo->strbeg) {
6185 else if (NEXTCHR_IS_EOS) {
6189 match = isLB(getLB_VAL_UTF8(
6190 reghop3((U8*)locinput,
6192 (U8*)(reginfo->strbeg)),
6193 (U8*) reginfo->strend),
6194 getLB_VAL_UTF8((U8*) locinput,
6195 (U8*) reginfo->strend),
6196 (U8*) reginfo->strbeg,
6198 (U8*) reginfo->strend,
6203 case SB_BOUND: /* Always matches at begin and end */
6204 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6208 match = isSB(getSB_VAL_UTF8(
6209 reghop3((U8*)locinput,
6211 (U8*)(reginfo->strbeg)),
6212 (U8*) reginfo->strend),
6213 getSB_VAL_UTF8((U8*) locinput,
6214 (U8*) reginfo->strend),
6215 (U8*) reginfo->strbeg,
6217 (U8*) reginfo->strend,
6223 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6227 match = isWB(WB_UNKNOWN,
6229 reghop3((U8*)locinput,
6231 (U8*)(reginfo->strbeg)),
6232 (U8*) reginfo->strend),
6233 getWB_VAL_UTF8((U8*) locinput,
6234 (U8*) reginfo->strend),
6235 (U8*) reginfo->strbeg,
6237 (U8*) reginfo->strend,
6243 else { /* Not utf8 target */
6244 switch((bound_type) FLAGS(scan)) {
6245 case TRADITIONAL_BOUND:
6248 b1 = (locinput == reginfo->strbeg)
6249 ? 0 /* isWORDCHAR_L1('\n') */
6250 : isWORDCHAR_L1(UCHARAT(locinput - 1));
6251 b2 = (NEXTCHR_IS_EOS)
6252 ? 0 /* isWORDCHAR_L1('\n') */
6253 : isWORDCHAR_L1(nextchr);
6254 match = cBOOL(b1 != b2);
6259 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6260 match = TRUE; /* GCB always matches at begin and
6263 else { /* Only CR-LF combo isn't a GCB in 0-255
6265 match = UCHARAT(locinput - 1) != '\r'
6266 || UCHARAT(locinput) != '\n';
6271 if (locinput == reginfo->strbeg) {
6274 else if (NEXTCHR_IS_EOS) {
6278 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6279 getLB_VAL_CP(UCHARAT(locinput)),
6280 (U8*) reginfo->strbeg,
6282 (U8*) reginfo->strend,
6287 case SB_BOUND: /* Always matches at begin and end */
6288 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6292 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6293 getSB_VAL_CP(UCHARAT(locinput)),
6294 (U8*) reginfo->strbeg,
6296 (U8*) reginfo->strend,
6302 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6306 match = isWB(WB_UNKNOWN,
6307 getWB_VAL_CP(UCHARAT(locinput -1)),
6308 getWB_VAL_CP(UCHARAT(locinput)),
6309 (U8*) reginfo->strbeg,
6311 (U8*) reginfo->strend,
6318 if (to_complement ^ ! match) {
6323 case ANYOFL: /* /[abc]/l */
6324 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6326 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6328 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6331 case ANYOFD: /* /[abc]/d */
6332 case ANYOF: /* /[abc]/ */
6335 if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6336 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6339 locinput += UTF8SKIP(locinput);
6342 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6348 /* The argument (FLAGS) to all the POSIX node types is the class number
6351 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
6355 case POSIXL: /* \w or [:punct:] etc. under /l */
6356 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6360 /* Use isFOO_lc() for characters within Latin1. (Note that
6361 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6362 * wouldn't be invariant) */
6363 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6364 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6372 if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
6373 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6374 goto utf8_posix_above_latin1;
6377 /* Here is a UTF-8 variant code point below 256 and the target is
6379 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6380 EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6381 *(locinput + 1))))))
6386 goto increment_locinput;
6388 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
6392 case POSIXD: /* \w or [:punct:] etc. under /d */
6398 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
6400 if (NEXTCHR_IS_EOS) {
6404 /* All UTF-8 variants match */
6405 if (! UTF8_IS_INVARIANT(nextchr)) {
6406 goto increment_locinput;
6412 case POSIXA: /* \w or [:punct:] etc. under /a */
6415 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6416 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6417 * character is a single byte */
6419 if (NEXTCHR_IS_EOS) {
6425 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6431 /* Here we are either not in utf8, or we matched a utf8-invariant,
6432 * so the next char is the next byte */
6436 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
6440 case POSIXU: /* \w or [:punct:] etc. under /u */
6442 if (NEXTCHR_IS_EOS) {
6446 /* Use _generic_isCC() for characters within Latin1. (Note that
6447 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6448 * wouldn't be invariant) */
6449 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6450 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6457 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
6458 if (! (to_complement
6459 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6467 else { /* Handle above Latin-1 code points */
6468 utf8_posix_above_latin1:
6469 classnum = (_char_class_number) FLAGS(scan);
6470 if (classnum < _FIRST_NON_SWASH_CC) {
6472 /* Here, uses a swash to find such code points. Load if if
6473 * not done already */
6474 if (! PL_utf8_swash_ptrs[classnum]) {
6475 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6476 PL_utf8_swash_ptrs[classnum]
6477 = _core_swash_init("utf8",
6480 PL_XPosix_ptrs[classnum], &flags);
6482 if (! (to_complement
6483 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6484 (U8 *) locinput, TRUE))))
6489 else { /* Here, uses macros to find above Latin-1 code points */
6491 case _CC_ENUM_SPACE:
6492 if (! (to_complement
6493 ^ cBOOL(is_XPERLSPACE_high(locinput))))
6498 case _CC_ENUM_BLANK:
6499 if (! (to_complement
6500 ^ cBOOL(is_HORIZWS_high(locinput))))
6505 case _CC_ENUM_XDIGIT:
6506 if (! (to_complement
6507 ^ cBOOL(is_XDIGIT_high(locinput))))
6512 case _CC_ENUM_VERTSPACE:
6513 if (! (to_complement
6514 ^ cBOOL(is_VERTWS_high(locinput))))
6519 default: /* The rest, e.g. [:cntrl:], can't match
6521 if (! to_complement) {
6527 locinput += UTF8SKIP(locinput);
6531 case CLUMP: /* Match \X: logical Unicode character. This is defined as
6532 a Unicode extended Grapheme Cluster */
6535 if (! utf8_target) {
6537 /* Match either CR LF or '.', as all the other possibilities
6539 locinput++; /* Match the . or CR */
6540 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6542 && locinput < reginfo->strend
6543 && UCHARAT(locinput) == '\n')
6550 /* Get the gcb type for the current character */
6551 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6552 (U8*) reginfo->strend);
6554 /* Then scan through the input until we get to the first
6555 * character whose type is supposed to be a gcb with the
6556 * current character. (There is always a break at the
6558 locinput += UTF8SKIP(locinput);
6559 while (locinput < reginfo->strend) {
6560 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6561 (U8*) reginfo->strend);
6562 if (isGCB(prev_gcb, cur_gcb,
6563 (U8*) reginfo->strbeg, (U8*) locinput,
6570 locinput += UTF8SKIP(locinput);
6577 case NREFFL: /* /\g{name}/il */
6578 { /* The capture buffer cases. The ones beginning with N for the
6579 named buffers just convert to the equivalent numbered and
6580 pretend they were called as the corresponding numbered buffer
6582 /* don't initialize these in the declaration, it makes C++
6587 const U8 *fold_array;
6590 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6591 folder = foldEQ_locale;
6592 fold_array = PL_fold_locale;
6594 utf8_fold_flags = FOLDEQ_LOCALE;
6597 case NREFFA: /* /\g{name}/iaa */
6598 folder = foldEQ_latin1;
6599 fold_array = PL_fold_latin1;
6601 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6604 case NREFFU: /* /\g{name}/iu */
6605 folder = foldEQ_latin1;
6606 fold_array = PL_fold_latin1;
6608 utf8_fold_flags = 0;
6611 case NREFF: /* /\g{name}/i */
6613 fold_array = PL_fold;
6615 utf8_fold_flags = 0;
6618 case NREF: /* /\g{name}/ */
6622 utf8_fold_flags = 0;
6625 /* For the named back references, find the corresponding buffer
6627 n = reg_check_named_buff_matched(rex,scan);
6632 goto do_nref_ref_common;
6634 case REFFL: /* /\1/il */
6635 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6636 folder = foldEQ_locale;
6637 fold_array = PL_fold_locale;
6638 utf8_fold_flags = FOLDEQ_LOCALE;
6641 case REFFA: /* /\1/iaa */
6642 folder = foldEQ_latin1;
6643 fold_array = PL_fold_latin1;
6644 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6647 case REFFU: /* /\1/iu */
6648 folder = foldEQ_latin1;
6649 fold_array = PL_fold_latin1;
6650 utf8_fold_flags = 0;
6653 case REFF: /* /\1/i */
6655 fold_array = PL_fold;
6656 utf8_fold_flags = 0;
6659 case REF: /* /\1/ */
6662 utf8_fold_flags = 0;
6666 n = ARG(scan); /* which paren pair */
6669 ln = rex->offs[n].start;
6670 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6671 if (rex->lastparen < n || ln == -1)
6672 sayNO; /* Do not match unless seen CLOSEn. */
6673 if (ln == rex->offs[n].end)
6676 s = reginfo->strbeg + ln;
6677 if (type != REF /* REF can do byte comparison */
6678 && (utf8_target || type == REFFU || type == REFFL))
6680 char * limit = reginfo->strend;
6682 /* This call case insensitively compares the entire buffer
6683 * at s, with the current input starting at locinput, but
6684 * not going off the end given by reginfo->strend, and
6685 * returns in <limit> upon success, how much of the
6686 * current input was matched */
6687 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6688 locinput, &limit, 0, utf8_target, utf8_fold_flags))
6696 /* Not utf8: Inline the first character, for speed. */
6697 if (!NEXTCHR_IS_EOS &&
6698 UCHARAT(s) != nextchr &&
6700 UCHARAT(s) != fold_array[nextchr]))
6702 ln = rex->offs[n].end - ln;
6703 if (locinput + ln > reginfo->strend)
6705 if (ln > 1 && (type == REF
6706 ? memNE(s, locinput, ln)
6707 : ! folder(s, locinput, ln)))
6713 case NOTHING: /* null op; e.g. the 'nothing' following
6714 * the '*' in m{(a+|b)*}' */
6716 case TAIL: /* placeholder while compiling (A|B|C) */
6720 #define ST st->u.eval
6721 #define CUR_EVAL cur_eval->u.eval
6727 regexp_internal *rei;
6728 regnode *startpoint;
6731 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
6732 arg= (U32)ARG(scan);
6733 if (cur_eval && cur_eval->locinput == locinput) {
6734 if ( ++nochange_depth > max_nochange_depth )
6736 "Pattern subroutine nesting without pos change"
6737 " exceeded limit in regex");
6744 startpoint = scan + ARG2L(scan);
6745 EVAL_CLOSE_PAREN_SET( st, arg );
6746 /* Detect infinite recursion
6748 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6749 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6750 * So we track the position in the string we are at each time
6751 * we recurse and if we try to enter the same routine twice from
6752 * the same position we throw an error.
6754 if ( rex->recurse_locinput[arg] == locinput ) {
6755 /* FIXME: we should show the regop that is failing as part
6756 * of the error message. */
6757 Perl_croak(aTHX_ "Infinite recursion in regex");
6759 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6760 rex->recurse_locinput[arg]= locinput;
6763 GET_RE_DEBUG_FLAGS_DECL;
6765 Perl_re_exec_indentf( aTHX_
6766 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6767 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6773 /* Save all the positions seen so far. */
6774 ST.cp = regcppush(rex, 0, maxopenparen);
6775 REGCP_SET(ST.lastcp);
6777 /* and then jump to the code we share with EVAL */
6778 goto eval_recurse_doit;
6781 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
6782 if (cur_eval && cur_eval->locinput==locinput) {
6783 if ( ++nochange_depth > max_nochange_depth )
6784 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6789 /* execute the code in the {...} */
6793 OP * const oop = PL_op;
6794 COP * const ocurcop = PL_curcop;
6798 /* save *all* paren positions */
6799 regcppush(rex, 0, maxopenparen);
6800 REGCP_SET(runops_cp);
6803 caller_cv = find_runcv(NULL);
6807 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6809 (REGEXP*)(rexi->data->data[n])
6811 nop = (OP*)rexi->data->data[n+1];
6813 else if (rexi->data->what[n] == 'l') { /* literal code */
6815 nop = (OP*)rexi->data->data[n];
6816 assert(CvDEPTH(newcv));
6819 /* literal with own CV */
6820 assert(rexi->data->what[n] == 'L');
6821 newcv = rex->qr_anoncv;
6822 nop = (OP*)rexi->data->data[n];
6825 /* normally if we're about to execute code from the same
6826 * CV that we used previously, we just use the existing
6827 * CX stack entry. However, its possible that in the
6828 * meantime we may have backtracked, popped from the save
6829 * stack, and undone the SAVECOMPPAD(s) associated with
6830 * PUSH_MULTICALL; in which case PL_comppad no longer
6831 * points to newcv's pad. */
6832 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6834 U8 flags = (CXp_SUB_RE |
6835 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6836 if (last_pushed_cv) {
6837 /* PUSH/POP_MULTICALL save and restore the
6838 * caller's PL_comppad; if we call multiple subs
6839 * using the same CX block, we have to save and
6840 * unwind the varying PL_comppad's ourselves,
6841 * especially restoring the right PL_comppad on
6842 * backtrack - so save it on the save stack */
6844 CHANGE_MULTICALL_FLAGS(newcv, flags);
6847 PUSH_MULTICALL_FLAGS(newcv, flags);
6849 last_pushed_cv = newcv;
6852 /* these assignments are just to silence compiler
6854 multicall_cop = NULL;
6856 last_pad = PL_comppad;
6858 /* the initial nextstate you would normally execute
6859 * at the start of an eval (which would cause error
6860 * messages to come from the eval), may be optimised
6861 * away from the execution path in the regex code blocks;
6862 * so manually set PL_curcop to it initially */
6864 OP *o = cUNOPx(nop)->op_first;
6865 assert(o->op_type == OP_NULL);
6866 if (o->op_targ == OP_SCOPE) {
6867 o = cUNOPo->op_first;
6870 assert(o->op_targ == OP_LEAVE);
6871 o = cUNOPo->op_first;
6872 assert(o->op_type == OP_ENTER);
6876 if (o->op_type != OP_STUB) {
6877 assert( o->op_type == OP_NEXTSTATE
6878 || o->op_type == OP_DBSTATE
6879 || (o->op_type == OP_NULL
6880 && ( o->op_targ == OP_NEXTSTATE
6881 || o->op_targ == OP_DBSTATE
6885 PL_curcop = (COP*)o;
6890 DEBUG_STATE_r( Perl_re_printf( aTHX_
6891 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6893 rex->offs[0].end = locinput - reginfo->strbeg;
6894 if (reginfo->info_aux_eval->pos_magic)
6895 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6896 reginfo->sv, reginfo->strbeg,
6897 locinput - reginfo->strbeg);
6900 SV *sv_mrk = get_sv("REGMARK", 1);
6901 sv_setsv(sv_mrk, sv_yes_mark);
6904 /* we don't use MULTICALL here as we want to call the
6905 * first op of the block of interest, rather than the
6906 * first op of the sub. Also, we don't want to free
6907 * the savestack frame */
6908 before = (IV)(SP-PL_stack_base);
6910 CALLRUNOPS(aTHX); /* Scalar context. */
6912 if ((IV)(SP-PL_stack_base) == before)
6913 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
6919 /* before restoring everything, evaluate the returned
6920 * value, so that 'uninit' warnings don't use the wrong
6921 * PL_op or pad. Also need to process any magic vars
6922 * (e.g. $1) *before* parentheses are restored */
6927 if (logical == 0) /* (?{})/ */
6928 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6929 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
6930 sw = cBOOL(SvTRUE(ret));
6933 else { /* /(??{}) */
6934 /* if its overloaded, let the regex compiler handle
6935 * it; otherwise extract regex, or stringify */
6936 if (SvGMAGICAL(ret))
6937 ret = sv_mortalcopy(ret);
6938 if (!SvAMAGIC(ret)) {
6942 if (SvTYPE(sv) == SVt_REGEXP)
6943 re_sv = (REGEXP*) sv;
6944 else if (SvSMAGICAL(ret)) {
6945 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6947 re_sv = (REGEXP *) mg->mg_obj;
6950 /* force any undef warnings here */
6951 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6952 ret = sv_mortalcopy(ret);
6953 (void) SvPV_force_nolen(ret);
6959 /* *** Note that at this point we don't restore
6960 * PL_comppad, (or pop the CxSUB) on the assumption it may
6961 * be used again soon. This is safe as long as nothing
6962 * in the regexp code uses the pad ! */
6964 PL_curcop = ocurcop;
6965 regcp_restore(rex, runops_cp, &maxopenparen);
6966 PL_curpm_under = PL_curpm;
6967 PL_curpm = PL_reg_curpm;
6973 /* only /(??{})/ from now on */
6976 /* extract RE object from returned value; compiling if
6980 re_sv = reg_temp_copy(NULL, re_sv);
6985 if (SvUTF8(ret) && IN_BYTES) {
6986 /* In use 'bytes': make a copy of the octet
6987 * sequence, but without the flag on */
6989 const char *const p = SvPV(ret, len);
6990 ret = newSVpvn_flags(p, len, SVs_TEMP);
6992 if (rex->intflags & PREGf_USE_RE_EVAL)
6993 pm_flags |= PMf_USE_RE_EVAL;
6995 /* if we got here, it should be an engine which
6996 * supports compiling code blocks and stuff */
6997 assert(rex->engine && rex->engine->op_comp);
6998 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6999 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7000 rex->engine, NULL, NULL,
7001 /* copy /msixn etc to inner pattern */
7006 & (SVs_TEMP | SVs_GMG | SVf_ROK))
7007 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7008 /* This isn't a first class regexp. Instead, it's
7009 caching a regexp onto an existing, Perl visible
7011 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7017 RXp_MATCH_COPIED_off(re);
7018 re->subbeg = rex->subbeg;
7019 re->sublen = rex->sublen;
7020 re->suboffset = rex->suboffset;
7021 re->subcoffset = rex->subcoffset;
7023 re->lastcloseparen = 0;
7026 debug_start_match(re_sv, utf8_target, locinput,
7027 reginfo->strend, "Matching embedded");
7029 startpoint = rei->program + 1;
7030 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7031 * close_paren only for GOSUB */
7032 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7033 /* Save all the seen positions so far. */
7034 ST.cp = regcppush(rex, 0, maxopenparen);
7035 REGCP_SET(ST.lastcp);
7036 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7038 /* run the pattern returned from (??{...}) */
7040 eval_recurse_doit: /* Share code with GOSUB below this line
7041 * At this point we expect the stack context to be
7042 * set up correctly */
7044 /* invalidate the S-L poscache. We're now executing a
7045 * different set of WHILEM ops (and their associated
7046 * indexes) against the same string, so the bits in the
7047 * cache are meaningless. Setting maxiter to zero forces
7048 * the cache to be invalidated and zeroed before reuse.
7049 * XXX This is too dramatic a measure. Ideally we should
7050 * save the old cache and restore when running the outer
7052 reginfo->poscache_maxiter = 0;
7054 /* the new regexp might have a different is_utf8_pat than we do */
7055 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7057 ST.prev_rex = rex_sv;
7058 ST.prev_curlyx = cur_curlyx;
7060 SET_reg_curpm(rex_sv);
7065 ST.prev_eval = cur_eval;
7067 /* now continue from first node in postoned RE */
7068 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
7069 NOT_REACHED; /* NOTREACHED */
7072 case EVAL_AB: /* cleanup after a successful (??{A})B */
7073 /* note: this is called twice; first after popping B, then A */
7075 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
7076 depth, cur_eval, ST.prev_eval);
7079 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7080 if ( cur_eval && CUR_EVAL.close_paren ) {\
7082 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7084 CUR_EVAL.close_paren - 1,\
7088 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7091 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7093 rex_sv = ST.prev_rex;
7094 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7095 SET_reg_curpm(rex_sv);
7096 rex = ReANY(rex_sv);
7097 rexi = RXi_GET(rex);
7099 /* preserve $^R across LEAVE's. See Bug 121070. */
7100 SV *save_sv= GvSV(PL_replgv);
7101 SvREFCNT_inc(save_sv);
7102 regcpblow(ST.cp); /* LEAVE in disguise */
7103 sv_setsv(GvSV(PL_replgv), save_sv);
7104 SvREFCNT_dec(save_sv);
7106 cur_eval = ST.prev_eval;
7107 cur_curlyx = ST.prev_curlyx;
7109 /* Invalidate cache. See "invalidate" comment above. */
7110 reginfo->poscache_maxiter = 0;
7111 if ( nochange_depth )
7114 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7118 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7119 /* note: this is called twice; first after popping B, then A */
7121 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7122 depth, cur_eval, ST.prev_eval);
7125 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7127 rex_sv = ST.prev_rex;
7128 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7129 SET_reg_curpm(rex_sv);
7130 rex = ReANY(rex_sv);
7131 rexi = RXi_GET(rex);
7133 REGCP_UNWIND(ST.lastcp);
7134 regcppop(rex, &maxopenparen);
7135 cur_eval = ST.prev_eval;
7136 cur_curlyx = ST.prev_curlyx;
7138 /* Invalidate cache. See "invalidate" comment above. */
7139 reginfo->poscache_maxiter = 0;
7140 if ( nochange_depth )
7143 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7148 n = ARG(scan); /* which paren pair */
7149 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7150 if (n > maxopenparen)
7152 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7153 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
7158 (IV)rex->offs[n].start_tmp,
7164 /* XXX really need to log other places start/end are set too */
7165 #define CLOSE_CAPTURE \
7166 rex->offs[n].start = rex->offs[n].start_tmp; \
7167 rex->offs[n].end = locinput - reginfo->strbeg; \
7168 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
7169 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
7172 PTR2UV(rex->offs), \
7174 (IV)rex->offs[n].start, \
7175 (IV)rex->offs[n].end \
7179 n = ARG(scan); /* which paren pair */
7181 if (n > rex->lastparen)
7183 rex->lastcloseparen = n;
7184 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7189 case ACCEPT: /* (*ACCEPT) */
7191 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7195 cursor && OP(cursor)!=END;
7196 cursor=regnext(cursor))
7198 if ( OP(cursor)==CLOSE ){
7200 if ( n <= lastopen ) {
7202 if (n > rex->lastparen)
7204 rex->lastcloseparen = n;
7205 if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7214 case GROUPP: /* (?(1)) */
7215 n = ARG(scan); /* which paren pair */
7216 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7219 case NGROUPP: /* (?(<name>)) */
7220 /* reg_check_named_buff_matched returns 0 for no match */
7221 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7224 case INSUBP: /* (?(R)) */
7226 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7227 * of SCAN is already set up as matches a eval.close_paren */
7228 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7231 case DEFINEP: /* (?(DEFINE)) */
7235 case IFTHEN: /* (?(cond)A|B) */
7236 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7238 next = NEXTOPER(NEXTOPER(scan));
7240 next = scan + ARG(scan);
7241 if (OP(next) == IFTHEN) /* Fake one. */
7242 next = NEXTOPER(NEXTOPER(next));
7246 case LOGICAL: /* modifier for EVAL and IFMATCH */
7247 logical = scan->flags;
7250 /*******************************************************************
7252 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7253 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7254 STAR/PLUS/CURLY/CURLYN are used instead.)
7256 A*B is compiled as <CURLYX><A><WHILEM><B>
7258 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7259 state, which contains the current count, initialised to -1. It also sets
7260 cur_curlyx to point to this state, with any previous value saved in the
7263 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7264 since the pattern may possibly match zero times (i.e. it's a while {} loop
7265 rather than a do {} while loop).
7267 Each entry to WHILEM represents a successful match of A. The count in the
7268 CURLYX block is incremented, another WHILEM state is pushed, and execution
7269 passes to A or B depending on greediness and the current count.
7271 For example, if matching against the string a1a2a3b (where the aN are
7272 substrings that match /A/), then the match progresses as follows: (the
7273 pushed states are interspersed with the bits of strings matched so far):
7276 <CURLYX cnt=0><WHILEM>
7277 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7278 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7279 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7280 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7282 (Contrast this with something like CURLYM, which maintains only a single
7286 a1 <CURLYM cnt=1> a2
7287 a1 a2 <CURLYM cnt=2> a3
7288 a1 a2 a3 <CURLYM cnt=3> b
7291 Each WHILEM state block marks a point to backtrack to upon partial failure
7292 of A or B, and also contains some minor state data related to that
7293 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
7294 overall state, such as the count, and pointers to the A and B ops.
7296 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7297 must always point to the *current* CURLYX block, the rules are:
7299 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7300 and set cur_curlyx to point the new block.
7302 When popping the CURLYX block after a successful or unsuccessful match,
7303 restore the previous cur_curlyx.
7305 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7306 to the outer one saved in the CURLYX block.
7308 When popping the WHILEM block after a successful or unsuccessful B match,
7309 restore the previous cur_curlyx.
7311 Here's an example for the pattern (AI* BI)*BO
7312 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7315 curlyx backtrack stack
7316 ------ ---------------
7318 CO <CO prev=NULL> <WO>
7319 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7320 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7321 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7323 At this point the pattern succeeds, and we work back down the stack to
7324 clean up, restoring as we go:
7326 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7327 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7328 CO <CO prev=NULL> <WO>
7331 *******************************************************************/
7333 #define ST st->u.curlyx
7335 case CURLYX: /* start of /A*B/ (for complex A) */
7337 /* No need to save/restore up to this paren */
7338 I32 parenfloor = scan->flags;
7340 assert(next); /* keep Coverity happy */
7341 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7344 /* XXXX Probably it is better to teach regpush to support
7345 parenfloor > maxopenparen ... */
7346 if (parenfloor > (I32)rex->lastparen)
7347 parenfloor = rex->lastparen; /* Pessimization... */
7349 ST.prev_curlyx= cur_curlyx;
7351 ST.cp = PL_savestack_ix;
7353 /* these fields contain the state of the current curly.
7354 * they are accessed by subsequent WHILEMs */
7355 ST.parenfloor = parenfloor;
7360 ST.count = -1; /* this will be updated by WHILEM */
7361 ST.lastloc = NULL; /* this will be updated by WHILEM */
7363 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7364 NOT_REACHED; /* NOTREACHED */
7367 case CURLYX_end: /* just finished matching all of A*B */
7368 cur_curlyx = ST.prev_curlyx;
7370 NOT_REACHED; /* NOTREACHED */
7372 case CURLYX_end_fail: /* just failed to match all of A*B */
7374 cur_curlyx = ST.prev_curlyx;
7376 NOT_REACHED; /* NOTREACHED */
7380 #define ST st->u.whilem
7382 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
7384 /* see the discussion above about CURLYX/WHILEM */
7389 assert(cur_curlyx); /* keep Coverity happy */
7391 min = ARG1(cur_curlyx->u.curlyx.me);
7392 max = ARG2(cur_curlyx->u.curlyx.me);
7393 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7394 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7395 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7396 ST.cache_offset = 0;
7400 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: matched %ld out of %d..%d\n",
7401 depth, (long)n, min, max)
7404 /* First just match a string of min A's. */
7407 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7408 cur_curlyx->u.curlyx.lastloc = locinput;
7409 REGCP_SET(ST.lastcp);
7411 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7412 NOT_REACHED; /* NOTREACHED */
7415 /* If degenerate A matches "", assume A done. */
7417 if (locinput == cur_curlyx->u.curlyx.lastloc) {
7418 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: empty match detected, trying continuation...\n",
7421 goto do_whilem_B_max;
7424 /* super-linear cache processing.
7426 * The idea here is that for certain types of CURLYX/WHILEM -
7427 * principally those whose upper bound is infinity (and
7428 * excluding regexes that have things like \1 and other very
7429 * non-regular expresssiony things), then if a pattern like
7430 * /....A*.../ fails and we backtrack to the WHILEM, then we
7431 * make a note that this particular WHILEM op was at string
7432 * position 47 (say) when the rest of pattern failed. Then, if
7433 * we ever find ourselves back at that WHILEM, and at string
7434 * position 47 again, we can just fail immediately rather than
7435 * running the rest of the pattern again.
7437 * This is very handy when patterns start to go
7438 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7439 * with a combinatorial explosion of backtracking.
7441 * The cache is implemented as a bit array, with one bit per
7442 * string byte position per WHILEM op (up to 16) - so its
7443 * between 0.25 and 2x the string size.
7445 * To avoid allocating a poscache buffer every time, we do an
7446 * initially countdown; only after we have executed a WHILEM
7447 * op (string-length x #WHILEMs) times do we allocate the
7450 * The top 4 bits of scan->flags byte say how many different
7451 * relevant CURLLYX/WHILEM op pairs there are, while the
7452 * bottom 4-bits is the identifying index number of this
7458 if (!reginfo->poscache_maxiter) {
7459 /* start the countdown: Postpone detection until we
7460 * know the match is not *that* much linear. */
7461 reginfo->poscache_maxiter
7462 = (reginfo->strend - reginfo->strbeg + 1)
7464 /* possible overflow for long strings and many CURLYX's */
7465 if (reginfo->poscache_maxiter < 0)
7466 reginfo->poscache_maxiter = I32_MAX;
7467 reginfo->poscache_iter = reginfo->poscache_maxiter;
7470 if (reginfo->poscache_iter-- == 0) {
7471 /* initialise cache */
7472 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7473 regmatch_info_aux *const aux = reginfo->info_aux;
7474 if (aux->poscache) {
7475 if ((SSize_t)reginfo->poscache_size < size) {
7476 Renew(aux->poscache, size, char);
7477 reginfo->poscache_size = size;
7479 Zero(aux->poscache, size, char);
7482 reginfo->poscache_size = size;
7483 Newxz(aux->poscache, size, char);
7485 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7486 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
7487 PL_colors[4], PL_colors[5])
7491 if (reginfo->poscache_iter < 0) {
7492 /* have we already failed at this position? */
7493 SSize_t offset, mask;
7495 reginfo->poscache_iter = -1; /* stop eventual underflow */
7496 offset = (scan->flags & 0xf) - 1
7497 + (locinput - reginfo->strbeg)
7499 mask = 1 << (offset % 8);
7501 if (reginfo->info_aux->poscache[offset] & mask) {
7502 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: (cache) already tried at this position...\n",
7505 sayNO; /* cache records failure */
7507 ST.cache_offset = offset;
7508 ST.cache_mask = mask;
7512 /* Prefer B over A for minimal matching. */
7514 if (cur_curlyx->u.curlyx.minmod) {
7515 ST.save_curlyx = cur_curlyx;
7516 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7517 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
7519 REGCP_SET(ST.lastcp);
7520 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7522 NOT_REACHED; /* NOTREACHED */
7525 /* Prefer A over B for maximal matching. */
7527 if (n < max) { /* More greed allowed? */
7528 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7530 cur_curlyx->u.curlyx.lastloc = locinput;
7531 REGCP_SET(ST.lastcp);
7532 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7533 NOT_REACHED; /* NOTREACHED */
7535 goto do_whilem_B_max;
7537 NOT_REACHED; /* NOTREACHED */
7539 case WHILEM_B_min: /* just matched B in a minimal match */
7540 case WHILEM_B_max: /* just matched B in a maximal match */
7541 cur_curlyx = ST.save_curlyx;
7543 NOT_REACHED; /* NOTREACHED */
7545 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7546 cur_curlyx = ST.save_curlyx;
7547 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7548 cur_curlyx->u.curlyx.count--;
7550 NOT_REACHED; /* NOTREACHED */
7552 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7554 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7555 REGCP_UNWIND(ST.lastcp);
7556 regcppop(rex, &maxopenparen);
7557 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7558 cur_curlyx->u.curlyx.count--;
7560 NOT_REACHED; /* NOTREACHED */
7562 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7563 REGCP_UNWIND(ST.lastcp);
7564 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
7565 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "whilem: failed, trying continuation...\n",
7569 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7570 && ckWARN(WARN_REGEXP)
7571 && !reginfo->warned)
7573 reginfo->warned = TRUE;
7574 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7575 "Complex regular subexpression recursion limit (%d) "
7581 ST.save_curlyx = cur_curlyx;
7582 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7583 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7585 NOT_REACHED; /* NOTREACHED */
7587 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7588 cur_curlyx = ST.save_curlyx;
7589 REGCP_UNWIND(ST.lastcp);
7590 regcppop(rex, &maxopenparen);
7592 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
7593 /* Maximum greed exceeded */
7594 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7595 && ckWARN(WARN_REGEXP)
7596 && !reginfo->warned)
7598 reginfo->warned = TRUE;
7599 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7600 "Complex regular subexpression recursion "
7601 "limit (%d) exceeded",
7604 cur_curlyx->u.curlyx.count--;
7608 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "trying longer...\n", depth)
7610 /* Try grabbing another A and see if it helps. */
7611 cur_curlyx->u.curlyx.lastloc = locinput;
7612 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7614 REGCP_SET(ST.lastcp);
7615 PUSH_STATE_GOTO(WHILEM_A_min,
7616 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
7618 NOT_REACHED; /* NOTREACHED */
7621 #define ST st->u.branch
7623 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
7624 next = scan + ARG(scan);
7627 scan = NEXTOPER(scan);
7630 case BRANCH: /* /(...|A|...)/ */
7631 scan = NEXTOPER(scan); /* scan now points to inner node */
7632 ST.lastparen = rex->lastparen;
7633 ST.lastcloseparen = rex->lastcloseparen;
7634 ST.next_branch = next;
7637 /* Now go into the branch */
7639 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7641 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7643 NOT_REACHED; /* NOTREACHED */
7645 case CUTGROUP: /* /(*THEN)/ */
7646 sv_yes_mark = st->u.mark.mark_name = scan->flags
7647 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7649 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7650 NOT_REACHED; /* NOTREACHED */
7652 case CUTGROUP_next_fail:
7655 if (st->u.mark.mark_name)
7656 sv_commit = st->u.mark.mark_name;
7658 NOT_REACHED; /* NOTREACHED */
7662 NOT_REACHED; /* NOTREACHED */
7664 case BRANCH_next_fail: /* that branch failed; try the next, if any */
7669 REGCP_UNWIND(ST.cp);
7670 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7671 scan = ST.next_branch;
7672 /* no more branches? */
7673 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7675 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
7682 continue; /* execute next BRANCH[J] op */
7685 case MINMOD: /* next op will be non-greedy, e.g. A*? */
7690 #define ST st->u.curlym
7692 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
7694 /* This is an optimisation of CURLYX that enables us to push
7695 * only a single backtracking state, no matter how many matches
7696 * there are in {m,n}. It relies on the pattern being constant
7697 * length, with no parens to influence future backrefs
7701 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7703 ST.lastparen = rex->lastparen;
7704 ST.lastcloseparen = rex->lastcloseparen;
7706 /* if paren positive, emulate an OPEN/CLOSE around A */
7708 U32 paren = ST.me->flags;
7709 if (paren > maxopenparen)
7710 maxopenparen = paren;
7711 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7719 ST.c1 = CHRTEST_UNINIT;
7722 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7725 curlym_do_A: /* execute the A in /A{m,n}B/ */
7726 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7727 NOT_REACHED; /* NOTREACHED */
7729 case CURLYM_A: /* we've just matched an A */
7731 /* after first match, determine A's length: u.curlym.alen */
7732 if (ST.count == 1) {
7733 if (reginfo->is_utf8_target) {
7734 char *s = st->locinput;
7735 while (s < locinput) {
7741 ST.alen = locinput - st->locinput;
7744 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7747 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7748 depth, (IV) ST.count, (IV)ST.alen)
7751 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7755 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7756 if ( max == REG_INFTY || ST.count < max )
7757 goto curlym_do_A; /* try to match another A */
7759 goto curlym_do_B; /* try to match B */
7761 case CURLYM_A_fail: /* just failed to match an A */
7762 REGCP_UNWIND(ST.cp);
7765 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7766 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7769 curlym_do_B: /* execute the B in /A{m,n}B/ */
7770 if (ST.c1 == CHRTEST_UNINIT) {
7771 /* calculate c1 and c2 for possible match of 1st char
7772 * following curly */
7773 ST.c1 = ST.c2 = CHRTEST_VOID;
7775 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7776 regnode *text_node = ST.B;
7777 if (! HAS_TEXT(text_node))
7778 FIND_NEXT_IMPT(text_node);
7781 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7783 But the former is redundant in light of the latter.
7785 if this changes back then the macro for
7786 IS_TEXT and friends need to change.
7788 if (PL_regkind[OP(text_node)] == EXACT) {
7789 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7790 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7800 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%"IVdf"...\n",
7801 depth, (IV)ST.count)
7803 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7804 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7805 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7806 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7808 /* simulate B failing */
7810 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7812 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7813 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7814 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7816 state_num = CURLYM_B_fail;
7817 goto reenter_switch;
7820 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7821 /* simulate B failing */
7823 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7825 (int) nextchr, ST.c1, ST.c2)
7827 state_num = CURLYM_B_fail;
7828 goto reenter_switch;
7833 /* emulate CLOSE: mark current A as captured */
7834 I32 paren = ST.me->flags;
7836 rex->offs[paren].start
7837 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7838 rex->offs[paren].end = locinput - reginfo->strbeg;
7839 if ((U32)paren > rex->lastparen)
7840 rex->lastparen = paren;
7841 rex->lastcloseparen = paren;
7844 rex->offs[paren].end = -1;
7846 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7855 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7856 NOT_REACHED; /* NOTREACHED */
7858 case CURLYM_B_fail: /* just failed to match a B */
7859 REGCP_UNWIND(ST.cp);
7860 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7862 I32 max = ARG2(ST.me);
7863 if (max != REG_INFTY && ST.count == max)
7865 goto curlym_do_A; /* try to match a further A */
7867 /* backtrack one A */
7868 if (ST.count == ARG1(ST.me) /* min */)
7871 SET_locinput(HOPc(locinput, -ST.alen));
7872 goto curlym_do_B; /* try to match B */
7875 #define ST st->u.curly
7877 #define CURLY_SETPAREN(paren, success) \
7880 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7881 rex->offs[paren].end = locinput - reginfo->strbeg; \
7882 if (paren > rex->lastparen) \
7883 rex->lastparen = paren; \
7884 rex->lastcloseparen = paren; \
7887 rex->offs[paren].end = -1; \
7888 rex->lastparen = ST.lastparen; \
7889 rex->lastcloseparen = ST.lastcloseparen; \
7893 case STAR: /* /A*B/ where A is width 1 char */
7897 scan = NEXTOPER(scan);
7900 case PLUS: /* /A+B/ where A is width 1 char */
7904 scan = NEXTOPER(scan);
7907 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
7908 ST.paren = scan->flags; /* Which paren to set */
7909 ST.lastparen = rex->lastparen;
7910 ST.lastcloseparen = rex->lastcloseparen;
7911 if (ST.paren > maxopenparen)
7912 maxopenparen = ST.paren;
7913 ST.min = ARG1(scan); /* min to match */
7914 ST.max = ARG2(scan); /* max to match */
7915 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
7920 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7923 case CURLY: /* /A{m,n}B/ where A is width 1 char */
7925 ST.min = ARG1(scan); /* min to match */
7926 ST.max = ARG2(scan); /* max to match */
7927 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7930 * Lookahead to avoid useless match attempts
7931 * when we know what character comes next.
7933 * Used to only do .*x and .*?x, but now it allows
7934 * for )'s, ('s and (?{ ... })'s to be in the way
7935 * of the quantifier and the EXACT-like node. -- japhy
7938 assert(ST.min <= ST.max);
7939 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7940 ST.c1 = ST.c2 = CHRTEST_VOID;
7943 regnode *text_node = next;
7945 if (! HAS_TEXT(text_node))
7946 FIND_NEXT_IMPT(text_node);
7948 if (! HAS_TEXT(text_node))
7949 ST.c1 = ST.c2 = CHRTEST_VOID;
7951 if ( PL_regkind[OP(text_node)] != EXACT ) {
7952 ST.c1 = ST.c2 = CHRTEST_VOID;
7956 /* Currently we only get here when
7958 PL_rekind[OP(text_node)] == EXACT
7960 if this changes back then the macro for IS_TEXT and
7961 friends need to change. */
7962 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7963 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7975 char *li = locinput;
7978 regrepeat(rex, &li, ST.A, reginfo, ST.min)
7984 if (ST.c1 == CHRTEST_VOID)
7985 goto curly_try_B_min;
7987 ST.oldloc = locinput;
7989 /* set ST.maxpos to the furthest point along the
7990 * string that could possibly match */
7991 if (ST.max == REG_INFTY) {
7992 ST.maxpos = reginfo->strend - 1;
7994 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7997 else if (utf8_target) {
7998 int m = ST.max - ST.min;
7999 for (ST.maxpos = locinput;
8000 m >0 && ST.maxpos < reginfo->strend; m--)
8001 ST.maxpos += UTF8SKIP(ST.maxpos);
8004 ST.maxpos = locinput + ST.max - ST.min;
8005 if (ST.maxpos >= reginfo->strend)
8006 ST.maxpos = reginfo->strend - 1;
8008 goto curly_try_B_min_known;
8012 /* avoid taking address of locinput, so it can remain
8014 char *li = locinput;
8015 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8016 if (ST.count < ST.min)
8019 if ((ST.count > ST.min)
8020 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8022 /* A{m,n} must come at the end of the string, there's
8023 * no point in backing off ... */
8025 /* ...except that $ and \Z can match before *and* after
8026 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
8027 We may back off by one in this case. */
8028 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8032 goto curly_try_B_max;
8034 NOT_REACHED; /* NOTREACHED */
8036 case CURLY_B_min_known_fail:
8037 /* failed to find B in a non-greedy match where c1,c2 valid */
8039 REGCP_UNWIND(ST.cp);
8041 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8043 /* Couldn't or didn't -- move forward. */
8044 ST.oldloc = locinput;
8046 locinput += UTF8SKIP(locinput);
8050 curly_try_B_min_known:
8051 /* find the next place where 'B' could work, then call B */
8055 n = (ST.oldloc == locinput) ? 0 : 1;
8056 if (ST.c1 == ST.c2) {
8057 /* set n to utf8_distance(oldloc, locinput) */
8058 while (locinput <= ST.maxpos
8059 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8061 locinput += UTF8SKIP(locinput);
8066 /* set n to utf8_distance(oldloc, locinput) */
8067 while (locinput <= ST.maxpos
8068 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8069 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8071 locinput += UTF8SKIP(locinput);
8076 else { /* Not utf8_target */
8077 if (ST.c1 == ST.c2) {
8078 while (locinput <= ST.maxpos &&
8079 UCHARAT(locinput) != ST.c1)
8083 while (locinput <= ST.maxpos
8084 && UCHARAT(locinput) != ST.c1
8085 && UCHARAT(locinput) != ST.c2)
8088 n = locinput - ST.oldloc;
8090 if (locinput > ST.maxpos)
8093 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8094 * at b; check that everything between oldloc and
8095 * locinput matches */
8096 char *li = ST.oldloc;
8098 if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8100 assert(n == REG_INFTY || locinput == li);
8102 CURLY_SETPAREN(ST.paren, ST.count);
8103 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8105 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8107 NOT_REACHED; /* NOTREACHED */
8109 case CURLY_B_min_fail:
8110 /* failed to find B in a non-greedy match where c1,c2 invalid */
8112 REGCP_UNWIND(ST.cp);
8114 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8116 /* failed -- move forward one */
8118 char *li = locinput;
8119 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8126 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8127 ST.count > 0)) /* count overflow ? */
8130 CURLY_SETPAREN(ST.paren, ST.count);
8131 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8133 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8137 NOT_REACHED; /* NOTREACHED */
8140 /* a successful greedy match: now try to match B */
8141 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8144 bool could_match = locinput < reginfo->strend;
8146 /* If it could work, try it. */
8147 if (ST.c1 != CHRTEST_VOID && could_match) {
8148 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8150 could_match = memEQ(locinput,
8155 UTF8SKIP(locinput));
8158 could_match = UCHARAT(locinput) == ST.c1
8159 || UCHARAT(locinput) == ST.c2;
8162 if (ST.c1 == CHRTEST_VOID || could_match) {
8163 CURLY_SETPAREN(ST.paren, ST.count);
8164 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8165 NOT_REACHED; /* NOTREACHED */
8170 case CURLY_B_max_fail:
8171 /* failed to find B in a greedy match */
8173 REGCP_UNWIND(ST.cp);
8175 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8178 if (--ST.count < ST.min)
8180 locinput = HOPc(locinput, -1);
8181 goto curly_try_B_max;
8185 case END: /* last op of main pattern */
8188 /* we've just finished A in /(??{A})B/; now continue with B */
8189 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8190 st->u.eval.prev_rex = rex_sv; /* inner */
8192 /* Save *all* the positions. */
8193 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8194 rex_sv = CUR_EVAL.prev_rex;
8195 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8196 SET_reg_curpm(rex_sv);
8197 rex = ReANY(rex_sv);
8198 rexi = RXi_GET(rex);
8200 st->u.eval.prev_curlyx = cur_curlyx;
8201 cur_curlyx = CUR_EVAL.prev_curlyx;
8203 REGCP_SET(st->u.eval.lastcp);
8205 /* Restore parens of the outer rex without popping the
8207 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8209 st->u.eval.prev_eval = cur_eval;
8210 cur_eval = CUR_EVAL.prev_eval;
8212 Perl_re_exec_indentf( aTHX_ "EVAL trying tail ... (cur_eval=%p)\n",
8214 if ( nochange_depth )
8217 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8219 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
8220 locinput); /* match B */
8223 if (locinput < reginfo->till) {
8224 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8225 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8227 (long)(locinput - startpos),
8228 (long)(reginfo->till - startpos),
8231 sayNO_SILENT; /* Cannot match: too short. */
8233 sayYES; /* Success! */
8235 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8237 Perl_re_exec_indentf( aTHX_ "%ssubpattern success...%s\n",
8238 depth, PL_colors[4], PL_colors[5]));
8239 sayYES; /* Success! */
8242 #define ST st->u.ifmatch
8247 case SUSPEND: /* (?>A) */
8249 newstart = locinput;
8252 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
8254 goto ifmatch_trivial_fail_test;
8256 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
8258 ifmatch_trivial_fail_test:
8260 char * const s = HOPBACKc(locinput, scan->flags);
8265 sw = 1 - cBOOL(ST.wanted);
8269 next = scan + ARG(scan);
8277 newstart = locinput;
8281 ST.logical = logical;
8282 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8284 /* execute body of (?...A) */
8285 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8286 NOT_REACHED; /* NOTREACHED */
8289 case IFMATCH_A_fail: /* body of (?...A) failed */
8290 ST.wanted = !ST.wanted;
8293 case IFMATCH_A: /* body of (?...A) succeeded */
8295 sw = cBOOL(ST.wanted);
8297 else if (!ST.wanted)
8300 if (OP(ST.me) != SUSPEND) {
8301 /* restore old position except for (?>...) */
8302 locinput = st->locinput;
8304 scan = ST.me + ARG(ST.me);
8307 continue; /* execute B */
8311 case LONGJMP: /* alternative with many branches compiles to
8312 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8313 next = scan + ARG(scan);
8318 case COMMIT: /* (*COMMIT) */
8319 reginfo->cutpoint = reginfo->strend;
8322 case PRUNE: /* (*PRUNE) */
8324 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8325 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8326 NOT_REACHED; /* NOTREACHED */
8328 case COMMIT_next_fail:
8332 NOT_REACHED; /* NOTREACHED */
8334 case OPFAIL: /* (*FAIL) */
8336 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8338 /* deal with (?(?!)X|Y) properly,
8339 * make sure we trigger the no branch
8340 * of the trailing IFTHEN structure*/
8346 NOT_REACHED; /* NOTREACHED */
8348 #define ST st->u.mark
8349 case MARKPOINT: /* (*MARK:foo) */
8350 ST.prev_mark = mark_state;
8351 ST.mark_name = sv_commit = sv_yes_mark
8352 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8354 ST.mark_loc = locinput;
8355 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8356 NOT_REACHED; /* NOTREACHED */
8358 case MARKPOINT_next:
8359 mark_state = ST.prev_mark;
8361 NOT_REACHED; /* NOTREACHED */
8363 case MARKPOINT_next_fail:
8364 if (popmark && sv_eq(ST.mark_name,popmark))
8366 if (ST.mark_loc > startpoint)
8367 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8368 popmark = NULL; /* we found our mark */
8369 sv_commit = ST.mark_name;
8372 Perl_re_exec_indentf( aTHX_ "%ssetting cutpoint to mark:%"SVf"...%s\n",
8374 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8377 mark_state = ST.prev_mark;
8378 sv_yes_mark = mark_state ?
8379 mark_state->u.mark.mark_name : NULL;
8381 NOT_REACHED; /* NOTREACHED */
8383 case SKIP: /* (*SKIP) */
8385 /* (*SKIP) : if we fail we cut here*/
8386 ST.mark_name = NULL;
8387 ST.mark_loc = locinput;
8388 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8390 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8391 otherwise do nothing. Meaning we need to scan
8393 regmatch_state *cur = mark_state;
8394 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8397 if ( sv_eq( cur->u.mark.mark_name,
8400 ST.mark_name = find;
8401 PUSH_STATE_GOTO( SKIP_next, next, locinput);
8403 cur = cur->u.mark.prev_mark;
8406 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8409 case SKIP_next_fail:
8411 /* (*CUT:NAME) - Set up to search for the name as we
8412 collapse the stack*/
8413 popmark = ST.mark_name;
8415 /* (*CUT) - No name, we cut here.*/
8416 if (ST.mark_loc > startpoint)
8417 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8418 /* but we set sv_commit to latest mark_name if there
8419 is one so they can test to see how things lead to this
8422 sv_commit=mark_state->u.mark.mark_name;
8426 NOT_REACHED; /* NOTREACHED */
8429 case LNBREAK: /* \R */
8430 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8437 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
8438 PTR2UV(scan), OP(scan));
8439 Perl_croak(aTHX_ "regexp memory corruption");
8441 /* this is a point to jump to in order to increment
8442 * locinput by one character */
8444 assert(!NEXTCHR_IS_EOS);
8446 locinput += PL_utf8skip[nextchr];
8447 /* locinput is allowed to go 1 char off the end, but not 2+ */
8448 if (locinput > reginfo->strend)
8457 /* switch break jumps here */
8458 scan = next; /* prepare to execute the next op and ... */
8459 continue; /* ... jump back to the top, reusing st */
8463 /* push a state that backtracks on success */
8464 st->u.yes.prev_yes_state = yes_state;
8468 /* push a new regex state, then continue at scan */
8470 regmatch_state *newst;
8473 regmatch_state *cur = st;
8474 regmatch_state *curyes = yes_state;
8476 regmatch_slab *slab = PL_regmatch_slab;
8477 for (;curd > -1 && (depth-curd < 3);cur--,curd--) {
8478 if (cur < SLAB_FIRST(slab)) {
8480 cur = SLAB_LAST(slab);
8482 Perl_re_exec_indentf( aTHX_ "#%-3d %-10s %s\n",
8484 curd, PL_reg_name[cur->resume_state],
8485 (curyes == cur) ? "yes" : ""
8488 curyes = cur->u.yes.prev_yes_state;
8491 DEBUG_STATE_pp("push")
8494 st->locinput = locinput;
8496 if (newst > SLAB_LAST(PL_regmatch_slab))
8497 newst = S_push_slab(aTHX);
8498 PL_regmatch_state = newst;
8500 locinput = pushinput;
8506 #ifdef SOLARIS_BAD_OPTIMIZER
8507 # undef PL_charclass
8511 * We get here only if there's trouble -- normally "case END" is
8512 * the terminating point.
8514 Perl_croak(aTHX_ "corrupted regexp pointers");
8515 NOT_REACHED; /* NOTREACHED */
8519 /* we have successfully completed a subexpression, but we must now
8520 * pop to the state marked by yes_state and continue from there */
8521 assert(st != yes_state);
8523 while (st != yes_state) {
8525 if (st < SLAB_FIRST(PL_regmatch_slab)) {
8526 PL_regmatch_slab = PL_regmatch_slab->prev;
8527 st = SLAB_LAST(PL_regmatch_slab);
8531 DEBUG_STATE_pp("pop (no final)");
8533 DEBUG_STATE_pp("pop (yes)");
8539 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8540 || yes_state > SLAB_LAST(PL_regmatch_slab))
8542 /* not in this slab, pop slab */
8543 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8544 PL_regmatch_slab = PL_regmatch_slab->prev;
8545 st = SLAB_LAST(PL_regmatch_slab);
8547 depth -= (st - yes_state);
8550 yes_state = st->u.yes.prev_yes_state;
8551 PL_regmatch_state = st;
8554 locinput= st->locinput;
8555 state_num = st->resume_state + no_final;
8556 goto reenter_switch;
8559 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
8560 PL_colors[4], PL_colors[5]));
8562 if (reginfo->info_aux_eval) {
8563 /* each successfully executed (?{...}) block does the equivalent of
8564 * local $^R = do {...}
8565 * When popping the save stack, all these locals would be undone;
8566 * bypass this by setting the outermost saved $^R to the latest
8568 /* I dont know if this is needed or works properly now.
8569 * see code related to PL_replgv elsewhere in this file.
8572 if (oreplsv != GvSV(PL_replgv))
8573 sv_setsv(oreplsv, GvSV(PL_replgv));
8580 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
8582 PL_colors[4], PL_colors[5])
8594 /* there's a previous state to backtrack to */
8596 if (st < SLAB_FIRST(PL_regmatch_slab)) {
8597 PL_regmatch_slab = PL_regmatch_slab->prev;
8598 st = SLAB_LAST(PL_regmatch_slab);
8600 PL_regmatch_state = st;
8601 locinput= st->locinput;
8603 DEBUG_STATE_pp("pop");
8605 if (yes_state == st)
8606 yes_state = st->u.yes.prev_yes_state;
8608 state_num = st->resume_state + 1; /* failure = success + 1 */
8610 goto reenter_switch;
8615 if (rex->intflags & PREGf_VERBARG_SEEN) {
8616 SV *sv_err = get_sv("REGERROR", 1);
8617 SV *sv_mrk = get_sv("REGMARK", 1);
8619 sv_commit = &PL_sv_no;
8621 sv_yes_mark = &PL_sv_yes;
8624 sv_commit = &PL_sv_yes;
8625 sv_yes_mark = &PL_sv_no;
8629 sv_setsv(sv_err, sv_commit);
8630 sv_setsv(sv_mrk, sv_yes_mark);
8634 if (last_pushed_cv) {
8637 PERL_UNUSED_VAR(SP);
8640 assert(!result || locinput - reginfo->strbeg >= 0);
8641 return result ? locinput - reginfo->strbeg : -1;
8645 - regrepeat - repeatedly match something simple, report how many
8647 * What 'simple' means is a node which can be the operand of a quantifier like
8650 * startposp - pointer a pointer to the start position. This is updated
8651 * to point to the byte following the highest successful
8653 * p - the regnode to be repeatedly matched against.
8654 * reginfo - struct holding match state, such as strend
8655 * max - maximum number of things to match.
8656 * depth - (for debugging) backtracking depth.
8659 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8660 regmatch_info *const reginfo, I32 max _pDEPTH)
8662 char *scan; /* Pointer to current position in target string */
8664 char *loceol = reginfo->strend; /* local version */
8665 I32 hardcount = 0; /* How many matches so far */
8666 bool utf8_target = reginfo->is_utf8_target;
8667 unsigned int to_complement = 0; /* Invert the result? */
8669 _char_class_number classnum;
8671 PERL_ARGS_ASSERT_REGREPEAT;
8674 if (max == REG_INFTY)
8676 else if (! utf8_target && loceol - scan > max)
8677 loceol = scan + max;
8679 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8680 * to the maximum of how far we should go in it (leaving it set to the real
8681 * end, if the maximum permissible would take us beyond that). This allows
8682 * us to make the loop exit condition that we haven't gone past <loceol> to
8683 * also mean that we haven't exceeded the max permissible count, saving a
8684 * test each time through the loop. But it assumes that the OP matches a
8685 * single byte, which is true for most of the OPs below when applied to a
8686 * non-UTF-8 target. Those relatively few OPs that don't have this
8687 * characteristic will have to compensate.
8689 * There is no adjustment for UTF-8 targets, as the number of bytes per
8690 * character varies. OPs will have to test both that the count is less
8691 * than the max permissible (using <hardcount> to keep track), and that we
8692 * are still within the bounds of the string (using <loceol>. A few OPs
8693 * match a single byte no matter what the encoding. They can omit the max
8694 * test if, for the UTF-8 case, they do the adjustment that was skipped
8697 * Thus, the code above sets things up for the common case; and exceptional
8698 * cases need extra work; the common case is to make sure <scan> doesn't
8699 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8700 * count doesn't exceed the maximum permissible */
8705 while (scan < loceol && hardcount < max && *scan != '\n') {
8706 scan += UTF8SKIP(scan);
8710 while (scan < loceol && *scan != '\n')
8716 while (scan < loceol && hardcount < max) {
8717 scan += UTF8SKIP(scan);
8725 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8726 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8727 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8731 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8735 /* Can use a simple loop if the pattern char to match on is invariant
8736 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
8737 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8738 * true iff it doesn't matter if the argument is in UTF-8 or not */
8739 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8740 if (utf8_target && loceol - scan > max) {
8741 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8742 * since here, to match at all, 1 char == 1 byte */
8743 loceol = scan + max;
8745 while (scan < loceol && UCHARAT(scan) == c) {
8749 else if (reginfo->is_utf8_pat) {
8751 STRLEN scan_char_len;
8753 /* When both target and pattern are UTF-8, we have to do
8755 while (hardcount < max
8757 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8758 && memEQ(scan, STRING(p), scan_char_len))
8760 scan += scan_char_len;
8764 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8766 /* Target isn't utf8; convert the character in the UTF-8
8767 * pattern to non-UTF8, and do a simple loop */
8768 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8769 while (scan < loceol && UCHARAT(scan) == c) {
8772 } /* else pattern char is above Latin1, can't possibly match the
8777 /* Here, the string must be utf8; pattern isn't, and <c> is
8778 * different in utf8 than not, so can't compare them directly.
8779 * Outside the loop, find the two utf8 bytes that represent c, and
8780 * then look for those in sequence in the utf8 string */
8781 U8 high = UTF8_TWO_BYTE_HI(c);
8782 U8 low = UTF8_TWO_BYTE_LO(c);
8784 while (hardcount < max
8785 && scan + 1 < loceol
8786 && UCHARAT(scan) == high
8787 && UCHARAT(scan + 1) == low)
8795 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
8796 assert(! reginfo->is_utf8_pat);
8799 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8803 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8804 utf8_flags = FOLDEQ_LOCALE;
8807 case EXACTF: /* This node only generated for non-utf8 patterns */
8808 assert(! reginfo->is_utf8_pat);
8813 if (! utf8_target) {
8816 utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8817 | FOLDEQ_S2_FOLDS_SANE;
8822 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8826 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8828 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8830 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8833 if (c1 == CHRTEST_VOID) {
8834 /* Use full Unicode fold matching */
8835 char *tmpeol = reginfo->strend;
8836 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8837 while (hardcount < max
8838 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8839 STRING(p), NULL, pat_len,
8840 reginfo->is_utf8_pat, utf8_flags))
8843 tmpeol = reginfo->strend;
8847 else if (utf8_target) {
8849 while (scan < loceol
8851 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8853 scan += UTF8SKIP(scan);
8858 while (scan < loceol
8860 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8861 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8863 scan += UTF8SKIP(scan);
8868 else if (c1 == c2) {
8869 while (scan < loceol && UCHARAT(scan) == c1) {
8874 while (scan < loceol &&
8875 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8884 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8886 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
8887 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8893 while (hardcount < max
8895 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8897 scan += UTF8SKIP(scan);
8901 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan, 0))
8906 /* The argument (FLAGS) to all the POSIX node types is the class number */
8913 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8914 if (! utf8_target) {
8915 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8921 while (hardcount < max && scan < loceol
8922 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8925 scan += UTF8SKIP(scan);
8938 if (utf8_target && loceol - scan > max) {
8940 /* We didn't adjust <loceol> at the beginning of this routine
8941 * because is UTF-8, but it is actually ok to do so, since here, to
8942 * match, 1 char == 1 byte. */
8943 loceol = scan + max;
8945 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8958 if (! utf8_target) {
8959 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8965 /* The complement of something that matches only ASCII matches all
8966 * non-ASCII, plus everything in ASCII that isn't in the class. */
8967 while (hardcount < max && scan < loceol
8968 && (! isASCII_utf8(scan)
8969 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8971 scan += UTF8SKIP(scan);
8982 if (! utf8_target) {
8983 while (scan < loceol && to_complement
8984 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8991 classnum = (_char_class_number) FLAGS(p);
8992 if (classnum < _FIRST_NON_SWASH_CC) {
8994 /* Here, a swash is needed for above-Latin1 code points.
8995 * Process as many Latin1 code points using the built-in rules.
8996 * Go to another loop to finish processing upon encountering
8997 * the first Latin1 code point. We could do that in this loop
8998 * as well, but the other way saves having to test if the swash
8999 * has been loaded every time through the loop: extra space to
9001 while (hardcount < max && scan < loceol) {
9002 if (UTF8_IS_INVARIANT(*scan)) {
9003 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
9010 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
9011 if (! (to_complement
9012 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
9021 goto found_above_latin1;
9028 /* For these character classes, the knowledge of how to handle
9029 * every code point is compiled in to Perl via a macro. This
9030 * code is written for making the loops as tight as possible.
9031 * It could be refactored to save space instead */
9033 case _CC_ENUM_SPACE:
9034 while (hardcount < max
9036 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
9038 scan += UTF8SKIP(scan);
9042 case _CC_ENUM_BLANK:
9043 while (hardcount < max
9045 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
9047 scan += UTF8SKIP(scan);
9051 case _CC_ENUM_XDIGIT:
9052 while (hardcount < max
9054 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
9056 scan += UTF8SKIP(scan);
9060 case _CC_ENUM_VERTSPACE:
9061 while (hardcount < max
9063 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
9065 scan += UTF8SKIP(scan);
9069 case _CC_ENUM_CNTRL:
9070 while (hardcount < max
9072 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
9074 scan += UTF8SKIP(scan);
9079 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9085 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
9087 /* Load the swash if not already present */
9088 if (! PL_utf8_swash_ptrs[classnum]) {
9089 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9090 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9094 PL_XPosix_ptrs[classnum], &flags);
9097 while (hardcount < max && scan < loceol
9098 && to_complement ^ cBOOL(_generic_utf8(
9101 swash_fetch(PL_utf8_swash_ptrs[classnum],
9105 scan += UTF8SKIP(scan);
9112 while (hardcount < max && scan < loceol &&
9113 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9118 /* LNBREAK can match one or two latin chars, which is ok, but we
9119 * have to use hardcount in this situation, and throw away the
9120 * adjustment to <loceol> done before the switch statement */
9121 loceol = reginfo->strend;
9122 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9131 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9145 /* These are all 0 width, so match right here or not at all. */
9149 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9150 NOT_REACHED; /* NOTREACHED */
9157 c = scan - *startposp;
9161 GET_RE_DEBUG_FLAGS_DECL;
9163 SV * const prop = sv_newmortal();
9164 regprop(prog, prop, p, reginfo, NULL);
9165 Perl_re_exec_indentf( aTHX_ "%s can match %"IVdf" times out of %"IVdf"...\n",
9166 depth, SvPVX_const(prop),(IV)c,(IV)max);
9174 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9176 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
9177 create a copy so that changes the caller makes won't change the shared one.
9178 If <altsvp> is non-null, will return NULL in it, for back-compat.
9181 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9183 PERL_ARGS_ASSERT_REGCLASS_SWASH;
9189 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9192 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9195 - reginclass - determine if a character falls into a character class
9197 n is the ANYOF-type regnode
9198 p is the target string
9199 p_end points to one byte beyond the end of the target string
9200 utf8_target tells whether p is in UTF-8.
9202 Returns true if matched; false otherwise.
9204 Note that this can be a synthetic start class, a combination of various
9205 nodes, so things you think might be mutually exclusive, such as locale,
9206 aren't. It can match both locale and non-locale
9211 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9214 const char flags = ANYOF_FLAGS(n);
9218 PERL_ARGS_ASSERT_REGINCLASS;
9220 /* If c is not already the code point, get it. Note that
9221 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9222 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9224 c = utf8n_to_uvchr(p, p_end - p, &c_len,
9225 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
9226 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
9227 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
9228 * UTF8_ALLOW_FFFF */
9229 if (c_len == (STRLEN)-1)
9230 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
9231 if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9232 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9236 /* If this character is potentially in the bitmap, check it */
9237 if (c < NUM_ANYOF_CODE_POINTS) {
9238 if (ANYOF_BITMAP_TEST(n, c))
9241 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9248 else if (flags & ANYOF_LOCALE_FLAGS) {
9249 if ((flags & ANYOFL_FOLD)
9251 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9255 else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9259 /* The data structure is arranged so bits 0, 2, 4, ... are set
9260 * if the class includes the Posix character class given by
9261 * bit/2; and 1, 3, 5, ... are set if the class includes the
9262 * complemented Posix class given by int(bit/2). So we loop
9263 * through the bits, each time changing whether we complement
9264 * the result or not. Suppose for the sake of illustration
9265 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
9266 * is set, it means there is a match for this ANYOF node if the
9267 * character is in the class given by the expression (0 / 2 = 0
9268 * = \w). If it is in that class, isFOO_lc() will return 1,
9269 * and since 'to_complement' is 0, the result will stay TRUE,
9270 * and we exit the loop. Suppose instead that bit 0 is 0, but
9271 * bit 1 is 1. That means there is a match if the character
9272 * matches \W. We won't bother to call isFOO_lc() on bit 0,
9273 * but will on bit 1. On the second iteration 'to_complement'
9274 * will be 1, so the exclusive or will reverse things, so we
9275 * are testing for \W. On the third iteration, 'to_complement'
9276 * will be 0, and we would be testing for \s; the fourth
9277 * iteration would test for \S, etc.
9279 * Note that this code assumes that all the classes are closed
9280 * under folding. For example, if a character matches \w, then
9281 * its fold does too; and vice versa. This should be true for
9282 * any well-behaved locale for all the currently defined Posix
9283 * classes, except for :lower: and :upper:, which are handled
9284 * by the pseudo-class :cased: which matches if either of the
9285 * other two does. To get rid of this assumption, an outer
9286 * loop could be used below to iterate over both the source
9287 * character, and its fold (if different) */
9290 int to_complement = 0;
9292 while (count < ANYOF_MAX) {
9293 if (ANYOF_POSIXL_TEST(n, count)
9294 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9307 /* If the bitmap didn't (or couldn't) match, and something outside the
9308 * bitmap could match, try that. */
9310 if (c >= NUM_ANYOF_CODE_POINTS
9311 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9313 match = TRUE; /* Everything above the bitmap matches */
9315 /* Here doesn't match everything above the bitmap. If there is
9316 * some information available beyond the bitmap, we may find a
9317 * match in it. If so, this is most likely because the code point
9318 * is outside the bitmap range. But rarely, it could be because of
9319 * some other reason. If so, various flags are set to indicate
9320 * this possibility. On ANYOFD nodes, there may be matches that
9321 * happen only when the target string is UTF-8; or for other node
9322 * types, because runtime lookup is needed, regardless of the
9323 * UTF-8ness of the target string. Finally, under /il, there may
9324 * be some matches only possible if the locale is a UTF-8 one. */
9325 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
9326 && ( c >= NUM_ANYOF_CODE_POINTS
9327 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9328 && ( UNLIKELY(OP(n) != ANYOFD)
9329 || (utf8_target && ! isASCII_uni(c)
9330 # if NUM_ANYOF_CODE_POINTS > 256
9334 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9335 && IN_UTF8_CTYPE_LOCALE)))
9337 SV* only_utf8_locale = NULL;
9338 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9339 &only_utf8_locale, NULL);
9345 } else { /* Convert to utf8 */
9346 utf8_p = utf8_buffer;
9347 append_utf8_from_native_byte(*p, &utf8_p);
9348 utf8_p = utf8_buffer;
9351 if (swash_fetch(sw, utf8_p, TRUE)) {
9355 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9356 match = _invlist_contains_cp(only_utf8_locale, c);
9360 if (UNICODE_IS_SUPER(c)
9362 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9364 && ckWARN_d(WARN_NON_UNICODE))
9366 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9367 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
9371 #if ANYOF_INVERT != 1
9372 /* Depending on compiler optimization cBOOL takes time, so if don't have to
9374 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9377 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9378 return (flags & ANYOF_INVERT) ^ match;
9382 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9384 /* return the position 'off' UTF-8 characters away from 's', forward if
9385 * 'off' >= 0, backwards if negative. But don't go outside of position
9386 * 'lim', which better be < s if off < 0 */
9388 PERL_ARGS_ASSERT_REGHOP3;
9391 while (off-- && s < lim) {
9392 /* XXX could check well-formedness here */
9397 while (off++ && s > lim) {
9399 if (UTF8_IS_CONTINUED(*s)) {
9400 while (s > lim && UTF8_IS_CONTINUATION(*s))
9402 if (! UTF8_IS_START(*s)) {
9403 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9406 /* XXX could check well-formedness here */
9413 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9415 PERL_ARGS_ASSERT_REGHOP4;
9418 while (off-- && s < rlim) {
9419 /* XXX could check well-formedness here */
9424 while (off++ && s > llim) {
9426 if (UTF8_IS_CONTINUED(*s)) {
9427 while (s > llim && UTF8_IS_CONTINUATION(*s))
9429 if (! UTF8_IS_START(*s)) {
9430 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9433 /* XXX could check well-formedness here */
9439 /* like reghop3, but returns NULL on overrun, rather than returning last
9443 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9445 PERL_ARGS_ASSERT_REGHOPMAYBE3;
9448 while (off-- && s < lim) {
9449 /* XXX could check well-formedness here */
9456 while (off++ && s > lim) {
9458 if (UTF8_IS_CONTINUED(*s)) {
9459 while (s > lim && UTF8_IS_CONTINUATION(*s))
9461 if (! UTF8_IS_START(*s)) {
9462 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9465 /* XXX could check well-formedness here */
9474 /* when executing a regex that may have (?{}), extra stuff needs setting
9475 up that will be visible to the called code, even before the current
9476 match has finished. In particular:
9478 * $_ is localised to the SV currently being matched;
9479 * pos($_) is created if necessary, ready to be updated on each call-out
9481 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9482 isn't set until the current pattern is successfully finished), so that
9483 $1 etc of the match-so-far can be seen;
9484 * save the old values of subbeg etc of the current regex, and set then
9485 to the current string (again, this is normally only done at the end
9490 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9493 regexp *const rex = ReANY(reginfo->prog);
9494 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9496 eval_state->rex = rex;
9499 /* Make $_ available to executed code. */
9500 if (reginfo->sv != DEFSV) {
9502 DEFSV_set(reginfo->sv);
9505 if (!(mg = mg_find_mglob(reginfo->sv))) {
9506 /* prepare for quick setting of pos */
9507 mg = sv_magicext_mglob(reginfo->sv);
9510 eval_state->pos_magic = mg;
9511 eval_state->pos = mg->mg_len;
9512 eval_state->pos_flags = mg->mg_flags;
9515 eval_state->pos_magic = NULL;
9517 if (!PL_reg_curpm) {
9518 /* PL_reg_curpm is a fake PMOP that we can attach the current
9519 * regex to and point PL_curpm at, so that $1 et al are visible
9520 * within a /(?{})/. It's just allocated once per interpreter the
9521 * first time its needed */
9522 Newxz(PL_reg_curpm, 1, PMOP);
9525 SV* const repointer = &PL_sv_undef;
9526 /* this regexp is also owned by the new PL_reg_curpm, which
9527 will try to free it. */
9528 av_push(PL_regex_padav, repointer);
9529 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
9530 PL_regex_pad = AvARRAY(PL_regex_padav);
9534 SET_reg_curpm(reginfo->prog);
9535 eval_state->curpm = PL_curpm;
9536 PL_curpm_under = PL_curpm;
9537 PL_curpm = PL_reg_curpm;
9538 if (RXp_MATCH_COPIED(rex)) {
9539 /* Here is a serious problem: we cannot rewrite subbeg,
9540 since it may be needed if this match fails. Thus
9541 $` inside (?{}) could fail... */
9542 eval_state->subbeg = rex->subbeg;
9543 eval_state->sublen = rex->sublen;
9544 eval_state->suboffset = rex->suboffset;
9545 eval_state->subcoffset = rex->subcoffset;
9547 eval_state->saved_copy = rex->saved_copy;
9549 RXp_MATCH_COPIED_off(rex);
9552 eval_state->subbeg = NULL;
9553 rex->subbeg = (char *)reginfo->strbeg;
9555 rex->subcoffset = 0;
9556 rex->sublen = reginfo->strend - reginfo->strbeg;
9560 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
9563 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
9565 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
9566 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
9569 Safefree(aux->poscache);
9573 /* undo the effects of S_setup_eval_state() */
9575 if (eval_state->subbeg) {
9576 regexp * const rex = eval_state->rex;
9577 rex->subbeg = eval_state->subbeg;
9578 rex->sublen = eval_state->sublen;
9579 rex->suboffset = eval_state->suboffset;
9580 rex->subcoffset = eval_state->subcoffset;
9582 rex->saved_copy = eval_state->saved_copy;
9584 RXp_MATCH_COPIED_on(rex);
9586 if (eval_state->pos_magic)
9588 eval_state->pos_magic->mg_len = eval_state->pos;
9589 eval_state->pos_magic->mg_flags =
9590 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
9591 | (eval_state->pos_flags & MGf_BYTES);
9594 PL_curpm = eval_state->curpm;
9597 PL_regmatch_state = aux->old_regmatch_state;
9598 PL_regmatch_slab = aux->old_regmatch_slab;
9600 /* free all slabs above current one - this must be the last action
9601 * of this function, as aux and eval_state are allocated within
9602 * slabs and may be freed here */
9604 s = PL_regmatch_slab->next;
9606 PL_regmatch_slab->next = NULL;
9608 regmatch_slab * const osl = s;
9617 S_to_utf8_substr(pTHX_ regexp *prog)
9619 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9620 * on the converted value */
9624 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9627 if (prog->substrs->data[i].substr
9628 && !prog->substrs->data[i].utf8_substr) {
9629 SV* const sv = newSVsv(prog->substrs->data[i].substr);
9630 prog->substrs->data[i].utf8_substr = sv;
9631 sv_utf8_upgrade(sv);
9632 if (SvVALID(prog->substrs->data[i].substr)) {
9633 if (SvTAIL(prog->substrs->data[i].substr)) {
9634 /* Trim the trailing \n that fbm_compile added last
9636 SvCUR_set(sv, SvCUR(sv) - 1);
9637 /* Whilst this makes the SV technically "invalid" (as its
9638 buffer is no longer followed by "\0") when fbm_compile()
9639 adds the "\n" back, a "\0" is restored. */
9640 fbm_compile(sv, FBMcf_TAIL);
9644 if (prog->substrs->data[i].substr == prog->check_substr)
9645 prog->check_utf8 = sv;
9651 S_to_byte_substr(pTHX_ regexp *prog)
9653 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9654 * on the converted value; returns FALSE if can't be converted. */
9658 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9661 if (prog->substrs->data[i].utf8_substr
9662 && !prog->substrs->data[i].substr) {
9663 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
9664 if (! sv_utf8_downgrade(sv, TRUE)) {
9667 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9668 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9669 /* Trim the trailing \n that fbm_compile added last
9671 SvCUR_set(sv, SvCUR(sv) - 1);
9672 fbm_compile(sv, FBMcf_TAIL);
9676 prog->substrs->data[i].substr = sv;
9677 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9678 prog->check_substr = sv;
9686 * ex: set ts=8 sts=4 sw=4 et: