5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
89 static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
93 /* At least one required character in the target string is expressible only in
95 static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
100 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
107 #define STATIC static
110 /* Valid only if 'c', the character being looke-up, is an invariant under
111 * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
112 * everything matchable is straight forward in the bitmap */
113 #define REGINCLASS(prog,p,c,u) (ANYOF_FLAGS(p) \
114 ? reginclass(prog,p,c,c+1,u) \
115 : ANYOF_BITMAP_TEST(p,*(c)))
121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
122 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
124 #define HOPc(pos,off) \
125 (char *)(reginfo->is_utf8_target \
126 ? reghop3((U8*)pos, off, \
127 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
130 #define HOPBACKc(pos, off) \
131 (char*)(reginfo->is_utf8_target \
132 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(reginfo->strbeg)) \
133 : (pos - off >= reginfo->strbeg) \
137 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
138 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140 /* lim must be +ve. Returns NULL on overshoot */
141 #define HOPMAYBE3(pos,off,lim) \
142 (reginfo->is_utf8_target \
143 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
144 : ((U8*)pos + off <= lim) \
148 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
149 * off must be >=0; args should be vars rather than expressions */
150 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
151 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
152 : (U8*)((pos + off) > lim ? lim : (pos + off)))
154 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
155 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
157 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
159 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
160 #define NEXTCHR_IS_EOS (nextchr < 0)
162 #define SET_nextchr \
163 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
165 #define SET_locinput(p) \
170 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
172 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
173 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
174 1, 0, invlist, &flags); \
179 /* If in debug mode, we test that a known character properly matches */
181 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
184 utf8_char_in_property) \
185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
186 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
188 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
191 utf8_char_in_property) \
192 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
195 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
196 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
198 PL_XPosix_ptrs[_CC_WORDCHAR], \
199 LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
201 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
202 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
204 /* for use after a quantifier and before an EXACT-like node -- japhy */
205 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
207 * NOTE that *nothing* that affects backtracking should be in here, specifically
208 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
209 * node that is in between two EXACT like nodes when ascertaining what the required
210 * "follow" character is. This should probably be moved to regex compile time
211 * although it may be done at run time beause of the REF possibility - more
212 * investigation required. -- demerphq
214 #define JUMPABLE(rn) ( \
216 (OP(rn) == CLOSE && \
217 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
219 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
220 OP(rn) == PLUS || OP(rn) == MINMOD || \
222 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
224 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
226 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
229 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
230 we don't need this definition. XXX These are now out-of-sync*/
231 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
232 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
233 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
236 /* ... so we use this as its faster. */
237 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
238 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
239 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
240 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
245 Search for mandatory following text node; for lookahead, the text must
246 follow but for lookbehind (rn->flags != 0) we skip to the next step.
248 #define FIND_NEXT_IMPT(rn) STMT_START { \
249 while (JUMPABLE(rn)) { \
250 const OPCODE type = OP(rn); \
251 if (type == SUSPEND || PL_regkind[type] == CURLY) \
252 rn = NEXTOPER(NEXTOPER(rn)); \
253 else if (type == PLUS) \
255 else if (type == IFMATCH) \
256 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
257 else rn += NEXT_OFF(rn); \
261 #define SLAB_FIRST(s) (&(s)->states[0])
262 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
264 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
265 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
266 static regmatch_state * S_push_slab(pTHX);
268 #define REGCP_PAREN_ELEMS 3
269 #define REGCP_OTHER_ELEMS 3
270 #define REGCP_FRAME_ELEMS 1
271 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
272 * are needed for the regexp context stack bookkeeping. */
275 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
277 const int retval = PL_savestack_ix;
278 const int paren_elems_to_push =
279 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
280 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
281 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
283 GET_RE_DEBUG_FLAGS_DECL;
285 PERL_ARGS_ASSERT_REGCPPUSH;
287 if (paren_elems_to_push < 0)
288 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
289 (int)paren_elems_to_push, (int)maxopenparen,
290 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
292 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
293 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
294 " out of range (%lu-%ld)",
296 (unsigned long)maxopenparen,
299 SSGROW(total_elems + REGCP_FRAME_ELEMS);
302 if ((int)maxopenparen > (int)parenfloor)
303 Perl_re_printf( aTHX_
304 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
309 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
310 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
311 SSPUSHIV(rex->offs[p].end);
312 SSPUSHIV(rex->offs[p].start);
313 SSPUSHINT(rex->offs[p].start_tmp);
314 DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
315 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
317 (IV)rex->offs[p].start,
318 (IV)rex->offs[p].start_tmp,
322 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
323 SSPUSHINT(maxopenparen);
324 SSPUSHINT(rex->lastparen);
325 SSPUSHINT(rex->lastcloseparen);
326 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
331 /* These are needed since we do not localize EVAL nodes: */
332 #define REGCP_SET(cp) \
334 Perl_re_exec_indentf( aTHX_ \
335 "Setting an EVAL scope, savestack=%"IVdf",\n", \
336 depth, (IV)PL_savestack_ix \
341 #define REGCP_UNWIND(cp) \
343 if (cp != PL_savestack_ix) \
344 Perl_re_exec_indentf( aTHX_ \
345 "Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n",\
346 depth, (IV)(cp), (IV)PL_savestack_ix \
351 #define UNWIND_PAREN(lp, lcp) \
352 for (n = rex->lastparen; n > lp; n--) \
353 rex->offs[n].end = -1; \
354 rex->lastparen = n; \
355 rex->lastcloseparen = lcp;
359 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
363 GET_RE_DEBUG_FLAGS_DECL;
365 PERL_ARGS_ASSERT_REGCPPOP;
367 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
369 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
370 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
371 rex->lastcloseparen = SSPOPINT;
372 rex->lastparen = SSPOPINT;
373 *maxopenparen_p = SSPOPINT;
375 i -= REGCP_OTHER_ELEMS;
376 /* Now restore the parentheses context. */
378 if (i || rex->lastparen + 1 <= rex->nparens)
379 Perl_re_printf( aTHX_
380 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
385 paren = *maxopenparen_p;
386 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
388 rex->offs[paren].start_tmp = SSPOPINT;
389 rex->offs[paren].start = SSPOPIV;
391 if (paren <= rex->lastparen)
392 rex->offs[paren].end = tmps;
393 DEBUG_BUFFERS_r( Perl_re_printf( aTHX_
394 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
396 (IV)rex->offs[paren].start,
397 (IV)rex->offs[paren].start_tmp,
398 (IV)rex->offs[paren].end,
399 (paren > rex->lastparen ? "(skipped)" : ""));
404 /* It would seem that the similar code in regtry()
405 * already takes care of this, and in fact it is in
406 * a better location to since this code can #if 0-ed out
407 * but the code in regtry() is needed or otherwise tests
408 * requiring null fields (pat.t#187 and split.t#{13,14}
409 * (as of patchlevel 7877) will fail. Then again,
410 * this code seems to be necessary or otherwise
411 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
412 * --jhi updated by dapm */
413 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
414 if (i > *maxopenparen_p)
415 rex->offs[i].start = -1;
416 rex->offs[i].end = -1;
417 DEBUG_BUFFERS_r( Perl_re_printf( aTHX_
418 " \\%"UVuf": %s ..-1 undeffing\n",
420 (i > *maxopenparen_p) ? "-1" : " "
426 /* restore the parens and associated vars at savestack position ix,
427 * but without popping the stack */
430 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
432 I32 tmpix = PL_savestack_ix;
433 PL_savestack_ix = ix;
434 regcppop(rex, maxopenparen_p);
435 PL_savestack_ix = tmpix;
438 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
441 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
443 /* Returns a boolean as to whether or not 'character' is a member of the
444 * Posix character class given by 'classnum' that should be equivalent to a
445 * value in the typedef '_char_class_number'.
447 * Ideally this could be replaced by a just an array of function pointers
448 * to the C library functions that implement the macros this calls.
449 * However, to compile, the precise function signatures are required, and
450 * these may vary from platform to to platform. To avoid having to figure
451 * out what those all are on each platform, I (khw) am using this method,
452 * which adds an extra layer of function call overhead (unless the C
453 * optimizer strips it away). But we don't particularly care about
454 * performance with locales anyway. */
456 switch ((_char_class_number) classnum) {
457 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
458 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
459 case _CC_ENUM_ASCII: return isASCII_LC(character);
460 case _CC_ENUM_BLANK: return isBLANK_LC(character);
461 case _CC_ENUM_CASED: return isLOWER_LC(character)
462 || isUPPER_LC(character);
463 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
464 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
465 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
466 case _CC_ENUM_LOWER: return isLOWER_LC(character);
467 case _CC_ENUM_PRINT: return isPRINT_LC(character);
468 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
469 case _CC_ENUM_SPACE: return isSPACE_LC(character);
470 case _CC_ENUM_UPPER: return isUPPER_LC(character);
471 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
472 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
473 default: /* VERTSPACE should never occur in locales */
474 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
477 NOT_REACHED; /* NOTREACHED */
482 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
484 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
485 * 'character' is a member of the Posix character class given by 'classnum'
486 * that should be equivalent to a value in the typedef
487 * '_char_class_number'.
489 * This just calls isFOO_lc on the code point for the character if it is in
490 * the range 0-255. Outside that range, all characters use Unicode
491 * rules, ignoring any locale. So use the Unicode function if this class
492 * requires a swash, and use the Unicode macro otherwise. */
494 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
496 if (UTF8_IS_INVARIANT(*character)) {
497 return isFOO_lc(classnum, *character);
499 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
500 return isFOO_lc(classnum,
501 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
504 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
506 if (classnum < _FIRST_NON_SWASH_CC) {
508 /* Initialize the swash unless done already */
509 if (! PL_utf8_swash_ptrs[classnum]) {
510 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
511 PL_utf8_swash_ptrs[classnum] =
512 _core_swash_init("utf8",
515 PL_XPosix_ptrs[classnum], &flags);
518 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
520 TRUE /* is UTF */ ));
523 switch ((_char_class_number) classnum) {
524 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
525 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
526 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
527 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
531 return FALSE; /* Things like CNTRL are always below 256 */
535 * pregexec and friends
538 #ifndef PERL_IN_XSUB_RE
540 - pregexec - match a regexp against a string
543 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
544 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
545 /* stringarg: the point in the string at which to begin matching */
546 /* strend: pointer to null at end of string */
547 /* strbeg: real beginning of string */
548 /* minend: end of match must be >= minend bytes after stringarg. */
549 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
550 * itself is accessed via the pointers above */
551 /* nosave: For optimizations. */
553 PERL_ARGS_ASSERT_PREGEXEC;
556 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
557 nosave ? 0 : REXEC_COPY_STR);
563 /* re_intuit_start():
565 * Based on some optimiser hints, try to find the earliest position in the
566 * string where the regex could match.
568 * rx: the regex to match against
569 * sv: the SV being matched: only used for utf8 flag; the string
570 * itself is accessed via the pointers below. Note that on
571 * something like an overloaded SV, SvPOK(sv) may be false
572 * and the string pointers may point to something unrelated to
574 * strbeg: real beginning of string
575 * strpos: the point in the string at which to begin matching
576 * strend: pointer to the byte following the last char of the string
577 * flags currently unused; set to 0
578 * data: currently unused; set to NULL
580 * The basic idea of re_intuit_start() is to use some known information
581 * about the pattern, namely:
583 * a) the longest known anchored substring (i.e. one that's at a
584 * constant offset from the beginning of the pattern; but not
585 * necessarily at a fixed offset from the beginning of the
587 * b) the longest floating substring (i.e. one that's not at a constant
588 * offset from the beginning of the pattern);
589 * c) Whether the pattern is anchored to the string; either
590 * an absolute anchor: /^../, or anchored to \n: /^.../m,
591 * or anchored to pos(): /\G/;
592 * d) A start class: a real or synthetic character class which
593 * represents which characters are legal at the start of the pattern;
595 * to either quickly reject the match, or to find the earliest position
596 * within the string at which the pattern might match, thus avoiding
597 * running the full NFA engine at those earlier locations, only to
598 * eventually fail and retry further along.
600 * Returns NULL if the pattern can't match, or returns the address within
601 * the string which is the earliest place the match could occur.
603 * The longest of the anchored and floating substrings is called 'check'
604 * and is checked first. The other is called 'other' and is checked
605 * second. The 'other' substring may not be present. For example,
607 * /(abc|xyz)ABC\d{0,3}DEFG/
611 * check substr (float) = "DEFG", offset 6..9 chars
612 * other substr (anchored) = "ABC", offset 3..3 chars
615 * Be aware that during the course of this function, sometimes 'anchored'
616 * refers to a substring being anchored relative to the start of the
617 * pattern, and sometimes to the pattern itself being anchored relative to
618 * the string. For example:
620 * /\dabc/: "abc" is anchored to the pattern;
621 * /^\dabc/: "abc" is anchored to the pattern and the string;
622 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
623 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
624 * but the pattern is anchored to the string.
628 Perl_re_intuit_start(pTHX_
631 const char * const strbeg,
635 re_scream_pos_data *data)
637 struct regexp *const prog = ReANY(rx);
638 SSize_t start_shift = prog->check_offset_min;
639 /* Should be nonnegative! */
640 SSize_t end_shift = 0;
641 /* current lowest pos in string where the regex can start matching */
642 char *rx_origin = strpos;
644 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
645 U8 other_ix = 1 - prog->substrs->check_ix;
647 char *other_last = strpos;/* latest pos 'other' substr already checked to */
648 char *check_at = NULL; /* check substr found at this pos */
649 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
650 RXi_GET_DECL(prog,progi);
651 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
652 regmatch_info *const reginfo = ®info_buf;
653 GET_RE_DEBUG_FLAGS_DECL;
655 PERL_ARGS_ASSERT_RE_INTUIT_START;
656 PERL_UNUSED_ARG(flags);
657 PERL_UNUSED_ARG(data);
659 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
660 "Intuit: trying to determine minimum start position...\n"));
662 /* for now, assume that all substr offsets are positive. If at some point
663 * in the future someone wants to do clever things with lookbehind and
664 * -ve offsets, they'll need to fix up any code in this function
665 * which uses these offsets. See the thread beginning
666 * <20140113145929.GF27210@iabyn.com>
668 assert(prog->substrs->data[0].min_offset >= 0);
669 assert(prog->substrs->data[0].max_offset >= 0);
670 assert(prog->substrs->data[1].min_offset >= 0);
671 assert(prog->substrs->data[1].max_offset >= 0);
672 assert(prog->substrs->data[2].min_offset >= 0);
673 assert(prog->substrs->data[2].max_offset >= 0);
675 /* for now, assume that if both present, that the floating substring
676 * doesn't start before the anchored substring.
677 * If you break this assumption (e.g. doing better optimisations
678 * with lookahead/behind), then you'll need to audit the code in this
679 * function carefully first
682 ! ( (prog->anchored_utf8 || prog->anchored_substr)
683 && (prog->float_utf8 || prog->float_substr))
684 || (prog->float_min_offset >= prog->anchored_offset));
686 /* byte rather than char calculation for efficiency. It fails
687 * to quickly reject some cases that can't match, but will reject
688 * them later after doing full char arithmetic */
689 if (prog->minlen > strend - strpos) {
690 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
691 " String too short...\n"));
695 RX_MATCH_UTF8_set(rx,utf8_target);
696 reginfo->is_utf8_target = cBOOL(utf8_target);
697 reginfo->info_aux = NULL;
698 reginfo->strbeg = strbeg;
699 reginfo->strend = strend;
700 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
702 /* not actually used within intuit, but zero for safety anyway */
703 reginfo->poscache_maxiter = 0;
706 if ((!prog->anchored_utf8 && prog->anchored_substr)
707 || (!prog->float_utf8 && prog->float_substr))
708 to_utf8_substr(prog);
709 check = prog->check_utf8;
711 if (!prog->check_substr && prog->check_utf8) {
712 if (! to_byte_substr(prog)) {
713 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
716 check = prog->check_substr;
719 /* dump the various substring data */
720 DEBUG_OPTIMISE_MORE_r({
722 for (i=0; i<=2; i++) {
723 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
724 : prog->substrs->data[i].substr);
728 Perl_re_printf( aTHX_
729 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
730 " useful=%"IVdf" utf8=%d [%s]\n",
732 (IV)prog->substrs->data[i].min_offset,
733 (IV)prog->substrs->data[i].max_offset,
734 (IV)prog->substrs->data[i].end_shift,
741 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
743 /* ml_anch: check after \n?
745 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
746 * with /.*.../, these flags will have been added by the
748 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
749 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
751 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
752 && !(prog->intflags & PREGf_IMPLICIT);
754 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
755 /* we are only allowed to match at BOS or \G */
757 /* trivially reject if there's a BOS anchor and we're not at BOS.
759 * Note that we don't try to do a similar quick reject for
760 * \G, since generally the caller will have calculated strpos
761 * based on pos() and gofs, so the string is already correctly
762 * anchored by definition; and handling the exceptions would
763 * be too fiddly (e.g. REXEC_IGNOREPOS).
765 if ( strpos != strbeg
766 && (prog->intflags & PREGf_ANCH_SBOL))
768 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
769 " Not at start...\n"));
773 /* in the presence of an anchor, the anchored (relative to the
774 * start of the regex) substr must also be anchored relative
775 * to strpos. So quickly reject if substr isn't found there.
776 * This works for \G too, because the caller will already have
777 * subtracted gofs from pos, and gofs is the offset from the
778 * \G to the start of the regex. For example, in /.abc\Gdef/,
779 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
780 * caller will have set strpos=pos()-4; we look for the substr
781 * at position pos()-4+1, which lines up with the "a" */
783 if (prog->check_offset_min == prog->check_offset_max) {
784 /* Substring at constant offset from beg-of-str... */
785 SSize_t slen = SvCUR(check);
786 char *s = HOP3c(strpos, prog->check_offset_min, strend);
788 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
789 " Looking for check substr at fixed offset %"IVdf"...\n",
790 (IV)prog->check_offset_min));
793 /* In this case, the regex is anchored at the end too.
794 * Unless it's a multiline match, the lengths must match
795 * exactly, give or take a \n. NB: slen >= 1 since
796 * the last char of check is \n */
798 && ( strend - s > slen
799 || strend - s < slen - 1
800 || (strend - s == slen && strend[-1] != '\n')))
802 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
803 " String too long...\n"));
806 /* Now should match s[0..slen-2] */
809 if (slen && (*SvPVX_const(check) != *s
810 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
812 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
813 " String not equal...\n"));
818 goto success_at_start;
823 end_shift = prog->check_end_shift;
825 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
827 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
828 (IV)end_shift, RX_PRECOMP(prog));
833 /* This is the (re)entry point of the main loop in this function.
834 * The goal of this loop is to:
835 * 1) find the "check" substring in the region rx_origin..strend
836 * (adjusted by start_shift / end_shift). If not found, reject
838 * 2) If it exists, look for the "other" substr too if defined; for
839 * example, if the check substr maps to the anchored substr, then
840 * check the floating substr, and vice-versa. If not found, go
841 * back to (1) with rx_origin suitably incremented.
842 * 3) If we find an rx_origin position that doesn't contradict
843 * either of the substrings, then check the possible additional
844 * constraints on rx_origin of /^.../m or a known start class.
845 * If these fail, then depending on which constraints fail, jump
846 * back to here, or to various other re-entry points further along
847 * that skip some of the first steps.
848 * 4) If we pass all those tests, update the BmUSEFUL() count on the
849 * substring. If the start position was determined to be at the
850 * beginning of the string - so, not rejected, but not optimised,
851 * since we have to run regmatch from position 0 - decrement the
852 * BmUSEFUL() count. Otherwise increment it.
856 /* first, look for the 'check' substring */
862 DEBUG_OPTIMISE_MORE_r({
863 Perl_re_printf( aTHX_
864 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
865 " Start shift: %"IVdf" End shift %"IVdf
866 " Real end Shift: %"IVdf"\n",
867 (IV)(rx_origin - strbeg),
868 (IV)prog->check_offset_min,
871 (IV)prog->check_end_shift);
874 end_point = HOP3(strend, -end_shift, strbeg);
875 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
880 /* If the regex is absolutely anchored to either the start of the
881 * string (SBOL) or to pos() (ANCH_GPOS), then
882 * check_offset_max represents an upper bound on the string where
883 * the substr could start. For the ANCH_GPOS case, we assume that
884 * the caller of intuit will have already set strpos to
885 * pos()-gofs, so in this case strpos + offset_max will still be
886 * an upper bound on the substr.
889 && prog->intflags & PREGf_ANCH
890 && prog->check_offset_max != SSize_t_MAX)
892 SSize_t len = SvCUR(check) - !!SvTAIL(check);
893 const char * const anchor =
894 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
896 /* do a bytes rather than chars comparison. It's conservative;
897 * so it skips doing the HOP if the result can't possibly end
898 * up earlier than the old value of end_point.
900 if ((char*)end_point - anchor > prog->check_offset_max) {
901 end_point = HOP3lim((U8*)anchor,
902 prog->check_offset_max,
908 check_at = fbm_instr( start_point, end_point,
909 check, multiline ? FBMrf_MULTILINE : 0);
911 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
912 " doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
913 (IV)((char*)start_point - strbeg),
914 (IV)((char*)end_point - strbeg),
915 (IV)(check_at ? check_at - strbeg : -1)
918 /* Update the count-of-usability, remove useless subpatterns,
922 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
923 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
924 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
925 (check_at ? "Found" : "Did not find"),
926 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
927 ? "anchored" : "floating"),
930 (check_at ? " at offset " : "...\n") );
935 /* set rx_origin to the minimum position where the regex could start
936 * matching, given the constraint of the just-matched check substring.
937 * But don't set it lower than previously.
940 if (check_at - rx_origin > prog->check_offset_max)
941 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
942 /* Finish the diagnostic message */
943 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
944 "%ld (rx_origin now %"IVdf")...\n",
945 (long)(check_at - strbeg),
946 (IV)(rx_origin - strbeg)
951 /* now look for the 'other' substring if defined */
953 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
954 : prog->substrs->data[other_ix].substr)
956 /* Take into account the "other" substring. */
960 struct reg_substr_datum *other;
963 other = &prog->substrs->data[other_ix];
965 /* if "other" is anchored:
966 * we've previously found a floating substr starting at check_at.
967 * This means that the regex origin must lie somewhere
968 * between min (rx_origin): HOP3(check_at, -check_offset_max)
969 * and max: HOP3(check_at, -check_offset_min)
970 * (except that min will be >= strpos)
971 * So the fixed substr must lie somewhere between
972 * HOP3(min, anchored_offset)
973 * HOP3(max, anchored_offset) + SvCUR(substr)
976 /* if "other" is floating
977 * Calculate last1, the absolute latest point where the
978 * floating substr could start in the string, ignoring any
979 * constraints from the earlier fixed match. It is calculated
982 * strend - prog->minlen (in chars) is the absolute latest
983 * position within the string where the origin of the regex
984 * could appear. The latest start point for the floating
985 * substr is float_min_offset(*) on from the start of the
986 * regex. last1 simply combines thee two offsets.
988 * (*) You might think the latest start point should be
989 * float_max_offset from the regex origin, and technically
990 * you'd be correct. However, consider
992 * Here, float min, max are 3,5 and minlen is 7.
993 * This can match either
997 * In the first case, the regex matches minlen chars; in the
998 * second, minlen+1, in the third, minlen+2.
999 * In the first case, the floating offset is 3 (which equals
1000 * float_min), in the second, 4, and in the third, 5 (which
1001 * equals float_max). In all cases, the floating string bcd
1002 * can never start more than 4 chars from the end of the
1003 * string, which equals minlen - float_min. As the substring
1004 * starts to match more than float_min from the start of the
1005 * regex, it makes the regex match more than minlen chars,
1006 * and the two cancel each other out. So we can always use
1007 * float_min - minlen, rather than float_max - minlen for the
1008 * latest position in the string.
1010 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1011 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1014 assert(prog->minlen >= other->min_offset);
1015 last1 = HOP3c(strend,
1016 other->min_offset - prog->minlen, strbeg);
1018 if (other_ix) {/* i.e. if (other-is-float) */
1019 /* last is the latest point where the floating substr could
1020 * start, *given* any constraints from the earlier fixed
1021 * match. This constraint is that the floating string starts
1022 * <= float_max_offset chars from the regex origin (rx_origin).
1023 * If this value is less than last1, use it instead.
1025 assert(rx_origin <= last1);
1027 /* this condition handles the offset==infinity case, and
1028 * is a short-cut otherwise. Although it's comparing a
1029 * byte offset to a char length, it does so in a safe way,
1030 * since 1 char always occupies 1 or more bytes,
1031 * so if a string range is (last1 - rx_origin) bytes,
1032 * it will be less than or equal to (last1 - rx_origin)
1033 * chars; meaning it errs towards doing the accurate HOP3
1034 * rather than just using last1 as a short-cut */
1035 (last1 - rx_origin) < other->max_offset
1037 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1040 assert(strpos + start_shift <= check_at);
1041 last = HOP4c(check_at, other->min_offset - start_shift,
1045 s = HOP3c(rx_origin, other->min_offset, strend);
1046 if (s < other_last) /* These positions already checked */
1049 must = utf8_target ? other->utf8_substr : other->substr;
1050 assert(SvPOK(must));
1053 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1059 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1060 " skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1061 (IV)(from - strbeg),
1067 (unsigned char*)from,
1070 multiline ? FBMrf_MULTILINE : 0
1072 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1073 " doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1074 (IV)(from - strbeg),
1076 (IV)(s ? s - strbeg : -1)
1082 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1083 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1084 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1085 s ? "Found" : "Contradicts",
1086 other_ix ? "floating" : "anchored",
1087 quoted, RE_SV_TAIL(must));
1092 /* last1 is latest possible substr location. If we didn't
1093 * find it before there, we never will */
1094 if (last >= last1) {
1095 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1096 "; giving up...\n"));
1100 /* try to find the check substr again at a later
1101 * position. Maybe next time we'll find the "other" substr
1103 other_last = HOP3c(last, 1, strend) /* highest failure */;
1105 other_ix /* i.e. if other-is-float */
1106 ? HOP3c(rx_origin, 1, strend)
1107 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1108 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1109 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1110 (other_ix ? "floating" : "anchored"),
1111 (long)(HOP3c(check_at, 1, strend) - strbeg),
1112 (IV)(rx_origin - strbeg)
1117 if (other_ix) { /* if (other-is-float) */
1118 /* other_last is set to s, not s+1, since its possible for
1119 * a floating substr to fail first time, then succeed
1120 * second time at the same floating position; e.g.:
1121 * "-AB--AABZ" =~ /\wAB\d*Z/
1122 * The first time round, anchored and float match at
1123 * "-(AB)--AAB(Z)" then fail on the initial \w character
1124 * class. Second time round, they match at "-AB--A(AB)(Z)".
1129 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1130 other_last = HOP3c(s, 1, strend);
1132 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1133 " at offset %ld (rx_origin now %"IVdf")...\n",
1135 (IV)(rx_origin - strbeg)
1141 DEBUG_OPTIMISE_MORE_r(
1142 Perl_re_printf( aTHX_
1143 " Check-only match: offset min:%"IVdf" max:%"IVdf
1144 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1145 " strend:%"IVdf"\n",
1146 (IV)prog->check_offset_min,
1147 (IV)prog->check_offset_max,
1148 (IV)(check_at-strbeg),
1149 (IV)(rx_origin-strbeg),
1150 (IV)(rx_origin-check_at),
1156 postprocess_substr_matches:
1158 /* handle the extra constraint of /^.../m if present */
1160 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1163 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1164 " looking for /^/m anchor"));
1166 /* we have failed the constraint of a \n before rx_origin.
1167 * Find the next \n, if any, even if it's beyond the current
1168 * anchored and/or floating substrings. Whether we should be
1169 * scanning ahead for the next \n or the next substr is debatable.
1170 * On the one hand you'd expect rare substrings to appear less
1171 * often than \n's. On the other hand, searching for \n means
1172 * we're effectively flipping between check_substr and "\n" on each
1173 * iteration as the current "rarest" string candidate, which
1174 * means for example that we'll quickly reject the whole string if
1175 * hasn't got a \n, rather than trying every substr position
1179 s = HOP3c(strend, - prog->minlen, strpos);
1180 if (s <= rx_origin ||
1181 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1183 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1184 " Did not find /%s^%s/m...\n",
1185 PL_colors[0], PL_colors[1]));
1189 /* earliest possible origin is 1 char after the \n.
1190 * (since *rx_origin == '\n', it's safe to ++ here rather than
1191 * HOP(rx_origin, 1)) */
1194 if (prog->substrs->check_ix == 0 /* check is anchored */
1195 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1197 /* Position contradicts check-string; either because
1198 * check was anchored (and thus has no wiggle room),
1199 * or check was float and rx_origin is above the float range */
1200 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1201 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1202 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1206 /* if we get here, the check substr must have been float,
1207 * is in range, and we may or may not have had an anchored
1208 * "other" substr which still contradicts */
1209 assert(prog->substrs->check_ix); /* check is float */
1211 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1212 /* whoops, the anchored "other" substr exists, so we still
1213 * contradict. On the other hand, the float "check" substr
1214 * didn't contradict, so just retry the anchored "other"
1216 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1217 " Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
1218 PL_colors[0], PL_colors[1],
1219 (IV)(rx_origin - strbeg + prog->anchored_offset),
1220 (IV)(rx_origin - strbeg)
1222 goto do_other_substr;
1225 /* success: we don't contradict the found floating substring
1226 * (and there's no anchored substr). */
1227 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1228 " Found /%s^%s/m with rx_origin %ld...\n",
1229 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1232 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1233 " (multiline anchor test skipped)\n"));
1239 /* if we have a starting character class, then test that extra constraint.
1240 * (trie stclasses are too expensive to use here, we are better off to
1241 * leave it to regmatch itself) */
1243 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1244 const U8* const str = (U8*)STRING(progi->regstclass);
1246 /* XXX this value could be pre-computed */
1247 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1248 ? (reginfo->is_utf8_pat
1249 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1250 : STR_LEN(progi->regstclass))
1254 /* latest pos that a matching float substr constrains rx start to */
1255 char *rx_max_float = NULL;
1257 /* if the current rx_origin is anchored, either by satisfying an
1258 * anchored substring constraint, or a /^.../m constraint, then we
1259 * can reject the current origin if the start class isn't found
1260 * at the current position. If we have a float-only match, then
1261 * rx_origin is constrained to a range; so look for the start class
1262 * in that range. if neither, then look for the start class in the
1263 * whole rest of the string */
1265 /* XXX DAPM it's not clear what the minlen test is for, and why
1266 * it's not used in the floating case. Nothing in the test suite
1267 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1268 * Here are some old comments, which may or may not be correct:
1270 * minlen == 0 is possible if regstclass is \b or \B,
1271 * and the fixed substr is ''$.
1272 * Since minlen is already taken into account, rx_origin+1 is
1273 * before strend; accidentally, minlen >= 1 guaranties no false
1274 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1275 * 0) below assumes that regstclass does not come from lookahead...
1276 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1277 * This leaves EXACTF-ish only, which are dealt with in
1281 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1282 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1283 else if (prog->float_substr || prog->float_utf8) {
1284 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1285 endpos= HOP3c(rx_max_float, cl_l, strend);
1290 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1291 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
1292 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1293 (IV)start_shift, (IV)(check_at - strbeg),
1294 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1296 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1299 if (endpos == strend) {
1300 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1301 " Could not match STCLASS...\n") );
1304 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1305 " This position contradicts STCLASS...\n") );
1306 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1307 && !(prog->intflags & PREGf_IMPLICIT))
1310 /* Contradict one of substrings */
1311 if (prog->anchored_substr || prog->anchored_utf8) {
1312 if (prog->substrs->check_ix == 1) { /* check is float */
1313 /* Have both, check_string is floating */
1314 assert(rx_origin + start_shift <= check_at);
1315 if (rx_origin + start_shift != check_at) {
1316 /* not at latest position float substr could match:
1317 * Recheck anchored substring, but not floating.
1318 * The condition above is in bytes rather than
1319 * chars for efficiency. It's conservative, in
1320 * that it errs on the side of doing 'goto
1321 * do_other_substr'. In this case, at worst,
1322 * an extra anchored search may get done, but in
1323 * practice the extra fbm_instr() is likely to
1324 * get skipped anyway. */
1325 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1326 " about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1327 (long)(other_last - strbeg),
1328 (IV)(rx_origin - strbeg)
1330 goto do_other_substr;
1338 /* In the presence of ml_anch, we might be able to
1339 * find another \n without breaking the current float
1342 /* strictly speaking this should be HOP3c(..., 1, ...),
1343 * but since we goto a block of code that's going to
1344 * search for the next \n if any, its safe here */
1346 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1347 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1348 PL_colors[0], PL_colors[1],
1349 (long)(rx_origin - strbeg)) );
1350 goto postprocess_substr_matches;
1353 /* strictly speaking this can never be true; but might
1354 * be if we ever allow intuit without substrings */
1355 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1358 rx_origin = rx_max_float;
1361 /* at this point, any matching substrings have been
1362 * contradicted. Start again... */
1364 rx_origin = HOP3c(rx_origin, 1, strend);
1366 /* uses bytes rather than char calculations for efficiency.
1367 * It's conservative: it errs on the side of doing 'goto restart',
1368 * where there is code that does a proper char-based test */
1369 if (rx_origin + start_shift + end_shift > strend) {
1370 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1371 " Could not match STCLASS...\n") );
1374 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1375 " about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
1376 (prog->substrs->check_ix ? "floating" : "anchored"),
1377 (long)(rx_origin + start_shift - strbeg),
1378 (IV)(rx_origin - strbeg)
1385 if (rx_origin != s) {
1386 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1387 " By STCLASS: moving %ld --> %ld\n",
1388 (long)(rx_origin - strbeg), (long)(s - strbeg))
1392 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1393 " Does not contradict STCLASS...\n");
1398 /* Decide whether using the substrings helped */
1400 if (rx_origin != strpos) {
1401 /* Fixed substring is found far enough so that the match
1402 cannot start at strpos. */
1404 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1405 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1408 /* The found rx_origin position does not prohibit matching at
1409 * strpos, so calling intuit didn't gain us anything. Decrement
1410 * the BmUSEFUL() count on the check substring, and if we reach
1412 if (!(prog->intflags & PREGf_NAUGHTY)
1414 prog->check_utf8 /* Could be deleted already */
1415 && --BmUSEFUL(prog->check_utf8) < 0
1416 && (prog->check_utf8 == prog->float_utf8)
1418 prog->check_substr /* Could be deleted already */
1419 && --BmUSEFUL(prog->check_substr) < 0
1420 && (prog->check_substr == prog->float_substr)
1423 /* If flags & SOMETHING - do not do it many times on the same match */
1424 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1425 /* XXX Does the destruction order has to change with utf8_target? */
1426 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1427 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1428 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1429 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1430 check = NULL; /* abort */
1431 /* XXXX This is a remnant of the old implementation. It
1432 looks wasteful, since now INTUIT can use many
1433 other heuristics. */
1434 prog->extflags &= ~RXf_USE_INTUIT;
1438 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1439 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1440 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1444 fail_finish: /* Substring not found */
1445 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1446 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1448 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1449 PL_colors[4], PL_colors[5]));
1454 #define DECL_TRIE_TYPE(scan) \
1455 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1456 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1457 trie_utf8l, trie_flu8 } \
1458 trie_type = ((scan->flags == EXACT) \
1459 ? (utf8_target ? trie_utf8 : trie_plain) \
1460 : (scan->flags == EXACTL) \
1461 ? (utf8_target ? trie_utf8l : trie_plain) \
1462 : (scan->flags == EXACTFA) \
1464 ? trie_utf8_exactfa_fold \
1465 : trie_latin_utf8_exactfa_fold) \
1466 : (scan->flags == EXACTFLU8 \
1470 : trie_latin_utf8_fold)))
1472 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1475 U8 flags = FOLD_FLAGS_FULL; \
1476 switch (trie_type) { \
1478 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1479 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1480 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1482 goto do_trie_utf8_fold; \
1483 case trie_utf8_exactfa_fold: \
1484 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1486 case trie_utf8_fold: \
1487 do_trie_utf8_fold: \
1488 if ( foldlen>0 ) { \
1489 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1494 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
1495 len = UTF8SKIP(uc); \
1496 skiplen = UVCHR_SKIP( uvc ); \
1497 foldlen -= skiplen; \
1498 uscan = foldbuf + skiplen; \
1501 case trie_latin_utf8_exactfa_fold: \
1502 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1504 case trie_latin_utf8_fold: \
1505 if ( foldlen>0 ) { \
1506 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1512 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1513 skiplen = UVCHR_SKIP( uvc ); \
1514 foldlen -= skiplen; \
1515 uscan = foldbuf + skiplen; \
1519 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1520 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1521 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1525 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1532 charid = trie->charmap[ uvc ]; \
1536 if (widecharmap) { \
1537 SV** const svpp = hv_fetch(widecharmap, \
1538 (char*)&uvc, sizeof(UV), 0); \
1540 charid = (U16)SvIV(*svpp); \
1545 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1546 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1547 startpos, doutf8, depth)
1549 #define REXEC_FBC_EXACTISH_SCAN(COND) \
1553 && (ln == 1 || folder(s, pat_string, ln)) \
1554 && (reginfo->intuit || regtry(reginfo, &s)) )\
1560 #define REXEC_FBC_UTF8_SCAN(CODE) \
1562 while (s < strend) { \
1568 #define REXEC_FBC_SCAN(CODE) \
1570 while (s < strend) { \
1576 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1577 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1579 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1588 #define REXEC_FBC_CLASS_SCAN(COND) \
1589 REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1591 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1600 #define REXEC_FBC_CSCAN(CONDUTF8,COND) \
1601 if (utf8_target) { \
1602 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
1605 REXEC_FBC_CLASS_SCAN(COND); \
1608 /* The three macros below are slightly different versions of the same logic.
1610 * The first is for /a and /aa when the target string is UTF-8. This can only
1611 * match ascii, but it must advance based on UTF-8. The other two handle the
1612 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1613 * for the boundary (or non-boundary) between a word and non-word character.
1614 * The utf8 and non-utf8 cases have the same logic, but the details must be
1615 * different. Find the "wordness" of the character just prior to this one, and
1616 * compare it with the wordness of this one. If they differ, we have a
1617 * boundary. At the beginning of the string, pretend that the previous
1618 * character was a new-line.
1620 * All these macros uncleanly have side-effects with each other and outside
1621 * variables. So far it's been too much trouble to clean-up
1623 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1624 * a word character or not.
1625 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1627 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1629 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1630 * are looking for a boundary or for a non-boundary. If we are looking for a
1631 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1632 * see if this tentative match actually works, and if so, to quit the loop
1633 * here. And vice-versa if we are looking for a non-boundary.
1635 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1636 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1637 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1638 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1639 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1640 * complement. But in that branch we complement tmp, meaning that at the
1641 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1642 * which means at the top of the loop in the next iteration, it is
1643 * TEST_NON_UTF8(s-1) */
1644 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1645 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1646 tmp = TEST_NON_UTF8(tmp); \
1647 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1648 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1650 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1657 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1658 * TEST_UTF8 is a macro that for the same input code points returns identically
1659 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1660 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
1661 if (s == reginfo->strbeg) { \
1664 else { /* Back-up to the start of the previous character */ \
1665 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1666 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1667 0, UTF8_ALLOW_DEFAULT); \
1669 tmp = TEST_UV(tmp); \
1670 LOAD_UTF8_CHARCLASS_ALNUM(); \
1671 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1672 if (tmp == ! (TEST_UTF8((U8 *) s))) { \
1681 /* Like the above two macros. UTF8_CODE is the complete code for handling
1682 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1684 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1685 if (utf8_target) { \
1688 else { /* Not utf8 */ \
1689 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1690 tmp = TEST_NON_UTF8(tmp); \
1691 REXEC_FBC_SCAN( /* advances s while s < strend */ \
1692 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1701 /* Here, things have been set up by the previous code so that tmp is the \
1702 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1703 * utf8ness of the target). We also have to check if this matches against \
1704 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1705 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1707 if (tmp == ! TEST_NON_UTF8('\n')) { \
1714 /* This is the macro to use when we want to see if something that looks like it
1715 * could match, actually does, and if so exits the loop */
1716 #define REXEC_FBC_TRYIT \
1717 if ((reginfo->intuit || regtry(reginfo, &s))) \
1720 /* The only difference between the BOUND and NBOUND cases is that
1721 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1722 * NBOUND. This is accomplished by passing it as either the if or else clause,
1723 * with the other one being empty (PLACEHOLDER is defined as empty).
1725 * The TEST_FOO parameters are for operating on different forms of input, but
1726 * all should be ones that return identically for the same underlying code
1728 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1730 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1731 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1733 #define FBC_BOUND_A(TEST_NON_UTF8) \
1735 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1736 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1738 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1740 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1741 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1743 #define FBC_NBOUND_A(TEST_NON_UTF8) \
1745 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1746 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1750 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1751 IV cp_out = Perl__invlist_search(invlist, cp_in);
1752 assert(cp_out >= 0);
1755 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1756 invmap[S_get_break_val_cp_checked(invlist, cp)]
1758 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1759 invmap[_invlist_search(invlist, cp)]
1762 /* Takes a pointer to an inversion list, a pointer to its corresponding
1763 * inversion map, and a code point, and returns the code point's value
1764 * according to the two arrays. It assumes that all code points have a value.
1765 * This is used as the base macro for macros for particular properties */
1766 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
1767 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
1769 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1770 * of a code point, returning the value for the first code point in the string.
1771 * And it takes the particular macro name that finds the desired value given a
1772 * code point. Merely convert the UTF-8 to code point and call the cp macro */
1773 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
1774 (__ASSERT_(pos < strend) \
1775 /* Note assumes is valid UTF-8 */ \
1776 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1778 /* Returns the GCB value for the input code point */
1779 #define getGCB_VAL_CP(cp) \
1780 _generic_GET_BREAK_VAL_CP( \
1785 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1786 * bounded by pos and strend */
1787 #define getGCB_VAL_UTF8(pos, strend) \
1788 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1790 /* Returns the LB value for the input code point */
1791 #define getLB_VAL_CP(cp) \
1792 _generic_GET_BREAK_VAL_CP( \
1797 /* Returns the LB value for the first code point in the UTF-8 encoded string
1798 * bounded by pos and strend */
1799 #define getLB_VAL_UTF8(pos, strend) \
1800 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1803 /* Returns the SB value for the input code point */
1804 #define getSB_VAL_CP(cp) \
1805 _generic_GET_BREAK_VAL_CP( \
1810 /* Returns the SB value for the first code point in the UTF-8 encoded string
1811 * bounded by pos and strend */
1812 #define getSB_VAL_UTF8(pos, strend) \
1813 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1815 /* Returns the WB value for the input code point */
1816 #define getWB_VAL_CP(cp) \
1817 _generic_GET_BREAK_VAL_CP( \
1822 /* Returns the WB value for the first code point in the UTF-8 encoded string
1823 * bounded by pos and strend */
1824 #define getWB_VAL_UTF8(pos, strend) \
1825 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1827 /* We know what class REx starts with. Try to find this position... */
1828 /* if reginfo->intuit, its a dryrun */
1829 /* annoyingly all the vars in this routine have different names from their counterparts
1830 in regmatch. /grrr */
1832 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1833 const char *strend, regmatch_info *reginfo)
1836 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1837 char *pat_string; /* The pattern's exactish string */
1838 char *pat_end; /* ptr to end char of pat_string */
1839 re_fold_t folder; /* Function for computing non-utf8 folds */
1840 const U8 *fold_array; /* array for folding ords < 256 */
1846 I32 tmp = 1; /* Scratch variable? */
1847 const bool utf8_target = reginfo->is_utf8_target;
1848 UV utf8_fold_flags = 0;
1849 const bool is_utf8_pat = reginfo->is_utf8_pat;
1850 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1851 with a result inverts that result, as 0^1 =
1853 _char_class_number classnum;
1855 RXi_GET_DECL(prog,progi);
1857 PERL_ARGS_ASSERT_FIND_BYCLASS;
1859 /* We know what class it must start with. */
1862 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1864 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
1865 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1872 REXEC_FBC_UTF8_CLASS_SCAN(
1873 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1876 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s, 0));
1880 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1881 assert(! is_utf8_pat);
1884 if (is_utf8_pat || utf8_target) {
1885 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1886 goto do_exactf_utf8;
1888 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1889 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1890 goto do_exactf_non_utf8; /* isn't dealt with by these */
1892 case EXACTF: /* This node only generated for non-utf8 patterns */
1893 assert(! is_utf8_pat);
1895 utf8_fold_flags = 0;
1896 goto do_exactf_utf8;
1898 fold_array = PL_fold;
1900 goto do_exactf_non_utf8;
1903 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1904 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1905 utf8_fold_flags = FOLDEQ_LOCALE;
1906 goto do_exactf_utf8;
1908 fold_array = PL_fold_locale;
1909 folder = foldEQ_locale;
1910 goto do_exactf_non_utf8;
1914 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1916 goto do_exactf_utf8;
1919 if (! utf8_target) { /* All code points in this node require
1920 UTF-8 to express. */
1923 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1924 | FOLDEQ_S2_FOLDS_SANE;
1925 goto do_exactf_utf8;
1928 if (is_utf8_pat || utf8_target) {
1929 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1930 goto do_exactf_utf8;
1933 /* Any 'ss' in the pattern should have been replaced by regcomp,
1934 * so we don't have to worry here about this single special case
1935 * in the Latin1 range */
1936 fold_array = PL_fold_latin1;
1937 folder = foldEQ_latin1;
1941 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1942 are no glitches with fold-length differences
1943 between the target string and pattern */
1945 /* The idea in the non-utf8 EXACTF* cases is to first find the
1946 * first character of the EXACTF* node and then, if necessary,
1947 * case-insensitively compare the full text of the node. c1 is the
1948 * first character. c2 is its fold. This logic will not work for
1949 * Unicode semantics and the german sharp ss, which hence should
1950 * not be compiled into a node that gets here. */
1951 pat_string = STRING(c);
1952 ln = STR_LEN(c); /* length to match in octets/bytes */
1954 /* We know that we have to match at least 'ln' bytes (which is the
1955 * same as characters, since not utf8). If we have to match 3
1956 * characters, and there are only 2 availabe, we know without
1957 * trying that it will fail; so don't start a match past the
1958 * required minimum number from the far end */
1959 e = HOP3c(strend, -((SSize_t)ln), s);
1961 if (reginfo->intuit && e < s) {
1962 e = s; /* Due to minlen logic of intuit() */
1966 c2 = fold_array[c1];
1967 if (c1 == c2) { /* If char and fold are the same */
1968 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1971 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1979 /* If one of the operands is in utf8, we can't use the simpler folding
1980 * above, due to the fact that many different characters can have the
1981 * same fold, or portion of a fold, or different- length fold */
1982 pat_string = STRING(c);
1983 ln = STR_LEN(c); /* length to match in octets/bytes */
1984 pat_end = pat_string + ln;
1985 lnc = is_utf8_pat /* length to match in characters */
1986 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1989 /* We have 'lnc' characters to match in the pattern, but because of
1990 * multi-character folding, each character in the target can match
1991 * up to 3 characters (Unicode guarantees it will never exceed
1992 * this) if it is utf8-encoded; and up to 2 if not (based on the
1993 * fact that the Latin 1 folds are already determined, and the
1994 * only multi-char fold in that range is the sharp-s folding to
1995 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1996 * string character. Adjust lnc accordingly, rounding up, so that
1997 * if we need to match at least 4+1/3 chars, that really is 5. */
1998 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1999 lnc = (lnc + expansion - 1) / expansion;
2001 /* As in the non-UTF8 case, if we have to match 3 characters, and
2002 * only 2 are left, it's guaranteed to fail, so don't start a
2003 * match that would require us to go beyond the end of the string
2005 e = HOP3c(strend, -((SSize_t)lnc), s);
2007 if (reginfo->intuit && e < s) {
2008 e = s; /* Due to minlen logic of intuit() */
2011 /* XXX Note that we could recalculate e to stop the loop earlier,
2012 * as the worst case expansion above will rarely be met, and as we
2013 * go along we would usually find that e moves further to the left.
2014 * This would happen only after we reached the point in the loop
2015 * where if there were no expansion we should fail. Unclear if
2016 * worth the expense */
2019 char *my_strend= (char *)strend;
2020 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2021 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2022 && (reginfo->intuit || regtry(reginfo, &s)) )
2026 s += (utf8_target) ? UTF8SKIP(s) : 1;
2032 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2033 if (FLAGS(c) != TRADITIONAL_BOUND) {
2034 if (! IN_UTF8_CTYPE_LOCALE) {
2035 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2036 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2041 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2045 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2046 if (FLAGS(c) != TRADITIONAL_BOUND) {
2047 if (! IN_UTF8_CTYPE_LOCALE) {
2048 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2049 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2054 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2057 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2059 assert(FLAGS(c) == TRADITIONAL_BOUND);
2061 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2064 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2066 assert(FLAGS(c) == TRADITIONAL_BOUND);
2068 FBC_BOUND_A(isWORDCHAR_A);
2071 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2073 assert(FLAGS(c) == TRADITIONAL_BOUND);
2075 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2078 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2080 assert(FLAGS(c) == TRADITIONAL_BOUND);
2082 FBC_NBOUND_A(isWORDCHAR_A);
2086 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2087 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2098 switch((bound_type) FLAGS(c)) {
2099 case TRADITIONAL_BOUND:
2100 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2103 if (s == reginfo->strbeg) {
2104 if (reginfo->intuit || regtry(reginfo, &s))
2109 /* Didn't match. Try at the next position (if there is one) */
2110 s += (utf8_target) ? UTF8SKIP(s) : 1;
2111 if (UNLIKELY(s >= reginfo->strend)) {
2117 GCB_enum before = getGCB_VAL_UTF8(
2119 (U8*)(reginfo->strbeg)),
2120 (U8*) reginfo->strend);
2121 while (s < strend) {
2122 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2123 (U8*) reginfo->strend);
2124 if ( (to_complement ^ isGCB(before,
2126 (U8*) reginfo->strbeg,
2129 && (reginfo->intuit || regtry(reginfo, &s)))
2137 else { /* Not utf8. Everything is a GCB except between CR and
2139 while (s < strend) {
2140 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2141 || UCHARAT(s) != '\n'))
2142 && (reginfo->intuit || regtry(reginfo, &s)))
2150 /* And, since this is a bound, it can match after the final
2151 * character in the string */
2152 if ((reginfo->intuit || regtry(reginfo, &s))) {
2158 if (s == reginfo->strbeg) {
2159 if (reginfo->intuit || regtry(reginfo, &s)) {
2162 s += (utf8_target) ? UTF8SKIP(s) : 1;
2163 if (UNLIKELY(s >= reginfo->strend)) {
2169 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2171 (U8*)(reginfo->strbeg)),
2172 (U8*) reginfo->strend);
2173 while (s < strend) {
2174 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2175 if (to_complement ^ isLB(before,
2177 (U8*) reginfo->strbeg,
2179 (U8*) reginfo->strend,
2181 && (reginfo->intuit || regtry(reginfo, &s)))
2189 else { /* Not utf8. */
2190 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2191 while (s < strend) {
2192 LB_enum after = getLB_VAL_CP((U8) *s);
2193 if (to_complement ^ isLB(before,
2195 (U8*) reginfo->strbeg,
2197 (U8*) reginfo->strend,
2199 && (reginfo->intuit || regtry(reginfo, &s)))
2208 if (reginfo->intuit || regtry(reginfo, &s)) {
2215 if (s == reginfo->strbeg) {
2216 if (reginfo->intuit || regtry(reginfo, &s)) {
2219 s += (utf8_target) ? UTF8SKIP(s) : 1;
2220 if (UNLIKELY(s >= reginfo->strend)) {
2226 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2228 (U8*)(reginfo->strbeg)),
2229 (U8*) reginfo->strend);
2230 while (s < strend) {
2231 SB_enum after = getSB_VAL_UTF8((U8*) s,
2232 (U8*) reginfo->strend);
2233 if ((to_complement ^ isSB(before,
2235 (U8*) reginfo->strbeg,
2237 (U8*) reginfo->strend,
2239 && (reginfo->intuit || regtry(reginfo, &s)))
2247 else { /* Not utf8. */
2248 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2249 while (s < strend) {
2250 SB_enum after = getSB_VAL_CP((U8) *s);
2251 if ((to_complement ^ isSB(before,
2253 (U8*) reginfo->strbeg,
2255 (U8*) reginfo->strend,
2257 && (reginfo->intuit || regtry(reginfo, &s)))
2266 /* Here are at the final position in the target string. The SB
2267 * value is always true here, so matches, depending on other
2269 if (reginfo->intuit || regtry(reginfo, &s)) {
2276 if (s == reginfo->strbeg) {
2277 if (reginfo->intuit || regtry(reginfo, &s)) {
2280 s += (utf8_target) ? UTF8SKIP(s) : 1;
2281 if (UNLIKELY(s >= reginfo->strend)) {
2287 /* We are at a boundary between char_sub_0 and char_sub_1.
2288 * We also keep track of the value for char_sub_-1 as we
2289 * loop through the line. Context may be needed to make a
2290 * determination, and if so, this can save having to
2292 WB_enum previous = WB_UNKNOWN;
2293 WB_enum before = getWB_VAL_UTF8(
2296 (U8*)(reginfo->strbeg)),
2297 (U8*) reginfo->strend);
2298 while (s < strend) {
2299 WB_enum after = getWB_VAL_UTF8((U8*) s,
2300 (U8*) reginfo->strend);
2301 if ((to_complement ^ isWB(previous,
2304 (U8*) reginfo->strbeg,
2306 (U8*) reginfo->strend,
2308 && (reginfo->intuit || regtry(reginfo, &s)))
2317 else { /* Not utf8. */
2318 WB_enum previous = WB_UNKNOWN;
2319 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2320 while (s < strend) {
2321 WB_enum after = getWB_VAL_CP((U8) *s);
2322 if ((to_complement ^ isWB(previous,
2325 (U8*) reginfo->strbeg,
2327 (U8*) reginfo->strend,
2329 && (reginfo->intuit || regtry(reginfo, &s)))
2339 if (reginfo->intuit || regtry(reginfo, &s)) {
2346 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2347 is_LNBREAK_latin1_safe(s, strend)
2351 /* The argument to all the POSIX node types is the class number to pass to
2352 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2359 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2360 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2361 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2376 /* The complement of something that matches only ASCII matches all
2377 * non-ASCII, plus everything in ASCII that isn't in the class. */
2378 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2379 || ! _generic_isCC_A(*s, FLAGS(c)));
2388 /* Don't need to worry about utf8, as it can match only a single
2389 * byte invariant character. */
2390 REXEC_FBC_CLASS_SCAN(
2391 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2399 if (! utf8_target) {
2400 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2406 classnum = (_char_class_number) FLAGS(c);
2407 if (classnum < _FIRST_NON_SWASH_CC) {
2408 while (s < strend) {
2410 /* We avoid loading in the swash as long as possible, but
2411 * should we have to, we jump to a separate loop. This
2412 * extra 'if' statement is what keeps this code from being
2413 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2414 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2415 goto found_above_latin1;
2417 if ((UTF8_IS_INVARIANT(*s)
2418 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2420 || (UTF8_IS_DOWNGRADEABLE_START(*s)
2421 && to_complement ^ cBOOL(
2422 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
2426 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2438 else switch (classnum) { /* These classes are implemented as
2440 case _CC_ENUM_SPACE:
2441 REXEC_FBC_UTF8_CLASS_SCAN(
2442 to_complement ^ cBOOL(isSPACE_utf8(s)));
2445 case _CC_ENUM_BLANK:
2446 REXEC_FBC_UTF8_CLASS_SCAN(
2447 to_complement ^ cBOOL(isBLANK_utf8(s)));
2450 case _CC_ENUM_XDIGIT:
2451 REXEC_FBC_UTF8_CLASS_SCAN(
2452 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2455 case _CC_ENUM_VERTSPACE:
2456 REXEC_FBC_UTF8_CLASS_SCAN(
2457 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2460 case _CC_ENUM_CNTRL:
2461 REXEC_FBC_UTF8_CLASS_SCAN(
2462 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2466 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2467 NOT_REACHED; /* NOTREACHED */
2472 found_above_latin1: /* Here we have to load a swash to get the result
2473 for the current code point */
2474 if (! PL_utf8_swash_ptrs[classnum]) {
2475 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2476 PL_utf8_swash_ptrs[classnum] =
2477 _core_swash_init("utf8",
2480 PL_XPosix_ptrs[classnum], &flags);
2483 /* This is a copy of the loop above for swash classes, though using the
2484 * FBC macro instead of being expanded out. Since we've loaded the
2485 * swash, we don't have to check for that each time through the loop */
2486 REXEC_FBC_UTF8_CLASS_SCAN(
2487 to_complement ^ cBOOL(_generic_utf8(
2490 swash_fetch(PL_utf8_swash_ptrs[classnum],
2498 /* what trie are we using right now */
2499 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2500 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2501 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2503 const char *last_start = strend - trie->minlen;
2505 const char *real_start = s;
2507 STRLEN maxlen = trie->maxlen;
2509 U8 **points; /* map of where we were in the input string
2510 when reading a given char. For ASCII this
2511 is unnecessary overhead as the relationship
2512 is always 1:1, but for Unicode, especially
2513 case folded Unicode this is not true. */
2514 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2518 GET_RE_DEBUG_FLAGS_DECL;
2520 /* We can't just allocate points here. We need to wrap it in
2521 * an SV so it gets freed properly if there is a croak while
2522 * running the match */
2525 sv_points=newSV(maxlen * sizeof(U8 *));
2526 SvCUR_set(sv_points,
2527 maxlen * sizeof(U8 *));
2528 SvPOK_on(sv_points);
2529 sv_2mortal(sv_points);
2530 points=(U8**)SvPV_nolen(sv_points );
2531 if ( trie_type != trie_utf8_fold
2532 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2535 bitmap=(U8*)trie->bitmap;
2537 bitmap=(U8*)ANYOF_BITMAP(c);
2539 /* this is the Aho-Corasick algorithm modified a touch
2540 to include special handling for long "unknown char" sequences.
2541 The basic idea being that we use AC as long as we are dealing
2542 with a possible matching char, when we encounter an unknown char
2543 (and we have not encountered an accepting state) we scan forward
2544 until we find a legal starting char.
2545 AC matching is basically that of trie matching, except that when
2546 we encounter a failing transition, we fall back to the current
2547 states "fail state", and try the current char again, a process
2548 we repeat until we reach the root state, state 1, or a legal
2549 transition. If we fail on the root state then we can either
2550 terminate if we have reached an accepting state previously, or
2551 restart the entire process from the beginning if we have not.
2554 while (s <= last_start) {
2555 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2563 U8 *uscan = (U8*)NULL;
2564 U8 *leftmost = NULL;
2566 U32 accepted_word= 0;
2570 while ( state && uc <= (U8*)strend ) {
2572 U32 word = aho->states[ state ].wordnum;
2576 DEBUG_TRIE_EXECUTE_r(
2577 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2578 dump_exec_pos( (char *)uc, c, strend, real_start,
2579 (char *)uc, utf8_target, 0 );
2580 Perl_re_printf( aTHX_
2581 " Scanning for legal start char...\n");
2585 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2589 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2595 if (uc >(U8*)last_start) break;
2599 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2600 if (!leftmost || lpos < leftmost) {
2601 DEBUG_r(accepted_word=word);
2607 points[pointpos++ % maxlen]= uc;
2608 if (foldlen || uc < (U8*)strend) {
2609 REXEC_TRIE_READ_CHAR(trie_type, trie,
2611 uscan, len, uvc, charid, foldlen,
2613 DEBUG_TRIE_EXECUTE_r({
2614 dump_exec_pos( (char *)uc, c, strend,
2615 real_start, s, utf8_target, 0);
2616 Perl_re_printf( aTHX_
2617 " Charid:%3u CP:%4"UVxf" ",
2629 word = aho->states[ state ].wordnum;
2631 base = aho->states[ state ].trans.base;
2633 DEBUG_TRIE_EXECUTE_r({
2635 dump_exec_pos( (char *)uc, c, strend, real_start,
2636 s, utf8_target, 0 );
2637 Perl_re_printf( aTHX_
2638 "%sState: %4"UVxf", word=%"UVxf,
2639 failed ? " Fail transition to " : "",
2640 (UV)state, (UV)word);
2646 ( ((offset = base + charid
2647 - 1 - trie->uniquecharcount)) >= 0)
2648 && ((U32)offset < trie->lasttrans)
2649 && trie->trans[offset].check == state
2650 && (tmp=trie->trans[offset].next))
2652 DEBUG_TRIE_EXECUTE_r(
2653 Perl_re_printf( aTHX_ " - legal\n"));
2658 DEBUG_TRIE_EXECUTE_r(
2659 Perl_re_printf( aTHX_ " - fail\n"));
2661 state = aho->fail[state];
2665 /* we must be accepting here */
2666 DEBUG_TRIE_EXECUTE_r(
2667 Perl_re_printf( aTHX_ " - accepting\n"));
2676 if (!state) state = 1;
2679 if ( aho->states[ state ].wordnum ) {
2680 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2681 if (!leftmost || lpos < leftmost) {
2682 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2687 s = (char*)leftmost;
2688 DEBUG_TRIE_EXECUTE_r({
2689 Perl_re_printf( aTHX_ "Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2690 (UV)accepted_word, (IV)(s - real_start)
2693 if (reginfo->intuit || regtry(reginfo, &s)) {
2699 DEBUG_TRIE_EXECUTE_r({
2700 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
2703 DEBUG_TRIE_EXECUTE_r(
2704 Perl_re_printf( aTHX_ "No match.\n"));
2713 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2720 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2721 * flags have same meanings as with regexec_flags() */
2724 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2731 struct regexp *const prog = ReANY(rx);
2733 if (flags & REXEC_COPY_STR) {
2736 DEBUG_C(Perl_re_printf( aTHX_
2737 "Copy on write: regexp capture, type %d\n",
2739 /* Create a new COW SV to share the match string and store
2740 * in saved_copy, unless the current COW SV in saved_copy
2741 * is valid and suitable for our purpose */
2742 if (( prog->saved_copy
2743 && SvIsCOW(prog->saved_copy)
2744 && SvPOKp(prog->saved_copy)
2747 && SvPVX(sv) == SvPVX(prog->saved_copy)))
2749 /* just reuse saved_copy SV */
2750 if (RXp_MATCH_COPIED(prog)) {
2751 Safefree(prog->subbeg);
2752 RXp_MATCH_COPIED_off(prog);
2756 /* create new COW SV to share string */
2757 RX_MATCH_COPY_FREE(rx);
2758 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2760 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2761 assert (SvPOKp(prog->saved_copy));
2762 prog->sublen = strend - strbeg;
2763 prog->suboffset = 0;
2764 prog->subcoffset = 0;
2769 SSize_t max = strend - strbeg;
2772 if ( (flags & REXEC_COPY_SKIP_POST)
2773 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2774 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2775 ) { /* don't copy $' part of string */
2778 /* calculate the right-most part of the string covered
2779 * by a capture. Due to lookahead, this may be to
2780 * the right of $&, so we have to scan all captures */
2781 while (n <= prog->lastparen) {
2782 if (prog->offs[n].end > max)
2783 max = prog->offs[n].end;
2787 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2788 ? prog->offs[0].start
2790 assert(max >= 0 && max <= strend - strbeg);
2793 if ( (flags & REXEC_COPY_SKIP_PRE)
2794 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2795 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2796 ) { /* don't copy $` part of string */
2799 /* calculate the left-most part of the string covered
2800 * by a capture. Due to lookbehind, this may be to
2801 * the left of $&, so we have to scan all captures */
2802 while (min && n <= prog->lastparen) {
2803 if ( prog->offs[n].start != -1
2804 && prog->offs[n].start < min)
2806 min = prog->offs[n].start;
2810 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2811 && min > prog->offs[0].end
2813 min = prog->offs[0].end;
2817 assert(min >= 0 && min <= max && min <= strend - strbeg);
2820 if (RX_MATCH_COPIED(rx)) {
2821 if (sublen > prog->sublen)
2823 (char*)saferealloc(prog->subbeg, sublen+1);
2826 prog->subbeg = (char*)safemalloc(sublen+1);
2827 Copy(strbeg + min, prog->subbeg, sublen, char);
2828 prog->subbeg[sublen] = '\0';
2829 prog->suboffset = min;
2830 prog->sublen = sublen;
2831 RX_MATCH_COPIED_on(rx);
2833 prog->subcoffset = prog->suboffset;
2834 if (prog->suboffset && utf8_target) {
2835 /* Convert byte offset to chars.
2836 * XXX ideally should only compute this if @-/@+
2837 * has been seen, a la PL_sawampersand ??? */
2839 /* If there's a direct correspondence between the
2840 * string which we're matching and the original SV,
2841 * then we can use the utf8 len cache associated with
2842 * the SV. In particular, it means that under //g,
2843 * sv_pos_b2u() will use the previously cached
2844 * position to speed up working out the new length of
2845 * subcoffset, rather than counting from the start of
2846 * the string each time. This stops
2847 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2848 * from going quadratic */
2849 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2850 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2851 SV_GMAGIC|SV_CONST_RETURN);
2853 prog->subcoffset = utf8_length((U8*)strbeg,
2854 (U8*)(strbeg+prog->suboffset));
2858 RX_MATCH_COPY_FREE(rx);
2859 prog->subbeg = strbeg;
2860 prog->suboffset = 0;
2861 prog->subcoffset = 0;
2862 prog->sublen = strend - strbeg;
2870 - regexec_flags - match a regexp against a string
2873 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2874 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2875 /* stringarg: the point in the string at which to begin matching */
2876 /* strend: pointer to null at end of string */
2877 /* strbeg: real beginning of string */
2878 /* minend: end of match must be >= minend bytes after stringarg. */
2879 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2880 * itself is accessed via the pointers above */
2881 /* data: May be used for some additional optimizations.
2882 Currently unused. */
2883 /* flags: For optimizations. See REXEC_* in regexp.h */
2886 struct regexp *const prog = ReANY(rx);
2890 SSize_t minlen; /* must match at least this many chars */
2891 SSize_t dontbother = 0; /* how many characters not to try at end */
2892 const bool utf8_target = cBOOL(DO_UTF8(sv));
2894 RXi_GET_DECL(prog,progi);
2895 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2896 regmatch_info *const reginfo = ®info_buf;
2897 regexp_paren_pair *swap = NULL;
2899 GET_RE_DEBUG_FLAGS_DECL;
2901 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2902 PERL_UNUSED_ARG(data);
2904 /* Be paranoid... */
2906 Perl_croak(aTHX_ "NULL regexp parameter");
2910 debug_start_match(rx, utf8_target, stringarg, strend,
2914 startpos = stringarg;
2916 /* set these early as they may be used by the HOP macros below */
2917 reginfo->strbeg = strbeg;
2918 reginfo->strend = strend;
2919 reginfo->is_utf8_target = cBOOL(utf8_target);
2921 if (prog->intflags & PREGf_GPOS_SEEN) {
2924 /* set reginfo->ganch, the position where \G can match */
2927 (flags & REXEC_IGNOREPOS)
2928 ? stringarg /* use start pos rather than pos() */
2929 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2930 /* Defined pos(): */
2931 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2932 : strbeg; /* pos() not defined; use start of string */
2934 DEBUG_GPOS_r(Perl_re_printf( aTHX_
2935 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2937 /* in the presence of \G, we may need to start looking earlier in
2938 * the string than the suggested start point of stringarg:
2939 * if prog->gofs is set, then that's a known, fixed minimum
2942 * /ab|c\G/: gofs = 1
2943 * or if the minimum offset isn't known, then we have to go back
2944 * to the start of the string, e.g. /w+\G/
2947 if (prog->intflags & PREGf_ANCH_GPOS) {
2949 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
2951 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
2953 DEBUG_r(Perl_re_printf( aTHX_
2954 "fail: ganch-gofs before earliest possible start\n"));
2959 startpos = reginfo->ganch;
2961 else if (prog->gofs) {
2962 startpos = HOPBACKc(startpos, prog->gofs);
2966 else if (prog->intflags & PREGf_GPOS_FLOAT)
2970 minlen = prog->minlen;
2971 if ((startpos + minlen) > strend || startpos < strbeg) {
2972 DEBUG_r(Perl_re_printf( aTHX_
2973 "Regex match can't succeed, so not even tried\n"));
2977 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2978 * which will call destuctors to reset PL_regmatch_state, free higher
2979 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2980 * regmatch_info_aux_eval */
2982 oldsave = PL_savestack_ix;
2986 if ((prog->extflags & RXf_USE_INTUIT)
2987 && !(flags & REXEC_CHECKED))
2989 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2994 if (prog->extflags & RXf_CHECK_ALL) {
2995 /* we can match based purely on the result of INTUIT.
2996 * Set up captures etc just for $& and $-[0]
2997 * (an intuit-only match wont have $1,$2,..) */
2998 assert(!prog->nparens);
3000 /* s/// doesn't like it if $& is earlier than where we asked it to
3001 * start searching (which can happen on something like /.\G/) */
3002 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3005 /* this should only be possible under \G */
3006 assert(prog->intflags & PREGf_GPOS_SEEN);
3007 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3008 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3012 /* match via INTUIT shouldn't have any captures.
3013 * Let @-, @+, $^N know */
3014 prog->lastparen = prog->lastcloseparen = 0;
3015 RX_MATCH_UTF8_set(rx, utf8_target);
3016 prog->offs[0].start = s - strbeg;
3017 prog->offs[0].end = utf8_target
3018 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3019 : s - strbeg + prog->minlenret;
3020 if ( !(flags & REXEC_NOT_FIRST) )
3021 S_reg_set_capture_string(aTHX_ rx,
3023 sv, flags, utf8_target);
3029 multiline = prog->extflags & RXf_PMf_MULTILINE;
3031 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3032 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3033 "String too short [regexec_flags]...\n"));
3037 /* Check validity of program. */
3038 if (UCHARAT(progi->program) != REG_MAGIC) {
3039 Perl_croak(aTHX_ "corrupted regexp program");
3042 RX_MATCH_TAINTED_off(rx);
3043 RX_MATCH_UTF8_set(rx, utf8_target);
3045 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3046 reginfo->intuit = 0;
3047 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3048 reginfo->warned = FALSE;
3050 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3051 /* see how far we have to get to not match where we matched before */
3052 reginfo->till = stringarg + minend;
3054 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3055 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3056 S_cleanup_regmatch_info_aux has executed (registered by
3057 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3058 magic belonging to this SV.
3059 Not newSVsv, either, as it does not COW.
3061 reginfo->sv = newSV(0);
3062 SvSetSV_nosteal(reginfo->sv, sv);
3063 SAVEFREESV(reginfo->sv);
3066 /* reserve next 2 or 3 slots in PL_regmatch_state:
3067 * slot N+0: may currently be in use: skip it
3068 * slot N+1: use for regmatch_info_aux struct
3069 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3070 * slot N+3: ready for use by regmatch()
3074 regmatch_state *old_regmatch_state;
3075 regmatch_slab *old_regmatch_slab;
3076 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3078 /* on first ever match, allocate first slab */
3079 if (!PL_regmatch_slab) {
3080 Newx(PL_regmatch_slab, 1, regmatch_slab);
3081 PL_regmatch_slab->prev = NULL;
3082 PL_regmatch_slab->next = NULL;
3083 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3086 old_regmatch_state = PL_regmatch_state;
3087 old_regmatch_slab = PL_regmatch_slab;
3089 for (i=0; i <= max; i++) {
3091 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3093 reginfo->info_aux_eval =
3094 reginfo->info_aux->info_aux_eval =
3095 &(PL_regmatch_state->u.info_aux_eval);
3097 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3098 PL_regmatch_state = S_push_slab(aTHX);
3101 /* note initial PL_regmatch_state position; at end of match we'll
3102 * pop back to there and free any higher slabs */
3104 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3105 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3106 reginfo->info_aux->poscache = NULL;
3108 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3110 if ((prog->extflags & RXf_EVAL_SEEN))
3111 S_setup_eval_state(aTHX_ reginfo);
3113 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3116 /* If there is a "must appear" string, look for it. */
3118 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3119 /* We have to be careful. If the previous successful match
3120 was from this regex we don't want a subsequent partially
3121 successful match to clobber the old results.
3122 So when we detect this possibility we add a swap buffer
3123 to the re, and switch the buffer each match. If we fail,
3124 we switch it back; otherwise we leave it swapped.
3127 /* do we need a save destructor here for eval dies? */
3128 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3129 DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
3130 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3137 if (prog->recurse_locinput)
3138 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3140 /* Simplest case: anchored match need be tried only once, or with
3141 * MBOL, only at the beginning of each line.
3143 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3144 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3145 * match at the start of the string then it won't match anywhere else
3146 * either; while with /.*.../, if it doesn't match at the beginning,
3147 * the earliest it could match is at the start of the next line */
3149 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3152 if (regtry(reginfo, &s))
3155 if (!(prog->intflags & PREGf_ANCH_MBOL))
3158 /* didn't match at start, try at other newline positions */
3161 dontbother = minlen - 1;
3162 end = HOP3c(strend, -dontbother, strbeg) - 1;
3164 /* skip to next newline */
3166 while (s <= end) { /* note it could be possible to match at the end of the string */
3167 /* NB: newlines are the same in unicode as they are in latin */
3170 if (prog->check_substr || prog->check_utf8) {
3171 /* note that with PREGf_IMPLICIT, intuit can only fail
3172 * or return the start position, so it's of limited utility.
3173 * Nevertheless, I made the decision that the potential for
3174 * quick fail was still worth it - DAPM */
3175 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3179 if (regtry(reginfo, &s))
3183 } /* end anchored search */
3185 if (prog->intflags & PREGf_ANCH_GPOS)
3187 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3188 assert(prog->intflags & PREGf_GPOS_SEEN);
3189 /* For anchored \G, the only position it can match from is
3190 * (ganch-gofs); we already set startpos to this above; if intuit
3191 * moved us on from there, we can't possibly succeed */
3192 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3193 if (s == startpos && regtry(reginfo, &s))
3198 /* Messy cases: unanchored match. */
3199 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3200 /* we have /x+whatever/ */
3201 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3207 if (! prog->anchored_utf8) {
3208 to_utf8_substr(prog);
3210 ch = SvPVX_const(prog->anchored_utf8)[0];
3213 DEBUG_EXECUTE_r( did_match = 1 );
3214 if (regtry(reginfo, &s)) goto got_it;
3216 while (s < strend && *s == ch)
3223 if (! prog->anchored_substr) {
3224 if (! to_byte_substr(prog)) {
3225 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3228 ch = SvPVX_const(prog->anchored_substr)[0];
3231 DEBUG_EXECUTE_r( did_match = 1 );
3232 if (regtry(reginfo, &s)) goto got_it;
3234 while (s < strend && *s == ch)
3239 DEBUG_EXECUTE_r(if (!did_match)
3240 Perl_re_printf( aTHX_
3241 "Did not find anchored character...\n")
3244 else if (prog->anchored_substr != NULL
3245 || prog->anchored_utf8 != NULL
3246 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3247 && prog->float_max_offset < strend - s)) {
3252 char *last1; /* Last position checked before */
3256 if (prog->anchored_substr || prog->anchored_utf8) {
3258 if (! prog->anchored_utf8) {
3259 to_utf8_substr(prog);
3261 must = prog->anchored_utf8;
3264 if (! prog->anchored_substr) {
3265 if (! to_byte_substr(prog)) {
3266 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3269 must = prog->anchored_substr;
3271 back_max = back_min = prog->anchored_offset;
3274 if (! prog->float_utf8) {
3275 to_utf8_substr(prog);
3277 must = prog->float_utf8;
3280 if (! prog->float_substr) {
3281 if (! to_byte_substr(prog)) {
3282 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3285 must = prog->float_substr;
3287 back_max = prog->float_max_offset;
3288 back_min = prog->float_min_offset;
3294 last = HOP3c(strend, /* Cannot start after this */
3295 -(SSize_t)(CHR_SVLEN(must)
3296 - (SvTAIL(must) != 0) + back_min), strbeg);
3298 if (s > reginfo->strbeg)
3299 last1 = HOPc(s, -1);
3301 last1 = s - 1; /* bogus */
3303 /* XXXX check_substr already used to find "s", can optimize if
3304 check_substr==must. */
3306 strend = HOPc(strend, -dontbother);
3307 while ( (s <= last) &&
3308 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
3309 (unsigned char*)strend, must,
3310 multiline ? FBMrf_MULTILINE : 0)) ) {
3311 DEBUG_EXECUTE_r( did_match = 1 );
3312 if (HOPc(s, -back_max) > last1) {
3313 last1 = HOPc(s, -back_min);
3314 s = HOPc(s, -back_max);
3317 char * const t = (last1 >= reginfo->strbeg)
3318 ? HOPc(last1, 1) : last1 + 1;
3320 last1 = HOPc(s, -back_min);
3324 while (s <= last1) {
3325 if (regtry(reginfo, &s))
3328 s++; /* to break out of outer loop */
3335 while (s <= last1) {
3336 if (regtry(reginfo, &s))
3342 DEBUG_EXECUTE_r(if (!did_match) {
3343 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3344 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3345 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
3346 ((must == prog->anchored_substr || must == prog->anchored_utf8)
3347 ? "anchored" : "floating"),
3348 quoted, RE_SV_TAIL(must));
3352 else if ( (c = progi->regstclass) ) {
3354 const OPCODE op = OP(progi->regstclass);
3355 /* don't bother with what can't match */
3356 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3357 strend = HOPc(strend, -(minlen - 1));
3360 SV * const prop = sv_newmortal();
3361 regprop(prog, prop, c, reginfo, NULL);
3363 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3365 Perl_re_printf( aTHX_
3366 "Matching stclass %.*s against %s (%d bytes)\n",
3367 (int)SvCUR(prop), SvPVX_const(prop),
3368 quoted, (int)(strend - s));
3371 if (find_byclass(prog, c, s, strend, reginfo))
3373 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
3377 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3385 if (! prog->float_utf8) {
3386 to_utf8_substr(prog);
3388 float_real = prog->float_utf8;
3391 if (! prog->float_substr) {
3392 if (! to_byte_substr(prog)) {
3393 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3396 float_real = prog->float_substr;
3399 little = SvPV_const(float_real, len);
3400 if (SvTAIL(float_real)) {
3401 /* This means that float_real contains an artificial \n on
3402 * the end due to the presence of something like this:
3403 * /foo$/ where we can match both "foo" and "foo\n" at the
3404 * end of the string. So we have to compare the end of the
3405 * string first against the float_real without the \n and
3406 * then against the full float_real with the string. We
3407 * have to watch out for cases where the string might be
3408 * smaller than the float_real or the float_real without
3410 char *checkpos= strend - len;
3412 Perl_re_printf( aTHX_
3413 "%sChecking for float_real.%s\n",
3414 PL_colors[4], PL_colors[5]));
3415 if (checkpos + 1 < strbeg) {
3416 /* can't match, even if we remove the trailing \n
3417 * string is too short to match */
3419 Perl_re_printf( aTHX_
3420 "%sString shorter than required trailing substring, cannot match.%s\n",
3421 PL_colors[4], PL_colors[5]));
3423 } else if (memEQ(checkpos + 1, little, len - 1)) {
3424 /* can match, the end of the string matches without the
3426 last = checkpos + 1;
3427 } else if (checkpos < strbeg) {
3428 /* cant match, string is too short when the "\n" is
3431 Perl_re_printf( aTHX_
3432 "%sString does not contain required trailing substring, cannot match.%s\n",
3433 PL_colors[4], PL_colors[5]));
3435 } else if (!multiline) {
3436 /* non multiline match, so compare with the "\n" at the
3437 * end of the string */
3438 if (memEQ(checkpos, little, len)) {
3442 Perl_re_printf( aTHX_
3443 "%sString does not contain required trailing substring, cannot match.%s\n",
3444 PL_colors[4], PL_colors[5]));
3448 /* multiline match, so we have to search for a place
3449 * where the full string is located */
3455 last = rninstr(s, strend, little, little + len);
3457 last = strend; /* matching "$" */
3460 /* at one point this block contained a comment which was
3461 * probably incorrect, which said that this was a "should not
3462 * happen" case. Even if it was true when it was written I am
3463 * pretty sure it is not anymore, so I have removed the comment
3464 * and replaced it with this one. Yves */
3466 Perl_re_printf( aTHX_
3467 "%sString does not contain required substring, cannot match.%s\n",
3468 PL_colors[4], PL_colors[5]
3472 dontbother = strend - last + prog->float_min_offset;
3474 if (minlen && (dontbother < minlen))
3475 dontbother = minlen - 1;
3476 strend -= dontbother; /* this one's always in bytes! */
3477 /* We don't know much -- general case. */
3480 if (regtry(reginfo, &s))
3489 if (regtry(reginfo, &s))
3491 } while (s++ < strend);
3499 /* s/// doesn't like it if $& is earlier than where we asked it to
3500 * start searching (which can happen on something like /.\G/) */
3501 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3502 && (prog->offs[0].start < stringarg - strbeg))
3504 /* this should only be possible under \G */
3505 assert(prog->intflags & PREGf_GPOS_SEEN);
3506 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3507 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3513 Perl_re_printf( aTHX_
3514 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3521 /* clean up; this will trigger destructors that will free all slabs
3522 * above the current one, and cleanup the regmatch_info_aux
3523 * and regmatch_info_aux_eval sructs */
3525 LEAVE_SCOPE(oldsave);
3527 if (RXp_PAREN_NAMES(prog))
3528 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3530 /* make sure $`, $&, $', and $digit will work later */
3531 if ( !(flags & REXEC_NOT_FIRST) )
3532 S_reg_set_capture_string(aTHX_ rx,
3533 strbeg, reginfo->strend,
3534 sv, flags, utf8_target);
3539 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
3540 PL_colors[4], PL_colors[5]));
3542 /* clean up; this will trigger destructors that will free all slabs
3543 * above the current one, and cleanup the regmatch_info_aux
3544 * and regmatch_info_aux_eval sructs */
3546 LEAVE_SCOPE(oldsave);
3549 /* we failed :-( roll it back */
3550 DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
3551 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3556 Safefree(prog->offs);
3563 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3564 * Do inc before dec, in case old and new rex are the same */
3565 #define SET_reg_curpm(Re2) \
3566 if (reginfo->info_aux_eval) { \
3567 (void)ReREFCNT_inc(Re2); \
3568 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3569 PM_SETRE((PL_reg_curpm), (Re2)); \
3574 - regtry - try match at specific point
3576 STATIC bool /* 0 failure, 1 success */
3577 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3580 REGEXP *const rx = reginfo->prog;
3581 regexp *const prog = ReANY(rx);
3584 U32 depth = 0; /* used by REGCP_SET */
3586 RXi_GET_DECL(prog,progi);
3587 GET_RE_DEBUG_FLAGS_DECL;
3589 PERL_ARGS_ASSERT_REGTRY;
3591 reginfo->cutpoint=NULL;
3593 prog->offs[0].start = *startposp - reginfo->strbeg;
3594 prog->lastparen = 0;
3595 prog->lastcloseparen = 0;
3597 /* XXXX What this code is doing here?!!! There should be no need
3598 to do this again and again, prog->lastparen should take care of
3601 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3602 * Actually, the code in regcppop() (which Ilya may be meaning by
3603 * prog->lastparen), is not needed at all by the test suite
3604 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3605 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3606 * Meanwhile, this code *is* needed for the
3607 * above-mentioned test suite tests to succeed. The common theme
3608 * on those tests seems to be returning null fields from matches.
3609 * --jhi updated by dapm */
3611 if (prog->nparens) {
3612 regexp_paren_pair *pp = prog->offs;
3614 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3622 result = regmatch(reginfo, *startposp, progi->program + 1);
3624 prog->offs[0].end = result;
3627 if (reginfo->cutpoint)
3628 *startposp= reginfo->cutpoint;
3629 REGCP_UNWIND(lastcp);
3634 #define sayYES goto yes
3635 #define sayNO goto no
3636 #define sayNO_SILENT goto no_silent
3638 /* we dont use STMT_START/END here because it leads to
3639 "unreachable code" warnings, which are bogus, but distracting. */
3640 #define CACHEsayNO \
3641 if (ST.cache_mask) \
3642 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3645 /* this is used to determine how far from the left messages like
3646 'failed...' are printed in regexec.c. It should be set such that
3647 messages are inline with the regop output that created them.
3649 #define REPORT_CODE_OFF 29
3650 #define INDENT_CHARS(depth) ((int)(depth) % 20)
3653 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
3657 PerlIO *f= Perl_debug_log;
3658 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
3659 va_start(ap, depth);
3660 PerlIO_printf(f, "%*s|%4"UVuf"| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
3661 result = PerlIO_vprintf(f, fmt, ap);
3665 #endif /* DEBUGGING */
3668 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3669 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
3670 #define CHRTEST_NOT_A_CP_1 -999
3671 #define CHRTEST_NOT_A_CP_2 -998
3673 /* grab a new slab and return the first slot in it */
3675 STATIC regmatch_state *
3678 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3681 regmatch_slab *s = PL_regmatch_slab->next;
3683 Newx(s, 1, regmatch_slab);
3684 s->prev = PL_regmatch_slab;
3686 PL_regmatch_slab->next = s;
3688 PL_regmatch_slab = s;
3689 return SLAB_FIRST(s);
3693 /* push a new state then goto it */
3695 #define PUSH_STATE_GOTO(state, node, input) \
3696 pushinput = input; \
3698 st->resume_state = state; \
3701 /* push a new state with success backtracking, then goto it */
3703 #define PUSH_YES_STATE_GOTO(state, node, input) \
3704 pushinput = input; \
3706 st->resume_state = state; \
3707 goto push_yes_state;
3714 regmatch() - main matching routine
3716 This is basically one big switch statement in a loop. We execute an op,
3717 set 'next' to point the next op, and continue. If we come to a point which
3718 we may need to backtrack to on failure such as (A|B|C), we push a
3719 backtrack state onto the backtrack stack. On failure, we pop the top
3720 state, and re-enter the loop at the state indicated. If there are no more
3721 states to pop, we return failure.
3723 Sometimes we also need to backtrack on success; for example /A+/, where
3724 after successfully matching one A, we need to go back and try to
3725 match another one; similarly for lookahead assertions: if the assertion
3726 completes successfully, we backtrack to the state just before the assertion
3727 and then carry on. In these cases, the pushed state is marked as
3728 'backtrack on success too'. This marking is in fact done by a chain of
3729 pointers, each pointing to the previous 'yes' state. On success, we pop to
3730 the nearest yes state, discarding any intermediate failure-only states.
3731 Sometimes a yes state is pushed just to force some cleanup code to be
3732 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3733 it to free the inner regex.
3735 Note that failure backtracking rewinds the cursor position, while
3736 success backtracking leaves it alone.
3738 A pattern is complete when the END op is executed, while a subpattern
3739 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3740 ops trigger the "pop to last yes state if any, otherwise return true"
3743 A common convention in this function is to use A and B to refer to the two
3744 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3745 the subpattern to be matched possibly multiple times, while B is the entire
3746 rest of the pattern. Variable and state names reflect this convention.
3748 The states in the main switch are the union of ops and failure/success of
3749 substates associated with with that op. For example, IFMATCH is the op
3750 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3751 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3752 successfully matched A and IFMATCH_A_fail is a state saying that we have
3753 just failed to match A. Resume states always come in pairs. The backtrack
3754 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3755 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3756 on success or failure.
3758 The struct that holds a backtracking state is actually a big union, with
3759 one variant for each major type of op. The variable st points to the
3760 top-most backtrack struct. To make the code clearer, within each
3761 block of code we #define ST to alias the relevant union.
3763 Here's a concrete example of a (vastly oversimplified) IFMATCH
3769 #define ST st->u.ifmatch
3771 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3772 ST.foo = ...; // some state we wish to save
3774 // push a yes backtrack state with a resume value of
3775 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3777 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3780 case IFMATCH_A: // we have successfully executed A; now continue with B
3782 bar = ST.foo; // do something with the preserved value
3785 case IFMATCH_A_fail: // A failed, so the assertion failed
3786 ...; // do some housekeeping, then ...
3787 sayNO; // propagate the failure
3794 For any old-timers reading this who are familiar with the old recursive
3795 approach, the code above is equivalent to:
3797 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3806 ...; // do some housekeeping, then ...
3807 sayNO; // propagate the failure
3810 The topmost backtrack state, pointed to by st, is usually free. If you
3811 want to claim it, populate any ST.foo fields in it with values you wish to
3812 save, then do one of
3814 PUSH_STATE_GOTO(resume_state, node, newinput);
3815 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3817 which sets that backtrack state's resume value to 'resume_state', pushes a
3818 new free entry to the top of the backtrack stack, then goes to 'node'.
3819 On backtracking, the free slot is popped, and the saved state becomes the
3820 new free state. An ST.foo field in this new top state can be temporarily
3821 accessed to retrieve values, but once the main loop is re-entered, it
3822 becomes available for reuse.
3824 Note that the depth of the backtrack stack constantly increases during the
3825 left-to-right execution of the pattern, rather than going up and down with
3826 the pattern nesting. For example the stack is at its maximum at Z at the
3827 end of the pattern, rather than at X in the following:
3829 /(((X)+)+)+....(Y)+....Z/
3831 The only exceptions to this are lookahead/behind assertions and the cut,
3832 (?>A), which pop all the backtrack states associated with A before
3835 Backtrack state structs are allocated in slabs of about 4K in size.
3836 PL_regmatch_state and st always point to the currently active state,
3837 and PL_regmatch_slab points to the slab currently containing
3838 PL_regmatch_state. The first time regmatch() is called, the first slab is
3839 allocated, and is never freed until interpreter destruction. When the slab
3840 is full, a new one is allocated and chained to the end. At exit from
3841 regmatch(), slabs allocated since entry are freed.
3846 #define DEBUG_STATE_pp(pp) \
3848 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
3849 Perl_re_printf( aTHX_ \
3850 "%*s" pp " %s%s%s%s%s\n", \
3851 INDENT_CHARS(depth), "", \
3852 PL_reg_name[st->resume_state], \
3853 ((st==yes_state||st==mark_state) ? "[" : ""), \
3854 ((st==yes_state) ? "Y" : ""), \
3855 ((st==mark_state) ? "M" : ""), \
3856 ((st==yes_state||st==mark_state) ? "]" : "") \
3861 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3866 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3867 const char *start, const char *end, const char *blurb)
3869 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3871 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3876 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3877 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3879 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3880 start, end - start, 60);
3882 Perl_re_printf( aTHX_
3883 "%s%s REx%s %s against %s\n",
3884 PL_colors[4], blurb, PL_colors[5], s0, s1);
3886 if (utf8_target||utf8_pat)
3887 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
3888 utf8_pat ? "pattern" : "",
3889 utf8_pat && utf8_target ? " and " : "",
3890 utf8_target ? "string" : ""
3896 S_dump_exec_pos(pTHX_ const char *locinput,
3897 const regnode *scan,
3898 const char *loc_regeol,
3899 const char *loc_bostr,
3900 const char *loc_reg_starttry,
3901 const bool utf8_target,
3905 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3906 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3907 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3908 /* The part of the string before starttry has one color
3909 (pref0_len chars), between starttry and current
3910 position another one (pref_len - pref0_len chars),
3911 after the current position the third one.
3912 We assume that pref0_len <= pref_len, otherwise we
3913 decrease pref0_len. */
3914 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3915 ? (5 + taill) - l : locinput - loc_bostr;
3918 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3920 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3922 pref0_len = pref_len - (locinput - loc_reg_starttry);
3923 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3924 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3925 ? (5 + taill) - pref_len : loc_regeol - locinput);
3926 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3930 if (pref0_len > pref_len)
3931 pref0_len = pref_len;
3933 const int is_uni = utf8_target ? 1 : 0;
3935 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3936 (locinput - pref_len),pref0_len, 60, 4, 5);
3938 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3939 (locinput - pref_len + pref0_len),
3940 pref_len - pref0_len, 60, 2, 3);
3942 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3943 locinput, loc_regeol - locinput, 10, 0, 1);
3945 const STRLEN tlen=len0+len1+len2;
3946 Perl_re_printf( aTHX_
3947 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|%4u| ",
3948 (IV)(locinput - loc_bostr),
3951 (docolor ? "" : "> <"),
3953 (int)(tlen > 19 ? 0 : 19 - tlen),
3961 /* reg_check_named_buff_matched()
3962 * Checks to see if a named buffer has matched. The data array of
3963 * buffer numbers corresponding to the buffer is expected to reside
3964 * in the regexp->data->data array in the slot stored in the ARG() of
3965 * node involved. Note that this routine doesn't actually care about the
3966 * name, that information is not preserved from compilation to execution.
3967 * Returns the index of the leftmost defined buffer with the given name
3968 * or 0 if non of the buffers matched.
3971 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3974 RXi_GET_DECL(rex,rexi);
3975 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3976 I32 *nums=(I32*)SvPVX(sv_dat);
3978 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3980 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3981 if ((I32)rex->lastparen >= nums[n] &&
3982 rex->offs[nums[n]].end != -1)
3992 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3993 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3995 /* This function determines if there are one or two characters that match
3996 * the first character of the passed-in EXACTish node <text_node>, and if
3997 * so, returns them in the passed-in pointers.
3999 * If it determines that no possible character in the target string can
4000 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
4001 * the first character in <text_node> requires UTF-8 to represent, and the
4002 * target string isn't in UTF-8.)
4004 * If there are more than two characters that could match the beginning of
4005 * <text_node>, or if more context is required to determine a match or not,
4006 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4008 * The motiviation behind this function is to allow the caller to set up
4009 * tight loops for matching. If <text_node> is of type EXACT, there is
4010 * only one possible character that can match its first character, and so
4011 * the situation is quite simple. But things get much more complicated if
4012 * folding is involved. It may be that the first character of an EXACTFish
4013 * node doesn't participate in any possible fold, e.g., punctuation, so it
4014 * can be matched only by itself. The vast majority of characters that are
4015 * in folds match just two things, their lower and upper-case equivalents.
4016 * But not all are like that; some have multiple possible matches, or match
4017 * sequences of more than one character. This function sorts all that out.
4019 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
4020 * loop of trying to match A*, we know we can't exit where the thing
4021 * following it isn't a B. And something can't be a B unless it is the
4022 * beginning of B. By putting a quick test for that beginning in a tight
4023 * loop, we can rule out things that can't possibly be B without having to
4024 * break out of the loop, thus avoiding work. Similarly, if A is a single
4025 * character, we can make a tight loop matching A*, using the outputs of
4028 * If the target string to match isn't in UTF-8, and there aren't
4029 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4030 * the one or two possible octets (which are characters in this situation)
4031 * that can match. In all cases, if there is only one character that can
4032 * match, *<c1p> and *<c2p> will be identical.
4034 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4035 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4036 * can match the beginning of <text_node>. They should be declared with at
4037 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
4038 * undefined what these contain.) If one or both of the buffers are
4039 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4040 * corresponding invariant. If variant, the corresponding *<c1p> and/or
4041 * *<c2p> will be set to a negative number(s) that shouldn't match any code
4042 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
4043 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4045 const bool utf8_target = reginfo->is_utf8_target;
4047 UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4048 UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4049 bool use_chrtest_void = FALSE;
4050 const bool is_utf8_pat = reginfo->is_utf8_pat;
4052 /* Used when we have both utf8 input and utf8 output, to avoid converting
4053 * to/from code points */
4054 bool utf8_has_been_setup = FALSE;
4058 U8 *pat = (U8*)STRING(text_node);
4059 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4061 if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4063 /* In an exact node, only one thing can be matched, that first
4064 * character. If both the pat and the target are UTF-8, we can just
4065 * copy the input to the output, avoiding finding the code point of
4070 else if (utf8_target) {
4071 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4072 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4073 utf8_has_been_setup = TRUE;
4076 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4079 else { /* an EXACTFish node */
4080 U8 *pat_end = pat + STR_LEN(text_node);
4082 /* An EXACTFL node has at least some characters unfolded, because what
4083 * they match is not known until now. So, now is the time to fold
4084 * the first few of them, as many as are needed to determine 'c1' and
4085 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
4086 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4087 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
4088 * need to fold as many characters as a single character can fold to,
4089 * so that later we can check if the first ones are such a multi-char
4090 * fold. But, in such a pattern only locale-problematic characters
4091 * aren't folded, so we can skip this completely if the first character
4092 * in the node isn't one of the tricky ones */
4093 if (OP(text_node) == EXACTFL) {
4095 if (! is_utf8_pat) {
4096 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4098 folded[0] = folded[1] = 's';
4100 pat_end = folded + 2;
4103 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4108 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4110 *(d++) = (U8) toFOLD_LC(*s);
4115 _to_utf8_fold_flags(s,
4118 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4129 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4130 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4132 /* Multi-character folds require more context to sort out. Also
4133 * PL_utf8_foldclosures used below doesn't handle them, so have to
4134 * be handled outside this routine */
4135 use_chrtest_void = TRUE;
4137 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4138 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4140 /* Load the folds hash, if not already done */
4142 if (! PL_utf8_foldclosures) {
4143 _load_PL_utf8_foldclosures();
4146 /* The fold closures data structure is a hash with the keys
4147 * being the UTF-8 of every character that is folded to, like
4148 * 'k', and the values each an array of all code points that
4149 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
4150 * Multi-character folds are not included */
4151 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4156 /* Not found in the hash, therefore there are no folds
4157 * containing it, so there is only a single character that
4161 else { /* Does participate in folds */
4162 AV* list = (AV*) *listp;
4163 if (av_tindex_nomg(list) != 1) {
4165 /* If there aren't exactly two folds to this, it is
4166 * outside the scope of this function */
4167 use_chrtest_void = TRUE;
4169 else { /* There are two. Get them */
4170 SV** c_p = av_fetch(list, 0, FALSE);
4172 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4176 c_p = av_fetch(list, 1, FALSE);
4178 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4182 /* Folds that cross the 255/256 boundary are forbidden
4183 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4184 * one is ASCIII. Since the pattern character is above
4185 * 255, and its only other match is below 256, the only
4186 * legal match will be to itself. We have thrown away
4187 * the original, so have to compute which is the one
4189 if ((c1 < 256) != (c2 < 256)) {
4190 if ((OP(text_node) == EXACTFL
4191 && ! IN_UTF8_CTYPE_LOCALE)
4192 || ((OP(text_node) == EXACTFA
4193 || OP(text_node) == EXACTFA_NO_TRIE)
4194 && (isASCII(c1) || isASCII(c2))))
4207 else /* Here, c1 is <= 255 */
4209 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4210 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4211 && ((OP(text_node) != EXACTFA
4212 && OP(text_node) != EXACTFA_NO_TRIE)
4215 /* Here, there could be something above Latin1 in the target
4216 * which folds to this character in the pattern. All such
4217 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4218 * than two characters involved in their folds, so are outside
4219 * the scope of this function */
4220 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4221 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4224 use_chrtest_void = TRUE;
4227 else { /* Here nothing above Latin1 can fold to the pattern
4229 switch (OP(text_node)) {
4231 case EXACTFL: /* /l rules */
4232 c2 = PL_fold_locale[c1];
4235 case EXACTF: /* This node only generated for non-utf8
4237 assert(! is_utf8_pat);
4238 if (! utf8_target) { /* /d rules */
4243 /* /u rules for all these. This happens to work for
4244 * EXACTFA as nothing in Latin1 folds to ASCII */
4245 case EXACTFA_NO_TRIE: /* This node only generated for
4246 non-utf8 patterns */
4247 assert(! is_utf8_pat);
4252 c2 = PL_fold_latin1[c1];
4256 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4257 NOT_REACHED; /* NOTREACHED */
4263 /* Here have figured things out. Set up the returns */
4264 if (use_chrtest_void) {
4265 *c2p = *c1p = CHRTEST_VOID;
4267 else if (utf8_target) {
4268 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
4269 uvchr_to_utf8(c1_utf8, c1);
4270 uvchr_to_utf8(c2_utf8, c2);
4273 /* Invariants are stored in both the utf8 and byte outputs; Use
4274 * negative numbers otherwise for the byte ones. Make sure that the
4275 * byte ones are the same iff the utf8 ones are the same */
4276 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4277 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4280 ? CHRTEST_NOT_A_CP_1
4281 : CHRTEST_NOT_A_CP_2;
4283 else if (c1 > 255) {
4284 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
4289 *c1p = *c2p = c2; /* c2 is the only representable value */
4291 else { /* c1 is representable; see about c2 */
4293 *c2p = (c2 < 256) ? c2 : c1;
4300 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4302 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4303 * between the inputs. See http://www.unicode.org/reports/tr29/. */
4305 PERL_ARGS_ASSERT_ISGCB;
4307 switch (GCB_table[before][after]) {
4314 case GCB_RI_then_RI:
4317 U8 * temp_pos = (U8 *) curpos;
4319 /* Do not break within emoji flag sequences. That is, do not
4320 * break between regional indicator (RI) symbols if there is an
4321 * odd number of RI characters before the break point.
4322 * GB12 ^ (RI RI)* RI × RI
4323 * GB13 [^RI] (RI RI)* RI × RI */
4325 while (backup_one_GCB(strbeg,
4327 utf8_target) == GCB_Regional_Indicator)
4332 return RI_count % 2 != 1;
4335 case GCB_EX_then_EM:
4337 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
4339 U8 * temp_pos = (U8 *) curpos;
4343 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4345 while (prev == GCB_Extend);
4347 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4355 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4356 before, after, GCB_table[before][after]);
4363 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4367 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4369 if (*curpos < strbeg) {
4374 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4375 U8 * prev_prev_char_pos;
4377 if (! prev_char_pos) {
4381 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4382 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4383 *curpos = prev_char_pos;
4384 prev_char_pos = prev_prev_char_pos;
4387 *curpos = (U8 *) strbeg;
4392 if (*curpos - 2 < strbeg) {
4393 *curpos = (U8 *) strbeg;
4397 gcb = getGCB_VAL_CP(*(*curpos - 1));
4403 /* Combining marks attach to most classes that precede them, but this defines
4404 * the exceptions (from TR14) */
4405 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
4406 || prev == LB_Mandatory_Break \
4407 || prev == LB_Carriage_Return \
4408 || prev == LB_Line_Feed \
4409 || prev == LB_Next_Line \
4410 || prev == LB_Space \
4411 || prev == LB_ZWSpace))
4414 S_isLB(pTHX_ LB_enum before,
4416 const U8 * const strbeg,
4417 const U8 * const curpos,
4418 const U8 * const strend,
4419 const bool utf8_target)
4421 U8 * temp_pos = (U8 *) curpos;
4422 LB_enum prev = before;
4424 /* Is the boundary between 'before' and 'after' line-breakable?
4425 * Most of this is just a table lookup of a generated table from Unicode
4426 * rules. But some rules require context to decide, and so have to be
4427 * implemented in code */
4429 PERL_ARGS_ASSERT_ISLB;
4431 /* Rule numbers in the comments below are as of Unicode 9.0 */
4435 switch (LB_table[before][after]) {
4440 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4443 case LB_SP_foo + LB_BREAKABLE:
4444 case LB_SP_foo + LB_NOBREAK:
4445 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4447 /* When we have something following a SP, we have to look at the
4448 * context in order to know what to do.
4450 * SP SP should not reach here because LB7: Do not break before
4451 * spaces. (For two spaces in a row there is nothing that
4452 * overrides that) */
4453 assert(after != LB_Space);
4455 /* Here we have a space followed by a non-space. Mostly this is a
4456 * case of LB18: "Break after spaces". But there are complications
4457 * as the handling of spaces is somewhat tricky. They are in a
4458 * number of rules, which have to be applied in priority order, but
4459 * something earlier in the string can cause a rule to be skipped
4460 * and a lower priority rule invoked. A prime example is LB7 which
4461 * says don't break before a space. But rule LB8 (lower priority)
4462 * says that the first break opportunity after a ZW is after any
4463 * span of spaces immediately after it. If a ZW comes before a SP
4464 * in the input, rule LB8 applies, and not LB7. Other such rules
4465 * involve combining marks which are rules 9 and 10, but they may
4466 * override higher priority rules if they come earlier in the
4467 * string. Since we're doing random access into the middle of the
4468 * string, we have to look for rules that should get applied based
4469 * on both string position and priority. Combining marks do not
4470 * attach to either ZW nor SP, so we don't have to consider them
4473 * To check for LB8, we have to find the first non-space character
4474 * before this span of spaces */
4476 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4478 while (prev == LB_Space);
4480 /* LB8 Break before any character following a zero-width space,
4481 * even if one or more spaces intervene.
4483 * So if we have a ZW just before this span, and to get here this
4484 * is the final space in the span. */
4485 if (prev == LB_ZWSpace) {
4489 /* Here, not ZW SP+. There are several rules that have higher
4490 * priority than LB18 and can be resolved now, as they don't depend
4491 * on anything earlier in the string (except ZW, which we have
4492 * already handled). One of these rules is LB11 Do not break
4493 * before Word joiner, but we have specially encoded that in the
4494 * lookup table so it is caught by the single test below which
4495 * catches the other ones. */
4496 if (LB_table[LB_Space][after] - LB_SP_foo
4497 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4502 /* If we get here, we have to XXX consider combining marks. */
4503 if (prev == LB_Combining_Mark) {
4505 /* What happens with these depends on the character they
4508 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4510 while (prev == LB_Combining_Mark);
4512 /* Most times these attach to and inherit the characteristics
4513 * of that character, but not always, and when not, they are to
4514 * be treated as AL by rule LB10. */
4515 if (! LB_CM_ATTACHES_TO(prev)) {
4516 prev = LB_Alphabetic;
4520 /* Here, we have the character preceding the span of spaces all set
4521 * up. We follow LB18: "Break after spaces" unless the table shows
4522 * that is overriden */
4523 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4527 /* We don't know how to treat the CM except by looking at the first
4528 * non-CM character preceding it. ZWJ is treated as CM */
4530 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4532 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4534 /* Here, 'prev' is that first earlier non-CM character. If the CM
4535 * attatches to it, then it inherits the behavior of 'prev'. If it
4536 * doesn't attach, it is to be treated as an AL */
4537 if (! LB_CM_ATTACHES_TO(prev)) {
4538 prev = LB_Alphabetic;
4543 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4544 case LB_HY_or_BA_then_foo + LB_NOBREAK:
4546 /* LB21a Don't break after Hebrew + Hyphen.
4547 * HL (HY | BA) × */
4549 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4550 == LB_Hebrew_Letter)
4555 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4557 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4558 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4560 /* LB25a (PR | PO) × ( OP | HY )? NU */
4561 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4565 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4568 case LB_SY_or_IS_then_various + LB_BREAKABLE:
4569 case LB_SY_or_IS_then_various + LB_NOBREAK:
4571 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4573 LB_enum temp = prev;
4575 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4577 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4578 if (temp == LB_Numeric) {
4582 return LB_table[prev][after] - LB_SY_or_IS_then_various
4586 case LB_various_then_PO_or_PR + LB_BREAKABLE:
4587 case LB_various_then_PO_or_PR + LB_NOBREAK:
4589 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4591 LB_enum temp = prev;
4592 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4594 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4596 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4597 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4599 if (temp == LB_Numeric) {
4602 return LB_various_then_PO_or_PR;
4605 case LB_RI_then_RI + LB_NOBREAK:
4606 case LB_RI_then_RI + LB_BREAKABLE:
4610 /* LB30a Break between two regional indicator symbols if and
4611 * only if there are an even number of regional indicators
4612 * preceding the position of the break.
4614 * sot (RI RI)* RI × RI
4615 * [^RI] (RI RI)* RI × RI */
4617 while (backup_one_LB(strbeg,
4619 utf8_target) == LB_Regional_Indicator)
4624 return RI_count % 2 == 0;
4632 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4633 before, after, LB_table[before][after]);
4640 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4644 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4646 if (*curpos >= strend) {
4651 *curpos += UTF8SKIP(*curpos);
4652 if (*curpos >= strend) {
4655 lb = getLB_VAL_UTF8(*curpos, strend);
4659 if (*curpos >= strend) {
4662 lb = getLB_VAL_CP(**curpos);
4669 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4673 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4675 if (*curpos < strbeg) {
4680 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4681 U8 * prev_prev_char_pos;
4683 if (! prev_char_pos) {
4687 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4688 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4689 *curpos = prev_char_pos;
4690 prev_char_pos = prev_prev_char_pos;
4693 *curpos = (U8 *) strbeg;
4698 if (*curpos - 2 < strbeg) {
4699 *curpos = (U8 *) strbeg;
4703 lb = getLB_VAL_CP(*(*curpos - 1));
4710 S_isSB(pTHX_ SB_enum before,
4712 const U8 * const strbeg,
4713 const U8 * const curpos,
4714 const U8 * const strend,
4715 const bool utf8_target)
4717 /* returns a boolean indicating if there is a Sentence Boundary Break
4718 * between the inputs. See http://www.unicode.org/reports/tr29/ */
4720 U8 * lpos = (U8 *) curpos;
4721 bool has_para_sep = FALSE;
4722 bool has_sp = FALSE;
4724 PERL_ARGS_ASSERT_ISSB;
4726 /* Break at the start and end of text.
4729 But unstated in Unicode is don't break if the text is empty */
4730 if (before == SB_EDGE || after == SB_EDGE) {
4731 return before != after;
4734 /* SB 3: Do not break within CRLF. */
4735 if (before == SB_CR && after == SB_LF) {
4739 /* Break after paragraph separators. CR and LF are considered
4740 * so because Unicode views text as like word processing text where there
4741 * are no newlines except between paragraphs, and the word processor takes
4742 * care of wrapping without there being hard line-breaks in the text *./
4743 SB4. Sep | CR | LF ÷ */
4744 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4748 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4749 * (See Section 6.2, Replacing Ignore Rules.)
4750 SB5. X (Extend | Format)* → X */
4751 if (after == SB_Extend || after == SB_Format) {
4753 /* Implied is that the these characters attach to everything
4754 * immediately prior to them except for those separator-type
4755 * characters. And the rules earlier have already handled the case
4756 * when one of those immediately precedes the extend char */
4760 if (before == SB_Extend || before == SB_Format) {
4761 U8 * temp_pos = lpos;
4762 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4763 if ( backup != SB_EDGE
4772 /* Here, both 'before' and 'backup' are these types; implied is that we
4773 * don't break between them */
4774 if (backup == SB_Extend || backup == SB_Format) {
4779 /* Do not break after ambiguous terminators like period, if they are
4780 * immediately followed by a number or lowercase letter, if they are
4781 * between uppercase letters, if the first following letter (optionally
4782 * after certain punctuation) is lowercase, or if they are followed by
4783 * "continuation" punctuation such as comma, colon, or semicolon. For
4784 * example, a period may be an abbreviation or numeric period, and thus may
4785 * not mark the end of a sentence.
4787 * SB6. ATerm × Numeric */
4788 if (before == SB_ATerm && after == SB_Numeric) {
4792 /* SB7. (Upper | Lower) ATerm × Upper */
4793 if (before == SB_ATerm && after == SB_Upper) {
4794 U8 * temp_pos = lpos;
4795 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4796 if (backup == SB_Upper || backup == SB_Lower) {
4801 /* The remaining rules that aren't the final one, all require an STerm or
4802 * an ATerm after having backed up over some Close* Sp*, and in one case an
4803 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4804 * So do that backup now, setting flags if either Sp or a paragraph
4805 * separator are found */
4807 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4808 has_para_sep = TRUE;
4809 before = backup_one_SB(strbeg, &lpos, utf8_target);
4812 if (before == SB_Sp) {
4815 before = backup_one_SB(strbeg, &lpos, utf8_target);
4817 while (before == SB_Sp);
4820 while (before == SB_Close) {
4821 before = backup_one_SB(strbeg, &lpos, utf8_target);
4824 /* The next few rules apply only when the backed-up-to is an ATerm, and in
4825 * most cases an STerm */
4826 if (before == SB_STerm || before == SB_ATerm) {
4828 /* So, here the lhs matches
4829 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
4830 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
4831 * The rules that apply here are:
4833 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
4834 | LF | STerm | ATerm) )* Lower
4835 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
4836 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
4837 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
4838 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
4841 /* And all but SB11 forbid having seen a paragraph separator */
4842 if (! has_para_sep) {
4843 if (before == SB_ATerm) { /* SB8 */
4844 U8 * rpos = (U8 *) curpos;
4845 SB_enum later = after;
4847 while ( later != SB_OLetter
4848 && later != SB_Upper
4849 && later != SB_Lower
4853 && later != SB_STerm
4854 && later != SB_ATerm
4855 && later != SB_EDGE)
4857 later = advance_one_SB(&rpos, strend, utf8_target);
4859 if (later == SB_Lower) {
4864 if ( after == SB_SContinue /* SB8a */
4865 || after == SB_STerm
4866 || after == SB_ATerm)
4871 if (! has_sp) { /* SB9 applies only if there was no Sp* */
4872 if ( after == SB_Close
4882 /* SB10. This and SB9 could probably be combined some way, but khw
4883 * has decided to follow the Unicode rule book precisely for
4884 * simplified maintenance */
4898 /* Otherwise, do not break.
4905 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4909 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4911 if (*curpos >= strend) {
4917 *curpos += UTF8SKIP(*curpos);
4918 if (*curpos >= strend) {
4921 sb = getSB_VAL_UTF8(*curpos, strend);
4922 } while (sb == SB_Extend || sb == SB_Format);
4927 if (*curpos >= strend) {
4930 sb = getSB_VAL_CP(**curpos);
4931 } while (sb == SB_Extend || sb == SB_Format);
4938 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4942 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4944 if (*curpos < strbeg) {
4949 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4950 if (! prev_char_pos) {
4954 /* Back up over Extend and Format. curpos is always just to the right
4955 * of the characater whose value we are getting */
4957 U8 * prev_prev_char_pos;
4958 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4961 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4962 *curpos = prev_char_pos;
4963 prev_char_pos = prev_prev_char_pos;
4966 *curpos = (U8 *) strbeg;
4969 } while (sb == SB_Extend || sb == SB_Format);
4973 if (*curpos - 2 < strbeg) {
4974 *curpos = (U8 *) strbeg;
4978 sb = getSB_VAL_CP(*(*curpos - 1));
4979 } while (sb == SB_Extend || sb == SB_Format);
4986 S_isWB(pTHX_ WB_enum previous,
4989 const U8 * const strbeg,
4990 const U8 * const curpos,
4991 const U8 * const strend,
4992 const bool utf8_target)
4994 /* Return a boolean as to if the boundary between 'before' and 'after' is
4995 * a Unicode word break, using their published algorithm, but tailored for
4996 * Perl by treating spans of white space as one unit. Context may be
4997 * needed to make this determination. If the value for the character
4998 * before 'before' is known, it is passed as 'previous'; otherwise that
4999 * should be set to WB_UNKNOWN. The other input parameters give the
5000 * boundaries and current position in the matching of the string. That
5001 * is, 'curpos' marks the position where the character whose wb value is
5002 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5004 U8 * before_pos = (U8 *) curpos;
5005 U8 * after_pos = (U8 *) curpos;
5006 WB_enum prev = before;
5009 PERL_ARGS_ASSERT_ISWB;
5011 /* Rule numbers in the comments below are as of Unicode 9.0 */
5015 switch (WB_table[before][after]) {
5022 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5023 next = advance_one_WB(&after_pos, strend, utf8_target,
5024 FALSE /* Don't skip Extend nor Format */ );
5025 /* A space immediately preceeding an Extend or Format is attached
5026 * to by them, and hence gets separated from previous spaces.
5027 * Otherwise don't break between horizontal white space */
5028 return next == WB_Extend || next == WB_Format;
5030 /* WB4 Ignore Format and Extend characters, except when they appear at
5031 * the beginning of a region of text. This code currently isn't
5032 * general purpose, but it works as the rules are currently and likely
5033 * to be laid out. The reason it works is that when 'they appear at
5034 * the beginning of a region of text', the rule is to break before
5035 * them, just like any other character. Therefore, the default rule
5036 * applies and we don't have to look in more depth. Should this ever
5037 * change, we would have to have 2 'case' statements, like in the rules
5038 * below, and backup a single character (not spacing over the extend
5039 * ones) and then see if that is one of the region-end characters and
5041 case WB_Ex_or_FO_or_ZWJ_then_foo:
5042 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5045 case WB_DQ_then_HL + WB_BREAKABLE:
5046 case WB_DQ_then_HL + WB_NOBREAK:
5048 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5050 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5051 == WB_Hebrew_Letter)
5056 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5058 case WB_HL_then_DQ + WB_BREAKABLE:
5059 case WB_HL_then_DQ + WB_NOBREAK:
5061 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5063 if (advance_one_WB(&after_pos, strend, utf8_target,
5064 TRUE /* Do skip Extend and Format */ )
5065 == WB_Hebrew_Letter)
5070 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5072 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5073 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5075 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5076 * | Single_Quote) (ALetter | Hebrew_Letter) */
5078 next = advance_one_WB(&after_pos, strend, utf8_target,
5079 TRUE /* Do skip Extend and Format */ );
5081 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5086 return WB_table[before][after]
5087 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5089 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5090 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5092 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5093 * | Single_Quote) × (ALetter | Hebrew_Letter) */
5095 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5096 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5101 return WB_table[before][after]
5102 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5104 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5105 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5107 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
5110 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5116 return WB_table[before][after]
5117 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5119 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5120 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5122 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
5124 if (advance_one_WB(&after_pos, strend, utf8_target,
5125 TRUE /* Do skip Extend and Format */ )
5131 return WB_table[before][after]
5132 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5134 case WB_RI_then_RI + WB_NOBREAK:
5135 case WB_RI_then_RI + WB_BREAKABLE:
5139 /* Do not break within emoji flag sequences. That is, do not
5140 * break between regional indicator (RI) symbols if there is an
5141 * odd number of RI characters before the potential break
5144 * WB15 ^ (RI RI)* RI × RI
5145 * WB16 [^RI] (RI RI)* RI × RI */
5147 while (backup_one_WB(&previous,
5150 utf8_target) == WB_Regional_Indicator)
5155 return RI_count % 2 != 1;
5163 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5164 before, after, WB_table[before][after]);
5171 S_advance_one_WB(pTHX_ U8 ** curpos,
5172 const U8 * const strend,
5173 const bool utf8_target,
5174 const bool skip_Extend_Format)
5178 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5180 if (*curpos >= strend) {
5186 /* Advance over Extend and Format */
5188 *curpos += UTF8SKIP(*curpos);
5189 if (*curpos >= strend) {
5192 wb = getWB_VAL_UTF8(*curpos, strend);
5193 } while ( skip_Extend_Format
5194 && (wb == WB_Extend || wb == WB_Format));
5199 if (*curpos >= strend) {
5202 wb = getWB_VAL_CP(**curpos);
5203 } while ( skip_Extend_Format
5204 && (wb == WB_Extend || wb == WB_Format));
5211 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5215 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5217 /* If we know what the previous character's break value is, don't have
5219 if (*previous != WB_UNKNOWN) {
5222 /* But we need to move backwards by one */
5224 *curpos = reghopmaybe3(*curpos, -1, strbeg);
5226 *previous = WB_EDGE;
5227 *curpos = (U8 *) strbeg;
5230 *previous = WB_UNKNOWN;
5235 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5238 /* And we always back up over these three types */
5239 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5244 if (*curpos < strbeg) {
5249 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5250 if (! prev_char_pos) {
5254 /* Back up over Extend and Format. curpos is always just to the right
5255 * of the characater whose value we are getting */
5257 U8 * prev_prev_char_pos;
5258 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5262 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5263 *curpos = prev_char_pos;
5264 prev_char_pos = prev_prev_char_pos;
5267 *curpos = (U8 *) strbeg;
5270 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5274 if (*curpos - 2 < strbeg) {
5275 *curpos = (U8 *) strbeg;
5279 wb = getWB_VAL_CP(*(*curpos - 1));
5280 } while (wb == WB_Extend || wb == WB_Format);
5286 #define EVAL_CLOSE_PAREN_IS(st,expr) \
5289 ( ( st )->u.eval.close_paren ) && \
5290 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
5293 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
5296 ( ( st )->u.eval.close_paren ) && \
5298 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
5302 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5303 (st)->u.eval.close_paren = ( (expr) + 1 )
5305 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5306 (st)->u.eval.close_paren = 0
5308 /* returns -1 on failure, $+[0] on success */
5310 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5313 #if PERL_VERSION < 9 && !defined(PERL_CORE)
5317 const bool utf8_target = reginfo->is_utf8_target;
5318 const U32 uniflags = UTF8_ALLOW_DEFAULT;
5319 REGEXP *rex_sv = reginfo->prog;
5320 regexp *rex = ReANY(rex_sv);
5321 RXi_GET_DECL(rex,rexi);
5322 /* the current state. This is a cached copy of PL_regmatch_state */
5324 /* cache heavy used fields of st in registers */
5327 U32 n = 0; /* general value; init to avoid compiler warning */
5328 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
5329 char *locinput = startpos;
5330 char *pushinput; /* where to continue after a PUSH */
5331 I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */
5333 bool result = 0; /* return value of S_regmatch */
5334 int depth = 0; /* depth of backtrack stack */
5335 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5336 const U32 max_nochange_depth =
5337 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5338 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5339 regmatch_state *yes_state = NULL; /* state to pop to on success of
5341 /* mark_state piggy backs on the yes_state logic so that when we unwind
5342 the stack on success we can update the mark_state as we go */
5343 regmatch_state *mark_state = NULL; /* last mark state we have seen */
5344 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5345 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
5347 bool no_final = 0; /* prevent failure from backtracking? */
5348 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
5349 char *startpoint = locinput;
5350 SV *popmark = NULL; /* are we looking for a mark? */
5351 SV *sv_commit = NULL; /* last mark name seen in failure */
5352 SV *sv_yes_mark = NULL; /* last mark name we have seen
5353 during a successful match */
5354 U32 lastopen = 0; /* last open we saw */
5355 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
5356 SV* const oreplsv = GvSVn(PL_replgv);
5357 /* these three flags are set by various ops to signal information to
5358 * the very next op. They have a useful lifetime of exactly one loop
5359 * iteration, and are not preserved or restored by state pushes/pops
5361 bool sw = 0; /* the condition value in (?(cond)a|b) */
5362 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
5363 int logical = 0; /* the following EVAL is:
5367 or the following IFMATCH/UNLESSM is:
5368 false: plain (?=foo)
5369 true: used as a condition: (?(?=foo))
5371 PAD* last_pad = NULL;
5373 U8 gimme = G_SCALAR;
5374 CV *caller_cv = NULL; /* who called us */
5375 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
5376 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
5377 U32 maxopenparen = 0; /* max '(' index seen so far */
5378 int to_complement; /* Invert the result? */
5379 _char_class_number classnum;
5380 bool is_utf8_pat = reginfo->is_utf8_pat;
5383 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5384 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5385 # define SOLARIS_BAD_OPTIMIZER
5386 const U32 *pl_charclass_dup = PL_charclass;
5387 # define PL_charclass pl_charclass_dup
5391 GET_RE_DEBUG_FLAGS_DECL;
5394 /* protect against undef(*^R) */
5395 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5397 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5398 multicall_oldcatch = 0;
5399 PERL_UNUSED_VAR(multicall_cop);
5401 PERL_ARGS_ASSERT_REGMATCH;
5403 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5404 Perl_re_printf( aTHX_ "regmatch start\n");
5407 st = PL_regmatch_state;
5409 /* Note that nextchr is a byte even in UTF */
5412 while (scan != NULL) {
5415 next = scan + NEXT_OFF(scan);
5418 state_num = OP(scan);
5422 if (state_num <= REGNODE_MAX) {
5423 SV * const prop = sv_newmortal();
5424 regnode *rnext = regnext(scan);
5426 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5427 regprop(rex, prop, scan, reginfo, NULL);
5428 Perl_re_printf( aTHX_
5429 "%*s%"IVdf":%s(%"IVdf")\n",
5430 INDENT_CHARS(depth), "",
5431 (IV)(scan - rexi->program),
5433 (PL_regkind[OP(scan)] == END || !rnext) ?
5434 0 : (IV)(rnext - rexi->program));
5441 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5443 switch (state_num) {
5444 case SBOL: /* /^../ and /\A../ */
5445 if (locinput == reginfo->strbeg)
5449 case MBOL: /* /^../m */
5450 if (locinput == reginfo->strbeg ||
5451 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5458 if (locinput == reginfo->ganch)
5462 case KEEPS: /* \K */
5463 /* update the startpoint */
5464 st->u.keeper.val = rex->offs[0].start;
5465 rex->offs[0].start = locinput - reginfo->strbeg;
5466 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5467 NOT_REACHED; /* NOTREACHED */
5469 case KEEPS_next_fail:
5470 /* rollback the start point change */
5471 rex->offs[0].start = st->u.keeper.val;
5473 NOT_REACHED; /* NOTREACHED */
5475 case MEOL: /* /..$/m */
5476 if (!NEXTCHR_IS_EOS && nextchr != '\n')
5480 case SEOL: /* /..$/ */
5481 if (!NEXTCHR_IS_EOS && nextchr != '\n')
5483 if (reginfo->strend - locinput > 1)
5488 if (!NEXTCHR_IS_EOS)
5492 case SANY: /* /./s */
5495 goto increment_locinput;
5497 case REG_ANY: /* /./ */
5498 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5500 goto increment_locinput;
5504 #define ST st->u.trie
5505 case TRIEC: /* (ab|cd) with known charclass */
5506 /* In this case the charclass data is available inline so
5507 we can fail fast without a lot of extra overhead.
5509 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5511 Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
5512 depth, PL_colors[4], PL_colors[5])
5515 NOT_REACHED; /* NOTREACHED */
5518 case TRIE: /* (ab|cd) */
5519 /* the basic plan of execution of the trie is:
5520 * At the beginning, run though all the states, and
5521 * find the longest-matching word. Also remember the position
5522 * of the shortest matching word. For example, this pattern:
5525 * when matched against the string "abcde", will generate
5526 * accept states for all words except 3, with the longest
5527 * matching word being 4, and the shortest being 2 (with
5528 * the position being after char 1 of the string).
5530 * Then for each matching word, in word order (i.e. 1,2,4,5),
5531 * we run the remainder of the pattern; on each try setting
5532 * the current position to the character following the word,
5533 * returning to try the next word on failure.
5535 * We avoid having to build a list of words at runtime by
5536 * using a compile-time structure, wordinfo[].prev, which
5537 * gives, for each word, the previous accepting word (if any).
5538 * In the case above it would contain the mappings 1->2, 2->0,
5539 * 3->0, 4->5, 5->1. We can use this table to generate, from
5540 * the longest word (4 above), a list of all words, by
5541 * following the list of prev pointers; this gives us the
5542 * unordered list 4,5,1,2. Then given the current word we have
5543 * just tried, we can go through the list and find the
5544 * next-biggest word to try (so if we just failed on word 2,
5545 * the next in the list is 4).
5547 * Since at runtime we don't record the matching position in
5548 * the string for each word, we have to work that out for
5549 * each word we're about to process. The wordinfo table holds
5550 * the character length of each word; given that we recorded
5551 * at the start: the position of the shortest word and its
5552 * length in chars, we just need to move the pointer the
5553 * difference between the two char lengths. Depending on
5554 * Unicode status and folding, that's cheap or expensive.
5556 * This algorithm is optimised for the case where are only a
5557 * small number of accept states, i.e. 0,1, or maybe 2.
5558 * With lots of accepts states, and having to try all of them,
5559 * it becomes quadratic on number of accept states to find all
5564 /* what type of TRIE am I? (utf8 makes this contextual) */
5565 DECL_TRIE_TYPE(scan);
5567 /* what trie are we using right now */
5568 reg_trie_data * const trie
5569 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5570 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5571 U32 state = trie->startstate;
5573 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5574 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5576 && UTF8_IS_ABOVE_LATIN1(nextchr)
5577 && scan->flags == EXACTL)
5579 /* We only output for EXACTL, as we let the folder
5580 * output this message for EXACTFLU8 to avoid
5582 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5587 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5589 if (trie->states[ state ].wordnum) {
5591 Perl_re_exec_indentf( aTHX_ "%smatched empty string...%s\n",
5592 depth, PL_colors[4], PL_colors[5])
5598 Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
5599 depth, PL_colors[4], PL_colors[5])
5606 U8 *uc = ( U8* )locinput;
5610 U8 *uscan = (U8*)NULL;
5611 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5612 U32 charcount = 0; /* how many input chars we have matched */
5613 U32 accepted = 0; /* have we seen any accepting states? */
5615 ST.jump = trie->jump;
5618 ST.longfold = FALSE; /* char longer if folded => it's harder */
5621 /* fully traverse the TRIE; note the position of the
5622 shortest accept state and the wordnum of the longest
5625 while ( state && uc <= (U8*)(reginfo->strend) ) {
5626 U32 base = trie->states[ state ].trans.base;
5630 wordnum = trie->states[ state ].wordnum;
5632 if (wordnum) { /* it's an accept state */
5635 /* record first match position */
5637 ST.firstpos = (U8*)locinput;
5642 ST.firstchars = charcount;
5645 if (!ST.nextword || wordnum < ST.nextword)
5646 ST.nextword = wordnum;
5647 ST.topword = wordnum;
5650 DEBUG_TRIE_EXECUTE_r({
5651 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
5652 Perl_re_exec_indentf( aTHX_
5653 "%sState: %4"UVxf" Accepted: %c ",
5654 depth, PL_colors[4],
5655 (UV)state, (accepted ? 'Y' : 'N'));
5658 /* read a char and goto next state */
5659 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5661 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5662 uscan, len, uvc, charid, foldlen,
5669 base + charid - 1 - trie->uniquecharcount)) >= 0)
5671 && ((U32)offset < trie->lasttrans)
5672 && trie->trans[offset].check == state)
5674 state = trie->trans[offset].next;
5685 DEBUG_TRIE_EXECUTE_r(
5686 Perl_re_printf( aTHX_
5687 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5688 charid, uvc, (UV)state, PL_colors[5] );
5694 /* calculate total number of accept states */
5699 w = trie->wordinfo[w].prev;
5702 ST.accepted = accepted;
5706 Perl_re_exec_indentf( aTHX_ "%sgot %"IVdf" possible matches%s\n",
5708 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5710 goto trie_first_try; /* jump into the fail handler */
5712 NOT_REACHED; /* NOTREACHED */
5714 case TRIE_next_fail: /* we failed - try next alternative */
5718 REGCP_UNWIND(ST.cp);
5719 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5721 if (!--ST.accepted) {
5723 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
5731 /* Find next-highest word to process. Note that this code
5732 * is O(N^2) per trie run (O(N) per branch), so keep tight */
5735 U16 const nextword = ST.nextword;
5736 reg_trie_wordinfo * const wordinfo
5737 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5738 for (word=ST.topword; word; word=wordinfo[word].prev) {
5739 if (word > nextword && (!min || word < min))
5752 ST.lastparen = rex->lastparen;
5753 ST.lastcloseparen = rex->lastcloseparen;
5757 /* find start char of end of current word */
5759 U32 chars; /* how many chars to skip */
5760 reg_trie_data * const trie
5761 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5763 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5765 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5770 /* the hard option - fold each char in turn and find
5771 * its folded length (which may be different */
5772 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5780 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5788 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5793 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5809 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5810 ? ST.jump[ST.nextword]
5814 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
5822 if (ST.accepted > 1 || has_cutgroup) {
5823 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5824 NOT_REACHED; /* NOTREACHED */
5826 /* only one choice left - just continue */
5828 AV *const trie_words
5829 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5830 SV ** const tmp = trie_words
5831 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5832 SV *sv= tmp ? sv_newmortal() : NULL;
5834 Perl_re_exec_indentf( aTHX_ "%sonly one match left, short-circuiting: #%d <%s>%s\n",
5835 depth, PL_colors[4],
5837 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5838 PL_colors[0], PL_colors[1],
5839 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5841 : "not compiled under -Dr",
5845 locinput = (char*)uc;
5846 continue; /* execute rest of RE */
5851 case EXACTL: /* /abc/l */
5852 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5854 /* Complete checking would involve going through every character
5855 * matched by the string to see if any is above latin1. But the
5856 * comparision otherwise might very well be a fast assembly
5857 * language routine, and I (khw) don't think slowing things down
5858 * just to check for this warning is worth it. So this just checks
5859 * the first character */
5860 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5861 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5864 case EXACT: { /* /abc/ */
5865 char *s = STRING(scan);
5867 if (utf8_target != is_utf8_pat) {
5868 /* The target and the pattern have differing utf8ness. */
5870 const char * const e = s + ln;
5873 /* The target is utf8, the pattern is not utf8.
5874 * Above-Latin1 code points can't match the pattern;
5875 * invariants match exactly, and the other Latin1 ones need
5876 * to be downgraded to a single byte in order to do the
5877 * comparison. (If we could be confident that the target
5878 * is not malformed, this could be refactored to have fewer
5879 * tests by just assuming that if the first bytes match, it
5880 * is an invariant, but there are tests in the test suite
5881 * dealing with (??{...}) which violate this) */
5883 if (l >= reginfo->strend
5884 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5888 if (UTF8_IS_INVARIANT(*(U8*)l)) {
5895 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5905 /* The target is not utf8, the pattern is utf8. */
5907 if (l >= reginfo->strend
5908 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5912 if (UTF8_IS_INVARIANT(*(U8*)s)) {
5919 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5931 /* The target and the pattern have the same utf8ness. */
5932 /* Inline the first character, for speed. */
5933 if (reginfo->strend - locinput < ln
5934 || UCHARAT(s) != nextchr
5935 || (ln > 1 && memNE(s, locinput, ln)))
5944 case EXACTFL: { /* /abc/il */
5946 const U8 * fold_array;
5948 U32 fold_utf8_flags;
5950 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5951 folder = foldEQ_locale;
5952 fold_array = PL_fold_locale;
5953 fold_utf8_flags = FOLDEQ_LOCALE;
5956 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
5957 is effectively /u; hence to match, target
5959 if (! utf8_target) {
5962 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5963 | FOLDEQ_S1_FOLDS_SANE;
5964 folder = foldEQ_latin1;
5965 fold_array = PL_fold_latin1;
5968 case EXACTFU_SS: /* /\x{df}/iu */
5969 case EXACTFU: /* /abc/iu */
5970 folder = foldEQ_latin1;
5971 fold_array = PL_fold_latin1;
5972 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5975 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
5977 assert(! is_utf8_pat);
5979 case EXACTFA: /* /abc/iaa */
5980 folder = foldEQ_latin1;
5981 fold_array = PL_fold_latin1;
5982 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5985 case EXACTF: /* /abc/i This node only generated for
5986 non-utf8 patterns */
5987 assert(! is_utf8_pat);
5989 fold_array = PL_fold;
5990 fold_utf8_flags = 0;
5998 || state_num == EXACTFU_SS
5999 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6001 /* Either target or the pattern are utf8, or has the issue where
6002 * the fold lengths may differ. */
6003 const char * const l = locinput;
6004 char *e = reginfo->strend;
6006 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
6007 l, &e, 0, utf8_target, fold_utf8_flags))
6015 /* Neither the target nor the pattern are utf8 */
6016 if (UCHARAT(s) != nextchr
6018 && UCHARAT(s) != fold_array[nextchr])
6022 if (reginfo->strend - locinput < ln)
6024 if (ln > 1 && ! folder(s, locinput, ln))
6030 case NBOUNDL: /* /\B/l */
6034 case BOUNDL: /* /\b/l */
6037 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6039 if (FLAGS(scan) != TRADITIONAL_BOUND) {
6040 if (! IN_UTF8_CTYPE_LOCALE) {
6041 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6042 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6048 if (locinput == reginfo->strbeg)
6049 b1 = isWORDCHAR_LC('\n');
6051 b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
6052 (U8*)(reginfo->strbeg)));
6054 b2 = (NEXTCHR_IS_EOS)
6055 ? isWORDCHAR_LC('\n')
6056 : isWORDCHAR_LC_utf8((U8*)locinput);
6058 else { /* Here the string isn't utf8 */
6059 b1 = (locinput == reginfo->strbeg)
6060 ? isWORDCHAR_LC('\n')
6061 : isWORDCHAR_LC(UCHARAT(locinput - 1));
6062 b2 = (NEXTCHR_IS_EOS)
6063 ? isWORDCHAR_LC('\n')
6064 : isWORDCHAR_LC(nextchr);
6066 if (to_complement ^ (b1 == b2)) {
6072 case NBOUND: /* /\B/ */
6076 case BOUND: /* /\b/ */
6080 goto bound_ascii_match_only;
6082 case NBOUNDA: /* /\B/a */
6086 case BOUNDA: /* /\b/a */
6090 bound_ascii_match_only:
6091 /* Here the string isn't utf8, or is utf8 and only ascii characters
6092 * are to match \w. In the latter case looking at the byte just
6093 * prior to the current one may be just the final byte of a
6094 * multi-byte character. This is ok. There are two cases:
6095 * 1) it is a single byte character, and then the test is doing
6096 * just what it's supposed to.
6097 * 2) it is a multi-byte character, in which case the final byte is
6098 * never mistakable for ASCII, and so the test will say it is
6099 * not a word character, which is the correct answer. */
6100 b1 = (locinput == reginfo->strbeg)
6101 ? isWORDCHAR_A('\n')
6102 : isWORDCHAR_A(UCHARAT(locinput - 1));
6103 b2 = (NEXTCHR_IS_EOS)
6104 ? isWORDCHAR_A('\n')
6105 : isWORDCHAR_A(nextchr);
6106 if (to_complement ^ (b1 == b2)) {
6112 case NBOUNDU: /* /\B/u */
6116 case BOUNDU: /* /\b/u */
6119 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6122 else if (utf8_target) {
6124 switch((bound_type) FLAGS(scan)) {
6125 case TRADITIONAL_BOUND:
6128 b1 = (locinput == reginfo->strbeg)
6129 ? 0 /* isWORDCHAR_L1('\n') */
6130 : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
6131 (U8*)(reginfo->strbeg)));
6132 b2 = (NEXTCHR_IS_EOS)
6133 ? 0 /* isWORDCHAR_L1('\n') */
6134 : isWORDCHAR_utf8((U8*)locinput);
6135 match = cBOOL(b1 != b2);
6139 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6140 match = TRUE; /* GCB always matches at begin and
6144 /* Find the gcb values of previous and current
6145 * chars, then see if is a break point */
6146 match = isGCB(getGCB_VAL_UTF8(
6147 reghop3((U8*)locinput,
6149 (U8*)(reginfo->strbeg)),
6150 (U8*) reginfo->strend),
6151 getGCB_VAL_UTF8((U8*) locinput,
6152 (U8*) reginfo->strend),
6153 (U8*) reginfo->strbeg,
6160 if (locinput == reginfo->strbeg) {
6163 else if (NEXTCHR_IS_EOS) {
6167 match = isLB(getLB_VAL_UTF8(
6168 reghop3((U8*)locinput,
6170 (U8*)(reginfo->strbeg)),
6171 (U8*) reginfo->strend),
6172 getLB_VAL_UTF8((U8*) locinput,
6173 (U8*) reginfo->strend),
6174 (U8*) reginfo->strbeg,
6176 (U8*) reginfo->strend,
6181 case SB_BOUND: /* Always matches at begin and end */
6182 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6186 match = isSB(getSB_VAL_UTF8(
6187 reghop3((U8*)locinput,
6189 (U8*)(reginfo->strbeg)),
6190 (U8*) reginfo->strend),
6191 getSB_VAL_UTF8((U8*) locinput,
6192 (U8*) reginfo->strend),
6193 (U8*) reginfo->strbeg,
6195 (U8*) reginfo->strend,
6201 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6205 match = isWB(WB_UNKNOWN,
6207 reghop3((U8*)locinput,
6209 (U8*)(reginfo->strbeg)),
6210 (U8*) reginfo->strend),
6211 getWB_VAL_UTF8((U8*) locinput,
6212 (U8*) reginfo->strend),
6213 (U8*) reginfo->strbeg,
6215 (U8*) reginfo->strend,
6221 else { /* Not utf8 target */
6222 switch((bound_type) FLAGS(scan)) {
6223 case TRADITIONAL_BOUND:
6226 b1 = (locinput == reginfo->strbeg)
6227 ? 0 /* isWORDCHAR_L1('\n') */
6228 : isWORDCHAR_L1(UCHARAT(locinput - 1));
6229 b2 = (NEXTCHR_IS_EOS)
6230 ? 0 /* isWORDCHAR_L1('\n') */
6231 : isWORDCHAR_L1(nextchr);
6232 match = cBOOL(b1 != b2);
6237 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6238 match = TRUE; /* GCB always matches at begin and
6241 else { /* Only CR-LF combo isn't a GCB in 0-255
6243 match = UCHARAT(locinput - 1) != '\r'
6244 || UCHARAT(locinput) != '\n';
6249 if (locinput == reginfo->strbeg) {
6252 else if (NEXTCHR_IS_EOS) {
6256 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6257 getLB_VAL_CP(UCHARAT(locinput)),
6258 (U8*) reginfo->strbeg,
6260 (U8*) reginfo->strend,
6265 case SB_BOUND: /* Always matches at begin and end */
6266 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6270 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6271 getSB_VAL_CP(UCHARAT(locinput)),
6272 (U8*) reginfo->strbeg,
6274 (U8*) reginfo->strend,
6280 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6284 match = isWB(WB_UNKNOWN,
6285 getWB_VAL_CP(UCHARAT(locinput -1)),
6286 getWB_VAL_CP(UCHARAT(locinput)),
6287 (U8*) reginfo->strbeg,
6289 (U8*) reginfo->strend,
6296 if (to_complement ^ ! match) {
6301 case ANYOFL: /* /[abc]/l */
6302 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6304 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6306 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6309 case ANYOFD: /* /[abc]/d */
6310 case ANYOF: /* /[abc]/ */
6313 if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6314 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6317 locinput += UTF8SKIP(locinput);
6320 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6326 /* The argument (FLAGS) to all the POSIX node types is the class number
6329 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
6333 case POSIXL: /* \w or [:punct:] etc. under /l */
6334 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6338 /* Use isFOO_lc() for characters within Latin1. (Note that
6339 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6340 * wouldn't be invariant) */
6341 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6342 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6350 if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
6351 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6352 goto utf8_posix_above_latin1;
6355 /* Here is a UTF-8 variant code point below 256 and the target is
6357 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6358 EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6359 *(locinput + 1))))))
6364 goto increment_locinput;
6366 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
6370 case POSIXD: /* \w or [:punct:] etc. under /d */
6376 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
6378 if (NEXTCHR_IS_EOS) {
6382 /* All UTF-8 variants match */
6383 if (! UTF8_IS_INVARIANT(nextchr)) {
6384 goto increment_locinput;
6390 case POSIXA: /* \w or [:punct:] etc. under /a */
6393 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6394 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6395 * character is a single byte */
6397 if (NEXTCHR_IS_EOS) {
6403 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6409 /* Here we are either not in utf8, or we matched a utf8-invariant,
6410 * so the next char is the next byte */
6414 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
6418 case POSIXU: /* \w or [:punct:] etc. under /u */
6420 if (NEXTCHR_IS_EOS) {
6424 /* Use _generic_isCC() for characters within Latin1. (Note that
6425 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6426 * wouldn't be invariant) */
6427 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6428 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6435 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
6436 if (! (to_complement
6437 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6445 else { /* Handle above Latin-1 code points */
6446 utf8_posix_above_latin1:
6447 classnum = (_char_class_number) FLAGS(scan);
6448 if (classnum < _FIRST_NON_SWASH_CC) {
6450 /* Here, uses a swash to find such code points. Load if if
6451 * not done already */
6452 if (! PL_utf8_swash_ptrs[classnum]) {
6453 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6454 PL_utf8_swash_ptrs[classnum]
6455 = _core_swash_init("utf8",
6458 PL_XPosix_ptrs[classnum], &flags);
6460 if (! (to_complement
6461 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6462 (U8 *) locinput, TRUE))))
6467 else { /* Here, uses macros to find above Latin-1 code points */
6469 case _CC_ENUM_SPACE:
6470 if (! (to_complement
6471 ^ cBOOL(is_XPERLSPACE_high(locinput))))
6476 case _CC_ENUM_BLANK:
6477 if (! (to_complement
6478 ^ cBOOL(is_HORIZWS_high(locinput))))
6483 case _CC_ENUM_XDIGIT:
6484 if (! (to_complement
6485 ^ cBOOL(is_XDIGIT_high(locinput))))
6490 case _CC_ENUM_VERTSPACE:
6491 if (! (to_complement
6492 ^ cBOOL(is_VERTWS_high(locinput))))
6497 default: /* The rest, e.g. [:cntrl:], can't match
6499 if (! to_complement) {
6505 locinput += UTF8SKIP(locinput);
6509 case CLUMP: /* Match \X: logical Unicode character. This is defined as
6510 a Unicode extended Grapheme Cluster */
6513 if (! utf8_target) {
6515 /* Match either CR LF or '.', as all the other possibilities
6517 locinput++; /* Match the . or CR */
6518 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6520 && locinput < reginfo->strend
6521 && UCHARAT(locinput) == '\n')
6528 /* Get the gcb type for the current character */
6529 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6530 (U8*) reginfo->strend);
6532 /* Then scan through the input until we get to the first
6533 * character whose type is supposed to be a gcb with the
6534 * current character. (There is always a break at the
6536 locinput += UTF8SKIP(locinput);
6537 while (locinput < reginfo->strend) {
6538 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6539 (U8*) reginfo->strend);
6540 if (isGCB(prev_gcb, cur_gcb,
6541 (U8*) reginfo->strbeg, (U8*) locinput,
6548 locinput += UTF8SKIP(locinput);
6555 case NREFFL: /* /\g{name}/il */
6556 { /* The capture buffer cases. The ones beginning with N for the
6557 named buffers just convert to the equivalent numbered and
6558 pretend they were called as the corresponding numbered buffer
6560 /* don't initialize these in the declaration, it makes C++
6565 const U8 *fold_array;
6568 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6569 folder = foldEQ_locale;
6570 fold_array = PL_fold_locale;
6572 utf8_fold_flags = FOLDEQ_LOCALE;
6575 case NREFFA: /* /\g{name}/iaa */
6576 folder = foldEQ_latin1;
6577 fold_array = PL_fold_latin1;
6579 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6582 case NREFFU: /* /\g{name}/iu */
6583 folder = foldEQ_latin1;
6584 fold_array = PL_fold_latin1;
6586 utf8_fold_flags = 0;
6589 case NREFF: /* /\g{name}/i */
6591 fold_array = PL_fold;
6593 utf8_fold_flags = 0;
6596 case NREF: /* /\g{name}/ */
6600 utf8_fold_flags = 0;
6603 /* For the named back references, find the corresponding buffer
6605 n = reg_check_named_buff_matched(rex,scan);
6610 goto do_nref_ref_common;
6612 case REFFL: /* /\1/il */
6613 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6614 folder = foldEQ_locale;
6615 fold_array = PL_fold_locale;
6616 utf8_fold_flags = FOLDEQ_LOCALE;
6619 case REFFA: /* /\1/iaa */
6620 folder = foldEQ_latin1;
6621 fold_array = PL_fold_latin1;
6622 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6625 case REFFU: /* /\1/iu */
6626 folder = foldEQ_latin1;
6627 fold_array = PL_fold_latin1;
6628 utf8_fold_flags = 0;
6631 case REFF: /* /\1/i */
6633 fold_array = PL_fold;
6634 utf8_fold_flags = 0;
6637 case REF: /* /\1/ */
6640 utf8_fold_flags = 0;
6644 n = ARG(scan); /* which paren pair */
6647 ln = rex->offs[n].start;
6648 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6649 if (rex->lastparen < n || ln == -1)
6650 sayNO; /* Do not match unless seen CLOSEn. */
6651 if (ln == rex->offs[n].end)
6654 s = reginfo->strbeg + ln;
6655 if (type != REF /* REF can do byte comparison */
6656 && (utf8_target || type == REFFU || type == REFFL))
6658 char * limit = reginfo->strend;
6660 /* This call case insensitively compares the entire buffer
6661 * at s, with the current input starting at locinput, but
6662 * not going off the end given by reginfo->strend, and
6663 * returns in <limit> upon success, how much of the
6664 * current input was matched */
6665 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6666 locinput, &limit, 0, utf8_target, utf8_fold_flags))
6674 /* Not utf8: Inline the first character, for speed. */
6675 if (!NEXTCHR_IS_EOS &&
6676 UCHARAT(s) != nextchr &&
6678 UCHARAT(s) != fold_array[nextchr]))
6680 ln = rex->offs[n].end - ln;
6681 if (locinput + ln > reginfo->strend)
6683 if (ln > 1 && (type == REF
6684 ? memNE(s, locinput, ln)
6685 : ! folder(s, locinput, ln)))
6691 case NOTHING: /* null op; e.g. the 'nothing' following
6692 * the '*' in m{(a+|b)*}' */
6694 case TAIL: /* placeholder while compiling (A|B|C) */
6698 #define ST st->u.eval
6699 #define CUR_EVAL cur_eval->u.eval
6705 regexp_internal *rei;
6706 regnode *startpoint;
6709 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
6710 arg= (U32)ARG(scan);
6711 if (cur_eval && cur_eval->locinput == locinput) {
6712 if ( ++nochange_depth > max_nochange_depth )
6714 "Pattern subroutine nesting without pos change"
6715 " exceeded limit in regex");
6722 startpoint = scan + ARG2L(scan);
6723 EVAL_CLOSE_PAREN_SET( st, arg );
6724 /* Detect infinite recursion
6726 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6727 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6728 * So we track the position in the string we are at each time
6729 * we recurse and if we try to enter the same routine twice from
6730 * the same position we throw an error.
6732 if ( rex->recurse_locinput[arg] == locinput ) {
6733 /* FIXME: we should show the regop that is failing as part
6734 * of the error message. */
6735 Perl_croak(aTHX_ "Infinite recursion in regex");
6737 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6738 rex->recurse_locinput[arg]= locinput;
6741 GET_RE_DEBUG_FLAGS_DECL;
6743 Perl_re_exec_indentf( aTHX_
6744 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6745 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6751 /* Save all the positions seen so far. */
6752 ST.cp = regcppush(rex, 0, maxopenparen);
6753 REGCP_SET(ST.lastcp);
6755 /* and then jump to the code we share with EVAL */
6756 goto eval_recurse_doit;
6759 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
6760 if (cur_eval && cur_eval->locinput==locinput) {
6761 if ( ++nochange_depth > max_nochange_depth )
6762 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6767 /* execute the code in the {...} */
6771 OP * const oop = PL_op;
6772 COP * const ocurcop = PL_curcop;
6776 /* save *all* paren positions */
6777 regcppush(rex, 0, maxopenparen);
6778 REGCP_SET(runops_cp);
6781 caller_cv = find_runcv(NULL);
6785 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6787 (REGEXP*)(rexi->data->data[n])
6789 nop = (OP*)rexi->data->data[n+1];
6791 else if (rexi->data->what[n] == 'l') { /* literal code */
6793 nop = (OP*)rexi->data->data[n];
6794 assert(CvDEPTH(newcv));
6797 /* literal with own CV */
6798 assert(rexi->data->what[n] == 'L');
6799 newcv = rex->qr_anoncv;
6800 nop = (OP*)rexi->data->data[n];
6803 /* normally if we're about to execute code from the same
6804 * CV that we used previously, we just use the existing
6805 * CX stack entry. However, its possible that in the
6806 * meantime we may have backtracked, popped from the save
6807 * stack, and undone the SAVECOMPPAD(s) associated with
6808 * PUSH_MULTICALL; in which case PL_comppad no longer
6809 * points to newcv's pad. */
6810 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6812 U8 flags = (CXp_SUB_RE |
6813 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6814 if (last_pushed_cv) {
6815 /* PUSH/POP_MULTICALL save and restore the
6816 * caller's PL_comppad; if we call multiple subs
6817 * using the same CX block, we have to save and
6818 * unwind the varying PL_comppad's ourselves,
6819 * especially restoring the right PL_comppad on
6820 * backtrack - so save it on the save stack */
6822 CHANGE_MULTICALL_FLAGS(newcv, flags);
6825 PUSH_MULTICALL_FLAGS(newcv, flags);
6827 last_pushed_cv = newcv;
6830 /* these assignments are just to silence compiler
6832 multicall_cop = NULL;
6834 last_pad = PL_comppad;
6836 /* the initial nextstate you would normally execute
6837 * at the start of an eval (which would cause error
6838 * messages to come from the eval), may be optimised
6839 * away from the execution path in the regex code blocks;
6840 * so manually set PL_curcop to it initially */
6842 OP *o = cUNOPx(nop)->op_first;
6843 assert(o->op_type == OP_NULL);
6844 if (o->op_targ == OP_SCOPE) {
6845 o = cUNOPo->op_first;
6848 assert(o->op_targ == OP_LEAVE);
6849 o = cUNOPo->op_first;
6850 assert(o->op_type == OP_ENTER);
6854 if (o->op_type != OP_STUB) {
6855 assert( o->op_type == OP_NEXTSTATE
6856 || o->op_type == OP_DBSTATE
6857 || (o->op_type == OP_NULL
6858 && ( o->op_targ == OP_NEXTSTATE
6859 || o->op_targ == OP_DBSTATE
6863 PL_curcop = (COP*)o;
6868 DEBUG_STATE_r( Perl_re_printf( aTHX_
6869 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6871 rex->offs[0].end = locinput - reginfo->strbeg;
6872 if (reginfo->info_aux_eval->pos_magic)
6873 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6874 reginfo->sv, reginfo->strbeg,
6875 locinput - reginfo->strbeg);
6878 SV *sv_mrk = get_sv("REGMARK", 1);
6879 sv_setsv(sv_mrk, sv_yes_mark);
6882 /* we don't use MULTICALL here as we want to call the
6883 * first op of the block of interest, rather than the
6884 * first op of the sub. Also, we don't want to free
6885 * the savestack frame */
6886 before = (IV)(SP-PL_stack_base);
6888 CALLRUNOPS(aTHX); /* Scalar context. */
6890 if ((IV)(SP-PL_stack_base) == before)
6891 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
6897 /* before restoring everything, evaluate the returned
6898 * value, so that 'uninit' warnings don't use the wrong
6899 * PL_op or pad. Also need to process any magic vars
6900 * (e.g. $1) *before* parentheses are restored */
6905 if (logical == 0) /* (?{})/ */
6906 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6907 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
6908 sw = cBOOL(SvTRUE(ret));
6911 else { /* /(??{}) */
6912 /* if its overloaded, let the regex compiler handle
6913 * it; otherwise extract regex, or stringify */
6914 if (SvGMAGICAL(ret))
6915 ret = sv_mortalcopy(ret);
6916 if (!SvAMAGIC(ret)) {
6920 if (SvTYPE(sv) == SVt_REGEXP)
6921 re_sv = (REGEXP*) sv;
6922 else if (SvSMAGICAL(ret)) {
6923 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6925 re_sv = (REGEXP *) mg->mg_obj;
6928 /* force any undef warnings here */
6929 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6930 ret = sv_mortalcopy(ret);
6931 (void) SvPV_force_nolen(ret);
6937 /* *** Note that at this point we don't restore
6938 * PL_comppad, (or pop the CxSUB) on the assumption it may
6939 * be used again soon. This is safe as long as nothing
6940 * in the regexp code uses the pad ! */
6942 PL_curcop = ocurcop;
6943 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
6944 PL_curpm = PL_reg_curpm;
6950 /* only /(??{})/ from now on */
6953 /* extract RE object from returned value; compiling if
6957 re_sv = reg_temp_copy(NULL, re_sv);
6962 if (SvUTF8(ret) && IN_BYTES) {
6963 /* In use 'bytes': make a copy of the octet
6964 * sequence, but without the flag on */
6966 const char *const p = SvPV(ret, len);
6967 ret = newSVpvn_flags(p, len, SVs_TEMP);
6969 if (rex->intflags & PREGf_USE_RE_EVAL)
6970 pm_flags |= PMf_USE_RE_EVAL;
6972 /* if we got here, it should be an engine which
6973 * supports compiling code blocks and stuff */
6974 assert(rex->engine && rex->engine->op_comp);
6975 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6976 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
6977 rex->engine, NULL, NULL,
6978 /* copy /msixn etc to inner pattern */
6983 & (SVs_TEMP | SVs_GMG | SVf_ROK))
6984 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
6985 /* This isn't a first class regexp. Instead, it's
6986 caching a regexp onto an existing, Perl visible
6988 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
6994 RXp_MATCH_COPIED_off(re);
6995 re->subbeg = rex->subbeg;
6996 re->sublen = rex->sublen;
6997 re->suboffset = rex->suboffset;
6998 re->subcoffset = rex->subcoffset;
7000 re->lastcloseparen = 0;
7003 debug_start_match(re_sv, utf8_target, locinput,
7004 reginfo->strend, "Matching embedded");
7006 startpoint = rei->program + 1;
7007 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7008 * close_paren only for GOSUB */
7009 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7010 /* Save all the seen positions so far. */
7011 ST.cp = regcppush(rex, 0, maxopenparen);
7012 REGCP_SET(ST.lastcp);
7013 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7015 /* run the pattern returned from (??{...}) */
7017 eval_recurse_doit: /* Share code with GOSUB below this line
7018 * At this point we expect the stack context to be
7019 * set up correctly */
7021 /* invalidate the S-L poscache. We're now executing a
7022 * different set of WHILEM ops (and their associated
7023 * indexes) against the same string, so the bits in the
7024 * cache are meaningless. Setting maxiter to zero forces
7025 * the cache to be invalidated and zeroed before reuse.
7026 * XXX This is too dramatic a measure. Ideally we should
7027 * save the old cache and restore when running the outer
7029 reginfo->poscache_maxiter = 0;
7031 /* the new regexp might have a different is_utf8_pat than we do */
7032 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7034 ST.prev_rex = rex_sv;
7035 ST.prev_curlyx = cur_curlyx;
7037 SET_reg_curpm(rex_sv);
7042 ST.prev_eval = cur_eval;
7044 /* now continue from first node in postoned RE */
7045 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
7046 NOT_REACHED; /* NOTREACHED */
7049 case EVAL_AB: /* cleanup after a successful (??{A})B */
7050 /* note: this is called twice; first after popping B, then A */
7052 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
7053 depth, cur_eval, ST.prev_eval);
7056 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7057 if ( cur_eval && CUR_EVAL.close_paren ) {\
7059 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7061 CUR_EVAL.close_paren - 1,\
7065 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7068 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7070 rex_sv = ST.prev_rex;
7071 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7072 SET_reg_curpm(rex_sv);
7073 rex = ReANY(rex_sv);
7074 rexi = RXi_GET(rex);
7076 /* preserve $^R across LEAVE's. See Bug 121070. */
7077 SV *save_sv= GvSV(PL_replgv);
7078 SvREFCNT_inc(save_sv);
7079 regcpblow(ST.cp); /* LEAVE in disguise */
7080 sv_setsv(GvSV(PL_replgv), save_sv);
7081 SvREFCNT_dec(save_sv);
7083 cur_eval = ST.prev_eval;
7084 cur_curlyx = ST.prev_curlyx;
7086 /* Invalidate cache. See "invalidate" comment above. */
7087 reginfo->poscache_maxiter = 0;
7088 if ( nochange_depth )
7091 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7095 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7096 /* note: this is called twice; first after popping B, then A */
7098 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7099 depth, cur_eval, ST.prev_eval);
7102 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7104 rex_sv = ST.prev_rex;
7105 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7106 SET_reg_curpm(rex_sv);
7107 rex = ReANY(rex_sv);
7108 rexi = RXi_GET(rex);
7110 REGCP_UNWIND(ST.lastcp);
7111 regcppop(rex, &maxopenparen);
7112 cur_eval = ST.prev_eval;
7113 cur_curlyx = ST.prev_curlyx;
7115 /* Invalidate cache. See "invalidate" comment above. */
7116 reginfo->poscache_maxiter = 0;
7117 if ( nochange_depth )
7120 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7125 n = ARG(scan); /* which paren pair */
7126 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7127 if (n > maxopenparen)
7129 DEBUG_BUFFERS_r(Perl_re_printf( aTHX_
7130 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
7134 (IV)rex->offs[n].start_tmp,
7140 /* XXX really need to log other places start/end are set too */
7141 #define CLOSE_CAPTURE \
7142 rex->offs[n].start = rex->offs[n].start_tmp; \
7143 rex->offs[n].end = locinput - reginfo->strbeg; \
7144 DEBUG_BUFFERS_r(Perl_re_printf( aTHX_ \
7145 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
7147 PTR2UV(rex->offs), \
7149 (IV)rex->offs[n].start, \
7150 (IV)rex->offs[n].end \
7154 n = ARG(scan); /* which paren pair */
7156 if (n > rex->lastparen)
7158 rex->lastcloseparen = n;
7159 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7164 case ACCEPT: /* (*ACCEPT) */
7166 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7170 cursor && OP(cursor)!=END;
7171 cursor=regnext(cursor))
7173 if ( OP(cursor)==CLOSE ){
7175 if ( n <= lastopen ) {
7177 if (n > rex->lastparen)
7179 rex->lastcloseparen = n;
7180 if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7189 case GROUPP: /* (?(1)) */
7190 n = ARG(scan); /* which paren pair */
7191 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7194 case NGROUPP: /* (?(<name>)) */
7195 /* reg_check_named_buff_matched returns 0 for no match */
7196 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7199 case INSUBP: /* (?(R)) */
7201 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7202 * of SCAN is already set up as matches a eval.close_paren */
7203 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7206 case DEFINEP: /* (?(DEFINE)) */
7210 case IFTHEN: /* (?(cond)A|B) */
7211 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7213 next = NEXTOPER(NEXTOPER(scan));
7215 next = scan + ARG(scan);
7216 if (OP(next) == IFTHEN) /* Fake one. */
7217 next = NEXTOPER(NEXTOPER(next));
7221 case LOGICAL: /* modifier for EVAL and IFMATCH */
7222 logical = scan->flags;
7225 /*******************************************************************
7227 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7228 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7229 STAR/PLUS/CURLY/CURLYN are used instead.)
7231 A*B is compiled as <CURLYX><A><WHILEM><B>
7233 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7234 state, which contains the current count, initialised to -1. It also sets
7235 cur_curlyx to point to this state, with any previous value saved in the
7238 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7239 since the pattern may possibly match zero times (i.e. it's a while {} loop
7240 rather than a do {} while loop).
7242 Each entry to WHILEM represents a successful match of A. The count in the
7243 CURLYX block is incremented, another WHILEM state is pushed, and execution
7244 passes to A or B depending on greediness and the current count.
7246 For example, if matching against the string a1a2a3b (where the aN are
7247 substrings that match /A/), then the match progresses as follows: (the
7248 pushed states are interspersed with the bits of strings matched so far):
7251 <CURLYX cnt=0><WHILEM>
7252 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7253 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7254 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7255 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7257 (Contrast this with something like CURLYM, which maintains only a single
7261 a1 <CURLYM cnt=1> a2
7262 a1 a2 <CURLYM cnt=2> a3
7263 a1 a2 a3 <CURLYM cnt=3> b
7266 Each WHILEM state block marks a point to backtrack to upon partial failure
7267 of A or B, and also contains some minor state data related to that
7268 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
7269 overall state, such as the count, and pointers to the A and B ops.
7271 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7272 must always point to the *current* CURLYX block, the rules are:
7274 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7275 and set cur_curlyx to point the new block.
7277 When popping the CURLYX block after a successful or unsuccessful match,
7278 restore the previous cur_curlyx.
7280 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7281 to the outer one saved in the CURLYX block.
7283 When popping the WHILEM block after a successful or unsuccessful B match,
7284 restore the previous cur_curlyx.
7286 Here's an example for the pattern (AI* BI)*BO
7287 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7290 curlyx backtrack stack
7291 ------ ---------------
7293 CO <CO prev=NULL> <WO>
7294 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7295 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7296 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7298 At this point the pattern succeeds, and we work back down the stack to
7299 clean up, restoring as we go:
7301 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7302 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7303 CO <CO prev=NULL> <WO>
7306 *******************************************************************/
7308 #define ST st->u.curlyx
7310 case CURLYX: /* start of /A*B/ (for complex A) */
7312 /* No need to save/restore up to this paren */
7313 I32 parenfloor = scan->flags;
7315 assert(next); /* keep Coverity happy */
7316 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7319 /* XXXX Probably it is better to teach regpush to support
7320 parenfloor > maxopenparen ... */
7321 if (parenfloor > (I32)rex->lastparen)
7322 parenfloor = rex->lastparen; /* Pessimization... */
7324 ST.prev_curlyx= cur_curlyx;
7326 ST.cp = PL_savestack_ix;
7328 /* these fields contain the state of the current curly.
7329 * they are accessed by subsequent WHILEMs */
7330 ST.parenfloor = parenfloor;
7335 ST.count = -1; /* this will be updated by WHILEM */
7336 ST.lastloc = NULL; /* this will be updated by WHILEM */
7338 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7339 NOT_REACHED; /* NOTREACHED */
7342 case CURLYX_end: /* just finished matching all of A*B */
7343 cur_curlyx = ST.prev_curlyx;
7345 NOT_REACHED; /* NOTREACHED */
7347 case CURLYX_end_fail: /* just failed to match all of A*B */
7349 cur_curlyx = ST.prev_curlyx;
7351 NOT_REACHED; /* NOTREACHED */
7355 #define ST st->u.whilem
7357 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
7359 /* see the discussion above about CURLYX/WHILEM */
7364 assert(cur_curlyx); /* keep Coverity happy */
7366 min = ARG1(cur_curlyx->u.curlyx.me);
7367 max = ARG2(cur_curlyx->u.curlyx.me);
7368 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7369 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7370 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7371 ST.cache_offset = 0;
7375 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: matched %ld out of %d..%d\n",
7376 depth, (long)n, min, max)
7379 /* First just match a string of min A's. */
7382 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7384 cur_curlyx->u.curlyx.lastloc = locinput;
7385 REGCP_SET(ST.lastcp);
7387 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7388 NOT_REACHED; /* NOTREACHED */
7391 /* If degenerate A matches "", assume A done. */
7393 if (locinput == cur_curlyx->u.curlyx.lastloc) {
7394 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: empty match detected, trying continuation...\n",
7397 goto do_whilem_B_max;
7400 /* super-linear cache processing.
7402 * The idea here is that for certain types of CURLYX/WHILEM -
7403 * principally those whose upper bound is infinity (and
7404 * excluding regexes that have things like \1 and other very
7405 * non-regular expresssiony things), then if a pattern like
7406 * /....A*.../ fails and we backtrack to the WHILEM, then we
7407 * make a note that this particular WHILEM op was at string
7408 * position 47 (say) when the rest of pattern failed. Then, if
7409 * we ever find ourselves back at that WHILEM, and at string
7410 * position 47 again, we can just fail immediately rather than
7411 * running the rest of the pattern again.
7413 * This is very handy when patterns start to go
7414 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7415 * with a combinatorial explosion of backtracking.
7417 * The cache is implemented as a bit array, with one bit per
7418 * string byte position per WHILEM op (up to 16) - so its
7419 * between 0.25 and 2x the string size.
7421 * To avoid allocating a poscache buffer every time, we do an
7422 * initially countdown; only after we have executed a WHILEM
7423 * op (string-length x #WHILEMs) times do we allocate the
7426 * The top 4 bits of scan->flags byte say how many different
7427 * relevant CURLLYX/WHILEM op pairs there are, while the
7428 * bottom 4-bits is the identifying index number of this
7434 if (!reginfo->poscache_maxiter) {
7435 /* start the countdown: Postpone detection until we
7436 * know the match is not *that* much linear. */
7437 reginfo->poscache_maxiter
7438 = (reginfo->strend - reginfo->strbeg + 1)
7440 /* possible overflow for long strings and many CURLYX's */
7441 if (reginfo->poscache_maxiter < 0)
7442 reginfo->poscache_maxiter = I32_MAX;
7443 reginfo->poscache_iter = reginfo->poscache_maxiter;
7446 if (reginfo->poscache_iter-- == 0) {
7447 /* initialise cache */
7448 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7449 regmatch_info_aux *const aux = reginfo->info_aux;
7450 if (aux->poscache) {
7451 if ((SSize_t)reginfo->poscache_size < size) {
7452 Renew(aux->poscache, size, char);
7453 reginfo->poscache_size = size;
7455 Zero(aux->poscache, size, char);
7458 reginfo->poscache_size = size;
7459 Newxz(aux->poscache, size, char);
7461 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7462 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
7463 PL_colors[4], PL_colors[5])
7467 if (reginfo->poscache_iter < 0) {
7468 /* have we already failed at this position? */
7469 SSize_t offset, mask;
7471 reginfo->poscache_iter = -1; /* stop eventual underflow */
7472 offset = (scan->flags & 0xf) - 1
7473 + (locinput - reginfo->strbeg)
7475 mask = 1 << (offset % 8);
7477 if (reginfo->info_aux->poscache[offset] & mask) {
7478 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: (cache) already tried at this position...\n",
7481 sayNO; /* cache records failure */
7483 ST.cache_offset = offset;
7484 ST.cache_mask = mask;
7488 /* Prefer B over A for minimal matching. */
7490 if (cur_curlyx->u.curlyx.minmod) {
7491 ST.save_curlyx = cur_curlyx;
7492 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7493 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
7495 REGCP_SET(ST.lastcp);
7496 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7498 NOT_REACHED; /* NOTREACHED */
7501 /* Prefer A over B for maximal matching. */
7503 if (n < max) { /* More greed allowed? */
7504 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7506 cur_curlyx->u.curlyx.lastloc = locinput;
7507 REGCP_SET(ST.lastcp);
7508 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7509 NOT_REACHED; /* NOTREACHED */
7511 goto do_whilem_B_max;
7513 NOT_REACHED; /* NOTREACHED */
7515 case WHILEM_B_min: /* just matched B in a minimal match */
7516 case WHILEM_B_max: /* just matched B in a maximal match */
7517 cur_curlyx = ST.save_curlyx;
7519 NOT_REACHED; /* NOTREACHED */
7521 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7522 cur_curlyx = ST.save_curlyx;
7523 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7524 cur_curlyx->u.curlyx.count--;
7526 NOT_REACHED; /* NOTREACHED */
7528 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7530 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7531 REGCP_UNWIND(ST.lastcp);
7532 regcppop(rex, &maxopenparen);
7533 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7534 cur_curlyx->u.curlyx.count--;
7536 NOT_REACHED; /* NOTREACHED */
7538 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7539 REGCP_UNWIND(ST.lastcp);
7540 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
7541 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "whilem: failed, trying continuation...\n",
7545 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7546 && ckWARN(WARN_REGEXP)
7547 && !reginfo->warned)
7549 reginfo->warned = TRUE;
7550 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7551 "Complex regular subexpression recursion limit (%d) "
7557 ST.save_curlyx = cur_curlyx;
7558 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7559 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7561 NOT_REACHED; /* NOTREACHED */
7563 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7564 cur_curlyx = ST.save_curlyx;
7565 REGCP_UNWIND(ST.lastcp);
7566 regcppop(rex, &maxopenparen);
7568 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
7569 /* Maximum greed exceeded */
7570 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7571 && ckWARN(WARN_REGEXP)
7572 && !reginfo->warned)
7574 reginfo->warned = TRUE;
7575 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7576 "Complex regular subexpression recursion "
7577 "limit (%d) exceeded",
7580 cur_curlyx->u.curlyx.count--;
7584 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "trying longer...\n", depth)
7586 /* Try grabbing another A and see if it helps. */
7587 cur_curlyx->u.curlyx.lastloc = locinput;
7588 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7590 REGCP_SET(ST.lastcp);
7591 PUSH_STATE_GOTO(WHILEM_A_min,
7592 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
7594 NOT_REACHED; /* NOTREACHED */
7597 #define ST st->u.branch
7599 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
7600 next = scan + ARG(scan);
7603 scan = NEXTOPER(scan);
7606 case BRANCH: /* /(...|A|...)/ */
7607 scan = NEXTOPER(scan); /* scan now points to inner node */
7608 ST.lastparen = rex->lastparen;
7609 ST.lastcloseparen = rex->lastcloseparen;
7610 ST.next_branch = next;
7613 /* Now go into the branch */
7615 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7617 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7619 NOT_REACHED; /* NOTREACHED */
7621 case CUTGROUP: /* /(*THEN)/ */
7622 sv_yes_mark = st->u.mark.mark_name = scan->flags
7623 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7625 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7626 NOT_REACHED; /* NOTREACHED */
7628 case CUTGROUP_next_fail:
7631 if (st->u.mark.mark_name)
7632 sv_commit = st->u.mark.mark_name;
7634 NOT_REACHED; /* NOTREACHED */
7638 NOT_REACHED; /* NOTREACHED */
7640 case BRANCH_next_fail: /* that branch failed; try the next, if any */
7645 REGCP_UNWIND(ST.cp);
7646 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7647 scan = ST.next_branch;
7648 /* no more branches? */
7649 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7651 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
7658 continue; /* execute next BRANCH[J] op */
7661 case MINMOD: /* next op will be non-greedy, e.g. A*? */
7666 #define ST st->u.curlym
7668 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
7670 /* This is an optimisation of CURLYX that enables us to push
7671 * only a single backtracking state, no matter how many matches
7672 * there are in {m,n}. It relies on the pattern being constant
7673 * length, with no parens to influence future backrefs
7677 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7679 ST.lastparen = rex->lastparen;
7680 ST.lastcloseparen = rex->lastcloseparen;
7682 /* if paren positive, emulate an OPEN/CLOSE around A */
7684 U32 paren = ST.me->flags;
7685 if (paren > maxopenparen)
7686 maxopenparen = paren;
7687 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7695 ST.c1 = CHRTEST_UNINIT;
7698 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7701 curlym_do_A: /* execute the A in /A{m,n}B/ */
7702 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7703 NOT_REACHED; /* NOTREACHED */
7705 case CURLYM_A: /* we've just matched an A */
7707 /* after first match, determine A's length: u.curlym.alen */
7708 if (ST.count == 1) {
7709 if (reginfo->is_utf8_target) {
7710 char *s = st->locinput;
7711 while (s < locinput) {
7717 ST.alen = locinput - st->locinput;
7720 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7723 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7724 depth, (IV) ST.count, (IV)ST.alen)
7727 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7731 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7732 if ( max == REG_INFTY || ST.count < max )
7733 goto curlym_do_A; /* try to match another A */
7735 goto curlym_do_B; /* try to match B */
7737 case CURLYM_A_fail: /* just failed to match an A */
7738 REGCP_UNWIND(ST.cp);
7741 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7742 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7745 curlym_do_B: /* execute the B in /A{m,n}B/ */
7746 if (ST.c1 == CHRTEST_UNINIT) {
7747 /* calculate c1 and c2 for possible match of 1st char
7748 * following curly */
7749 ST.c1 = ST.c2 = CHRTEST_VOID;
7751 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7752 regnode *text_node = ST.B;
7753 if (! HAS_TEXT(text_node))
7754 FIND_NEXT_IMPT(text_node);
7757 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7759 But the former is redundant in light of the latter.
7761 if this changes back then the macro for
7762 IS_TEXT and friends need to change.
7764 if (PL_regkind[OP(text_node)] == EXACT) {
7765 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7766 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7776 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%"IVdf"...\n",
7777 depth, (IV)ST.count)
7779 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7780 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7781 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7782 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7784 /* simulate B failing */
7786 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7788 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7789 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7790 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7792 state_num = CURLYM_B_fail;
7793 goto reenter_switch;
7796 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7797 /* simulate B failing */
7799 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7801 (int) nextchr, ST.c1, ST.c2)
7803 state_num = CURLYM_B_fail;
7804 goto reenter_switch;
7809 /* emulate CLOSE: mark current A as captured */
7810 I32 paren = ST.me->flags;
7812 rex->offs[paren].start
7813 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7814 rex->offs[paren].end = locinput - reginfo->strbeg;
7815 if ((U32)paren > rex->lastparen)
7816 rex->lastparen = paren;
7817 rex->lastcloseparen = paren;
7820 rex->offs[paren].end = -1;
7822 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
7831 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7832 NOT_REACHED; /* NOTREACHED */
7834 case CURLYM_B_fail: /* just failed to match a B */
7835 REGCP_UNWIND(ST.cp);
7836 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7838 I32 max = ARG2(ST.me);
7839 if (max != REG_INFTY && ST.count == max)
7841 goto curlym_do_A; /* try to match a further A */
7843 /* backtrack one A */
7844 if (ST.count == ARG1(ST.me) /* min */)
7847 SET_locinput(HOPc(locinput, -ST.alen));
7848 goto curlym_do_B; /* try to match B */
7851 #define ST st->u.curly
7853 #define CURLY_SETPAREN(paren, success) \
7856 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7857 rex->offs[paren].end = locinput - reginfo->strbeg; \
7858 if (paren > rex->lastparen) \
7859 rex->lastparen = paren; \
7860 rex->lastcloseparen = paren; \
7863 rex->offs[paren].end = -1; \
7864 rex->lastparen = ST.lastparen; \
7865 rex->lastcloseparen = ST.lastcloseparen; \
7869 case STAR: /* /A*B/ where A is width 1 char */
7873 scan = NEXTOPER(scan);
7876 case PLUS: /* /A+B/ where A is width 1 char */
7880 scan = NEXTOPER(scan);
7883 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
7884 ST.paren = scan->flags; /* Which paren to set */
7885 ST.lastparen = rex->lastparen;
7886 ST.lastcloseparen = rex->lastcloseparen;
7887 if (ST.paren > maxopenparen)
7888 maxopenparen = ST.paren;
7889 ST.min = ARG1(scan); /* min to match */
7890 ST.max = ARG2(scan); /* max to match */
7891 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
7896 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7899 case CURLY: /* /A{m,n}B/ where A is width 1 char */
7901 ST.min = ARG1(scan); /* min to match */
7902 ST.max = ARG2(scan); /* max to match */
7903 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7906 * Lookahead to avoid useless match attempts
7907 * when we know what character comes next.
7909 * Used to only do .*x and .*?x, but now it allows
7910 * for )'s, ('s and (?{ ... })'s to be in the way
7911 * of the quantifier and the EXACT-like node. -- japhy
7914 assert(ST.min <= ST.max);
7915 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7916 ST.c1 = ST.c2 = CHRTEST_VOID;
7919 regnode *text_node = next;
7921 if (! HAS_TEXT(text_node))
7922 FIND_NEXT_IMPT(text_node);
7924 if (! HAS_TEXT(text_node))
7925 ST.c1 = ST.c2 = CHRTEST_VOID;
7927 if ( PL_regkind[OP(text_node)] != EXACT ) {
7928 ST.c1 = ST.c2 = CHRTEST_VOID;
7932 /* Currently we only get here when
7934 PL_rekind[OP(text_node)] == EXACT
7936 if this changes back then the macro for IS_TEXT and
7937 friends need to change. */
7938 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7939 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7951 char *li = locinput;
7954 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
7960 if (ST.c1 == CHRTEST_VOID)
7961 goto curly_try_B_min;
7963 ST.oldloc = locinput;
7965 /* set ST.maxpos to the furthest point along the
7966 * string that could possibly match */
7967 if (ST.max == REG_INFTY) {
7968 ST.maxpos = reginfo->strend - 1;
7970 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7973 else if (utf8_target) {
7974 int m = ST.max - ST.min;
7975 for (ST.maxpos = locinput;
7976 m >0 && ST.maxpos < reginfo->strend; m--)
7977 ST.maxpos += UTF8SKIP(ST.maxpos);
7980 ST.maxpos = locinput + ST.max - ST.min;
7981 if (ST.maxpos >= reginfo->strend)
7982 ST.maxpos = reginfo->strend - 1;
7984 goto curly_try_B_min_known;
7988 /* avoid taking address of locinput, so it can remain
7990 char *li = locinput;
7991 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
7992 if (ST.count < ST.min)
7995 if ((ST.count > ST.min)
7996 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
7998 /* A{m,n} must come at the end of the string, there's
7999 * no point in backing off ... */
8001 /* ...except that $ and \Z can match before *and* after
8002 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
8003 We may back off by one in this case. */
8004 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8008 goto curly_try_B_max;
8010 NOT_REACHED; /* NOTREACHED */
8012 case CURLY_B_min_known_fail:
8013 /* failed to find B in a non-greedy match where c1,c2 valid */
8015 REGCP_UNWIND(ST.cp);
8017 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8019 /* Couldn't or didn't -- move forward. */
8020 ST.oldloc = locinput;
8022 locinput += UTF8SKIP(locinput);
8026 curly_try_B_min_known:
8027 /* find the next place where 'B' could work, then call B */
8031 n = (ST.oldloc == locinput) ? 0 : 1;
8032 if (ST.c1 == ST.c2) {
8033 /* set n to utf8_distance(oldloc, locinput) */
8034 while (locinput <= ST.maxpos
8035 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8037 locinput += UTF8SKIP(locinput);
8042 /* set n to utf8_distance(oldloc, locinput) */
8043 while (locinput <= ST.maxpos
8044 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8045 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8047 locinput += UTF8SKIP(locinput);
8052 else { /* Not utf8_target */
8053 if (ST.c1 == ST.c2) {
8054 while (locinput <= ST.maxpos &&
8055 UCHARAT(locinput) != ST.c1)
8059 while (locinput <= ST.maxpos
8060 && UCHARAT(locinput) != ST.c1
8061 && UCHARAT(locinput) != ST.c2)
8064 n = locinput - ST.oldloc;
8066 if (locinput > ST.maxpos)
8069 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8070 * at b; check that everything between oldloc and
8071 * locinput matches */
8072 char *li = ST.oldloc;
8074 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
8076 assert(n == REG_INFTY || locinput == li);
8078 CURLY_SETPAREN(ST.paren, ST.count);
8079 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8081 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
8083 NOT_REACHED; /* NOTREACHED */
8085 case CURLY_B_min_fail:
8086 /* failed to find B in a non-greedy match where c1,c2 invalid */
8088 REGCP_UNWIND(ST.cp);
8090 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8092 /* failed -- move forward one */
8094 char *li = locinput;
8095 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
8102 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8103 ST.count > 0)) /* count overflow ? */
8106 CURLY_SETPAREN(ST.paren, ST.count);
8107 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8109 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8113 NOT_REACHED; /* NOTREACHED */
8116 /* a successful greedy match: now try to match B */
8117 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8120 bool could_match = locinput < reginfo->strend;
8122 /* If it could work, try it. */
8123 if (ST.c1 != CHRTEST_VOID && could_match) {
8124 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8126 could_match = memEQ(locinput,
8131 UTF8SKIP(locinput));
8134 could_match = UCHARAT(locinput) == ST.c1
8135 || UCHARAT(locinput) == ST.c2;
8138 if (ST.c1 == CHRTEST_VOID || could_match) {
8139 CURLY_SETPAREN(ST.paren, ST.count);
8140 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8141 NOT_REACHED; /* NOTREACHED */
8146 case CURLY_B_max_fail:
8147 /* failed to find B in a greedy match */
8149 REGCP_UNWIND(ST.cp);
8151 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8154 if (--ST.count < ST.min)
8156 locinput = HOPc(locinput, -1);
8157 goto curly_try_B_max;
8161 case END: /* last op of main pattern */
8164 /* we've just finished A in /(??{A})B/; now continue with B */
8165 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8166 st->u.eval.prev_rex = rex_sv; /* inner */
8168 /* Save *all* the positions. */
8169 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8170 rex_sv = CUR_EVAL.prev_rex;
8171 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8172 SET_reg_curpm(rex_sv);
8173 rex = ReANY(rex_sv);
8174 rexi = RXi_GET(rex);
8176 st->u.eval.prev_curlyx = cur_curlyx;
8177 cur_curlyx = CUR_EVAL.prev_curlyx;
8179 REGCP_SET(st->u.eval.lastcp);
8181 /* Restore parens of the outer rex without popping the
8183 S_regcp_restore(aTHX_ rex, CUR_EVAL.lastcp,
8186 st->u.eval.prev_eval = cur_eval;
8187 cur_eval = CUR_EVAL.prev_eval;
8189 Perl_re_exec_indentf( aTHX_ "EVAL trying tail ... (cur_eval=%p)\n",
8191 if ( nochange_depth )
8194 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8196 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
8197 locinput); /* match B */
8200 if (locinput < reginfo->till) {
8201 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8202 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8204 (long)(locinput - startpos),
8205 (long)(reginfo->till - startpos),
8208 sayNO_SILENT; /* Cannot match: too short. */
8210 sayYES; /* Success! */
8212 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8214 Perl_re_exec_indentf( aTHX_ "%ssubpattern success...%s\n",
8215 depth, PL_colors[4], PL_colors[5]));
8216 sayYES; /* Success! */
8219 #define ST st->u.ifmatch
8224 case SUSPEND: /* (?>A) */
8226 newstart = locinput;
8229 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
8231 goto ifmatch_trivial_fail_test;
8233 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
8235 ifmatch_trivial_fail_test:
8237 char * const s = HOPBACKc(locinput, scan->flags);
8242 sw = 1 - cBOOL(ST.wanted);
8246 next = scan + ARG(scan);
8254 newstart = locinput;
8258 ST.logical = logical;
8259 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8261 /* execute body of (?...A) */
8262 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8263 NOT_REACHED; /* NOTREACHED */
8266 case IFMATCH_A_fail: /* body of (?...A) failed */
8267 ST.wanted = !ST.wanted;
8270 case IFMATCH_A: /* body of (?...A) succeeded */
8272 sw = cBOOL(ST.wanted);
8274 else if (!ST.wanted)
8277 if (OP(ST.me) != SUSPEND) {
8278 /* restore old position except for (?>...) */
8279 locinput = st->locinput;
8281 scan = ST.me + ARG(ST.me);
8284 continue; /* execute B */
8288 case LONGJMP: /* alternative with many branches compiles to
8289 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8290 next = scan + ARG(scan);
8295 case COMMIT: /* (*COMMIT) */
8296 reginfo->cutpoint = reginfo->strend;
8299 case PRUNE: /* (*PRUNE) */
8301 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8302 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8303 NOT_REACHED; /* NOTREACHED */
8305 case COMMIT_next_fail:
8309 NOT_REACHED; /* NOTREACHED */
8311 case OPFAIL: /* (*FAIL) */
8313 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8315 /* deal with (?(?!)X|Y) properly,
8316 * make sure we trigger the no branch
8317 * of the trailing IFTHEN structure*/
8323 NOT_REACHED; /* NOTREACHED */
8325 #define ST st->u.mark
8326 case MARKPOINT: /* (*MARK:foo) */
8327 ST.prev_mark = mark_state;
8328 ST.mark_name = sv_commit = sv_yes_mark
8329 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8331 ST.mark_loc = locinput;
8332 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8333 NOT_REACHED; /* NOTREACHED */
8335 case MARKPOINT_next:
8336 mark_state = ST.prev_mark;
8338 NOT_REACHED; /* NOTREACHED */
8340 case MARKPOINT_next_fail:
8341 if (popmark && sv_eq(ST.mark_name,popmark))
8343 if (ST.mark_loc > startpoint)
8344 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8345 popmark = NULL; /* we found our mark */
8346 sv_commit = ST.mark_name;
8349 Perl_re_exec_indentf( aTHX_ "%ssetting cutpoint to mark:%"SVf"...%s\n",
8351 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8354 mark_state = ST.prev_mark;
8355 sv_yes_mark = mark_state ?
8356 mark_state->u.mark.mark_name : NULL;
8358 NOT_REACHED; /* NOTREACHED */
8360 case SKIP: /* (*SKIP) */
8362 /* (*SKIP) : if we fail we cut here*/
8363 ST.mark_name = NULL;
8364 ST.mark_loc = locinput;
8365 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8367 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8368 otherwise do nothing. Meaning we need to scan
8370 regmatch_state *cur = mark_state;
8371 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8374 if ( sv_eq( cur->u.mark.mark_name,
8377 ST.mark_name = find;
8378 PUSH_STATE_GOTO( SKIP_next, next, locinput);
8380 cur = cur->u.mark.prev_mark;
8383 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8386 case SKIP_next_fail:
8388 /* (*CUT:NAME) - Set up to search for the name as we
8389 collapse the stack*/
8390 popmark = ST.mark_name;
8392 /* (*CUT) - No name, we cut here.*/
8393 if (ST.mark_loc > startpoint)
8394 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8395 /* but we set sv_commit to latest mark_name if there
8396 is one so they can test to see how things lead to this
8399 sv_commit=mark_state->u.mark.mark_name;
8403 NOT_REACHED; /* NOTREACHED */
8406 case LNBREAK: /* \R */
8407 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8414 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
8415 PTR2UV(scan), OP(scan));
8416 Perl_croak(aTHX_ "regexp memory corruption");
8418 /* this is a point to jump to in order to increment
8419 * locinput by one character */
8421 assert(!NEXTCHR_IS_EOS);
8423 locinput += PL_utf8skip[nextchr];
8424 /* locinput is allowed to go 1 char off the end, but not 2+ */
8425 if (locinput > reginfo->strend)
8434 /* switch break jumps here */
8435 scan = next; /* prepare to execute the next op and ... */
8436 continue; /* ... jump back to the top, reusing st */
8440 /* push a state that backtracks on success */
8441 st->u.yes.prev_yes_state = yes_state;
8445 /* push a new regex state, then continue at scan */
8447 regmatch_state *newst;
8450 regmatch_state *cur = st;
8451 regmatch_state *curyes = yes_state;
8453 regmatch_slab *slab = PL_regmatch_slab;
8454 for (;curd > -1 && (depth-curd < 3);cur--,curd--) {
8455 if (cur < SLAB_FIRST(slab)) {
8457 cur = SLAB_LAST(slab);
8459 Perl_re_exec_indentf( aTHX_ "#%-3d %-10s %s\n",
8461 curd, PL_reg_name[cur->resume_state],
8462 (curyes == cur) ? "yes" : ""
8465 curyes = cur->u.yes.prev_yes_state;
8468 DEBUG_STATE_pp("push")
8471 st->locinput = locinput;
8473 if (newst > SLAB_LAST(PL_regmatch_slab))
8474 newst = S_push_slab(aTHX);
8475 PL_regmatch_state = newst;
8477 locinput = pushinput;
8483 #ifdef SOLARIS_BAD_OPTIMIZER
8484 # undef PL_charclass
8488 * We get here only if there's trouble -- normally "case END" is
8489 * the terminating point.
8491 Perl_croak(aTHX_ "corrupted regexp pointers");
8492 NOT_REACHED; /* NOTREACHED */
8496 /* we have successfully completed a subexpression, but we must now
8497 * pop to the state marked by yes_state and continue from there */
8498 assert(st != yes_state);
8500 while (st != yes_state) {
8502 if (st < SLAB_FIRST(PL_regmatch_slab)) {
8503 PL_regmatch_slab = PL_regmatch_slab->prev;
8504 st = SLAB_LAST(PL_regmatch_slab);
8508 DEBUG_STATE_pp("pop (no final)");
8510 DEBUG_STATE_pp("pop (yes)");
8516 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8517 || yes_state > SLAB_LAST(PL_regmatch_slab))
8519 /* not in this slab, pop slab */
8520 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8521 PL_regmatch_slab = PL_regmatch_slab->prev;
8522 st = SLAB_LAST(PL_regmatch_slab);
8524 depth -= (st - yes_state);
8527 yes_state = st->u.yes.prev_yes_state;
8528 PL_regmatch_state = st;
8531 locinput= st->locinput;
8532 state_num = st->resume_state + no_final;
8533 goto reenter_switch;
8536 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
8537 PL_colors[4], PL_colors[5]));
8539 if (reginfo->info_aux_eval) {
8540 /* each successfully executed (?{...}) block does the equivalent of
8541 * local $^R = do {...}
8542 * When popping the save stack, all these locals would be undone;
8543 * bypass this by setting the outermost saved $^R to the latest
8545 /* I dont know if this is needed or works properly now.
8546 * see code related to PL_replgv elsewhere in this file.
8549 if (oreplsv != GvSV(PL_replgv))
8550 sv_setsv(oreplsv, GvSV(PL_replgv));
8557 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
8559 PL_colors[4], PL_colors[5])
8571 /* there's a previous state to backtrack to */
8573 if (st < SLAB_FIRST(PL_regmatch_slab)) {
8574 PL_regmatch_slab = PL_regmatch_slab->prev;
8575 st = SLAB_LAST(PL_regmatch_slab);
8577 PL_regmatch_state = st;
8578 locinput= st->locinput;
8580 DEBUG_STATE_pp("pop");
8582 if (yes_state == st)
8583 yes_state = st->u.yes.prev_yes_state;
8585 state_num = st->resume_state + 1; /* failure = success + 1 */
8587 goto reenter_switch;
8592 if (rex->intflags & PREGf_VERBARG_SEEN) {
8593 SV *sv_err = get_sv("REGERROR", 1);
8594 SV *sv_mrk = get_sv("REGMARK", 1);
8596 sv_commit = &PL_sv_no;
8598 sv_yes_mark = &PL_sv_yes;
8601 sv_commit = &PL_sv_yes;
8602 sv_yes_mark = &PL_sv_no;
8606 sv_setsv(sv_err, sv_commit);
8607 sv_setsv(sv_mrk, sv_yes_mark);
8611 if (last_pushed_cv) {
8614 PERL_UNUSED_VAR(SP);
8617 assert(!result || locinput - reginfo->strbeg >= 0);
8618 return result ? locinput - reginfo->strbeg : -1;
8622 - regrepeat - repeatedly match something simple, report how many
8624 * What 'simple' means is a node which can be the operand of a quantifier like
8627 * startposp - pointer a pointer to the start position. This is updated
8628 * to point to the byte following the highest successful
8630 * p - the regnode to be repeatedly matched against.
8631 * reginfo - struct holding match state, such as strend
8632 * max - maximum number of things to match.
8633 * depth - (for debugging) backtracking depth.
8636 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8637 regmatch_info *const reginfo, I32 max, int depth)
8639 char *scan; /* Pointer to current position in target string */
8641 char *loceol = reginfo->strend; /* local version */
8642 I32 hardcount = 0; /* How many matches so far */
8643 bool utf8_target = reginfo->is_utf8_target;
8644 unsigned int to_complement = 0; /* Invert the result? */
8646 _char_class_number classnum;
8648 PERL_UNUSED_ARG(depth);
8651 PERL_ARGS_ASSERT_REGREPEAT;
8654 if (max == REG_INFTY)
8656 else if (! utf8_target && loceol - scan > max)
8657 loceol = scan + max;
8659 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8660 * to the maximum of how far we should go in it (leaving it set to the real
8661 * end, if the maximum permissible would take us beyond that). This allows
8662 * us to make the loop exit condition that we haven't gone past <loceol> to
8663 * also mean that we haven't exceeded the max permissible count, saving a
8664 * test each time through the loop. But it assumes that the OP matches a
8665 * single byte, which is true for most of the OPs below when applied to a
8666 * non-UTF-8 target. Those relatively few OPs that don't have this
8667 * characteristic will have to compensate.
8669 * There is no adjustment for UTF-8 targets, as the number of bytes per
8670 * character varies. OPs will have to test both that the count is less
8671 * than the max permissible (using <hardcount> to keep track), and that we
8672 * are still within the bounds of the string (using <loceol>. A few OPs
8673 * match a single byte no matter what the encoding. They can omit the max
8674 * test if, for the UTF-8 case, they do the adjustment that was skipped
8677 * Thus, the code above sets things up for the common case; and exceptional
8678 * cases need extra work; the common case is to make sure <scan> doesn't
8679 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8680 * count doesn't exceed the maximum permissible */
8685 while (scan < loceol && hardcount < max && *scan != '\n') {
8686 scan += UTF8SKIP(scan);
8690 while (scan < loceol && *scan != '\n')
8696 while (scan < loceol && hardcount < max) {
8697 scan += UTF8SKIP(scan);
8705 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8706 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8707 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8711 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8715 /* Can use a simple loop if the pattern char to match on is invariant
8716 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
8717 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8718 * true iff it doesn't matter if the argument is in UTF-8 or not */
8719 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8720 if (utf8_target && loceol - scan > max) {
8721 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8722 * since here, to match at all, 1 char == 1 byte */
8723 loceol = scan + max;
8725 while (scan < loceol && UCHARAT(scan) == c) {
8729 else if (reginfo->is_utf8_pat) {
8731 STRLEN scan_char_len;
8733 /* When both target and pattern are UTF-8, we have to do
8735 while (hardcount < max
8737 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8738 && memEQ(scan, STRING(p), scan_char_len))
8740 scan += scan_char_len;
8744 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8746 /* Target isn't utf8; convert the character in the UTF-8
8747 * pattern to non-UTF8, and do a simple loop */
8748 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8749 while (scan < loceol && UCHARAT(scan) == c) {
8752 } /* else pattern char is above Latin1, can't possibly match the
8757 /* Here, the string must be utf8; pattern isn't, and <c> is
8758 * different in utf8 than not, so can't compare them directly.
8759 * Outside the loop, find the two utf8 bytes that represent c, and
8760 * then look for those in sequence in the utf8 string */
8761 U8 high = UTF8_TWO_BYTE_HI(c);
8762 U8 low = UTF8_TWO_BYTE_LO(c);
8764 while (hardcount < max
8765 && scan + 1 < loceol
8766 && UCHARAT(scan) == high
8767 && UCHARAT(scan + 1) == low)
8775 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
8776 assert(! reginfo->is_utf8_pat);
8779 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8783 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8784 utf8_flags = FOLDEQ_LOCALE;
8787 case EXACTF: /* This node only generated for non-utf8 patterns */
8788 assert(! reginfo->is_utf8_pat);
8793 if (! utf8_target) {
8796 utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8797 | FOLDEQ_S2_FOLDS_SANE;
8802 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8806 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8808 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8810 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8813 if (c1 == CHRTEST_VOID) {
8814 /* Use full Unicode fold matching */
8815 char *tmpeol = reginfo->strend;
8816 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8817 while (hardcount < max
8818 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8819 STRING(p), NULL, pat_len,
8820 reginfo->is_utf8_pat, utf8_flags))
8823 tmpeol = reginfo->strend;
8827 else if (utf8_target) {
8829 while (scan < loceol
8831 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8833 scan += UTF8SKIP(scan);
8838 while (scan < loceol
8840 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8841 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8843 scan += UTF8SKIP(scan);
8848 else if (c1 == c2) {
8849 while (scan < loceol && UCHARAT(scan) == c1) {
8854 while (scan < loceol &&
8855 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8864 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8866 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
8867 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8873 while (hardcount < max
8875 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8877 scan += UTF8SKIP(scan);
8881 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan, 0))
8886 /* The argument (FLAGS) to all the POSIX node types is the class number */
8893 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8894 if (! utf8_target) {
8895 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8901 while (hardcount < max && scan < loceol
8902 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8905 scan += UTF8SKIP(scan);
8918 if (utf8_target && loceol - scan > max) {
8920 /* We didn't adjust <loceol> at the beginning of this routine
8921 * because is UTF-8, but it is actually ok to do so, since here, to
8922 * match, 1 char == 1 byte. */
8923 loceol = scan + max;
8925 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8938 if (! utf8_target) {
8939 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8945 /* The complement of something that matches only ASCII matches all
8946 * non-ASCII, plus everything in ASCII that isn't in the class. */
8947 while (hardcount < max && scan < loceol
8948 && (! isASCII_utf8(scan)
8949 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8951 scan += UTF8SKIP(scan);
8962 if (! utf8_target) {
8963 while (scan < loceol && to_complement
8964 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8971 classnum = (_char_class_number) FLAGS(p);
8972 if (classnum < _FIRST_NON_SWASH_CC) {
8974 /* Here, a swash is needed for above-Latin1 code points.
8975 * Process as many Latin1 code points using the built-in rules.
8976 * Go to another loop to finish processing upon encountering
8977 * the first Latin1 code point. We could do that in this loop
8978 * as well, but the other way saves having to test if the swash
8979 * has been loaded every time through the loop: extra space to
8981 while (hardcount < max && scan < loceol) {
8982 if (UTF8_IS_INVARIANT(*scan)) {
8983 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
8990 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
8991 if (! (to_complement
8992 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
9001 goto found_above_latin1;
9008 /* For these character classes, the knowledge of how to handle
9009 * every code point is compiled in to Perl via a macro. This
9010 * code is written for making the loops as tight as possible.
9011 * It could be refactored to save space instead */
9013 case _CC_ENUM_SPACE:
9014 while (hardcount < max
9016 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
9018 scan += UTF8SKIP(scan);
9022 case _CC_ENUM_BLANK:
9023 while (hardcount < max
9025 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
9027 scan += UTF8SKIP(scan);
9031 case _CC_ENUM_XDIGIT:
9032 while (hardcount < max
9034 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
9036 scan += UTF8SKIP(scan);
9040 case _CC_ENUM_VERTSPACE:
9041 while (hardcount < max
9043 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
9045 scan += UTF8SKIP(scan);
9049 case _CC_ENUM_CNTRL:
9050 while (hardcount < max
9052 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
9054 scan += UTF8SKIP(scan);
9059 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9065 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
9067 /* Load the swash if not already present */
9068 if (! PL_utf8_swash_ptrs[classnum]) {
9069 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9070 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
9074 PL_XPosix_ptrs[classnum], &flags);
9077 while (hardcount < max && scan < loceol
9078 && to_complement ^ cBOOL(_generic_utf8(
9081 swash_fetch(PL_utf8_swash_ptrs[classnum],
9085 scan += UTF8SKIP(scan);
9092 while (hardcount < max && scan < loceol &&
9093 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9098 /* LNBREAK can match one or two latin chars, which is ok, but we
9099 * have to use hardcount in this situation, and throw away the
9100 * adjustment to <loceol> done before the switch statement */
9101 loceol = reginfo->strend;
9102 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9111 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9125 /* These are all 0 width, so match right here or not at all. */
9129 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9130 NOT_REACHED; /* NOTREACHED */
9137 c = scan - *startposp;
9141 GET_RE_DEBUG_FLAGS_DECL;
9143 SV * const prop = sv_newmortal();
9144 regprop(prog, prop, p, reginfo, NULL);
9145 Perl_re_exec_indentf( aTHX_ "%s can match %"IVdf" times out of %"IVdf"...\n",
9146 depth, SvPVX_const(prop),(IV)c,(IV)max);
9154 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9156 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
9157 create a copy so that changes the caller makes won't change the shared one.
9158 If <altsvp> is non-null, will return NULL in it, for back-compat.
9161 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9163 PERL_ARGS_ASSERT_REGCLASS_SWASH;
9169 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9172 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9175 - reginclass - determine if a character falls into a character class
9177 n is the ANYOF-type regnode
9178 p is the target string
9179 p_end points to one byte beyond the end of the target string
9180 utf8_target tells whether p is in UTF-8.
9182 Returns true if matched; false otherwise.
9184 Note that this can be a synthetic start class, a combination of various
9185 nodes, so things you think might be mutually exclusive, such as locale,
9186 aren't. It can match both locale and non-locale
9191 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9194 const char flags = ANYOF_FLAGS(n);
9198 PERL_ARGS_ASSERT_REGINCLASS;
9200 /* If c is not already the code point, get it. Note that
9201 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9202 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9204 c = utf8n_to_uvchr(p, p_end - p, &c_len,
9205 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
9206 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
9207 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
9208 * UTF8_ALLOW_FFFF */
9209 if (c_len == (STRLEN)-1)
9210 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
9211 if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
9212 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9216 /* If this character is potentially in the bitmap, check it */
9217 if (c < NUM_ANYOF_CODE_POINTS) {
9218 if (ANYOF_BITMAP_TEST(n, c))
9221 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9228 else if (flags & ANYOF_LOCALE_FLAGS) {
9229 if ((flags & ANYOFL_FOLD)
9231 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9235 else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9239 /* The data structure is arranged so bits 0, 2, 4, ... are set
9240 * if the class includes the Posix character class given by
9241 * bit/2; and 1, 3, 5, ... are set if the class includes the
9242 * complemented Posix class given by int(bit/2). So we loop
9243 * through the bits, each time changing whether we complement
9244 * the result or not. Suppose for the sake of illustration
9245 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
9246 * is set, it means there is a match for this ANYOF node if the
9247 * character is in the class given by the expression (0 / 2 = 0
9248 * = \w). If it is in that class, isFOO_lc() will return 1,
9249 * and since 'to_complement' is 0, the result will stay TRUE,
9250 * and we exit the loop. Suppose instead that bit 0 is 0, but
9251 * bit 1 is 1. That means there is a match if the character
9252 * matches \W. We won't bother to call isFOO_lc() on bit 0,
9253 * but will on bit 1. On the second iteration 'to_complement'
9254 * will be 1, so the exclusive or will reverse things, so we
9255 * are testing for \W. On the third iteration, 'to_complement'
9256 * will be 0, and we would be testing for \s; the fourth
9257 * iteration would test for \S, etc.
9259 * Note that this code assumes that all the classes are closed
9260 * under folding. For example, if a character matches \w, then
9261 * its fold does too; and vice versa. This should be true for
9262 * any well-behaved locale for all the currently defined Posix
9263 * classes, except for :lower: and :upper:, which are handled
9264 * by the pseudo-class :cased: which matches if either of the
9265 * other two does. To get rid of this assumption, an outer
9266 * loop could be used below to iterate over both the source
9267 * character, and its fold (if different) */
9270 int to_complement = 0;
9272 while (count < ANYOF_MAX) {
9273 if (ANYOF_POSIXL_TEST(n, count)
9274 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9287 /* If the bitmap didn't (or couldn't) match, and something outside the
9288 * bitmap could match, try that. */
9290 if (c >= NUM_ANYOF_CODE_POINTS
9291 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9293 match = TRUE; /* Everything above the bitmap matches */
9295 /* Here doesn't match everything above the bitmap. If there is
9296 * some information available beyond the bitmap, we may find a
9297 * match in it. If so, this is most likely because the code point
9298 * is outside the bitmap range. But rarely, it could be because of
9299 * some other reason. If so, various flags are set to indicate
9300 * this possibility. On ANYOFD nodes, there may be matches that
9301 * happen only when the target string is UTF-8; or for other node
9302 * types, because runtime lookup is needed, regardless of the
9303 * UTF-8ness of the target string. Finally, under /il, there may
9304 * be some matches only possible if the locale is a UTF-8 one. */
9305 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
9306 && ( c >= NUM_ANYOF_CODE_POINTS
9307 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9308 && ( UNLIKELY(OP(n) != ANYOFD)
9309 || (utf8_target && ! isASCII_uni(c)
9310 # if NUM_ANYOF_CODE_POINTS > 256
9314 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9315 && IN_UTF8_CTYPE_LOCALE)))
9317 SV* only_utf8_locale = NULL;
9318 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9319 &only_utf8_locale, NULL);
9325 } else { /* Convert to utf8 */
9326 utf8_p = utf8_buffer;
9327 append_utf8_from_native_byte(*p, &utf8_p);
9328 utf8_p = utf8_buffer;
9331 if (swash_fetch(sw, utf8_p, TRUE)) {
9335 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9336 match = _invlist_contains_cp(only_utf8_locale, c);
9340 if (UNICODE_IS_SUPER(c)
9342 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9344 && ckWARN_d(WARN_NON_UNICODE))
9346 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9347 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
9351 #if ANYOF_INVERT != 1
9352 /* Depending on compiler optimization cBOOL takes time, so if don't have to
9354 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9357 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9358 return (flags & ANYOF_INVERT) ^ match;
9362 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9364 /* return the position 'off' UTF-8 characters away from 's', forward if
9365 * 'off' >= 0, backwards if negative. But don't go outside of position
9366 * 'lim', which better be < s if off < 0 */
9368 PERL_ARGS_ASSERT_REGHOP3;
9371 while (off-- && s < lim) {
9372 /* XXX could check well-formedness here */
9377 while (off++ && s > lim) {
9379 if (UTF8_IS_CONTINUED(*s)) {
9380 while (s > lim && UTF8_IS_CONTINUATION(*s))
9382 if (! UTF8_IS_START(*s)) {
9383 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9386 /* XXX could check well-formedness here */
9393 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9395 PERL_ARGS_ASSERT_REGHOP4;
9398 while (off-- && s < rlim) {
9399 /* XXX could check well-formedness here */
9404 while (off++ && s > llim) {
9406 if (UTF8_IS_CONTINUED(*s)) {
9407 while (s > llim && UTF8_IS_CONTINUATION(*s))
9409 if (! UTF8_IS_START(*s)) {
9410 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9413 /* XXX could check well-formedness here */
9419 /* like reghop3, but returns NULL on overrun, rather than returning last
9423 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9425 PERL_ARGS_ASSERT_REGHOPMAYBE3;
9428 while (off-- && s < lim) {
9429 /* XXX could check well-formedness here */
9436 while (off++ && s > lim) {
9438 if (UTF8_IS_CONTINUED(*s)) {
9439 while (s > lim && UTF8_IS_CONTINUATION(*s))
9441 if (! UTF8_IS_START(*s)) {
9442 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9445 /* XXX could check well-formedness here */
9454 /* when executing a regex that may have (?{}), extra stuff needs setting
9455 up that will be visible to the called code, even before the current
9456 match has finished. In particular:
9458 * $_ is localised to the SV currently being matched;
9459 * pos($_) is created if necessary, ready to be updated on each call-out
9461 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9462 isn't set until the current pattern is successfully finished), so that
9463 $1 etc of the match-so-far can be seen;
9464 * save the old values of subbeg etc of the current regex, and set then
9465 to the current string (again, this is normally only done at the end
9470 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9473 regexp *const rex = ReANY(reginfo->prog);
9474 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9476 eval_state->rex = rex;
9479 /* Make $_ available to executed code. */
9480 if (reginfo->sv != DEFSV) {
9482 DEFSV_set(reginfo->sv);
9485 if (!(mg = mg_find_mglob(reginfo->sv))) {
9486 /* prepare for quick setting of pos */
9487 mg = sv_magicext_mglob(reginfo->sv);
9490 eval_state->pos_magic = mg;
9491 eval_state->pos = mg->mg_len;
9492 eval_state->pos_flags = mg->mg_flags;
9495 eval_state->pos_magic = NULL;
9497 if (!PL_reg_curpm) {
9498 /* PL_reg_curpm is a fake PMOP that we can attach the current
9499 * regex to and point PL_curpm at, so that $1 et al are visible
9500 * within a /(?{})/. It's just allocated once per interpreter the
9501 * first time its needed */
9502 Newxz(PL_reg_curpm, 1, PMOP);
9505 SV* const repointer = &PL_sv_undef;
9506 /* this regexp is also owned by the new PL_reg_curpm, which
9507 will try to free it. */
9508 av_push(PL_regex_padav, repointer);
9509 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
9510 PL_regex_pad = AvARRAY(PL_regex_padav);
9514 SET_reg_curpm(reginfo->prog);
9515 eval_state->curpm = PL_curpm;
9516 PL_curpm = PL_reg_curpm;
9517 if (RXp_MATCH_COPIED(rex)) {
9518 /* Here is a serious problem: we cannot rewrite subbeg,
9519 since it may be needed if this match fails. Thus
9520 $` inside (?{}) could fail... */
9521 eval_state->subbeg = rex->subbeg;
9522 eval_state->sublen = rex->sublen;
9523 eval_state->suboffset = rex->suboffset;
9524 eval_state->subcoffset = rex->subcoffset;
9526 eval_state->saved_copy = rex->saved_copy;
9528 RXp_MATCH_COPIED_off(rex);
9531 eval_state->subbeg = NULL;
9532 rex->subbeg = (char *)reginfo->strbeg;
9534 rex->subcoffset = 0;
9535 rex->sublen = reginfo->strend - reginfo->strbeg;
9539 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
9542 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
9544 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
9545 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
9548 Safefree(aux->poscache);
9552 /* undo the effects of S_setup_eval_state() */
9554 if (eval_state->subbeg) {
9555 regexp * const rex = eval_state->rex;
9556 rex->subbeg = eval_state->subbeg;
9557 rex->sublen = eval_state->sublen;
9558 rex->suboffset = eval_state->suboffset;
9559 rex->subcoffset = eval_state->subcoffset;
9561 rex->saved_copy = eval_state->saved_copy;
9563 RXp_MATCH_COPIED_on(rex);
9565 if (eval_state->pos_magic)
9567 eval_state->pos_magic->mg_len = eval_state->pos;
9568 eval_state->pos_magic->mg_flags =
9569 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
9570 | (eval_state->pos_flags & MGf_BYTES);
9573 PL_curpm = eval_state->curpm;
9576 PL_regmatch_state = aux->old_regmatch_state;
9577 PL_regmatch_slab = aux->old_regmatch_slab;
9579 /* free all slabs above current one - this must be the last action
9580 * of this function, as aux and eval_state are allocated within
9581 * slabs and may be freed here */
9583 s = PL_regmatch_slab->next;
9585 PL_regmatch_slab->next = NULL;
9587 regmatch_slab * const osl = s;
9596 S_to_utf8_substr(pTHX_ regexp *prog)
9598 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9599 * on the converted value */
9603 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9606 if (prog->substrs->data[i].substr
9607 && !prog->substrs->data[i].utf8_substr) {
9608 SV* const sv = newSVsv(prog->substrs->data[i].substr);
9609 prog->substrs->data[i].utf8_substr = sv;
9610 sv_utf8_upgrade(sv);
9611 if (SvVALID(prog->substrs->data[i].substr)) {
9612 if (SvTAIL(prog->substrs->data[i].substr)) {
9613 /* Trim the trailing \n that fbm_compile added last
9615 SvCUR_set(sv, SvCUR(sv) - 1);
9616 /* Whilst this makes the SV technically "invalid" (as its
9617 buffer is no longer followed by "\0") when fbm_compile()
9618 adds the "\n" back, a "\0" is restored. */
9619 fbm_compile(sv, FBMcf_TAIL);
9623 if (prog->substrs->data[i].substr == prog->check_substr)
9624 prog->check_utf8 = sv;
9630 S_to_byte_substr(pTHX_ regexp *prog)
9632 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9633 * on the converted value; returns FALSE if can't be converted. */
9637 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9640 if (prog->substrs->data[i].utf8_substr
9641 && !prog->substrs->data[i].substr) {
9642 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
9643 if (! sv_utf8_downgrade(sv, TRUE)) {
9646 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9647 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9648 /* Trim the trailing \n that fbm_compile added last
9650 SvCUR_set(sv, SvCUR(sv) - 1);
9651 fbm_compile(sv, FBMcf_TAIL);
9655 prog->substrs->data[i].substr = sv;
9656 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9657 prog->check_substr = sv;
9665 * ex: set ts=8 sts=4 sw=4 et: