regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #ifndef STATIC
  91 #define STATIC  static
  92 #endif
  93
  94 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  95  * call if there are no complications: i.e., if everything matchable is
  96  * straight forward in the bitmap */
  97 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
  98                                               : ANYOF_BITMAP_TEST(p,*(c)))
  99
 100 /*
 101  * Forwards.
 102  */
 103
 104 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 105 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 106
 107 #define HOPc(pos,off) \
 108         (char *)(PL_reg_match_utf8 \
 109             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 110             : (U8*)(pos + off))
 111 #define HOPBACKc(pos, off) \
 112         (char*)(PL_reg_match_utf8\
 113             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 114             : (pos - off >= PL_bostr)           \
 115                 ? (U8*)pos - off                \
 116                 : NULL)
 117
 118 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 119 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 120
 121 /* these are unrolled below in the CCC_TRY_XXX defined */
 122 #ifdef EBCDIC
 123     /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
 124      * skip the check on EBCDIC platforms */
 125 #   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
 126 #else
 127 #   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 128     if (!CAT2(PL_utf8_,class)) { \
 129         bool ok; \
 130         ENTER; save_re_context(); \
 131         ok=CAT2(is_utf8_,class)((const U8*)str); \
 132         PERL_UNUSED_VAR(ok); \
 133         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 134 #endif
 135
 136 /* Doesn't do an assert to verify that is correct */
 137 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 138     if (!CAT2(PL_utf8_,class)) { \
 139         bool throw_away PERL_UNUSED_DECL; \
 140         ENTER; save_re_context(); \
 141         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 142         LEAVE; } } STMT_END
 143
 144 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 145 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 146 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 147
 148 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 149         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 150         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 151         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 152             * assert should likely and hopefully fail on an EBCDIC machine */ \
 153         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 154                                                                             \
 155         /* No asserts are done for these, in case called on an early        \
 156             * Unicode version in which they map to nothing */               \
 157         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 158         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 159         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 161         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 162         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 163         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 164
 165 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 166
 167 /* The actual code for CCC_TRY, which uses several variables from the routine
 168  * it's callable from.  It is designed to be the bulk of a case statement.
 169  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 170  *      nextchr matches the class.
 171  * UTF8_TEST is the whole test string to use for utf8 targets
 172  * LOAD is what to use to test, and if not present to load in the swash for the
 173  *      class
 174  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 175  *      UTF8_TEST test.
 176  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 177  * utf8 and a variant, load the swash if necessary and test using the utf8
 178  * test.  Advance to the next character if test is ok, otherwise fail; If not
 179  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 180  * fails, or advance to the next character */
 181
 182 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 183     if (locinput >= PL_regeol) {                                              \
 184         sayNO;                                                                \
 185     }                                                                         \
 186     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 187         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 188         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 189             sayNO;                                                            \
 190         }                                                                     \
 191         locinput += PL_utf8skip[nextchr];                                     \
 192         nextchr = UCHARAT(locinput);                                          \
 193         break;                                                                \
 194     }                                                                         \
 195     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 196         sayNO;                                                                \
 197     }                                                                         \
 198     nextchr = UCHARAT(++locinput);                                            \
 199     break;
 200
 201 /* Handle the non-locale cases for a character class and its complement.  It
 202  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 203  * This is because that code fails when the test succeeds, so we want to have
 204  * the test fail so that the code succeeds.  The swash is stored in a
 205  * predictable PL_ place */
 206 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 207                            CLASS, STR)                                        \
 208     case NAME:                                                                \
 209         _CCC_TRY_CODE( !, FUNC,                                               \
 210                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 211                                             (U8*)locinput, TRUE)),            \
 212                           CLASS, STR)                                         \
 213     case NNAME:                                                               \
 214         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 215                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 216                                             (U8*)locinput, TRUE)),            \
 217                           CLASS, STR)                                         \
 218
 219 /* Generate the case statements for both locale and non-locale character
 220  * classes in regmatch for classes that don't have special unicode semantics.
 221  * Locales don't use an immediate swash, but an intermediary special locale
 222  * function that is called on the pointer to the current place in the input
 223  * string.  That function will resolve to needing the same swash.  One might
 224  * think that because we don't know what the locale will match, we shouldn't
 225  * check with the swash loading function that it loaded properly; ie, that we
 226  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 227  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 228  * irrelevant here */
 229 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 230                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 231                 NAMEA, NNAMEA, FUNCA,                                         \
 232                 CLASS, STR)                                                   \
 233     case NAMEL:                                                               \
 234         PL_reg_flags |= RF_tainted;                                           \
 235         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 236     case NNAMEL:                                                              \
 237         PL_reg_flags |= RF_tainted;                                           \
 238         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 239                        CLASS, STR)                                            \
 240     case NAMEA:                                                               \
 241         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 242             sayNO;                                                            \
 243         }                                                                     \
 244         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 245         nextchr = UCHARAT(++locinput);                                        \
 246         break;                                                                \
 247     case NNAMEA:                                                              \
 248         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 249             sayNO;                                                            \
 250         }                                                                     \
 251         if (utf8_target) {                                                    \
 252             locinput += PL_utf8skip[nextchr];                                 \
 253             nextchr = UCHARAT(locinput);                                      \
 254         }                                                                     \
 255         else {                                                                \
 256             nextchr = UCHARAT(++locinput);                                    \
 257         }                                                                     \
 258         break;                                                                \
 259     /* Generate the non-locale cases */                                       \
 260     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 261
 262 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 263  * statements to handle separate Unicode semantics nodes */
 264 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 265                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 266                   NAMEU, NNAMEU, FUNCU,                                        \
 267                   NAMEA, NNAMEA, FUNCA,                                        \
 268                   CLASS, STR)                                                  \
 269     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 270             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 271             NAMEA, NNAMEA, FUNCA,                                              \
 272             CLASS, STR)                                                        \
 273     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 274
 275 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 276
 277 /* for use after a quantifier and before an EXACT-like node -- japhy */
 278 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 279  *
 280  * NOTE that *nothing* that affects backtracking should be in here, specifically
 281  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 282  * node that is in between two EXACT like nodes when ascertaining what the required
 283  * "follow" character is. This should probably be moved to regex compile time
 284  * although it may be done at run time beause of the REF possibility - more
 285  * investigation required. -- demerphq
 286 */
 287 #define JUMPABLE(rn) (      \
 288     OP(rn) == OPEN ||       \
 289     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 290     OP(rn) == EVAL ||   \
 291     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 292     OP(rn) == PLUS || OP(rn) == MINMOD || \
 293     OP(rn) == KEEPS || \
 294     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 295 )
 296 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 297
 298 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 299
 300 #if 0
 301 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 302    we don't need this definition. */
 303 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 304 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 305 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 306
 307 #else
 308 /* ... so we use this as its faster. */
 309 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 310 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 311 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 312 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 313
 314 #endif
 315
 316 /*
 317   Search for mandatory following text node; for lookahead, the text must
 318   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 319 */
 320 #define FIND_NEXT_IMPT(rn) STMT_START { \
 321     while (JUMPABLE(rn)) { \
 322         const OPCODE type = OP(rn); \
 323         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 324             rn = NEXTOPER(NEXTOPER(rn)); \
 325         else if (type == PLUS) \
 326             rn = NEXTOPER(rn); \
 327         else if (type == IFMATCH) \
 328             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 329         else rn += NEXT_OFF(rn); \
 330     } \
 331 } STMT_END
 332
 333
 334 static void restore_pos(pTHX_ void *arg);
 335
 336 #define REGCP_PAREN_ELEMS 3
 337 #define REGCP_OTHER_ELEMS 3
 338 #define REGCP_FRAME_ELEMS 1
 339 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 340  * are needed for the regexp context stack bookkeeping. */
 341
 342 STATIC CHECKPOINT
 343 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 344 {
 345     dVAR;
 346     const int retval = PL_savestack_ix;
 347     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 348     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 349     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 350     I32 p;
 351     GET_RE_DEBUG_FLAGS_DECL;
 352
 353     PERL_ARGS_ASSERT_REGCPPUSH;
 354
 355     if (paren_elems_to_push < 0)
 356         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 357                    paren_elems_to_push);
 358
 359     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 360         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 361                    " out of range (%lu-%ld)",
 362                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 363
 364     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 365
 366     DEBUG_BUFFERS_r(
 367         if ((int)PL_regsize > (int)parenfloor)
 368             PerlIO_printf(Perl_debug_log,
 369                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 370                 PTR2UV(rex),
 371                 PTR2UV(rex->offs)
 372             );
 373     );
 374     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 375 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 376         SSPUSHINT(rex->offs[p].end);
 377         SSPUSHINT(rex->offs[p].start);
 378         SSPUSHINT(rex->offs[p].start_tmp);
 379         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 380             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 381             (UV)p,
 382             (IV)rex->offs[p].start,
 383             (IV)rex->offs[p].start_tmp,
 384             (IV)rex->offs[p].end
 385         ));
 386     }
 387 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 388     SSPUSHINT(PL_regsize);
 389     SSPUSHINT(rex->lastparen);
 390     SSPUSHINT(rex->lastcloseparen);
 391     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 392
 393     return retval;
 394 }
 395
 396 /* These are needed since we do not localize EVAL nodes: */
 397 #define REGCP_SET(cp)                                           \
 398     DEBUG_STATE_r(                                              \
 399             PerlIO_printf(Perl_debug_log,                       \
 400                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 401                 (IV)PL_savestack_ix));                          \
 402     cp = PL_savestack_ix
 403
 404 #define REGCP_UNWIND(cp)                                        \
 405     DEBUG_STATE_r(                                              \
 406         if (cp != PL_savestack_ix)                              \
 407             PerlIO_printf(Perl_debug_log,                       \
 408                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 409                 (IV)(cp), (IV)PL_savestack_ix));                \
 410     regcpblow(cp)
 411
 412 #define UNWIND_PAREN(lp, lcp)               \
 413     for (n = rex->lastparen; n > lp; n--)   \
 414         rex->offs[n].end = -1;              \
 415     rex->lastparen = n;                     \
 416     rex->lastcloseparen = lcp;
 417
 418
 419 STATIC void
 420 S_regcppop(pTHX_ regexp *rex)
 421 {
 422     dVAR;
 423     UV i;
 424     U32 paren;
 425     GET_RE_DEBUG_FLAGS_DECL;
 426
 427     PERL_ARGS_ASSERT_REGCPPOP;
 428
 429     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 430     i = SSPOPUV;
 431     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 432     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 433     rex->lastcloseparen = SSPOPINT;
 434     rex->lastparen = SSPOPINT;
 435     PL_regsize = SSPOPINT;
 436
 437     i -= REGCP_OTHER_ELEMS;
 438     /* Now restore the parentheses context. */
 439     DEBUG_BUFFERS_r(
 440         if (i || rex->lastparen + 1 <= rex->nparens)
 441             PerlIO_printf(Perl_debug_log,
 442                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 443                 PTR2UV(rex),
 444                 PTR2UV(rex->offs)
 445             );
 446     );
 447     paren = PL_regsize;
 448     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 449         I32 tmps;
 450         rex->offs[paren].start_tmp = SSPOPINT;
 451         rex->offs[paren].start = SSPOPINT;
 452         tmps = SSPOPINT;
 453         if (paren <= rex->lastparen)
 454             rex->offs[paren].end = tmps;
 455         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 456             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 457             (UV)paren,
 458             (IV)rex->offs[paren].start,
 459             (IV)rex->offs[paren].start_tmp,
 460             (IV)rex->offs[paren].end,
 461             (paren > rex->lastparen ? "(skipped)" : ""));
 462         );
 463         paren--;
 464     }
 465 #if 1
 466     /* It would seem that the similar code in regtry()
 467      * already takes care of this, and in fact it is in
 468      * a better location to since this code can #if 0-ed out
 469      * but the code in regtry() is needed or otherwise tests
 470      * requiring null fields (pat.t#187 and split.t#{13,14}
 471      * (as of patchlevel 7877)  will fail.  Then again,
 472      * this code seems to be necessary or otherwise
 473      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 474      * --jhi updated by dapm */
 475     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 476         if (i > PL_regsize)
 477             rex->offs[i].start = -1;
 478         rex->offs[i].end = -1;
 479         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 480             "    \\%"UVuf": %s   ..-1 undeffing\n",
 481             (UV)i,
 482             (i > PL_regsize) ? "-1" : "  "
 483         ));
 484     }
 485 #endif
 486 }
 487
 488 /* restore the parens and associated vars at savestack position ix,
 489  * but without popping the stack */
 490
 491 STATIC void
 492 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 493 {
 494     I32 tmpix = PL_savestack_ix;
 495     PL_savestack_ix = ix;
 496     regcppop(rex);
 497     PL_savestack_ix = tmpix;
 498 }
 499
 500 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 501
 502 /*
 503  * pregexec and friends
 504  */
 505
 506 #ifndef PERL_IN_XSUB_RE
 507 /*
 508  - pregexec - match a regexp against a string
 509  */
 510 I32
 511 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 512          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 513 /* strend: pointer to null at end of string */
 514 /* strbeg: real beginning of string */
 515 /* minend: end of match must be >=minend after stringarg. */
 516 /* nosave: For optimizations. */
 517 {
 518     PERL_ARGS_ASSERT_PREGEXEC;
 519
 520     return
 521         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 522                       nosave ? 0 : REXEC_COPY_STR);
 523 }
 524 #endif
 525
 526 /*
 527  * Need to implement the following flags for reg_anch:
 528  *
 529  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 530  * USE_INTUIT_ML
 531  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 532  * INTUIT_AUTORITATIVE_ML
 533  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 534  * INTUIT_ONCE_ML
 535  *
 536  * Another flag for this function: SECOND_TIME (so that float substrs
 537  * with giant delta may be not rechecked).
 538  */
 539
 540 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 541
 542 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 543    Otherwise, only SvCUR(sv) is used to get strbeg. */
 544
 545 /* XXXX We assume that strpos is strbeg unless sv. */
 546
 547 /* XXXX Some places assume that there is a fixed substring.
 548         An update may be needed if optimizer marks as "INTUITable"
 549         RExen without fixed substrings.  Similarly, it is assumed that
 550         lengths of all the strings are no more than minlen, thus they
 551         cannot come from lookahead.
 552         (Or minlen should take into account lookahead.)
 553   NOTE: Some of this comment is not correct. minlen does now take account
 554   of lookahead/behind. Further research is required. -- demerphq
 555
 556 */
 557
 558 /* A failure to find a constant substring means that there is no need to make
 559    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 560    finding a substring too deep into the string means that less calls to
 561    regtry() should be needed.
 562
 563    REx compiler's optimizer found 4 possible hints:
 564         a) Anchored substring;
 565         b) Fixed substring;
 566         c) Whether we are anchored (beginning-of-line or \G);
 567         d) First node (of those at offset 0) which may distinguish positions;
 568    We use a)b)d) and multiline-part of c), and try to find a position in the
 569    string which does not contradict any of them.
 570  */
 571
 572 /* Most of decisions we do here should have been done at compile time.
 573    The nodes of the REx which we used for the search should have been
 574    deleted from the finite automaton. */
 575
 576 char *
 577 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 578                      char *strend, const U32 flags, re_scream_pos_data *data)
 579 {
 580     dVAR;
 581     struct regexp *const prog = (struct regexp *)SvANY(rx);
 582     register I32 start_shift = 0;
 583     /* Should be nonnegative! */
 584     register I32 end_shift   = 0;
 585     register char *s;
 586     register SV *check;
 587     char *strbeg;
 588     char *t;
 589     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 590     I32 ml_anch;
 591     register char *other_last = NULL;   /* other substr checked before this */
 592     char *check_at = NULL;              /* check substr found at this pos */
 593     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 594     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 595     RXi_GET_DECL(prog,progi);
 596 #ifdef DEBUGGING
 597     const char * const i_strpos = strpos;
 598 #endif
 599     GET_RE_DEBUG_FLAGS_DECL;
 600
 601     PERL_ARGS_ASSERT_RE_INTUIT_START;
 602     PERL_UNUSED_ARG(flags);
 603     PERL_UNUSED_ARG(data);
 604
 605     RX_MATCH_UTF8_set(rx,utf8_target);
 606
 607     if (RX_UTF8(rx)) {
 608         PL_reg_flags |= RF_utf8;
 609     }
 610     DEBUG_EXECUTE_r(
 611         debug_start_match(rx, utf8_target, strpos, strend,
 612             sv ? "Guessing start of match in sv for"
 613                : "Guessing start of match in string for");
 614               );
 615
 616     /* CHR_DIST() would be more correct here but it makes things slow. */
 617     if (prog->minlen > strend - strpos) {
 618         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 619                               "String too short... [re_intuit_start]\n"));
 620         goto fail;
 621     }
 622
 623     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 624     PL_regeol = strend;
 625     if (utf8_target) {
 626         if (!prog->check_utf8 && prog->check_substr)
 627             to_utf8_substr(prog);
 628         check = prog->check_utf8;
 629     } else {
 630         if (!prog->check_substr && prog->check_utf8)
 631             to_byte_substr(prog);
 632         check = prog->check_substr;
 633     }
 634     if (check == &PL_sv_undef) {
 635         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 636                 "Non-utf8 string cannot match utf8 check string\n"));
 637         goto fail;
 638     }
 639     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 640         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 641                      || ( (prog->extflags & RXf_ANCH_BOL)
 642                           && !multiline ) );    /* Check after \n? */
 643
 644         if (!ml_anch) {
 645           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 646                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 647                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 648                && sv && !SvROK(sv)
 649                && (strpos != strbeg)) {
 650               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 651               goto fail;
 652           }
 653           if (prog->check_offset_min == prog->check_offset_max &&
 654               !(prog->extflags & RXf_CANY_SEEN)) {
 655             /* Substring at constant offset from beg-of-str... */
 656             I32 slen;
 657
 658             s = HOP3c(strpos, prog->check_offset_min, strend);
 659
 660             if (SvTAIL(check)) {
 661                 slen = SvCUR(check);    /* >= 1 */
 662
 663                 if ( strend - s > slen || strend - s < slen - 1
 664                      || (strend - s == slen && strend[-1] != '\n')) {
 665                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 666                     goto fail_finish;
 667                 }
 668                 /* Now should match s[0..slen-2] */
 669                 slen--;
 670                 if (slen && (*SvPVX_const(check) != *s
 671                              || (slen > 1
 672                                  && memNE(SvPVX_const(check), s, slen)))) {
 673                   report_neq:
 674                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 675                     goto fail_finish;
 676                 }
 677             }
 678             else if (*SvPVX_const(check) != *s
 679                      || ((slen = SvCUR(check)) > 1
 680                          && memNE(SvPVX_const(check), s, slen)))
 681                 goto report_neq;
 682             check_at = s;
 683             goto success_at_start;
 684           }
 685         }
 686         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 687         s = strpos;
 688         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 689         end_shift = prog->check_end_shift;
 690
 691         if (!ml_anch) {
 692             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 693                                          - (SvTAIL(check) != 0);
 694             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 695
 696             if (end_shift < eshift)
 697                 end_shift = eshift;
 698         }
 699     }
 700     else {                              /* Can match at random position */
 701         ml_anch = 0;
 702         s = strpos;
 703         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 704         end_shift = prog->check_end_shift;
 705
 706         /* end shift should be non negative here */
 707     }
 708
 709 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 710     if (end_shift < 0)
 711         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 712                    (IV)end_shift, RX_PRECOMP(prog));
 713 #endif
 714
 715   restart:
 716     /* Find a possible match in the region s..strend by looking for
 717        the "check" substring in the region corrected by start/end_shift. */
 718
 719     {
 720         I32 srch_start_shift = start_shift;
 721         I32 srch_end_shift = end_shift;
 722         U8* start_point;
 723         U8* end_point;
 724         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 725             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 726             srch_start_shift = strbeg - s;
 727         }
 728     DEBUG_OPTIMISE_MORE_r({
 729         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 730             (IV)prog->check_offset_min,
 731             (IV)srch_start_shift,
 732             (IV)srch_end_shift,
 733             (IV)prog->check_end_shift);
 734     });
 735
 736         if (prog->extflags & RXf_CANY_SEEN) {
 737             start_point= (U8*)(s + srch_start_shift);
 738             end_point= (U8*)(strend - srch_end_shift);
 739         } else {
 740             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 741             end_point= HOP3(strend, -srch_end_shift, strbeg);
 742         }
 743         DEBUG_OPTIMISE_MORE_r({
 744             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 745                 (int)(end_point - start_point),
 746                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 747                 start_point);
 748         });
 749
 750         s = fbm_instr( start_point, end_point,
 751                       check, multiline ? FBMrf_MULTILINE : 0);
 752     }
 753     /* Update the count-of-usability, remove useless subpatterns,
 754         unshift s.  */
 755
 756     DEBUG_EXECUTE_r({
 757         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 758             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 759         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 760                           (s ? "Found" : "Did not find"),
 761             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 762                 ? "anchored" : "floating"),
 763             quoted,
 764             RE_SV_TAIL(check),
 765             (s ? " at offset " : "...\n") );
 766     });
 767
 768     if (!s)
 769         goto fail_finish;
 770     /* Finish the diagnostic message */
 771     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 772
 773     /* XXX dmq: first branch is for positive lookbehind...
 774        Our check string is offset from the beginning of the pattern.
 775        So we need to do any stclass tests offset forward from that
 776        point. I think. :-(
 777      */
 778
 779
 780
 781     check_at=s;
 782
 783
 784     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 785        Start with the other substr.
 786        XXXX no SCREAM optimization yet - and a very coarse implementation
 787        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 788                 *always* match.  Probably should be marked during compile...
 789        Probably it is right to do no SCREAM here...
 790      */
 791
 792     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 793                 : (prog->float_substr && prog->anchored_substr))
 794     {
 795         /* Take into account the "other" substring. */
 796         /* XXXX May be hopelessly wrong for UTF... */
 797         if (!other_last)
 798             other_last = strpos;
 799         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 800           do_other_anchored:
 801             {
 802                 char * const last = HOP3c(s, -start_shift, strbeg);
 803                 char *last1, *last2;
 804                 char * const saved_s = s;
 805                 SV* must;
 806
 807                 t = s - prog->check_offset_max;
 808                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 809                     && (!utf8_target
 810                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 811                             && t > strpos)))
 812                     NOOP;
 813                 else
 814                     t = strpos;
 815                 t = HOP3c(t, prog->anchored_offset, strend);
 816                 if (t < other_last)     /* These positions already checked */
 817                     t = other_last;
 818                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 819                 if (last < last1)
 820                     last1 = last;
 821                 /* XXXX It is not documented what units *_offsets are in.
 822                    We assume bytes, but this is clearly wrong.
 823                    Meaning this code needs to be carefully reviewed for errors.
 824                    dmq.
 825                   */
 826
 827                 /* On end-of-str: see comment below. */
 828                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 829                 if (must == &PL_sv_undef) {
 830                     s = (char*)NULL;
 831                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 832                 }
 833                 else
 834                     s = fbm_instr(
 835                         (unsigned char*)t,
 836                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 837                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 838                         must,
 839                         multiline ? FBMrf_MULTILINE : 0
 840                     );
 841                 DEBUG_EXECUTE_r({
 842                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 843                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 844                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 845                         (s ? "Found" : "Contradicts"),
 846                         quoted, RE_SV_TAIL(must));
 847                 });
 848
 849
 850                 if (!s) {
 851                     if (last1 >= last2) {
 852                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 853                                                 ", giving up...\n"));
 854                         goto fail_finish;
 855                     }
 856                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 857                         ", trying floating at offset %ld...\n",
 858                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 859                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 860                     s = HOP3c(last, 1, strend);
 861                     goto restart;
 862                 }
 863                 else {
 864                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 865                           (long)(s - i_strpos)));
 866                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 867                     other_last = HOP3c(s, 1, strend);
 868                     s = saved_s;
 869                     if (t == strpos)
 870                         goto try_at_start;
 871                     goto try_at_offset;
 872                 }
 873             }
 874         }
 875         else {          /* Take into account the floating substring. */
 876             char *last, *last1;
 877             char * const saved_s = s;
 878             SV* must;
 879
 880             t = HOP3c(s, -start_shift, strbeg);
 881             last1 = last =
 882                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 883             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 884                 last = HOP3c(t, prog->float_max_offset, strend);
 885             s = HOP3c(t, prog->float_min_offset, strend);
 886             if (s < other_last)
 887                 s = other_last;
 888  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 889             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 890             /* fbm_instr() takes into account exact value of end-of-str
 891                if the check is SvTAIL(ed).  Since false positives are OK,
 892                and end-of-str is not later than strend we are OK. */
 893             if (must == &PL_sv_undef) {
 894                 s = (char*)NULL;
 895                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 896             }
 897             else
 898                 s = fbm_instr((unsigned char*)s,
 899                               (unsigned char*)last + SvCUR(must)
 900                                   - (SvTAIL(must)!=0),
 901                               must, multiline ? FBMrf_MULTILINE : 0);
 902             DEBUG_EXECUTE_r({
 903                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 904                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 905                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 906                     (s ? "Found" : "Contradicts"),
 907                     quoted, RE_SV_TAIL(must));
 908             });
 909             if (!s) {
 910                 if (last1 == last) {
 911                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 912                                             ", giving up...\n"));
 913                     goto fail_finish;
 914                 }
 915                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 916                     ", trying anchored starting at offset %ld...\n",
 917                     (long)(saved_s + 1 - i_strpos)));
 918                 other_last = last;
 919                 s = HOP3c(t, 1, strend);
 920                 goto restart;
 921             }
 922             else {
 923                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 924                       (long)(s - i_strpos)));
 925                 other_last = s; /* Fix this later. --Hugo */
 926                 s = saved_s;
 927                 if (t == strpos)
 928                     goto try_at_start;
 929                 goto try_at_offset;
 930             }
 931         }
 932     }
 933
 934
 935     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 936
 937     DEBUG_OPTIMISE_MORE_r(
 938         PerlIO_printf(Perl_debug_log,
 939             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 940             (IV)prog->check_offset_min,
 941             (IV)prog->check_offset_max,
 942             (IV)(s-strpos),
 943             (IV)(t-strpos),
 944             (IV)(t-s),
 945             (IV)(strend-strpos)
 946         )
 947     );
 948
 949     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 950         && (!utf8_target
 951             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 952                  && t > strpos)))
 953     {
 954         /* Fixed substring is found far enough so that the match
 955            cannot start at strpos. */
 956       try_at_offset:
 957         if (ml_anch && t[-1] != '\n') {
 958             /* Eventually fbm_*() should handle this, but often
 959                anchored_offset is not 0, so this check will not be wasted. */
 960             /* XXXX In the code below we prefer to look for "^" even in
 961                presence of anchored substrings.  And we search even
 962                beyond the found float position.  These pessimizations
 963                are historical artefacts only.  */
 964           find_anchor:
 965             while (t < strend - prog->minlen) {
 966                 if (*t == '\n') {
 967                     if (t < check_at - prog->check_offset_min) {
 968                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 969                             /* Since we moved from the found position,
 970                                we definitely contradict the found anchored
 971                                substr.  Due to the above check we do not
 972                                contradict "check" substr.
 973                                Thus we can arrive here only if check substr
 974                                is float.  Redo checking for "other"=="fixed".
 975                              */
 976                             strpos = t + 1;
 977                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 978                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 979                             goto do_other_anchored;
 980                         }
 981                         /* We don't contradict the found floating substring. */
 982                         /* XXXX Why not check for STCLASS? */
 983                         s = t + 1;
 984                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 985                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 986                         goto set_useful;
 987                     }
 988                     /* Position contradicts check-string */
 989                     /* XXXX probably better to look for check-string
 990                        than for "\n", so one should lower the limit for t? */
 991                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 992                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 993                     other_last = strpos = s = t + 1;
 994                     goto restart;
 995                 }
 996                 t++;
 997             }
 998             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 999                         PL_colors[0], PL_colors[1]));
1000             goto fail_finish;
1001         }
1002         else {
1003             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1004                         PL_colors[0], PL_colors[1]));
1005         }
1006         s = t;
1007       set_useful:
1008         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1009     }
1010     else {
1011         /* The found string does not prohibit matching at strpos,
1012            - no optimization of calling REx engine can be performed,
1013            unless it was an MBOL and we are not after MBOL,
1014            or a future STCLASS check will fail this. */
1015       try_at_start:
1016         /* Even in this situation we may use MBOL flag if strpos is offset
1017            wrt the start of the string. */
1018         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1019             && (strpos != strbeg) && strpos[-1] != '\n'
1020             /* May be due to an implicit anchor of m{.*foo}  */
1021             && !(prog->intflags & PREGf_IMPLICIT))
1022         {
1023             t = strpos;
1024             goto find_anchor;
1025         }
1026         DEBUG_EXECUTE_r( if (ml_anch)
1027             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1028                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1029         );
1030       success_at_start:
1031         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1032             && (utf8_target ? (
1033                 prog->check_utf8                /* Could be deleted already */
1034                 && --BmUSEFUL(prog->check_utf8) < 0
1035                 && (prog->check_utf8 == prog->float_utf8)
1036             ) : (
1037                 prog->check_substr              /* Could be deleted already */
1038                 && --BmUSEFUL(prog->check_substr) < 0
1039                 && (prog->check_substr == prog->float_substr)
1040             )))
1041         {
1042             /* If flags & SOMETHING - do not do it many times on the same match */
1043             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1044             /* XXX Does the destruction order has to change with utf8_target? */
1045             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1046             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1047             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1048             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1049             check = NULL;                       /* abort */
1050             s = strpos;
1051             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1052                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1053             if (prog->intflags & PREGf_IMPLICIT)
1054                 prog->extflags &= ~RXf_ANCH_MBOL;
1055             /* XXXX This is a remnant of the old implementation.  It
1056                     looks wasteful, since now INTUIT can use many
1057                     other heuristics. */
1058             prog->extflags &= ~RXf_USE_INTUIT;
1059             /* XXXX What other flags might need to be cleared in this branch? */
1060         }
1061         else
1062             s = strpos;
1063     }
1064
1065     /* Last resort... */
1066     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1067     /* trie stclasses are too expensive to use here, we are better off to
1068        leave it to regmatch itself */
1069     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1070         /* minlen == 0 is possible if regstclass is \b or \B,
1071            and the fixed substr is ''$.
1072            Since minlen is already taken into account, s+1 is before strend;
1073            accidentally, minlen >= 1 guaranties no false positives at s + 1
1074            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1075            regstclass does not come from lookahead...  */
1076         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1077            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1078         const U8* const str = (U8*)STRING(progi->regstclass);
1079         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1080                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1081                     : 1);
1082         char * endpos;
1083         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1084             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1085         else if (prog->float_substr || prog->float_utf8)
1086             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1087         else
1088             endpos= strend;
1089
1090         if (checked_upto < s)
1091            checked_upto = s;
1092         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1093                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1094
1095         t = s;
1096         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1097         if (s) {
1098             checked_upto = s;
1099         } else {
1100 #ifdef DEBUGGING
1101             const char *what = NULL;
1102 #endif
1103             if (endpos == strend) {
1104                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1105                                 "Could not match STCLASS...\n") );
1106                 goto fail;
1107             }
1108             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1109                                    "This position contradicts STCLASS...\n") );
1110             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1111                 goto fail;
1112             checked_upto = HOPBACKc(endpos, start_shift);
1113             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1114                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1115             /* Contradict one of substrings */
1116             if (prog->anchored_substr || prog->anchored_utf8) {
1117                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1118                     DEBUG_EXECUTE_r( what = "anchored" );
1119                   hop_and_restart:
1120                     s = HOP3c(t, 1, strend);
1121                     if (s + start_shift + end_shift > strend) {
1122                         /* XXXX Should be taken into account earlier? */
1123                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1124                                                "Could not match STCLASS...\n") );
1125                         goto fail;
1126                     }
1127                     if (!check)
1128                         goto giveup;
1129                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1130                                 "Looking for %s substr starting at offset %ld...\n",
1131                                  what, (long)(s + start_shift - i_strpos)) );
1132                     goto restart;
1133                 }
1134                 /* Have both, check_string is floating */
1135                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1136                     goto retry_floating_check;
1137                 /* Recheck anchored substring, but not floating... */
1138                 s = check_at;
1139                 if (!check)
1140                     goto giveup;
1141                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142                           "Looking for anchored substr starting at offset %ld...\n",
1143                           (long)(other_last - i_strpos)) );
1144                 goto do_other_anchored;
1145             }
1146             /* Another way we could have checked stclass at the
1147                current position only: */
1148             if (ml_anch) {
1149                 s = t = t + 1;
1150                 if (!check)
1151                     goto giveup;
1152                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1153                           "Looking for /%s^%s/m starting at offset %ld...\n",
1154                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1155                 goto try_at_offset;
1156             }
1157             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1158                 goto fail;
1159             /* Check is floating substring. */
1160           retry_floating_check:
1161             t = check_at - start_shift;
1162             DEBUG_EXECUTE_r( what = "floating" );
1163             goto hop_and_restart;
1164         }
1165         if (t != s) {
1166             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1167                         "By STCLASS: moving %ld --> %ld\n",
1168                                   (long)(t - i_strpos), (long)(s - i_strpos))
1169                    );
1170         }
1171         else {
1172             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1173                                   "Does not contradict STCLASS...\n");
1174                    );
1175         }
1176     }
1177   giveup:
1178     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1179                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1180                           PL_colors[5], (long)(s - i_strpos)) );
1181     return s;
1182
1183   fail_finish:                          /* Substring not found */
1184     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1185         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1186   fail:
1187     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1188                           PL_colors[4], PL_colors[5]));
1189     return NULL;
1190 }
1191
1192 #define DECL_TRIE_TYPE(scan) \
1193     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1194                     trie_type = ((scan->flags == EXACT) \
1195                               ? (utf8_target ? trie_utf8 : trie_plain) \
1196                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1197
1198 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1199 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1200     STRLEN skiplen;                                                                 \
1201     switch (trie_type) {                                                            \
1202     case trie_utf8_fold:                                                            \
1203         if ( foldlen>0 ) {                                                          \
1204             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1205             foldlen -= len;                                                         \
1206             uscan += len;                                                           \
1207             len=0;                                                                  \
1208         } else {                                                                    \
1209             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1210             len = UTF8SKIP(uc);                                                     \
1211             skiplen = UNISKIP( uvc );                                               \
1212             foldlen -= skiplen;                                                     \
1213             uscan = foldbuf + skiplen;                                              \
1214         }                                                                           \
1215         break;                                                                      \
1216     case trie_latin_utf8_fold:                                                      \
1217         if ( foldlen>0 ) {                                                          \
1218             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1219             foldlen -= len;                                                         \
1220             uscan += len;                                                           \
1221             len=0;                                                                  \
1222         } else {                                                                    \
1223             len = 1;                                                                \
1224             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1225             skiplen = UNISKIP( uvc );                                               \
1226             foldlen -= skiplen;                                                     \
1227             uscan = foldbuf + skiplen;                                              \
1228         }                                                                           \
1229         break;                                                                      \
1230     case trie_utf8:                                                                 \
1231         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1232         break;                                                                      \
1233     case trie_plain:                                                                \
1234         uvc = (UV)*uc;                                                              \
1235         len = 1;                                                                    \
1236     }                                                                               \
1237     if (uvc < 256) {                                                                \
1238         charid = trie->charmap[ uvc ];                                              \
1239     }                                                                               \
1240     else {                                                                          \
1241         charid = 0;                                                                 \
1242         if (widecharmap) {                                                          \
1243             SV** const svpp = hv_fetch(widecharmap,                                 \
1244                         (char*)&uvc, sizeof(UV), 0);                                \
1245             if (svpp)                                                               \
1246                 charid = (U16)SvIV(*svpp);                                          \
1247         }                                                                           \
1248     }                                                                               \
1249 } STMT_END
1250
1251 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1252 STMT_START {                                              \
1253     while (s <= e) {                                      \
1254         if ( (CoNd)                                       \
1255              && (ln == 1 || folder(s, pat_string, ln))    \
1256              && (!reginfo || regtry(reginfo, &s)) )       \
1257             goto got_it;                                  \
1258         s++;                                              \
1259     }                                                     \
1260 } STMT_END
1261
1262 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1263 STMT_START {                                          \
1264     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1265         CoDe                                          \
1266         s += uskip;                                   \
1267     }                                                 \
1268 } STMT_END
1269
1270 #define REXEC_FBC_SCAN(CoDe)                          \
1271 STMT_START {                                          \
1272     while (s < strend) {                              \
1273         CoDe                                          \
1274         s++;                                          \
1275     }                                                 \
1276 } STMT_END
1277
1278 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1279 REXEC_FBC_UTF8_SCAN(                                  \
1280     if (CoNd) {                                       \
1281         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1282             goto got_it;                              \
1283         else                                          \
1284             tmp = doevery;                            \
1285     }                                                 \
1286     else                                              \
1287         tmp = 1;                                      \
1288 )
1289
1290 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1291 REXEC_FBC_SCAN(                                       \
1292     if (CoNd) {                                       \
1293         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1294             goto got_it;                              \
1295         else                                          \
1296             tmp = doevery;                            \
1297     }                                                 \
1298     else                                              \
1299         tmp = 1;                                      \
1300 )
1301
1302 #define REXEC_FBC_TRYIT               \
1303 if ((!reginfo || regtry(reginfo, &s))) \
1304     goto got_it
1305
1306 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1307     if (utf8_target) {                                             \
1308         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1309     }                                                          \
1310     else {                                                     \
1311         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1312     }
1313
1314 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1315     if (utf8_target) {                                             \
1316         UtFpReLoAd;                                            \
1317         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1318     }                                                          \
1319     else {                                                     \
1320         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1321     }
1322
1323 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1324     PL_reg_flags |= RF_tainted;                                \
1325     if (utf8_target) {                                             \
1326         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1327     }                                                          \
1328     else {                                                     \
1329         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1330     }
1331
1332 #define DUMP_EXEC_POS(li,s,doutf8) \
1333     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1334
1335
1336 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1337         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1338         tmp = TEST_NON_UTF8(tmp);                                              \
1339         REXEC_FBC_UTF8_SCAN(                                                   \
1340             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1341                 tmp = !tmp;                                                    \
1342                 IF_SUCCESS;                                                    \
1343             }                                                                  \
1344             else {                                                             \
1345                 IF_FAIL;                                                       \
1346             }                                                                  \
1347         );                                                                     \
1348
1349 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1350         if (s == PL_bostr) {                                                   \
1351             tmp = '\n';                                                        \
1352         }                                                                      \
1353         else {                                                                 \
1354             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1355             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1356         }                                                                      \
1357         tmp = TeSt1_UtF8;                                                      \
1358         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1359         REXEC_FBC_UTF8_SCAN(                                                   \
1360             if (tmp == ! (TeSt2_UtF8)) { \
1361                 tmp = !tmp;                                                    \
1362                 IF_SUCCESS;                                                    \
1363             }                                                                  \
1364             else {                                                             \
1365                 IF_FAIL;                                                       \
1366             }                                                                  \
1367         );                                                                     \
1368
1369 /* The only difference between the BOUND and NBOUND cases is that
1370  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1371  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1372  * with the other one being empty */
1373 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1374     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1375
1376 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1377     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1378
1379 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1380     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1381
1382 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1383     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1384
1385
1386 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1387  * be passed in completely with the variable name being tested, which isn't
1388  * such a clean interface, but this is easier to read than it was before.  We
1389  * are looking for the boundary (or non-boundary between a word and non-word
1390  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1391  * must be different.  Find the "wordness" of the character just prior to this
1392  * one, and compare it with the wordness of this one.  If they differ, we have
1393  * a boundary.  At the beginning of the string, pretend that the previous
1394  * character was a new-line */
1395 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1396     if (utf8_target) {                                                         \
1397                 UTF8_CODE \
1398     }                                                                          \
1399     else {  /* Not utf8 */                                                     \
1400         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1401         tmp = TEST_NON_UTF8(tmp);                                              \
1402         REXEC_FBC_SCAN(                                                        \
1403             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1404                 tmp = !tmp;                                                    \
1405                 IF_SUCCESS;                                                    \
1406             }                                                                  \
1407             else {                                                             \
1408                 IF_FAIL;                                                       \
1409             }                                                                  \
1410         );                                                                     \
1411     }                                                                          \
1412     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1413         goto got_it;
1414
1415 /* We know what class REx starts with.  Try to find this position... */
1416 /* if reginfo is NULL, its a dryrun */
1417 /* annoyingly all the vars in this routine have different names from their counterparts
1418    in regmatch. /grrr */
1419
1420 STATIC char *
1421 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1422     const char *strend, regmatch_info *reginfo)
1423 {
1424         dVAR;
1425         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1426         char *pat_string;   /* The pattern's exactish string */
1427         char *pat_end;      /* ptr to end char of pat_string */
1428         re_fold_t folder;       /* Function for computing non-utf8 folds */
1429         const U8 *fold_array;   /* array for folding ords < 256 */
1430         STRLEN ln;
1431         STRLEN lnc;
1432         register STRLEN uskip;
1433         U8 c1;
1434         U8 c2;
1435         char *e;
1436         register I32 tmp = 1;   /* Scratch variable? */
1437         register const bool utf8_target = PL_reg_match_utf8;
1438         UV utf8_fold_flags = 0;
1439         RXi_GET_DECL(prog,progi);
1440
1441         PERL_ARGS_ASSERT_FIND_BYCLASS;
1442
1443         /* We know what class it must start with. */
1444         switch (OP(c)) {
1445         case ANYOFV:
1446         case ANYOF:
1447             if (utf8_target || OP(c) == ANYOFV) {
1448                 STRLEN inclasslen = strend - s;
1449                 REXEC_FBC_UTF8_CLASS_SCAN(
1450                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1451             }
1452             else {
1453                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1454             }
1455             break;
1456         case CANY:
1457             REXEC_FBC_SCAN(
1458                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1459                     goto got_it;
1460                 else
1461                     tmp = doevery;
1462             );
1463             break;
1464
1465         case EXACTFA:
1466             if (UTF_PATTERN || utf8_target) {
1467                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1468                 goto do_exactf_utf8;
1469             }
1470             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1471             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1472             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1473
1474         case EXACTF:
1475             if (utf8_target) {
1476
1477                 /* regcomp.c already folded this if pattern is in UTF-8 */
1478                 utf8_fold_flags = 0;
1479                 goto do_exactf_utf8;
1480             }
1481             fold_array = PL_fold;
1482             folder = foldEQ;
1483             goto do_exactf_non_utf8;
1484
1485         case EXACTFL:
1486             if (UTF_PATTERN || utf8_target) {
1487                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1488                 goto do_exactf_utf8;
1489             }
1490             fold_array = PL_fold_locale;
1491             folder = foldEQ_locale;
1492             goto do_exactf_non_utf8;
1493
1494         case EXACTFU_SS:
1495             if (UTF_PATTERN) {
1496                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1497             }
1498             goto do_exactf_utf8;
1499
1500         case EXACTFU_TRICKYFOLD:
1501         case EXACTFU:
1502             if (UTF_PATTERN || utf8_target) {
1503                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1504                 goto do_exactf_utf8;
1505             }
1506
1507             /* Any 'ss' in the pattern should have been replaced by regcomp,
1508              * so we don't have to worry here about this single special case
1509              * in the Latin1 range */
1510             fold_array = PL_fold_latin1;
1511             folder = foldEQ_latin1;
1512
1513             /* FALL THROUGH */
1514
1515         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1516                                are no glitches with fold-length differences
1517                                between the target string and pattern */
1518
1519             /* The idea in the non-utf8 EXACTF* cases is to first find the
1520              * first character of the EXACTF* node and then, if necessary,
1521              * case-insensitively compare the full text of the node.  c1 is the
1522              * first character.  c2 is its fold.  This logic will not work for
1523              * Unicode semantics and the german sharp ss, which hence should
1524              * not be compiled into a node that gets here. */
1525             pat_string = STRING(c);
1526             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1527
1528             /* We know that we have to match at least 'ln' bytes (which is the
1529              * same as characters, since not utf8).  If we have to match 3
1530              * characters, and there are only 2 availabe, we know without
1531              * trying that it will fail; so don't start a match past the
1532              * required minimum number from the far end */
1533             e = HOP3c(strend, -((I32)ln), s);
1534
1535             if (!reginfo && e < s) {
1536                 e = s;                  /* Due to minlen logic of intuit() */
1537             }
1538
1539             c1 = *pat_string;
1540             c2 = fold_array[c1];
1541             if (c1 == c2) { /* If char and fold are the same */
1542                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1543             }
1544             else {
1545                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1546             }
1547             break;
1548
1549         do_exactf_utf8:
1550         {
1551             unsigned expansion;
1552
1553
1554             /* If one of the operands is in utf8, we can't use the simpler
1555              * folding above, due to the fact that many different characters
1556              * can have the same fold, or portion of a fold, or different-
1557              * length fold */
1558             pat_string = STRING(c);
1559             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1560             pat_end = pat_string + ln;
1561             lnc = (UTF_PATTERN) /* length to match in characters */
1562                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1563                     : ln;
1564
1565             /* We have 'lnc' characters to match in the pattern, but because of
1566              * multi-character folding, each character in the target can match
1567              * up to 3 characters (Unicode guarantees it will never exceed
1568              * this) if it is utf8-encoded; and up to 2 if not (based on the
1569              * fact that the Latin 1 folds are already determined, and the
1570              * only multi-char fold in that range is the sharp-s folding to
1571              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1572              * string character.  Adjust lnc accordingly, rounding up, so that
1573              * if we need to match at least 4+1/3 chars, that really is 5. */
1574             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1575             lnc = (lnc + expansion - 1) / expansion;
1576
1577             /* As in the non-UTF8 case, if we have to match 3 characters, and
1578              * only 2 are left, it's guaranteed to fail, so don't start a
1579              * match that would require us to go beyond the end of the string
1580              */
1581             e = HOP3c(strend, -((I32)lnc), s);
1582
1583             if (!reginfo && e < s) {
1584                 e = s;                  /* Due to minlen logic of intuit() */
1585             }
1586
1587             /* XXX Note that we could recalculate e to stop the loop earlier,
1588              * as the worst case expansion above will rarely be met, and as we
1589              * go along we would usually find that e moves further to the left.
1590              * This would happen only after we reached the point in the loop
1591              * where if there were no expansion we should fail.  Unclear if
1592              * worth the expense */
1593
1594             while (s <= e) {
1595                 char *my_strend= (char *)strend;
1596                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1597                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1598                     && (!reginfo || regtry(reginfo, &s)) )
1599                 {
1600                     goto got_it;
1601                 }
1602                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1603             }
1604             break;
1605         }
1606         case BOUNDL:
1607             PL_reg_flags |= RF_tainted;
1608             FBC_BOUND(isALNUM_LC,
1609                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1610                       isALNUM_LC_utf8((U8*)s));
1611             break;
1612         case NBOUNDL:
1613             PL_reg_flags |= RF_tainted;
1614             FBC_NBOUND(isALNUM_LC,
1615                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1616                        isALNUM_LC_utf8((U8*)s));
1617             break;
1618         case BOUND:
1619             FBC_BOUND(isWORDCHAR,
1620                       isALNUM_uni(tmp),
1621                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1622             break;
1623         case BOUNDA:
1624             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1625                              isWORDCHAR_A(tmp),
1626                              isWORDCHAR_A((U8*)s));
1627             break;
1628         case NBOUND:
1629             FBC_NBOUND(isWORDCHAR,
1630                        isALNUM_uni(tmp),
1631                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1632             break;
1633         case NBOUNDA:
1634             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1635                               isWORDCHAR_A(tmp),
1636                               isWORDCHAR_A((U8*)s));
1637             break;
1638         case BOUNDU:
1639             FBC_BOUND(isWORDCHAR_L1,
1640                       isALNUM_uni(tmp),
1641                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1642             break;
1643         case NBOUNDU:
1644             FBC_NBOUND(isWORDCHAR_L1,
1645                        isALNUM_uni(tmp),
1646                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1647             break;
1648         case ALNUML:
1649             REXEC_FBC_CSCAN_TAINT(
1650                 isALNUM_LC_utf8((U8*)s),
1651                 isALNUM_LC(*s)
1652             );
1653             break;
1654         case ALNUMU:
1655             REXEC_FBC_CSCAN_PRELOAD(
1656                 LOAD_UTF8_CHARCLASS_ALNUM(),
1657                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1658                 isWORDCHAR_L1((U8) *s)
1659             );
1660             break;
1661         case ALNUM:
1662             REXEC_FBC_CSCAN_PRELOAD(
1663                 LOAD_UTF8_CHARCLASS_ALNUM(),
1664                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1665                 isWORDCHAR((U8) *s)
1666             );
1667             break;
1668         case ALNUMA:
1669             /* Don't need to worry about utf8, as it can match only a single
1670              * byte invariant character */
1671             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1672             break;
1673         case NALNUMU:
1674             REXEC_FBC_CSCAN_PRELOAD(
1675                 LOAD_UTF8_CHARCLASS_ALNUM(),
1676                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1677                 ! isWORDCHAR_L1((U8) *s)
1678             );
1679             break;
1680         case NALNUM:
1681             REXEC_FBC_CSCAN_PRELOAD(
1682                 LOAD_UTF8_CHARCLASS_ALNUM(),
1683                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1684                 ! isALNUM(*s)
1685             );
1686             break;
1687         case NALNUMA:
1688             REXEC_FBC_CSCAN(
1689                 !isWORDCHAR_A(*s),
1690                 !isWORDCHAR_A(*s)
1691             );
1692             break;
1693         case NALNUML:
1694             REXEC_FBC_CSCAN_TAINT(
1695                 !isALNUM_LC_utf8((U8*)s),
1696                 !isALNUM_LC(*s)
1697             );
1698             break;
1699         case SPACEU:
1700             REXEC_FBC_CSCAN_PRELOAD(
1701                 LOAD_UTF8_CHARCLASS_SPACE(),
1702                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1703                 isSPACE_L1((U8) *s)
1704             );
1705             break;
1706         case SPACE:
1707             REXEC_FBC_CSCAN_PRELOAD(
1708                 LOAD_UTF8_CHARCLASS_SPACE(),
1709                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1710                 isSPACE((U8) *s)
1711             );
1712             break;
1713         case SPACEA:
1714             /* Don't need to worry about utf8, as it can match only a single
1715              * byte invariant character */
1716             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1717             break;
1718         case SPACEL:
1719             REXEC_FBC_CSCAN_TAINT(
1720                 isSPACE_LC_utf8((U8*)s),
1721                 isSPACE_LC(*s)
1722             );
1723             break;
1724         case NSPACEU:
1725             REXEC_FBC_CSCAN_PRELOAD(
1726                 LOAD_UTF8_CHARCLASS_SPACE(),
1727                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1728                 ! isSPACE_L1((U8) *s)
1729             );
1730             break;
1731         case NSPACE:
1732             REXEC_FBC_CSCAN_PRELOAD(
1733                 LOAD_UTF8_CHARCLASS_SPACE(),
1734                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1735                 ! isSPACE((U8) *s)
1736             );
1737             break;
1738         case NSPACEA:
1739             REXEC_FBC_CSCAN(
1740                 !isSPACE_A(*s),
1741                 !isSPACE_A(*s)
1742             );
1743             break;
1744         case NSPACEL:
1745             REXEC_FBC_CSCAN_TAINT(
1746                 !isSPACE_LC_utf8((U8*)s),
1747                 !isSPACE_LC(*s)
1748             );
1749             break;
1750         case DIGIT:
1751             REXEC_FBC_CSCAN_PRELOAD(
1752                 LOAD_UTF8_CHARCLASS_DIGIT(),
1753                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1754                 isDIGIT(*s)
1755             );
1756             break;
1757         case DIGITA:
1758             /* Don't need to worry about utf8, as it can match only a single
1759              * byte invariant character */
1760             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1761             break;
1762         case DIGITL:
1763             REXEC_FBC_CSCAN_TAINT(
1764                 isDIGIT_LC_utf8((U8*)s),
1765                 isDIGIT_LC(*s)
1766             );
1767             break;
1768         case NDIGIT:
1769             REXEC_FBC_CSCAN_PRELOAD(
1770                 LOAD_UTF8_CHARCLASS_DIGIT(),
1771                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1772                 !isDIGIT(*s)
1773             );
1774             break;
1775         case NDIGITA:
1776             REXEC_FBC_CSCAN(
1777                 !isDIGIT_A(*s),
1778                 !isDIGIT_A(*s)
1779             );
1780             break;
1781         case NDIGITL:
1782             REXEC_FBC_CSCAN_TAINT(
1783                 !isDIGIT_LC_utf8((U8*)s),
1784                 !isDIGIT_LC(*s)
1785             );
1786             break;
1787         case LNBREAK:
1788             REXEC_FBC_CSCAN(
1789                 is_LNBREAK_utf8(s),
1790                 is_LNBREAK_latin1(s)
1791             );
1792             break;
1793         case VERTWS:
1794             REXEC_FBC_CSCAN(
1795                 is_VERTWS_utf8(s),
1796                 is_VERTWS_latin1(s)
1797             );
1798             break;
1799         case NVERTWS:
1800             REXEC_FBC_CSCAN(
1801                 !is_VERTWS_utf8(s),
1802                 !is_VERTWS_latin1(s)
1803             );
1804             break;
1805         case HORIZWS:
1806             REXEC_FBC_CSCAN(
1807                 is_HORIZWS_utf8(s),
1808                 is_HORIZWS_latin1(s)
1809             );
1810             break;
1811         case NHORIZWS:
1812             REXEC_FBC_CSCAN(
1813                 !is_HORIZWS_utf8(s),
1814                 !is_HORIZWS_latin1(s)
1815             );
1816             break;
1817         case POSIXA:
1818             /* Don't need to worry about utf8, as it can match only a single
1819             * byte invariant character.  The flag in this node type is the
1820             * class number to pass to _generic_isCC() to build a mask for
1821             * searching in PL_charclass[] */
1822             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1823             break;
1824         case NPOSIXA:
1825             REXEC_FBC_CSCAN(
1826                 !_generic_isCC_A(*s, FLAGS(c)),
1827                 !_generic_isCC_A(*s, FLAGS(c))
1828             );
1829             break;
1830
1831         case AHOCORASICKC:
1832         case AHOCORASICK:
1833             {
1834                 DECL_TRIE_TYPE(c);
1835                 /* what trie are we using right now */
1836                 reg_ac_data *aho
1837                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1838                 reg_trie_data *trie
1839                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1840                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1841
1842                 const char *last_start = strend - trie->minlen;
1843 #ifdef DEBUGGING
1844                 const char *real_start = s;
1845 #endif
1846                 STRLEN maxlen = trie->maxlen;
1847                 SV *sv_points;
1848                 U8 **points; /* map of where we were in the input string
1849                                 when reading a given char. For ASCII this
1850                                 is unnecessary overhead as the relationship
1851                                 is always 1:1, but for Unicode, especially
1852                                 case folded Unicode this is not true. */
1853                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1854                 U8 *bitmap=NULL;
1855
1856
1857                 GET_RE_DEBUG_FLAGS_DECL;
1858
1859                 /* We can't just allocate points here. We need to wrap it in
1860                  * an SV so it gets freed properly if there is a croak while
1861                  * running the match */
1862                 ENTER;
1863                 SAVETMPS;
1864                 sv_points=newSV(maxlen * sizeof(U8 *));
1865                 SvCUR_set(sv_points,
1866                     maxlen * sizeof(U8 *));
1867                 SvPOK_on(sv_points);
1868                 sv_2mortal(sv_points);
1869                 points=(U8**)SvPV_nolen(sv_points );
1870                 if ( trie_type != trie_utf8_fold
1871                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1872                 {
1873                     if (trie->bitmap)
1874                         bitmap=(U8*)trie->bitmap;
1875                     else
1876                         bitmap=(U8*)ANYOF_BITMAP(c);
1877                 }
1878                 /* this is the Aho-Corasick algorithm modified a touch
1879                    to include special handling for long "unknown char"
1880                    sequences. The basic idea being that we use AC as long
1881                    as we are dealing with a possible matching char, when
1882                    we encounter an unknown char (and we have not encountered
1883                    an accepting state) we scan forward until we find a legal
1884                    starting char.
1885                    AC matching is basically that of trie matching, except
1886                    that when we encounter a failing transition, we fall back
1887                    to the current states "fail state", and try the current char
1888                    again, a process we repeat until we reach the root state,
1889                    state 1, or a legal transition. If we fail on the root state
1890                    then we can either terminate if we have reached an accepting
1891                    state previously, or restart the entire process from the beginning
1892                    if we have not.
1893
1894                  */
1895                 while (s <= last_start) {
1896                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1897                     U8 *uc = (U8*)s;
1898                     U16 charid = 0;
1899                     U32 base = 1;
1900                     U32 state = 1;
1901                     UV uvc = 0;
1902                     STRLEN len = 0;
1903                     STRLEN foldlen = 0;
1904                     U8 *uscan = (U8*)NULL;
1905                     U8 *leftmost = NULL;
1906 #ifdef DEBUGGING
1907                     U32 accepted_word= 0;
1908 #endif
1909                     U32 pointpos = 0;
1910
1911                     while ( state && uc <= (U8*)strend ) {
1912                         int failed=0;
1913                         U32 word = aho->states[ state ].wordnum;
1914
1915                         if( state==1 ) {
1916                             if ( bitmap ) {
1917                                 DEBUG_TRIE_EXECUTE_r(
1918                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1919                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1920                                             (char *)uc, utf8_target );
1921                                         PerlIO_printf( Perl_debug_log,
1922                                             " Scanning for legal start char...\n");
1923                                     }
1924                                 );
1925                                 if (utf8_target) {
1926                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1927                                         uc += UTF8SKIP(uc);
1928                                     }
1929                                 } else {
1930                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1931                                         uc++;
1932                                     }
1933                                 }
1934                                 s= (char *)uc;
1935                             }
1936                             if (uc >(U8*)last_start) break;
1937                         }
1938
1939                         if ( word ) {
1940                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1941                             if (!leftmost || lpos < leftmost) {
1942                                 DEBUG_r(accepted_word=word);
1943                                 leftmost= lpos;
1944                             }
1945                             if (base==0) break;
1946
1947                         }
1948                         points[pointpos++ % maxlen]= uc;
1949                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1950                                              uscan, len, uvc, charid, foldlen,
1951                                              foldbuf, uniflags);
1952                         DEBUG_TRIE_EXECUTE_r({
1953                             dump_exec_pos( (char *)uc, c, strend, real_start,
1954                                 s,   utf8_target );
1955                             PerlIO_printf(Perl_debug_log,
1956                                 " Charid:%3u CP:%4"UVxf" ",
1957                                  charid, uvc);
1958                         });
1959
1960                         do {
1961 #ifdef DEBUGGING
1962                             word = aho->states[ state ].wordnum;
1963 #endif
1964                             base = aho->states[ state ].trans.base;
1965
1966                             DEBUG_TRIE_EXECUTE_r({
1967                                 if (failed)
1968                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1969                                         s,   utf8_target );
1970                                 PerlIO_printf( Perl_debug_log,
1971                                     "%sState: %4"UVxf", word=%"UVxf,
1972                                     failed ? " Fail transition to " : "",
1973                                     (UV)state, (UV)word);
1974                             });
1975                             if ( base ) {
1976                                 U32 tmp;
1977                                 I32 offset;
1978                                 if (charid &&
1979                                      ( ((offset = base + charid
1980                                         - 1 - trie->uniquecharcount)) >= 0)
1981                                      && ((U32)offset < trie->lasttrans)
1982                                      && trie->trans[offset].check == state
1983                                      && (tmp=trie->trans[offset].next))
1984                                 {
1985                                     DEBUG_TRIE_EXECUTE_r(
1986                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1987                                     state = tmp;
1988                                     break;
1989                                 }
1990                                 else {
1991                                     DEBUG_TRIE_EXECUTE_r(
1992                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1993                                     failed = 1;
1994                                     state = aho->fail[state];
1995                                 }
1996                             }
1997                             else {
1998                                 /* we must be accepting here */
1999                                 DEBUG_TRIE_EXECUTE_r(
2000                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2001                                 failed = 1;
2002                                 break;
2003                             }
2004                         } while(state);
2005                         uc += len;
2006                         if (failed) {
2007                             if (leftmost)
2008                                 break;
2009                             if (!state) state = 1;
2010                         }
2011                     }
2012                     if ( aho->states[ state ].wordnum ) {
2013                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2014                         if (!leftmost || lpos < leftmost) {
2015                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2016                             leftmost = lpos;
2017                         }
2018                     }
2019                     if (leftmost) {
2020                         s = (char*)leftmost;
2021                         DEBUG_TRIE_EXECUTE_r({
2022                             PerlIO_printf(
2023                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2024                                 (UV)accepted_word, (IV)(s - real_start)
2025                             );
2026                         });
2027                         if (!reginfo || regtry(reginfo, &s)) {
2028                             FREETMPS;
2029                             LEAVE;
2030                             goto got_it;
2031                         }
2032                         s = HOPc(s,1);
2033                         DEBUG_TRIE_EXECUTE_r({
2034                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2035                         });
2036                     } else {
2037                         DEBUG_TRIE_EXECUTE_r(
2038                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2039                         break;
2040                     }
2041                 }
2042                 FREETMPS;
2043                 LEAVE;
2044             }
2045             break;
2046         default:
2047             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2048             break;
2049         }
2050         return 0;
2051       got_it:
2052         return s;
2053 }
2054
2055
2056 /*
2057  - regexec_flags - match a regexp against a string
2058  */
2059 I32
2060 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2061               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2062 /* strend: pointer to null at end of string */
2063 /* strbeg: real beginning of string */
2064 /* minend: end of match must be >=minend after stringarg. */
2065 /* data: May be used for some additional optimizations.
2066          Currently its only used, with a U32 cast, for transmitting
2067          the ganch offset when doing a /g match. This will change */
2068 /* nosave: For optimizations. */
2069 {
2070     dVAR;
2071     struct regexp *const prog = (struct regexp *)SvANY(rx);
2072     /*register*/ char *s;
2073     register regnode *c;
2074     /*register*/ char *startpos = stringarg;
2075     I32 minlen;         /* must match at least this many chars */
2076     I32 dontbother = 0; /* how many characters not to try at end */
2077     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2078     I32 scream_pos = -1;                /* Internal iterator of scream. */
2079     char *scream_olds = NULL;
2080     const bool utf8_target = cBOOL(DO_UTF8(sv));
2081     I32 multiline;
2082     RXi_GET_DECL(prog,progi);
2083     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2084     regexp_paren_pair *swap = NULL;
2085     GET_RE_DEBUG_FLAGS_DECL;
2086
2087     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2088     PERL_UNUSED_ARG(data);
2089
2090     /* Be paranoid... */
2091     if (prog == NULL || startpos == NULL) {
2092         Perl_croak(aTHX_ "NULL regexp parameter");
2093         return 0;
2094     }
2095
2096     multiline = prog->extflags & RXf_PMf_MULTILINE;
2097     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2098
2099     RX_MATCH_UTF8_set(rx, utf8_target);
2100     DEBUG_EXECUTE_r(
2101         debug_start_match(rx, utf8_target, startpos, strend,
2102         "Matching");
2103     );
2104
2105     minlen = prog->minlen;
2106
2107     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2108         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2109                               "String too short [regexec_flags]...\n"));
2110         goto phooey;
2111     }
2112
2113
2114     /* Check validity of program. */
2115     if (UCHARAT(progi->program) != REG_MAGIC) {
2116         Perl_croak(aTHX_ "corrupted regexp program");
2117     }
2118
2119     PL_reg_flags = 0;
2120     PL_reg_state.re_state_eval_setup_done = FALSE;
2121     PL_reg_maxiter = 0;
2122
2123     if (RX_UTF8(rx))
2124         PL_reg_flags |= RF_utf8;
2125
2126     /* Mark beginning of line for ^ and lookbehind. */
2127     reginfo.bol = startpos; /* XXX not used ??? */
2128     PL_bostr  = strbeg;
2129     reginfo.sv = sv;
2130
2131     /* Mark end of line for $ (and such) */
2132     PL_regeol = strend;
2133
2134     /* see how far we have to get to not match where we matched before */
2135     reginfo.till = startpos+minend;
2136
2137     /* If there is a "must appear" string, look for it. */
2138     s = startpos;
2139
2140     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2141         MAGIC *mg;
2142         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2143             reginfo.ganch = startpos + prog->gofs;
2144             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2145               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2146         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2147                   && SvMAGIC(sv)
2148                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2149                   && mg->mg_len >= 0) {
2150             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2151             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2152                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2153
2154             if (prog->extflags & RXf_ANCH_GPOS) {
2155                 if (s > reginfo.ganch)
2156                     goto phooey;
2157                 s = reginfo.ganch - prog->gofs;
2158                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2160                 if (s < strbeg)
2161                     goto phooey;
2162             }
2163         }
2164         else if (data) {
2165             reginfo.ganch = strbeg + PTR2UV(data);
2166             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2167                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2168
2169         } else {                                /* pos() not defined */
2170             reginfo.ganch = strbeg;
2171             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2172                  "GPOS: reginfo.ganch = strbeg\n"));
2173         }
2174     }
2175     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2176         /* We have to be careful. If the previous successful match
2177            was from this regex we don't want a subsequent partially
2178            successful match to clobber the old results.
2179            So when we detect this possibility we add a swap buffer
2180            to the re, and switch the buffer each match. If we fail
2181            we switch it back, otherwise we leave it swapped.
2182         */
2183         swap = prog->offs;
2184         /* do we need a save destructor here for eval dies? */
2185         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2186         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2187             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2188             PTR2UV(prog),
2189             PTR2UV(swap),
2190             PTR2UV(prog->offs)
2191         ));
2192     }
2193     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2194         re_scream_pos_data d;
2195
2196         d.scream_olds = &scream_olds;
2197         d.scream_pos = &scream_pos;
2198         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2199         if (!s) {
2200             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2201             goto phooey;        /* not present */
2202         }
2203     }
2204
2205
2206
2207     /* Simplest case:  anchored match need be tried only once. */
2208     /*  [unless only anchor is BOL and multiline is set] */
2209     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2210         if (s == startpos && regtry(&reginfo, &startpos))
2211             goto got_it;
2212         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2213                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2214         {
2215             char *end;
2216
2217             if (minlen)
2218                 dontbother = minlen - 1;
2219             end = HOP3c(strend, -dontbother, strbeg) - 1;
2220             /* for multiline we only have to try after newlines */
2221             if (prog->check_substr || prog->check_utf8) {
2222                 /* because of the goto we can not easily reuse the macros for bifurcating the
2223                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2224                 if (utf8_target) {
2225                     if (s == startpos)
2226                         goto after_try_utf8;
2227                     while (1) {
2228                         if (regtry(&reginfo, &s)) {
2229                             goto got_it;
2230                         }
2231                       after_try_utf8:
2232                         if (s > end) {
2233                             goto phooey;
2234                         }
2235                         if (prog->extflags & RXf_USE_INTUIT) {
2236                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2237                             if (!s) {
2238                                 goto phooey;
2239                             }
2240                         }
2241                         else {
2242                             s += UTF8SKIP(s);
2243                         }
2244                     }
2245                 } /* end search for check string in unicode */
2246                 else {
2247                     if (s == startpos) {
2248                         goto after_try_latin;
2249                     }
2250                     while (1) {
2251                         if (regtry(&reginfo, &s)) {
2252                             goto got_it;
2253                         }
2254                       after_try_latin:
2255                         if (s > end) {
2256                             goto phooey;
2257                         }
2258                         if (prog->extflags & RXf_USE_INTUIT) {
2259                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2260                             if (!s) {
2261                                 goto phooey;
2262                             }
2263                         }
2264                         else {
2265                             s++;
2266                         }
2267                     }
2268                 } /* end search for check string in latin*/
2269             } /* end search for check string */
2270             else { /* search for newline */
2271                 if (s > startpos) {
2272                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2273                     s--;
2274                 }
2275                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2276                 while (s <= end) { /* note it could be possible to match at the end of the string */
2277                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2278                         if (regtry(&reginfo, &s))
2279                             goto got_it;
2280                     }
2281                 }
2282             } /* end search for newline */
2283         } /* end anchored/multiline check string search */
2284         goto phooey;
2285     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2286     {
2287         /* the warning about reginfo.ganch being used without initialization
2288            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2289            and we only enter this block when the same bit is set. */
2290         char *tmp_s = reginfo.ganch - prog->gofs;
2291
2292         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2293             goto got_it;
2294         goto phooey;
2295     }
2296
2297     /* Messy cases:  unanchored match. */
2298     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2299         /* we have /x+whatever/ */
2300         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2301         char ch;
2302 #ifdef DEBUGGING
2303         int did_match = 0;
2304 #endif
2305         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2306             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2307         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2308
2309         if (utf8_target) {
2310             REXEC_FBC_SCAN(
2311                 if (*s == ch) {
2312                     DEBUG_EXECUTE_r( did_match = 1 );
2313                     if (regtry(&reginfo, &s)) goto got_it;
2314                     s += UTF8SKIP(s);
2315                     while (s < strend && *s == ch)
2316                         s += UTF8SKIP(s);
2317                 }
2318             );
2319         }
2320         else {
2321             REXEC_FBC_SCAN(
2322                 if (*s == ch) {
2323                     DEBUG_EXECUTE_r( did_match = 1 );
2324                     if (regtry(&reginfo, &s)) goto got_it;
2325                     s++;
2326                     while (s < strend && *s == ch)
2327                         s++;
2328                 }
2329             );
2330         }
2331         DEBUG_EXECUTE_r(if (!did_match)
2332                 PerlIO_printf(Perl_debug_log,
2333                                   "Did not find anchored character...\n")
2334                );
2335     }
2336     else if (prog->anchored_substr != NULL
2337               || prog->anchored_utf8 != NULL
2338               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2339                   && prog->float_max_offset < strend - s)) {
2340         SV *must;
2341         I32 back_max;
2342         I32 back_min;
2343         char *last;
2344         char *last1;            /* Last position checked before */
2345 #ifdef DEBUGGING
2346         int did_match = 0;
2347 #endif
2348         if (prog->anchored_substr || prog->anchored_utf8) {
2349             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2350                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2351             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2352             back_max = back_min = prog->anchored_offset;
2353         } else {
2354             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2355                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2356             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2357             back_max = prog->float_max_offset;
2358             back_min = prog->float_min_offset;
2359         }
2360
2361
2362         if (must == &PL_sv_undef)
2363             /* could not downgrade utf8 check substring, so must fail */
2364             goto phooey;
2365
2366         if (back_min<0) {
2367             last = strend;
2368         } else {
2369             last = HOP3c(strend,        /* Cannot start after this */
2370                   -(I32)(CHR_SVLEN(must)
2371                          - (SvTAIL(must) != 0) + back_min), strbeg);
2372         }
2373         if (s > PL_bostr)
2374             last1 = HOPc(s, -1);
2375         else
2376             last1 = s - 1;      /* bogus */
2377
2378         /* XXXX check_substr already used to find "s", can optimize if
2379            check_substr==must. */
2380         scream_pos = -1;
2381         dontbother = end_shift;
2382         strend = HOPc(strend, -dontbother);
2383         while ( (s <= last) &&
2384                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2385                                   (unsigned char*)strend, must,
2386                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2387             DEBUG_EXECUTE_r( did_match = 1 );
2388             if (HOPc(s, -back_max) > last1) {
2389                 last1 = HOPc(s, -back_min);
2390                 s = HOPc(s, -back_max);
2391             }
2392             else {
2393                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2394
2395                 last1 = HOPc(s, -back_min);
2396                 s = t;
2397             }
2398             if (utf8_target) {
2399                 while (s <= last1) {
2400                     if (regtry(&reginfo, &s))
2401                         goto got_it;
2402                     s += UTF8SKIP(s);
2403                 }
2404             }
2405             else {
2406                 while (s <= last1) {
2407                     if (regtry(&reginfo, &s))
2408                         goto got_it;
2409                     s++;
2410                 }
2411             }
2412         }
2413         DEBUG_EXECUTE_r(if (!did_match) {
2414             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2415                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2416             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2417                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2418                                ? "anchored" : "floating"),
2419                 quoted, RE_SV_TAIL(must));
2420         });
2421         goto phooey;
2422     }
2423     else if ( (c = progi->regstclass) ) {
2424         if (minlen) {
2425             const OPCODE op = OP(progi->regstclass);
2426             /* don't bother with what can't match */
2427             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2428                 strend = HOPc(strend, -(minlen - 1));
2429         }
2430         DEBUG_EXECUTE_r({
2431             SV * const prop = sv_newmortal();
2432             regprop(prog, prop, c);
2433             {
2434                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2435                     s,strend-s,60);
2436                 PerlIO_printf(Perl_debug_log,
2437                     "Matching stclass %.*s against %s (%d bytes)\n",
2438                     (int)SvCUR(prop), SvPVX_const(prop),
2439                      quoted, (int)(strend - s));
2440             }
2441         });
2442         if (find_byclass(prog, c, s, strend, &reginfo))
2443             goto got_it;
2444         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2445     }
2446     else {
2447         dontbother = 0;
2448         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2449             /* Trim the end. */
2450             char *last= NULL;
2451             SV* float_real;
2452             STRLEN len;
2453             const char *little;
2454
2455             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2456                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2457             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2458
2459             little = SvPV_const(float_real, len);
2460             if (SvTAIL(float_real)) {
2461                     /* This means that float_real contains an artificial \n on the end
2462                      * due to the presence of something like this: /foo$/
2463                      * where we can match both "foo" and "foo\n" at the end of the string.
2464                      * So we have to compare the end of the string first against the float_real
2465                      * without the \n and then against the full float_real with the string.
2466                      * We have to watch out for cases where the string might be smaller
2467                      * than the float_real or the float_real without the \n.
2468                      */
2469                     char *checkpos= strend - len;
2470                     DEBUG_OPTIMISE_r(
2471                         PerlIO_printf(Perl_debug_log,
2472                             "%sChecking for float_real.%s\n",
2473                             PL_colors[4], PL_colors[5]));
2474                     if (checkpos + 1 < strbeg) {
2475                         /* can't match, even if we remove the trailing \n string is too short to match */
2476                         DEBUG_EXECUTE_r(
2477                             PerlIO_printf(Perl_debug_log,
2478                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2479                                 PL_colors[4], PL_colors[5]));
2480                         goto phooey;
2481                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2482                         /* can match, the end of the string matches without the "\n" */
2483                         last = checkpos + 1;
2484                     } else if (checkpos < strbeg) {
2485                         /* cant match, string is too short when the "\n" is included */
2486                         DEBUG_EXECUTE_r(
2487                             PerlIO_printf(Perl_debug_log,
2488                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2489                                 PL_colors[4], PL_colors[5]));
2490                         goto phooey;
2491                     } else if (!multiline) {
2492                         /* non multiline match, so compare with the "\n" at the end of the string */
2493                         if (memEQ(checkpos, little, len)) {
2494                             last= checkpos;
2495                         } else {
2496                             DEBUG_EXECUTE_r(
2497                                 PerlIO_printf(Perl_debug_log,
2498                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2499                                     PL_colors[4], PL_colors[5]));
2500                             goto phooey;
2501                         }
2502                     } else {
2503                         /* multiline match, so we have to search for a place where the full string is located */
2504                         goto find_last;
2505                     }
2506             } else {
2507                   find_last:
2508                     if (len)
2509                         last = rninstr(s, strend, little, little + len);
2510                     else
2511                         last = strend;  /* matching "$" */
2512             }
2513             if (!last) {
2514                 /* at one point this block contained a comment which was probably
2515                  * incorrect, which said that this was a "should not happen" case.
2516                  * Even if it was true when it was written I am pretty sure it is
2517                  * not anymore, so I have removed the comment and replaced it with
2518                  * this one. Yves */
2519                 DEBUG_EXECUTE_r(
2520                     PerlIO_printf(Perl_debug_log,
2521                         "String does not contain required substring, cannot match.\n"
2522                     ));
2523                 goto phooey;
2524             }
2525             dontbother = strend - last + prog->float_min_offset;
2526         }
2527         if (minlen && (dontbother < minlen))
2528             dontbother = minlen - 1;
2529         strend -= dontbother;              /* this one's always in bytes! */
2530         /* We don't know much -- general case. */
2531         if (utf8_target) {
2532             for (;;) {
2533                 if (regtry(&reginfo, &s))
2534                     goto got_it;
2535                 if (s >= strend)
2536                     break;
2537                 s += UTF8SKIP(s);
2538             };
2539         }
2540         else {
2541             do {
2542                 if (regtry(&reginfo, &s))
2543                     goto got_it;
2544             } while (s++ < strend);
2545         }
2546     }
2547
2548     /* Failure. */
2549     goto phooey;
2550
2551 got_it:
2552     DEBUG_BUFFERS_r(
2553         if (swap)
2554             PerlIO_printf(Perl_debug_log,
2555                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2556                 PTR2UV(prog),
2557                 PTR2UV(swap)
2558             );
2559     );
2560     Safefree(swap);
2561     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2562
2563     if (PL_reg_state.re_state_eval_setup_done)
2564         restore_pos(aTHX_ prog);
2565     if (RXp_PAREN_NAMES(prog))
2566         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2567
2568     /* make sure $`, $&, $', and $digit will work later */
2569     if ( !(flags & REXEC_NOT_FIRST) ) {
2570         RX_MATCH_COPY_FREE(rx);
2571         if (flags & REXEC_COPY_STR) {
2572             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2573 #ifdef PERL_OLD_COPY_ON_WRITE
2574             if ((SvIsCOW(sv)
2575                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2576                 if (DEBUG_C_TEST) {
2577                     PerlIO_printf(Perl_debug_log,
2578                                   "Copy on write: regexp capture, type %d\n",
2579                                   (int) SvTYPE(sv));
2580                 }
2581                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2582                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2583                 assert (SvPOKp(prog->saved_copy));
2584             } else
2585 #endif
2586             {
2587                 RX_MATCH_COPIED_on(rx);
2588                 s = savepvn(strbeg, i);
2589                 prog->subbeg = s;
2590             }
2591             prog->sublen = i;
2592         }
2593         else {
2594             prog->subbeg = strbeg;
2595             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2596         }
2597     }
2598
2599     return 1;
2600
2601 phooey:
2602     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2603                           PL_colors[4], PL_colors[5]));
2604     if (PL_reg_state.re_state_eval_setup_done)
2605         restore_pos(aTHX_ prog);
2606     if (swap) {
2607         /* we failed :-( roll it back */
2608         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2609             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2610             PTR2UV(prog),
2611             PTR2UV(prog->offs),
2612             PTR2UV(swap)
2613         ));
2614         Safefree(prog->offs);
2615         prog->offs = swap;
2616     }
2617
2618     return 0;
2619 }
2620
2621
2622 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2623  * Do inc before dec, in case old and new rex are the same */
2624 #define SET_reg_curpm(Re2) \
2625     if (PL_reg_state.re_state_eval_setup_done) {    \
2626         (void)ReREFCNT_inc(Re2);                    \
2627         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2628         PM_SETRE((PL_reg_curpm), (Re2));            \
2629     }
2630
2631
2632 /*
2633  - regtry - try match at specific point
2634  */
2635 STATIC I32                      /* 0 failure, 1 success */
2636 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2637 {
2638     dVAR;
2639     CHECKPOINT lastcp;
2640     REGEXP *const rx = reginfo->prog;
2641     regexp *const prog = (struct regexp *)SvANY(rx);
2642     RXi_GET_DECL(prog,progi);
2643     GET_RE_DEBUG_FLAGS_DECL;
2644
2645     PERL_ARGS_ASSERT_REGTRY;
2646
2647     reginfo->cutpoint=NULL;
2648
2649     if ((prog->extflags & RXf_EVAL_SEEN)
2650         && !PL_reg_state.re_state_eval_setup_done)
2651     {
2652         MAGIC *mg;
2653
2654         PL_reg_state.re_state_eval_setup_done = TRUE;
2655         if (reginfo->sv) {
2656             /* Make $_ available to executed code. */
2657             if (reginfo->sv != DEFSV) {
2658                 SAVE_DEFSV;
2659                 DEFSV_set(reginfo->sv);
2660             }
2661
2662             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2663                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2664                 /* prepare for quick setting of pos */
2665 #ifdef PERL_OLD_COPY_ON_WRITE
2666                 if (SvIsCOW(reginfo->sv))
2667                     sv_force_normal_flags(reginfo->sv, 0);
2668 #endif
2669                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2670                                  &PL_vtbl_mglob, NULL, 0);
2671                 mg->mg_len = -1;
2672             }
2673             PL_reg_magic    = mg;
2674             PL_reg_oldpos   = mg->mg_len;
2675             SAVEDESTRUCTOR_X(restore_pos, prog);
2676         }
2677         if (!PL_reg_curpm) {
2678             Newxz(PL_reg_curpm, 1, PMOP);
2679 #ifdef USE_ITHREADS
2680             {
2681                 SV* const repointer = &PL_sv_undef;
2682                 /* this regexp is also owned by the new PL_reg_curpm, which
2683                    will try to free it.  */
2684                 av_push(PL_regex_padav, repointer);
2685                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2686                 PL_regex_pad = AvARRAY(PL_regex_padav);
2687             }
2688 #endif
2689         }
2690         SET_reg_curpm(rx);
2691         PL_reg_oldcurpm = PL_curpm;
2692         PL_curpm = PL_reg_curpm;
2693         if (RXp_MATCH_COPIED(prog)) {
2694             /*  Here is a serious problem: we cannot rewrite subbeg,
2695                 since it may be needed if this match fails.  Thus
2696                 $` inside (?{}) could fail... */
2697             PL_reg_oldsaved = prog->subbeg;
2698             PL_reg_oldsavedlen = prog->sublen;
2699 #ifdef PERL_OLD_COPY_ON_WRITE
2700             PL_nrs = prog->saved_copy;
2701 #endif
2702             RXp_MATCH_COPIED_off(prog);
2703         }
2704         else
2705             PL_reg_oldsaved = NULL;
2706         prog->subbeg = PL_bostr;
2707         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2708     }
2709 #ifdef DEBUGGING
2710     PL_reg_starttry = *startpos;
2711 #endif
2712     prog->offs[0].start = *startpos - PL_bostr;
2713     PL_reginput = *startpos;
2714     prog->lastparen = 0;
2715     prog->lastcloseparen = 0;
2716     PL_regsize = 0;
2717
2718     /* XXXX What this code is doing here?!!!  There should be no need
2719        to do this again and again, prog->lastparen should take care of
2720        this!  --ilya*/
2721
2722     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2723      * Actually, the code in regcppop() (which Ilya may be meaning by
2724      * prog->lastparen), is not needed at all by the test suite
2725      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2726      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2727      * Meanwhile, this code *is* needed for the
2728      * above-mentioned test suite tests to succeed.  The common theme
2729      * on those tests seems to be returning null fields from matches.
2730      * --jhi updated by dapm */
2731 #if 1
2732     if (prog->nparens) {
2733         regexp_paren_pair *pp = prog->offs;
2734         register I32 i;
2735         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2736             ++pp;
2737             pp->start = -1;
2738             pp->end = -1;
2739         }
2740     }
2741 #endif
2742     REGCP_SET(lastcp);
2743     if (regmatch(reginfo, progi->program + 1)) {
2744         prog->offs[0].end = PL_reginput - PL_bostr;
2745         return 1;
2746     }
2747     if (reginfo->cutpoint)
2748         *startpos= reginfo->cutpoint;
2749     REGCP_UNWIND(lastcp);
2750     return 0;
2751 }
2752
2753
2754 #define sayYES goto yes
2755 #define sayNO goto no
2756 #define sayNO_SILENT goto no_silent
2757
2758 /* we dont use STMT_START/END here because it leads to
2759    "unreachable code" warnings, which are bogus, but distracting. */
2760 #define CACHEsayNO \
2761     if (ST.cache_mask) \
2762        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2763     sayNO
2764
2765 /* this is used to determine how far from the left messages like
2766    'failed...' are printed. It should be set such that messages
2767    are inline with the regop output that created them.
2768 */
2769 #define REPORT_CODE_OFF 32
2770
2771
2772 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2773 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2774
2775 #define SLAB_FIRST(s) (&(s)->states[0])
2776 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2777
2778 /* grab a new slab and return the first slot in it */
2779
2780 STATIC regmatch_state *
2781 S_push_slab(pTHX)
2782 {
2783 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2784     dMY_CXT;
2785 #endif
2786     regmatch_slab *s = PL_regmatch_slab->next;
2787     if (!s) {
2788         Newx(s, 1, regmatch_slab);
2789         s->prev = PL_regmatch_slab;
2790         s->next = NULL;
2791         PL_regmatch_slab->next = s;
2792     }
2793     PL_regmatch_slab = s;
2794     return SLAB_FIRST(s);
2795 }
2796
2797
2798 /* push a new state then goto it */
2799
2800 #define PUSH_STATE_GOTO(state, node) \
2801     scan = node; \
2802     st->resume_state = state; \
2803     goto push_state;
2804
2805 /* push a new state with success backtracking, then goto it */
2806
2807 #define PUSH_YES_STATE_GOTO(state, node) \
2808     scan = node; \
2809     st->resume_state = state; \
2810     goto push_yes_state;
2811
2812
2813
2814 /*
2815
2816 regmatch() - main matching routine
2817
2818 This is basically one big switch statement in a loop. We execute an op,
2819 set 'next' to point the next op, and continue. If we come to a point which
2820 we may need to backtrack to on failure such as (A|B|C), we push a
2821 backtrack state onto the backtrack stack. On failure, we pop the top
2822 state, and re-enter the loop at the state indicated. If there are no more
2823 states to pop, we return failure.
2824
2825 Sometimes we also need to backtrack on success; for example /A+/, where
2826 after successfully matching one A, we need to go back and try to
2827 match another one; similarly for lookahead assertions: if the assertion
2828 completes successfully, we backtrack to the state just before the assertion
2829 and then carry on.  In these cases, the pushed state is marked as
2830 'backtrack on success too'. This marking is in fact done by a chain of
2831 pointers, each pointing to the previous 'yes' state. On success, we pop to
2832 the nearest yes state, discarding any intermediate failure-only states.
2833 Sometimes a yes state is pushed just to force some cleanup code to be
2834 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2835 it to free the inner regex.
2836
2837 Note that failure backtracking rewinds the cursor position, while
2838 success backtracking leaves it alone.
2839
2840 A pattern is complete when the END op is executed, while a subpattern
2841 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2842 ops trigger the "pop to last yes state if any, otherwise return true"
2843 behaviour.
2844
2845 A common convention in this function is to use A and B to refer to the two
2846 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2847 the subpattern to be matched possibly multiple times, while B is the entire
2848 rest of the pattern. Variable and state names reflect this convention.
2849
2850 The states in the main switch are the union of ops and failure/success of
2851 substates associated with with that op.  For example, IFMATCH is the op
2852 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2853 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2854 successfully matched A and IFMATCH_A_fail is a state saying that we have
2855 just failed to match A. Resume states always come in pairs. The backtrack
2856 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2857 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2858 on success or failure.
2859
2860 The struct that holds a backtracking state is actually a big union, with
2861 one variant for each major type of op. The variable st points to the
2862 top-most backtrack struct. To make the code clearer, within each
2863 block of code we #define ST to alias the relevant union.
2864
2865 Here's a concrete example of a (vastly oversimplified) IFMATCH
2866 implementation:
2867
2868     switch (state) {
2869     ....
2870
2871 #define ST st->u.ifmatch
2872
2873     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2874         ST.foo = ...; // some state we wish to save
2875         ...
2876         // push a yes backtrack state with a resume value of
2877         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2878         // first node of A:
2879         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2880         // NOTREACHED
2881
2882     case IFMATCH_A: // we have successfully executed A; now continue with B
2883         next = B;
2884         bar = ST.foo; // do something with the preserved value
2885         break;
2886
2887     case IFMATCH_A_fail: // A failed, so the assertion failed
2888         ...;   // do some housekeeping, then ...
2889         sayNO; // propagate the failure
2890
2891 #undef ST
2892
2893     ...
2894     }
2895
2896 For any old-timers reading this who are familiar with the old recursive
2897 approach, the code above is equivalent to:
2898
2899     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2900     {
2901         int foo = ...
2902         ...
2903         if (regmatch(A)) {
2904             next = B;
2905             bar = foo;
2906             break;
2907         }
2908         ...;   // do some housekeeping, then ...
2909         sayNO; // propagate the failure
2910     }
2911
2912 The topmost backtrack state, pointed to by st, is usually free. If you
2913 want to claim it, populate any ST.foo fields in it with values you wish to
2914 save, then do one of
2915
2916         PUSH_STATE_GOTO(resume_state, node);
2917         PUSH_YES_STATE_GOTO(resume_state, node);
2918
2919 which sets that backtrack state's resume value to 'resume_state', pushes a
2920 new free entry to the top of the backtrack stack, then goes to 'node'.
2921 On backtracking, the free slot is popped, and the saved state becomes the
2922 new free state. An ST.foo field in this new top state can be temporarily
2923 accessed to retrieve values, but once the main loop is re-entered, it
2924 becomes available for reuse.
2925
2926 Note that the depth of the backtrack stack constantly increases during the
2927 left-to-right execution of the pattern, rather than going up and down with
2928 the pattern nesting. For example the stack is at its maximum at Z at the
2929 end of the pattern, rather than at X in the following:
2930
2931     /(((X)+)+)+....(Y)+....Z/
2932
2933 The only exceptions to this are lookahead/behind assertions and the cut,
2934 (?>A), which pop all the backtrack states associated with A before
2935 continuing.
2936
2937 Backtrack state structs are allocated in slabs of about 4K in size.
2938 PL_regmatch_state and st always point to the currently active state,
2939 and PL_regmatch_slab points to the slab currently containing
2940 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2941 allocated, and is never freed until interpreter destruction. When the slab
2942 is full, a new one is allocated and chained to the end. At exit from
2943 regmatch(), slabs allocated since entry are freed.
2944
2945 */
2946
2947
2948 #define DEBUG_STATE_pp(pp)                                  \
2949     DEBUG_STATE_r({                                         \
2950         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2951         PerlIO_printf(Perl_debug_log,                       \
2952             "    %*s"pp" %s%s%s%s%s\n",                     \
2953             depth*2, "",                                    \
2954             PL_reg_name[st->resume_state],                     \
2955             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2956             ((st==yes_state) ? "Y" : ""),                   \
2957             ((st==mark_state) ? "M" : ""),                  \
2958             ((st==yes_state||st==mark_state) ? "]" : "")    \
2959         );                                                  \
2960     });
2961
2962
2963 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2964
2965 #ifdef DEBUGGING
2966
2967 STATIC void
2968 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2969     const char *start, const char *end, const char *blurb)
2970 {
2971     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2972
2973     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2974
2975     if (!PL_colorset)
2976             reginitcolors();
2977     {
2978         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2979             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2980
2981         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2982             start, end - start, 60);
2983
2984         PerlIO_printf(Perl_debug_log,
2985             "%s%s REx%s %s against %s\n",
2986                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2987
2988         if (utf8_target||utf8_pat)
2989             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2990                 utf8_pat ? "pattern" : "",
2991                 utf8_pat && utf8_target ? " and " : "",
2992                 utf8_target ? "string" : ""
2993             );
2994     }
2995 }
2996
2997 STATIC void
2998 S_dump_exec_pos(pTHX_ const char *locinput,
2999                       const regnode *scan,
3000                       const char *loc_regeol,
3001                       const char *loc_bostr,
3002                       const char *loc_reg_starttry,
3003                       const bool utf8_target)
3004 {
3005     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3006     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3007     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3008     /* The part of the string before starttry has one color
3009        (pref0_len chars), between starttry and current
3010        position another one (pref_len - pref0_len chars),
3011        after the current position the third one.
3012        We assume that pref0_len <= pref_len, otherwise we
3013        decrease pref0_len.  */
3014     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3015         ? (5 + taill) - l : locinput - loc_bostr;
3016     int pref0_len;
3017
3018     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3019
3020     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3021         pref_len++;
3022     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3023     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3024         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3025               ? (5 + taill) - pref_len : loc_regeol - locinput);
3026     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3027         l--;
3028     if (pref0_len < 0)
3029         pref0_len = 0;
3030     if (pref0_len > pref_len)
3031         pref0_len = pref_len;
3032     {
3033         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3034
3035         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3036             (locinput - pref_len),pref0_len, 60, 4, 5);
3037
3038         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3039                     (locinput - pref_len + pref0_len),
3040                     pref_len - pref0_len, 60, 2, 3);
3041
3042         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3043                     locinput, loc_regeol - locinput, 10, 0, 1);
3044
3045         const STRLEN tlen=len0+len1+len2;
3046         PerlIO_printf(Perl_debug_log,
3047                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3048                     (IV)(locinput - loc_bostr),
3049                     len0, s0,
3050                     len1, s1,
3051                     (docolor ? "" : "> <"),
3052                     len2, s2,
3053                     (int)(tlen > 19 ? 0 :  19 - tlen),
3054                     "");
3055     }
3056 }
3057
3058 #endif
3059
3060 /* reg_check_named_buff_matched()
3061  * Checks to see if a named buffer has matched. The data array of
3062  * buffer numbers corresponding to the buffer is expected to reside
3063  * in the regexp->data->data array in the slot stored in the ARG() of
3064  * node involved. Note that this routine doesn't actually care about the
3065  * name, that information is not preserved from compilation to execution.
3066  * Returns the index of the leftmost defined buffer with the given name
3067  * or 0 if non of the buffers matched.
3068  */
3069 STATIC I32
3070 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3071 {
3072     I32 n;
3073     RXi_GET_DECL(rex,rexi);
3074     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3075     I32 *nums=(I32*)SvPVX(sv_dat);
3076
3077     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3078
3079     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3080         if ((I32)rex->lastparen >= nums[n] &&
3081             rex->offs[nums[n]].end != -1)
3082         {
3083             return nums[n];
3084         }
3085     }
3086     return 0;
3087 }
3088
3089
3090 /* free all slabs above current one  - called during LEAVE_SCOPE */
3091
3092 STATIC void
3093 S_clear_backtrack_stack(pTHX_ void *p)
3094 {
3095     regmatch_slab *s = PL_regmatch_slab->next;
3096     PERL_UNUSED_ARG(p);
3097
3098     if (!s)
3099         return;
3100     PL_regmatch_slab->next = NULL;
3101     while (s) {
3102         regmatch_slab * const osl = s;
3103         s = s->next;
3104         Safefree(osl);
3105     }
3106 }
3107
3108
3109 STATIC I32                      /* 0 failure, 1 success */
3110 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3111 {
3112 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3113     dMY_CXT;
3114 #endif
3115     dVAR;
3116     register const bool utf8_target = PL_reg_match_utf8;
3117     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3118     REGEXP *rex_sv = reginfo->prog;
3119     regexp *rex = (struct regexp *)SvANY(rex_sv);
3120     RXi_GET_DECL(rex,rexi);
3121     I32 oldsave;
3122     /* the current state. This is a cached copy of PL_regmatch_state */
3123     register regmatch_state *st;
3124     /* cache heavy used fields of st in registers */
3125     register regnode *scan;
3126     register regnode *next;
3127     register U32 n = 0; /* general value; init to avoid compiler warning */
3128     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3129     register char *locinput = PL_reginput;
3130     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3131
3132     bool result = 0;        /* return value of S_regmatch */
3133     int depth = 0;          /* depth of backtrack stack */
3134     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3135     const U32 max_nochange_depth =
3136         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3137         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3138     regmatch_state *yes_state = NULL; /* state to pop to on success of
3139                                                             subpattern */
3140     /* mark_state piggy backs on the yes_state logic so that when we unwind
3141        the stack on success we can update the mark_state as we go */
3142     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3143     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3144     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3145     U32 state_num;
3146     bool no_final = 0;      /* prevent failure from backtracking? */
3147     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3148     char *startpoint = PL_reginput;
3149     SV *popmark = NULL;     /* are we looking for a mark? */
3150     SV *sv_commit = NULL;   /* last mark name seen in failure */
3151     SV *sv_yes_mark = NULL; /* last mark name we have seen
3152                                during a successful match */
3153     U32 lastopen = 0;       /* last open we saw */
3154     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3155     SV* const oreplsv = GvSV(PL_replgv);
3156     /* these three flags are set by various ops to signal information to
3157      * the very next op. They have a useful lifetime of exactly one loop
3158      * iteration, and are not preserved or restored by state pushes/pops
3159      */
3160     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3161     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3162     int logical = 0;        /* the following EVAL is:
3163                                 0: (?{...})
3164                                 1: (?(?{...})X|Y)
3165                                 2: (??{...})
3166                                or the following IFMATCH/UNLESSM is:
3167                                 false: plain (?=foo)
3168                                 true:  used as a condition: (?(?=foo))
3169                             */
3170     PAD* last_pad = NULL;
3171     dMULTICALL;
3172     I32 gimme = G_SCALAR;
3173     CV *caller_cv = NULL;       /* who called us */
3174     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3175     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3176
3177 #ifdef DEBUGGING
3178     GET_RE_DEBUG_FLAGS_DECL;
3179 #endif
3180
3181     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3182     multicall_oldcatch = 0;
3183     multicall_cv = NULL;
3184     cx = NULL;
3185     PERL_UNUSED_VAR(multicall_cop);
3186     PERL_UNUSED_VAR(newsp);
3187
3188
3189     PERL_ARGS_ASSERT_REGMATCH;
3190
3191     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3192             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3193     }));
3194     /* on first ever call to regmatch, allocate first slab */
3195     if (!PL_regmatch_slab) {
3196         Newx(PL_regmatch_slab, 1, regmatch_slab);
3197         PL_regmatch_slab->prev = NULL;
3198         PL_regmatch_slab->next = NULL;
3199         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3200     }
3201
3202     oldsave = PL_savestack_ix;
3203     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3204     SAVEVPTR(PL_regmatch_slab);
3205     SAVEVPTR(PL_regmatch_state);
3206
3207     /* grab next free state slot */
3208     st = ++PL_regmatch_state;
3209     if (st >  SLAB_LAST(PL_regmatch_slab))
3210         st = PL_regmatch_state = S_push_slab(aTHX);
3211
3212     /* Note that nextchr is a byte even in UTF */
3213     nextchr = UCHARAT(locinput);
3214     scan = prog;
3215     while (scan != NULL) {
3216
3217         DEBUG_EXECUTE_r( {
3218             SV * const prop = sv_newmortal();
3219             regnode *rnext=regnext(scan);
3220             DUMP_EXEC_POS( locinput, scan, utf8_target );
3221             regprop(rex, prop, scan);
3222
3223             PerlIO_printf(Perl_debug_log,
3224                     "%3"IVdf":%*s%s(%"IVdf")\n",
3225                     (IV)(scan - rexi->program), depth*2, "",
3226                     SvPVX_const(prop),
3227                     (PL_regkind[OP(scan)] == END || !rnext) ?
3228                         0 : (IV)(rnext - rexi->program));
3229         });
3230
3231         next = scan + NEXT_OFF(scan);
3232         if (next == scan)
3233             next = NULL;
3234         state_num = OP(scan);
3235
3236       reenter_switch:
3237
3238         switch (state_num) {
3239         case BOL:
3240             if (locinput == PL_bostr)
3241             {
3242                 /* reginfo->till = reginfo->bol; */
3243                 break;
3244             }
3245             sayNO;
3246         case MBOL:
3247             if (locinput == PL_bostr ||
3248                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3249             {
3250                 break;
3251             }
3252             sayNO;
3253         case SBOL:
3254             if (locinput == PL_bostr)
3255                 break;
3256             sayNO;
3257         case GPOS:
3258             if (locinput == reginfo->ganch)
3259                 break;
3260             sayNO;
3261
3262         case KEEPS:
3263             /* update the startpoint */
3264             st->u.keeper.val = rex->offs[0].start;
3265             PL_reginput = locinput;
3266             rex->offs[0].start = locinput - PL_bostr;
3267             PUSH_STATE_GOTO(KEEPS_next, next);
3268             /*NOT-REACHED*/
3269         case KEEPS_next_fail:
3270             /* rollback the start point change */
3271             rex->offs[0].start = st->u.keeper.val;
3272             sayNO_SILENT;
3273             /*NOT-REACHED*/
3274         case EOL:
3275                 goto seol;
3276         case MEOL:
3277             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3278                 sayNO;
3279             break;
3280         case SEOL:
3281           seol:
3282             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3283                 sayNO;
3284             if (PL_regeol - locinput > 1)
3285                 sayNO;
3286             break;
3287         case EOS:
3288             if (PL_regeol != locinput)
3289                 sayNO;
3290             break;
3291         case SANY:
3292             if (!nextchr && locinput >= PL_regeol)
3293                 sayNO;
3294             if (utf8_target) {
3295                 locinput += PL_utf8skip[nextchr];
3296                 if (locinput > PL_regeol)
3297                     sayNO;
3298                 nextchr = UCHARAT(locinput);
3299             }
3300             else
3301                 nextchr = UCHARAT(++locinput);
3302             break;
3303         case CANY:
3304             if (!nextchr && locinput >= PL_regeol)
3305                 sayNO;
3306             nextchr = UCHARAT(++locinput);
3307             break;
3308         case REG_ANY:
3309             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3310                 sayNO;
3311             if (utf8_target) {
3312                 locinput += PL_utf8skip[nextchr];
3313                 if (locinput > PL_regeol)
3314                     sayNO;
3315                 nextchr = UCHARAT(locinput);
3316             }
3317             else
3318                 nextchr = UCHARAT(++locinput);
3319             break;
3320
3321 #undef  ST
3322 #define ST st->u.trie
3323         case TRIEC:
3324             /* In this case the charclass data is available inline so
3325                we can fail fast without a lot of extra overhead.
3326              */
3327             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3328                 DEBUG_EXECUTE_r(
3329                     PerlIO_printf(Perl_debug_log,
3330                               "%*s  %sfailed to match trie start class...%s\n",
3331                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3332                 );
3333                 sayNO_SILENT;
3334                 assert(0); /* NOTREACHED */
3335             }
3336             /* FALL THROUGH */
3337         case TRIE:
3338             /* the basic plan of execution of the trie is:
3339              * At the beginning, run though all the states, and
3340              * find the longest-matching word. Also remember the position
3341              * of the shortest matching word. For example, this pattern:
3342              *    1  2 3 4    5
3343              *    ab|a|x|abcd|abc
3344              * when matched against the string "abcde", will generate
3345              * accept states for all words except 3, with the longest
3346              * matching word being 4, and the shortest being 1 (with
3347              * the position being after char 1 of the string).
3348              *
3349              * Then for each matching word, in word order (i.e. 1,2,4,5),
3350              * we run the remainder of the pattern; on each try setting
3351              * the current position to the character following the word,
3352              * returning to try the next word on failure.
3353              *
3354              * We avoid having to build a list of words at runtime by
3355              * using a compile-time structure, wordinfo[].prev, which
3356              * gives, for each word, the previous accepting word (if any).
3357              * In the case above it would contain the mappings 1->2, 2->0,
3358              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3359              * the longest word (4 above), a list of all words, by
3360              * following the list of prev pointers; this gives us the
3361              * unordered list 4,5,1,2. Then given the current word we have
3362              * just tried, we can go through the list and find the
3363              * next-biggest word to try (so if we just failed on word 2,
3364              * the next in the list is 4).
3365              *
3366              * Since at runtime we don't record the matching position in
3367              * the string for each word, we have to work that out for
3368              * each word we're about to process. The wordinfo table holds
3369              * the character length of each word; given that we recorded
3370              * at the start: the position of the shortest word and its
3371              * length in chars, we just need to move the pointer the
3372              * difference between the two char lengths. Depending on
3373              * Unicode status and folding, that's cheap or expensive.
3374              *
3375              * This algorithm is optimised for the case where are only a
3376              * small number of accept states, i.e. 0,1, or maybe 2.
3377              * With lots of accepts states, and having to try all of them,
3378              * it becomes quadratic on number of accept states to find all
3379              * the next words.
3380              */
3381
3382             {
3383                 /* what type of TRIE am I? (utf8 makes this contextual) */
3384                 DECL_TRIE_TYPE(scan);
3385
3386                 /* what trie are we using right now */
3387                 reg_trie_data * const trie
3388                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3389                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3390                 U32 state = trie->startstate;
3391
3392                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3393                     if (trie->states[ state ].wordnum) {
3394                          DEBUG_EXECUTE_r(
3395                             PerlIO_printf(Perl_debug_log,
3396                                           "%*s  %smatched empty string...%s\n",
3397                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3398                         );
3399                         if (!trie->jump)
3400                             break;
3401                     } else {
3402                         DEBUG_EXECUTE_r(
3403                             PerlIO_printf(Perl_debug_log,
3404                                           "%*s  %sfailed to match trie start class...%s\n",
3405                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3406                         );
3407                         sayNO_SILENT;
3408                    }
3409                 }
3410
3411             {
3412                 U8 *uc = ( U8* )locinput;
3413
3414                 STRLEN len = 0;
3415                 STRLEN foldlen = 0;
3416                 U8 *uscan = (U8*)NULL;
3417                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3418                 U32 charcount = 0; /* how many input chars we have matched */
3419                 U32 accepted = 0; /* have we seen any accepting states? */
3420
3421                 ST.jump = trie->jump;
3422                 ST.me = scan;
3423                 ST.firstpos = NULL;
3424                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3425                 ST.nextword = 0;
3426
3427                 /* fully traverse the TRIE; note the position of the
3428                    shortest accept state and the wordnum of the longest
3429                    accept state */
3430
3431                 while ( state && uc <= (U8*)PL_regeol ) {
3432                     U32 base = trie->states[ state ].trans.base;
3433                     UV uvc = 0;
3434                     U16 charid = 0;
3435                     U16 wordnum;
3436                     wordnum = trie->states[ state ].wordnum;
3437
3438                     if (wordnum) { /* it's an accept state */
3439                         if (!accepted) {
3440                             accepted = 1;
3441                             /* record first match position */
3442                             if (ST.longfold) {
3443                                 ST.firstpos = (U8*)locinput;
3444                                 ST.firstchars = 0;
3445                             }
3446                             else {
3447                                 ST.firstpos = uc;
3448                                 ST.firstchars = charcount;
3449                             }
3450                         }
3451                         if (!ST.nextword || wordnum < ST.nextword)
3452                             ST.nextword = wordnum;
3453                         ST.topword = wordnum;
3454                     }
3455
3456                     DEBUG_TRIE_EXECUTE_r({
3457                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3458                                 PerlIO_printf( Perl_debug_log,
3459                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3460                                     2+depth * 2, "", PL_colors[4],
3461                                     (UV)state, (accepted ? 'Y' : 'N'));
3462                     });
3463
3464                     /* read a char and goto next state */
3465                     if ( base ) {
3466                         I32 offset;
3467                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3468                                              uscan, len, uvc, charid, foldlen,
3469                                              foldbuf, uniflags);
3470                         charcount++;
3471                         if (foldlen>0)
3472                             ST.longfold = TRUE;
3473                         if (charid &&
3474                              ( ((offset =
3475                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3476
3477                              && ((U32)offset < trie->lasttrans)
3478                              && trie->trans[offset].check == state)
3479                         {
3480                             state = trie->trans[offset].next;
3481                         }
3482                         else {
3483                             state = 0;
3484                         }
3485                         uc += len;
3486
3487                     }
3488                     else {
3489                         state = 0;
3490                     }
3491                     DEBUG_TRIE_EXECUTE_r(
3492                         PerlIO_printf( Perl_debug_log,
3493                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3494                             charid, uvc, (UV)state, PL_colors[5] );
3495                     );
3496                 }
3497                 if (!accepted)
3498                    sayNO;
3499
3500                 /* calculate total number of accept states */
3501                 {
3502                     U16 w = ST.topword;
3503                     accepted = 0;
3504                     while (w) {
3505                         w = trie->wordinfo[w].prev;
3506                         accepted++;
3507                     }
3508                     ST.accepted = accepted;
3509                 }
3510
3511                 DEBUG_EXECUTE_r(
3512                     PerlIO_printf( Perl_debug_log,
3513                         "%*s  %sgot %"IVdf" possible matches%s\n",
3514                         REPORT_CODE_OFF + depth * 2, "",
3515                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3516                 );
3517                 goto trie_first_try; /* jump into the fail handler */
3518             }}
3519             assert(0); /* NOTREACHED */
3520
3521         case TRIE_next_fail: /* we failed - try next alternative */
3522             if ( ST.jump) {
3523                 REGCP_UNWIND(ST.cp);
3524                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3525             }
3526             if (!--ST.accepted) {
3527                 DEBUG_EXECUTE_r({
3528                     PerlIO_printf( Perl_debug_log,
3529                         "%*s  %sTRIE failed...%s\n",
3530                         REPORT_CODE_OFF+depth*2, "",
3531                         PL_colors[4],
3532                         PL_colors[5] );
3533                 });
3534                 sayNO_SILENT;
3535             }
3536             {
3537                 /* Find next-highest word to process.  Note that this code
3538                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3539                 register U16 min = 0;
3540                 register U16 word;
3541                 register U16 const nextword = ST.nextword;
3542                 register reg_trie_wordinfo * const wordinfo
3543                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3544                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3545                     if (word > nextword && (!min || word < min))
3546                         min = word;
3547                 }
3548                 ST.nextword = min;
3549             }
3550
3551           trie_first_try:
3552             if (do_cutgroup) {
3553                 do_cutgroup = 0;
3554                 no_final = 0;
3555             }
3556
3557             if ( ST.jump) {
3558                 ST.lastparen = rex->lastparen;
3559                 ST.lastcloseparen = rex->lastcloseparen;
3560                 REGCP_SET(ST.cp);
3561             }
3562
3563             /* find start char of end of current word */
3564             {
3565                 U32 chars; /* how many chars to skip */
3566                 U8 *uc = ST.firstpos;
3567                 reg_trie_data * const trie
3568                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3569
3570                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3571                             >=  ST.firstchars);
3572                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3573                             - ST.firstchars;
3574
3575                 if (ST.longfold) {
3576                     /* the hard option - fold each char in turn and find
3577                      * its folded length (which may be different */
3578                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3579                     STRLEN foldlen;
3580                     STRLEN len;
3581                     UV uvc;
3582                     U8 *uscan;
3583
3584                     while (chars) {
3585                         if (utf8_target) {
3586                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3587                                                     uniflags);
3588                             uc += len;
3589                         }
3590                         else {
3591                             uvc = *uc;
3592                             uc++;
3593                         }
3594                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3595                         uscan = foldbuf;
3596                         while (foldlen) {
3597                             if (!--chars)
3598                                 break;
3599                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3600                                             uniflags);
3601                             uscan += len;
3602                             foldlen -= len;
3603                         }
3604                     }
3605                 }
3606                 else {
3607                     if (utf8_target)
3608                         while (chars--)
3609                             uc += UTF8SKIP(uc);
3610                     else
3611                         uc += chars;
3612                 }
3613                 PL_reginput = (char *)uc;
3614             }
3615
3616             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3617                             ? ST.jump[ST.nextword]
3618                             : NEXT_OFF(ST.me));
3619
3620             DEBUG_EXECUTE_r({
3621                 PerlIO_printf( Perl_debug_log,
3622                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3623                     REPORT_CODE_OFF+depth*2, "",
3624                     PL_colors[4],
3625                     ST.nextword,
3626                     PL_colors[5]
3627                     );
3628             });
3629
3630             if (ST.accepted > 1 || has_cutgroup) {
3631                 PUSH_STATE_GOTO(TRIE_next, scan);
3632                 assert(0); /* NOTREACHED */
3633             }
3634             /* only one choice left - just continue */
3635             DEBUG_EXECUTE_r({
3636                 AV *const trie_words
3637                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3638                 SV ** const tmp = av_fetch( trie_words,
3639                     ST.nextword-1, 0 );
3640                 SV *sv= tmp ? sv_newmortal() : NULL;
3641
3642                 PerlIO_printf( Perl_debug_log,
3643                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3644                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3645                     ST.nextword,
3646                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3647                             PL_colors[0], PL_colors[1],
3648                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3649                         )
3650                     : "not compiled under -Dr",
3651                     PL_colors[5] );
3652             });
3653
3654             locinput = PL_reginput;
3655             nextchr = UCHARAT(locinput);
3656             continue; /* execute rest of RE */
3657             assert(0); /* NOTREACHED */
3658 #undef  ST
3659
3660         case EXACT: {
3661             char *s = STRING(scan);
3662             ln = STR_LEN(scan);
3663             if (utf8_target != UTF_PATTERN) {
3664                 /* The target and the pattern have differing utf8ness. */
3665                 char *l = locinput;
3666                 const char * const e = s + ln;
3667
3668                 if (utf8_target) {
3669                     /* The target is utf8, the pattern is not utf8. */
3670                     while (s < e) {
3671                         STRLEN ulen;
3672                         if (l >= PL_regeol)
3673                              sayNO;
3674                         if (NATIVE_TO_UNI(*(U8*)s) !=
3675                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3676                                             uniflags))
3677                              sayNO;
3678                         l += ulen;
3679                         s ++;
3680                     }
3681                 }
3682                 else {
3683                     /* The target is not utf8, the pattern is utf8. */
3684                     while (s < e) {
3685                         STRLEN ulen;
3686                         if (l >= PL_regeol)
3687                             sayNO;
3688                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3689                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3690                                            uniflags))
3691                             sayNO;
3692                         s += ulen;
3693                         l ++;
3694                     }
3695                 }
3696                 locinput = l;
3697                 nextchr = UCHARAT(locinput);
3698                 break;
3699             }
3700             /* The target and the pattern have the same utf8ness. */
3701             /* Inline the first character, for speed. */
3702             if (UCHARAT(s) != nextchr)
3703                 sayNO;
3704             if (PL_regeol - locinput < ln)
3705                 sayNO;
3706             if (ln > 1 && memNE(s, locinput, ln))
3707                 sayNO;
3708             locinput += ln;
3709             nextchr = UCHARAT(locinput);
3710             break;
3711             }
3712         case EXACTFL: {
3713             re_fold_t folder;
3714             const U8 * fold_array;
3715             const char * s;
3716             U32 fold_utf8_flags;
3717
3718             PL_reg_flags |= RF_tainted;
3719             folder = foldEQ_locale;
3720             fold_array = PL_fold_locale;
3721             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3722             goto do_exactf;
3723
3724         case EXACTFU_SS:
3725         case EXACTFU_TRICKYFOLD:
3726         case EXACTFU:
3727             folder = foldEQ_latin1;
3728             fold_array = PL_fold_latin1;
3729             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3730             goto do_exactf;
3731
3732         case EXACTFA:
3733             folder = foldEQ_latin1;
3734             fold_array = PL_fold_latin1;
3735             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3736             goto do_exactf;
3737
3738         case EXACTF:
3739             folder = foldEQ;
3740             fold_array = PL_fold;
3741             fold_utf8_flags = 0;
3742
3743           do_exactf:
3744             s = STRING(scan);
3745             ln = STR_LEN(scan);
3746
3747             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3748               /* Either target or the pattern are utf8, or has the issue where
3749                * the fold lengths may differ. */
3750                 const char * const l = locinput;
3751                 char *e = PL_regeol;
3752
3753                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3754                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3755                 {
3756                     sayNO;
3757                 }
3758                 locinput = e;
3759                 nextchr = UCHARAT(locinput);
3760                 break;
3761             }
3762
3763             /* Neither the target nor the pattern are utf8 */
3764             if (UCHARAT(s) != nextchr &&
3765                 UCHARAT(s) != fold_array[nextchr])
3766             {
3767                 sayNO;
3768             }
3769             if (PL_regeol - locinput < ln)
3770                 sayNO;
3771             if (ln > 1 && ! folder(s, locinput, ln))
3772                 sayNO;
3773             locinput += ln;
3774             nextchr = UCHARAT(locinput);
3775             break;
3776         }
3777
3778         /* XXX Could improve efficiency by separating these all out using a
3779          * macro or in-line function.  At that point regcomp.c would no longer
3780          * have to set the FLAGS fields of these */
3781         case BOUNDL:
3782         case NBOUNDL:
3783             PL_reg_flags |= RF_tainted;
3784             /* FALL THROUGH */
3785         case BOUND:
3786         case BOUNDU:
3787         case BOUNDA:
3788         case NBOUND:
3789         case NBOUNDU:
3790         case NBOUNDA:
3791             /* was last char in word? */
3792             if (utf8_target
3793                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3794                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3795             {
3796                 if (locinput == PL_bostr)
3797                     ln = '\n';
3798                 else {
3799                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3800
3801                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3802                 }
3803                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3804                     ln = isALNUM_uni(ln);
3805                     LOAD_UTF8_CHARCLASS_ALNUM();
3806                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3807                 }
3808                 else {
3809                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3810                     n = isALNUM_LC_utf8((U8*)locinput);
3811                 }
3812             }
3813             else {
3814
3815                 /* Here the string isn't utf8, or is utf8 and only ascii
3816                  * characters are to match \w.  In the latter case looking at
3817                  * the byte just prior to the current one may be just the final
3818                  * byte of a multi-byte character.  This is ok.  There are two
3819                  * cases:
3820                  * 1) it is a single byte character, and then the test is doing
3821                  *      just what it's supposed to.
3822                  * 2) it is a multi-byte character, in which case the final
3823                  *      byte is never mistakable for ASCII, and so the test
3824                  *      will say it is not a word character, which is the
3825                  *      correct answer. */
3826                 ln = (locinput != PL_bostr) ?
3827                     UCHARAT(locinput - 1) : '\n';
3828                 switch (FLAGS(scan)) {
3829                     case REGEX_UNICODE_CHARSET:
3830                         ln = isWORDCHAR_L1(ln);
3831                         n = isWORDCHAR_L1(nextchr);
3832                         break;
3833                     case REGEX_LOCALE_CHARSET:
3834                         ln = isALNUM_LC(ln);
3835                         n = isALNUM_LC(nextchr);
3836                         break;
3837                     case REGEX_DEPENDS_CHARSET:
3838                         ln = isALNUM(ln);
3839                         n = isALNUM(nextchr);
3840                         break;
3841                     case REGEX_ASCII_RESTRICTED_CHARSET:
3842                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3843                         ln = isWORDCHAR_A(ln);
3844                         n = isWORDCHAR_A(nextchr);
3845                         break;
3846                     default:
3847                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3848                         break;
3849                 }
3850             }
3851             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3852              * regcomp.sym */
3853             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3854                     sayNO;
3855             break;
3856         case ANYOFV:
3857         case ANYOF:
3858             if (utf8_target || state_num == ANYOFV) {
3859                 STRLEN inclasslen = PL_regeol - locinput;
3860                 if (locinput >= PL_regeol)
3861                     sayNO;
3862
3863                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3864                     sayNO;
3865                 locinput += inclasslen;
3866                 nextchr = UCHARAT(locinput);
3867                 break;
3868             }
3869             else {
3870                 if (nextchr < 0)
3871                     nextchr = UCHARAT(locinput);
3872                 if (!nextchr && locinput >= PL_regeol)
3873                     sayNO;
3874                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3875                     sayNO;
3876                 nextchr = UCHARAT(++locinput);
3877                 break;
3878             }
3879             break;
3880         /* Special char classes - The defines start on line 129 or so */
3881         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3882                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3883                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3884                   ALNUMA, NALNUMA, isWORDCHAR_A,
3885                   alnum, "a");
3886
3887         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3888                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3889                   SPACEU, NSPACEU, isSPACE_L1,
3890                   SPACEA, NSPACEA, isSPACE_A,
3891                   space, " ");
3892
3893         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3894                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3895                 DIGITA, NDIGITA, isDIGIT_A,
3896                 digit, "0");
3897
3898         case POSIXA:
3899             if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
3900                 sayNO;
3901             }
3902             /* Matched a utf8-invariant, so don't have to worry about utf8 */
3903             nextchr = UCHARAT(++locinput);
3904             break;
3905         case NPOSIXA:
3906             if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
3907                 sayNO;
3908             }
3909             if (utf8_target) {
3910                 locinput += PL_utf8skip[nextchr];
3911                 nextchr = UCHARAT(locinput);
3912             }
3913             else {
3914                 nextchr = UCHARAT(++locinput);
3915             }
3916             break;
3917
3918         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3919                        a Unicode extended Grapheme Cluster */
3920             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3921               extended Grapheme Cluster is:
3922
3923                CR LF
3924                | Prepend* Begin Extend*
3925                | .
3926
3927                Begin is (Hangul-syllable | ! Control)
3928                Extend is (Grapheme_Extend | Spacing_Mark)
3929                Control is [ GCB_Control CR LF ]
3930
3931                The discussion below shows how the code for CLUMP is derived
3932                from this regex.  Note that most of these concepts are from
3933                property values of the Grapheme Cluster Boundary (GCB) property.
3934                No code point can have multiple property values for a given
3935                property.  Thus a code point in Prepend can't be in Control, but
3936                it must be in !Control.  This is why Control above includes
3937                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3938                property separately, and so can't be in GCB_Control, even though
3939                they logically are controls.  Control is not the same as gc=cc,
3940                but includes format and other characters as well.
3941
3942                The Unicode definition of Hangul-syllable is:
3943                    L+
3944                    | (L* ( ( V | LV ) V* | LVT ) T*)
3945                    | T+
3946                   )
3947                Each of these is a value for the GCB property, and hence must be
3948                disjoint, so the order they are tested is immaterial, so the
3949                above can safely be changed to
3950                    T+
3951                    | L+
3952                    | (L* ( LVT | ( V | LV ) V*) T*)
3953
3954                The last two terms can be combined like this:
3955                    L* ( L
3956                         | (( LVT | ( V | LV ) V*) T*))
3957
3958                And refactored into this:
3959                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3960
3961                That means that if we have seen any L's at all we can quit
3962                there, but if the next character is an LVT, a V, or an LV we
3963                should keep going.
3964
3965                There is a subtlety with Prepend* which showed up in testing.
3966                Note that the Begin, and only the Begin is required in:
3967                 | Prepend* Begin Extend*
3968                Also, Begin contains '! Control'.  A Prepend must be a
3969                '!  Control', which means it must also be a Begin.  What it
3970                comes down to is that if we match Prepend* and then find no
3971                suitable Begin afterwards, that if we backtrack the last
3972                Prepend, that one will be a suitable Begin.
3973             */
3974
3975             if (locinput >= PL_regeol)
3976                 sayNO;
3977             if  (! utf8_target) {
3978
3979                 /* Match either CR LF  or '.', as all the other possibilities
3980                  * require utf8 */
3981                 locinput++;         /* Match the . or CR */
3982                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3983                                        match the LF */
3984                     && locinput < PL_regeol
3985                     && UCHARAT(locinput) == '\n') locinput++;
3986             }
3987             else {
3988
3989                 /* Utf8: See if is ( CR LF ); already know that locinput <
3990                  * PL_regeol, so locinput+1 is in bounds */
3991                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3992                     locinput += 2;
3993                 }
3994                 else {
3995                     /* In case have to backtrack to beginning, then match '.' */
3996                     char *starting = locinput;
3997
3998                     /* In case have to backtrack the last prepend */
3999                     char *previous_prepend = 0;
4000
4001                     LOAD_UTF8_CHARCLASS_GCB();
4002
4003                     /* Match (prepend)* */
4004                     while (locinput < PL_regeol
4005                            && swash_fetch(PL_utf8_X_prepend,
4006                                           (U8*)locinput, utf8_target))
4007                     {
4008                         previous_prepend = locinput;
4009                         locinput += UTF8SKIP(locinput);
4010                     }
4011
4012                     /* As noted above, if we matched a prepend character, but
4013                      * the next thing won't match, back off the last prepend we
4014                      * matched, as it is guaranteed to match the begin */
4015                     if (previous_prepend
4016                         && (locinput >=  PL_regeol
4017                             || ! swash_fetch(PL_utf8_X_begin,
4018                                              (U8*)locinput, utf8_target)))
4019                     {
4020                         locinput = previous_prepend;
4021                     }
4022
4023                     /* Note that here we know PL_regeol > locinput, as we
4024                      * tested that upon input to this switch case, and if we
4025                      * moved locinput forward, we tested the result just above
4026                      * and it either passed, or we backed off so that it will
4027                      * now pass */
4028                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
4029
4030                         /* Here did not match the required 'Begin' in the
4031                          * second term.  So just match the very first
4032                          * character, the '.' of the final term of the regex */
4033                         locinput = starting + UTF8SKIP(starting);
4034                     } else {
4035
4036                         /* Here is the beginning of a character that can have
4037                          * an extender.  It is either a hangul syllable, or a
4038                          * non-control */
4039                         if (swash_fetch(PL_utf8_X_non_hangul,
4040                                         (U8*)locinput, utf8_target))
4041                         {
4042
4043                             /* Here not a Hangul syllable, must be a
4044                              * ('!  * Control') */
4045                             locinput += UTF8SKIP(locinput);
4046                         } else {
4047
4048                             /* Here is a Hangul syllable.  It can be composed
4049                              * of several individual characters.  One
4050                              * possibility is T+ */
4051                             if (swash_fetch(PL_utf8_X_T,
4052                                             (U8*)locinput, utf8_target))
4053                             {
4054                                 while (locinput < PL_regeol
4055                                         && swash_fetch(PL_utf8_X_T,
4056                                                         (U8*)locinput, utf8_target))
4057                                 {
4058                                     locinput += UTF8SKIP(locinput);
4059                                 }
4060                             } else {
4061
4062                                 /* Here, not T+, but is a Hangul.  That means
4063                                  * it is one of the others: L, LV, LVT or V,
4064                                  * and matches:
4065                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
4066
4067                                 /* Match L*           */
4068                                 while (locinput < PL_regeol
4069                                         && swash_fetch(PL_utf8_X_L,
4070                                                         (U8*)locinput, utf8_target))
4071                                 {
4072                                     locinput += UTF8SKIP(locinput);
4073                                 }
4074
4075                                 /* Here, have exhausted L*.  If the next
4076                                  * character is not an LV, LVT nor V, it means
4077                                  * we had to have at least one L, so matches L+
4078                                  * in the original equation, we have a complete
4079                                  * hangul syllable.  Are done. */
4080
4081                                 if (locinput < PL_regeol
4082                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
4083                                                     (U8*)locinput, utf8_target))
4084                                 {
4085
4086                                     /* Otherwise keep going.  Must be LV, LVT
4087                                      * or V.  See if LVT */
4088                                     if (swash_fetch(PL_utf8_X_LVT,
4089                                                     (U8*)locinput, utf8_target))
4090                                     {
4091                                         locinput += UTF8SKIP(locinput);
4092                                     } else {
4093
4094                                         /* Must be  V or LV.  Take it, then
4095                                          * match V*     */
4096                                         locinput += UTF8SKIP(locinput);
4097                                         while (locinput < PL_regeol
4098                                                 && swash_fetch(PL_utf8_X_V,
4099                                                          (U8*)locinput, utf8_target))
4100                                         {
4101                                             locinput += UTF8SKIP(locinput);
4102                                         }
4103                                     }
4104
4105                                     /* And any of LV, LVT, or V can be followed
4106                                      * by T*            */
4107                                     while (locinput < PL_regeol
4108                                            && swash_fetch(PL_utf8_X_T,
4109                                                            (U8*)locinput,
4110                                                            utf8_target))
4111                                     {
4112                                         locinput += UTF8SKIP(locinput);
4113                                     }
4114                                 }
4115                             }
4116                         }
4117
4118                         /* Match any extender */
4119                         while (locinput < PL_regeol
4120                                 && swash_fetch(PL_utf8_X_extend,
4121                                                 (U8*)locinput, utf8_target))
4122                         {
4123                             locinput += UTF8SKIP(locinput);
4124                         }
4125                     }
4126                 }
4127                 if (locinput > PL_regeol) sayNO;
4128             }
4129             nextchr = UCHARAT(locinput);
4130             break;
4131
4132         case NREFFL:
4133         {   /* The capture buffer cases.  The ones beginning with N for the
4134                named buffers just convert to the equivalent numbered and
4135                pretend they were called as the corresponding numbered buffer
4136                op.  */
4137             /* don't initialize these in the declaration, it makes C++
4138                unhappy */
4139             char *s;
4140             char type;
4141             re_fold_t folder;
4142             const U8 *fold_array;
4143             UV utf8_fold_flags;
4144
4145             PL_reg_flags |= RF_tainted;
4146             folder = foldEQ_locale;
4147             fold_array = PL_fold_locale;
4148             type = REFFL;
4149             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4150             goto do_nref;
4151
4152         case NREFFA:
4153             folder = foldEQ_latin1;
4154             fold_array = PL_fold_latin1;
4155             type = REFFA;
4156             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4157             goto do_nref;
4158
4159         case NREFFU:
4160             folder = foldEQ_latin1;
4161             fold_array = PL_fold_latin1;
4162             type = REFFU;
4163             utf8_fold_flags = 0;
4164             goto do_nref;
4165
4166         case NREFF:
4167             folder = foldEQ;
4168             fold_array = PL_fold;
4169             type = REFF;
4170             utf8_fold_flags = 0;
4171             goto do_nref;
4172
4173         case NREF:
4174             type = REF;
4175             folder = NULL;
4176             fold_array = NULL;
4177             utf8_fold_flags = 0;
4178           do_nref:
4179
4180             /* For the named back references, find the corresponding buffer
4181              * number */
4182             n = reg_check_named_buff_matched(rex,scan);
4183
4184             if ( ! n ) {
4185                 sayNO;
4186             }
4187             goto do_nref_ref_common;
4188
4189         case REFFL:
4190             PL_reg_flags |= RF_tainted;
4191             folder = foldEQ_locale;
4192             fold_array = PL_fold_locale;
4193             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4194             goto do_ref;
4195
4196         case REFFA:
4197             folder = foldEQ_latin1;
4198             fold_array = PL_fold_latin1;
4199             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4200             goto do_ref;
4201
4202         case REFFU:
4203             folder = foldEQ_latin1;
4204             fold_array = PL_fold_latin1;
4205             utf8_fold_flags = 0;
4206             goto do_ref;
4207
4208         case REFF:
4209             folder = foldEQ;
4210             fold_array = PL_fold;
4211             utf8_fold_flags = 0;
4212             goto do_ref;
4213
4214         case REF:
4215             folder = NULL;
4216             fold_array = NULL;
4217             utf8_fold_flags = 0;
4218
4219           do_ref:
4220             type = OP(scan);
4221             n = ARG(scan);  /* which paren pair */
4222
4223           do_nref_ref_common:
4224             ln = rex->offs[n].start;
4225             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4226             if (rex->lastparen < n || ln == -1)
4227                 sayNO;                  /* Do not match unless seen CLOSEn. */
4228             if (ln == rex->offs[n].end)
4229                 break;
4230
4231             s = PL_bostr + ln;
4232             if (type != REF     /* REF can do byte comparison */
4233                 && (utf8_target || type == REFFU))
4234             { /* XXX handle REFFL better */
4235                 char * limit = PL_regeol;
4236
4237                 /* This call case insensitively compares the entire buffer
4238                     * at s, with the current input starting at locinput, but
4239                     * not going off the end given by PL_regeol, and returns in
4240                     * limit upon success, how much of the current input was
4241                     * matched */
4242                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4243                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4244                 {
4245                     sayNO;
4246                 }
4247                 locinput = limit;
4248                 nextchr = UCHARAT(locinput);
4249                 break;
4250             }
4251
4252             /* Not utf8:  Inline the first character, for speed. */
4253             if (UCHARAT(s) != nextchr &&
4254                 (type == REF ||
4255                  UCHARAT(s) != fold_array[nextchr]))
4256                 sayNO;
4257             ln = rex->offs[n].end - ln;
4258             if (locinput + ln > PL_regeol)
4259                 sayNO;
4260             if (ln > 1 && (type == REF
4261                            ? memNE(s, locinput, ln)
4262                            : ! folder(s, locinput, ln)))
4263                 sayNO;
4264             locinput += ln;
4265             nextchr = UCHARAT(locinput);
4266             break;
4267         }
4268         case NOTHING:
4269         case TAIL:
4270             break;
4271         case BACK:
4272             break;
4273
4274 #undef  ST
4275 #define ST st->u.eval
4276         {
4277             SV *ret;
4278             REGEXP *re_sv;
4279             regexp *re;
4280             regexp_internal *rei;
4281             regnode *startpoint;
4282
4283         case GOSTART:
4284         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4285             if (cur_eval && cur_eval->locinput==locinput) {
4286                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4287                     Perl_croak(aTHX_ "Infinite recursion in regex");
4288                 if ( ++nochange_depth > max_nochange_depth )
4289                     Perl_croak(aTHX_
4290                         "Pattern subroutine nesting without pos change"
4291                         " exceeded limit in regex");
4292             } else {
4293                 nochange_depth = 0;
4294             }
4295             re_sv = rex_sv;
4296             re = rex;
4297             rei = rexi;
4298             if (OP(scan)==GOSUB) {
4299                 startpoint = scan + ARG2L(scan);
4300                 ST.close_paren = ARG(scan);
4301             } else {
4302                 startpoint = rei->program+1;
4303                 ST.close_paren = 0;
4304             }
4305             goto eval_recurse_doit;
4306             assert(0); /* NOTREACHED */
4307         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4308             if (cur_eval && cur_eval->locinput==locinput) {
4309                 if ( ++nochange_depth > max_nochange_depth )
4310                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4311             } else {
4312                 nochange_depth = 0;
4313             }
4314             {
4315                 /* execute the code in the {...} */
4316
4317                 dSP;
4318                 SV ** before;
4319                 OP * const oop = PL_op;
4320                 COP * const ocurcop = PL_curcop;
4321                 OP *nop;
4322                 char *saved_regeol = PL_regeol;
4323                 struct re_save_state saved_state;
4324                 CV *newcv;
4325
4326                 /* save *all* paren positions */
4327                 regcppush(rex, 0);
4328                 REGCP_SET(runops_cp);
4329
4330                 /* To not corrupt the existing regex state while executing the
4331                  * eval we would normally put it on the save stack, like with
4332                  * save_re_context. However, re-evals have a weird scoping so we
4333                  * can't just add ENTER/LEAVE here. With that, things like
4334                  *
4335                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4336                  *
4337                  * would break, as they expect the localisation to be unwound
4338                  * only when the re-engine backtracks through the bit that
4339                  * localised it.
4340                  *
4341                  * What we do instead is just saving the state in a local c
4342                  * variable.
4343                  */
4344                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4345
4346                 PL_reg_state.re_reparsing = FALSE;
4347
4348                 if (!caller_cv)
4349                     caller_cv = find_runcv(NULL);
4350
4351                 n = ARG(scan);
4352
4353                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4354                     newcv = ((struct regexp *)SvANY(
4355                                                 (REGEXP*)(rexi->data->data[n])
4356                                             ))->qr_anoncv
4357                                         ;
4358                     nop = (OP*)rexi->data->data[n+1];
4359                 }
4360                 else if (rexi->data->what[n] == 'l') { /* literal code */
4361                     newcv = caller_cv;
4362                     nop = (OP*)rexi->data->data[n];
4363                     assert(CvDEPTH(newcv));
4364                 }
4365                 else {
4366                     /* literal with own CV */
4367                     assert(rexi->data->what[n] == 'L');
4368                     newcv = rex->qr_anoncv;
4369                     nop = (OP*)rexi->data->data[n];
4370                 }
4371
4372                 /* normally if we're about to execute code from the same
4373                  * CV that we used previously, we just use the existing
4374                  * CX stack entry. However, its possible that in the
4375                  * meantime we may have backtracked, popped from the save
4376                  * stack, and undone the SAVECOMPPAD(s) associated with
4377                  * PUSH_MULTICALL; in which case PL_comppad no longer
4378                  * points to newcv's pad. */
4379                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4380                 {
4381                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4382                     if (last_pushed_cv) {
4383                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4384                     }
4385                     else {
4386                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4387                     }
4388                     last_pushed_cv = newcv;
4389                 }
4390                 last_pad = PL_comppad;
4391
4392                 /* the initial nextstate you would normally execute
4393                  * at the start of an eval (which would cause error
4394                  * messages to come from the eval), may be optimised
4395                  * away from the execution path in the regex code blocks;
4396                  * so manually set PL_curcop to it initially */
4397                 {
4398                     OP *o = cUNOPx(nop)->op_first;
4399                     assert(o->op_type == OP_NULL);
4400                     if (o->op_targ == OP_SCOPE) {
4401                         o = cUNOPo->op_first;
4402                     }
4403                     else {
4404                         assert(o->op_targ == OP_LEAVE);
4405                         o = cUNOPo->op_first;
4406                         assert(o->op_type == OP_ENTER);
4407                         o = o->op_sibling;
4408                     }
4409
4410                     if (o->op_type != OP_STUB) {
4411                         assert(    o->op_type == OP_NEXTSTATE
4412                                 || o->op_type == OP_DBSTATE
4413                                 || (o->op_type == OP_NULL
4414                                     &&  (  o->op_targ == OP_NEXTSTATE
4415                                         || o->op_targ == OP_DBSTATE
4416                                         )
4417                                     )
4418                         );
4419                         PL_curcop = (COP*)o;
4420                     }
4421                 }
4422                 nop = nop->op_next;
4423
4424                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4425                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4426
4427                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4428
4429                 if (sv_yes_mark) {
4430                     SV *sv_mrk = get_sv("REGMARK", 1);
4431                     sv_setsv(sv_mrk, sv_yes_mark);
4432                 }
4433
4434                 /* we don't use MULTICALL here as we want to call the
4435                  * first op of the block of interest, rather than the
4436                  * first op of the sub */
4437                 before = SP;
4438                 PL_op = nop;
4439                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4440                 SPAGAIN;
4441                 if (SP == before)
4442                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4443                 else {
4444                     ret = POPs;
4445                     PUTBACK;
4446                 }
4447
4448                 /* before restoring everything, evaluate the returned
4449                  * value, so that 'uninit' warnings don't use the wrong
4450                  * PL_op or pad. Also need to process any magic vars
4451                  * (e.g. $1) *before* parentheses are restored */
4452
4453                 PL_op = NULL;
4454
4455                 re_sv = NULL;
4456                 if (logical == 0)        /*   (?{})/   */
4457                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4458                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4459                     sw = cBOOL(SvTRUE(ret));
4460                     logical = 0;
4461                 }
4462                 else {                   /*  /(??{})  */
4463                     /*  if its overloaded, let the regex compiler handle
4464                      *  it; otherwise extract regex, or stringify  */
4465                     if (!SvAMAGIC(ret)) {
4466                         SV *sv = ret;
4467                         if (SvROK(sv))
4468                             sv = SvRV(sv);
4469                         if (SvTYPE(sv) == SVt_REGEXP)
4470                             re_sv = (REGEXP*) sv;
4471                         else if (SvSMAGICAL(sv)) {
4472                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4473                             if (mg)
4474                                 re_sv = (REGEXP *) mg->mg_obj;
4475                         }
4476
4477                         /* force any magic, undef warnings here */
4478                         if (!re_sv) {
4479                             ret = sv_mortalcopy(ret);
4480                             (void) SvPV_force_nolen(ret);
4481                         }
4482                     }
4483
4484                 }
4485
4486                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4487
4488                 /* *** Note that at this point we don't restore
4489                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4490                  * be used again soon. This is safe as long as nothing
4491                  * in the regexp code uses the pad ! */
4492                 PL_op = oop;
4493                 PL_curcop = ocurcop;
4494                 PL_regeol = saved_regeol;
4495                 S_regcp_restore(aTHX_ rex, runops_cp);
4496
4497                 if (logical != 2)
4498                     break;
4499             }
4500
4501                 /* only /(??{})/  from now on */
4502                 logical = 0;
4503                 {
4504                     /* extract RE object from returned value; compiling if
4505                      * necessary */
4506
4507                     if (re_sv) {
4508                         re_sv = reg_temp_copy(NULL, re_sv);
4509                     }
4510                     else {
4511                         U32 pm_flags = 0;
4512                         const I32 osize = PL_regsize;
4513
4514                         if (SvUTF8(ret) && IN_BYTES) {
4515                             /* In use 'bytes': make a copy of the octet
4516                              * sequence, but without the flag on */
4517                             STRLEN len;
4518                             const char *const p = SvPV(ret, len);
4519                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4520                         }
4521                         if (rex->intflags & PREGf_USE_RE_EVAL)
4522                             pm_flags |= PMf_USE_RE_EVAL;
4523
4524                         /* if we got here, it should be an engine which
4525                          * supports compiling code blocks and stuff */
4526                         assert(rex->engine && rex->engine->op_comp);
4527                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4528                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4529                                     rex->engine, NULL, NULL,
4530                                     /* copy /msix etc to inner pattern */
4531                                     scan->flags,
4532                                     pm_flags);
4533
4534                         if (!(SvFLAGS(ret)
4535                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4536                                  | SVs_GMG))) {
4537                             /* This isn't a first class regexp. Instead, it's
4538                                caching a regexp onto an existing, Perl visible
4539                                scalar.  */
4540                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4541                         }
4542                         PL_regsize = osize;
4543                         /* safe to do now that any $1 etc has been
4544                          * interpolated into the new pattern string and
4545                          * compiled */
4546                         S_regcp_restore(aTHX_ rex, runops_cp);
4547                     }
4548                     re = (struct regexp *)SvANY(re_sv);
4549                 }
4550                 RXp_MATCH_COPIED_off(re);
4551                 re->subbeg = rex->subbeg;
4552                 re->sublen = rex->sublen;
4553                 rei = RXi_GET(re);
4554                 DEBUG_EXECUTE_r(
4555                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4556                         "Matching embedded");
4557                 );
4558                 startpoint = rei->program + 1;
4559                 ST.close_paren = 0; /* only used for GOSUB */
4560
4561         eval_recurse_doit: /* Share code with GOSUB below this line */
4562                 /* run the pattern returned from (??{...}) */
4563                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4564                 REGCP_SET(ST.lastcp);
4565
4566                 re->lastparen = 0;
4567                 re->lastcloseparen = 0;
4568
4569                 PL_reginput = locinput;
4570                 PL_regsize = 0;
4571
4572                 /* XXXX This is too dramatic a measure... */
4573                 PL_reg_maxiter = 0;
4574
4575                 ST.toggle_reg_flags = PL_reg_flags;
4576                 if (RX_UTF8(re_sv))
4577                     PL_reg_flags |= RF_utf8;
4578                 else
4579                     PL_reg_flags &= ~RF_utf8;
4580                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4581
4582                 ST.prev_rex = rex_sv;
4583                 ST.prev_curlyx = cur_curlyx;
4584                 rex_sv = re_sv;
4585                 SET_reg_curpm(rex_sv);
4586                 rex = re;
4587                 rexi = rei;
4588                 cur_curlyx = NULL;
4589                 ST.B = next;
4590                 ST.prev_eval = cur_eval;
4591                 cur_eval = st;
4592                 /* now continue from first node in postoned RE */
4593                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4594                 assert(0); /* NOTREACHED */
4595         }
4596
4597         case EVAL_AB: /* cleanup after a successful (??{A})B */
4598             /* note: this is called twice; first after popping B, then A */
4599             PL_reg_flags ^= ST.toggle_reg_flags;
4600             rex_sv = ST.prev_rex;
4601             SET_reg_curpm(rex_sv);
4602             rex = (struct regexp *)SvANY(rex_sv);
4603             rexi = RXi_GET(rex);
4604             regcpblow(ST.cp);
4605             cur_eval = ST.prev_eval;
4606             cur_curlyx = ST.prev_curlyx;
4607
4608             /* XXXX This is too dramatic a measure... */
4609             PL_reg_maxiter = 0;
4610             if ( nochange_depth )
4611                 nochange_depth--;
4612             sayYES;
4613
4614
4615         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4616             /* note: this is called twice; first after popping B, then A */
4617             PL_reg_flags ^= ST.toggle_reg_flags;
4618             rex_sv = ST.prev_rex;
4619             SET_reg_curpm(rex_sv);
4620             rex = (struct regexp *)SvANY(rex_sv);
4621             rexi = RXi_GET(rex);
4622
4623             PL_reginput = locinput;
4624             REGCP_UNWIND(ST.lastcp);
4625             regcppop(rex);
4626             cur_eval = ST.prev_eval;
4627             cur_curlyx = ST.prev_curlyx;
4628             /* XXXX This is too dramatic a measure... */
4629             PL_reg_maxiter = 0;
4630             if ( nochange_depth )
4631                 nochange_depth--;
4632             sayNO_SILENT;
4633 #undef ST
4634
4635         case OPEN:
4636             n = ARG(scan);  /* which paren pair */
4637             rex->offs[n].start_tmp = locinput - PL_bostr;
4638             if (n > PL_regsize)
4639                 PL_regsize = n;
4640             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4641                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4642                 PTR2UV(rex),
4643                 PTR2UV(rex->offs),
4644                 (UV)n,
4645                 (IV)rex->offs[n].start_tmp,
4646                 (UV)PL_regsize
4647             ));
4648             lastopen = n;
4649             break;
4650
4651 /* XXX really need to log other places start/end are set too */
4652 #define CLOSE_CAPTURE \
4653     rex->offs[n].start = rex->offs[n].start_tmp; \
4654     rex->offs[n].end = locinput - PL_bostr; \
4655     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4656         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4657         PTR2UV(rex), \
4658         PTR2UV(rex->offs), \
4659         (UV)n, \
4660         (IV)rex->offs[n].start, \
4661         (IV)rex->offs[n].end \
4662     ))
4663
4664         case CLOSE:
4665             n = ARG(scan);  /* which paren pair */
4666             CLOSE_CAPTURE;
4667             /*if (n > PL_regsize)
4668                 PL_regsize = n;*/
4669             if (n > rex->lastparen)
4670                 rex->lastparen = n;
4671             rex->lastcloseparen = n;
4672             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4673                 goto fake_end;
4674             }
4675             break;
4676         case ACCEPT:
4677             if (ARG(scan)){
4678                 regnode *cursor;
4679                 for (cursor=scan;
4680                      cursor && OP(cursor)!=END;
4681                      cursor=regnext(cursor))
4682                 {
4683                     if ( OP(cursor)==CLOSE ){
4684                         n = ARG(cursor);
4685                         if ( n <= lastopen ) {
4686                             CLOSE_CAPTURE;
4687                             /*if (n > PL_regsize)
4688                             PL_regsize = n;*/
4689                             if (n > rex->lastparen)
4690                                 rex->lastparen = n;
4691                             rex->lastcloseparen = n;
4692                             if ( n == ARG(scan) || (cur_eval &&
4693                                 cur_eval->u.eval.close_paren == n))
4694                                 break;
4695                         }
4696                     }
4697                 }
4698             }
4699             goto fake_end;
4700             /*NOTREACHED*/
4701         case GROUPP:
4702             n = ARG(scan);  /* which paren pair */
4703             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4704             break;
4705         case NGROUPP:
4706             /* reg_check_named_buff_matched returns 0 for no match */
4707             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4708             break;
4709         case INSUBP:
4710             n = ARG(scan);
4711             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4712             break;
4713         case DEFINEP:
4714             sw = 0;
4715             break;
4716         case IFTHEN:
4717             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4718             if (sw)
4719                 next = NEXTOPER(NEXTOPER(scan));
4720             else {
4721                 next = scan + ARG(scan);
4722                 if (OP(next) == IFTHEN) /* Fake one. */
4723                     next = NEXTOPER(NEXTOPER(next));
4724             }
4725             break;
4726         case LOGICAL:
4727             logical = scan->flags;
4728             break;
4729
4730 /*******************************************************************
4731
4732 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4733 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4734 STAR/PLUS/CURLY/CURLYN are used instead.)
4735
4736 A*B is compiled as <CURLYX><A><WHILEM><B>
4737
4738 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4739 state, which contains the current count, initialised to -1. It also sets
4740 cur_curlyx to point to this state, with any previous value saved in the
4741 state block.
4742
4743 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4744 since the pattern may possibly match zero times (i.e. it's a while {} loop
4745 rather than a do {} while loop).
4746
4747 Each entry to WHILEM represents a successful match of A. The count in the
4748 CURLYX block is incremented, another WHILEM state is pushed, and execution
4749 passes to A or B depending on greediness and the current count.
4750
4751 For example, if matching against the string a1a2a3b (where the aN are
4752 substrings that match /A/), then the match progresses as follows: (the
4753 pushed states are interspersed with the bits of strings matched so far):
4754
4755     <CURLYX cnt=-1>
4756     <CURLYX cnt=0><WHILEM>
4757     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4758     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4759     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4760     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4761
4762 (Contrast this with something like CURLYM, which maintains only a single
4763 backtrack state:
4764
4765     <CURLYM cnt=0> a1
4766     a1 <CURLYM cnt=1> a2
4767     a1 a2 <CURLYM cnt=2> a3
4768     a1 a2 a3 <CURLYM cnt=3> b
4769 )
4770
4771 Each WHILEM state block marks a point to backtrack to upon partial failure
4772 of A or B, and also contains some minor state data related to that
4773 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4774 overall state, such as the count, and pointers to the A and B ops.
4775
4776 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4777 must always point to the *current* CURLYX block, the rules are:
4778
4779 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4780 and set cur_curlyx to point the new block.
4781
4782 When popping the CURLYX block after a successful or unsuccessful match,
4783 restore the previous cur_curlyx.
4784
4785 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4786 to the outer one saved in the CURLYX block.
4787
4788 When popping the WHILEM block after a successful or unsuccessful B match,
4789 restore the previous cur_curlyx.
4790
4791 Here's an example for the pattern (AI* BI)*BO
4792 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4793
4794 cur_
4795 curlyx backtrack stack
4796 ------ ---------------
4797 NULL
4798 CO     <CO prev=NULL> <WO>
4799 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4800 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4801 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4802
4803 At this point the pattern succeeds, and we work back down the stack to
4804 clean up, restoring as we go:
4805
4806 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4807 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4808 CO     <CO prev=NULL> <WO>
4809 NULL
4810
4811 *******************************************************************/
4812
4813 #define ST st->u.curlyx
4814
4815         case CURLYX:    /* start of /A*B/  (for complex A) */
4816         {
4817             /* No need to save/restore up to this paren */
4818             I32 parenfloor = scan->flags;
4819
4820             assert(next); /* keep Coverity happy */
4821             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4822                 next += ARG(next);
4823
4824             /* XXXX Probably it is better to teach regpush to support
4825                parenfloor > PL_regsize... */
4826             if (parenfloor > (I32)rex->lastparen)
4827                 parenfloor = rex->lastparen; /* Pessimization... */
4828
4829             ST.prev_curlyx= cur_curlyx;
4830             cur_curlyx = st;
4831             ST.cp = PL_savestack_ix;
4832
4833             /* these fields contain the state of the current curly.
4834              * they are accessed by subsequent WHILEMs */
4835             ST.parenfloor = parenfloor;
4836             ST.me = scan;
4837             ST.B = next;
4838             ST.minmod = minmod;
4839             minmod = 0;
4840             ST.count = -1;      /* this will be updated by WHILEM */
4841             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4842
4843             PL_reginput = locinput;
4844             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4845             assert(0); /* NOTREACHED */
4846         }
4847
4848         case CURLYX_end: /* just finished matching all of A*B */
4849             cur_curlyx = ST.prev_curlyx;
4850             sayYES;
4851             assert(0); /* NOTREACHED */
4852
4853         case CURLYX_end_fail: /* just failed to match all of A*B */
4854             regcpblow(ST.cp);
4855             cur_curlyx = ST.prev_curlyx;
4856             sayNO;
4857             assert(0); /* NOTREACHED */
4858
4859
4860 #undef ST
4861 #define ST st->u.whilem
4862
4863         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4864         {
4865             /* see the discussion above about CURLYX/WHILEM */
4866             I32 n;
4867             int min = ARG1(cur_curlyx->u.curlyx.me);
4868             int max = ARG2(cur_curlyx->u.curlyx.me);
4869             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4870
4871             assert(cur_curlyx); /* keep Coverity happy */
4872             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4873             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4874             ST.cache_offset = 0;
4875             ST.cache_mask = 0;
4876
4877             PL_reginput = locinput;
4878
4879             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4880                   "%*s  whilem: matched %ld out of %d..%d\n",
4881                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4882             );
4883
4884             /* First just match a string of min A's. */
4885
4886             if (n < min) {
4887                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4888                 cur_curlyx->u.curlyx.lastloc = locinput;
4889                 REGCP_SET(ST.lastcp);
4890
4891                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4892                 assert(0); /* NOTREACHED */
4893             }
4894
4895             /* If degenerate A matches "", assume A done. */
4896
4897             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4898                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4899                    "%*s  whilem: empty match detected, trying continuation...\n",
4900                    REPORT_CODE_OFF+depth*2, "")
4901                 );
4902                 goto do_whilem_B_max;
4903             }
4904
4905             /* super-linear cache processing */
4906
4907             if (scan->flags) {
4908
4909                 if (!PL_reg_maxiter) {
4910                     /* start the countdown: Postpone detection until we
4911                      * know the match is not *that* much linear. */
4912                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4913                     /* possible overflow for long strings and many CURLYX's */
4914                     if (PL_reg_maxiter < 0)
4915                         PL_reg_maxiter = I32_MAX;
4916                     PL_reg_leftiter = PL_reg_maxiter;
4917                 }
4918
4919                 if (PL_reg_leftiter-- == 0) {
4920                     /* initialise cache */
4921                     const I32 size = (PL_reg_maxiter + 7)/8;
4922                     if (PL_reg_poscache) {
4923                         if ((I32)PL_reg_poscache_size < size) {
4924                             Renew(PL_reg_poscache, size, char);
4925                             PL_reg_poscache_size = size;
4926                         }
4927                         Zero(PL_reg_poscache, size, char);
4928                     }
4929                     else {
4930                         PL_reg_poscache_size = size;
4931                         Newxz(PL_reg_poscache, size, char);
4932                     }
4933                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4934       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4935                               PL_colors[4], PL_colors[5])
4936                     );
4937                 }
4938
4939                 if (PL_reg_leftiter < 0) {
4940                     /* have we already failed at this position? */
4941                     I32 offset, mask;
4942                     offset  = (scan->flags & 0xf) - 1
4943                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4944                     mask    = 1 << (offset % 8);
4945                     offset /= 8;
4946                     if (PL_reg_poscache[offset] & mask) {
4947                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4948                             "%*s  whilem: (cache) already tried at this position...\n",
4949                             REPORT_CODE_OFF+depth*2, "")
4950                         );
4951                         sayNO; /* cache records failure */
4952                     }
4953                     ST.cache_offset = offset;
4954                     ST.cache_mask   = mask;
4955                 }
4956             }
4957
4958             /* Prefer B over A for minimal matching. */
4959
4960             if (cur_curlyx->u.curlyx.minmod) {
4961                 ST.save_curlyx = cur_curlyx;
4962                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4963                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
4964                 REGCP_SET(ST.lastcp);
4965                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4966                 assert(0); /* NOTREACHED */
4967             }
4968
4969             /* Prefer A over B for maximal matching. */
4970
4971             if (n < max) { /* More greed allowed? */
4972                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4973                 cur_curlyx->u.curlyx.lastloc = locinput;
4974                 REGCP_SET(ST.lastcp);
4975                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4976                 assert(0); /* NOTREACHED */
4977             }
4978             goto do_whilem_B_max;
4979         }
4980         assert(0); /* NOTREACHED */
4981
4982         case WHILEM_B_min: /* just matched B in a minimal match */
4983         case WHILEM_B_max: /* just matched B in a maximal match */
4984             cur_curlyx = ST.save_curlyx;
4985             sayYES;
4986             assert(0); /* NOTREACHED */
4987
4988         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4989             cur_curlyx = ST.save_curlyx;
4990             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4991             cur_curlyx->u.curlyx.count--;
4992             CACHEsayNO;
4993             assert(0); /* NOTREACHED */
4994
4995         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4996             /* FALL THROUGH */
4997         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4998             REGCP_UNWIND(ST.lastcp);
4999             regcppop(rex);
5000             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5001             cur_curlyx->u.curlyx.count--;
5002             CACHEsayNO;
5003             assert(0); /* NOTREACHED */
5004
5005         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5006             REGCP_UNWIND(ST.lastcp);
5007             regcppop(rex);      /* Restore some previous $<digit>s? */
5008             PL_reginput = locinput;
5009             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5010                 "%*s  whilem: failed, trying continuation...\n",
5011                 REPORT_CODE_OFF+depth*2, "")
5012             );
5013           do_whilem_B_max:
5014             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5015                 && ckWARN(WARN_REGEXP)
5016                 && !(PL_reg_flags & RF_warned))
5017             {
5018                 PL_reg_flags |= RF_warned;
5019                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5020                      "Complex regular subexpression recursion limit (%d) "
5021                      "exceeded",
5022                      REG_INFTY - 1);
5023             }
5024
5025             /* now try B */
5026             ST.save_curlyx = cur_curlyx;
5027             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5028             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
5029             assert(0); /* NOTREACHED */
5030
5031         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5032             cur_curlyx = ST.save_curlyx;
5033             REGCP_UNWIND(ST.lastcp);
5034             regcppop(rex);
5035
5036             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5037                 /* Maximum greed exceeded */
5038                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5039                     && ckWARN(WARN_REGEXP)
5040                     && !(PL_reg_flags & RF_warned))
5041                 {
5042                     PL_reg_flags |= RF_warned;
5043                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5044                         "Complex regular subexpression recursion "
5045                         "limit (%d) exceeded",
5046                         REG_INFTY - 1);
5047                 }
5048                 cur_curlyx->u.curlyx.count--;
5049                 CACHEsayNO;
5050             }
5051
5052             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5053                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5054             );
5055             /* Try grabbing another A and see if it helps. */
5056             PL_reginput = locinput;
5057             cur_curlyx->u.curlyx.lastloc = locinput;
5058             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5059             REGCP_SET(ST.lastcp);
5060             PUSH_STATE_GOTO(WHILEM_A_min,
5061                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
5062             assert(0); /* NOTREACHED */
5063
5064 #undef  ST
5065 #define ST st->u.branch
5066
5067         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5068             next = scan + ARG(scan);
5069             if (next == scan)
5070                 next = NULL;
5071             scan = NEXTOPER(scan);
5072             /* FALL THROUGH */
5073
5074         case BRANCH:        /*  /(...|A|...)/ */
5075             scan = NEXTOPER(scan); /* scan now points to inner node */
5076             ST.lastparen = rex->lastparen;
5077             ST.lastcloseparen = rex->lastcloseparen;
5078             ST.next_branch = next;
5079             REGCP_SET(ST.cp);
5080             PL_reginput = locinput;
5081
5082             /* Now go into the branch */
5083             if (has_cutgroup) {
5084                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5085             } else {
5086                 PUSH_STATE_GOTO(BRANCH_next, scan);
5087             }
5088             assert(0); /* NOTREACHED */
5089         case CUTGROUP:
5090             PL_reginput = locinput;
5091             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5092                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5093             PUSH_STATE_GOTO(CUTGROUP_next,next);
5094             assert(0); /* NOTREACHED */
5095         case CUTGROUP_next_fail:
5096             do_cutgroup = 1;
5097             no_final = 1;
5098             if (st->u.mark.mark_name)
5099                 sv_commit = st->u.mark.mark_name;
5100             sayNO;
5101             assert(0); /* NOTREACHED */
5102         case BRANCH_next:
5103             sayYES;
5104             assert(0); /* NOTREACHED */
5105         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5106             if (do_cutgroup) {
5107                 do_cutgroup = 0;
5108                 no_final = 0;
5109             }
5110             REGCP_UNWIND(ST.cp);
5111             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5112             scan = ST.next_branch;
5113             /* no more branches? */
5114             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5115                 DEBUG_EXECUTE_r({
5116                     PerlIO_printf( Perl_debug_log,
5117                         "%*s  %sBRANCH failed...%s\n",
5118                         REPORT_CODE_OFF+depth*2, "",
5119                         PL_colors[4],
5120                         PL_colors[5] );
5121                 });
5122                 sayNO_SILENT;
5123             }
5124             continue; /* execute next BRANCH[J] op */
5125             assert(0); /* NOTREACHED */
5126
5127         case MINMOD:
5128             minmod = 1;
5129             break;
5130
5131 #undef  ST
5132 #define ST st->u.curlym
5133
5134         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5135
5136             /* This is an optimisation of CURLYX that enables us to push
5137              * only a single backtracking state, no matter how many matches
5138              * there are in {m,n}. It relies on the pattern being constant
5139              * length, with no parens to influence future backrefs
5140              */
5141
5142             ST.me = scan;
5143             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5144
5145             ST.lastparen      = rex->lastparen;
5146             ST.lastcloseparen = rex->lastcloseparen;
5147
5148             /* if paren positive, emulate an OPEN/CLOSE around A */
5149             if (ST.me->flags) {
5150                 U32 paren = ST.me->flags;
5151                 if (paren > PL_regsize)
5152                     PL_regsize = paren;
5153                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5154             }
5155             ST.A = scan;
5156             ST.B = next;
5157             ST.alen = 0;
5158             ST.count = 0;
5159             ST.minmod = minmod;
5160             minmod = 0;
5161             ST.c1 = CHRTEST_UNINIT;
5162             REGCP_SET(ST.cp);
5163
5164             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5165                 goto curlym_do_B;
5166
5167           curlym_do_A: /* execute the A in /A{m,n}B/  */
5168             PL_reginput = locinput;
5169             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5170             assert(0); /* NOTREACHED */
5171
5172         case CURLYM_A: /* we've just matched an A */
5173             locinput = st->locinput;
5174             nextchr = UCHARAT(locinput);
5175
5176             ST.count++;
5177             /* after first match, determine A's length: u.curlym.alen */
5178             if (ST.count == 1) {
5179                 if (PL_reg_match_utf8) {
5180                     char *s = locinput;
5181                     while (s < PL_reginput) {
5182                         ST.alen++;
5183                         s += UTF8SKIP(s);
5184                     }
5185                 }
5186                 else {
5187                     ST.alen = PL_reginput - locinput;
5188                 }
5189                 if (ST.alen == 0)
5190                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5191             }
5192             DEBUG_EXECUTE_r(
5193                 PerlIO_printf(Perl_debug_log,
5194                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5195                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5196                           (IV) ST.count, (IV)ST.alen)
5197             );
5198
5199             locinput = PL_reginput;
5200
5201             if (cur_eval && cur_eval->u.eval.close_paren &&
5202                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5203                 goto fake_end;
5204
5205             {
5206                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5207                 if ( max == REG_INFTY || ST.count < max )
5208                     goto curlym_do_A; /* try to match another A */
5209             }
5210             goto curlym_do_B; /* try to match B */
5211
5212         case CURLYM_A_fail: /* just failed to match an A */
5213             REGCP_UNWIND(ST.cp);
5214
5215             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5216                 || (cur_eval && cur_eval->u.eval.close_paren &&
5217                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5218                 sayNO;
5219
5220           curlym_do_B: /* execute the B in /A{m,n}B/  */
5221             PL_reginput = locinput;
5222             if (ST.c1 == CHRTEST_UNINIT) {
5223                 /* calculate c1 and c2 for possible match of 1st char
5224                  * following curly */
5225                 ST.c1 = ST.c2 = CHRTEST_VOID;
5226                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5227                     regnode *text_node = ST.B;
5228                     if (! HAS_TEXT(text_node))
5229                         FIND_NEXT_IMPT(text_node);
5230                     /* this used to be
5231
5232                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5233
5234                         But the former is redundant in light of the latter.
5235
5236                         if this changes back then the macro for
5237                         IS_TEXT and friends need to change.
5238                      */
5239                     if (PL_regkind[OP(text_node)] == EXACT)
5240                     {
5241
5242                         ST.c1 = (U8)*STRING(text_node);
5243                         switch (OP(text_node)) {
5244                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5245                             case EXACTFA:
5246                             case EXACTFU_SS:
5247                             case EXACTFU_TRICKYFOLD:
5248                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5249                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5250                             default: ST.c2 = ST.c1;
5251                         }
5252                     }
5253                 }
5254             }
5255
5256             DEBUG_EXECUTE_r(
5257                 PerlIO_printf(Perl_debug_log,
5258                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5259                     (int)(REPORT_CODE_OFF+(depth*2)),
5260                     "", (IV)ST.count)
5261                 );
5262             if (ST.c1 != CHRTEST_VOID
5263                     && UCHARAT(PL_reginput) != ST.c1
5264                     && UCHARAT(PL_reginput) != ST.c2)
5265             {
5266                 /* simulate B failing */
5267                 DEBUG_OPTIMISE_r(
5268                     PerlIO_printf(Perl_debug_log,
5269                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5270                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5271                         (IV)ST.c1,(IV)ST.c2
5272                 ));
5273                 state_num = CURLYM_B_fail;
5274                 goto reenter_switch;
5275             }
5276
5277             if (ST.me->flags) {
5278                 /* emulate CLOSE: mark current A as captured */
5279                 I32 paren = ST.me->flags;
5280                 if (ST.count) {
5281                     rex->offs[paren].start
5282                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5283                     rex->offs[paren].end = PL_reginput - PL_bostr;
5284                     if ((U32)paren > rex->lastparen)
5285                         rex->lastparen = paren;
5286                     rex->lastcloseparen = paren;
5287                 }
5288                 else
5289                     rex->offs[paren].end = -1;
5290                 if (cur_eval && cur_eval->u.eval.close_paren &&
5291                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5292                 {
5293                     if (ST.count)
5294                         goto fake_end;
5295                     else
5296                         sayNO;
5297                 }
5298             }
5299
5300             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5301             assert(0); /* NOTREACHED */
5302
5303         case CURLYM_B_fail: /* just failed to match a B */
5304             REGCP_UNWIND(ST.cp);
5305             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5306             if (ST.minmod) {
5307                 I32 max = ARG2(ST.me);
5308                 if (max != REG_INFTY && ST.count == max)
5309                     sayNO;
5310                 goto curlym_do_A; /* try to match a further A */
5311             }
5312             /* backtrack one A */
5313             if (ST.count == ARG1(ST.me) /* min */)
5314                 sayNO;
5315             ST.count--;
5316             locinput = HOPc(locinput, -ST.alen);
5317             goto curlym_do_B; /* try to match B */
5318
5319 #undef ST
5320 #define ST st->u.curly
5321
5322 #define CURLY_SETPAREN(paren, success) \
5323     if (paren) { \
5324         if (success) { \
5325             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5326             rex->offs[paren].end = locinput - PL_bostr; \
5327             if (paren > rex->lastparen) \
5328                 rex->lastparen = paren; \
5329             rex->lastcloseparen = paren; \
5330         } \
5331         else { \
5332             rex->offs[paren].end = -1; \
5333             rex->lastparen      = ST.lastparen; \
5334             rex->lastcloseparen = ST.lastcloseparen; \
5335         } \
5336     }
5337
5338         case STAR:              /*  /A*B/ where A is width 1 */
5339             ST.paren = 0;
5340             ST.min = 0;
5341             ST.max = REG_INFTY;
5342             scan = NEXTOPER(scan);
5343             goto repeat;
5344         case PLUS:              /*  /A+B/ where A is width 1 */
5345             ST.paren = 0;
5346             ST.min = 1;
5347             ST.max = REG_INFTY;
5348             scan = NEXTOPER(scan);
5349             goto repeat;
5350         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5351             ST.paren = scan->flags;     /* Which paren to set */
5352             ST.lastparen      = rex->lastparen;
5353             ST.lastcloseparen = rex->lastcloseparen;
5354             if (ST.paren > PL_regsize)
5355                 PL_regsize = ST.paren;
5356             ST.min = ARG1(scan);  /* min to match */
5357             ST.max = ARG2(scan);  /* max to match */
5358             if (cur_eval && cur_eval->u.eval.close_paren &&
5359                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5360                 ST.min=1;
5361                 ST.max=1;
5362             }
5363             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5364             goto repeat;
5365         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5366             ST.paren = 0;
5367             ST.min = ARG1(scan);  /* min to match */
5368             ST.max = ARG2(scan);  /* max to match */
5369             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5370           repeat:
5371             /*
5372             * Lookahead to avoid useless match attempts
5373             * when we know what character comes next.
5374             *
5375             * Used to only do .*x and .*?x, but now it allows
5376             * for )'s, ('s and (?{ ... })'s to be in the way
5377             * of the quantifier and the EXACT-like node.  -- japhy
5378             */
5379
5380             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5381                 sayNO;
5382             if (HAS_TEXT(next) || JUMPABLE(next)) {
5383                 U8 *s;
5384                 regnode *text_node = next;
5385
5386                 if (! HAS_TEXT(text_node))
5387                     FIND_NEXT_IMPT(text_node);
5388
5389                 if (! HAS_TEXT(text_node))
5390                     ST.c1 = ST.c2 = CHRTEST_VOID;
5391                 else {
5392                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5393                         ST.c1 = ST.c2 = CHRTEST_VOID;
5394                         goto assume_ok_easy;
5395                     }
5396                     else
5397                         s = (U8*)STRING(text_node);
5398
5399                     /*  Currently we only get here when
5400
5401                         PL_rekind[OP(text_node)] == EXACT
5402
5403                         if this changes back then the macro for IS_TEXT and
5404                         friends need to change. */
5405                     if (!UTF_PATTERN) {
5406                         ST.c1 = *s;
5407                         switch (OP(text_node)) {
5408                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5409                             case EXACTFA:
5410                             case EXACTFU_SS:
5411                             case EXACTFU_TRICKYFOLD:
5412                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5413                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5414                             default: ST.c2 = ST.c1; break;
5415                         }
5416                     }
5417                     else { /* UTF_PATTERN */
5418                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5419                              STRLEN ulen;
5420                              U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5421
5422                              to_utf8_fold((U8*)s, tmpbuf, &ulen);
5423                              ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5424                                                     uniflags);
5425                         }
5426                         else {
5427                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5428                                                      uniflags);
5429                         }
5430                     }
5431                 }
5432             }
5433             else
5434                 ST.c1 = ST.c2 = CHRTEST_VOID;
5435         assume_ok_easy:
5436
5437             ST.A = scan;
5438             ST.B = next;
5439             PL_reginput = locinput;
5440             if (minmod) {
5441                 minmod = 0;
5442                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5443                     sayNO;
5444                 ST.count = ST.min;
5445                 locinput = PL_reginput;
5446                 REGCP_SET(ST.cp);
5447                 if (ST.c1 == CHRTEST_VOID)
5448                     goto curly_try_B_min;
5449
5450                 ST.oldloc = locinput;
5451
5452                 /* set ST.maxpos to the furthest point along the
5453                  * string that could possibly match */
5454                 if  (ST.max == REG_INFTY) {
5455                     ST.maxpos = PL_regeol - 1;
5456                     if (utf8_target)
5457                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5458                             ST.maxpos--;
5459                 }
5460                 else if (utf8_target) {
5461                     int m = ST.max - ST.min;
5462                     for (ST.maxpos = locinput;
5463                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5464                         ST.maxpos += UTF8SKIP(ST.maxpos);
5465                 }
5466                 else {
5467                     ST.maxpos = locinput + ST.max - ST.min;
5468                     if (ST.maxpos >= PL_regeol)
5469                         ST.maxpos = PL_regeol - 1;
5470                 }
5471                 goto curly_try_B_min_known;
5472
5473             }
5474             else {
5475                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5476                 locinput = PL_reginput;
5477                 if (ST.count < ST.min)
5478                     sayNO;
5479                 if ((ST.count > ST.min)
5480                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5481                 {
5482                     /* A{m,n} must come at the end of the string, there's
5483                      * no point in backing off ... */
5484                     ST.min = ST.count;
5485                     /* ...except that $ and \Z can match before *and* after
5486                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5487                        We may back off by one in this case. */
5488                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5489                         ST.min--;
5490                 }
5491                 REGCP_SET(ST.cp);
5492                 goto curly_try_B_max;
5493             }
5494             assert(0); /* NOTREACHED */
5495
5496
5497         case CURLY_B_min_known_fail:
5498             /* failed to find B in a non-greedy match where c1,c2 valid */
5499
5500             PL_reginput = locinput;     /* Could be reset... */
5501             REGCP_UNWIND(ST.cp);
5502             if (ST.paren) {
5503                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5504             }
5505             /* Couldn't or didn't -- move forward. */
5506             ST.oldloc = locinput;
5507             if (utf8_target)
5508                 locinput += UTF8SKIP(locinput);
5509             else
5510                 locinput++;
5511             ST.count++;
5512           curly_try_B_min_known:
5513              /* find the next place where 'B' could work, then call B */
5514             {
5515                 int n;
5516                 if (utf8_target) {
5517                     n = (ST.oldloc == locinput) ? 0 : 1;
5518                     if (ST.c1 == ST.c2) {
5519                         STRLEN len;
5520                         /* set n to utf8_distance(oldloc, locinput) */
5521                         while (locinput <= ST.maxpos &&
5522                                utf8n_to_uvchr((U8*)locinput,
5523                                               UTF8_MAXBYTES, &len,
5524                                               uniflags) != (UV)ST.c1) {
5525                             locinput += len;
5526                             n++;
5527                         }
5528                     }
5529                     else {
5530                         /* set n to utf8_distance(oldloc, locinput) */
5531                         while (locinput <= ST.maxpos) {
5532                             STRLEN len;
5533                             const UV c = utf8n_to_uvchr((U8*)locinput,
5534                                                   UTF8_MAXBYTES, &len,
5535                                                   uniflags);
5536                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5537                                 break;
5538                             locinput += len;
5539                             n++;
5540                         }
5541                     }
5542                 }
5543                 else {
5544                     if (ST.c1 == ST.c2) {
5545                         while (locinput <= ST.maxpos &&
5546                                UCHARAT(locinput) != ST.c1)
5547                             locinput++;
5548                     }
5549                     else {
5550                         while (locinput <= ST.maxpos
5551                                && UCHARAT(locinput) != ST.c1
5552                                && UCHARAT(locinput) != ST.c2)
5553                             locinput++;
5554                     }
5555                     n = locinput - ST.oldloc;
5556                 }
5557                 if (locinput > ST.maxpos)
5558                     sayNO;
5559                 /* PL_reginput == oldloc now */
5560                 if (n) {
5561                     ST.count += n;
5562                     if (regrepeat(rex, ST.A, n, depth) < n)
5563                         sayNO;
5564                 }
5565                 PL_reginput = locinput;
5566                 CURLY_SETPAREN(ST.paren, ST.count);
5567                 if (cur_eval && cur_eval->u.eval.close_paren &&
5568                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5569                     goto fake_end;
5570                 }
5571                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5572             }
5573             assert(0); /* NOTREACHED */
5574
5575
5576         case CURLY_B_min_fail:
5577             /* failed to find B in a non-greedy match where c1,c2 invalid */
5578
5579             REGCP_UNWIND(ST.cp);
5580             if (ST.paren) {
5581                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5582             }
5583             /* failed -- move forward one */
5584             PL_reginput = locinput;
5585             if (regrepeat(rex, ST.A, 1, depth)) {
5586                 ST.count++;
5587                 locinput = PL_reginput;
5588                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5589                         ST.count > 0)) /* count overflow ? */
5590                 {
5591                   curly_try_B_min:
5592                     CURLY_SETPAREN(ST.paren, ST.count);
5593                     if (cur_eval && cur_eval->u.eval.close_paren &&
5594                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5595                         goto fake_end;
5596                     }
5597                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5598                 }
5599             }
5600             sayNO;
5601             assert(0); /* NOTREACHED */
5602
5603
5604         curly_try_B_max:
5605             /* a successful greedy match: now try to match B */
5606             if (cur_eval && cur_eval->u.eval.close_paren &&
5607                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5608                 goto fake_end;
5609             }
5610             {
5611                 UV c = 0;
5612                 if (ST.c1 != CHRTEST_VOID)
5613                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5614                                            UTF8_MAXBYTES, 0, uniflags)
5615                                 : (UV) UCHARAT(PL_reginput);
5616                 /* If it could work, try it. */
5617                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5618                     CURLY_SETPAREN(ST.paren, ST.count);
5619                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5620                     assert(0); /* NOTREACHED */
5621                 }
5622             }
5623             /* FALL THROUGH */
5624         case CURLY_B_max_fail:
5625             /* failed to find B in a greedy match */
5626
5627             REGCP_UNWIND(ST.cp);
5628             if (ST.paren) {
5629                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5630             }
5631             /*  back up. */
5632             if (--ST.count < ST.min)
5633                 sayNO;
5634             PL_reginput = locinput = HOPc(locinput, -1);
5635             goto curly_try_B_max;
5636
5637 #undef ST
5638
5639         case END:
5640             fake_end:
5641             if (cur_eval) {
5642                 /* we've just finished A in /(??{A})B/; now continue with B */
5643                 st->u.eval.toggle_reg_flags
5644                             = cur_eval->u.eval.toggle_reg_flags;
5645                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5646
5647                 st->u.eval.prev_rex = rex_sv;           /* inner */
5648                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5649                 rex_sv = cur_eval->u.eval.prev_rex;
5650                 SET_reg_curpm(rex_sv);
5651                 rex = (struct regexp *)SvANY(rex_sv);
5652                 rexi = RXi_GET(rex);
5653                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5654
5655                 REGCP_SET(st->u.eval.lastcp);
5656                 PL_reginput = locinput;
5657
5658                 /* Restore parens of the outer rex without popping the
5659                  * savestack */
5660                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5661
5662                 st->u.eval.prev_eval = cur_eval;
5663                 cur_eval = cur_eval->u.eval.prev_eval;
5664                 DEBUG_EXECUTE_r(
5665                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5666                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5667                 if ( nochange_depth )
5668                     nochange_depth--;
5669
5670                 PUSH_YES_STATE_GOTO(EVAL_AB,
5671                         st->u.eval.prev_eval->u.eval.B); /* match B */
5672             }
5673
5674             if (locinput < reginfo->till) {
5675                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5676                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5677                                       PL_colors[4],
5678                                       (long)(locinput - PL_reg_starttry),
5679                                       (long)(reginfo->till - PL_reg_starttry),
5680                                       PL_colors[5]));
5681
5682                 sayNO_SILENT;           /* Cannot match: too short. */
5683             }
5684             PL_reginput = locinput;     /* put where regtry can find it */
5685             sayYES;                     /* Success! */
5686
5687         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5688             DEBUG_EXECUTE_r(
5689             PerlIO_printf(Perl_debug_log,
5690                 "%*s  %ssubpattern success...%s\n",
5691                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5692             PL_reginput = locinput;     /* put where regtry can find it */
5693             sayYES;                     /* Success! */
5694
5695 #undef  ST
5696 #define ST st->u.ifmatch
5697
5698         case SUSPEND:   /* (?>A) */
5699             ST.wanted = 1;
5700             PL_reginput = locinput;
5701             goto do_ifmatch;
5702
5703         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5704             ST.wanted = 0;
5705             goto ifmatch_trivial_fail_test;
5706
5707         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5708             ST.wanted = 1;
5709           ifmatch_trivial_fail_test:
5710             if (scan->flags) {
5711                 char * const s = HOPBACKc(locinput, scan->flags);
5712                 if (!s) {
5713                     /* trivial fail */
5714                     if (logical) {
5715                         logical = 0;
5716                         sw = 1 - cBOOL(ST.wanted);
5717                     }
5718                     else if (ST.wanted)
5719                         sayNO;
5720                     next = scan + ARG(scan);
5721                     if (next == scan)
5722                         next = NULL;
5723                     break;
5724                 }
5725                 PL_reginput = s;
5726             }
5727             else
5728                 PL_reginput = locinput;
5729
5730           do_ifmatch:
5731             ST.me = scan;
5732             ST.logical = logical;
5733             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5734
5735             /* execute body of (?...A) */
5736             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5737             assert(0); /* NOTREACHED */
5738
5739         case IFMATCH_A_fail: /* body of (?...A) failed */
5740             ST.wanted = !ST.wanted;
5741             /* FALL THROUGH */
5742
5743         case IFMATCH_A: /* body of (?...A) succeeded */
5744             if (ST.logical) {
5745                 sw = cBOOL(ST.wanted);
5746             }
5747             else if (!ST.wanted)
5748                 sayNO;
5749
5750             if (OP(ST.me) == SUSPEND)
5751                 locinput = PL_reginput;
5752             else {
5753                 locinput = PL_reginput = st->locinput;
5754                 nextchr = UCHARAT(locinput);
5755             }
5756             scan = ST.me + ARG(ST.me);
5757             if (scan == ST.me)
5758                 scan = NULL;
5759             continue; /* execute B */
5760
5761 #undef ST
5762
5763         case LONGJMP:
5764             next = scan + ARG(scan);
5765             if (next == scan)
5766                 next = NULL;
5767             break;
5768         case COMMIT:
5769             reginfo->cutpoint = PL_regeol;
5770             /* FALLTHROUGH */
5771         case PRUNE:
5772             PL_reginput = locinput;
5773             if (!scan->flags)
5774                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5775             PUSH_STATE_GOTO(COMMIT_next,next);
5776             assert(0); /* NOTREACHED */
5777         case COMMIT_next_fail:
5778             no_final = 1;
5779             /* FALLTHROUGH */
5780         case OPFAIL:
5781             sayNO;
5782             assert(0); /* NOTREACHED */
5783
5784 #define ST st->u.mark
5785         case MARKPOINT:
5786             ST.prev_mark = mark_state;
5787             ST.mark_name = sv_commit = sv_yes_mark
5788                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5789             mark_state = st;
5790             ST.mark_loc = PL_reginput = locinput;
5791             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5792             assert(0); /* NOTREACHED */
5793         case MARKPOINT_next:
5794             mark_state = ST.prev_mark;
5795             sayYES;
5796             assert(0); /* NOTREACHED */
5797         case MARKPOINT_next_fail:
5798             if (popmark && sv_eq(ST.mark_name,popmark))
5799             {
5800                 if (ST.mark_loc > startpoint)
5801                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5802                 popmark = NULL; /* we found our mark */
5803                 sv_commit = ST.mark_name;
5804
5805                 DEBUG_EXECUTE_r({
5806                         PerlIO_printf(Perl_debug_log,
5807                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5808                             REPORT_CODE_OFF+depth*2, "",
5809                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5810                 });
5811             }
5812             mark_state = ST.prev_mark;
5813             sv_yes_mark = mark_state ?
5814                 mark_state->u.mark.mark_name : NULL;
5815             sayNO;
5816             assert(0); /* NOTREACHED */
5817         case SKIP:
5818             PL_reginput = locinput;
5819             if (scan->flags) {
5820                 /* (*SKIP) : if we fail we cut here*/
5821                 ST.mark_name = NULL;
5822                 ST.mark_loc = locinput;
5823                 PUSH_STATE_GOTO(SKIP_next,next);
5824             } else {
5825                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5826                    otherwise do nothing.  Meaning we need to scan
5827                  */
5828                 regmatch_state *cur = mark_state;
5829                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5830
5831                 while (cur) {
5832                     if ( sv_eq( cur->u.mark.mark_name,
5833                                 find ) )
5834                     {
5835                         ST.mark_name = find;
5836                         PUSH_STATE_GOTO( SKIP_next, next );
5837                     }
5838                     cur = cur->u.mark.prev_mark;
5839                 }
5840             }
5841             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5842             break;
5843         case SKIP_next_fail:
5844             if (ST.mark_name) {
5845                 /* (*CUT:NAME) - Set up to search for the name as we
5846                    collapse the stack*/
5847                 popmark = ST.mark_name;
5848             } else {
5849                 /* (*CUT) - No name, we cut here.*/
5850                 if (ST.mark_loc > startpoint)
5851                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5852                 /* but we set sv_commit to latest mark_name if there
5853                    is one so they can test to see how things lead to this
5854                    cut */
5855                 if (mark_state)
5856                     sv_commit=mark_state->u.mark.mark_name;
5857             }
5858             no_final = 1;
5859             sayNO;
5860             assert(0); /* NOTREACHED */
5861 #undef ST
5862         case LNBREAK:
5863             if ((n=is_LNBREAK(locinput,utf8_target))) {
5864                 locinput += n;
5865                 nextchr = UCHARAT(locinput);
5866             } else
5867                 sayNO;
5868             break;
5869
5870 #define CASE_CLASS(nAmE)                              \
5871         case nAmE:                                    \
5872             if (locinput >= PL_regeol)                \
5873                 sayNO;                                \
5874             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5875                 locinput += n;                        \
5876                 nextchr = UCHARAT(locinput);          \
5877             } else                                    \
5878                 sayNO;                                \
5879             break;                                    \
5880         case N##nAmE:                                 \
5881             if (locinput >= PL_regeol)                \
5882                 sayNO;                                \
5883             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5884                 sayNO;                                \
5885             } else {                                  \
5886                 locinput += UTF8SKIP(locinput);       \
5887                 nextchr = UCHARAT(locinput);          \
5888             }                                         \
5889             break
5890
5891         CASE_CLASS(VERTWS);
5892         CASE_CLASS(HORIZWS);
5893 #undef CASE_CLASS
5894
5895         default:
5896             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5897                           PTR2UV(scan), OP(scan));
5898             Perl_croak(aTHX_ "regexp memory corruption");
5899
5900         } /* end switch */
5901
5902         /* switch break jumps here */
5903         scan = next; /* prepare to execute the next op and ... */
5904         continue;    /* ... jump back to the top, reusing st */
5905         assert(0); /* NOTREACHED */
5906
5907       push_yes_state:
5908         /* push a state that backtracks on success */
5909         st->u.yes.prev_yes_state = yes_state;
5910         yes_state = st;
5911         /* FALL THROUGH */
5912       push_state:
5913         /* push a new regex state, then continue at scan  */
5914         {
5915             regmatch_state *newst;
5916
5917             DEBUG_STACK_r({
5918                 regmatch_state *cur = st;
5919                 regmatch_state *curyes = yes_state;
5920                 int curd = depth;
5921                 regmatch_slab *slab = PL_regmatch_slab;
5922                 for (;curd > -1;cur--,curd--) {
5923                     if (cur < SLAB_FIRST(slab)) {
5924                         slab = slab->prev;
5925                         cur = SLAB_LAST(slab);
5926                     }
5927                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5928                         REPORT_CODE_OFF + 2 + depth * 2,"",
5929                         curd, PL_reg_name[cur->resume_state],
5930                         (curyes == cur) ? "yes" : ""
5931                     );
5932                     if (curyes == cur)
5933                         curyes = cur->u.yes.prev_yes_state;
5934                 }
5935             } else
5936                 DEBUG_STATE_pp("push")
5937             );
5938             depth++;
5939             st->locinput = locinput;
5940             newst = st+1;
5941             if (newst >  SLAB_LAST(PL_regmatch_slab))
5942                 newst = S_push_slab(aTHX);
5943             PL_regmatch_state = newst;
5944
5945             locinput = PL_reginput;
5946             nextchr = UCHARAT(locinput);
5947             st = newst;
5948             continue;
5949             assert(0); /* NOTREACHED */
5950         }
5951     }
5952
5953     /*
5954     * We get here only if there's trouble -- normally "case END" is
5955     * the terminating point.
5956     */
5957     Perl_croak(aTHX_ "corrupted regexp pointers");
5958     /*NOTREACHED*/
5959     sayNO;
5960
5961 yes:
5962     if (yes_state) {
5963         /* we have successfully completed a subexpression, but we must now
5964          * pop to the state marked by yes_state and continue from there */
5965         assert(st != yes_state);
5966 #ifdef DEBUGGING
5967         while (st != yes_state) {
5968             st--;
5969             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5970                 PL_regmatch_slab = PL_regmatch_slab->prev;
5971                 st = SLAB_LAST(PL_regmatch_slab);
5972             }
5973             DEBUG_STATE_r({
5974                 if (no_final) {
5975                     DEBUG_STATE_pp("pop (no final)");
5976                 } else {
5977                     DEBUG_STATE_pp("pop (yes)");
5978                 }
5979             });
5980             depth--;
5981         }
5982 #else
5983         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5984             || yes_state > SLAB_LAST(PL_regmatch_slab))
5985         {
5986             /* not in this slab, pop slab */
5987             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5988             PL_regmatch_slab = PL_regmatch_slab->prev;
5989             st = SLAB_LAST(PL_regmatch_slab);
5990         }
5991         depth -= (st - yes_state);
5992 #endif
5993         st = yes_state;
5994         yes_state = st->u.yes.prev_yes_state;
5995         PL_regmatch_state = st;
5996
5997         if (no_final) {
5998             locinput= st->locinput;
5999             nextchr = UCHARAT(locinput);
6000         }
6001         state_num = st->resume_state + no_final;
6002         goto reenter_switch;
6003     }
6004
6005     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6006                           PL_colors[4], PL_colors[5]));
6007
6008     if (PL_reg_state.re_state_eval_setup_done) {
6009         /* each successfully executed (?{...}) block does the equivalent of
6010          *   local $^R = do {...}
6011          * When popping the save stack, all these locals would be undone;
6012          * bypass this by setting the outermost saved $^R to the latest
6013          * value */
6014         if (oreplsv != GvSV(PL_replgv))
6015             sv_setsv(oreplsv, GvSV(PL_replgv));
6016     }
6017     result = 1;
6018     goto final_exit;
6019
6020 no:
6021     DEBUG_EXECUTE_r(
6022         PerlIO_printf(Perl_debug_log,
6023             "%*s  %sfailed...%s\n",
6024             REPORT_CODE_OFF+depth*2, "",
6025             PL_colors[4], PL_colors[5])
6026         );
6027
6028 no_silent:
6029     if (no_final) {
6030         if (yes_state) {
6031             goto yes;
6032         } else {
6033             goto final_exit;
6034         }
6035     }
6036     if (depth) {
6037         /* there's a previous state to backtrack to */
6038         st--;
6039         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6040             PL_regmatch_slab = PL_regmatch_slab->prev;
6041             st = SLAB_LAST(PL_regmatch_slab);
6042         }
6043         PL_regmatch_state = st;
6044         locinput= st->locinput;
6045         nextchr = UCHARAT(locinput);
6046
6047         DEBUG_STATE_pp("pop");
6048         depth--;
6049         if (yes_state == st)
6050             yes_state = st->u.yes.prev_yes_state;
6051
6052         state_num = st->resume_state + 1; /* failure = success + 1 */
6053         goto reenter_switch;
6054     }
6055     result = 0;
6056
6057   final_exit:
6058     if (rex->intflags & PREGf_VERBARG_SEEN) {
6059         SV *sv_err = get_sv("REGERROR", 1);
6060         SV *sv_mrk = get_sv("REGMARK", 1);
6061         if (result) {
6062             sv_commit = &PL_sv_no;
6063             if (!sv_yes_mark)
6064                 sv_yes_mark = &PL_sv_yes;
6065         } else {
6066             if (!sv_commit)
6067                 sv_commit = &PL_sv_yes;
6068             sv_yes_mark = &PL_sv_no;
6069         }
6070         sv_setsv(sv_err, sv_commit);
6071         sv_setsv(sv_mrk, sv_yes_mark);
6072     }
6073
6074
6075     if (last_pushed_cv) {
6076         dSP;
6077         POP_MULTICALL;
6078         PERL_UNUSED_VAR(SP);
6079     }
6080
6081     /* clean up; in particular, free all slabs above current one */
6082     LEAVE_SCOPE(oldsave);
6083
6084     return result;
6085 }
6086
6087 /*
6088  - regrepeat - repeatedly match something simple, report how many
6089  */
6090 /*
6091  * [This routine now assumes that it will only match on things of length 1.
6092  * That was true before, but now we assume scan - reginput is the count,
6093  * rather than incrementing count on every character.  [Er, except utf8.]]
6094  */
6095 STATIC I32
6096 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
6097 {
6098     dVAR;
6099     register char *scan;
6100     register I32 c;
6101     register char *loceol = PL_regeol;
6102     register I32 hardcount = 0;
6103     register bool utf8_target = PL_reg_match_utf8;
6104     UV utf8_flags;
6105 #ifndef DEBUGGING
6106     PERL_UNUSED_ARG(depth);
6107 #endif
6108
6109     PERL_ARGS_ASSERT_REGREPEAT;
6110
6111     scan = PL_reginput;
6112     if (max == REG_INFTY)
6113         max = I32_MAX;
6114     else if (max < loceol - scan)
6115         loceol = scan + max;
6116     switch (OP(p)) {
6117     case REG_ANY:
6118         if (utf8_target) {
6119             loceol = PL_regeol;
6120             while (scan < loceol && hardcount < max && *scan != '\n') {
6121                 scan += UTF8SKIP(scan);
6122                 hardcount++;
6123             }
6124         } else {
6125             while (scan < loceol && *scan != '\n')
6126                 scan++;
6127         }
6128         break;
6129     case SANY:
6130         if (utf8_target) {
6131             loceol = PL_regeol;
6132             while (scan < loceol && hardcount < max) {
6133                 scan += UTF8SKIP(scan);
6134                 hardcount++;
6135             }
6136         }
6137         else
6138             scan = loceol;
6139         break;
6140     case CANY:
6141         scan = loceol;
6142         break;
6143     case EXACT:
6144         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6145          * means they match only characters in the string that can be expressed
6146          * as a single byte.  For non-utf8 strings, that means a simple match.
6147          * For utf8 strings, the character matched must be an invariant, or
6148          * downgradable to a single byte.  The pattern's utf8ness is
6149          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6150          * it is, it's an invariant */
6151
6152         c = (U8)*STRING(p);
6153         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6154
6155         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6156             while (scan < loceol && UCHARAT(scan) == c) {
6157                 scan++;
6158             }
6159         }
6160         else {
6161
6162             /* Here, the string is utf8, and the pattern char is different
6163              * in utf8 than not, so can't compare them directly.  Outside the
6164              * loop, find the two utf8 bytes that represent c, and then
6165              * look for those in sequence in the utf8 string */
6166             U8 high = UTF8_TWO_BYTE_HI(c);
6167             U8 low = UTF8_TWO_BYTE_LO(c);
6168             loceol = PL_regeol;
6169
6170             while (hardcount < max
6171                     && scan + 1 < loceol
6172                     && UCHARAT(scan) == high
6173                     && UCHARAT(scan + 1) == low)
6174             {
6175                 scan += 2;
6176                 hardcount++;
6177             }
6178         }
6179         break;
6180     case EXACTFA:
6181         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6182         goto do_exactf;
6183
6184     case EXACTFL:
6185         PL_reg_flags |= RF_tainted;
6186         utf8_flags = FOLDEQ_UTF8_LOCALE;
6187         goto do_exactf;
6188
6189     case EXACTF:
6190             utf8_flags = 0;
6191             goto do_exactf;
6192
6193     case EXACTFU_SS:
6194     case EXACTFU_TRICKYFOLD:
6195     case EXACTFU:
6196         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6197
6198         /* The comments for the EXACT case above apply as well to these fold
6199          * ones */
6200
6201     do_exactf:
6202         c = (U8)*STRING(p);
6203         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6204
6205         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6206             char *tmpeol = loceol;
6207             while (hardcount < max
6208                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6209                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6210             {
6211                 scan = tmpeol;
6212                 tmpeol = loceol;
6213                 hardcount++;
6214             }
6215
6216             /* XXX Note that the above handles properly the German sharp s in
6217              * the pattern matching ss in the string.  But it doesn't handle
6218              * properly cases where the string contains say 'LIGATURE ff' and
6219              * the pattern is 'f+'.  This would require, say, a new function or
6220              * revised interface to foldEQ_utf8(), in which the maximum number
6221              * of characters to match could be passed and it would return how
6222              * many actually did.  This is just one of many cases where
6223              * multi-char folds don't work properly, and so the fix is being
6224              * deferred */
6225         }
6226         else {
6227             U8 folded;
6228
6229             /* Here, the string isn't utf8 and c is a single byte; and either
6230              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6231              * doesn't affect c.  Can just do simple comparisons for exact or
6232              * fold matching. */
6233             switch (OP(p)) {
6234                 case EXACTF: folded = PL_fold[c]; break;
6235                 case EXACTFA:
6236                 case EXACTFU_TRICKYFOLD:
6237                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6238                 case EXACTFL: folded = PL_fold_locale[c]; break;
6239                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6240             }
6241             while (scan < loceol &&
6242                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6243             {
6244                 scan++;
6245             }
6246         }
6247         break;
6248     case ANYOFV:
6249     case ANYOF:
6250         if (utf8_target || OP(p) == ANYOFV) {
6251             STRLEN inclasslen;
6252             loceol = PL_regeol;
6253             inclasslen = loceol - scan;
6254             while (hardcount < max
6255                    && ((inclasslen = loceol - scan) > 0)
6256                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6257             {
6258                 scan += inclasslen;
6259                 hardcount++;
6260             }
6261         } else {
6262             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6263                 scan++;
6264         }
6265         break;
6266     case ALNUMU:
6267         if (utf8_target) {
6268     utf8_wordchar:
6269             loceol = PL_regeol;
6270             LOAD_UTF8_CHARCLASS_ALNUM();
6271             while (hardcount < max && scan < loceol &&
6272                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6273             {
6274                 scan += UTF8SKIP(scan);
6275                 hardcount++;
6276             }
6277         } else {
6278             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6279                 scan++;
6280             }
6281         }
6282         break;
6283     case ALNUM:
6284         if (utf8_target)
6285             goto utf8_wordchar;
6286         while (scan < loceol && isALNUM((U8) *scan)) {
6287             scan++;
6288         }
6289         break;
6290     case ALNUMA:
6291         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6292             scan++;
6293         }
6294         break;
6295     case ALNUML:
6296         PL_reg_flags |= RF_tainted;
6297         if (utf8_target) {
6298             loceol = PL_regeol;
6299             while (hardcount < max && scan < loceol &&
6300                    isALNUM_LC_utf8((U8*)scan)) {
6301                 scan += UTF8SKIP(scan);
6302                 hardcount++;
6303             }
6304         } else {
6305             while (scan < loceol && isALNUM_LC(*scan))
6306                 scan++;
6307         }
6308         break;
6309     case NALNUMU:
6310         if (utf8_target) {
6311
6312     utf8_Nwordchar:
6313
6314             loceol = PL_regeol;
6315             LOAD_UTF8_CHARCLASS_ALNUM();
6316             while (hardcount < max && scan < loceol &&
6317                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6318             {
6319                 scan += UTF8SKIP(scan);
6320                 hardcount++;
6321             }
6322         } else {
6323             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6324                 scan++;
6325             }
6326         }
6327         break;
6328     case NALNUM:
6329         if (utf8_target)
6330             goto utf8_Nwordchar;
6331         while (scan < loceol && ! isALNUM((U8) *scan)) {
6332             scan++;
6333         }
6334         break;
6335
6336     case POSIXA:
6337        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6338             scan++;
6339         }
6340         break;
6341     case NPOSIXA:
6342         if (utf8_target) {
6343             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6344                 scan += UTF8SKIP(scan);
6345             }
6346         }
6347         else {
6348             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6349                 scan++;
6350             }
6351         }
6352         break;
6353     case NALNUMA:
6354         if (utf8_target) {
6355             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6356                 scan += UTF8SKIP(scan);
6357             }
6358         }
6359         else {
6360             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6361                 scan++;
6362             }
6363         }
6364         break;
6365     case NALNUML:
6366         PL_reg_flags |= RF_tainted;
6367         if (utf8_target) {
6368             loceol = PL_regeol;
6369             while (hardcount < max && scan < loceol &&
6370                    !isALNUM_LC_utf8((U8*)scan)) {
6371                 scan += UTF8SKIP(scan);
6372                 hardcount++;
6373             }
6374         } else {
6375             while (scan < loceol && !isALNUM_LC(*scan))
6376                 scan++;
6377         }
6378         break;
6379     case SPACEU:
6380         if (utf8_target) {
6381
6382     utf8_space:
6383
6384             loceol = PL_regeol;
6385             LOAD_UTF8_CHARCLASS_SPACE();
6386             while (hardcount < max && scan < loceol &&
6387                    (*scan == ' ' ||
6388                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6389             {
6390                 scan += UTF8SKIP(scan);
6391                 hardcount++;
6392             }
6393             break;
6394         }
6395         else {
6396             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6397                 scan++;
6398             }
6399             break;
6400         }
6401     case SPACE:
6402         if (utf8_target)
6403             goto utf8_space;
6404
6405         while (scan < loceol && isSPACE((U8) *scan)) {
6406             scan++;
6407         }
6408         break;
6409     case SPACEA:
6410         while (scan < loceol && isSPACE_A((U8) *scan)) {
6411             scan++;
6412         }
6413         break;
6414     case SPACEL:
6415         PL_reg_flags |= RF_tainted;
6416         if (utf8_target) {
6417             loceol = PL_regeol;
6418             while (hardcount < max && scan < loceol &&
6419                    isSPACE_LC_utf8((U8*)scan)) {
6420                 scan += UTF8SKIP(scan);
6421                 hardcount++;
6422             }
6423         } else {
6424             while (scan < loceol && isSPACE_LC(*scan))
6425                 scan++;
6426         }
6427         break;
6428     case NSPACEU:
6429         if (utf8_target) {
6430
6431     utf8_Nspace:
6432
6433             loceol = PL_regeol;
6434             LOAD_UTF8_CHARCLASS_SPACE();
6435             while (hardcount < max && scan < loceol &&
6436                    ! (*scan == ' ' ||
6437                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6438             {
6439                 scan += UTF8SKIP(scan);
6440                 hardcount++;
6441             }
6442             break;
6443         }
6444         else {
6445             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6446                 scan++;
6447             }
6448         }
6449         break;
6450     case NSPACE:
6451         if (utf8_target)
6452             goto utf8_Nspace;
6453
6454         while (scan < loceol && ! isSPACE((U8) *scan)) {
6455             scan++;
6456         }
6457         break;
6458     case NSPACEA:
6459         if (utf8_target) {
6460             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6461                 scan += UTF8SKIP(scan);
6462             }
6463         }
6464         else {
6465             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6466                 scan++;
6467             }
6468         }
6469         break;
6470     case NSPACEL:
6471         PL_reg_flags |= RF_tainted;
6472         if (utf8_target) {
6473             loceol = PL_regeol;
6474             while (hardcount < max && scan < loceol &&
6475                    !isSPACE_LC_utf8((U8*)scan)) {
6476                 scan += UTF8SKIP(scan);
6477                 hardcount++;
6478             }
6479         } else {
6480             while (scan < loceol && !isSPACE_LC(*scan))
6481                 scan++;
6482         }
6483         break;
6484     case DIGIT:
6485         if (utf8_target) {
6486             loceol = PL_regeol;
6487             LOAD_UTF8_CHARCLASS_DIGIT();
6488             while (hardcount < max && scan < loceol &&
6489                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6490                 scan += UTF8SKIP(scan);
6491                 hardcount++;
6492             }
6493         } else {
6494             while (scan < loceol && isDIGIT(*scan))
6495                 scan++;
6496         }
6497         break;
6498     case DIGITA:
6499         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6500             scan++;
6501         }
6502         break;
6503     case DIGITL:
6504         PL_reg_flags |= RF_tainted;
6505         if (utf8_target) {
6506             loceol = PL_regeol;
6507             while (hardcount < max && scan < loceol &&
6508                    isDIGIT_LC_utf8((U8*)scan)) {
6509                 scan += UTF8SKIP(scan);
6510                 hardcount++;
6511             }
6512         } else {
6513             while (scan < loceol && isDIGIT_LC(*scan))
6514                 scan++;
6515         }
6516         break;
6517     case NDIGIT:
6518         if (utf8_target) {
6519             loceol = PL_regeol;
6520             LOAD_UTF8_CHARCLASS_DIGIT();
6521             while (hardcount < max && scan < loceol &&
6522                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6523                 scan += UTF8SKIP(scan);
6524                 hardcount++;
6525             }
6526         } else {
6527             while (scan < loceol && !isDIGIT(*scan))
6528                 scan++;
6529         }
6530         break;
6531     case NDIGITA:
6532         if (utf8_target) {
6533             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6534                 scan += UTF8SKIP(scan);
6535             }
6536         }
6537         else {
6538             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6539                 scan++;
6540             }
6541         }
6542         break;
6543     case NDIGITL:
6544         PL_reg_flags |= RF_tainted;
6545         if (utf8_target) {
6546             loceol = PL_regeol;
6547             while (hardcount < max && scan < loceol &&
6548                    !isDIGIT_LC_utf8((U8*)scan)) {
6549                 scan += UTF8SKIP(scan);
6550                 hardcount++;
6551             }
6552         } else {
6553             while (scan < loceol && !isDIGIT_LC(*scan))
6554                 scan++;
6555         }
6556         break;
6557     case LNBREAK:
6558         if (utf8_target) {
6559             loceol = PL_regeol;
6560             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6561                 scan += c;
6562                 hardcount++;
6563             }
6564         } else {
6565             /*
6566               LNBREAK can match two latin chars, which is ok,
6567               because we have a null terminated string, but we
6568               have to use hardcount in this situation
6569             */
6570             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6571                 scan+=c;
6572                 hardcount++;
6573             }
6574         }
6575         break;
6576     case HORIZWS:
6577         if (utf8_target) {
6578             loceol = PL_regeol;
6579             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6580                 scan += c;
6581                 hardcount++;
6582             }
6583         } else {
6584             while (scan < loceol && is_HORIZWS_latin1(scan))
6585                 scan++;
6586         }
6587         break;
6588     case NHORIZWS:
6589         if (utf8_target) {
6590             loceol = PL_regeol;
6591             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6592                 scan += UTF8SKIP(scan);
6593                 hardcount++;
6594             }
6595         } else {
6596             while (scan < loceol && !is_HORIZWS_latin1(scan))
6597                 scan++;
6598
6599         }
6600         break;
6601     case VERTWS:
6602         if (utf8_target) {
6603             loceol = PL_regeol;
6604             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6605                 scan += c;
6606                 hardcount++;
6607             }
6608         } else {
6609             while (scan < loceol && is_VERTWS_latin1(scan))
6610                 scan++;
6611
6612         }
6613         break;
6614     case NVERTWS:
6615         if (utf8_target) {
6616             loceol = PL_regeol;
6617             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6618                 scan += UTF8SKIP(scan);
6619                 hardcount++;
6620             }
6621         } else {
6622             while (scan < loceol && !is_VERTWS_latin1(scan))
6623                 scan++;
6624
6625         }
6626         break;
6627
6628     default:            /* Called on something of 0 width. */
6629         break;          /* So match right here or not at all. */
6630     }
6631
6632     if (hardcount)
6633         c = hardcount;
6634     else
6635         c = scan - PL_reginput;
6636     PL_reginput = scan;
6637
6638     DEBUG_r({
6639         GET_RE_DEBUG_FLAGS_DECL;
6640         DEBUG_EXECUTE_r({
6641             SV * const prop = sv_newmortal();
6642             regprop(prog, prop, p);
6643             PerlIO_printf(Perl_debug_log,
6644                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6645                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6646         });
6647     });
6648
6649     return(c);
6650 }
6651
6652
6653 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6654 /*
6655 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6656 create a copy so that changes the caller makes won't change the shared one
6657  */
6658 SV *
6659 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6660 {
6661     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6662     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6663 }
6664 #endif
6665
6666 STATIC SV *
6667 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6668 {
6669     /* Returns the swash for the input 'node' in the regex 'prog'.
6670      * If <doinit> is true, will attempt to create the swash if not already
6671      *    done.
6672      * If <listsvp> is non-null, will return the swash initialization string in
6673      *    it.
6674      * If <altsvp> is non-null, will return the alternates to the regular swash
6675      *    in it
6676      * Tied intimately to how regcomp.c sets up the data structure */
6677
6678     dVAR;
6679     SV *sw  = NULL;
6680     SV *si  = NULL;
6681     SV *alt = NULL;
6682     SV*  invlist = NULL;
6683
6684     RXi_GET_DECL(prog,progi);
6685     const struct reg_data * const data = prog ? progi->data : NULL;
6686
6687     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6688
6689     assert(ANYOF_NONBITMAP(node));
6690
6691     if (data && data->count) {
6692         const U32 n = ARG(node);
6693
6694         if (data->what[n] == 's') {
6695             SV * const rv = MUTABLE_SV(data->data[n]);
6696             AV * const av = MUTABLE_AV(SvRV(rv));
6697             SV **const ary = AvARRAY(av);
6698             bool invlist_has_user_defined_property;
6699
6700             si = *ary;  /* ary[0] = the string to initialize the swash with */
6701
6702             /* Elements 3 and 4 are either both present or both absent. [3] is
6703              * any inversion list generated at compile time; [4] indicates if
6704              * that inversion list has any user-defined properties in it. */
6705             if (av_len(av) >= 3) {
6706                 invlist = ary[3];
6707                 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6708             }
6709             else {
6710                 invlist = NULL;
6711                 invlist_has_user_defined_property = FALSE;
6712             }
6713
6714             /* Element [1] is reserved for the set-up swash.  If already there,
6715              * return it; if not, create it and store it there */
6716             if (SvROK(ary[1])) {
6717                 sw = ary[1];
6718             }
6719             else if (si && doinit) {
6720
6721                 sw = _core_swash_init("utf8", /* the utf8 package */
6722                                       "", /* nameless */
6723                                       si,
6724                                       1, /* binary */
6725                                       0, /* not from tr/// */
6726                                       FALSE, /* is error if can't find
6727                                                 property */
6728                                       invlist,
6729                                       invlist_has_user_defined_property);
6730                 (void)av_store(av, 1, sw);
6731             }
6732
6733             /* Element [2] is for any multi-char folds.  Note that is a
6734              * fundamentally flawed design, because can't backtrack and try
6735              * again.  See [perl #89774] */
6736             if (SvTYPE(ary[2]) == SVt_PVAV) {
6737                 alt = ary[2];
6738             }
6739         }
6740     }
6741
6742     if (listsvp) {
6743         SV* matches_string = newSVpvn("", 0);
6744         SV** invlistsvp;
6745
6746         /* Use the swash, if any, which has to have incorporated into it all
6747          * possibilities */
6748         if (   sw
6749             && SvROK(sw)
6750             && SvTYPE(SvRV(sw)) == SVt_PVHV
6751             && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6752         {
6753             invlist = *invlistsvp;
6754         }
6755         else if (si && si != &PL_sv_undef) {
6756
6757             /* If no swash, use the input nitialization string, if available */
6758             sv_catsv(matches_string, si);
6759         }
6760
6761         /* Add the inversion list to whatever we have.  This may have come from
6762          * the swash, or from an input parameter */
6763         if (invlist) {
6764             sv_catsv(matches_string, _invlist_contents(invlist));
6765         }
6766         *listsvp = matches_string;
6767     }
6768
6769     if (altsvp)
6770         *altsvp  = alt;
6771
6772     return sw;
6773 }
6774
6775 /*
6776  - reginclass - determine if a character falls into a character class
6777
6778   n is the ANYOF regnode
6779   p is the target string
6780   lenp is pointer to the maximum number of bytes of how far to go in p
6781     (This is assumed wthout checking to always be at least the current
6782     character's size)
6783   utf8_target tells whether p is in UTF-8.
6784
6785   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6786   from a successful match, the value it points to will be updated to how many
6787   bytes in p were matched.  If there was no match, the value is undefined,
6788   possibly changed from the input.
6789
6790   Note that this can be a synthetic start class, a combination of various
6791   nodes, so things you think might be mutually exclusive, such as locale,
6792   aren't.  It can match both locale and non-locale
6793
6794  */
6795
6796 STATIC bool
6797 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6798 {
6799     dVAR;
6800     const char flags = ANYOF_FLAGS(n);
6801     bool match = FALSE;
6802     UV c = *p;
6803     STRLEN c_len = 0;
6804     STRLEN maxlen;
6805
6806     PERL_ARGS_ASSERT_REGINCLASS;
6807
6808     /* If c is not already the code point, get it */
6809     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6810         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6811                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6812                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6813                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6814                  * UTF8_ALLOW_FFFF */
6815         if (c_len == (STRLEN)-1)
6816             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6817     }
6818     else {
6819         c_len = 1;
6820     }
6821
6822     /* Use passed in max length, or one character if none passed in or less
6823      * than one character.  And assume will match just one character.  This is
6824      * overwritten later if matched more. */
6825     if (lenp) {
6826         maxlen = (*lenp > c_len) ? *lenp : c_len;
6827         *lenp = c_len;
6828
6829     }
6830     else {
6831         maxlen = c_len;
6832     }
6833
6834     /* If this character is potentially in the bitmap, check it */
6835     if (c < 256) {
6836         if (ANYOF_BITMAP_TEST(n, c))
6837             match = TRUE;
6838         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6839                 && ! utf8_target
6840                 && ! isASCII(c))
6841         {
6842             match = TRUE;
6843         }
6844
6845         else if (flags & ANYOF_LOCALE) {
6846             PL_reg_flags |= RF_tainted;
6847
6848             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6849                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6850             {
6851                 match = TRUE;
6852             }
6853             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6854                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6855                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6856                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6857                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6858                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6859                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6860                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6861                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6862                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6863                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6864                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6865                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6866                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6867                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6868                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6869                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6870                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6871                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6872                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6873                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6874                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6875                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6876                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6877                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6878                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6879                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6880                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6881                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6882                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6883                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6884                      ) /* How's that for a conditional? */
6885             ) {
6886                 match = TRUE;
6887             }
6888         }
6889     }
6890
6891     /* If the bitmap didn't (or couldn't) match, and something outside the
6892      * bitmap could match, try that.  Locale nodes specifiy completely the
6893      * behavior of code points in the bit map (otherwise, a utf8 target would
6894      * cause them to be treated as Unicode and not locale), except in
6895      * the very unlikely event when this node is a synthetic start class, which
6896      * could be a combination of locale and non-locale nodes.  So allow locale
6897      * to match for the synthetic start class, which will give a false
6898      * positive that will be resolved when the match is done again as not part
6899      * of the synthetic start class */
6900     if (!match) {
6901         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6902             match = TRUE;       /* Everything above 255 matches */
6903         }
6904         else if (ANYOF_NONBITMAP(n)
6905                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6906                      || (utf8_target
6907                          && (c >=256
6908                              || (! (flags & ANYOF_LOCALE))
6909                              || (flags & ANYOF_IS_SYNTHETIC)))))
6910         {
6911             AV *av;
6912             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6913
6914             if (sw) {
6915                 U8 * utf8_p;
6916                 if (utf8_target) {
6917                     utf8_p = (U8 *) p;
6918                 } else {
6919
6920                     /* Not utf8.  Convert as much of the string as available up
6921                      * to the limit of how far the (single) character in the
6922                      * pattern can possibly match (no need to go further).  If
6923                      * the node is a straight ANYOF or not folding, it can't
6924                      * match more than one.  Otherwise, It can match up to how
6925                      * far a single char can fold to.  Since not utf8, each
6926                      * character is a single byte, so the max it can be in
6927                      * bytes is the same as the max it can be in characters */
6928                     STRLEN len = (OP(n) == ANYOF
6929                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6930                                   ? 1
6931                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6932                                     ? maxlen
6933                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6934                     utf8_p = bytes_to_utf8(p, &len);
6935                 }
6936
6937                 if (swash_fetch(sw, utf8_p, TRUE))
6938                     match = TRUE;
6939                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6940
6941                     /* Here, we need to test if the fold of the target string
6942                      * matches.  The non-multi char folds have all been moved to
6943                      * the compilation phase, and the multi-char folds have
6944                      * been stored by regcomp into 'av'; we linearly check to
6945                      * see if any match the target string (folded).   We know
6946                      * that the originals were each one character, but we don't
6947                      * currently know how many characters/bytes each folded to,
6948                      * except we do know that there are small limits imposed by
6949                      * Unicode.  XXX A performance enhancement would be to have
6950                      * regcomp.c store the max number of chars/bytes that are
6951                      * in an av entry, as, say the 0th element.  Even better
6952                      * would be to have a hash of the few characters that can
6953                      * start a multi-char fold to the max number of chars of
6954                      * those folds.
6955                      *
6956                      * If there is a match, we will need to advance (if lenp is
6957                      * specified) the match pointer in the target string.  But
6958                      * what we are comparing here isn't that string directly,
6959                      * but its fold, whose length may differ from the original.
6960                      * As we go along in constructing the fold, therefore, we
6961                      * create a map so that we know how many bytes in the
6962                      * source to advance given that we have matched a certain
6963                      * number of bytes in the fold.  This map is stored in
6964                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6965                      * the fold of the first character that we are folding.
6966                      * Then map_fold_len_back[n] is set to the number of bytes
6967                      * in that first character.  Similarly let m be the
6968                      * corresponding number for the second character to be
6969                      * folded.  Then map_fold_len_back[n+m] is set to the
6970                      * number of bytes occupied by the first two source
6971                      * characters. ... */
6972                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6973                     U8 folded[UTF8_MAXBYTES_CASE+1];
6974                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6975                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6976                                                   chars */
6977
6978                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6979
6980                         /* Here, only need to fold the first char of the target
6981                          * string.  It the source wasn't utf8, is 1 byte long */
6982                         to_utf8_fold(utf8_p, folded, &foldlen);
6983                         total_foldlen = foldlen;
6984                         map_fold_len_back[foldlen] = (utf8_target)
6985                                                      ? UTF8SKIP(utf8_p)
6986                                                      : 1;
6987                     }
6988                     else {
6989
6990                         /* Here, need to fold more than the first char.  Do so
6991                          * up to the limits */
6992                         U8* source_ptr = utf8_p;    /* The source for the fold
6993                                                        is the regex target
6994                                                        string */
6995                         U8* folded_ptr = folded;
6996                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6997                                                        available byte in the
6998                                                        target string */
6999                         U8 i;
7000                         for (i = 0;
7001                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
7002                              i++)
7003                         {
7004
7005                             /* Fold the next character */
7006                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
7007                             STRLEN this_char_foldlen;
7008                             to_utf8_fold(source_ptr,
7009                                          this_char_folded,
7010                                          &this_char_foldlen);
7011
7012                             /* Bail if it would exceed the byte limit for
7013                              * folding a single char. */
7014                             if (this_char_foldlen + folded_ptr - folded >
7015                                                             UTF8_MAXBYTES_CASE)
7016                             {
7017                                 break;
7018                             }
7019
7020                             /* Add the fold of this character */
7021                             Copy(this_char_folded,
7022                                  folded_ptr,
7023                                  this_char_foldlen,
7024                                  U8);
7025                             source_ptr += UTF8SKIP(source_ptr);
7026                             folded_ptr += this_char_foldlen;
7027                             total_foldlen = folded_ptr - folded;
7028
7029                             /* Create map from the number of bytes in the fold
7030                              * back to the number of bytes in the source.  If
7031                              * the source isn't utf8, the byte count is just
7032                              * the number of characters so far */
7033                             map_fold_len_back[total_foldlen]
7034                                                       = (utf8_target)
7035                                                         ? source_ptr - utf8_p
7036                                                         : i + 1;
7037                         }
7038                         *folded_ptr = '\0';
7039                     }
7040
7041
7042                     /* Do the linear search to see if the fold is in the list
7043                      * of multi-char folds. */
7044                     if (av) {
7045                         I32 i;
7046                         for (i = 0; i <= av_len(av); i++) {
7047                             SV* const sv = *av_fetch(av, i, FALSE);
7048                             STRLEN len;
7049                             const char * const s = SvPV_const(sv, len);
7050
7051                             if (len <= total_foldlen
7052                                 && memEQ(s, (char*)folded, len)
7053
7054                                    /* If 0, means matched a partial char. See
7055                                     * [perl #90536] */
7056                                 && map_fold_len_back[len])
7057                             {
7058
7059                                 /* Advance the target string ptr to account for
7060                                  * this fold, but have to translate from the
7061                                  * folded length to the corresponding source
7062                                  * length. */
7063                                 if (lenp) {
7064                                     *lenp = map_fold_len_back[len];
7065                                 }
7066                                 match = TRUE;
7067                                 break;
7068                             }
7069                         }
7070                     }
7071                 }
7072
7073                 /* If we allocated a string above, free it */
7074                 if (! utf8_target) Safefree(utf8_p);
7075             }
7076         }
7077
7078         if (UNICODE_IS_SUPER(c)
7079             && (flags & ANYOF_WARN_SUPER)
7080             && ckWARN_d(WARN_NON_UNICODE))
7081         {
7082             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7083                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7084         }
7085     }
7086
7087     return (flags & ANYOF_INVERT) ? !match : match;
7088 }
7089
7090 STATIC U8 *
7091 S_reghop3(U8 *s, I32 off, const U8* lim)
7092 {
7093     /* return the position 'off' UTF-8 characters away from 's', forward if
7094      * 'off' >= 0, backwards if negative.  But don't go outside of position
7095      * 'lim', which better be < s  if off < 0 */
7096
7097     dVAR;
7098
7099     PERL_ARGS_ASSERT_REGHOP3;
7100
7101     if (off >= 0) {
7102         while (off-- && s < lim) {
7103             /* XXX could check well-formedness here */
7104             s += UTF8SKIP(s);
7105         }
7106     }
7107     else {
7108         while (off++ && s > lim) {
7109             s--;
7110             if (UTF8_IS_CONTINUED(*s)) {
7111                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7112                     s--;
7113             }
7114             /* XXX could check well-formedness here */
7115         }
7116     }
7117     return s;
7118 }
7119
7120 #ifdef XXX_dmq
7121 /* there are a bunch of places where we use two reghop3's that should
7122    be replaced with this routine. but since thats not done yet
7123    we ifdef it out - dmq
7124 */
7125 STATIC U8 *
7126 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7127 {
7128     dVAR;
7129
7130     PERL_ARGS_ASSERT_REGHOP4;
7131
7132     if (off >= 0) {
7133         while (off-- && s < rlim) {
7134             /* XXX could check well-formedness here */
7135             s += UTF8SKIP(s);
7136         }
7137     }
7138     else {
7139         while (off++ && s > llim) {
7140             s--;
7141             if (UTF8_IS_CONTINUED(*s)) {
7142                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7143                     s--;
7144             }
7145             /* XXX could check well-formedness here */
7146         }
7147     }
7148     return s;
7149 }
7150 #endif
7151
7152 STATIC U8 *
7153 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7154 {
7155     dVAR;
7156
7157     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7158
7159     if (off >= 0) {
7160         while (off-- && s < lim) {
7161             /* XXX could check well-formedness here */
7162             s += UTF8SKIP(s);
7163         }
7164         if (off >= 0)
7165             return NULL;
7166     }
7167     else {
7168         while (off++ && s > lim) {
7169             s--;
7170             if (UTF8_IS_CONTINUED(*s)) {
7171                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7172                     s--;
7173             }
7174             /* XXX could check well-formedness here */
7175         }
7176         if (off <= 0)
7177             return NULL;
7178     }
7179     return s;
7180 }
7181
7182 static void
7183 restore_pos(pTHX_ void *arg)
7184 {
7185     dVAR;
7186     regexp * const rex = (regexp *)arg;
7187     if (PL_reg_state.re_state_eval_setup_done) {
7188         if (PL_reg_oldsaved) {
7189             rex->subbeg = PL_reg_oldsaved;
7190             rex->sublen = PL_reg_oldsavedlen;
7191 #ifdef PERL_OLD_COPY_ON_WRITE
7192             rex->saved_copy = PL_nrs;
7193 #endif
7194             RXp_MATCH_COPIED_on(rex);
7195         }
7196         PL_reg_magic->mg_len = PL_reg_oldpos;
7197         PL_reg_state.re_state_eval_setup_done = FALSE;
7198         PL_curpm = PL_reg_oldcurpm;
7199     }
7200 }
7201
7202 STATIC void
7203 S_to_utf8_substr(pTHX_ register regexp *prog)
7204 {
7205     int i = 1;
7206
7207     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7208
7209     do {
7210         if (prog->substrs->data[i].substr
7211             && !prog->substrs->data[i].utf8_substr) {
7212             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7213             prog->substrs->data[i].utf8_substr = sv;
7214             sv_utf8_upgrade(sv);
7215             if (SvVALID(prog->substrs->data[i].substr)) {
7216                 if (SvTAIL(prog->substrs->data[i].substr)) {
7217                     /* Trim the trailing \n that fbm_compile added last
7218                        time.  */
7219                     SvCUR_set(sv, SvCUR(sv) - 1);
7220                     /* Whilst this makes the SV technically "invalid" (as its
7221                        buffer is no longer followed by "\0") when fbm_compile()
7222                        adds the "\n" back, a "\0" is restored.  */
7223                     fbm_compile(sv, FBMcf_TAIL);
7224                 } else
7225                     fbm_compile(sv, 0);
7226             }
7227             if (prog->substrs->data[i].substr == prog->check_substr)
7228                 prog->check_utf8 = sv;
7229         }
7230     } while (i--);
7231 }
7232
7233 STATIC void
7234 S_to_byte_substr(pTHX_ register regexp *prog)
7235 {
7236     dVAR;
7237     int i = 1;
7238
7239     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7240
7241     do {
7242         if (prog->substrs->data[i].utf8_substr
7243             && !prog->substrs->data[i].substr) {
7244             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7245             if (sv_utf8_downgrade(sv, TRUE)) {
7246                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7247                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7248                         /* Trim the trailing \n that fbm_compile added last
7249                            time.  */
7250                         SvCUR_set(sv, SvCUR(sv) - 1);
7251                         fbm_compile(sv, FBMcf_TAIL);
7252                     } else
7253                         fbm_compile(sv, 0);
7254                 }
7255             } else {
7256                 SvREFCNT_dec(sv);
7257                 sv = &PL_sv_undef;
7258             }
7259             prog->substrs->data[i].substr = sv;
7260             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7261                 prog->check_substr = sv;
7262         }
7263     } while (i--);
7264 }
7265
7266 /*
7267  * Local variables:
7268  * c-indentation-style: bsd
7269  * c-basic-offset: 4
7270  * indent-tabs-mode: nil
7271  * End:
7272  *
7273  * ex: set ts=8 sts=4 sw=4 et:
7274  */