regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "inline_invlist.c"
  84 #include "unicode_constants.h"
  85
  86 #define RF_tainted      1       /* tainted information used? e.g. locale */
  87 #define RF_warned       2               /* warned about big count? */
  88
  89 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  90
  91 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124
 125 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 126 #define NEXTCHR_IS_EOS (nextchr < 0)
 127
 128 #define SET_nextchr \
 129     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 130
 131 #define SET_locinput(p) \
 132     locinput = (p);  \
 133     SET_nextchr
 134
 135
 136 /* these are unrolled below in the CCC_TRY_XXX defined */
 137 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 138     if (!CAT2(PL_utf8_,class)) { \
 139         bool ok; \
 140         ENTER; save_re_context(); \
 141         ok=CAT2(is_utf8_,class)((const U8*)str); \
 142         PERL_UNUSED_VAR(ok); \
 143         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 144 /* Doesn't do an assert to verify that is correct */
 145 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 146     if (!CAT2(PL_utf8_,class)) { \
 147         bool throw_away PERL_UNUSED_DECL; \
 148         ENTER; save_re_context(); \
 149         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 150         LEAVE; } } STMT_END
 151
 152 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 153 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 154 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 155
 156 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 157         /* No asserts are done for some of these, in case called on a   */  \
 158         /* Unicode version in which they map to nothing */                  \
 159         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 160         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 161
 162 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 163
 164 /* The actual code for CCC_TRY, which uses several variables from the routine
 165  * it's callable from.  It is designed to be the bulk of a case statement.
 166  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 167  *      nextchr matches the class.
 168  * UTF8_TEST is the whole test string to use for utf8 targets
 169  * LOAD is what to use to test, and if not present to load in the swash for the
 170  *      class
 171  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 172  *      UTF8_TEST test.
 173  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 174  * utf8 and a variant, load the swash if necessary and test using the utf8
 175  * test.  Advance to the next character if test is ok, otherwise fail; If not
 176  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 177  * fails, or advance to the next character */
 178
 179 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 180     if (NEXTCHR_IS_EOS) {                                              \
 181         sayNO;                                                                \
 182     }                                                                         \
 183     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 184         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 185         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 186             sayNO;                                                            \
 187         }                                                                     \
 188     }                                                                         \
 189     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 190             sayNO;                                                            \
 191     }                                                                         \
 192     goto increment_locinput;
 193
 194 /* Handle the non-locale cases for a character class and its complement.  It
 195  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 196  * This is because that code fails when the test succeeds, so we want to have
 197  * the test fail so that the code succeeds.  The swash is stored in a
 198  * predictable PL_ place */
 199 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 200                            CLASS, STR)                                        \
 201     case NAME:                                                                \
 202         _CCC_TRY_CODE( !, FUNC,                                               \
 203                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 204                                             (U8*)locinput, TRUE)),            \
 205                           CLASS, STR)                                         \
 206     case NNAME:                                                               \
 207         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 208                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 209                                             (U8*)locinput, TRUE)),            \
 210                           CLASS, STR)                                         \
 211
 212 /* Generate the case statements for both locale and non-locale character
 213  * classes in regmatch for classes that don't have special unicode semantics.
 214  * Locales don't use an immediate swash, but an intermediary special locale
 215  * function that is called on the pointer to the current place in the input
 216  * string.  That function will resolve to needing the same swash.  One might
 217  * think that because we don't know what the locale will match, we shouldn't
 218  * check with the swash loading function that it loaded properly; ie, that we
 219  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 220  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 221  * irrelevant here */
 222 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 223                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 224                 NAMEA, NNAMEA, FUNCA,                                         \
 225                 CLASS, STR)                                                   \
 226     case NAMEL:                                                               \
 227         PL_reg_flags |= RF_tainted;                                           \
 228         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 229     case NNAMEL:                                                              \
 230         PL_reg_flags |= RF_tainted;                                           \
 231         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 232                        CLASS, STR)                                            \
 233     case NAMEA:                                                               \
 234         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 235             sayNO;                                                            \
 236         }                                                                     \
 237         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 238         locinput++;                                        \
 239         break;                                                                \
 240     case NNAMEA:                                                              \
 241         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 242             sayNO;                                                            \
 243         }                                                                     \
 244         goto increment_locinput;                                              \
 245     /* Generate the non-locale cases */                                       \
 246     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 247
 248 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 249  * statements to handle separate Unicode semantics nodes */
 250 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 251                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 252                   NAMEU, NNAMEU, FUNCU,                                        \
 253                   NAMEA, NNAMEA, FUNCA,                                        \
 254                   CLASS, STR)                                                  \
 255     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 256             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 257             NAMEA, NNAMEA, FUNCA,                                              \
 258             CLASS, STR)                                                        \
 259     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 260
 261 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 262
 263 /* for use after a quantifier and before an EXACT-like node -- japhy */
 264 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 265  *
 266  * NOTE that *nothing* that affects backtracking should be in here, specifically
 267  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 268  * node that is in between two EXACT like nodes when ascertaining what the required
 269  * "follow" character is. This should probably be moved to regex compile time
 270  * although it may be done at run time beause of the REF possibility - more
 271  * investigation required. -- demerphq
 272 */
 273 #define JUMPABLE(rn) (      \
 274     OP(rn) == OPEN ||       \
 275     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 276     OP(rn) == EVAL ||   \
 277     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 278     OP(rn) == PLUS || OP(rn) == MINMOD || \
 279     OP(rn) == KEEPS || \
 280     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 281 )
 282 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 283
 284 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 285
 286 #if 0
 287 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 288    we don't need this definition. */
 289 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 290 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 291 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 292
 293 #else
 294 /* ... so we use this as its faster. */
 295 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 296 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 297 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 298 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 299
 300 #endif
 301
 302 /*
 303   Search for mandatory following text node; for lookahead, the text must
 304   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 305 */
 306 #define FIND_NEXT_IMPT(rn) STMT_START { \
 307     while (JUMPABLE(rn)) { \
 308         const OPCODE type = OP(rn); \
 309         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 310             rn = NEXTOPER(NEXTOPER(rn)); \
 311         else if (type == PLUS) \
 312             rn = NEXTOPER(rn); \
 313         else if (type == IFMATCH) \
 314             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 315         else rn += NEXT_OFF(rn); \
 316     } \
 317 } STMT_END
 318
 319
 320 static void restore_pos(pTHX_ void *arg);
 321
 322 #define REGCP_PAREN_ELEMS 3
 323 #define REGCP_OTHER_ELEMS 3
 324 #define REGCP_FRAME_ELEMS 1
 325 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 326  * are needed for the regexp context stack bookkeeping. */
 327
 328 STATIC CHECKPOINT
 329 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 330 {
 331     dVAR;
 332     const int retval = PL_savestack_ix;
 333     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 334     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 335     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 336     I32 p;
 337     GET_RE_DEBUG_FLAGS_DECL;
 338
 339     PERL_ARGS_ASSERT_REGCPPUSH;
 340
 341     if (paren_elems_to_push < 0)
 342         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 343                    paren_elems_to_push);
 344
 345     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 346         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 347                    " out of range (%lu-%ld)",
 348                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 349
 350     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 351
 352     DEBUG_BUFFERS_r(
 353         if ((int)PL_regsize > (int)parenfloor)
 354             PerlIO_printf(Perl_debug_log,
 355                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 356                 PTR2UV(rex),
 357                 PTR2UV(rex->offs)
 358             );
 359     );
 360     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 361 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 362         SSPUSHINT(rex->offs[p].end);
 363         SSPUSHINT(rex->offs[p].start);
 364         SSPUSHINT(rex->offs[p].start_tmp);
 365         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 366             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 367             (UV)p,
 368             (IV)rex->offs[p].start,
 369             (IV)rex->offs[p].start_tmp,
 370             (IV)rex->offs[p].end
 371         ));
 372     }
 373 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 374     SSPUSHINT(PL_regsize);
 375     SSPUSHINT(rex->lastparen);
 376     SSPUSHINT(rex->lastcloseparen);
 377     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 378
 379     return retval;
 380 }
 381
 382 /* These are needed since we do not localize EVAL nodes: */
 383 #define REGCP_SET(cp)                                           \
 384     DEBUG_STATE_r(                                              \
 385             PerlIO_printf(Perl_debug_log,                       \
 386                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 387                 (IV)PL_savestack_ix));                          \
 388     cp = PL_savestack_ix
 389
 390 #define REGCP_UNWIND(cp)                                        \
 391     DEBUG_STATE_r(                                              \
 392         if (cp != PL_savestack_ix)                              \
 393             PerlIO_printf(Perl_debug_log,                       \
 394                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 395                 (IV)(cp), (IV)PL_savestack_ix));                \
 396     regcpblow(cp)
 397
 398 #define UNWIND_PAREN(lp, lcp)               \
 399     for (n = rex->lastparen; n > lp; n--)   \
 400         rex->offs[n].end = -1;              \
 401     rex->lastparen = n;                     \
 402     rex->lastcloseparen = lcp;
 403
 404
 405 STATIC void
 406 S_regcppop(pTHX_ regexp *rex)
 407 {
 408     dVAR;
 409     UV i;
 410     U32 paren;
 411     GET_RE_DEBUG_FLAGS_DECL;
 412
 413     PERL_ARGS_ASSERT_REGCPPOP;
 414
 415     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 416     i = SSPOPUV;
 417     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 418     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 419     rex->lastcloseparen = SSPOPINT;
 420     rex->lastparen = SSPOPINT;
 421     PL_regsize = SSPOPINT;
 422
 423     i -= REGCP_OTHER_ELEMS;
 424     /* Now restore the parentheses context. */
 425     DEBUG_BUFFERS_r(
 426         if (i || rex->lastparen + 1 <= rex->nparens)
 427             PerlIO_printf(Perl_debug_log,
 428                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 429                 PTR2UV(rex),
 430                 PTR2UV(rex->offs)
 431             );
 432     );
 433     paren = PL_regsize;
 434     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 435         I32 tmps;
 436         rex->offs[paren].start_tmp = SSPOPINT;
 437         rex->offs[paren].start = SSPOPINT;
 438         tmps = SSPOPINT;
 439         if (paren <= rex->lastparen)
 440             rex->offs[paren].end = tmps;
 441         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 442             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 443             (UV)paren,
 444             (IV)rex->offs[paren].start,
 445             (IV)rex->offs[paren].start_tmp,
 446             (IV)rex->offs[paren].end,
 447             (paren > rex->lastparen ? "(skipped)" : ""));
 448         );
 449         paren--;
 450     }
 451 #if 1
 452     /* It would seem that the similar code in regtry()
 453      * already takes care of this, and in fact it is in
 454      * a better location to since this code can #if 0-ed out
 455      * but the code in regtry() is needed or otherwise tests
 456      * requiring null fields (pat.t#187 and split.t#{13,14}
 457      * (as of patchlevel 7877)  will fail.  Then again,
 458      * this code seems to be necessary or otherwise
 459      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 460      * --jhi updated by dapm */
 461     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 462         if (i > PL_regsize)
 463             rex->offs[i].start = -1;
 464         rex->offs[i].end = -1;
 465         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 466             "    \\%"UVuf": %s   ..-1 undeffing\n",
 467             (UV)i,
 468             (i > PL_regsize) ? "-1" : "  "
 469         ));
 470     }
 471 #endif
 472 }
 473
 474 /* restore the parens and associated vars at savestack position ix,
 475  * but without popping the stack */
 476
 477 STATIC void
 478 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 479 {
 480     I32 tmpix = PL_savestack_ix;
 481     PL_savestack_ix = ix;
 482     regcppop(rex);
 483     PL_savestack_ix = tmpix;
 484 }
 485
 486 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 487
 488 /*
 489  * pregexec and friends
 490  */
 491
 492 #ifndef PERL_IN_XSUB_RE
 493 /*
 494  - pregexec - match a regexp against a string
 495  */
 496 I32
 497 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 498          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 499 /* stringarg: the point in the string at which to begin matching */
 500 /* strend:    pointer to null at end of string */
 501 /* strbeg:    real beginning of string */
 502 /* minend:    end of match must be >= minend bytes after stringarg. */
 503 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 504  *            itself is accessed via the pointers above */
 505 /* nosave:    For optimizations. */
 506 {
 507     PERL_ARGS_ASSERT_PREGEXEC;
 508
 509     return
 510         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 511                       nosave ? 0 : REXEC_COPY_STR);
 512 }
 513 #endif
 514
 515 /*
 516  * Need to implement the following flags for reg_anch:
 517  *
 518  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 519  * USE_INTUIT_ML
 520  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 521  * INTUIT_AUTORITATIVE_ML
 522  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 523  * INTUIT_ONCE_ML
 524  *
 525  * Another flag for this function: SECOND_TIME (so that float substrs
 526  * with giant delta may be not rechecked).
 527  */
 528
 529 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 530
 531 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 532    Otherwise, only SvCUR(sv) is used to get strbeg. */
 533
 534 /* XXXX We assume that strpos is strbeg unless sv. */
 535
 536 /* XXXX Some places assume that there is a fixed substring.
 537         An update may be needed if optimizer marks as "INTUITable"
 538         RExen without fixed substrings.  Similarly, it is assumed that
 539         lengths of all the strings are no more than minlen, thus they
 540         cannot come from lookahead.
 541         (Or minlen should take into account lookahead.)
 542   NOTE: Some of this comment is not correct. minlen does now take account
 543   of lookahead/behind. Further research is required. -- demerphq
 544
 545 */
 546
 547 /* A failure to find a constant substring means that there is no need to make
 548    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 549    finding a substring too deep into the string means that less calls to
 550    regtry() should be needed.
 551
 552    REx compiler's optimizer found 4 possible hints:
 553         a) Anchored substring;
 554         b) Fixed substring;
 555         c) Whether we are anchored (beginning-of-line or \G);
 556         d) First node (of those at offset 0) which may distinguish positions;
 557    We use a)b)d) and multiline-part of c), and try to find a position in the
 558    string which does not contradict any of them.
 559  */
 560
 561 /* Most of decisions we do here should have been done at compile time.
 562    The nodes of the REx which we used for the search should have been
 563    deleted from the finite automaton. */
 564
 565 char *
 566 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 567                      char *strend, const U32 flags, re_scream_pos_data *data)
 568 {
 569     dVAR;
 570     struct regexp *const prog = (struct regexp *)SvANY(rx);
 571     I32 start_shift = 0;
 572     /* Should be nonnegative! */
 573     I32 end_shift   = 0;
 574     char *s;
 575     SV *check;
 576     char *strbeg;
 577     char *t;
 578     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 579     I32 ml_anch;
 580     char *other_last = NULL;    /* other substr checked before this */
 581     char *check_at = NULL;              /* check substr found at this pos */
 582     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 583     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 584     RXi_GET_DECL(prog,progi);
 585 #ifdef DEBUGGING
 586     const char * const i_strpos = strpos;
 587 #endif
 588     GET_RE_DEBUG_FLAGS_DECL;
 589
 590     PERL_ARGS_ASSERT_RE_INTUIT_START;
 591     PERL_UNUSED_ARG(flags);
 592     PERL_UNUSED_ARG(data);
 593
 594     RX_MATCH_UTF8_set(rx,utf8_target);
 595
 596     if (RX_UTF8(rx)) {
 597         PL_reg_flags |= RF_utf8;
 598     }
 599     DEBUG_EXECUTE_r(
 600         debug_start_match(rx, utf8_target, strpos, strend,
 601             sv ? "Guessing start of match in sv for"
 602                : "Guessing start of match in string for");
 603               );
 604
 605     /* CHR_DIST() would be more correct here but it makes things slow. */
 606     if (prog->minlen > strend - strpos) {
 607         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 608                               "String too short... [re_intuit_start]\n"));
 609         goto fail;
 610     }
 611
 612     /* XXX we need to pass strbeg as a separate arg: the following is
 613      * guesswork and can be wrong... */
 614     if (sv && SvPOK(sv)) {
 615         char * p   = SvPVX(sv);
 616         STRLEN cur = SvCUR(sv);
 617         if (p <= strpos && strpos < p + cur) {
 618             strbeg = p;
 619             assert(p <= strend && strend <= p + cur);
 620         }
 621         else
 622             strbeg = strend - cur;
 623     }
 624     else
 625         strbeg = strpos;
 626
 627     PL_regeol = strend;
 628     if (utf8_target) {
 629         if (!prog->check_utf8 && prog->check_substr)
 630             to_utf8_substr(prog);
 631         check = prog->check_utf8;
 632     } else {
 633         if (!prog->check_substr && prog->check_utf8)
 634             to_byte_substr(prog);
 635         check = prog->check_substr;
 636     }
 637     if (check == &PL_sv_undef) {
 638         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 639                 "Non-utf8 string cannot match utf8 check string\n"));
 640         goto fail;
 641     }
 642     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 643         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 644                      || ( (prog->extflags & RXf_ANCH_BOL)
 645                           && !multiline ) );    /* Check after \n? */
 646
 647         if (!ml_anch) {
 648           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 649                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 650                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 651                && sv && !SvROK(sv)
 652                && (strpos != strbeg)) {
 653               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 654               goto fail;
 655           }
 656           if (prog->check_offset_min == prog->check_offset_max &&
 657               !(prog->extflags & RXf_CANY_SEEN)) {
 658             /* Substring at constant offset from beg-of-str... */
 659             I32 slen;
 660
 661             s = HOP3c(strpos, prog->check_offset_min, strend);
 662
 663             if (SvTAIL(check)) {
 664                 slen = SvCUR(check);    /* >= 1 */
 665
 666                 if ( strend - s > slen || strend - s < slen - 1
 667                      || (strend - s == slen && strend[-1] != '\n')) {
 668                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 669                     goto fail_finish;
 670                 }
 671                 /* Now should match s[0..slen-2] */
 672                 slen--;
 673                 if (slen && (*SvPVX_const(check) != *s
 674                              || (slen > 1
 675                                  && memNE(SvPVX_const(check), s, slen)))) {
 676                   report_neq:
 677                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 678                     goto fail_finish;
 679                 }
 680             }
 681             else if (*SvPVX_const(check) != *s
 682                      || ((slen = SvCUR(check)) > 1
 683                          && memNE(SvPVX_const(check), s, slen)))
 684                 goto report_neq;
 685             check_at = s;
 686             goto success_at_start;
 687           }
 688         }
 689         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 690         s = strpos;
 691         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 692         end_shift = prog->check_end_shift;
 693
 694         if (!ml_anch) {
 695             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 696                                          - (SvTAIL(check) != 0);
 697             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 698
 699             if (end_shift < eshift)
 700                 end_shift = eshift;
 701         }
 702     }
 703     else {                              /* Can match at random position */
 704         ml_anch = 0;
 705         s = strpos;
 706         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 707         end_shift = prog->check_end_shift;
 708
 709         /* end shift should be non negative here */
 710     }
 711
 712 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 713     if (end_shift < 0)
 714         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 715                    (IV)end_shift, RX_PRECOMP(prog));
 716 #endif
 717
 718   restart:
 719     /* Find a possible match in the region s..strend by looking for
 720        the "check" substring in the region corrected by start/end_shift. */
 721
 722     {
 723         I32 srch_start_shift = start_shift;
 724         I32 srch_end_shift = end_shift;
 725         U8* start_point;
 726         U8* end_point;
 727         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 728             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 729             srch_start_shift = strbeg - s;
 730         }
 731     DEBUG_OPTIMISE_MORE_r({
 732         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 733             (IV)prog->check_offset_min,
 734             (IV)srch_start_shift,
 735             (IV)srch_end_shift,
 736             (IV)prog->check_end_shift);
 737     });
 738
 739         if (prog->extflags & RXf_CANY_SEEN) {
 740             start_point= (U8*)(s + srch_start_shift);
 741             end_point= (U8*)(strend - srch_end_shift);
 742         } else {
 743             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 744             end_point= HOP3(strend, -srch_end_shift, strbeg);
 745         }
 746         DEBUG_OPTIMISE_MORE_r({
 747             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 748                 (int)(end_point - start_point),
 749                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 750                 start_point);
 751         });
 752
 753         s = fbm_instr( start_point, end_point,
 754                       check, multiline ? FBMrf_MULTILINE : 0);
 755     }
 756     /* Update the count-of-usability, remove useless subpatterns,
 757         unshift s.  */
 758
 759     DEBUG_EXECUTE_r({
 760         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 761             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 762         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 763                           (s ? "Found" : "Did not find"),
 764             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 765                 ? "anchored" : "floating"),
 766             quoted,
 767             RE_SV_TAIL(check),
 768             (s ? " at offset " : "...\n") );
 769     });
 770
 771     if (!s)
 772         goto fail_finish;
 773     /* Finish the diagnostic message */
 774     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 775
 776     /* XXX dmq: first branch is for positive lookbehind...
 777        Our check string is offset from the beginning of the pattern.
 778        So we need to do any stclass tests offset forward from that
 779        point. I think. :-(
 780      */
 781
 782
 783
 784     check_at=s;
 785
 786
 787     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 788        Start with the other substr.
 789        XXXX no SCREAM optimization yet - and a very coarse implementation
 790        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 791                 *always* match.  Probably should be marked during compile...
 792        Probably it is right to do no SCREAM here...
 793      */
 794
 795     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 796                 : (prog->float_substr && prog->anchored_substr))
 797     {
 798         /* Take into account the "other" substring. */
 799         /* XXXX May be hopelessly wrong for UTF... */
 800         if (!other_last)
 801             other_last = strpos;
 802         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 803           do_other_anchored:
 804             {
 805                 char * const last = HOP3c(s, -start_shift, strbeg);
 806                 char *last1, *last2;
 807                 char * const saved_s = s;
 808                 SV* must;
 809
 810                 t = s - prog->check_offset_max;
 811                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 812                     && (!utf8_target
 813                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 814                             && t > strpos)))
 815                     NOOP;
 816                 else
 817                     t = strpos;
 818                 t = HOP3c(t, prog->anchored_offset, strend);
 819                 if (t < other_last)     /* These positions already checked */
 820                     t = other_last;
 821                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 822                 if (last < last1)
 823                     last1 = last;
 824                 /* XXXX It is not documented what units *_offsets are in.
 825                    We assume bytes, but this is clearly wrong.
 826                    Meaning this code needs to be carefully reviewed for errors.
 827                    dmq.
 828                   */
 829
 830                 /* On end-of-str: see comment below. */
 831                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 832                 if (must == &PL_sv_undef) {
 833                     s = (char*)NULL;
 834                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 835                 }
 836                 else
 837                     s = fbm_instr(
 838                         (unsigned char*)t,
 839                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 840                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 841                         must,
 842                         multiline ? FBMrf_MULTILINE : 0
 843                     );
 844                 DEBUG_EXECUTE_r({
 845                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 846                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 847                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 848                         (s ? "Found" : "Contradicts"),
 849                         quoted, RE_SV_TAIL(must));
 850                 });
 851
 852
 853                 if (!s) {
 854                     if (last1 >= last2) {
 855                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 856                                                 ", giving up...\n"));
 857                         goto fail_finish;
 858                     }
 859                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 860                         ", trying floating at offset %ld...\n",
 861                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 862                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 863                     s = HOP3c(last, 1, strend);
 864                     goto restart;
 865                 }
 866                 else {
 867                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 868                           (long)(s - i_strpos)));
 869                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 870                     other_last = HOP3c(s, 1, strend);
 871                     s = saved_s;
 872                     if (t == strpos)
 873                         goto try_at_start;
 874                     goto try_at_offset;
 875                 }
 876             }
 877         }
 878         else {          /* Take into account the floating substring. */
 879             char *last, *last1;
 880             char * const saved_s = s;
 881             SV* must;
 882
 883             t = HOP3c(s, -start_shift, strbeg);
 884             last1 = last =
 885                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 886             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 887                 last = HOP3c(t, prog->float_max_offset, strend);
 888             s = HOP3c(t, prog->float_min_offset, strend);
 889             if (s < other_last)
 890                 s = other_last;
 891  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 892             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 893             /* fbm_instr() takes into account exact value of end-of-str
 894                if the check is SvTAIL(ed).  Since false positives are OK,
 895                and end-of-str is not later than strend we are OK. */
 896             if (must == &PL_sv_undef) {
 897                 s = (char*)NULL;
 898                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 899             }
 900             else
 901                 s = fbm_instr((unsigned char*)s,
 902                               (unsigned char*)last + SvCUR(must)
 903                                   - (SvTAIL(must)!=0),
 904                               must, multiline ? FBMrf_MULTILINE : 0);
 905             DEBUG_EXECUTE_r({
 906                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 907                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 908                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 909                     (s ? "Found" : "Contradicts"),
 910                     quoted, RE_SV_TAIL(must));
 911             });
 912             if (!s) {
 913                 if (last1 == last) {
 914                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 915                                             ", giving up...\n"));
 916                     goto fail_finish;
 917                 }
 918                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 919                     ", trying anchored starting at offset %ld...\n",
 920                     (long)(saved_s + 1 - i_strpos)));
 921                 other_last = last;
 922                 s = HOP3c(t, 1, strend);
 923                 goto restart;
 924             }
 925             else {
 926                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 927                       (long)(s - i_strpos)));
 928                 other_last = s; /* Fix this later. --Hugo */
 929                 s = saved_s;
 930                 if (t == strpos)
 931                     goto try_at_start;
 932                 goto try_at_offset;
 933             }
 934         }
 935     }
 936
 937
 938     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 939
 940     DEBUG_OPTIMISE_MORE_r(
 941         PerlIO_printf(Perl_debug_log,
 942             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 943             (IV)prog->check_offset_min,
 944             (IV)prog->check_offset_max,
 945             (IV)(s-strpos),
 946             (IV)(t-strpos),
 947             (IV)(t-s),
 948             (IV)(strend-strpos)
 949         )
 950     );
 951
 952     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 953         && (!utf8_target
 954             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 955                  && t > strpos)))
 956     {
 957         /* Fixed substring is found far enough so that the match
 958            cannot start at strpos. */
 959       try_at_offset:
 960         if (ml_anch && t[-1] != '\n') {
 961             /* Eventually fbm_*() should handle this, but often
 962                anchored_offset is not 0, so this check will not be wasted. */
 963             /* XXXX In the code below we prefer to look for "^" even in
 964                presence of anchored substrings.  And we search even
 965                beyond the found float position.  These pessimizations
 966                are historical artefacts only.  */
 967           find_anchor:
 968             while (t < strend - prog->minlen) {
 969                 if (*t == '\n') {
 970                     if (t < check_at - prog->check_offset_min) {
 971                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 972                             /* Since we moved from the found position,
 973                                we definitely contradict the found anchored
 974                                substr.  Due to the above check we do not
 975                                contradict "check" substr.
 976                                Thus we can arrive here only if check substr
 977                                is float.  Redo checking for "other"=="fixed".
 978                              */
 979                             strpos = t + 1;
 980                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 981                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 982                             goto do_other_anchored;
 983                         }
 984                         /* We don't contradict the found floating substring. */
 985                         /* XXXX Why not check for STCLASS? */
 986                         s = t + 1;
 987                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 988                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 989                         goto set_useful;
 990                     }
 991                     /* Position contradicts check-string */
 992                     /* XXXX probably better to look for check-string
 993                        than for "\n", so one should lower the limit for t? */
 994                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 995                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 996                     other_last = strpos = s = t + 1;
 997                     goto restart;
 998                 }
 999                 t++;
1000             }
1001             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1002                         PL_colors[0], PL_colors[1]));
1003             goto fail_finish;
1004         }
1005         else {
1006             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1007                         PL_colors[0], PL_colors[1]));
1008         }
1009         s = t;
1010       set_useful:
1011         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1012     }
1013     else {
1014         /* The found string does not prohibit matching at strpos,
1015            - no optimization of calling REx engine can be performed,
1016            unless it was an MBOL and we are not after MBOL,
1017            or a future STCLASS check will fail this. */
1018       try_at_start:
1019         /* Even in this situation we may use MBOL flag if strpos is offset
1020            wrt the start of the string. */
1021         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1022             && (strpos != strbeg) && strpos[-1] != '\n'
1023             /* May be due to an implicit anchor of m{.*foo}  */
1024             && !(prog->intflags & PREGf_IMPLICIT))
1025         {
1026             t = strpos;
1027             goto find_anchor;
1028         }
1029         DEBUG_EXECUTE_r( if (ml_anch)
1030             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1031                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1032         );
1033       success_at_start:
1034         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1035             && (utf8_target ? (
1036                 prog->check_utf8                /* Could be deleted already */
1037                 && --BmUSEFUL(prog->check_utf8) < 0
1038                 && (prog->check_utf8 == prog->float_utf8)
1039             ) : (
1040                 prog->check_substr              /* Could be deleted already */
1041                 && --BmUSEFUL(prog->check_substr) < 0
1042                 && (prog->check_substr == prog->float_substr)
1043             )))
1044         {
1045             /* If flags & SOMETHING - do not do it many times on the same match */
1046             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1047             /* XXX Does the destruction order has to change with utf8_target? */
1048             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1049             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1050             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1051             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1052             check = NULL;                       /* abort */
1053             s = strpos;
1054             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1055                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1056             if (prog->intflags & PREGf_IMPLICIT)
1057                 prog->extflags &= ~RXf_ANCH_MBOL;
1058             /* XXXX This is a remnant of the old implementation.  It
1059                     looks wasteful, since now INTUIT can use many
1060                     other heuristics. */
1061             prog->extflags &= ~RXf_USE_INTUIT;
1062             /* XXXX What other flags might need to be cleared in this branch? */
1063         }
1064         else
1065             s = strpos;
1066     }
1067
1068     /* Last resort... */
1069     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1070     /* trie stclasses are too expensive to use here, we are better off to
1071        leave it to regmatch itself */
1072     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1073         /* minlen == 0 is possible if regstclass is \b or \B,
1074            and the fixed substr is ''$.
1075            Since minlen is already taken into account, s+1 is before strend;
1076            accidentally, minlen >= 1 guaranties no false positives at s + 1
1077            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1078            regstclass does not come from lookahead...  */
1079         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1080            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1081         const U8* const str = (U8*)STRING(progi->regstclass);
1082         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1083                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1084                     : 1);
1085         char * endpos;
1086         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1087             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1088         else if (prog->float_substr || prog->float_utf8)
1089             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1090         else
1091             endpos= strend;
1092
1093         if (checked_upto < s)
1094            checked_upto = s;
1095         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1096                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1097
1098         t = s;
1099         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1100         if (s) {
1101             checked_upto = s;
1102         } else {
1103 #ifdef DEBUGGING
1104             const char *what = NULL;
1105 #endif
1106             if (endpos == strend) {
1107                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                                 "Could not match STCLASS...\n") );
1109                 goto fail;
1110             }
1111             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1112                                    "This position contradicts STCLASS...\n") );
1113             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1114                 goto fail;
1115             checked_upto = HOPBACKc(endpos, start_shift);
1116             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1117                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1118             /* Contradict one of substrings */
1119             if (prog->anchored_substr || prog->anchored_utf8) {
1120                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1121                     DEBUG_EXECUTE_r( what = "anchored" );
1122                   hop_and_restart:
1123                     s = HOP3c(t, 1, strend);
1124                     if (s + start_shift + end_shift > strend) {
1125                         /* XXXX Should be taken into account earlier? */
1126                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127                                                "Could not match STCLASS...\n") );
1128                         goto fail;
1129                     }
1130                     if (!check)
1131                         goto giveup;
1132                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1133                                 "Looking for %s substr starting at offset %ld...\n",
1134                                  what, (long)(s + start_shift - i_strpos)) );
1135                     goto restart;
1136                 }
1137                 /* Have both, check_string is floating */
1138                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1139                     goto retry_floating_check;
1140                 /* Recheck anchored substring, but not floating... */
1141                 s = check_at;
1142                 if (!check)
1143                     goto giveup;
1144                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1145                           "Looking for anchored substr starting at offset %ld...\n",
1146                           (long)(other_last - i_strpos)) );
1147                 goto do_other_anchored;
1148             }
1149             /* Another way we could have checked stclass at the
1150                current position only: */
1151             if (ml_anch) {
1152                 s = t = t + 1;
1153                 if (!check)
1154                     goto giveup;
1155                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1156                           "Looking for /%s^%s/m starting at offset %ld...\n",
1157                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1158                 goto try_at_offset;
1159             }
1160             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1161                 goto fail;
1162             /* Check is floating substring. */
1163           retry_floating_check:
1164             t = check_at - start_shift;
1165             DEBUG_EXECUTE_r( what = "floating" );
1166             goto hop_and_restart;
1167         }
1168         if (t != s) {
1169             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1170                         "By STCLASS: moving %ld --> %ld\n",
1171                                   (long)(t - i_strpos), (long)(s - i_strpos))
1172                    );
1173         }
1174         else {
1175             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1176                                   "Does not contradict STCLASS...\n");
1177                    );
1178         }
1179     }
1180   giveup:
1181     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1182                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1183                           PL_colors[5], (long)(s - i_strpos)) );
1184     return s;
1185
1186   fail_finish:                          /* Substring not found */
1187     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1188         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1189   fail:
1190     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1191                           PL_colors[4], PL_colors[5]));
1192     return NULL;
1193 }
1194
1195 #define DECL_TRIE_TYPE(scan) \
1196     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1197                     trie_type = ((scan->flags == EXACT) \
1198                               ? (utf8_target ? trie_utf8 : trie_plain) \
1199                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1200
1201 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1202 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1203     STRLEN skiplen;                                                                 \
1204     switch (trie_type) {                                                            \
1205     case trie_utf8_fold:                                                            \
1206         if ( foldlen>0 ) {                                                          \
1207             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1208             foldlen -= len;                                                         \
1209             uscan += len;                                                           \
1210             len=0;                                                                  \
1211         } else {                                                                    \
1212             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1213             len = UTF8SKIP(uc);                                                     \
1214             skiplen = UNISKIP( uvc );                                               \
1215             foldlen -= skiplen;                                                     \
1216             uscan = foldbuf + skiplen;                                              \
1217         }                                                                           \
1218         break;                                                                      \
1219     case trie_latin_utf8_fold:                                                      \
1220         if ( foldlen>0 ) {                                                          \
1221             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1222             foldlen -= len;                                                         \
1223             uscan += len;                                                           \
1224             len=0;                                                                  \
1225         } else {                                                                    \
1226             len = 1;                                                                \
1227             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1228             skiplen = UNISKIP( uvc );                                               \
1229             foldlen -= skiplen;                                                     \
1230             uscan = foldbuf + skiplen;                                              \
1231         }                                                                           \
1232         break;                                                                      \
1233     case trie_utf8:                                                                 \
1234         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1235         break;                                                                      \
1236     case trie_plain:                                                                \
1237         uvc = (UV)*uc;                                                              \
1238         len = 1;                                                                    \
1239     }                                                                               \
1240     if (uvc < 256) {                                                                \
1241         charid = trie->charmap[ uvc ];                                              \
1242     }                                                                               \
1243     else {                                                                          \
1244         charid = 0;                                                                 \
1245         if (widecharmap) {                                                          \
1246             SV** const svpp = hv_fetch(widecharmap,                                 \
1247                         (char*)&uvc, sizeof(UV), 0);                                \
1248             if (svpp)                                                               \
1249                 charid = (U16)SvIV(*svpp);                                          \
1250         }                                                                           \
1251     }                                                                               \
1252 } STMT_END
1253
1254 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1255 STMT_START {                                              \
1256     while (s <= e) {                                      \
1257         if ( (CoNd)                                       \
1258              && (ln == 1 || folder(s, pat_string, ln))    \
1259              && (!reginfo || regtry(reginfo, &s)) )       \
1260             goto got_it;                                  \
1261         s++;                                              \
1262     }                                                     \
1263 } STMT_END
1264
1265 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1266 STMT_START {                                          \
1267     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1268         CoDe                                          \
1269         s += uskip;                                   \
1270     }                                                 \
1271 } STMT_END
1272
1273 #define REXEC_FBC_SCAN(CoDe)                          \
1274 STMT_START {                                          \
1275     while (s < strend) {                              \
1276         CoDe                                          \
1277         s++;                                          \
1278     }                                                 \
1279 } STMT_END
1280
1281 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1282 REXEC_FBC_UTF8_SCAN(                                  \
1283     if (CoNd) {                                       \
1284         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1285             goto got_it;                              \
1286         else                                          \
1287             tmp = doevery;                            \
1288     }                                                 \
1289     else                                              \
1290         tmp = 1;                                      \
1291 )
1292
1293 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1294 REXEC_FBC_SCAN(                                       \
1295     if (CoNd) {                                       \
1296         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1297             goto got_it;                              \
1298         else                                          \
1299             tmp = doevery;                            \
1300     }                                                 \
1301     else                                              \
1302         tmp = 1;                                      \
1303 )
1304
1305 #define REXEC_FBC_TRYIT               \
1306 if ((!reginfo || regtry(reginfo, &s))) \
1307     goto got_it
1308
1309 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1310     if (utf8_target) {                                             \
1311         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1312     }                                                          \
1313     else {                                                     \
1314         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1315     }
1316
1317 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1318     if (utf8_target) {                                             \
1319         UtFpReLoAd;                                            \
1320         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1321     }                                                          \
1322     else {                                                     \
1323         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1324     }
1325
1326 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1327     PL_reg_flags |= RF_tainted;                                \
1328     if (utf8_target) {                                             \
1329         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1330     }                                                          \
1331     else {                                                     \
1332         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1333     }
1334
1335 #define DUMP_EXEC_POS(li,s,doutf8) \
1336     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1337
1338
1339 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1340         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1341         tmp = TEST_NON_UTF8(tmp);                                              \
1342         REXEC_FBC_UTF8_SCAN(                                                   \
1343             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1344                 tmp = !tmp;                                                    \
1345                 IF_SUCCESS;                                                    \
1346             }                                                                  \
1347             else {                                                             \
1348                 IF_FAIL;                                                       \
1349             }                                                                  \
1350         );                                                                     \
1351
1352 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1353         if (s == PL_bostr) {                                                   \
1354             tmp = '\n';                                                        \
1355         }                                                                      \
1356         else {                                                                 \
1357             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1358             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1359         }                                                                      \
1360         tmp = TeSt1_UtF8;                                                      \
1361         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1362         REXEC_FBC_UTF8_SCAN(                                                   \
1363             if (tmp == ! (TeSt2_UtF8)) { \
1364                 tmp = !tmp;                                                    \
1365                 IF_SUCCESS;                                                    \
1366             }                                                                  \
1367             else {                                                             \
1368                 IF_FAIL;                                                       \
1369             }                                                                  \
1370         );                                                                     \
1371
1372 /* The only difference between the BOUND and NBOUND cases is that
1373  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1374  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1375  * with the other one being empty */
1376 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1377     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1378
1379 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1380     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1381
1382 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1383     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1384
1385 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1386     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1387
1388
1389 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1390  * be passed in completely with the variable name being tested, which isn't
1391  * such a clean interface, but this is easier to read than it was before.  We
1392  * are looking for the boundary (or non-boundary between a word and non-word
1393  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1394  * must be different.  Find the "wordness" of the character just prior to this
1395  * one, and compare it with the wordness of this one.  If they differ, we have
1396  * a boundary.  At the beginning of the string, pretend that the previous
1397  * character was a new-line */
1398 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1399     if (utf8_target) {                                                         \
1400                 UTF8_CODE \
1401     }                                                                          \
1402     else {  /* Not utf8 */                                                     \
1403         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1404         tmp = TEST_NON_UTF8(tmp);                                              \
1405         REXEC_FBC_SCAN(                                                        \
1406             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1407                 tmp = !tmp;                                                    \
1408                 IF_SUCCESS;                                                    \
1409             }                                                                  \
1410             else {                                                             \
1411                 IF_FAIL;                                                       \
1412             }                                                                  \
1413         );                                                                     \
1414     }                                                                          \
1415     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1416         goto got_it;
1417
1418 /* We know what class REx starts with.  Try to find this position... */
1419 /* if reginfo is NULL, its a dryrun */
1420 /* annoyingly all the vars in this routine have different names from their counterparts
1421    in regmatch. /grrr */
1422
1423 STATIC char *
1424 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1425     const char *strend, regmatch_info *reginfo)
1426 {
1427         dVAR;
1428         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1429         char *pat_string;   /* The pattern's exactish string */
1430         char *pat_end;      /* ptr to end char of pat_string */
1431         re_fold_t folder;       /* Function for computing non-utf8 folds */
1432         const U8 *fold_array;   /* array for folding ords < 256 */
1433         STRLEN ln;
1434         STRLEN lnc;
1435         STRLEN uskip;
1436         U8 c1;
1437         U8 c2;
1438         char *e;
1439         I32 tmp = 1;    /* Scratch variable? */
1440         const bool utf8_target = PL_reg_match_utf8;
1441         UV utf8_fold_flags = 0;
1442         RXi_GET_DECL(prog,progi);
1443
1444         PERL_ARGS_ASSERT_FIND_BYCLASS;
1445
1446         /* We know what class it must start with. */
1447         switch (OP(c)) {
1448         case ANYOFV:
1449         case ANYOF:
1450             if (utf8_target || OP(c) == ANYOFV) {
1451                 STRLEN inclasslen = strend - s;
1452                 REXEC_FBC_UTF8_CLASS_SCAN(
1453                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1454             }
1455             else {
1456                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1457             }
1458             break;
1459         case CANY:
1460             REXEC_FBC_SCAN(
1461                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1462                     goto got_it;
1463                 else
1464                     tmp = doevery;
1465             );
1466             break;
1467
1468         case EXACTFA:
1469             if (UTF_PATTERN || utf8_target) {
1470                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1471                 goto do_exactf_utf8;
1472             }
1473             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1474             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1475             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1476
1477         case EXACTF:
1478             if (utf8_target) {
1479
1480                 /* regcomp.c already folded this if pattern is in UTF-8 */
1481                 utf8_fold_flags = 0;
1482                 goto do_exactf_utf8;
1483             }
1484             fold_array = PL_fold;
1485             folder = foldEQ;
1486             goto do_exactf_non_utf8;
1487
1488         case EXACTFL:
1489             if (UTF_PATTERN || utf8_target) {
1490                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1491                 goto do_exactf_utf8;
1492             }
1493             fold_array = PL_fold_locale;
1494             folder = foldEQ_locale;
1495             goto do_exactf_non_utf8;
1496
1497         case EXACTFU_SS:
1498             if (UTF_PATTERN) {
1499                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1500             }
1501             goto do_exactf_utf8;
1502
1503         case EXACTFU_TRICKYFOLD:
1504         case EXACTFU:
1505             if (UTF_PATTERN || utf8_target) {
1506                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1507                 goto do_exactf_utf8;
1508             }
1509
1510             /* Any 'ss' in the pattern should have been replaced by regcomp,
1511              * so we don't have to worry here about this single special case
1512              * in the Latin1 range */
1513             fold_array = PL_fold_latin1;
1514             folder = foldEQ_latin1;
1515
1516             /* FALL THROUGH */
1517
1518         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1519                                are no glitches with fold-length differences
1520                                between the target string and pattern */
1521
1522             /* The idea in the non-utf8 EXACTF* cases is to first find the
1523              * first character of the EXACTF* node and then, if necessary,
1524              * case-insensitively compare the full text of the node.  c1 is the
1525              * first character.  c2 is its fold.  This logic will not work for
1526              * Unicode semantics and the german sharp ss, which hence should
1527              * not be compiled into a node that gets here. */
1528             pat_string = STRING(c);
1529             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1530
1531             /* We know that we have to match at least 'ln' bytes (which is the
1532              * same as characters, since not utf8).  If we have to match 3
1533              * characters, and there are only 2 availabe, we know without
1534              * trying that it will fail; so don't start a match past the
1535              * required minimum number from the far end */
1536             e = HOP3c(strend, -((I32)ln), s);
1537
1538             if (!reginfo && e < s) {
1539                 e = s;                  /* Due to minlen logic of intuit() */
1540             }
1541
1542             c1 = *pat_string;
1543             c2 = fold_array[c1];
1544             if (c1 == c2) { /* If char and fold are the same */
1545                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1546             }
1547             else {
1548                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1549             }
1550             break;
1551
1552         do_exactf_utf8:
1553         {
1554             unsigned expansion;
1555
1556
1557             /* If one of the operands is in utf8, we can't use the simpler
1558              * folding above, due to the fact that many different characters
1559              * can have the same fold, or portion of a fold, or different-
1560              * length fold */
1561             pat_string = STRING(c);
1562             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1563             pat_end = pat_string + ln;
1564             lnc = (UTF_PATTERN) /* length to match in characters */
1565                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1566                     : ln;
1567
1568             /* We have 'lnc' characters to match in the pattern, but because of
1569              * multi-character folding, each character in the target can match
1570              * up to 3 characters (Unicode guarantees it will never exceed
1571              * this) if it is utf8-encoded; and up to 2 if not (based on the
1572              * fact that the Latin 1 folds are already determined, and the
1573              * only multi-char fold in that range is the sharp-s folding to
1574              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1575              * string character.  Adjust lnc accordingly, rounding up, so that
1576              * if we need to match at least 4+1/3 chars, that really is 5. */
1577             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1578             lnc = (lnc + expansion - 1) / expansion;
1579
1580             /* As in the non-UTF8 case, if we have to match 3 characters, and
1581              * only 2 are left, it's guaranteed to fail, so don't start a
1582              * match that would require us to go beyond the end of the string
1583              */
1584             e = HOP3c(strend, -((I32)lnc), s);
1585
1586             if (!reginfo && e < s) {
1587                 e = s;                  /* Due to minlen logic of intuit() */
1588             }
1589
1590             /* XXX Note that we could recalculate e to stop the loop earlier,
1591              * as the worst case expansion above will rarely be met, and as we
1592              * go along we would usually find that e moves further to the left.
1593              * This would happen only after we reached the point in the loop
1594              * where if there were no expansion we should fail.  Unclear if
1595              * worth the expense */
1596
1597             while (s <= e) {
1598                 char *my_strend= (char *)strend;
1599                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1600                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1601                     && (!reginfo || regtry(reginfo, &s)) )
1602                 {
1603                     goto got_it;
1604                 }
1605                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1606             }
1607             break;
1608         }
1609         case BOUNDL:
1610             PL_reg_flags |= RF_tainted;
1611             FBC_BOUND(isALNUM_LC,
1612                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1613                       isALNUM_LC_utf8((U8*)s));
1614             break;
1615         case NBOUNDL:
1616             PL_reg_flags |= RF_tainted;
1617             FBC_NBOUND(isALNUM_LC,
1618                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1619                        isALNUM_LC_utf8((U8*)s));
1620             break;
1621         case BOUND:
1622             FBC_BOUND(isWORDCHAR,
1623                       isALNUM_uni(tmp),
1624                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1625             break;
1626         case BOUNDA:
1627             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1628                              isWORDCHAR_A(tmp),
1629                              isWORDCHAR_A((U8*)s));
1630             break;
1631         case NBOUND:
1632             FBC_NBOUND(isWORDCHAR,
1633                        isALNUM_uni(tmp),
1634                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1635             break;
1636         case NBOUNDA:
1637             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1638                               isWORDCHAR_A(tmp),
1639                               isWORDCHAR_A((U8*)s));
1640             break;
1641         case BOUNDU:
1642             FBC_BOUND(isWORDCHAR_L1,
1643                       isALNUM_uni(tmp),
1644                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1645             break;
1646         case NBOUNDU:
1647             FBC_NBOUND(isWORDCHAR_L1,
1648                        isALNUM_uni(tmp),
1649                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1650             break;
1651         case ALNUML:
1652             REXEC_FBC_CSCAN_TAINT(
1653                 isALNUM_LC_utf8((U8*)s),
1654                 isALNUM_LC(*s)
1655             );
1656             break;
1657         case ALNUMU:
1658             REXEC_FBC_CSCAN_PRELOAD(
1659                 LOAD_UTF8_CHARCLASS_ALNUM(),
1660                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1661                 isWORDCHAR_L1((U8) *s)
1662             );
1663             break;
1664         case ALNUM:
1665             REXEC_FBC_CSCAN_PRELOAD(
1666                 LOAD_UTF8_CHARCLASS_ALNUM(),
1667                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1668                 isWORDCHAR((U8) *s)
1669             );
1670             break;
1671         case ALNUMA:
1672             /* Don't need to worry about utf8, as it can match only a single
1673              * byte invariant character */
1674             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1675             break;
1676         case NALNUMU:
1677             REXEC_FBC_CSCAN_PRELOAD(
1678                 LOAD_UTF8_CHARCLASS_ALNUM(),
1679                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1680                 ! isWORDCHAR_L1((U8) *s)
1681             );
1682             break;
1683         case NALNUM:
1684             REXEC_FBC_CSCAN_PRELOAD(
1685                 LOAD_UTF8_CHARCLASS_ALNUM(),
1686                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1687                 ! isALNUM(*s)
1688             );
1689             break;
1690         case NALNUMA:
1691             REXEC_FBC_CSCAN(
1692                 !isWORDCHAR_A(*s),
1693                 !isWORDCHAR_A(*s)
1694             );
1695             break;
1696         case NALNUML:
1697             REXEC_FBC_CSCAN_TAINT(
1698                 !isALNUM_LC_utf8((U8*)s),
1699                 !isALNUM_LC(*s)
1700             );
1701             break;
1702         case SPACEU:
1703             REXEC_FBC_CSCAN_PRELOAD(
1704                 LOAD_UTF8_CHARCLASS_SPACE(),
1705                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1706                 isSPACE_L1((U8) *s)
1707             );
1708             break;
1709         case SPACE:
1710             REXEC_FBC_CSCAN_PRELOAD(
1711                 LOAD_UTF8_CHARCLASS_SPACE(),
1712                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1713                 isSPACE((U8) *s)
1714             );
1715             break;
1716         case SPACEA:
1717             /* Don't need to worry about utf8, as it can match only a single
1718              * byte invariant character */
1719             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1720             break;
1721         case SPACEL:
1722             REXEC_FBC_CSCAN_TAINT(
1723                 isSPACE_LC_utf8((U8*)s),
1724                 isSPACE_LC(*s)
1725             );
1726             break;
1727         case NSPACEU:
1728             REXEC_FBC_CSCAN_PRELOAD(
1729                 LOAD_UTF8_CHARCLASS_SPACE(),
1730                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1731                 ! isSPACE_L1((U8) *s)
1732             );
1733             break;
1734         case NSPACE:
1735             REXEC_FBC_CSCAN_PRELOAD(
1736                 LOAD_UTF8_CHARCLASS_SPACE(),
1737                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1738                 ! isSPACE((U8) *s)
1739             );
1740             break;
1741         case NSPACEA:
1742             REXEC_FBC_CSCAN(
1743                 !isSPACE_A(*s),
1744                 !isSPACE_A(*s)
1745             );
1746             break;
1747         case NSPACEL:
1748             REXEC_FBC_CSCAN_TAINT(
1749                 !isSPACE_LC_utf8((U8*)s),
1750                 !isSPACE_LC(*s)
1751             );
1752             break;
1753         case DIGIT:
1754             REXEC_FBC_CSCAN_PRELOAD(
1755                 LOAD_UTF8_CHARCLASS_DIGIT(),
1756                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1757                 isDIGIT(*s)
1758             );
1759             break;
1760         case DIGITA:
1761             /* Don't need to worry about utf8, as it can match only a single
1762              * byte invariant character */
1763             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1764             break;
1765         case DIGITL:
1766             REXEC_FBC_CSCAN_TAINT(
1767                 isDIGIT_LC_utf8((U8*)s),
1768                 isDIGIT_LC(*s)
1769             );
1770             break;
1771         case NDIGIT:
1772             REXEC_FBC_CSCAN_PRELOAD(
1773                 LOAD_UTF8_CHARCLASS_DIGIT(),
1774                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1775                 !isDIGIT(*s)
1776             );
1777             break;
1778         case NDIGITA:
1779             REXEC_FBC_CSCAN(
1780                 !isDIGIT_A(*s),
1781                 !isDIGIT_A(*s)
1782             );
1783             break;
1784         case NDIGITL:
1785             REXEC_FBC_CSCAN_TAINT(
1786                 !isDIGIT_LC_utf8((U8*)s),
1787                 !isDIGIT_LC(*s)
1788             );
1789             break;
1790         case LNBREAK:
1791             REXEC_FBC_CSCAN(
1792                 is_LNBREAK_utf8_safe(s, strend),
1793                 is_LNBREAK_latin1_safe(s, strend)
1794             );
1795             break;
1796         case VERTWS:
1797             REXEC_FBC_CSCAN(
1798                 is_VERTWS_utf8_safe(s, strend),
1799                 is_VERTWS_latin1_safe(s, strend)
1800             );
1801             break;
1802         case NVERTWS:
1803             REXEC_FBC_CSCAN(
1804                 !is_VERTWS_utf8_safe(s, strend),
1805                 !is_VERTWS_latin1_safe(s, strend)
1806             );
1807             break;
1808         case HORIZWS:
1809             REXEC_FBC_CSCAN(
1810                 is_HORIZWS_utf8_safe(s, strend),
1811                 is_HORIZWS_latin1_safe(s, strend)
1812             );
1813             break;
1814         case NHORIZWS:
1815             REXEC_FBC_CSCAN(
1816                 !is_HORIZWS_utf8_safe(s, strend),
1817                 !is_HORIZWS_latin1_safe(s, strend)
1818             );
1819             break;
1820         case POSIXA:
1821             /* Don't need to worry about utf8, as it can match only a single
1822             * byte invariant character.  The flag in this node type is the
1823             * class number to pass to _generic_isCC() to build a mask for
1824             * searching in PL_charclass[] */
1825             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1826             break;
1827         case NPOSIXA:
1828             REXEC_FBC_CSCAN(
1829                 !_generic_isCC_A(*s, FLAGS(c)),
1830                 !_generic_isCC_A(*s, FLAGS(c))
1831             );
1832             break;
1833
1834         case AHOCORASICKC:
1835         case AHOCORASICK:
1836             {
1837                 DECL_TRIE_TYPE(c);
1838                 /* what trie are we using right now */
1839                 reg_ac_data *aho
1840                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1841                 reg_trie_data *trie
1842                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1843                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1844
1845                 const char *last_start = strend - trie->minlen;
1846 #ifdef DEBUGGING
1847                 const char *real_start = s;
1848 #endif
1849                 STRLEN maxlen = trie->maxlen;
1850                 SV *sv_points;
1851                 U8 **points; /* map of where we were in the input string
1852                                 when reading a given char. For ASCII this
1853                                 is unnecessary overhead as the relationship
1854                                 is always 1:1, but for Unicode, especially
1855                                 case folded Unicode this is not true. */
1856                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1857                 U8 *bitmap=NULL;
1858
1859
1860                 GET_RE_DEBUG_FLAGS_DECL;
1861
1862                 /* We can't just allocate points here. We need to wrap it in
1863                  * an SV so it gets freed properly if there is a croak while
1864                  * running the match */
1865                 ENTER;
1866                 SAVETMPS;
1867                 sv_points=newSV(maxlen * sizeof(U8 *));
1868                 SvCUR_set(sv_points,
1869                     maxlen * sizeof(U8 *));
1870                 SvPOK_on(sv_points);
1871                 sv_2mortal(sv_points);
1872                 points=(U8**)SvPV_nolen(sv_points );
1873                 if ( trie_type != trie_utf8_fold
1874                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1875                 {
1876                     if (trie->bitmap)
1877                         bitmap=(U8*)trie->bitmap;
1878                     else
1879                         bitmap=(U8*)ANYOF_BITMAP(c);
1880                 }
1881                 /* this is the Aho-Corasick algorithm modified a touch
1882                    to include special handling for long "unknown char"
1883                    sequences. The basic idea being that we use AC as long
1884                    as we are dealing with a possible matching char, when
1885                    we encounter an unknown char (and we have not encountered
1886                    an accepting state) we scan forward until we find a legal
1887                    starting char.
1888                    AC matching is basically that of trie matching, except
1889                    that when we encounter a failing transition, we fall back
1890                    to the current states "fail state", and try the current char
1891                    again, a process we repeat until we reach the root state,
1892                    state 1, or a legal transition. If we fail on the root state
1893                    then we can either terminate if we have reached an accepting
1894                    state previously, or restart the entire process from the beginning
1895                    if we have not.
1896
1897                  */
1898                 while (s <= last_start) {
1899                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1900                     U8 *uc = (U8*)s;
1901                     U16 charid = 0;
1902                     U32 base = 1;
1903                     U32 state = 1;
1904                     UV uvc = 0;
1905                     STRLEN len = 0;
1906                     STRLEN foldlen = 0;
1907                     U8 *uscan = (U8*)NULL;
1908                     U8 *leftmost = NULL;
1909 #ifdef DEBUGGING
1910                     U32 accepted_word= 0;
1911 #endif
1912                     U32 pointpos = 0;
1913
1914                     while ( state && uc <= (U8*)strend ) {
1915                         int failed=0;
1916                         U32 word = aho->states[ state ].wordnum;
1917
1918                         if( state==1 ) {
1919                             if ( bitmap ) {
1920                                 DEBUG_TRIE_EXECUTE_r(
1921                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1922                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1923                                             (char *)uc, utf8_target );
1924                                         PerlIO_printf( Perl_debug_log,
1925                                             " Scanning for legal start char...\n");
1926                                     }
1927                                 );
1928                                 if (utf8_target) {
1929                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1930                                         uc += UTF8SKIP(uc);
1931                                     }
1932                                 } else {
1933                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1934                                         uc++;
1935                                     }
1936                                 }
1937                                 s= (char *)uc;
1938                             }
1939                             if (uc >(U8*)last_start) break;
1940                         }
1941
1942                         if ( word ) {
1943                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1944                             if (!leftmost || lpos < leftmost) {
1945                                 DEBUG_r(accepted_word=word);
1946                                 leftmost= lpos;
1947                             }
1948                             if (base==0) break;
1949
1950                         }
1951                         points[pointpos++ % maxlen]= uc;
1952                         if (foldlen || uc < (U8*)strend) {
1953                             REXEC_TRIE_READ_CHAR(trie_type, trie,
1954                                              widecharmap, uc,
1955                                              uscan, len, uvc, charid, foldlen,
1956                                              foldbuf, uniflags);
1957                             DEBUG_TRIE_EXECUTE_r({
1958                                 dump_exec_pos( (char *)uc, c, strend,
1959                                             real_start, s, utf8_target);
1960                                 PerlIO_printf(Perl_debug_log,
1961                                     " Charid:%3u CP:%4"UVxf" ",
1962                                      charid, uvc);
1963                             });
1964                         }
1965                         else {
1966                             len = 0;
1967                             charid = 0;
1968                         }
1969
1970
1971                         do {
1972 #ifdef DEBUGGING
1973                             word = aho->states[ state ].wordnum;
1974 #endif
1975                             base = aho->states[ state ].trans.base;
1976
1977                             DEBUG_TRIE_EXECUTE_r({
1978                                 if (failed)
1979                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1980                                         s,   utf8_target );
1981                                 PerlIO_printf( Perl_debug_log,
1982                                     "%sState: %4"UVxf", word=%"UVxf,
1983                                     failed ? " Fail transition to " : "",
1984                                     (UV)state, (UV)word);
1985                             });
1986                             if ( base ) {
1987                                 U32 tmp;
1988                                 I32 offset;
1989                                 if (charid &&
1990                                      ( ((offset = base + charid
1991                                         - 1 - trie->uniquecharcount)) >= 0)
1992                                      && ((U32)offset < trie->lasttrans)
1993                                      && trie->trans[offset].check == state
1994                                      && (tmp=trie->trans[offset].next))
1995                                 {
1996                                     DEBUG_TRIE_EXECUTE_r(
1997                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1998                                     state = tmp;
1999                                     break;
2000                                 }
2001                                 else {
2002                                     DEBUG_TRIE_EXECUTE_r(
2003                                         PerlIO_printf( Perl_debug_log," - fail\n"));
2004                                     failed = 1;
2005                                     state = aho->fail[state];
2006                                 }
2007                             }
2008                             else {
2009                                 /* we must be accepting here */
2010                                 DEBUG_TRIE_EXECUTE_r(
2011                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
2012                                 failed = 1;
2013                                 break;
2014                             }
2015                         } while(state);
2016                         uc += len;
2017                         if (failed) {
2018                             if (leftmost)
2019                                 break;
2020                             if (!state) state = 1;
2021                         }
2022                     }
2023                     if ( aho->states[ state ].wordnum ) {
2024                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2025                         if (!leftmost || lpos < leftmost) {
2026                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2027                             leftmost = lpos;
2028                         }
2029                     }
2030                     if (leftmost) {
2031                         s = (char*)leftmost;
2032                         DEBUG_TRIE_EXECUTE_r({
2033                             PerlIO_printf(
2034                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2035                                 (UV)accepted_word, (IV)(s - real_start)
2036                             );
2037                         });
2038                         if (!reginfo || regtry(reginfo, &s)) {
2039                             FREETMPS;
2040                             LEAVE;
2041                             goto got_it;
2042                         }
2043                         s = HOPc(s,1);
2044                         DEBUG_TRIE_EXECUTE_r({
2045                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2046                         });
2047                     } else {
2048                         DEBUG_TRIE_EXECUTE_r(
2049                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2050                         break;
2051                     }
2052                 }
2053                 FREETMPS;
2054                 LEAVE;
2055             }
2056             break;
2057         default:
2058             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2059             break;
2060         }
2061         return 0;
2062       got_it:
2063         return s;
2064 }
2065
2066
2067 /*
2068  - regexec_flags - match a regexp against a string
2069  */
2070 I32
2071 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2072               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2073 /* stringarg: the point in the string at which to begin matching */
2074 /* strend:    pointer to null at end of string */
2075 /* strbeg:    real beginning of string */
2076 /* minend:    end of match must be >= minend bytes after stringarg. */
2077 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2078  *            itself is accessed via the pointers above */
2079 /* data:      May be used for some additional optimizations.
2080               Currently its only used, with a U32 cast, for transmitting
2081               the ganch offset when doing a /g match. This will change */
2082 /* nosave:    For optimizations. */
2083
2084 {
2085     dVAR;
2086     struct regexp *const prog = (struct regexp *)SvANY(rx);
2087     /*register*/ char *s;
2088     regnode *c;
2089     /*register*/ char *startpos = stringarg;
2090     I32 minlen;         /* must match at least this many chars */
2091     I32 dontbother = 0; /* how many characters not to try at end */
2092     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2093     I32 scream_pos = -1;                /* Internal iterator of scream. */
2094     char *scream_olds = NULL;
2095     const bool utf8_target = cBOOL(DO_UTF8(sv));
2096     I32 multiline;
2097     RXi_GET_DECL(prog,progi);
2098     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2099     regexp_paren_pair *swap = NULL;
2100     GET_RE_DEBUG_FLAGS_DECL;
2101
2102     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2103     PERL_UNUSED_ARG(data);
2104
2105     /* Be paranoid... */
2106     if (prog == NULL || startpos == NULL) {
2107         Perl_croak(aTHX_ "NULL regexp parameter");
2108         return 0;
2109     }
2110
2111     multiline = prog->extflags & RXf_PMf_MULTILINE;
2112     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2113
2114     RX_MATCH_UTF8_set(rx, utf8_target);
2115     DEBUG_EXECUTE_r(
2116         debug_start_match(rx, utf8_target, startpos, strend,
2117         "Matching");
2118     );
2119
2120     minlen = prog->minlen;
2121
2122     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2123         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2124                               "String too short [regexec_flags]...\n"));
2125         goto phooey;
2126     }
2127
2128
2129     /* Check validity of program. */
2130     if (UCHARAT(progi->program) != REG_MAGIC) {
2131         Perl_croak(aTHX_ "corrupted regexp program");
2132     }
2133
2134     PL_reg_flags = 0;
2135     PL_reg_state.re_state_eval_setup_done = FALSE;
2136     PL_reg_maxiter = 0;
2137
2138     if (RX_UTF8(rx))
2139         PL_reg_flags |= RF_utf8;
2140
2141     /* Mark beginning of line for ^ and lookbehind. */
2142     reginfo.bol = startpos; /* XXX not used ??? */
2143     PL_bostr  = strbeg;
2144     reginfo.sv = sv;
2145
2146     /* Mark end of line for $ (and such) */
2147     PL_regeol = strend;
2148
2149     /* see how far we have to get to not match where we matched before */
2150     reginfo.till = startpos+minend;
2151
2152     /* If there is a "must appear" string, look for it. */
2153     s = startpos;
2154
2155     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2156         MAGIC *mg;
2157         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2158             reginfo.ganch = startpos + prog->gofs;
2159             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2160               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2161         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2162                   && SvMAGIC(sv)
2163                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2164                   && mg->mg_len >= 0) {
2165             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2166             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2167                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2168
2169             if (prog->extflags & RXf_ANCH_GPOS) {
2170                 if (s > reginfo.ganch)
2171                     goto phooey;
2172                 s = reginfo.ganch - prog->gofs;
2173                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2174                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2175                 if (s < strbeg)
2176                     goto phooey;
2177             }
2178         }
2179         else if (data) {
2180             reginfo.ganch = strbeg + PTR2UV(data);
2181             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2182                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2183
2184         } else {                                /* pos() not defined */
2185             reginfo.ganch = strbeg;
2186             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2187                  "GPOS: reginfo.ganch = strbeg\n"));
2188         }
2189     }
2190     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2191         /* We have to be careful. If the previous successful match
2192            was from this regex we don't want a subsequent partially
2193            successful match to clobber the old results.
2194            So when we detect this possibility we add a swap buffer
2195            to the re, and switch the buffer each match. If we fail
2196            we switch it back, otherwise we leave it swapped.
2197         */
2198         swap = prog->offs;
2199         /* do we need a save destructor here for eval dies? */
2200         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2201         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2202             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2203             PTR2UV(prog),
2204             PTR2UV(swap),
2205             PTR2UV(prog->offs)
2206         ));
2207     }
2208     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2209         re_scream_pos_data d;
2210
2211         d.scream_olds = &scream_olds;
2212         d.scream_pos = &scream_pos;
2213         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2214         if (!s) {
2215             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2216             goto phooey;        /* not present */
2217         }
2218     }
2219
2220
2221
2222     /* Simplest case:  anchored match need be tried only once. */
2223     /*  [unless only anchor is BOL and multiline is set] */
2224     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2225         if (s == startpos && regtry(&reginfo, &startpos))
2226             goto got_it;
2227         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2228                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2229         {
2230             char *end;
2231
2232             if (minlen)
2233                 dontbother = minlen - 1;
2234             end = HOP3c(strend, -dontbother, strbeg) - 1;
2235             /* for multiline we only have to try after newlines */
2236             if (prog->check_substr || prog->check_utf8) {
2237                 /* because of the goto we can not easily reuse the macros for bifurcating the
2238                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2239                 if (utf8_target) {
2240                     if (s == startpos)
2241                         goto after_try_utf8;
2242                     while (1) {
2243                         if (regtry(&reginfo, &s)) {
2244                             goto got_it;
2245                         }
2246                       after_try_utf8:
2247                         if (s > end) {
2248                             goto phooey;
2249                         }
2250                         if (prog->extflags & RXf_USE_INTUIT) {
2251                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2252                             if (!s) {
2253                                 goto phooey;
2254                             }
2255                         }
2256                         else {
2257                             s += UTF8SKIP(s);
2258                         }
2259                     }
2260                 } /* end search for check string in unicode */
2261                 else {
2262                     if (s == startpos) {
2263                         goto after_try_latin;
2264                     }
2265                     while (1) {
2266                         if (regtry(&reginfo, &s)) {
2267                             goto got_it;
2268                         }
2269                       after_try_latin:
2270                         if (s > end) {
2271                             goto phooey;
2272                         }
2273                         if (prog->extflags & RXf_USE_INTUIT) {
2274                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2275                             if (!s) {
2276                                 goto phooey;
2277                             }
2278                         }
2279                         else {
2280                             s++;
2281                         }
2282                     }
2283                 } /* end search for check string in latin*/
2284             } /* end search for check string */
2285             else { /* search for newline */
2286                 if (s > startpos) {
2287                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2288                     s--;
2289                 }
2290                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2291                 while (s <= end) { /* note it could be possible to match at the end of the string */
2292                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2293                         if (regtry(&reginfo, &s))
2294                             goto got_it;
2295                     }
2296                 }
2297             } /* end search for newline */
2298         } /* end anchored/multiline check string search */
2299         goto phooey;
2300     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2301     {
2302         /* the warning about reginfo.ganch being used without initialization
2303            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2304            and we only enter this block when the same bit is set. */
2305         char *tmp_s = reginfo.ganch - prog->gofs;
2306
2307         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2308             goto got_it;
2309         goto phooey;
2310     }
2311
2312     /* Messy cases:  unanchored match. */
2313     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2314         /* we have /x+whatever/ */
2315         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2316         char ch;
2317 #ifdef DEBUGGING
2318         int did_match = 0;
2319 #endif
2320         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2321             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2322         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2323
2324         if (utf8_target) {
2325             REXEC_FBC_SCAN(
2326                 if (*s == ch) {
2327                     DEBUG_EXECUTE_r( did_match = 1 );
2328                     if (regtry(&reginfo, &s)) goto got_it;
2329                     s += UTF8SKIP(s);
2330                     while (s < strend && *s == ch)
2331                         s += UTF8SKIP(s);
2332                 }
2333             );
2334         }
2335         else {
2336             REXEC_FBC_SCAN(
2337                 if (*s == ch) {
2338                     DEBUG_EXECUTE_r( did_match = 1 );
2339                     if (regtry(&reginfo, &s)) goto got_it;
2340                     s++;
2341                     while (s < strend && *s == ch)
2342                         s++;
2343                 }
2344             );
2345         }
2346         DEBUG_EXECUTE_r(if (!did_match)
2347                 PerlIO_printf(Perl_debug_log,
2348                                   "Did not find anchored character...\n")
2349                );
2350     }
2351     else if (prog->anchored_substr != NULL
2352               || prog->anchored_utf8 != NULL
2353               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2354                   && prog->float_max_offset < strend - s)) {
2355         SV *must;
2356         I32 back_max;
2357         I32 back_min;
2358         char *last;
2359         char *last1;            /* Last position checked before */
2360 #ifdef DEBUGGING
2361         int did_match = 0;
2362 #endif
2363         if (prog->anchored_substr || prog->anchored_utf8) {
2364             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2365                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2366             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2367             back_max = back_min = prog->anchored_offset;
2368         } else {
2369             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2370                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2371             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2372             back_max = prog->float_max_offset;
2373             back_min = prog->float_min_offset;
2374         }
2375
2376
2377         if (must == &PL_sv_undef)
2378             /* could not downgrade utf8 check substring, so must fail */
2379             goto phooey;
2380
2381         if (back_min<0) {
2382             last = strend;
2383         } else {
2384             last = HOP3c(strend,        /* Cannot start after this */
2385                   -(I32)(CHR_SVLEN(must)
2386                          - (SvTAIL(must) != 0) + back_min), strbeg);
2387         }
2388         if (s > PL_bostr)
2389             last1 = HOPc(s, -1);
2390         else
2391             last1 = s - 1;      /* bogus */
2392
2393         /* XXXX check_substr already used to find "s", can optimize if
2394            check_substr==must. */
2395         scream_pos = -1;
2396         dontbother = end_shift;
2397         strend = HOPc(strend, -dontbother);
2398         while ( (s <= last) &&
2399                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2400                                   (unsigned char*)strend, must,
2401                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2402             DEBUG_EXECUTE_r( did_match = 1 );
2403             if (HOPc(s, -back_max) > last1) {
2404                 last1 = HOPc(s, -back_min);
2405                 s = HOPc(s, -back_max);
2406             }
2407             else {
2408                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2409
2410                 last1 = HOPc(s, -back_min);
2411                 s = t;
2412             }
2413             if (utf8_target) {
2414                 while (s <= last1) {
2415                     if (regtry(&reginfo, &s))
2416                         goto got_it;
2417                     if (s >= last1) {
2418                         s++; /* to break out of outer loop */
2419                         break;
2420                     }
2421                     s += UTF8SKIP(s);
2422                 }
2423             }
2424             else {
2425                 while (s <= last1) {
2426                     if (regtry(&reginfo, &s))
2427                         goto got_it;
2428                     s++;
2429                 }
2430             }
2431         }
2432         DEBUG_EXECUTE_r(if (!did_match) {
2433             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2434                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2435             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2436                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2437                                ? "anchored" : "floating"),
2438                 quoted, RE_SV_TAIL(must));
2439         });
2440         goto phooey;
2441     }
2442     else if ( (c = progi->regstclass) ) {
2443         if (minlen) {
2444             const OPCODE op = OP(progi->regstclass);
2445             /* don't bother with what can't match */
2446             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2447                 strend = HOPc(strend, -(minlen - 1));
2448         }
2449         DEBUG_EXECUTE_r({
2450             SV * const prop = sv_newmortal();
2451             regprop(prog, prop, c);
2452             {
2453                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2454                     s,strend-s,60);
2455                 PerlIO_printf(Perl_debug_log,
2456                     "Matching stclass %.*s against %s (%d bytes)\n",
2457                     (int)SvCUR(prop), SvPVX_const(prop),
2458                      quoted, (int)(strend - s));
2459             }
2460         });
2461         if (find_byclass(prog, c, s, strend, &reginfo))
2462             goto got_it;
2463         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2464     }
2465     else {
2466         dontbother = 0;
2467         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2468             /* Trim the end. */
2469             char *last= NULL;
2470             SV* float_real;
2471             STRLEN len;
2472             const char *little;
2473
2474             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2475                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2476             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2477
2478             little = SvPV_const(float_real, len);
2479             if (SvTAIL(float_real)) {
2480                     /* This means that float_real contains an artificial \n on the end
2481                      * due to the presence of something like this: /foo$/
2482                      * where we can match both "foo" and "foo\n" at the end of the string.
2483                      * So we have to compare the end of the string first against the float_real
2484                      * without the \n and then against the full float_real with the string.
2485                      * We have to watch out for cases where the string might be smaller
2486                      * than the float_real or the float_real without the \n.
2487                      */
2488                     char *checkpos= strend - len;
2489                     DEBUG_OPTIMISE_r(
2490                         PerlIO_printf(Perl_debug_log,
2491                             "%sChecking for float_real.%s\n",
2492                             PL_colors[4], PL_colors[5]));
2493                     if (checkpos + 1 < strbeg) {
2494                         /* can't match, even if we remove the trailing \n string is too short to match */
2495                         DEBUG_EXECUTE_r(
2496                             PerlIO_printf(Perl_debug_log,
2497                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2498                                 PL_colors[4], PL_colors[5]));
2499                         goto phooey;
2500                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2501                         /* can match, the end of the string matches without the "\n" */
2502                         last = checkpos + 1;
2503                     } else if (checkpos < strbeg) {
2504                         /* cant match, string is too short when the "\n" is included */
2505                         DEBUG_EXECUTE_r(
2506                             PerlIO_printf(Perl_debug_log,
2507                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2508                                 PL_colors[4], PL_colors[5]));
2509                         goto phooey;
2510                     } else if (!multiline) {
2511                         /* non multiline match, so compare with the "\n" at the end of the string */
2512                         if (memEQ(checkpos, little, len)) {
2513                             last= checkpos;
2514                         } else {
2515                             DEBUG_EXECUTE_r(
2516                                 PerlIO_printf(Perl_debug_log,
2517                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2518                                     PL_colors[4], PL_colors[5]));
2519                             goto phooey;
2520                         }
2521                     } else {
2522                         /* multiline match, so we have to search for a place where the full string is located */
2523                         goto find_last;
2524                     }
2525             } else {
2526                   find_last:
2527                     if (len)
2528                         last = rninstr(s, strend, little, little + len);
2529                     else
2530                         last = strend;  /* matching "$" */
2531             }
2532             if (!last) {
2533                 /* at one point this block contained a comment which was probably
2534                  * incorrect, which said that this was a "should not happen" case.
2535                  * Even if it was true when it was written I am pretty sure it is
2536                  * not anymore, so I have removed the comment and replaced it with
2537                  * this one. Yves */
2538                 DEBUG_EXECUTE_r(
2539                     PerlIO_printf(Perl_debug_log,
2540                         "String does not contain required substring, cannot match.\n"
2541                     ));
2542                 goto phooey;
2543             }
2544             dontbother = strend - last + prog->float_min_offset;
2545         }
2546         if (minlen && (dontbother < minlen))
2547             dontbother = minlen - 1;
2548         strend -= dontbother;              /* this one's always in bytes! */
2549         /* We don't know much -- general case. */
2550         if (utf8_target) {
2551             for (;;) {
2552                 if (regtry(&reginfo, &s))
2553                     goto got_it;
2554                 if (s >= strend)
2555                     break;
2556                 s += UTF8SKIP(s);
2557             };
2558         }
2559         else {
2560             do {
2561                 if (regtry(&reginfo, &s))
2562                     goto got_it;
2563             } while (s++ < strend);
2564         }
2565     }
2566
2567     /* Failure. */
2568     goto phooey;
2569
2570 got_it:
2571     DEBUG_BUFFERS_r(
2572         if (swap)
2573             PerlIO_printf(Perl_debug_log,
2574                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2575                 PTR2UV(prog),
2576                 PTR2UV(swap)
2577             );
2578     );
2579     Safefree(swap);
2580     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2581
2582     if (PL_reg_state.re_state_eval_setup_done)
2583         restore_pos(aTHX_ prog);
2584     if (RXp_PAREN_NAMES(prog))
2585         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2586
2587     /* make sure $`, $&, $', and $digit will work later */
2588     if ( !(flags & REXEC_NOT_FIRST) ) {
2589         if (flags & REXEC_COPY_STR) {
2590 #ifdef PERL_OLD_COPY_ON_WRITE
2591             if ((SvIsCOW(sv)
2592                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2593                 if (DEBUG_C_TEST) {
2594                     PerlIO_printf(Perl_debug_log,
2595                                   "Copy on write: regexp capture, type %d\n",
2596                                   (int) SvTYPE(sv));
2597                 }
2598                 RX_MATCH_COPY_FREE(rx);
2599                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2600                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2601                 assert (SvPOKp(prog->saved_copy));
2602                 prog->sublen  = PL_regeol - strbeg;
2603                 prog->suboffset = 0;
2604                 prog->subcoffset = 0;
2605             } else
2606 #endif
2607             {
2608                 I32 min = 0;
2609                 I32 max = PL_regeol - strbeg;
2610                 I32 sublen;
2611
2612                 if (    (flags & REXEC_COPY_SKIP_POST)
2613                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2614                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2615                 ) { /* don't copy $' part of string */
2616                     U32 n = 0;
2617                     max = -1;
2618                     /* calculate the right-most part of the string covered
2619                      * by a capture. Due to look-ahead, this may be to
2620                      * the right of $&, so we have to scan all captures */
2621                     while (n <= prog->lastparen) {
2622                         if (prog->offs[n].end > max)
2623                             max = prog->offs[n].end;
2624                         n++;
2625                     }
2626                     if (max == -1)
2627                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2628                                 ? prog->offs[0].start
2629                                 : 0;
2630                     assert(max >= 0 && max <= PL_regeol - strbeg);
2631                 }
2632
2633                 if (    (flags & REXEC_COPY_SKIP_PRE)
2634                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2635                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2636                 ) { /* don't copy $` part of string */
2637                     U32 n = 0;
2638                     min = max;
2639                     /* calculate the left-most part of the string covered
2640                      * by a capture. Due to look-behind, this may be to
2641                      * the left of $&, so we have to scan all captures */
2642                     while (min && n <= prog->lastparen) {
2643                         if (   prog->offs[n].start != -1
2644                             && prog->offs[n].start < min)
2645                         {
2646                             min = prog->offs[n].start;
2647                         }
2648                         n++;
2649                     }
2650                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2651                         && min >  prog->offs[0].end
2652                     )
2653                         min = prog->offs[0].end;
2654
2655                 }
2656
2657                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2658                 sublen = max - min;
2659
2660                 if (RX_MATCH_COPIED(rx)) {
2661                     if (sublen > prog->sublen)
2662                         prog->subbeg =
2663                                 (char*)saferealloc(prog->subbeg, sublen+1);
2664                 }
2665                 else
2666                     prog->subbeg = (char*)safemalloc(sublen+1);
2667                 Copy(strbeg + min, prog->subbeg, sublen, char);
2668                 prog->subbeg[sublen] = '\0';
2669                 prog->suboffset = min;
2670                 prog->sublen = sublen;
2671                 RX_MATCH_COPIED_on(rx);
2672             }
2673             prog->subcoffset = prog->suboffset;
2674             if (prog->suboffset && utf8_target) {
2675                 /* Convert byte offset to chars.
2676                  * XXX ideally should only compute this if @-/@+
2677                  * has been seen, a la PL_sawampersand ??? */
2678
2679                 /* If there's a direct correspondence between the
2680                  * string which we're matching and the original SV,
2681                  * then we can use the utf8 len cache associated with
2682                  * the SV. In particular, it means that under //g,
2683                  * sv_pos_b2u() will use the previously cached
2684                  * position to speed up working out the new length of
2685                  * subcoffset, rather than counting from the start of
2686                  * the string each time. This stops
2687                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2688                  * from going quadratic */
2689                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2690                     sv_pos_b2u(sv, &(prog->subcoffset));
2691                 else
2692                     prog->subcoffset = utf8_length((U8*)strbeg,
2693                                         (U8*)(strbeg+prog->suboffset));
2694             }
2695         }
2696         else {
2697             RX_MATCH_COPY_FREE(rx);
2698             prog->subbeg = strbeg;
2699             prog->suboffset = 0;
2700             prog->subcoffset = 0;
2701             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2702         }
2703     }
2704
2705     return 1;
2706
2707 phooey:
2708     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2709                           PL_colors[4], PL_colors[5]));
2710     if (PL_reg_state.re_state_eval_setup_done)
2711         restore_pos(aTHX_ prog);
2712     if (swap) {
2713         /* we failed :-( roll it back */
2714         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2715             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2716             PTR2UV(prog),
2717             PTR2UV(prog->offs),
2718             PTR2UV(swap)
2719         ));
2720         Safefree(prog->offs);
2721         prog->offs = swap;
2722     }
2723     return 0;
2724 }
2725
2726
2727 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2728  * Do inc before dec, in case old and new rex are the same */
2729 #define SET_reg_curpm(Re2) \
2730     if (PL_reg_state.re_state_eval_setup_done) {    \
2731         (void)ReREFCNT_inc(Re2);                    \
2732         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2733         PM_SETRE((PL_reg_curpm), (Re2));            \
2734     }
2735
2736
2737 /*
2738  - regtry - try match at specific point
2739  */
2740 STATIC I32                      /* 0 failure, 1 success */
2741 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2742 {
2743     dVAR;
2744     CHECKPOINT lastcp;
2745     REGEXP *const rx = reginfo->prog;
2746     regexp *const prog = (struct regexp *)SvANY(rx);
2747     I32 result;
2748     RXi_GET_DECL(prog,progi);
2749     GET_RE_DEBUG_FLAGS_DECL;
2750
2751     PERL_ARGS_ASSERT_REGTRY;
2752
2753     reginfo->cutpoint=NULL;
2754
2755     if ((prog->extflags & RXf_EVAL_SEEN)
2756         && !PL_reg_state.re_state_eval_setup_done)
2757     {
2758         MAGIC *mg;
2759
2760         PL_reg_state.re_state_eval_setup_done = TRUE;
2761         if (reginfo->sv) {
2762             /* Make $_ available to executed code. */
2763             if (reginfo->sv != DEFSV) {
2764                 SAVE_DEFSV;
2765                 DEFSV_set(reginfo->sv);
2766             }
2767
2768             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2769                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2770                 /* prepare for quick setting of pos */
2771 #ifdef PERL_OLD_COPY_ON_WRITE
2772                 if (SvIsCOW(reginfo->sv))
2773                     sv_force_normal_flags(reginfo->sv, 0);
2774 #endif
2775                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2776                                  &PL_vtbl_mglob, NULL, 0);
2777                 mg->mg_len = -1;
2778             }
2779             PL_reg_magic    = mg;
2780             PL_reg_oldpos   = mg->mg_len;
2781             SAVEDESTRUCTOR_X(restore_pos, prog);
2782         }
2783         if (!PL_reg_curpm) {
2784             Newxz(PL_reg_curpm, 1, PMOP);
2785 #ifdef USE_ITHREADS
2786             {
2787                 SV* const repointer = &PL_sv_undef;
2788                 /* this regexp is also owned by the new PL_reg_curpm, which
2789                    will try to free it.  */
2790                 av_push(PL_regex_padav, repointer);
2791                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2792                 PL_regex_pad = AvARRAY(PL_regex_padav);
2793             }
2794 #endif
2795         }
2796         SET_reg_curpm(rx);
2797         PL_reg_oldcurpm = PL_curpm;
2798         PL_curpm = PL_reg_curpm;
2799         if (RXp_MATCH_COPIED(prog)) {
2800             /*  Here is a serious problem: we cannot rewrite subbeg,
2801                 since it may be needed if this match fails.  Thus
2802                 $` inside (?{}) could fail... */
2803             PL_reg_oldsaved = prog->subbeg;
2804             PL_reg_oldsavedlen = prog->sublen;
2805             PL_reg_oldsavedoffset = prog->suboffset;
2806             PL_reg_oldsavedcoffset = prog->suboffset;
2807 #ifdef PERL_OLD_COPY_ON_WRITE
2808             PL_nrs = prog->saved_copy;
2809 #endif
2810             RXp_MATCH_COPIED_off(prog);
2811         }
2812         else
2813             PL_reg_oldsaved = NULL;
2814         prog->subbeg = PL_bostr;
2815         prog->suboffset = 0;
2816         prog->subcoffset = 0;
2817         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2818     }
2819 #ifdef DEBUGGING
2820     PL_reg_starttry = *startposp;
2821 #endif
2822     prog->offs[0].start = *startposp - PL_bostr;
2823     prog->lastparen = 0;
2824     prog->lastcloseparen = 0;
2825     PL_regsize = 0;
2826
2827     /* XXXX What this code is doing here?!!!  There should be no need
2828        to do this again and again, prog->lastparen should take care of
2829        this!  --ilya*/
2830
2831     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2832      * Actually, the code in regcppop() (which Ilya may be meaning by
2833      * prog->lastparen), is not needed at all by the test suite
2834      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2835      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2836      * Meanwhile, this code *is* needed for the
2837      * above-mentioned test suite tests to succeed.  The common theme
2838      * on those tests seems to be returning null fields from matches.
2839      * --jhi updated by dapm */
2840 #if 1
2841     if (prog->nparens) {
2842         regexp_paren_pair *pp = prog->offs;
2843         I32 i;
2844         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2845             ++pp;
2846             pp->start = -1;
2847             pp->end = -1;
2848         }
2849     }
2850 #endif
2851     REGCP_SET(lastcp);
2852     result = regmatch(reginfo, *startposp, progi->program + 1);
2853     if (result != -1) {
2854         prog->offs[0].end = result;
2855         return 1;
2856     }
2857     if (reginfo->cutpoint)
2858         *startposp= reginfo->cutpoint;
2859     REGCP_UNWIND(lastcp);
2860     return 0;
2861 }
2862
2863
2864 #define sayYES goto yes
2865 #define sayNO goto no
2866 #define sayNO_SILENT goto no_silent
2867
2868 /* we dont use STMT_START/END here because it leads to
2869    "unreachable code" warnings, which are bogus, but distracting. */
2870 #define CACHEsayNO \
2871     if (ST.cache_mask) \
2872        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2873     sayNO
2874
2875 /* this is used to determine how far from the left messages like
2876    'failed...' are printed. It should be set such that messages
2877    are inline with the regop output that created them.
2878 */
2879 #define REPORT_CODE_OFF 32
2880
2881
2882 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2883 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2884
2885 #define SLAB_FIRST(s) (&(s)->states[0])
2886 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2887
2888 /* grab a new slab and return the first slot in it */
2889
2890 STATIC regmatch_state *
2891 S_push_slab(pTHX)
2892 {
2893 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2894     dMY_CXT;
2895 #endif
2896     regmatch_slab *s = PL_regmatch_slab->next;
2897     if (!s) {
2898         Newx(s, 1, regmatch_slab);
2899         s->prev = PL_regmatch_slab;
2900         s->next = NULL;
2901         PL_regmatch_slab->next = s;
2902     }
2903     PL_regmatch_slab = s;
2904     return SLAB_FIRST(s);
2905 }
2906
2907
2908 /* push a new state then goto it */
2909
2910 #define PUSH_STATE_GOTO(state, node, input) \
2911     pushinput = input; \
2912     scan = node; \
2913     st->resume_state = state; \
2914     goto push_state;
2915
2916 /* push a new state with success backtracking, then goto it */
2917
2918 #define PUSH_YES_STATE_GOTO(state, node, input) \
2919     pushinput = input; \
2920     scan = node; \
2921     st->resume_state = state; \
2922     goto push_yes_state;
2923
2924
2925
2926
2927 /*
2928
2929 regmatch() - main matching routine
2930
2931 This is basically one big switch statement in a loop. We execute an op,
2932 set 'next' to point the next op, and continue. If we come to a point which
2933 we may need to backtrack to on failure such as (A|B|C), we push a
2934 backtrack state onto the backtrack stack. On failure, we pop the top
2935 state, and re-enter the loop at the state indicated. If there are no more
2936 states to pop, we return failure.
2937
2938 Sometimes we also need to backtrack on success; for example /A+/, where
2939 after successfully matching one A, we need to go back and try to
2940 match another one; similarly for lookahead assertions: if the assertion
2941 completes successfully, we backtrack to the state just before the assertion
2942 and then carry on.  In these cases, the pushed state is marked as
2943 'backtrack on success too'. This marking is in fact done by a chain of
2944 pointers, each pointing to the previous 'yes' state. On success, we pop to
2945 the nearest yes state, discarding any intermediate failure-only states.
2946 Sometimes a yes state is pushed just to force some cleanup code to be
2947 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2948 it to free the inner regex.
2949
2950 Note that failure backtracking rewinds the cursor position, while
2951 success backtracking leaves it alone.
2952
2953 A pattern is complete when the END op is executed, while a subpattern
2954 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2955 ops trigger the "pop to last yes state if any, otherwise return true"
2956 behaviour.
2957
2958 A common convention in this function is to use A and B to refer to the two
2959 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2960 the subpattern to be matched possibly multiple times, while B is the entire
2961 rest of the pattern. Variable and state names reflect this convention.
2962
2963 The states in the main switch are the union of ops and failure/success of
2964 substates associated with with that op.  For example, IFMATCH is the op
2965 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2966 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2967 successfully matched A and IFMATCH_A_fail is a state saying that we have
2968 just failed to match A. Resume states always come in pairs. The backtrack
2969 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2970 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2971 on success or failure.
2972
2973 The struct that holds a backtracking state is actually a big union, with
2974 one variant for each major type of op. The variable st points to the
2975 top-most backtrack struct. To make the code clearer, within each
2976 block of code we #define ST to alias the relevant union.
2977
2978 Here's a concrete example of a (vastly oversimplified) IFMATCH
2979 implementation:
2980
2981     switch (state) {
2982     ....
2983
2984 #define ST st->u.ifmatch
2985
2986     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2987         ST.foo = ...; // some state we wish to save
2988         ...
2989         // push a yes backtrack state with a resume value of
2990         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2991         // first node of A:
2992         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
2993         // NOTREACHED
2994
2995     case IFMATCH_A: // we have successfully executed A; now continue with B
2996         next = B;
2997         bar = ST.foo; // do something with the preserved value
2998         break;
2999
3000     case IFMATCH_A_fail: // A failed, so the assertion failed
3001         ...;   // do some housekeeping, then ...
3002         sayNO; // propagate the failure
3003
3004 #undef ST
3005
3006     ...
3007     }
3008
3009 For any old-timers reading this who are familiar with the old recursive
3010 approach, the code above is equivalent to:
3011
3012     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3013     {
3014         int foo = ...
3015         ...
3016         if (regmatch(A)) {
3017             next = B;
3018             bar = foo;
3019             break;
3020         }
3021         ...;   // do some housekeeping, then ...
3022         sayNO; // propagate the failure
3023     }
3024
3025 The topmost backtrack state, pointed to by st, is usually free. If you
3026 want to claim it, populate any ST.foo fields in it with values you wish to
3027 save, then do one of
3028
3029         PUSH_STATE_GOTO(resume_state, node, newinput);
3030         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3031
3032 which sets that backtrack state's resume value to 'resume_state', pushes a
3033 new free entry to the top of the backtrack stack, then goes to 'node'.
3034 On backtracking, the free slot is popped, and the saved state becomes the
3035 new free state. An ST.foo field in this new top state can be temporarily
3036 accessed to retrieve values, but once the main loop is re-entered, it
3037 becomes available for reuse.
3038
3039 Note that the depth of the backtrack stack constantly increases during the
3040 left-to-right execution of the pattern, rather than going up and down with
3041 the pattern nesting. For example the stack is at its maximum at Z at the
3042 end of the pattern, rather than at X in the following:
3043
3044     /(((X)+)+)+....(Y)+....Z/
3045
3046 The only exceptions to this are lookahead/behind assertions and the cut,
3047 (?>A), which pop all the backtrack states associated with A before
3048 continuing.
3049
3050 Backtrack state structs are allocated in slabs of about 4K in size.
3051 PL_regmatch_state and st always point to the currently active state,
3052 and PL_regmatch_slab points to the slab currently containing
3053 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3054 allocated, and is never freed until interpreter destruction. When the slab
3055 is full, a new one is allocated and chained to the end. At exit from
3056 regmatch(), slabs allocated since entry are freed.
3057
3058 */
3059
3060
3061 #define DEBUG_STATE_pp(pp)                                  \
3062     DEBUG_STATE_r({                                         \
3063         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3064         PerlIO_printf(Perl_debug_log,                       \
3065             "    %*s"pp" %s%s%s%s%s\n",                     \
3066             depth*2, "",                                    \
3067             PL_reg_name[st->resume_state],                     \
3068             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3069             ((st==yes_state) ? "Y" : ""),                   \
3070             ((st==mark_state) ? "M" : ""),                  \
3071             ((st==yes_state||st==mark_state) ? "]" : "")    \
3072         );                                                  \
3073     });
3074
3075
3076 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3077
3078 #ifdef DEBUGGING
3079
3080 STATIC void
3081 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3082     const char *start, const char *end, const char *blurb)
3083 {
3084     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3085
3086     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3087
3088     if (!PL_colorset)
3089             reginitcolors();
3090     {
3091         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3092             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3093
3094         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3095             start, end - start, 60);
3096
3097         PerlIO_printf(Perl_debug_log,
3098             "%s%s REx%s %s against %s\n",
3099                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3100
3101         if (utf8_target||utf8_pat)
3102             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3103                 utf8_pat ? "pattern" : "",
3104                 utf8_pat && utf8_target ? " and " : "",
3105                 utf8_target ? "string" : ""
3106             );
3107     }
3108 }
3109
3110 STATIC void
3111 S_dump_exec_pos(pTHX_ const char *locinput,
3112                       const regnode *scan,
3113                       const char *loc_regeol,
3114                       const char *loc_bostr,
3115                       const char *loc_reg_starttry,
3116                       const bool utf8_target)
3117 {
3118     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3119     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3120     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3121     /* The part of the string before starttry has one color
3122        (pref0_len chars), between starttry and current
3123        position another one (pref_len - pref0_len chars),
3124        after the current position the third one.
3125        We assume that pref0_len <= pref_len, otherwise we
3126        decrease pref0_len.  */
3127     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3128         ? (5 + taill) - l : locinput - loc_bostr;
3129     int pref0_len;
3130
3131     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3132
3133     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3134         pref_len++;
3135     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3136     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3137         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3138               ? (5 + taill) - pref_len : loc_regeol - locinput);
3139     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3140         l--;
3141     if (pref0_len < 0)
3142         pref0_len = 0;
3143     if (pref0_len > pref_len)
3144         pref0_len = pref_len;
3145     {
3146         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3147
3148         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3149             (locinput - pref_len),pref0_len, 60, 4, 5);
3150
3151         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3152                     (locinput - pref_len + pref0_len),
3153                     pref_len - pref0_len, 60, 2, 3);
3154
3155         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3156                     locinput, loc_regeol - locinput, 10, 0, 1);
3157
3158         const STRLEN tlen=len0+len1+len2;
3159         PerlIO_printf(Perl_debug_log,
3160                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3161                     (IV)(locinput - loc_bostr),
3162                     len0, s0,
3163                     len1, s1,
3164                     (docolor ? "" : "> <"),
3165                     len2, s2,
3166                     (int)(tlen > 19 ? 0 :  19 - tlen),
3167                     "");
3168     }
3169 }
3170
3171 #endif
3172
3173 /* reg_check_named_buff_matched()
3174  * Checks to see if a named buffer has matched. The data array of
3175  * buffer numbers corresponding to the buffer is expected to reside
3176  * in the regexp->data->data array in the slot stored in the ARG() of
3177  * node involved. Note that this routine doesn't actually care about the
3178  * name, that information is not preserved from compilation to execution.
3179  * Returns the index of the leftmost defined buffer with the given name
3180  * or 0 if non of the buffers matched.
3181  */
3182 STATIC I32
3183 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3184 {
3185     I32 n;
3186     RXi_GET_DECL(rex,rexi);
3187     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3188     I32 *nums=(I32*)SvPVX(sv_dat);
3189
3190     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3191
3192     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3193         if ((I32)rex->lastparen >= nums[n] &&
3194             rex->offs[nums[n]].end != -1)
3195         {
3196             return nums[n];
3197         }
3198     }
3199     return 0;
3200 }
3201
3202
3203 /* free all slabs above current one  - called during LEAVE_SCOPE */
3204
3205 STATIC void
3206 S_clear_backtrack_stack(pTHX_ void *p)
3207 {
3208     regmatch_slab *s = PL_regmatch_slab->next;
3209     PERL_UNUSED_ARG(p);
3210
3211     if (!s)
3212         return;
3213     PL_regmatch_slab->next = NULL;
3214     while (s) {
3215         regmatch_slab * const osl = s;
3216         s = s->next;
3217         Safefree(osl);
3218     }
3219 }
3220
3221
3222 /* returns -1 on failure, $+[0] on success */
3223 STATIC I32
3224 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3225 {
3226 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3227     dMY_CXT;
3228 #endif
3229     dVAR;
3230     const bool utf8_target = PL_reg_match_utf8;
3231     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3232     REGEXP *rex_sv = reginfo->prog;
3233     regexp *rex = (struct regexp *)SvANY(rex_sv);
3234     RXi_GET_DECL(rex,rexi);
3235     I32 oldsave;
3236     /* the current state. This is a cached copy of PL_regmatch_state */
3237     regmatch_state *st;
3238     /* cache heavy used fields of st in registers */
3239     regnode *scan;
3240     regnode *next;
3241     U32 n = 0;  /* general value; init to avoid compiler warning */
3242     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3243     char *locinput = startpos;
3244     char *pushinput; /* where to continue after a PUSH */
3245     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3246
3247     bool result = 0;        /* return value of S_regmatch */
3248     int depth = 0;          /* depth of backtrack stack */
3249     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3250     const U32 max_nochange_depth =
3251         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3252         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3253     regmatch_state *yes_state = NULL; /* state to pop to on success of
3254                                                             subpattern */
3255     /* mark_state piggy backs on the yes_state logic so that when we unwind
3256        the stack on success we can update the mark_state as we go */
3257     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3258     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3259     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3260     U32 state_num;
3261     bool no_final = 0;      /* prevent failure from backtracking? */
3262     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3263     char *startpoint = locinput;
3264     SV *popmark = NULL;     /* are we looking for a mark? */
3265     SV *sv_commit = NULL;   /* last mark name seen in failure */
3266     SV *sv_yes_mark = NULL; /* last mark name we have seen
3267                                during a successful match */
3268     U32 lastopen = 0;       /* last open we saw */
3269     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3270     SV* const oreplsv = GvSV(PL_replgv);
3271     /* these three flags are set by various ops to signal information to
3272      * the very next op. They have a useful lifetime of exactly one loop
3273      * iteration, and are not preserved or restored by state pushes/pops
3274      */
3275     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3276     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3277     int logical = 0;        /* the following EVAL is:
3278                                 0: (?{...})
3279                                 1: (?(?{...})X|Y)
3280                                 2: (??{...})
3281                                or the following IFMATCH/UNLESSM is:
3282                                 false: plain (?=foo)
3283                                 true:  used as a condition: (?(?=foo))
3284                             */
3285     PAD* last_pad = NULL;
3286     dMULTICALL;
3287     I32 gimme = G_SCALAR;
3288     CV *caller_cv = NULL;       /* who called us */
3289     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3290     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3291
3292 #ifdef DEBUGGING
3293     GET_RE_DEBUG_FLAGS_DECL;
3294 #endif
3295
3296     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3297     multicall_oldcatch = 0;
3298     multicall_cv = NULL;
3299     cx = NULL;
3300     PERL_UNUSED_VAR(multicall_cop);
3301     PERL_UNUSED_VAR(newsp);
3302
3303
3304     PERL_ARGS_ASSERT_REGMATCH;
3305
3306     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3307             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3308     }));
3309     /* on first ever call to regmatch, allocate first slab */
3310     if (!PL_regmatch_slab) {
3311         Newx(PL_regmatch_slab, 1, regmatch_slab);
3312         PL_regmatch_slab->prev = NULL;
3313         PL_regmatch_slab->next = NULL;
3314         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3315     }
3316
3317     oldsave = PL_savestack_ix;
3318     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3319     SAVEVPTR(PL_regmatch_slab);
3320     SAVEVPTR(PL_regmatch_state);
3321
3322     /* grab next free state slot */
3323     st = ++PL_regmatch_state;
3324     if (st >  SLAB_LAST(PL_regmatch_slab))
3325         st = PL_regmatch_state = S_push_slab(aTHX);
3326
3327     /* Note that nextchr is a byte even in UTF */
3328     SET_nextchr;
3329     scan = prog;
3330     while (scan != NULL) {
3331
3332         DEBUG_EXECUTE_r( {
3333             SV * const prop = sv_newmortal();
3334             regnode *rnext=regnext(scan);
3335             DUMP_EXEC_POS( locinput, scan, utf8_target );
3336             regprop(rex, prop, scan);
3337
3338             PerlIO_printf(Perl_debug_log,
3339                     "%3"IVdf":%*s%s(%"IVdf")\n",
3340                     (IV)(scan - rexi->program), depth*2, "",
3341                     SvPVX_const(prop),
3342                     (PL_regkind[OP(scan)] == END || !rnext) ?
3343                         0 : (IV)(rnext - rexi->program));
3344         });
3345
3346         next = scan + NEXT_OFF(scan);
3347         if (next == scan)
3348             next = NULL;
3349         state_num = OP(scan);
3350
3351       reenter_switch:
3352
3353         SET_nextchr;
3354
3355         switch (state_num) {
3356         case BOL: /*  /^../  */
3357             if (locinput == PL_bostr)
3358             {
3359                 /* reginfo->till = reginfo->bol; */
3360                 break;
3361             }
3362             sayNO;
3363
3364         case MBOL: /*  /^../m  */
3365             if (locinput == PL_bostr ||
3366                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3367             {
3368                 break;
3369             }
3370             sayNO;
3371
3372         case SBOL: /*  /^../s  */
3373             if (locinput == PL_bostr)
3374                 break;
3375             sayNO;
3376
3377         case GPOS: /*  \G  */
3378             if (locinput == reginfo->ganch)
3379                 break;
3380             sayNO;
3381
3382         case KEEPS: /*   \K  */
3383             /* update the startpoint */
3384             st->u.keeper.val = rex->offs[0].start;
3385             rex->offs[0].start = locinput - PL_bostr;
3386             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3387             /*NOT-REACHED*/
3388         case KEEPS_next_fail:
3389             /* rollback the start point change */
3390             rex->offs[0].start = st->u.keeper.val;
3391             sayNO_SILENT;
3392             /*NOT-REACHED*/
3393
3394         case EOL: /* /..$/  */
3395                 goto seol;
3396
3397         case MEOL: /* /..$/m  */
3398             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3399                 sayNO;
3400             break;
3401
3402         case SEOL: /* /..$/s  */
3403           seol:
3404             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3405                 sayNO;
3406             if (PL_regeol - locinput > 1)
3407                 sayNO;
3408             break;
3409
3410         case EOS: /*  \z  */
3411             if (!NEXTCHR_IS_EOS)
3412                 sayNO;
3413             break;
3414
3415         case SANY: /*  /./s  */
3416             if (NEXTCHR_IS_EOS)
3417                 sayNO;
3418             goto increment_locinput;
3419
3420         case CANY: /*  \C  */
3421             if (NEXTCHR_IS_EOS)
3422                 sayNO;
3423             locinput++;
3424             break;
3425
3426         case REG_ANY: /*  /./  */
3427             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3428                 sayNO;
3429             goto increment_locinput;
3430
3431
3432 #undef  ST
3433 #define ST st->u.trie
3434         case TRIEC: /* (ab|cd) with known charclass */
3435             /* In this case the charclass data is available inline so
3436                we can fail fast without a lot of extra overhead.
3437              */
3438             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3439                 DEBUG_EXECUTE_r(
3440                     PerlIO_printf(Perl_debug_log,
3441                               "%*s  %sfailed to match trie start class...%s\n",
3442                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3443                 );
3444                 sayNO_SILENT;
3445                 assert(0); /* NOTREACHED */
3446             }
3447             /* FALL THROUGH */
3448         case TRIE:  /* (ab|cd)  */
3449             /* the basic plan of execution of the trie is:
3450              * At the beginning, run though all the states, and
3451              * find the longest-matching word. Also remember the position
3452              * of the shortest matching word. For example, this pattern:
3453              *    1  2 3 4    5
3454              *    ab|a|x|abcd|abc
3455              * when matched against the string "abcde", will generate
3456              * accept states for all words except 3, with the longest
3457              * matching word being 4, and the shortest being 2 (with
3458              * the position being after char 1 of the string).
3459              *
3460              * Then for each matching word, in word order (i.e. 1,2,4,5),
3461              * we run the remainder of the pattern; on each try setting
3462              * the current position to the character following the word,
3463              * returning to try the next word on failure.
3464              *
3465              * We avoid having to build a list of words at runtime by
3466              * using a compile-time structure, wordinfo[].prev, which
3467              * gives, for each word, the previous accepting word (if any).
3468              * In the case above it would contain the mappings 1->2, 2->0,
3469              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3470              * the longest word (4 above), a list of all words, by
3471              * following the list of prev pointers; this gives us the
3472              * unordered list 4,5,1,2. Then given the current word we have
3473              * just tried, we can go through the list and find the
3474              * next-biggest word to try (so if we just failed on word 2,
3475              * the next in the list is 4).
3476              *
3477              * Since at runtime we don't record the matching position in
3478              * the string for each word, we have to work that out for
3479              * each word we're about to process. The wordinfo table holds
3480              * the character length of each word; given that we recorded
3481              * at the start: the position of the shortest word and its
3482              * length in chars, we just need to move the pointer the
3483              * difference between the two char lengths. Depending on
3484              * Unicode status and folding, that's cheap or expensive.
3485              *
3486              * This algorithm is optimised for the case where are only a
3487              * small number of accept states, i.e. 0,1, or maybe 2.
3488              * With lots of accepts states, and having to try all of them,
3489              * it becomes quadratic on number of accept states to find all
3490              * the next words.
3491              */
3492
3493             {
3494                 /* what type of TRIE am I? (utf8 makes this contextual) */
3495                 DECL_TRIE_TYPE(scan);
3496
3497                 /* what trie are we using right now */
3498                 reg_trie_data * const trie
3499                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3500                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3501                 U32 state = trie->startstate;
3502
3503                 if (   trie->bitmap
3504                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3505                 {
3506                     if (trie->states[ state ].wordnum) {
3507                          DEBUG_EXECUTE_r(
3508                             PerlIO_printf(Perl_debug_log,
3509                                           "%*s  %smatched empty string...%s\n",
3510                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3511                         );
3512                         if (!trie->jump)
3513                             break;
3514                     } else {
3515                         DEBUG_EXECUTE_r(
3516                             PerlIO_printf(Perl_debug_log,
3517                                           "%*s  %sfailed to match trie start class...%s\n",
3518                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3519                         );
3520                         sayNO_SILENT;
3521                    }
3522                 }
3523
3524             {
3525                 U8 *uc = ( U8* )locinput;
3526
3527                 STRLEN len = 0;
3528                 STRLEN foldlen = 0;
3529                 U8 *uscan = (U8*)NULL;
3530                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3531                 U32 charcount = 0; /* how many input chars we have matched */
3532                 U32 accepted = 0; /* have we seen any accepting states? */
3533
3534                 ST.jump = trie->jump;
3535                 ST.me = scan;
3536                 ST.firstpos = NULL;
3537                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3538                 ST.nextword = 0;
3539
3540                 /* fully traverse the TRIE; note the position of the
3541                    shortest accept state and the wordnum of the longest
3542                    accept state */
3543
3544                 while ( state && uc <= (U8*)PL_regeol ) {
3545                     U32 base = trie->states[ state ].trans.base;
3546                     UV uvc = 0;
3547                     U16 charid = 0;
3548                     U16 wordnum;
3549                     wordnum = trie->states[ state ].wordnum;
3550
3551                     if (wordnum) { /* it's an accept state */
3552                         if (!accepted) {
3553                             accepted = 1;
3554                             /* record first match position */
3555                             if (ST.longfold) {
3556                                 ST.firstpos = (U8*)locinput;
3557                                 ST.firstchars = 0;
3558                             }
3559                             else {
3560                                 ST.firstpos = uc;
3561                                 ST.firstchars = charcount;
3562                             }
3563                         }
3564                         if (!ST.nextword || wordnum < ST.nextword)
3565                             ST.nextword = wordnum;
3566                         ST.topword = wordnum;
3567                     }
3568
3569                     DEBUG_TRIE_EXECUTE_r({
3570                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3571                                 PerlIO_printf( Perl_debug_log,
3572                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3573                                     2+depth * 2, "", PL_colors[4],
3574                                     (UV)state, (accepted ? 'Y' : 'N'));
3575                     });
3576
3577                     /* read a char and goto next state */
3578                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3579                         I32 offset;
3580                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3581                                              uscan, len, uvc, charid, foldlen,
3582                                              foldbuf, uniflags);
3583                         charcount++;
3584                         if (foldlen>0)
3585                             ST.longfold = TRUE;
3586                         if (charid &&
3587                              ( ((offset =
3588                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3589
3590                              && ((U32)offset < trie->lasttrans)
3591                              && trie->trans[offset].check == state)
3592                         {
3593                             state = trie->trans[offset].next;
3594                         }
3595                         else {
3596                             state = 0;
3597                         }
3598                         uc += len;
3599
3600                     }
3601                     else {
3602                         state = 0;
3603                     }
3604                     DEBUG_TRIE_EXECUTE_r(
3605                         PerlIO_printf( Perl_debug_log,
3606                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3607                             charid, uvc, (UV)state, PL_colors[5] );
3608                     );
3609                 }
3610                 if (!accepted)
3611                    sayNO;
3612
3613                 /* calculate total number of accept states */
3614                 {
3615                     U16 w = ST.topword;
3616                     accepted = 0;
3617                     while (w) {
3618                         w = trie->wordinfo[w].prev;
3619                         accepted++;
3620                     }
3621                     ST.accepted = accepted;
3622                 }
3623
3624                 DEBUG_EXECUTE_r(
3625                     PerlIO_printf( Perl_debug_log,
3626                         "%*s  %sgot %"IVdf" possible matches%s\n",
3627                         REPORT_CODE_OFF + depth * 2, "",
3628                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3629                 );
3630                 goto trie_first_try; /* jump into the fail handler */
3631             }}
3632             assert(0); /* NOTREACHED */
3633
3634         case TRIE_next_fail: /* we failed - try next alternative */
3635         {
3636             U8 *uc;
3637             if ( ST.jump) {
3638                 REGCP_UNWIND(ST.cp);
3639                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3640             }
3641             if (!--ST.accepted) {
3642                 DEBUG_EXECUTE_r({
3643                     PerlIO_printf( Perl_debug_log,
3644                         "%*s  %sTRIE failed...%s\n",
3645                         REPORT_CODE_OFF+depth*2, "",
3646                         PL_colors[4],
3647                         PL_colors[5] );
3648                 });
3649                 sayNO_SILENT;
3650             }
3651             {
3652                 /* Find next-highest word to process.  Note that this code
3653                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3654                 U16 min = 0;
3655                 U16 word;
3656                 U16 const nextword = ST.nextword;
3657                 reg_trie_wordinfo * const wordinfo
3658                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3659                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3660                     if (word > nextword && (!min || word < min))
3661                         min = word;
3662                 }
3663                 ST.nextword = min;
3664             }
3665
3666           trie_first_try:
3667             if (do_cutgroup) {
3668                 do_cutgroup = 0;
3669                 no_final = 0;
3670             }
3671
3672             if ( ST.jump) {
3673                 ST.lastparen = rex->lastparen;
3674                 ST.lastcloseparen = rex->lastcloseparen;
3675                 REGCP_SET(ST.cp);
3676             }
3677
3678             /* find start char of end of current word */
3679             {
3680                 U32 chars; /* how many chars to skip */
3681                 reg_trie_data * const trie
3682                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3683
3684                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3685                             >=  ST.firstchars);
3686                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3687                             - ST.firstchars;
3688                 uc = ST.firstpos;
3689
3690                 if (ST.longfold) {
3691                     /* the hard option - fold each char in turn and find
3692                      * its folded length (which may be different */
3693                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3694                     STRLEN foldlen;
3695                     STRLEN len;
3696                     UV uvc;
3697                     U8 *uscan;
3698
3699                     while (chars) {
3700                         if (utf8_target) {
3701                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3702                                                     uniflags);
3703                             uc += len;
3704                         }
3705                         else {
3706                             uvc = *uc;
3707                             uc++;
3708                         }
3709                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3710                         uscan = foldbuf;
3711                         while (foldlen) {
3712                             if (!--chars)
3713                                 break;
3714                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3715                                             uniflags);
3716                             uscan += len;
3717                             foldlen -= len;
3718                         }
3719                     }
3720                 }
3721                 else {
3722                     if (utf8_target)
3723                         while (chars--)
3724                             uc += UTF8SKIP(uc);
3725                     else
3726                         uc += chars;
3727                 }
3728             }
3729
3730             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3731                             ? ST.jump[ST.nextword]
3732                             : NEXT_OFF(ST.me));
3733
3734             DEBUG_EXECUTE_r({
3735                 PerlIO_printf( Perl_debug_log,
3736                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3737                     REPORT_CODE_OFF+depth*2, "",
3738                     PL_colors[4],
3739                     ST.nextword,
3740                     PL_colors[5]
3741                     );
3742             });
3743
3744             if (ST.accepted > 1 || has_cutgroup) {
3745                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
3746                 assert(0); /* NOTREACHED */
3747             }
3748             /* only one choice left - just continue */
3749             DEBUG_EXECUTE_r({
3750                 AV *const trie_words
3751                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3752                 SV ** const tmp = av_fetch( trie_words,
3753                     ST.nextword-1, 0 );
3754                 SV *sv= tmp ? sv_newmortal() : NULL;
3755
3756                 PerlIO_printf( Perl_debug_log,
3757                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3758                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3759                     ST.nextword,
3760                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3761                             PL_colors[0], PL_colors[1],
3762                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3763                         )
3764                     : "not compiled under -Dr",
3765                     PL_colors[5] );
3766             });
3767
3768             locinput = (char*)uc;
3769             continue; /* execute rest of RE */
3770             assert(0); /* NOTREACHED */
3771         }
3772 #undef  ST
3773
3774         case EXACT: {            /*  /abc/        */
3775             char *s = STRING(scan);
3776             ln = STR_LEN(scan);
3777             if (utf8_target != UTF_PATTERN) {
3778                 /* The target and the pattern have differing utf8ness. */
3779                 char *l = locinput;
3780                 const char * const e = s + ln;
3781
3782                 if (utf8_target) {
3783                     /* The target is utf8, the pattern is not utf8.
3784                      * Above-Latin1 code points can't match the pattern;
3785                      * invariants match exactly, and the other Latin1 ones need
3786                      * to be downgraded to a single byte in order to do the
3787                      * comparison.  (If we could be confident that the target
3788                      * is not malformed, this could be refactored to have fewer
3789                      * tests by just assuming that if the first bytes match, it
3790                      * is an invariant, but there are tests in the test suite
3791                      * dealing with (??{...}) which violate this) */
3792                     while (s < e) {
3793                         if (l >= PL_regeol)
3794                              sayNO;
3795                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
3796                             sayNO;
3797                         }
3798                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
3799                             if (*l != *s) {
3800                                 sayNO;
3801                             }
3802                             l++;
3803                         }
3804                         else {
3805                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
3806                                 sayNO;
3807                             }
3808                             l += 2;
3809                         }
3810                         s++;
3811                     }
3812                 }
3813                 else {
3814                     /* The target is not utf8, the pattern is utf8. */
3815                     while (s < e) {
3816                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
3817                         {
3818                             sayNO;
3819                         }
3820                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
3821                             if (*s != *l) {
3822                                 sayNO;
3823                             }
3824                             s++;
3825                         }
3826                         else {
3827                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
3828                                 sayNO;
3829                             }
3830                             s += 2;
3831                         }
3832                         l++;
3833                     }
3834                 }
3835                 locinput = l;
3836                 break;
3837             }
3838             /* The target and the pattern have the same utf8ness. */
3839             /* Inline the first character, for speed. */
3840             if (UCHARAT(s) != nextchr)
3841                 sayNO;
3842             if (PL_regeol - locinput < ln)
3843                 sayNO;
3844             if (ln > 1 && memNE(s, locinput, ln))
3845                 sayNO;
3846             locinput += ln;
3847             break;
3848             }
3849
3850         case EXACTFL: {          /*  /abc/il      */
3851             re_fold_t folder;
3852             const U8 * fold_array;
3853             const char * s;
3854             U32 fold_utf8_flags;
3855
3856             PL_reg_flags |= RF_tainted;
3857             folder = foldEQ_locale;
3858             fold_array = PL_fold_locale;
3859             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3860             goto do_exactf;
3861
3862         case EXACTFU_SS:         /*  /\x{df}/iu   */
3863         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
3864         case EXACTFU:            /*  /abc/iu      */
3865             folder = foldEQ_latin1;
3866             fold_array = PL_fold_latin1;
3867             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3868             goto do_exactf;
3869
3870         case EXACTFA:            /*  /abc/iaa     */
3871             folder = foldEQ_latin1;
3872             fold_array = PL_fold_latin1;
3873             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3874             goto do_exactf;
3875
3876         case EXACTF:             /*  /abc/i       */
3877             folder = foldEQ;
3878             fold_array = PL_fold;
3879             fold_utf8_flags = 0;
3880
3881           do_exactf:
3882             s = STRING(scan);
3883             ln = STR_LEN(scan);
3884
3885             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3886               /* Either target or the pattern are utf8, or has the issue where
3887                * the fold lengths may differ. */
3888                 const char * const l = locinput;
3889                 char *e = PL_regeol;
3890
3891                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3892                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3893                 {
3894                     sayNO;
3895                 }
3896                 locinput = e;
3897                 break;
3898             }
3899
3900             /* Neither the target nor the pattern are utf8 */
3901             if (UCHARAT(s) != nextchr &&
3902                 UCHARAT(s) != fold_array[nextchr])
3903             {
3904                 sayNO;
3905             }
3906             if (PL_regeol - locinput < ln)
3907                 sayNO;
3908             if (ln > 1 && ! folder(s, locinput, ln))
3909                 sayNO;
3910             locinput += ln;
3911             break;
3912         }
3913
3914         /* XXX Could improve efficiency by separating these all out using a
3915          * macro or in-line function.  At that point regcomp.c would no longer
3916          * have to set the FLAGS fields of these */
3917         case BOUNDL:  /*  /\b/l  */
3918         case NBOUNDL: /*  /\B/l  */
3919             PL_reg_flags |= RF_tainted;
3920             /* FALL THROUGH */
3921         case BOUND:   /*  /\b/   */
3922         case BOUNDU:  /*  /\b/u  */
3923         case BOUNDA:  /*  /\b/a  */
3924         case NBOUND:  /*  /\B/   */
3925         case NBOUNDU: /*  /\B/u  */
3926         case NBOUNDA: /*  /\B/a  */
3927             /* was last char in word? */
3928             if (utf8_target
3929                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3930                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3931             {
3932                 if (locinput == PL_bostr)
3933                     ln = '\n';
3934                 else {
3935                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3936
3937                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3938                 }
3939                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3940                     ln = isALNUM_uni(ln);
3941                     if (NEXTCHR_IS_EOS)
3942                         n = 0;
3943                     else {
3944                         LOAD_UTF8_CHARCLASS_ALNUM();
3945                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
3946                                                                 utf8_target);
3947                     }
3948                 }
3949                 else {
3950                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3951                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
3952                 }
3953             }
3954             else {
3955
3956                 /* Here the string isn't utf8, or is utf8 and only ascii
3957                  * characters are to match \w.  In the latter case looking at
3958                  * the byte just prior to the current one may be just the final
3959                  * byte of a multi-byte character.  This is ok.  There are two
3960                  * cases:
3961                  * 1) it is a single byte character, and then the test is doing
3962                  *      just what it's supposed to.
3963                  * 2) it is a multi-byte character, in which case the final
3964                  *      byte is never mistakable for ASCII, and so the test
3965                  *      will say it is not a word character, which is the
3966                  *      correct answer. */
3967                 ln = (locinput != PL_bostr) ?
3968                     UCHARAT(locinput - 1) : '\n';
3969                 switch (FLAGS(scan)) {
3970                     case REGEX_UNICODE_CHARSET:
3971                         ln = isWORDCHAR_L1(ln);
3972                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
3973                         break;
3974                     case REGEX_LOCALE_CHARSET:
3975                         ln = isALNUM_LC(ln);
3976                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
3977                         break;
3978                     case REGEX_DEPENDS_CHARSET:
3979                         ln = isALNUM(ln);
3980                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
3981                         break;
3982                     case REGEX_ASCII_RESTRICTED_CHARSET:
3983                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3984                         ln = isWORDCHAR_A(ln);
3985                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
3986                         break;
3987                     default:
3988                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3989                         break;
3990                 }
3991             }
3992             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3993              * regcomp.sym */
3994             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3995                     sayNO;
3996             break;
3997
3998         case ANYOFV: /*  /[abx{df}]/i  */
3999         case ANYOF:  /*  /[abc]/       */
4000             if (NEXTCHR_IS_EOS)
4001                 sayNO;
4002             if (utf8_target || state_num == ANYOFV) {
4003                 STRLEN inclasslen = PL_regeol - locinput;
4004                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
4005                     sayNO;
4006                 locinput += inclasslen;
4007                 break;
4008             }
4009             else {
4010                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4011                     sayNO;
4012                 locinput++;
4013                 break;
4014             }
4015             break;
4016
4017         /* Special char classes: \d, \w etc.
4018          * The defines start on line 166 or so */
4019         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4020                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4021                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4022                   ALNUMA, NALNUMA, isWORDCHAR_A,
4023                   alnum, "a");
4024
4025         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
4026                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4027                   SPACEU, NSPACEU, isSPACE_L1,
4028                   SPACEA, NSPACEA, isSPACE_A,
4029                   space, " ");
4030
4031         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4032                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4033                 DIGITA, NDIGITA, isDIGIT_A,
4034                 digit, "0");
4035
4036         case POSIXA: /* /[[:ascii:]]/ etc */
4037             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4038                 sayNO;
4039             }
4040             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4041             locinput++;
4042             break;
4043
4044         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4045             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4046                 sayNO;
4047             }
4048             goto increment_locinput;
4049
4050         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4051                        a Unicode extended Grapheme Cluster */
4052             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4053               extended Grapheme Cluster is:
4054
4055                CR LF
4056                | Prepend* Begin Extend*
4057                | .
4058
4059                Begin is:           ( Special_Begin | ! Control )
4060                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4061                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4062                Control is:         [ GCB_Control  CR  LF ]
4063                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4064
4065                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4066                we can rewrite
4067
4068                    Begin is ( Regular_Begin + Special Begin )
4069
4070                It turns out that 98.4% of all Unicode code points match
4071                Regular_Begin.  Doing it this way eliminates a table match in
4072                the previous implementation for almost all Unicode code points.
4073
4074                There is a subtlety with Prepend* which showed up in testing.
4075                Note that the Begin, and only the Begin is required in:
4076                 | Prepend* Begin Extend*
4077                Also, Begin contains '! Control'.  A Prepend must be a
4078                '!  Control', which means it must also be a Begin.  What it
4079                comes down to is that if we match Prepend* and then find no
4080                suitable Begin afterwards, that if we backtrack the last
4081                Prepend, that one will be a suitable Begin.
4082             */
4083
4084             if (NEXTCHR_IS_EOS)
4085                 sayNO;
4086             if  (! utf8_target) {
4087
4088                 /* Match either CR LF  or '.', as all the other possibilities
4089                  * require utf8 */
4090                 locinput++;         /* Match the . or CR */
4091                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4092                                        match the LF */
4093                     && locinput < PL_regeol
4094                     && UCHARAT(locinput) == '\n') locinput++;
4095             }
4096             else {
4097
4098                 /* Utf8: See if is ( CR LF ); already know that locinput <
4099                  * PL_regeol, so locinput+1 is in bounds */
4100                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4101                         && UCHARAT(locinput + 1) == '\n')
4102                 {
4103                     locinput += 2;
4104                 }
4105                 else {
4106                     STRLEN len;
4107
4108                     /* In case have to backtrack to beginning, then match '.' */
4109                     char *starting = locinput;
4110
4111                     /* In case have to backtrack the last prepend */
4112                     char *previous_prepend = 0;
4113
4114                     LOAD_UTF8_CHARCLASS_GCB();
4115
4116                     /* Match (prepend)*   */
4117                     while (locinput < PL_regeol
4118                            && (len = is_GCB_Prepend_utf8(locinput)))
4119                     {
4120                         previous_prepend = locinput;
4121                         locinput += len;
4122                     }
4123
4124                     /* As noted above, if we matched a prepend character, but
4125                      * the next thing won't match, back off the last prepend we
4126                      * matched, as it is guaranteed to match the begin */
4127                     if (previous_prepend
4128                         && (locinput >=  PL_regeol
4129                             || (! swash_fetch(PL_utf8_X_regular_begin,
4130                                              (U8*)locinput, utf8_target)
4131                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4132                         )
4133                     {
4134                         locinput = previous_prepend;
4135                     }
4136
4137                     /* Note that here we know PL_regeol > locinput, as we
4138                      * tested that upon input to this switch case, and if we
4139                      * moved locinput forward, we tested the result just above
4140                      * and it either passed, or we backed off so that it will
4141                      * now pass */
4142                     if (swash_fetch(PL_utf8_X_regular_begin,
4143                                     (U8*)locinput, utf8_target)) {
4144                         locinput += UTF8SKIP(locinput);
4145                     }
4146                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4147
4148                         /* Here did not match the required 'Begin' in the
4149                          * second term.  So just match the very first
4150                          * character, the '.' of the final term of the regex */
4151                         locinput = starting + UTF8SKIP(starting);
4152                         goto exit_utf8;
4153                     } else {
4154
4155                         /* Here is a special begin.  It can be composed of
4156                          * several individual characters.  One possibility is
4157                          * RI+ */
4158                         if ((len = is_GCB_RI_utf8(locinput))) {
4159                             locinput += len;
4160                             while (locinput < PL_regeol
4161                                    && (len = is_GCB_RI_utf8(locinput)))
4162                             {
4163                                 locinput += len;
4164                             }
4165                         } else if ((len = is_GCB_T_utf8(locinput))) {
4166                             /* Another possibility is T+ */
4167                             locinput += len;
4168                             while (locinput < PL_regeol
4169                                 && (len = is_GCB_T_utf8(locinput)))
4170                             {
4171                                 locinput += len;
4172                             }
4173                         } else {
4174
4175                             /* Here, neither RI+ nor T+; must be some other
4176                              * Hangul.  That means it is one of the others: L,
4177                              * LV, LVT or V, and matches:
4178                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4179
4180                             /* Match L*           */
4181                             while (locinput < PL_regeol
4182                                    && (len = is_GCB_L_utf8(locinput)))
4183                             {
4184                                 locinput += len;
4185                             }
4186
4187                             /* Here, have exhausted L*.  If the next character
4188                              * is not an LV, LVT nor V, it means we had to have
4189                              * at least one L, so matches L+ in the original
4190                              * equation, we have a complete hangul syllable.
4191                              * Are done. */
4192
4193                             if (locinput < PL_regeol
4194                                 && is_GCB_LV_LVT_V_utf8(locinput))
4195                             {
4196
4197                                 /* Otherwise keep going.  Must be LV, LVT or V.
4198                                  * See if LVT */
4199                                 if (is_utf8_X_LVT((U8*)locinput)) {
4200                                     locinput += UTF8SKIP(locinput);
4201                                 } else {
4202
4203                                     /* Must be  V or LV.  Take it, then match
4204                                      * V*     */
4205                                     locinput += UTF8SKIP(locinput);
4206                                     while (locinput < PL_regeol
4207                                            && (len = is_GCB_V_utf8(locinput)))
4208                                     {
4209                                         locinput += len;
4210                                     }
4211                                 }
4212
4213                                 /* And any of LV, LVT, or V can be followed
4214                                  * by T*            */
4215                                 while (locinput < PL_regeol
4216                                        && (len = is_GCB_T_utf8(locinput)))
4217                                 {
4218                                     locinput += len;
4219                                 }
4220                             }
4221                         }
4222                     }
4223
4224                     /* Match any extender */
4225                     while (locinput < PL_regeol
4226                             && swash_fetch(PL_utf8_X_extend,
4227                                             (U8*)locinput, utf8_target))
4228                     {
4229                         locinput += UTF8SKIP(locinput);
4230                     }
4231                 }
4232             exit_utf8:
4233                 if (locinput > PL_regeol) sayNO;
4234             }
4235             break;
4236
4237         case NREFFL:  /*  /\g{name}/il  */
4238         {   /* The capture buffer cases.  The ones beginning with N for the
4239                named buffers just convert to the equivalent numbered and
4240                pretend they were called as the corresponding numbered buffer
4241                op.  */
4242             /* don't initialize these in the declaration, it makes C++
4243                unhappy */
4244             char *s;
4245             char type;
4246             re_fold_t folder;
4247             const U8 *fold_array;
4248             UV utf8_fold_flags;
4249
4250             PL_reg_flags |= RF_tainted;
4251             folder = foldEQ_locale;
4252             fold_array = PL_fold_locale;
4253             type = REFFL;
4254             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4255             goto do_nref;
4256
4257         case NREFFA:  /*  /\g{name}/iaa  */
4258             folder = foldEQ_latin1;
4259             fold_array = PL_fold_latin1;
4260             type = REFFA;
4261             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4262             goto do_nref;
4263
4264         case NREFFU:  /*  /\g{name}/iu  */
4265             folder = foldEQ_latin1;
4266             fold_array = PL_fold_latin1;
4267             type = REFFU;
4268             utf8_fold_flags = 0;
4269             goto do_nref;
4270
4271         case NREFF:  /*  /\g{name}/i  */
4272             folder = foldEQ;
4273             fold_array = PL_fold;
4274             type = REFF;
4275             utf8_fold_flags = 0;
4276             goto do_nref;
4277
4278         case NREF:  /*  /\g{name}/   */
4279             type = REF;
4280             folder = NULL;
4281             fold_array = NULL;
4282             utf8_fold_flags = 0;
4283           do_nref:
4284
4285             /* For the named back references, find the corresponding buffer
4286              * number */
4287             n = reg_check_named_buff_matched(rex,scan);
4288
4289             if ( ! n ) {
4290                 sayNO;
4291             }
4292             goto do_nref_ref_common;
4293
4294         case REFFL:  /*  /\1/il  */
4295             PL_reg_flags |= RF_tainted;
4296             folder = foldEQ_locale;
4297             fold_array = PL_fold_locale;
4298             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4299             goto do_ref;
4300
4301         case REFFA:  /*  /\1/iaa  */
4302             folder = foldEQ_latin1;
4303             fold_array = PL_fold_latin1;
4304             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4305             goto do_ref;
4306
4307         case REFFU:  /*  /\1/iu  */
4308             folder = foldEQ_latin1;
4309             fold_array = PL_fold_latin1;
4310             utf8_fold_flags = 0;
4311             goto do_ref;
4312
4313         case REFF:  /*  /\1/i  */
4314             folder = foldEQ;
4315             fold_array = PL_fold;
4316             utf8_fold_flags = 0;
4317             goto do_ref;
4318
4319         case REF:  /*  /\1/    */
4320             folder = NULL;
4321             fold_array = NULL;
4322             utf8_fold_flags = 0;
4323
4324           do_ref:
4325             type = OP(scan);
4326             n = ARG(scan);  /* which paren pair */
4327
4328           do_nref_ref_common:
4329             ln = rex->offs[n].start;
4330             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4331             if (rex->lastparen < n || ln == -1)
4332                 sayNO;                  /* Do not match unless seen CLOSEn. */
4333             if (ln == rex->offs[n].end)
4334                 break;
4335
4336             s = PL_bostr + ln;
4337             if (type != REF     /* REF can do byte comparison */
4338                 && (utf8_target || type == REFFU))
4339             { /* XXX handle REFFL better */
4340                 char * limit = PL_regeol;
4341
4342                 /* This call case insensitively compares the entire buffer
4343                     * at s, with the current input starting at locinput, but
4344                     * not going off the end given by PL_regeol, and returns in
4345                     * limit upon success, how much of the current input was
4346                     * matched */
4347                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4348                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4349                 {
4350                     sayNO;
4351                 }
4352                 locinput = limit;
4353                 break;
4354             }
4355
4356             /* Not utf8:  Inline the first character, for speed. */
4357             if (!NEXTCHR_IS_EOS &&
4358                 UCHARAT(s) != nextchr &&
4359                 (type == REF ||
4360                  UCHARAT(s) != fold_array[nextchr]))
4361                 sayNO;
4362             ln = rex->offs[n].end - ln;
4363             if (locinput + ln > PL_regeol)
4364                 sayNO;
4365             if (ln > 1 && (type == REF
4366                            ? memNE(s, locinput, ln)
4367                            : ! folder(s, locinput, ln)))
4368                 sayNO;
4369             locinput += ln;
4370             break;
4371         }
4372
4373         case NOTHING: /* null op; e.g. the 'nothing' following
4374                        * the '*' in m{(a+|b)*}' */
4375             break;
4376         case TAIL: /* placeholder while compiling (A|B|C) */
4377             break;
4378
4379         case BACK: /* ??? doesn't appear to be used ??? */
4380             break;
4381
4382 #undef  ST
4383 #define ST st->u.eval
4384         {
4385             SV *ret;
4386             REGEXP *re_sv;
4387             regexp *re;
4388             regexp_internal *rei;
4389             regnode *startpoint;
4390
4391         case GOSTART: /*  (?R)  */
4392         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4393             if (cur_eval && cur_eval->locinput==locinput) {
4394                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4395                     Perl_croak(aTHX_ "Infinite recursion in regex");
4396                 if ( ++nochange_depth > max_nochange_depth )
4397                     Perl_croak(aTHX_
4398                         "Pattern subroutine nesting without pos change"
4399                         " exceeded limit in regex");
4400             } else {
4401                 nochange_depth = 0;
4402             }
4403             re_sv = rex_sv;
4404             re = rex;
4405             rei = rexi;
4406             if (OP(scan)==GOSUB) {
4407                 startpoint = scan + ARG2L(scan);
4408                 ST.close_paren = ARG(scan);
4409             } else {
4410                 startpoint = rei->program+1;
4411                 ST.close_paren = 0;
4412             }
4413             goto eval_recurse_doit;
4414             assert(0); /* NOTREACHED */
4415
4416         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4417             if (cur_eval && cur_eval->locinput==locinput) {
4418                 if ( ++nochange_depth > max_nochange_depth )
4419                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4420             } else {
4421                 nochange_depth = 0;
4422             }
4423             {
4424                 /* execute the code in the {...} */
4425
4426                 dSP;
4427                 SV ** before;
4428                 OP * const oop = PL_op;
4429                 COP * const ocurcop = PL_curcop;
4430                 OP *nop;
4431                 char *saved_regeol = PL_regeol;
4432                 struct re_save_state saved_state;
4433                 CV *newcv;
4434
4435                 /* save *all* paren positions */
4436                 regcppush(rex, 0);
4437                 REGCP_SET(runops_cp);
4438
4439                 /* To not corrupt the existing regex state while executing the
4440                  * eval we would normally put it on the save stack, like with
4441                  * save_re_context. However, re-evals have a weird scoping so we
4442                  * can't just add ENTER/LEAVE here. With that, things like
4443                  *
4444                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4445                  *
4446                  * would break, as they expect the localisation to be unwound
4447                  * only when the re-engine backtracks through the bit that
4448                  * localised it.
4449                  *
4450                  * What we do instead is just saving the state in a local c
4451                  * variable.
4452                  */
4453                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4454
4455                 PL_reg_state.re_reparsing = FALSE;
4456
4457                 if (!caller_cv)
4458                     caller_cv = find_runcv(NULL);
4459
4460                 n = ARG(scan);
4461
4462                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4463                     newcv = ((struct regexp *)SvANY(
4464                                                 (REGEXP*)(rexi->data->data[n])
4465                                             ))->qr_anoncv
4466                                         ;
4467                     nop = (OP*)rexi->data->data[n+1];
4468                 }
4469                 else if (rexi->data->what[n] == 'l') { /* literal code */
4470                     newcv = caller_cv;
4471                     nop = (OP*)rexi->data->data[n];
4472                     assert(CvDEPTH(newcv));
4473                 }
4474                 else {
4475                     /* literal with own CV */
4476                     assert(rexi->data->what[n] == 'L');
4477                     newcv = rex->qr_anoncv;
4478                     nop = (OP*)rexi->data->data[n];
4479                 }
4480
4481                 /* normally if we're about to execute code from the same
4482                  * CV that we used previously, we just use the existing
4483                  * CX stack entry. However, its possible that in the
4484                  * meantime we may have backtracked, popped from the save
4485                  * stack, and undone the SAVECOMPPAD(s) associated with
4486                  * PUSH_MULTICALL; in which case PL_comppad no longer
4487                  * points to newcv's pad. */
4488                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4489                 {
4490                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4491                     if (last_pushed_cv) {
4492                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4493                     }
4494                     else {
4495                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4496                     }
4497                     last_pushed_cv = newcv;
4498                 }
4499                 last_pad = PL_comppad;
4500
4501                 /* the initial nextstate you would normally execute
4502                  * at the start of an eval (which would cause error
4503                  * messages to come from the eval), may be optimised
4504                  * away from the execution path in the regex code blocks;
4505                  * so manually set PL_curcop to it initially */
4506                 {
4507                     OP *o = cUNOPx(nop)->op_first;
4508                     assert(o->op_type == OP_NULL);
4509                     if (o->op_targ == OP_SCOPE) {
4510                         o = cUNOPo->op_first;
4511                     }
4512                     else {
4513                         assert(o->op_targ == OP_LEAVE);
4514                         o = cUNOPo->op_first;
4515                         assert(o->op_type == OP_ENTER);
4516                         o = o->op_sibling;
4517                     }
4518
4519                     if (o->op_type != OP_STUB) {
4520                         assert(    o->op_type == OP_NEXTSTATE
4521                                 || o->op_type == OP_DBSTATE
4522                                 || (o->op_type == OP_NULL
4523                                     &&  (  o->op_targ == OP_NEXTSTATE
4524                                         || o->op_targ == OP_DBSTATE
4525                                         )
4526                                     )
4527                         );
4528                         PL_curcop = (COP*)o;
4529                     }
4530                 }
4531                 nop = nop->op_next;
4532
4533                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4534                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4535
4536                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4537
4538                 if (sv_yes_mark) {
4539                     SV *sv_mrk = get_sv("REGMARK", 1);
4540                     sv_setsv(sv_mrk, sv_yes_mark);
4541                 }
4542
4543                 /* we don't use MULTICALL here as we want to call the
4544                  * first op of the block of interest, rather than the
4545                  * first op of the sub */
4546                 before = SP;
4547                 PL_op = nop;
4548                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4549                 SPAGAIN;
4550                 if (SP == before)
4551                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4552                 else {
4553                     ret = POPs;
4554                     PUTBACK;
4555                 }
4556
4557                 /* before restoring everything, evaluate the returned
4558                  * value, so that 'uninit' warnings don't use the wrong
4559                  * PL_op or pad. Also need to process any magic vars
4560                  * (e.g. $1) *before* parentheses are restored */
4561
4562                 PL_op = NULL;
4563
4564                 re_sv = NULL;
4565                 if (logical == 0)        /*   (?{})/   */
4566                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4567                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4568                     sw = cBOOL(SvTRUE(ret));
4569                     logical = 0;
4570                 }
4571                 else {                   /*  /(??{})  */
4572                     /*  if its overloaded, let the regex compiler handle
4573                      *  it; otherwise extract regex, or stringify  */
4574                     if (!SvAMAGIC(ret)) {
4575                         SV *sv = ret;
4576                         if (SvROK(sv))
4577                             sv = SvRV(sv);
4578                         if (SvTYPE(sv) == SVt_REGEXP)
4579                             re_sv = (REGEXP*) sv;
4580                         else if (SvSMAGICAL(sv)) {
4581                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4582                             if (mg)
4583                                 re_sv = (REGEXP *) mg->mg_obj;
4584                         }
4585
4586                         /* force any magic, undef warnings here */
4587                         if (!re_sv) {
4588                             ret = sv_mortalcopy(ret);
4589                             (void) SvPV_force_nolen(ret);
4590                         }
4591                     }
4592
4593                 }
4594
4595                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4596
4597                 /* *** Note that at this point we don't restore
4598                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4599                  * be used again soon. This is safe as long as nothing
4600                  * in the regexp code uses the pad ! */
4601                 PL_op = oop;
4602                 PL_curcop = ocurcop;
4603                 PL_regeol = saved_regeol;
4604                 S_regcp_restore(aTHX_ rex, runops_cp);
4605
4606                 if (logical != 2)
4607                     break;
4608             }
4609
4610                 /* only /(??{})/  from now on */
4611                 logical = 0;
4612                 {
4613                     /* extract RE object from returned value; compiling if
4614                      * necessary */
4615
4616                     if (re_sv) {
4617                         re_sv = reg_temp_copy(NULL, re_sv);
4618                     }
4619                     else {
4620                         U32 pm_flags = 0;
4621                         const I32 osize = PL_regsize;
4622
4623                         if (SvUTF8(ret) && IN_BYTES) {
4624                             /* In use 'bytes': make a copy of the octet
4625                              * sequence, but without the flag on */
4626                             STRLEN len;
4627                             const char *const p = SvPV(ret, len);
4628                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4629                         }
4630                         if (rex->intflags & PREGf_USE_RE_EVAL)
4631                             pm_flags |= PMf_USE_RE_EVAL;
4632
4633                         /* if we got here, it should be an engine which
4634                          * supports compiling code blocks and stuff */
4635                         assert(rex->engine && rex->engine->op_comp);
4636                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4637                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4638                                     rex->engine, NULL, NULL,
4639                                     /* copy /msix etc to inner pattern */
4640                                     scan->flags,
4641                                     pm_flags);
4642
4643                         if (!(SvFLAGS(ret)
4644                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4645                                  | SVs_GMG))) {
4646                             /* This isn't a first class regexp. Instead, it's
4647                                caching a regexp onto an existing, Perl visible
4648                                scalar.  */
4649                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4650                         }
4651                         PL_regsize = osize;
4652                         /* safe to do now that any $1 etc has been
4653                          * interpolated into the new pattern string and
4654                          * compiled */
4655                         S_regcp_restore(aTHX_ rex, runops_cp);
4656                     }
4657                     re = (struct regexp *)SvANY(re_sv);
4658                 }
4659                 RXp_MATCH_COPIED_off(re);
4660                 re->subbeg = rex->subbeg;
4661                 re->sublen = rex->sublen;
4662                 re->suboffset = rex->suboffset;
4663                 re->subcoffset = rex->subcoffset;
4664                 rei = RXi_GET(re);
4665                 DEBUG_EXECUTE_r(
4666                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4667                         "Matching embedded");
4668                 );
4669                 startpoint = rei->program + 1;
4670                 ST.close_paren = 0; /* only used for GOSUB */
4671
4672         eval_recurse_doit: /* Share code with GOSUB below this line */
4673                 /* run the pattern returned from (??{...}) */
4674                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4675                 REGCP_SET(ST.lastcp);
4676
4677                 re->lastparen = 0;
4678                 re->lastcloseparen = 0;
4679
4680                 PL_regsize = 0;
4681
4682                 /* XXXX This is too dramatic a measure... */
4683                 PL_reg_maxiter = 0;
4684
4685                 ST.toggle_reg_flags = PL_reg_flags;
4686                 if (RX_UTF8(re_sv))
4687                     PL_reg_flags |= RF_utf8;
4688                 else
4689                     PL_reg_flags &= ~RF_utf8;
4690                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4691
4692                 ST.prev_rex = rex_sv;
4693                 ST.prev_curlyx = cur_curlyx;
4694                 rex_sv = re_sv;
4695                 SET_reg_curpm(rex_sv);
4696                 rex = re;
4697                 rexi = rei;
4698                 cur_curlyx = NULL;
4699                 ST.B = next;
4700                 ST.prev_eval = cur_eval;
4701                 cur_eval = st;
4702                 /* now continue from first node in postoned RE */
4703                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
4704                 assert(0); /* NOTREACHED */
4705         }
4706
4707         case EVAL_AB: /* cleanup after a successful (??{A})B */
4708             /* note: this is called twice; first after popping B, then A */
4709             PL_reg_flags ^= ST.toggle_reg_flags;
4710             rex_sv = ST.prev_rex;
4711             SET_reg_curpm(rex_sv);
4712             rex = (struct regexp *)SvANY(rex_sv);
4713             rexi = RXi_GET(rex);
4714             regcpblow(ST.cp);
4715             cur_eval = ST.prev_eval;
4716             cur_curlyx = ST.prev_curlyx;
4717
4718             /* XXXX This is too dramatic a measure... */
4719             PL_reg_maxiter = 0;
4720             if ( nochange_depth )
4721                 nochange_depth--;
4722             sayYES;
4723
4724
4725         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4726             /* note: this is called twice; first after popping B, then A */
4727             PL_reg_flags ^= ST.toggle_reg_flags;
4728             rex_sv = ST.prev_rex;
4729             SET_reg_curpm(rex_sv);
4730             rex = (struct regexp *)SvANY(rex_sv);
4731             rexi = RXi_GET(rex);
4732
4733             REGCP_UNWIND(ST.lastcp);
4734             regcppop(rex);
4735             cur_eval = ST.prev_eval;
4736             cur_curlyx = ST.prev_curlyx;
4737             /* XXXX This is too dramatic a measure... */
4738             PL_reg_maxiter = 0;
4739             if ( nochange_depth )
4740                 nochange_depth--;
4741             sayNO_SILENT;
4742 #undef ST
4743
4744         case OPEN: /*  (  */
4745             n = ARG(scan);  /* which paren pair */
4746             rex->offs[n].start_tmp = locinput - PL_bostr;
4747             if (n > PL_regsize)
4748                 PL_regsize = n;
4749             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4750                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4751                 PTR2UV(rex),
4752                 PTR2UV(rex->offs),
4753                 (UV)n,
4754                 (IV)rex->offs[n].start_tmp,
4755                 (UV)PL_regsize
4756             ));
4757             lastopen = n;
4758             break;
4759
4760 /* XXX really need to log other places start/end are set too */
4761 #define CLOSE_CAPTURE \
4762     rex->offs[n].start = rex->offs[n].start_tmp; \
4763     rex->offs[n].end = locinput - PL_bostr; \
4764     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4765         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4766         PTR2UV(rex), \
4767         PTR2UV(rex->offs), \
4768         (UV)n, \
4769         (IV)rex->offs[n].start, \
4770         (IV)rex->offs[n].end \
4771     ))
4772
4773         case CLOSE:  /*  )  */
4774             n = ARG(scan);  /* which paren pair */
4775             CLOSE_CAPTURE;
4776             /*if (n > PL_regsize)
4777                 PL_regsize = n;*/
4778             if (n > rex->lastparen)
4779                 rex->lastparen = n;
4780             rex->lastcloseparen = n;
4781             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4782                 goto fake_end;
4783             }
4784             break;
4785
4786         case ACCEPT:  /*  (*ACCEPT)  */
4787             if (ARG(scan)){
4788                 regnode *cursor;
4789                 for (cursor=scan;
4790                      cursor && OP(cursor)!=END;
4791                      cursor=regnext(cursor))
4792                 {
4793                     if ( OP(cursor)==CLOSE ){
4794                         n = ARG(cursor);
4795                         if ( n <= lastopen ) {
4796                             CLOSE_CAPTURE;
4797                             /*if (n > PL_regsize)
4798                             PL_regsize = n;*/
4799                             if (n > rex->lastparen)
4800                                 rex->lastparen = n;
4801                             rex->lastcloseparen = n;
4802                             if ( n == ARG(scan) || (cur_eval &&
4803                                 cur_eval->u.eval.close_paren == n))
4804                                 break;
4805                         }
4806                     }
4807                 }
4808             }
4809             goto fake_end;
4810             /*NOTREACHED*/
4811
4812         case GROUPP:  /*  (?(1))  */
4813             n = ARG(scan);  /* which paren pair */
4814             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4815             break;
4816
4817         case NGROUPP:  /*  (?(<name>))  */
4818             /* reg_check_named_buff_matched returns 0 for no match */
4819             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4820             break;
4821
4822         case INSUBP:   /*  (?(R))  */
4823             n = ARG(scan);
4824             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4825             break;
4826
4827         case DEFINEP:  /*  (?(DEFINE))  */
4828             sw = 0;
4829             break;
4830
4831         case IFTHEN:   /*  (?(cond)A|B)  */
4832             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4833             if (sw)
4834                 next = NEXTOPER(NEXTOPER(scan));
4835             else {
4836                 next = scan + ARG(scan);
4837                 if (OP(next) == IFTHEN) /* Fake one. */
4838                     next = NEXTOPER(NEXTOPER(next));
4839             }
4840             break;
4841
4842         case LOGICAL:  /* modifier for EVAL and IFMATCH */
4843             logical = scan->flags;
4844             break;
4845
4846 /*******************************************************************
4847
4848 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4849 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4850 STAR/PLUS/CURLY/CURLYN are used instead.)
4851
4852 A*B is compiled as <CURLYX><A><WHILEM><B>
4853
4854 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4855 state, which contains the current count, initialised to -1. It also sets
4856 cur_curlyx to point to this state, with any previous value saved in the
4857 state block.
4858
4859 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4860 since the pattern may possibly match zero times (i.e. it's a while {} loop
4861 rather than a do {} while loop).
4862
4863 Each entry to WHILEM represents a successful match of A. The count in the
4864 CURLYX block is incremented, another WHILEM state is pushed, and execution
4865 passes to A or B depending on greediness and the current count.
4866
4867 For example, if matching against the string a1a2a3b (where the aN are
4868 substrings that match /A/), then the match progresses as follows: (the
4869 pushed states are interspersed with the bits of strings matched so far):
4870
4871     <CURLYX cnt=-1>
4872     <CURLYX cnt=0><WHILEM>
4873     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4874     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4875     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4876     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4877
4878 (Contrast this with something like CURLYM, which maintains only a single
4879 backtrack state:
4880
4881     <CURLYM cnt=0> a1
4882     a1 <CURLYM cnt=1> a2
4883     a1 a2 <CURLYM cnt=2> a3
4884     a1 a2 a3 <CURLYM cnt=3> b
4885 )
4886
4887 Each WHILEM state block marks a point to backtrack to upon partial failure
4888 of A or B, and also contains some minor state data related to that
4889 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4890 overall state, such as the count, and pointers to the A and B ops.
4891
4892 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4893 must always point to the *current* CURLYX block, the rules are:
4894
4895 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4896 and set cur_curlyx to point the new block.
4897
4898 When popping the CURLYX block after a successful or unsuccessful match,
4899 restore the previous cur_curlyx.
4900
4901 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4902 to the outer one saved in the CURLYX block.
4903
4904 When popping the WHILEM block after a successful or unsuccessful B match,
4905 restore the previous cur_curlyx.
4906
4907 Here's an example for the pattern (AI* BI)*BO
4908 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4909
4910 cur_
4911 curlyx backtrack stack
4912 ------ ---------------
4913 NULL
4914 CO     <CO prev=NULL> <WO>
4915 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4916 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4917 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4918
4919 At this point the pattern succeeds, and we work back down the stack to
4920 clean up, restoring as we go:
4921
4922 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4923 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4924 CO     <CO prev=NULL> <WO>
4925 NULL
4926
4927 *******************************************************************/
4928
4929 #define ST st->u.curlyx
4930
4931         case CURLYX:    /* start of /A*B/  (for complex A) */
4932         {
4933             /* No need to save/restore up to this paren */
4934             I32 parenfloor = scan->flags;
4935
4936             assert(next); /* keep Coverity happy */
4937             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4938                 next += ARG(next);
4939
4940             /* XXXX Probably it is better to teach regpush to support
4941                parenfloor > PL_regsize... */
4942             if (parenfloor > (I32)rex->lastparen)
4943                 parenfloor = rex->lastparen; /* Pessimization... */
4944
4945             ST.prev_curlyx= cur_curlyx;
4946             cur_curlyx = st;
4947             ST.cp = PL_savestack_ix;
4948
4949             /* these fields contain the state of the current curly.
4950              * they are accessed by subsequent WHILEMs */
4951             ST.parenfloor = parenfloor;
4952             ST.me = scan;
4953             ST.B = next;
4954             ST.minmod = minmod;
4955             minmod = 0;
4956             ST.count = -1;      /* this will be updated by WHILEM */
4957             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4958
4959             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
4960             assert(0); /* NOTREACHED */
4961         }
4962
4963         case CURLYX_end: /* just finished matching all of A*B */
4964             cur_curlyx = ST.prev_curlyx;
4965             sayYES;
4966             assert(0); /* NOTREACHED */
4967
4968         case CURLYX_end_fail: /* just failed to match all of A*B */
4969             regcpblow(ST.cp);
4970             cur_curlyx = ST.prev_curlyx;
4971             sayNO;
4972             assert(0); /* NOTREACHED */
4973
4974
4975 #undef ST
4976 #define ST st->u.whilem
4977
4978         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4979         {
4980             /* see the discussion above about CURLYX/WHILEM */
4981             I32 n;
4982             int min = ARG1(cur_curlyx->u.curlyx.me);
4983             int max = ARG2(cur_curlyx->u.curlyx.me);
4984             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4985
4986             assert(cur_curlyx); /* keep Coverity happy */
4987             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4988             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4989             ST.cache_offset = 0;
4990             ST.cache_mask = 0;
4991
4992
4993             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4994                   "%*s  whilem: matched %ld out of %d..%d\n",
4995                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4996             );
4997
4998             /* First just match a string of min A's. */
4999
5000             if (n < min) {
5001                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5002                 cur_curlyx->u.curlyx.lastloc = locinput;
5003                 REGCP_SET(ST.lastcp);
5004
5005                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5006                 assert(0); /* NOTREACHED */
5007             }
5008
5009             /* If degenerate A matches "", assume A done. */
5010
5011             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5012                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5013                    "%*s  whilem: empty match detected, trying continuation...\n",
5014                    REPORT_CODE_OFF+depth*2, "")
5015                 );
5016                 goto do_whilem_B_max;
5017             }
5018
5019             /* super-linear cache processing */
5020
5021             if (scan->flags) {
5022
5023                 if (!PL_reg_maxiter) {
5024                     /* start the countdown: Postpone detection until we
5025                      * know the match is not *that* much linear. */
5026                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5027                     /* possible overflow for long strings and many CURLYX's */
5028                     if (PL_reg_maxiter < 0)
5029                         PL_reg_maxiter = I32_MAX;
5030                     PL_reg_leftiter = PL_reg_maxiter;
5031                 }
5032
5033                 if (PL_reg_leftiter-- == 0) {
5034                     /* initialise cache */
5035                     const I32 size = (PL_reg_maxiter + 7)/8;
5036                     if (PL_reg_poscache) {
5037                         if ((I32)PL_reg_poscache_size < size) {
5038                             Renew(PL_reg_poscache, size, char);
5039                             PL_reg_poscache_size = size;
5040                         }
5041                         Zero(PL_reg_poscache, size, char);
5042                     }
5043                     else {
5044                         PL_reg_poscache_size = size;
5045                         Newxz(PL_reg_poscache, size, char);
5046                     }
5047                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5048       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5049                               PL_colors[4], PL_colors[5])
5050                     );
5051                 }
5052
5053                 if (PL_reg_leftiter < 0) {
5054                     /* have we already failed at this position? */
5055                     I32 offset, mask;
5056                     offset  = (scan->flags & 0xf) - 1
5057                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5058                     mask    = 1 << (offset % 8);
5059                     offset /= 8;
5060                     if (PL_reg_poscache[offset] & mask) {
5061                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5062                             "%*s  whilem: (cache) already tried at this position...\n",
5063                             REPORT_CODE_OFF+depth*2, "")
5064                         );
5065                         sayNO; /* cache records failure */
5066                     }
5067                     ST.cache_offset = offset;
5068                     ST.cache_mask   = mask;
5069                 }
5070             }
5071
5072             /* Prefer B over A for minimal matching. */
5073
5074             if (cur_curlyx->u.curlyx.minmod) {
5075                 ST.save_curlyx = cur_curlyx;
5076                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5077                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5078                 REGCP_SET(ST.lastcp);
5079                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5080                                     locinput);
5081                 assert(0); /* NOTREACHED */
5082             }
5083
5084             /* Prefer A over B for maximal matching. */
5085
5086             if (n < max) { /* More greed allowed? */
5087                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5088                 cur_curlyx->u.curlyx.lastloc = locinput;
5089                 REGCP_SET(ST.lastcp);
5090                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5091                 assert(0); /* NOTREACHED */
5092             }
5093             goto do_whilem_B_max;
5094         }
5095         assert(0); /* NOTREACHED */
5096
5097         case WHILEM_B_min: /* just matched B in a minimal match */
5098         case WHILEM_B_max: /* just matched B in a maximal match */
5099             cur_curlyx = ST.save_curlyx;
5100             sayYES;
5101             assert(0); /* NOTREACHED */
5102
5103         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5104             cur_curlyx = ST.save_curlyx;
5105             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5106             cur_curlyx->u.curlyx.count--;
5107             CACHEsayNO;
5108             assert(0); /* NOTREACHED */
5109
5110         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5111             /* FALL THROUGH */
5112         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5113             REGCP_UNWIND(ST.lastcp);
5114             regcppop(rex);
5115             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5116             cur_curlyx->u.curlyx.count--;
5117             CACHEsayNO;
5118             assert(0); /* NOTREACHED */
5119
5120         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5121             REGCP_UNWIND(ST.lastcp);
5122             regcppop(rex);      /* Restore some previous $<digit>s? */
5123             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5124                 "%*s  whilem: failed, trying continuation...\n",
5125                 REPORT_CODE_OFF+depth*2, "")
5126             );
5127           do_whilem_B_max:
5128             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5129                 && ckWARN(WARN_REGEXP)
5130                 && !(PL_reg_flags & RF_warned))
5131             {
5132                 PL_reg_flags |= RF_warned;
5133                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5134                      "Complex regular subexpression recursion limit (%d) "
5135                      "exceeded",
5136                      REG_INFTY - 1);
5137             }
5138
5139             /* now try B */
5140             ST.save_curlyx = cur_curlyx;
5141             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5142             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5143                                 locinput);
5144             assert(0); /* NOTREACHED */
5145
5146         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5147             cur_curlyx = ST.save_curlyx;
5148             REGCP_UNWIND(ST.lastcp);
5149             regcppop(rex);
5150
5151             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5152                 /* Maximum greed exceeded */
5153                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5154                     && ckWARN(WARN_REGEXP)
5155                     && !(PL_reg_flags & RF_warned))
5156                 {
5157                     PL_reg_flags |= RF_warned;
5158                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5159                         "Complex regular subexpression recursion "
5160                         "limit (%d) exceeded",
5161                         REG_INFTY - 1);
5162                 }
5163                 cur_curlyx->u.curlyx.count--;
5164                 CACHEsayNO;
5165             }
5166
5167             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5168                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5169             );
5170             /* Try grabbing another A and see if it helps. */
5171             cur_curlyx->u.curlyx.lastloc = locinput;
5172             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5173             REGCP_SET(ST.lastcp);
5174             PUSH_STATE_GOTO(WHILEM_A_min,
5175                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5176                 locinput);
5177             assert(0); /* NOTREACHED */
5178
5179 #undef  ST
5180 #define ST st->u.branch
5181
5182         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5183             next = scan + ARG(scan);
5184             if (next == scan)
5185                 next = NULL;
5186             scan = NEXTOPER(scan);
5187             /* FALL THROUGH */
5188
5189         case BRANCH:        /*  /(...|A|...)/ */
5190             scan = NEXTOPER(scan); /* scan now points to inner node */
5191             ST.lastparen = rex->lastparen;
5192             ST.lastcloseparen = rex->lastcloseparen;
5193             ST.next_branch = next;
5194             REGCP_SET(ST.cp);
5195
5196             /* Now go into the branch */
5197             if (has_cutgroup) {
5198                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5199             } else {
5200                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5201             }
5202             assert(0); /* NOTREACHED */
5203
5204         case CUTGROUP:  /*  /(*THEN)/  */
5205             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5206                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5207             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5208             assert(0); /* NOTREACHED */
5209
5210         case CUTGROUP_next_fail:
5211             do_cutgroup = 1;
5212             no_final = 1;
5213             if (st->u.mark.mark_name)
5214                 sv_commit = st->u.mark.mark_name;
5215             sayNO;
5216             assert(0); /* NOTREACHED */
5217
5218         case BRANCH_next:
5219             sayYES;
5220             assert(0); /* NOTREACHED */
5221
5222         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5223             if (do_cutgroup) {
5224                 do_cutgroup = 0;
5225                 no_final = 0;
5226             }
5227             REGCP_UNWIND(ST.cp);
5228             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5229             scan = ST.next_branch;
5230             /* no more branches? */
5231             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5232                 DEBUG_EXECUTE_r({
5233                     PerlIO_printf( Perl_debug_log,
5234                         "%*s  %sBRANCH failed...%s\n",
5235                         REPORT_CODE_OFF+depth*2, "",
5236                         PL_colors[4],
5237                         PL_colors[5] );
5238                 });
5239                 sayNO_SILENT;
5240             }
5241             continue; /* execute next BRANCH[J] op */
5242             assert(0); /* NOTREACHED */
5243
5244         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5245             minmod = 1;
5246             break;
5247
5248 #undef  ST
5249 #define ST st->u.curlym
5250
5251         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5252
5253             /* This is an optimisation of CURLYX that enables us to push
5254              * only a single backtracking state, no matter how many matches
5255              * there are in {m,n}. It relies on the pattern being constant
5256              * length, with no parens to influence future backrefs
5257              */
5258
5259             ST.me = scan;
5260             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5261
5262             ST.lastparen      = rex->lastparen;
5263             ST.lastcloseparen = rex->lastcloseparen;
5264
5265             /* if paren positive, emulate an OPEN/CLOSE around A */
5266             if (ST.me->flags) {
5267                 U32 paren = ST.me->flags;
5268                 if (paren > PL_regsize)
5269                     PL_regsize = paren;
5270                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5271             }
5272             ST.A = scan;
5273             ST.B = next;
5274             ST.alen = 0;
5275             ST.count = 0;
5276             ST.minmod = minmod;
5277             minmod = 0;
5278             ST.c1 = CHRTEST_UNINIT;
5279             REGCP_SET(ST.cp);
5280
5281             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5282                 goto curlym_do_B;
5283
5284           curlym_do_A: /* execute the A in /A{m,n}B/  */
5285             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5286             assert(0); /* NOTREACHED */
5287
5288         case CURLYM_A: /* we've just matched an A */
5289             ST.count++;
5290             /* after first match, determine A's length: u.curlym.alen */
5291             if (ST.count == 1) {
5292                 if (PL_reg_match_utf8) {
5293                     char *s = st->locinput;
5294                     while (s < locinput) {
5295                         ST.alen++;
5296                         s += UTF8SKIP(s);
5297                     }
5298                 }
5299                 else {
5300                     ST.alen = locinput - st->locinput;
5301                 }
5302                 if (ST.alen == 0)
5303                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5304             }
5305             DEBUG_EXECUTE_r(
5306                 PerlIO_printf(Perl_debug_log,
5307                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5308                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5309                           (IV) ST.count, (IV)ST.alen)
5310             );
5311
5312             if (cur_eval && cur_eval->u.eval.close_paren &&
5313                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5314                 goto fake_end;
5315
5316             {
5317                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5318                 if ( max == REG_INFTY || ST.count < max )
5319                     goto curlym_do_A; /* try to match another A */
5320             }
5321             goto curlym_do_B; /* try to match B */
5322
5323         case CURLYM_A_fail: /* just failed to match an A */
5324             REGCP_UNWIND(ST.cp);
5325
5326             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5327                 || (cur_eval && cur_eval->u.eval.close_paren &&
5328                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5329                 sayNO;
5330
5331           curlym_do_B: /* execute the B in /A{m,n}B/  */
5332             if (ST.c1 == CHRTEST_UNINIT) {
5333                 /* calculate c1 and c2 for possible match of 1st char
5334                  * following curly */
5335                 ST.c1 = ST.c2 = CHRTEST_VOID;
5336                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5337                     regnode *text_node = ST.B;
5338                     if (! HAS_TEXT(text_node))
5339                         FIND_NEXT_IMPT(text_node);
5340                     /* this used to be
5341
5342                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5343
5344                         But the former is redundant in light of the latter.
5345
5346                         if this changes back then the macro for
5347                         IS_TEXT and friends need to change.
5348                      */
5349                     if (PL_regkind[OP(text_node)] == EXACT)
5350                     {
5351
5352                         ST.c1 = (U8)*STRING(text_node);
5353                         switch (OP(text_node)) {
5354                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5355                             case EXACTFA:
5356                             case EXACTFU_SS:
5357                             case EXACTFU_TRICKYFOLD:
5358                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5359                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5360                             default: ST.c2 = ST.c1;
5361                         }
5362                     }
5363                 }
5364             }
5365
5366             DEBUG_EXECUTE_r(
5367                 PerlIO_printf(Perl_debug_log,
5368                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5369                     (int)(REPORT_CODE_OFF+(depth*2)),
5370                     "", (IV)ST.count)
5371                 );
5372             if (       !NEXTCHR_IS_EOS
5373                     && ST.c1 != CHRTEST_VOID
5374                     && nextchr != ST.c1
5375                     && nextchr != ST.c2)
5376             {
5377                 /* simulate B failing */
5378                 DEBUG_OPTIMISE_r(
5379                     PerlIO_printf(Perl_debug_log,
5380                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5381                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5382                         (IV)ST.c1,(IV)ST.c2
5383                 ));
5384                 state_num = CURLYM_B_fail;
5385                 goto reenter_switch;
5386             }
5387
5388             if (ST.me->flags) {
5389                 /* emulate CLOSE: mark current A as captured */
5390                 I32 paren = ST.me->flags;
5391                 if (ST.count) {
5392                     rex->offs[paren].start
5393                         = HOPc(locinput, -ST.alen) - PL_bostr;
5394                     rex->offs[paren].end = locinput - PL_bostr;
5395                     if ((U32)paren > rex->lastparen)
5396                         rex->lastparen = paren;
5397                     rex->lastcloseparen = paren;
5398                 }
5399                 else
5400                     rex->offs[paren].end = -1;
5401                 if (cur_eval && cur_eval->u.eval.close_paren &&
5402                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5403                 {
5404                     if (ST.count)
5405                         goto fake_end;
5406                     else
5407                         sayNO;
5408                 }
5409             }
5410
5411             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5412             assert(0); /* NOTREACHED */
5413
5414         case CURLYM_B_fail: /* just failed to match a B */
5415             REGCP_UNWIND(ST.cp);
5416             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5417             if (ST.minmod) {
5418                 I32 max = ARG2(ST.me);
5419                 if (max != REG_INFTY && ST.count == max)
5420                     sayNO;
5421                 goto curlym_do_A; /* try to match a further A */
5422             }
5423             /* backtrack one A */
5424             if (ST.count == ARG1(ST.me) /* min */)
5425                 sayNO;
5426             ST.count--;
5427             SET_locinput(HOPc(locinput, -ST.alen));
5428             goto curlym_do_B; /* try to match B */
5429
5430 #undef ST
5431 #define ST st->u.curly
5432
5433 #define CURLY_SETPAREN(paren, success) \
5434     if (paren) { \
5435         if (success) { \
5436             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5437             rex->offs[paren].end = locinput - PL_bostr; \
5438             if (paren > rex->lastparen) \
5439                 rex->lastparen = paren; \
5440             rex->lastcloseparen = paren; \
5441         } \
5442         else { \
5443             rex->offs[paren].end = -1; \
5444             rex->lastparen      = ST.lastparen; \
5445             rex->lastcloseparen = ST.lastcloseparen; \
5446         } \
5447     }
5448
5449         case STAR:              /*  /A*B/ where A is width 1 */
5450             ST.paren = 0;
5451             ST.min = 0;
5452             ST.max = REG_INFTY;
5453             scan = NEXTOPER(scan);
5454             goto repeat;
5455
5456         case PLUS:              /*  /A+B/ where A is width 1 */
5457             ST.paren = 0;
5458             ST.min = 1;
5459             ST.max = REG_INFTY;
5460             scan = NEXTOPER(scan);
5461             goto repeat;
5462
5463         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5464             ST.paren = scan->flags;     /* Which paren to set */
5465             ST.lastparen      = rex->lastparen;
5466             ST.lastcloseparen = rex->lastcloseparen;
5467             if (ST.paren > PL_regsize)
5468                 PL_regsize = ST.paren;
5469             ST.min = ARG1(scan);  /* min to match */
5470             ST.max = ARG2(scan);  /* max to match */
5471             if (cur_eval && cur_eval->u.eval.close_paren &&
5472                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5473                 ST.min=1;
5474                 ST.max=1;
5475             }
5476             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5477             goto repeat;
5478
5479         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5480             ST.paren = 0;
5481             ST.min = ARG1(scan);  /* min to match */
5482             ST.max = ARG2(scan);  /* max to match */
5483             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5484           repeat:
5485             /*
5486             * Lookahead to avoid useless match attempts
5487             * when we know what character comes next.
5488             *
5489             * Used to only do .*x and .*?x, but now it allows
5490             * for )'s, ('s and (?{ ... })'s to be in the way
5491             * of the quantifier and the EXACT-like node.  -- japhy
5492             */
5493
5494             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5495                 sayNO;
5496             if (HAS_TEXT(next) || JUMPABLE(next)) {
5497                 U8 *s;
5498                 regnode *text_node = next;
5499
5500                 if (! HAS_TEXT(text_node))
5501                     FIND_NEXT_IMPT(text_node);
5502
5503                 if (! HAS_TEXT(text_node))
5504                     ST.c1 = ST.c2 = CHRTEST_VOID;
5505                 else {
5506                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5507                         ST.c1 = ST.c2 = CHRTEST_VOID;
5508                         goto assume_ok_easy;
5509                     }
5510                     else
5511                         s = (U8*)STRING(text_node);
5512
5513                     /*  Currently we only get here when
5514
5515                         PL_rekind[OP(text_node)] == EXACT
5516
5517                         if this changes back then the macro for IS_TEXT and
5518                         friends need to change. */
5519                     if (!UTF_PATTERN) {
5520                         ST.c1 = *s;
5521                         switch (OP(text_node)) {
5522                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5523                             case EXACTFA:
5524                             case EXACTFU_SS:
5525                             case EXACTFU_TRICKYFOLD:
5526                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5527                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5528                             default: ST.c2 = ST.c1; break;
5529                         }
5530                     }
5531                     else { /* UTF_PATTERN */
5532                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5533                              STRLEN ulen;
5534                              U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5535
5536                              to_utf8_fold((U8*)s, tmpbuf, &ulen);
5537                              ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5538                                                     uniflags);
5539                         }
5540                         else {
5541                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5542                                                      uniflags);
5543                         }
5544                     }
5545                 }
5546             }
5547             else
5548                 ST.c1 = ST.c2 = CHRTEST_VOID;
5549         assume_ok_easy:
5550
5551             ST.A = scan;
5552             ST.B = next;
5553             if (minmod) {
5554                 char *li = locinput;
5555                 minmod = 0;
5556                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5557                     sayNO;
5558                 SET_locinput(li);
5559                 ST.count = ST.min;
5560                 REGCP_SET(ST.cp);
5561                 if (ST.c1 == CHRTEST_VOID)
5562                     goto curly_try_B_min;
5563
5564                 ST.oldloc = locinput;
5565
5566                 /* set ST.maxpos to the furthest point along the
5567                  * string that could possibly match */
5568                 if  (ST.max == REG_INFTY) {
5569                     ST.maxpos = PL_regeol - 1;
5570                     if (utf8_target)
5571                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5572                             ST.maxpos--;
5573                 }
5574                 else if (utf8_target) {
5575                     int m = ST.max - ST.min;
5576                     for (ST.maxpos = locinput;
5577                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5578                         ST.maxpos += UTF8SKIP(ST.maxpos);
5579                 }
5580                 else {
5581                     ST.maxpos = locinput + ST.max - ST.min;
5582                     if (ST.maxpos >= PL_regeol)
5583                         ST.maxpos = PL_regeol - 1;
5584                 }
5585                 goto curly_try_B_min_known;
5586
5587             }
5588             else {
5589                 /* avoid taking address of locinput, so it can remain
5590                  * a register var */
5591                 char *li = locinput;
5592                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5593                 if (ST.count < ST.min)
5594                     sayNO;
5595                 SET_locinput(li);
5596                 if ((ST.count > ST.min)
5597                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5598                 {
5599                     /* A{m,n} must come at the end of the string, there's
5600                      * no point in backing off ... */
5601                     ST.min = ST.count;
5602                     /* ...except that $ and \Z can match before *and* after
5603                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5604                        We may back off by one in this case. */
5605                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5606                         ST.min--;
5607                 }
5608                 REGCP_SET(ST.cp);
5609                 goto curly_try_B_max;
5610             }
5611             assert(0); /* NOTREACHED */
5612
5613
5614         case CURLY_B_min_known_fail:
5615             /* failed to find B in a non-greedy match where c1,c2 valid */
5616
5617             REGCP_UNWIND(ST.cp);
5618             if (ST.paren) {
5619                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5620             }
5621             /* Couldn't or didn't -- move forward. */
5622             ST.oldloc = locinput;
5623             if (utf8_target)
5624                 locinput += UTF8SKIP(locinput);
5625             else
5626                 locinput++;
5627             ST.count++;
5628           curly_try_B_min_known:
5629              /* find the next place where 'B' could work, then call B */
5630             {
5631                 int n;
5632                 if (utf8_target) {
5633                     n = (ST.oldloc == locinput) ? 0 : 1;
5634                     if (ST.c1 == ST.c2) {
5635                         STRLEN len;
5636                         /* set n to utf8_distance(oldloc, locinput) */
5637                         while (locinput <= ST.maxpos &&
5638                                utf8n_to_uvchr((U8*)locinput,
5639                                               UTF8_MAXBYTES, &len,
5640                                               uniflags) != (UV)ST.c1) {
5641                             locinput += len;
5642                             n++;
5643                         }
5644                     }
5645                     else {
5646                         /* set n to utf8_distance(oldloc, locinput) */
5647                         while (locinput <= ST.maxpos) {
5648                             STRLEN len;
5649                             const UV c = utf8n_to_uvchr((U8*)locinput,
5650                                                   UTF8_MAXBYTES, &len,
5651                                                   uniflags);
5652                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5653                                 break;
5654                             locinput += len;
5655                             n++;
5656                         }
5657                     }
5658                 }
5659                 else {
5660                     if (ST.c1 == ST.c2) {
5661                         while (locinput <= ST.maxpos &&
5662                                UCHARAT(locinput) != ST.c1)
5663                             locinput++;
5664                     }
5665                     else {
5666                         while (locinput <= ST.maxpos
5667                                && UCHARAT(locinput) != ST.c1
5668                                && UCHARAT(locinput) != ST.c2)
5669                             locinput++;
5670                     }
5671                     n = locinput - ST.oldloc;
5672                 }
5673                 if (locinput > ST.maxpos)
5674                     sayNO;
5675                 if (n) {
5676                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5677                      * at b; check that everything between oldloc and
5678                      * locinput matches */
5679                     char *li = ST.oldloc;
5680                     ST.count += n;
5681                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
5682                         sayNO;
5683                     assert(n == REG_INFTY || locinput == li);
5684                 }
5685                 CURLY_SETPAREN(ST.paren, ST.count);
5686                 if (cur_eval && cur_eval->u.eval.close_paren &&
5687                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5688                     goto fake_end;
5689                 }
5690                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5691             }
5692             assert(0); /* NOTREACHED */
5693
5694
5695         case CURLY_B_min_fail:
5696             /* failed to find B in a non-greedy match where c1,c2 invalid */
5697
5698             REGCP_UNWIND(ST.cp);
5699             if (ST.paren) {
5700                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5701             }
5702             /* failed -- move forward one */
5703             {
5704                 char *li = locinput;
5705                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
5706                     sayNO;
5707                 }
5708                 locinput = li;
5709             }
5710             {
5711                 ST.count++;
5712                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5713                         ST.count > 0)) /* count overflow ? */
5714                 {
5715                   curly_try_B_min:
5716                     CURLY_SETPAREN(ST.paren, ST.count);
5717                     if (cur_eval && cur_eval->u.eval.close_paren &&
5718                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5719                         goto fake_end;
5720                     }
5721                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
5722                 }
5723             }
5724             assert(0); /* NOTREACHED */
5725
5726
5727         curly_try_B_max:
5728             /* a successful greedy match: now try to match B */
5729             if (cur_eval && cur_eval->u.eval.close_paren &&
5730                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5731                 goto fake_end;
5732             }
5733             {
5734                 UV c = 0;
5735                 if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol)
5736                     c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
5737                                            UTF8_MAXBYTES, 0, uniflags)
5738                                 : (UV) UCHARAT(locinput);
5739                 /* If it could work, try it. */
5740                 if (ST.c1 == CHRTEST_VOID
5741                     || (locinput < PL_regeol &&
5742                         (c == (UV)ST.c1 || c == (UV)ST.c2)))
5743                 {
5744                     CURLY_SETPAREN(ST.paren, ST.count);
5745                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
5746                     assert(0); /* NOTREACHED */
5747                 }
5748             }
5749             /* FALL THROUGH */
5750
5751         case CURLY_B_max_fail:
5752             /* failed to find B in a greedy match */
5753
5754             REGCP_UNWIND(ST.cp);
5755             if (ST.paren) {
5756                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5757             }
5758             /*  back up. */
5759             if (--ST.count < ST.min)
5760                 sayNO;
5761             locinput = HOPc(locinput, -1);
5762             goto curly_try_B_max;
5763
5764 #undef ST
5765
5766         case END: /*  last op of main pattern  */
5767             fake_end:
5768             if (cur_eval) {
5769                 /* we've just finished A in /(??{A})B/; now continue with B */
5770                 st->u.eval.toggle_reg_flags
5771                             = cur_eval->u.eval.toggle_reg_flags;
5772                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5773
5774                 st->u.eval.prev_rex = rex_sv;           /* inner */
5775                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5776                 rex_sv = cur_eval->u.eval.prev_rex;
5777                 SET_reg_curpm(rex_sv);
5778                 rex = (struct regexp *)SvANY(rex_sv);
5779                 rexi = RXi_GET(rex);
5780                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5781
5782                 REGCP_SET(st->u.eval.lastcp);
5783
5784                 /* Restore parens of the outer rex without popping the
5785                  * savestack */
5786                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5787
5788                 st->u.eval.prev_eval = cur_eval;
5789                 cur_eval = cur_eval->u.eval.prev_eval;
5790                 DEBUG_EXECUTE_r(
5791                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5792                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5793                 if ( nochange_depth )
5794                     nochange_depth--;
5795
5796                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
5797                                     locinput); /* match B */
5798             }
5799
5800             if (locinput < reginfo->till) {
5801                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5802                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5803                                       PL_colors[4],
5804                                       (long)(locinput - PL_reg_starttry),
5805                                       (long)(reginfo->till - PL_reg_starttry),
5806                                       PL_colors[5]));
5807
5808                 sayNO_SILENT;           /* Cannot match: too short. */
5809             }
5810             sayYES;                     /* Success! */
5811
5812         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5813             DEBUG_EXECUTE_r(
5814             PerlIO_printf(Perl_debug_log,
5815                 "%*s  %ssubpattern success...%s\n",
5816                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5817             sayYES;                     /* Success! */
5818
5819 #undef  ST
5820 #define ST st->u.ifmatch
5821
5822         {
5823             char *newstart;
5824
5825         case SUSPEND:   /* (?>A) */
5826             ST.wanted = 1;
5827             newstart = locinput;
5828             goto do_ifmatch;
5829
5830         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5831             ST.wanted = 0;
5832             goto ifmatch_trivial_fail_test;
5833
5834         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5835             ST.wanted = 1;
5836           ifmatch_trivial_fail_test:
5837             if (scan->flags) {
5838                 char * const s = HOPBACKc(locinput, scan->flags);
5839                 if (!s) {
5840                     /* trivial fail */
5841                     if (logical) {
5842                         logical = 0;
5843                         sw = 1 - cBOOL(ST.wanted);
5844                     }
5845                     else if (ST.wanted)
5846                         sayNO;
5847                     next = scan + ARG(scan);
5848                     if (next == scan)
5849                         next = NULL;
5850                     break;
5851                 }
5852                 newstart = s;
5853             }
5854             else
5855                 newstart = locinput;
5856
5857           do_ifmatch:
5858             ST.me = scan;
5859             ST.logical = logical;
5860             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5861
5862             /* execute body of (?...A) */
5863             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
5864             assert(0); /* NOTREACHED */
5865         }
5866
5867         case IFMATCH_A_fail: /* body of (?...A) failed */
5868             ST.wanted = !ST.wanted;
5869             /* FALL THROUGH */
5870
5871         case IFMATCH_A: /* body of (?...A) succeeded */
5872             if (ST.logical) {
5873                 sw = cBOOL(ST.wanted);
5874             }
5875             else if (!ST.wanted)
5876                 sayNO;
5877
5878             if (OP(ST.me) != SUSPEND) {
5879                 /* restore old position except for (?>...) */
5880                 locinput = st->locinput;
5881             }
5882             scan = ST.me + ARG(ST.me);
5883             if (scan == ST.me)
5884                 scan = NULL;
5885             continue; /* execute B */
5886
5887 #undef ST
5888
5889         case LONGJMP: /*  alternative with many branches compiles to
5890                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
5891             next = scan + ARG(scan);
5892             if (next == scan)
5893                 next = NULL;
5894             break;
5895
5896         case COMMIT:  /*  (*COMMIT)  */
5897             reginfo->cutpoint = PL_regeol;
5898             /* FALLTHROUGH */
5899
5900         case PRUNE:   /*  (*PRUNE)   */
5901             if (!scan->flags)
5902                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5903             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
5904             assert(0); /* NOTREACHED */
5905
5906         case COMMIT_next_fail:
5907             no_final = 1;
5908             /* FALLTHROUGH */
5909
5910         case OPFAIL:   /* (*FAIL)  */
5911             sayNO;
5912             assert(0); /* NOTREACHED */
5913
5914 #define ST st->u.mark
5915         case MARKPOINT: /*  (*MARK:foo)  */
5916             ST.prev_mark = mark_state;
5917             ST.mark_name = sv_commit = sv_yes_mark
5918                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5919             mark_state = st;
5920             ST.mark_loc = locinput;
5921             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
5922             assert(0); /* NOTREACHED */
5923
5924         case MARKPOINT_next:
5925             mark_state = ST.prev_mark;
5926             sayYES;
5927             assert(0); /* NOTREACHED */
5928
5929         case MARKPOINT_next_fail:
5930             if (popmark && sv_eq(ST.mark_name,popmark))
5931             {
5932                 if (ST.mark_loc > startpoint)
5933                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5934                 popmark = NULL; /* we found our mark */
5935                 sv_commit = ST.mark_name;
5936
5937                 DEBUG_EXECUTE_r({
5938                         PerlIO_printf(Perl_debug_log,
5939                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5940                             REPORT_CODE_OFF+depth*2, "",
5941                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5942                 });
5943             }
5944             mark_state = ST.prev_mark;
5945             sv_yes_mark = mark_state ?
5946                 mark_state->u.mark.mark_name : NULL;
5947             sayNO;
5948             assert(0); /* NOTREACHED */
5949
5950         case SKIP:  /*  (*SKIP)  */
5951             if (scan->flags) {
5952                 /* (*SKIP) : if we fail we cut here*/
5953                 ST.mark_name = NULL;
5954                 ST.mark_loc = locinput;
5955                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5956             } else {
5957                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5958                    otherwise do nothing.  Meaning we need to scan
5959                  */
5960                 regmatch_state *cur = mark_state;
5961                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5962
5963                 while (cur) {
5964                     if ( sv_eq( cur->u.mark.mark_name,
5965                                 find ) )
5966                     {
5967                         ST.mark_name = find;
5968                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
5969                     }
5970                     cur = cur->u.mark.prev_mark;
5971                 }
5972             }
5973             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5974             break;
5975
5976         case SKIP_next_fail:
5977             if (ST.mark_name) {
5978                 /* (*CUT:NAME) - Set up to search for the name as we
5979                    collapse the stack*/
5980                 popmark = ST.mark_name;
5981             } else {
5982                 /* (*CUT) - No name, we cut here.*/
5983                 if (ST.mark_loc > startpoint)
5984                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5985                 /* but we set sv_commit to latest mark_name if there
5986                    is one so they can test to see how things lead to this
5987                    cut */
5988                 if (mark_state)
5989                     sv_commit=mark_state->u.mark.mark_name;
5990             }
5991             no_final = 1;
5992             sayNO;
5993             assert(0); /* NOTREACHED */
5994 #undef ST
5995
5996         case LNBREAK: /* \R */
5997             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
5998                 locinput += n;
5999             } else
6000                 sayNO;
6001             break;
6002
6003 #define CASE_CLASS(nAmE)                              \
6004         case nAmE:                                    \
6005             if (NEXTCHR_IS_EOS)                       \
6006                 sayNO;                                \
6007             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6008                 locinput += n;                        \
6009             } else                                    \
6010                 sayNO;                                \
6011             break;                                    \
6012         case N##nAmE:                                 \
6013             if (NEXTCHR_IS_EOS)                       \
6014                 sayNO;                                \
6015             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6016                 sayNO;                                \
6017             } else {                                  \
6018                 locinput += UTF8SKIP(locinput);       \
6019             }                                         \
6020             break
6021
6022         CASE_CLASS(VERTWS);  /*  \v \V  */
6023         CASE_CLASS(HORIZWS); /*  \h \H  */
6024 #undef CASE_CLASS
6025
6026         default:
6027             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6028                           PTR2UV(scan), OP(scan));
6029             Perl_croak(aTHX_ "regexp memory corruption");
6030
6031         /* this is a point to jump to in order to increment
6032          * locinput by one character */
6033         increment_locinput:
6034             if (utf8_target) {
6035                 locinput += PL_utf8skip[nextchr];
6036                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6037                 if (locinput > PL_regeol)
6038                     sayNO;
6039             }
6040             else
6041                 locinput++;
6042             break;
6043
6044         } /* end switch */
6045
6046         /* switch break jumps here */
6047         scan = next; /* prepare to execute the next op and ... */
6048         continue;    /* ... jump back to the top, reusing st */
6049         assert(0); /* NOTREACHED */
6050
6051       push_yes_state:
6052         /* push a state that backtracks on success */
6053         st->u.yes.prev_yes_state = yes_state;
6054         yes_state = st;
6055         /* FALL THROUGH */
6056       push_state:
6057         /* push a new regex state, then continue at scan  */
6058         {
6059             regmatch_state *newst;
6060
6061             DEBUG_STACK_r({
6062                 regmatch_state *cur = st;
6063                 regmatch_state *curyes = yes_state;
6064                 int curd = depth;
6065                 regmatch_slab *slab = PL_regmatch_slab;
6066                 for (;curd > -1;cur--,curd--) {
6067                     if (cur < SLAB_FIRST(slab)) {
6068                         slab = slab->prev;
6069                         cur = SLAB_LAST(slab);
6070                     }
6071                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6072                         REPORT_CODE_OFF + 2 + depth * 2,"",
6073                         curd, PL_reg_name[cur->resume_state],
6074                         (curyes == cur) ? "yes" : ""
6075                     );
6076                     if (curyes == cur)
6077                         curyes = cur->u.yes.prev_yes_state;
6078                 }
6079             } else
6080                 DEBUG_STATE_pp("push")
6081             );
6082             depth++;
6083             st->locinput = locinput;
6084             newst = st+1;
6085             if (newst >  SLAB_LAST(PL_regmatch_slab))
6086                 newst = S_push_slab(aTHX);
6087             PL_regmatch_state = newst;
6088
6089             locinput = pushinput;
6090             st = newst;
6091             continue;
6092             assert(0); /* NOTREACHED */
6093         }
6094     }
6095
6096     /*
6097     * We get here only if there's trouble -- normally "case END" is
6098     * the terminating point.
6099     */
6100     Perl_croak(aTHX_ "corrupted regexp pointers");
6101     /*NOTREACHED*/
6102     sayNO;
6103
6104 yes:
6105     if (yes_state) {
6106         /* we have successfully completed a subexpression, but we must now
6107          * pop to the state marked by yes_state and continue from there */
6108         assert(st != yes_state);
6109 #ifdef DEBUGGING
6110         while (st != yes_state) {
6111             st--;
6112             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6113                 PL_regmatch_slab = PL_regmatch_slab->prev;
6114                 st = SLAB_LAST(PL_regmatch_slab);
6115             }
6116             DEBUG_STATE_r({
6117                 if (no_final) {
6118                     DEBUG_STATE_pp("pop (no final)");
6119                 } else {
6120                     DEBUG_STATE_pp("pop (yes)");
6121                 }
6122             });
6123             depth--;
6124         }
6125 #else
6126         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6127             || yes_state > SLAB_LAST(PL_regmatch_slab))
6128         {
6129             /* not in this slab, pop slab */
6130             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6131             PL_regmatch_slab = PL_regmatch_slab->prev;
6132             st = SLAB_LAST(PL_regmatch_slab);
6133         }
6134         depth -= (st - yes_state);
6135 #endif
6136         st = yes_state;
6137         yes_state = st->u.yes.prev_yes_state;
6138         PL_regmatch_state = st;
6139
6140         if (no_final)
6141             locinput= st->locinput;
6142         state_num = st->resume_state + no_final;
6143         goto reenter_switch;
6144     }
6145
6146     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6147                           PL_colors[4], PL_colors[5]));
6148
6149     if (PL_reg_state.re_state_eval_setup_done) {
6150         /* each successfully executed (?{...}) block does the equivalent of
6151          *   local $^R = do {...}
6152          * When popping the save stack, all these locals would be undone;
6153          * bypass this by setting the outermost saved $^R to the latest
6154          * value */
6155         if (oreplsv != GvSV(PL_replgv))
6156             sv_setsv(oreplsv, GvSV(PL_replgv));
6157     }
6158     result = 1;
6159     goto final_exit;
6160
6161 no:
6162     DEBUG_EXECUTE_r(
6163         PerlIO_printf(Perl_debug_log,
6164             "%*s  %sfailed...%s\n",
6165             REPORT_CODE_OFF+depth*2, "",
6166             PL_colors[4], PL_colors[5])
6167         );
6168
6169 no_silent:
6170     if (no_final) {
6171         if (yes_state) {
6172             goto yes;
6173         } else {
6174             goto final_exit;
6175         }
6176     }
6177     if (depth) {
6178         /* there's a previous state to backtrack to */
6179         st--;
6180         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6181             PL_regmatch_slab = PL_regmatch_slab->prev;
6182             st = SLAB_LAST(PL_regmatch_slab);
6183         }
6184         PL_regmatch_state = st;
6185         locinput= st->locinput;
6186
6187         DEBUG_STATE_pp("pop");
6188         depth--;
6189         if (yes_state == st)
6190             yes_state = st->u.yes.prev_yes_state;
6191
6192         state_num = st->resume_state + 1; /* failure = success + 1 */
6193         goto reenter_switch;
6194     }
6195     result = 0;
6196
6197   final_exit:
6198     if (rex->intflags & PREGf_VERBARG_SEEN) {
6199         SV *sv_err = get_sv("REGERROR", 1);
6200         SV *sv_mrk = get_sv("REGMARK", 1);
6201         if (result) {
6202             sv_commit = &PL_sv_no;
6203             if (!sv_yes_mark)
6204                 sv_yes_mark = &PL_sv_yes;
6205         } else {
6206             if (!sv_commit)
6207                 sv_commit = &PL_sv_yes;
6208             sv_yes_mark = &PL_sv_no;
6209         }
6210         sv_setsv(sv_err, sv_commit);
6211         sv_setsv(sv_mrk, sv_yes_mark);
6212     }
6213
6214
6215     if (last_pushed_cv) {
6216         dSP;
6217         POP_MULTICALL;
6218         PERL_UNUSED_VAR(SP);
6219     }
6220
6221     /* clean up; in particular, free all slabs above current one */
6222     LEAVE_SCOPE(oldsave);
6223
6224     assert(!result ||  locinput - PL_bostr >= 0);
6225     return result ?  locinput - PL_bostr : -1;
6226 }
6227
6228 /*
6229  - regrepeat - repeatedly match something simple, report how many
6230  *
6231  * startposp - pointer a pointer to the start position.  This is updated
6232  *             to point to the byte following the highest successful
6233  *             match.
6234  * p         - the regnode to be repeatedly matched against.
6235  * max       - maximum number of characters to match.
6236  * depth     - (for debugging) backtracking depth.
6237  */
6238 STATIC I32
6239 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6240 {
6241     dVAR;
6242     char *scan;
6243     I32 c;
6244     char *loceol = PL_regeol;
6245     I32 hardcount = 0;
6246     bool utf8_target = PL_reg_match_utf8;
6247     UV utf8_flags;
6248 #ifndef DEBUGGING
6249     PERL_UNUSED_ARG(depth);
6250 #endif
6251
6252     PERL_ARGS_ASSERT_REGREPEAT;
6253
6254     scan = *startposp;
6255     if (max == REG_INFTY)
6256         max = I32_MAX;
6257     else if (max < loceol - scan)
6258         loceol = scan + max;
6259     switch (OP(p)) {
6260     case REG_ANY:
6261         if (utf8_target) {
6262             loceol = PL_regeol;
6263             while (scan < loceol && hardcount < max && *scan != '\n') {
6264                 scan += UTF8SKIP(scan);
6265                 hardcount++;
6266             }
6267         } else {
6268             while (scan < loceol && *scan != '\n')
6269                 scan++;
6270         }
6271         break;
6272     case SANY:
6273         if (utf8_target) {
6274             loceol = PL_regeol;
6275             while (scan < loceol && hardcount < max) {
6276                 scan += UTF8SKIP(scan);
6277                 hardcount++;
6278             }
6279         }
6280         else
6281             scan = loceol;
6282         break;
6283     case CANY:
6284         scan = loceol;
6285         break;
6286     case EXACT:
6287         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6288          * means they match only characters in the string that can be expressed
6289          * as a single byte.  For non-utf8 strings, that means a simple match.
6290          * For utf8 strings, the character matched must be an invariant, or
6291          * downgradable to a single byte.  The pattern's utf8ness is
6292          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6293          * it is, it's an invariant */
6294
6295         c = (U8)*STRING(p);
6296         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6297
6298         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6299             while (scan < loceol && UCHARAT(scan) == c) {
6300                 scan++;
6301             }
6302         }
6303         else {
6304
6305             /* Here, the string is utf8, and the pattern char is different
6306              * in utf8 than not, so can't compare them directly.  Outside the
6307              * loop, find the two utf8 bytes that represent c, and then
6308              * look for those in sequence in the utf8 string */
6309             U8 high = UTF8_TWO_BYTE_HI(c);
6310             U8 low = UTF8_TWO_BYTE_LO(c);
6311             loceol = PL_regeol;
6312
6313             while (hardcount < max
6314                     && scan + 1 < loceol
6315                     && UCHARAT(scan) == high
6316                     && UCHARAT(scan + 1) == low)
6317             {
6318                 scan += 2;
6319                 hardcount++;
6320             }
6321         }
6322         break;
6323     case EXACTFA:
6324         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6325         goto do_exactf;
6326
6327     case EXACTFL:
6328         PL_reg_flags |= RF_tainted;
6329         utf8_flags = FOLDEQ_UTF8_LOCALE;
6330         goto do_exactf;
6331
6332     case EXACTF:
6333             utf8_flags = 0;
6334             goto do_exactf;
6335
6336     case EXACTFU_SS:
6337     case EXACTFU_TRICKYFOLD:
6338     case EXACTFU:
6339         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6340
6341         /* The comments for the EXACT case above apply as well to these fold
6342          * ones */
6343
6344     do_exactf:
6345         c = (U8)*STRING(p);
6346         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6347
6348         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6349             char *tmpeol = loceol;
6350             while (hardcount < max
6351                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6352                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6353             {
6354                 scan = tmpeol;
6355                 tmpeol = loceol;
6356                 hardcount++;
6357             }
6358
6359             /* XXX Note that the above handles properly the German sharp s in
6360              * the pattern matching ss in the string.  But it doesn't handle
6361              * properly cases where the string contains say 'LIGATURE ff' and
6362              * the pattern is 'f+'.  This would require, say, a new function or
6363              * revised interface to foldEQ_utf8(), in which the maximum number
6364              * of characters to match could be passed and it would return how
6365              * many actually did.  This is just one of many cases where
6366              * multi-char folds don't work properly, and so the fix is being
6367              * deferred */
6368         }
6369         else {
6370             U8 folded;
6371
6372             /* Here, the string isn't utf8 and c is a single byte; and either
6373              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6374              * doesn't affect c.  Can just do simple comparisons for exact or
6375              * fold matching. */
6376             switch (OP(p)) {
6377                 case EXACTF: folded = PL_fold[c]; break;
6378                 case EXACTFA:
6379                 case EXACTFU_TRICKYFOLD:
6380                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6381                 case EXACTFL: folded = PL_fold_locale[c]; break;
6382                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6383             }
6384             while (scan < loceol &&
6385                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6386             {
6387                 scan++;
6388             }
6389         }
6390         break;
6391     case ANYOFV:
6392     case ANYOF:
6393         if (utf8_target || OP(p) == ANYOFV) {
6394             STRLEN inclasslen;
6395             loceol = PL_regeol;
6396             inclasslen = loceol - scan;
6397             while (hardcount < max
6398                    && ((inclasslen = loceol - scan) > 0)
6399                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6400             {
6401                 scan += inclasslen;
6402                 hardcount++;
6403             }
6404         } else {
6405             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6406                 scan++;
6407         }
6408         break;
6409     case ALNUMU:
6410         if (utf8_target) {
6411     utf8_wordchar:
6412             loceol = PL_regeol;
6413             LOAD_UTF8_CHARCLASS_ALNUM();
6414             while (hardcount < max && scan < loceol &&
6415                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6416             {
6417                 scan += UTF8SKIP(scan);
6418                 hardcount++;
6419             }
6420         } else {
6421             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6422                 scan++;
6423             }
6424         }
6425         break;
6426     case ALNUM:
6427         if (utf8_target)
6428             goto utf8_wordchar;
6429         while (scan < loceol && isALNUM((U8) *scan)) {
6430             scan++;
6431         }
6432         break;
6433     case ALNUMA:
6434         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6435             scan++;
6436         }
6437         break;
6438     case ALNUML:
6439         PL_reg_flags |= RF_tainted;
6440         if (utf8_target) {
6441             loceol = PL_regeol;
6442             while (hardcount < max && scan < loceol &&
6443                    isALNUM_LC_utf8((U8*)scan)) {
6444                 scan += UTF8SKIP(scan);
6445                 hardcount++;
6446             }
6447         } else {
6448             while (scan < loceol && isALNUM_LC(*scan))
6449                 scan++;
6450         }
6451         break;
6452     case NALNUMU:
6453         if (utf8_target) {
6454
6455     utf8_Nwordchar:
6456
6457             loceol = PL_regeol;
6458             LOAD_UTF8_CHARCLASS_ALNUM();
6459             while (hardcount < max && scan < loceol &&
6460                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6461             {
6462                 scan += UTF8SKIP(scan);
6463                 hardcount++;
6464             }
6465         } else {
6466             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6467                 scan++;
6468             }
6469         }
6470         break;
6471     case NALNUM:
6472         if (utf8_target)
6473             goto utf8_Nwordchar;
6474         while (scan < loceol && ! isALNUM((U8) *scan)) {
6475             scan++;
6476         }
6477         break;
6478
6479     case POSIXA:
6480        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6481             scan++;
6482         }
6483         break;
6484     case NPOSIXA:
6485         if (utf8_target) {
6486             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6487                 scan += UTF8SKIP(scan);
6488             }
6489         }
6490         else {
6491             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6492                 scan++;
6493             }
6494         }
6495         break;
6496     case NALNUMA:
6497         if (utf8_target) {
6498             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6499                 scan += UTF8SKIP(scan);
6500             }
6501         }
6502         else {
6503             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6504                 scan++;
6505             }
6506         }
6507         break;
6508     case NALNUML:
6509         PL_reg_flags |= RF_tainted;
6510         if (utf8_target) {
6511             loceol = PL_regeol;
6512             while (hardcount < max && scan < loceol &&
6513                    !isALNUM_LC_utf8((U8*)scan)) {
6514                 scan += UTF8SKIP(scan);
6515                 hardcount++;
6516             }
6517         } else {
6518             while (scan < loceol && !isALNUM_LC(*scan))
6519                 scan++;
6520         }
6521         break;
6522     case SPACEU:
6523         if (utf8_target) {
6524
6525     utf8_space:
6526
6527             loceol = PL_regeol;
6528             LOAD_UTF8_CHARCLASS_SPACE();
6529             while (hardcount < max && scan < loceol &&
6530                    (*scan == ' ' ||
6531                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6532             {
6533                 scan += UTF8SKIP(scan);
6534                 hardcount++;
6535             }
6536             break;
6537         }
6538         else {
6539             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6540                 scan++;
6541             }
6542             break;
6543         }
6544     case SPACE:
6545         if (utf8_target)
6546             goto utf8_space;
6547
6548         while (scan < loceol && isSPACE((U8) *scan)) {
6549             scan++;
6550         }
6551         break;
6552     case SPACEA:
6553         while (scan < loceol && isSPACE_A((U8) *scan)) {
6554             scan++;
6555         }
6556         break;
6557     case SPACEL:
6558         PL_reg_flags |= RF_tainted;
6559         if (utf8_target) {
6560             loceol = PL_regeol;
6561             while (hardcount < max && scan < loceol &&
6562                    isSPACE_LC_utf8((U8*)scan)) {
6563                 scan += UTF8SKIP(scan);
6564                 hardcount++;
6565             }
6566         } else {
6567             while (scan < loceol && isSPACE_LC(*scan))
6568                 scan++;
6569         }
6570         break;
6571     case NSPACEU:
6572         if (utf8_target) {
6573
6574     utf8_Nspace:
6575
6576             loceol = PL_regeol;
6577             LOAD_UTF8_CHARCLASS_SPACE();
6578             while (hardcount < max && scan < loceol &&
6579                    ! (*scan == ' ' ||
6580                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6581             {
6582                 scan += UTF8SKIP(scan);
6583                 hardcount++;
6584             }
6585             break;
6586         }
6587         else {
6588             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6589                 scan++;
6590             }
6591         }
6592         break;
6593     case NSPACE:
6594         if (utf8_target)
6595             goto utf8_Nspace;
6596
6597         while (scan < loceol && ! isSPACE((U8) *scan)) {
6598             scan++;
6599         }
6600         break;
6601     case NSPACEA:
6602         if (utf8_target) {
6603             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6604                 scan += UTF8SKIP(scan);
6605             }
6606         }
6607         else {
6608             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6609                 scan++;
6610             }
6611         }
6612         break;
6613     case NSPACEL:
6614         PL_reg_flags |= RF_tainted;
6615         if (utf8_target) {
6616             loceol = PL_regeol;
6617             while (hardcount < max && scan < loceol &&
6618                    !isSPACE_LC_utf8((U8*)scan)) {
6619                 scan += UTF8SKIP(scan);
6620                 hardcount++;
6621             }
6622         } else {
6623             while (scan < loceol && !isSPACE_LC(*scan))
6624                 scan++;
6625         }
6626         break;
6627     case DIGIT:
6628         if (utf8_target) {
6629             loceol = PL_regeol;
6630             LOAD_UTF8_CHARCLASS_DIGIT();
6631             while (hardcount < max && scan < loceol &&
6632                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6633                 scan += UTF8SKIP(scan);
6634                 hardcount++;
6635             }
6636         } else {
6637             while (scan < loceol && isDIGIT(*scan))
6638                 scan++;
6639         }
6640         break;
6641     case DIGITA:
6642         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6643             scan++;
6644         }
6645         break;
6646     case DIGITL:
6647         PL_reg_flags |= RF_tainted;
6648         if (utf8_target) {
6649             loceol = PL_regeol;
6650             while (hardcount < max && scan < loceol &&
6651                    isDIGIT_LC_utf8((U8*)scan)) {
6652                 scan += UTF8SKIP(scan);
6653                 hardcount++;
6654             }
6655         } else {
6656             while (scan < loceol && isDIGIT_LC(*scan))
6657                 scan++;
6658         }
6659         break;
6660     case NDIGIT:
6661         if (utf8_target) {
6662             loceol = PL_regeol;
6663             LOAD_UTF8_CHARCLASS_DIGIT();
6664             while (hardcount < max && scan < loceol &&
6665                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6666                 scan += UTF8SKIP(scan);
6667                 hardcount++;
6668             }
6669         } else {
6670             while (scan < loceol && !isDIGIT(*scan))
6671                 scan++;
6672         }
6673         break;
6674     case NDIGITA:
6675         if (utf8_target) {
6676             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6677                 scan += UTF8SKIP(scan);
6678             }
6679         }
6680         else {
6681             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6682                 scan++;
6683             }
6684         }
6685         break;
6686     case NDIGITL:
6687         PL_reg_flags |= RF_tainted;
6688         if (utf8_target) {
6689             loceol = PL_regeol;
6690             while (hardcount < max && scan < loceol &&
6691                    !isDIGIT_LC_utf8((U8*)scan)) {
6692                 scan += UTF8SKIP(scan);
6693                 hardcount++;
6694             }
6695         } else {
6696             while (scan < loceol && !isDIGIT_LC(*scan))
6697                 scan++;
6698         }
6699         break;
6700     case LNBREAK:
6701         if (utf8_target) {
6702             loceol = PL_regeol;
6703             while (hardcount < max && scan < loceol &&
6704                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
6705                 scan += c;
6706                 hardcount++;
6707             }
6708         } else {
6709             /*
6710               LNBREAK can match two latin chars, which is ok,
6711               because we have a null terminated string, but we
6712               have to use hardcount in this situation
6713             */
6714             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
6715                 scan+=c;
6716                 hardcount++;
6717             }
6718         }
6719         break;
6720     case HORIZWS:
6721         if (utf8_target) {
6722             loceol = PL_regeol;
6723             while (hardcount < max && scan < loceol &&
6724                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
6725             {
6726                 scan += c;
6727                 hardcount++;
6728             }
6729         } else {
6730             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
6731                 scan++;
6732         }
6733         break;
6734     case NHORIZWS:
6735         if (utf8_target) {
6736             loceol = PL_regeol;
6737             while (hardcount < max && scan < loceol &&
6738                         !is_HORIZWS_utf8_safe(scan, loceol))
6739             {
6740                 scan += UTF8SKIP(scan);
6741                 hardcount++;
6742             }
6743         } else {
6744             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
6745                 scan++;
6746
6747         }
6748         break;
6749     case VERTWS:
6750         if (utf8_target) {
6751             loceol = PL_regeol;
6752             while (hardcount < max && scan < loceol &&
6753                             (c=is_VERTWS_utf8_safe(scan, loceol)))
6754             {
6755                 scan += c;
6756                 hardcount++;
6757             }
6758         } else {
6759             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
6760                 scan++;
6761
6762         }
6763         break;
6764     case NVERTWS:
6765         if (utf8_target) {
6766             loceol = PL_regeol;
6767             while (hardcount < max && scan < loceol &&
6768                                 !is_VERTWS_utf8_safe(scan, loceol))
6769             {
6770                 scan += UTF8SKIP(scan);
6771                 hardcount++;
6772             }
6773         } else {
6774             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
6775                 scan++;
6776
6777         }
6778         break;
6779
6780     default:            /* Called on something of 0 width. */
6781         break;          /* So match right here or not at all. */
6782     }
6783
6784     if (hardcount)
6785         c = hardcount;
6786     else
6787         c = scan - *startposp;
6788     *startposp = scan;
6789
6790     DEBUG_r({
6791         GET_RE_DEBUG_FLAGS_DECL;
6792         DEBUG_EXECUTE_r({
6793             SV * const prop = sv_newmortal();
6794             regprop(prog, prop, p);
6795             PerlIO_printf(Perl_debug_log,
6796                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6797                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6798         });
6799     });
6800
6801     return(c);
6802 }
6803
6804
6805 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6806 /*
6807 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6808 create a copy so that changes the caller makes won't change the shared one
6809  */
6810 SV *
6811 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6812 {
6813     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6814     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6815 }
6816 #endif
6817
6818 STATIC SV *
6819 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6820 {
6821     /* Returns the swash for the input 'node' in the regex 'prog'.
6822      * If <doinit> is true, will attempt to create the swash if not already
6823      *    done.
6824      * If <listsvp> is non-null, will return the swash initialization string in
6825      *    it.
6826      * If <altsvp> is non-null, will return the alternates to the regular swash
6827      *    in it
6828      * Tied intimately to how regcomp.c sets up the data structure */
6829
6830     dVAR;
6831     SV *sw  = NULL;
6832     SV *si  = NULL;
6833     SV *alt = NULL;
6834     SV*  invlist = NULL;
6835
6836     RXi_GET_DECL(prog,progi);
6837     const struct reg_data * const data = prog ? progi->data : NULL;
6838
6839     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6840
6841     assert(ANYOF_NONBITMAP(node));
6842
6843     if (data && data->count) {
6844         const U32 n = ARG(node);
6845
6846         if (data->what[n] == 's') {
6847             SV * const rv = MUTABLE_SV(data->data[n]);
6848             AV * const av = MUTABLE_AV(SvRV(rv));
6849             SV **const ary = AvARRAY(av);
6850             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6851
6852             si = *ary;  /* ary[0] = the string to initialize the swash with */
6853
6854             /* Elements 3 and 4 are either both present or both absent. [3] is
6855              * any inversion list generated at compile time; [4] indicates if
6856              * that inversion list has any user-defined properties in it. */
6857             if (av_len(av) >= 3) {
6858                 invlist = ary[3];
6859                 if (SvUV(ary[4])) {
6860                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
6861                 }
6862             }
6863             else {
6864                 invlist = NULL;
6865             }
6866
6867             /* Element [1] is reserved for the set-up swash.  If already there,
6868              * return it; if not, create it and store it there */
6869             if (SvROK(ary[1])) {
6870                 sw = ary[1];
6871             }
6872             else if (si && doinit) {
6873
6874                 sw = _core_swash_init("utf8", /* the utf8 package */
6875                                       "", /* nameless */
6876                                       si,
6877                                       1, /* binary */
6878                                       0, /* not from tr/// */
6879                                       invlist,
6880                                       &swash_init_flags);
6881                 (void)av_store(av, 1, sw);
6882             }
6883
6884             /* Element [2] is for any multi-char folds.  Note that is a
6885              * fundamentally flawed design, because can't backtrack and try
6886              * again.  See [perl #89774] */
6887             if (SvTYPE(ary[2]) == SVt_PVAV) {
6888                 alt = ary[2];
6889             }
6890         }
6891     }
6892
6893     if (listsvp) {
6894         SV* matches_string = newSVpvn("", 0);
6895
6896         /* Use the swash, if any, which has to have incorporated into it all
6897          * possibilities */
6898         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
6899             && (si && si != &PL_sv_undef))
6900         {
6901
6902             /* If no swash, use the input initialization string, if available */
6903             sv_catsv(matches_string, si);
6904         }
6905
6906         /* Add the inversion list to whatever we have.  This may have come from
6907          * the swash, or from an input parameter */
6908         if (invlist) {
6909             sv_catsv(matches_string, _invlist_contents(invlist));
6910         }
6911         *listsvp = matches_string;
6912     }
6913
6914     if (altsvp)
6915         *altsvp  = alt;
6916
6917     return sw;
6918 }
6919
6920 /*
6921  - reginclass - determine if a character falls into a character class
6922
6923   n is the ANYOF regnode
6924   p is the target string
6925   lenp is pointer to the maximum number of bytes of how far to go in p
6926     (This is assumed wthout checking to always be at least the current
6927     character's size)
6928   utf8_target tells whether p is in UTF-8.
6929
6930   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6931   from a successful match, the value it points to will be updated to how many
6932   bytes in p were matched.  If there was no match, the value is undefined,
6933   possibly changed from the input.
6934
6935   Note that this can be a synthetic start class, a combination of various
6936   nodes, so things you think might be mutually exclusive, such as locale,
6937   aren't.  It can match both locale and non-locale
6938
6939  */
6940
6941 STATIC bool
6942 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6943 {
6944     dVAR;
6945     const char flags = ANYOF_FLAGS(n);
6946     bool match = FALSE;
6947     UV c = *p;
6948     STRLEN c_len = 0;
6949     STRLEN maxlen;
6950
6951     PERL_ARGS_ASSERT_REGINCLASS;
6952
6953     /* If c is not already the code point, get it */
6954     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6955         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6956                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6957                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6958                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6959                  * UTF8_ALLOW_FFFF */
6960         if (c_len == (STRLEN)-1)
6961             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6962     }
6963     else {
6964         c_len = 1;
6965     }
6966
6967     /* Use passed in max length, or one character if none passed in or less
6968      * than one character.  And assume will match just one character.  This is
6969      * overwritten later if matched more. */
6970     if (lenp) {
6971         maxlen = (*lenp > c_len) ? *lenp : c_len;
6972         *lenp = c_len;
6973
6974     }
6975     else {
6976         maxlen = c_len;
6977     }
6978
6979     /* If this character is potentially in the bitmap, check it */
6980     if (c < 256) {
6981         if (ANYOF_BITMAP_TEST(n, c))
6982             match = TRUE;
6983         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6984                 && ! utf8_target
6985                 && ! isASCII(c))
6986         {
6987             match = TRUE;
6988         }
6989
6990         else if (flags & ANYOF_LOCALE) {
6991             PL_reg_flags |= RF_tainted;
6992
6993             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6994                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6995             {
6996                 match = TRUE;
6997             }
6998             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6999                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
7000                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
7001                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
7002                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
7003                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
7004                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
7005                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
7006                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7007                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
7008                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
7009                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
7010                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
7011                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
7012                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
7013                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
7014                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
7015                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
7016                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
7017                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
7018                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
7019                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
7020                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
7021                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
7022                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
7023                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
7024                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
7025                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
7026                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
7027                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
7028                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
7029                      ) /* How's that for a conditional? */
7030             ) {
7031                 match = TRUE;
7032             }
7033         }
7034     }
7035
7036     /* If the bitmap didn't (or couldn't) match, and something outside the
7037      * bitmap could match, try that.  Locale nodes specifiy completely the
7038      * behavior of code points in the bit map (otherwise, a utf8 target would
7039      * cause them to be treated as Unicode and not locale), except in
7040      * the very unlikely event when this node is a synthetic start class, which
7041      * could be a combination of locale and non-locale nodes.  So allow locale
7042      * to match for the synthetic start class, which will give a false
7043      * positive that will be resolved when the match is done again as not part
7044      * of the synthetic start class */
7045     if (!match) {
7046         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7047             match = TRUE;       /* Everything above 255 matches */
7048         }
7049         else if (ANYOF_NONBITMAP(n)
7050                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7051                      || (utf8_target
7052                          && (c >=256
7053                              || (! (flags & ANYOF_LOCALE))
7054                              || (flags & ANYOF_IS_SYNTHETIC)))))
7055         {
7056             AV *av;
7057             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
7058
7059             if (sw) {
7060                 U8 * utf8_p;
7061                 if (utf8_target) {
7062                     utf8_p = (U8 *) p;
7063                 } else {
7064
7065                     /* Not utf8.  Convert as much of the string as available up
7066                      * to the limit of how far the (single) character in the
7067                      * pattern can possibly match (no need to go further).  If
7068                      * the node is a straight ANYOF or not folding, it can't
7069                      * match more than one.  Otherwise, It can match up to how
7070                      * far a single char can fold to.  Since not utf8, each
7071                      * character is a single byte, so the max it can be in
7072                      * bytes is the same as the max it can be in characters */
7073                     STRLEN len = (OP(n) == ANYOF
7074                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
7075                                   ? 1
7076                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
7077                                     ? maxlen
7078                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
7079                     utf8_p = bytes_to_utf8(p, &len);
7080                 }
7081
7082                 if (swash_fetch(sw, utf8_p, TRUE))
7083                     match = TRUE;
7084                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
7085
7086                     /* Here, we need to test if the fold of the target string
7087                      * matches.  The non-multi char folds have all been moved to
7088                      * the compilation phase, and the multi-char folds have
7089                      * been stored by regcomp into 'av'; we linearly check to
7090                      * see if any match the target string (folded).   We know
7091                      * that the originals were each one character, but we don't
7092                      * currently know how many characters/bytes each folded to,
7093                      * except we do know that there are small limits imposed by
7094                      * Unicode.  XXX A performance enhancement would be to have
7095                      * regcomp.c store the max number of chars/bytes that are
7096                      * in an av entry, as, say the 0th element.  Even better
7097                      * would be to have a hash of the few characters that can
7098                      * start a multi-char fold to the max number of chars of
7099                      * those folds.
7100                      *
7101                      * If there is a match, we will need to advance (if lenp is
7102                      * specified) the match pointer in the target string.  But
7103                      * what we are comparing here isn't that string directly,
7104                      * but its fold, whose length may differ from the original.
7105                      * As we go along in constructing the fold, therefore, we
7106                      * create a map so that we know how many bytes in the
7107                      * source to advance given that we have matched a certain
7108                      * number of bytes in the fold.  This map is stored in
7109                      * 'map_fold_len_back'.  Let n mean the number of bytes in
7110                      * the fold of the first character that we are folding.
7111                      * Then map_fold_len_back[n] is set to the number of bytes
7112                      * in that first character.  Similarly let m be the
7113                      * corresponding number for the second character to be
7114                      * folded.  Then map_fold_len_back[n+m] is set to the
7115                      * number of bytes occupied by the first two source
7116                      * characters. ... */
7117                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
7118                     U8 folded[UTF8_MAXBYTES_CASE+1];
7119                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
7120                     STRLEN total_foldlen = 0; /* num bytes in fold of all
7121                                                   chars */
7122
7123                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
7124
7125                         /* Here, only need to fold the first char of the target
7126                          * string.  It the source wasn't utf8, is 1 byte long */
7127                         to_utf8_fold(utf8_p, folded, &foldlen);
7128                         total_foldlen = foldlen;
7129                         map_fold_len_back[foldlen] = (utf8_target)
7130                                                      ? UTF8SKIP(utf8_p)
7131                                                      : 1;
7132                     }
7133                     else {
7134
7135                         /* Here, need to fold more than the first char.  Do so
7136                          * up to the limits */
7137                         U8* source_ptr = utf8_p;    /* The source for the fold
7138                                                        is the regex target
7139                                                        string */
7140                         U8* folded_ptr = folded;
7141                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
7142                                                        available byte in the
7143                                                        target string */
7144                         U8 i;
7145                         for (i = 0;
7146                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
7147                              i++)
7148                         {
7149
7150                             /* Fold the next character */
7151                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
7152                             STRLEN this_char_foldlen;
7153                             to_utf8_fold(source_ptr,
7154                                          this_char_folded,
7155                                          &this_char_foldlen);
7156
7157                             /* Bail if it would exceed the byte limit for
7158                              * folding a single char. */
7159                             if (this_char_foldlen + folded_ptr - folded >
7160                                                             UTF8_MAXBYTES_CASE)
7161                             {
7162                                 break;
7163                             }
7164
7165                             /* Add the fold of this character */
7166                             Copy(this_char_folded,
7167                                  folded_ptr,
7168                                  this_char_foldlen,
7169                                  U8);
7170                             source_ptr += UTF8SKIP(source_ptr);
7171                             folded_ptr += this_char_foldlen;
7172                             total_foldlen = folded_ptr - folded;
7173
7174                             /* Create map from the number of bytes in the fold
7175                              * back to the number of bytes in the source.  If
7176                              * the source isn't utf8, the byte count is just
7177                              * the number of characters so far */
7178                             map_fold_len_back[total_foldlen]
7179                                                       = (utf8_target)
7180                                                         ? source_ptr - utf8_p
7181                                                         : i + 1;
7182                         }
7183                         *folded_ptr = '\0';
7184                     }
7185
7186
7187                     /* Do the linear search to see if the fold is in the list
7188                      * of multi-char folds. */
7189                     if (av) {
7190                         I32 i;
7191                         for (i = 0; i <= av_len(av); i++) {
7192                             SV* const sv = *av_fetch(av, i, FALSE);
7193                             STRLEN len;
7194                             const char * const s = SvPV_const(sv, len);
7195
7196                             if (len <= total_foldlen
7197                                 && memEQ(s, (char*)folded, len)
7198
7199                                    /* If 0, means matched a partial char. See
7200                                     * [perl #90536] */
7201                                 && map_fold_len_back[len])
7202                             {
7203
7204                                 /* Advance the target string ptr to account for
7205                                  * this fold, but have to translate from the
7206                                  * folded length to the corresponding source
7207                                  * length. */
7208                                 if (lenp) {
7209                                     *lenp = map_fold_len_back[len];
7210                                 }
7211                                 match = TRUE;
7212                                 break;
7213                             }
7214                         }
7215                     }
7216                 }
7217
7218                 /* If we allocated a string above, free it */
7219                 if (! utf8_target) Safefree(utf8_p);
7220             }
7221         }
7222
7223         if (UNICODE_IS_SUPER(c)
7224             && (flags & ANYOF_WARN_SUPER)
7225             && ckWARN_d(WARN_NON_UNICODE))
7226         {
7227             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7228                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7229         }
7230     }
7231
7232     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7233     return cBOOL(flags & ANYOF_INVERT) ^ match;
7234 }
7235
7236 STATIC U8 *
7237 S_reghop3(U8 *s, I32 off, const U8* lim)
7238 {
7239     /* return the position 'off' UTF-8 characters away from 's', forward if
7240      * 'off' >= 0, backwards if negative.  But don't go outside of position
7241      * 'lim', which better be < s  if off < 0 */
7242
7243     dVAR;
7244
7245     PERL_ARGS_ASSERT_REGHOP3;
7246
7247     if (off >= 0) {
7248         while (off-- && s < lim) {
7249             /* XXX could check well-formedness here */
7250             s += UTF8SKIP(s);
7251         }
7252     }
7253     else {
7254         while (off++ && s > lim) {
7255             s--;
7256             if (UTF8_IS_CONTINUED(*s)) {
7257                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7258                     s--;
7259             }
7260             /* XXX could check well-formedness here */
7261         }
7262     }
7263     return s;
7264 }
7265
7266 #ifdef XXX_dmq
7267 /* there are a bunch of places where we use two reghop3's that should
7268    be replaced with this routine. but since thats not done yet
7269    we ifdef it out - dmq
7270 */
7271 STATIC U8 *
7272 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7273 {
7274     dVAR;
7275
7276     PERL_ARGS_ASSERT_REGHOP4;
7277
7278     if (off >= 0) {
7279         while (off-- && s < rlim) {
7280             /* XXX could check well-formedness here */
7281             s += UTF8SKIP(s);
7282         }
7283     }
7284     else {
7285         while (off++ && s > llim) {
7286             s--;
7287             if (UTF8_IS_CONTINUED(*s)) {
7288                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7289                     s--;
7290             }
7291             /* XXX could check well-formedness here */
7292         }
7293     }
7294     return s;
7295 }
7296 #endif
7297
7298 STATIC U8 *
7299 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7300 {
7301     dVAR;
7302
7303     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7304
7305     if (off >= 0) {
7306         while (off-- && s < lim) {
7307             /* XXX could check well-formedness here */
7308             s += UTF8SKIP(s);
7309         }
7310         if (off >= 0)
7311             return NULL;
7312     }
7313     else {
7314         while (off++ && s > lim) {
7315             s--;
7316             if (UTF8_IS_CONTINUED(*s)) {
7317                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7318                     s--;
7319             }
7320             /* XXX could check well-formedness here */
7321         }
7322         if (off <= 0)
7323             return NULL;
7324     }
7325     return s;
7326 }
7327
7328 static void
7329 restore_pos(pTHX_ void *arg)
7330 {
7331     dVAR;
7332     regexp * const rex = (regexp *)arg;
7333     if (PL_reg_state.re_state_eval_setup_done) {
7334         if (PL_reg_oldsaved) {
7335             rex->subbeg = PL_reg_oldsaved;
7336             rex->sublen = PL_reg_oldsavedlen;
7337             rex->suboffset = PL_reg_oldsavedoffset;
7338             rex->subcoffset = PL_reg_oldsavedcoffset;
7339 #ifdef PERL_OLD_COPY_ON_WRITE
7340             rex->saved_copy = PL_nrs;
7341 #endif
7342             RXp_MATCH_COPIED_on(rex);
7343         }
7344         PL_reg_magic->mg_len = PL_reg_oldpos;
7345         PL_reg_state.re_state_eval_setup_done = FALSE;
7346         PL_curpm = PL_reg_oldcurpm;
7347     }
7348 }
7349
7350 STATIC void
7351 S_to_utf8_substr(pTHX_ register regexp *prog)
7352 {
7353     int i = 1;
7354
7355     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7356
7357     do {
7358         if (prog->substrs->data[i].substr
7359             && !prog->substrs->data[i].utf8_substr) {
7360             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7361             prog->substrs->data[i].utf8_substr = sv;
7362             sv_utf8_upgrade(sv);
7363             if (SvVALID(prog->substrs->data[i].substr)) {
7364                 if (SvTAIL(prog->substrs->data[i].substr)) {
7365                     /* Trim the trailing \n that fbm_compile added last
7366                        time.  */
7367                     SvCUR_set(sv, SvCUR(sv) - 1);
7368                     /* Whilst this makes the SV technically "invalid" (as its
7369                        buffer is no longer followed by "\0") when fbm_compile()
7370                        adds the "\n" back, a "\0" is restored.  */
7371                     fbm_compile(sv, FBMcf_TAIL);
7372                 } else
7373                     fbm_compile(sv, 0);
7374             }
7375             if (prog->substrs->data[i].substr == prog->check_substr)
7376                 prog->check_utf8 = sv;
7377         }
7378     } while (i--);
7379 }
7380
7381 STATIC void
7382 S_to_byte_substr(pTHX_ register regexp *prog)
7383 {
7384     dVAR;
7385     int i = 1;
7386
7387     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7388
7389     do {
7390         if (prog->substrs->data[i].utf8_substr
7391             && !prog->substrs->data[i].substr) {
7392             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7393             if (sv_utf8_downgrade(sv, TRUE)) {
7394                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7395                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7396                         /* Trim the trailing \n that fbm_compile added last
7397                            time.  */
7398                         SvCUR_set(sv, SvCUR(sv) - 1);
7399                         fbm_compile(sv, FBMcf_TAIL);
7400                     } else
7401                         fbm_compile(sv, 0);
7402                 }
7403             } else {
7404                 SvREFCNT_dec(sv);
7405                 sv = &PL_sv_undef;
7406             }
7407             prog->substrs->data[i].substr = sv;
7408             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7409                 prog->check_substr = sv;
7410         }
7411     } while (i--);
7412 }
7413
7414 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7415  * pre-composed Hangul syllables, which are all in a contiguous block and
7416  * arranged there in such a way so as to facilitate alorithmic determination of
7417  * their characteristics.  As such, they don't need a swash, but can be
7418  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7419  * is a GCB=LV */
7420 #define SBASE 0xAC00    /* Start of block */
7421 #define SCount 11172    /* Length of block */
7422 #define TCount 28
7423
7424 #if 0   /* This routine is not currently used */
7425 PERL_STATIC_INLINE bool
7426 S_is_utf8_X_LV(pTHX_ const U8 *p)
7427 {
7428     /* Unlike most other similarly named routines here, this does not create a
7429      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7430
7431     dVAR;
7432
7433     UV cp = valid_utf8_to_uvchr(p, NULL);
7434
7435     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7436
7437     /* The earliest Unicode releases did not have these precomposed Hangul
7438      * syllables.  Set to point to undef in that case, so will return false on
7439      * every call */
7440     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7441         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7442         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7443             SvREFCNT_dec(PL_utf8_X_LV);
7444             PL_utf8_X_LV = &PL_sv_undef;
7445         }
7446     }
7447
7448     return (PL_utf8_X_LV != &PL_sv_undef
7449             && cp >= SBASE && cp < SBASE + SCount
7450             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7451 }
7452 #endif
7453
7454 PERL_STATIC_INLINE bool
7455 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7456 {
7457     /* Unlike most other similarly named routines here, this does not create a
7458      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7459
7460     dVAR;
7461
7462     UV cp = valid_utf8_to_uvchr(p, NULL);
7463
7464     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7465
7466     /* The earliest Unicode releases did not have these precomposed Hangul
7467      * syllables.  Set to point to undef in that case, so will return false on
7468      * every call */
7469     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7470         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7471         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7472             SvREFCNT_dec(PL_utf8_X_LVT);
7473             PL_utf8_X_LVT = &PL_sv_undef;
7474         }
7475     }
7476
7477     return (PL_utf8_X_LVT != &PL_sv_undef
7478             && cp >= SBASE && cp < SBASE + SCount
7479             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7480 }
7481
7482 /*
7483  * Local variables:
7484  * c-indentation-style: bsd
7485  * c-basic-offset: 4
7486  * indent-tabs-mode: nil
7487  * End:
7488  *
7489  * ex: set ts=8 sts=4 sw=4 et:
7490  */