regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #ifndef STATIC
  91 #define STATIC  static
  92 #endif
  93
  94 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  95  * call if there are no complications: i.e., if everything matchable is
  96  * straight forward in the bitmap */
  97 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
  98                                               : ANYOF_BITMAP_TEST(p,*(c)))
  99
 100 /*
 101  * Forwards.
 102  */
 103
 104 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 105 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 106
 107 #define HOPc(pos,off) \
 108         (char *)(PL_reg_match_utf8 \
 109             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 110             : (U8*)(pos + off))
 111 #define HOPBACKc(pos, off) \
 112         (char*)(PL_reg_match_utf8\
 113             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 114             : (pos - off >= PL_bostr)           \
 115                 ? (U8*)pos - off                \
 116                 : NULL)
 117
 118 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 119 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 120
 121 /* these are unrolled below in the CCC_TRY_XXX defined */
 122 #ifdef EBCDIC
 123     /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
 124      * skip the check on EBCDIC platforms */
 125 #   define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
 126 #else
 127 #   define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 128     if (!CAT2(PL_utf8_,class)) { \
 129         bool ok; \
 130         ENTER; save_re_context(); \
 131         ok=CAT2(is_utf8_,class)((const U8*)str); \
 132         PERL_UNUSED_VAR(ok); \
 133         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 134 #endif
 135
 136 /* Doesn't do an assert to verify that is correct */
 137 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 138     if (!CAT2(PL_utf8_,class)) { \
 139         bool throw_away PERL_UNUSED_DECL; \
 140         ENTER; save_re_context(); \
 141         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 142         LEAVE; } } STMT_END
 143
 144 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 145 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 146 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 147
 148 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 149         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 150         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 151         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 152             * assert should likely and hopefully fail on an EBCDIC machine */ \
 153         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 154                                                                             \
 155         /* No asserts are done for these, in case called on an early        \
 156             * Unicode version in which they map to nothing */               \
 157         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 158         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 159         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 160         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 161         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 162         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 163         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 164
 165 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 166
 167 /* The actual code for CCC_TRY, which uses several variables from the routine
 168  * it's callable from.  It is designed to be the bulk of a case statement.
 169  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 170  *      nextchr matches the class.
 171  * UTF8_TEST is the whole test string to use for utf8 targets
 172  * LOAD is what to use to test, and if not present to load in the swash for the
 173  *      class
 174  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 175  *      UTF8_TEST test.
 176  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 177  * utf8 and a variant, load the swash if necessary and test using the utf8
 178  * test.  Advance to the next character if test is ok, otherwise fail; If not
 179  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 180  * fails, or advance to the next character */
 181
 182 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 183     if (locinput >= PL_regeol) {                                              \
 184         sayNO;                                                                \
 185     }                                                                         \
 186     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 187         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 188         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 189             sayNO;                                                            \
 190         }                                                                     \
 191         locinput += PL_utf8skip[nextchr];                                     \
 192         nextchr = UCHARAT(locinput);                                          \
 193         break;                                                                \
 194     }                                                                         \
 195     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 196         sayNO;                                                                \
 197     }                                                                         \
 198     nextchr = UCHARAT(++locinput);                                            \
 199     break;
 200
 201 /* Handle the non-locale cases for a character class and its complement.  It
 202  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 203  * This is because that code fails when the test succeeds, so we want to have
 204  * the test fail so that the code succeeds.  The swash is stored in a
 205  * predictable PL_ place */
 206 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 207                            CLASS, STR)                                        \
 208     case NAME:                                                                \
 209         _CCC_TRY_CODE( !, FUNC,                                               \
 210                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 211                                             (U8*)locinput, TRUE)),            \
 212                           CLASS, STR)                                         \
 213     case NNAME:                                                               \
 214         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 215                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 216                                             (U8*)locinput, TRUE)),            \
 217                           CLASS, STR)                                         \
 218
 219 /* Generate the case statements for both locale and non-locale character
 220  * classes in regmatch for classes that don't have special unicode semantics.
 221  * Locales don't use an immediate swash, but an intermediary special locale
 222  * function that is called on the pointer to the current place in the input
 223  * string.  That function will resolve to needing the same swash.  One might
 224  * think that because we don't know what the locale will match, we shouldn't
 225  * check with the swash loading function that it loaded properly; ie, that we
 226  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 227  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 228  * irrelevant here */
 229 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 230                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 231                 NAMEA, NNAMEA, FUNCA,                                         \
 232                 CLASS, STR)                                                   \
 233     case NAMEL:                                                               \
 234         PL_reg_flags |= RF_tainted;                                           \
 235         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 236     case NNAMEL:                                                              \
 237         PL_reg_flags |= RF_tainted;                                           \
 238         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 239                        CLASS, STR)                                            \
 240     case NAMEA:                                                               \
 241         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 242             sayNO;                                                            \
 243         }                                                                     \
 244         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 245         nextchr = UCHARAT(++locinput);                                        \
 246         break;                                                                \
 247     case NNAMEA:                                                              \
 248         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 249             sayNO;                                                            \
 250         }                                                                     \
 251         if (utf8_target) {                                                    \
 252             locinput += PL_utf8skip[nextchr];                                 \
 253             nextchr = UCHARAT(locinput);                                      \
 254         }                                                                     \
 255         else {                                                                \
 256             nextchr = UCHARAT(++locinput);                                    \
 257         }                                                                     \
 258         break;                                                                \
 259     /* Generate the non-locale cases */                                       \
 260     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 261
 262 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 263  * statements to handle separate Unicode semantics nodes */
 264 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 265                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 266                   NAMEU, NNAMEU, FUNCU,                                        \
 267                   NAMEA, NNAMEA, FUNCA,                                        \
 268                   CLASS, STR)                                                  \
 269     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 270             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 271             NAMEA, NNAMEA, FUNCA,                                              \
 272             CLASS, STR)                                                        \
 273     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 274
 275 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 276
 277 /* for use after a quantifier and before an EXACT-like node -- japhy */
 278 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 279  *
 280  * NOTE that *nothing* that affects backtracking should be in here, specifically
 281  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 282  * node that is in between two EXACT like nodes when ascertaining what the required
 283  * "follow" character is. This should probably be moved to regex compile time
 284  * although it may be done at run time beause of the REF possibility - more
 285  * investigation required. -- demerphq
 286 */
 287 #define JUMPABLE(rn) (      \
 288     OP(rn) == OPEN ||       \
 289     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 290     OP(rn) == EVAL ||   \
 291     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 292     OP(rn) == PLUS || OP(rn) == MINMOD || \
 293     OP(rn) == KEEPS || \
 294     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 295 )
 296 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 297
 298 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 299
 300 #if 0
 301 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 302    we don't need this definition. */
 303 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 304 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 305 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 306
 307 #else
 308 /* ... so we use this as its faster. */
 309 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 310 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 311 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 312 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 313
 314 #endif
 315
 316 /*
 317   Search for mandatory following text node; for lookahead, the text must
 318   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 319 */
 320 #define FIND_NEXT_IMPT(rn) STMT_START { \
 321     while (JUMPABLE(rn)) { \
 322         const OPCODE type = OP(rn); \
 323         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 324             rn = NEXTOPER(NEXTOPER(rn)); \
 325         else if (type == PLUS) \
 326             rn = NEXTOPER(rn); \
 327         else if (type == IFMATCH) \
 328             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 329         else rn += NEXT_OFF(rn); \
 330     } \
 331 } STMT_END
 332
 333
 334 static void restore_pos(pTHX_ void *arg);
 335
 336 #define REGCP_PAREN_ELEMS 3
 337 #define REGCP_OTHER_ELEMS 3
 338 #define REGCP_FRAME_ELEMS 1
 339 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 340  * are needed for the regexp context stack bookkeeping. */
 341
 342 STATIC CHECKPOINT
 343 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 344 {
 345     dVAR;
 346     const int retval = PL_savestack_ix;
 347     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 348     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 349     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 350     I32 p;
 351     GET_RE_DEBUG_FLAGS_DECL;
 352
 353     PERL_ARGS_ASSERT_REGCPPUSH;
 354
 355     if (paren_elems_to_push < 0)
 356         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 357                    paren_elems_to_push);
 358
 359     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 360         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 361                    " out of range (%lu-%ld)",
 362                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 363
 364     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 365
 366     DEBUG_BUFFERS_r(
 367         if ((int)PL_regsize > (int)parenfloor)
 368             PerlIO_printf(Perl_debug_log,
 369                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 370                 PTR2UV(rex),
 371                 PTR2UV(rex->offs)
 372             );
 373     );
 374     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 375 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 376         SSPUSHINT(rex->offs[p].end);
 377         SSPUSHINT(rex->offs[p].start);
 378         SSPUSHINT(rex->offs[p].start_tmp);
 379         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 380             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 381             (UV)p,
 382             (IV)rex->offs[p].start,
 383             (IV)rex->offs[p].start_tmp,
 384             (IV)rex->offs[p].end
 385         ));
 386     }
 387 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 388     SSPUSHINT(PL_regsize);
 389     SSPUSHINT(rex->lastparen);
 390     SSPUSHINT(rex->lastcloseparen);
 391     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 392
 393     return retval;
 394 }
 395
 396 /* These are needed since we do not localize EVAL nodes: */
 397 #define REGCP_SET(cp)                                           \
 398     DEBUG_STATE_r(                                              \
 399             PerlIO_printf(Perl_debug_log,                       \
 400                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 401                 (IV)PL_savestack_ix));                          \
 402     cp = PL_savestack_ix
 403
 404 #define REGCP_UNWIND(cp)                                        \
 405     DEBUG_STATE_r(                                              \
 406         if (cp != PL_savestack_ix)                              \
 407             PerlIO_printf(Perl_debug_log,                       \
 408                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 409                 (IV)(cp), (IV)PL_savestack_ix));                \
 410     regcpblow(cp)
 411
 412 #define UNWIND_PAREN(lp, lcp)               \
 413     for (n = rex->lastparen; n > lp; n--)   \
 414         rex->offs[n].end = -1;              \
 415     rex->lastparen = n;                     \
 416     rex->lastcloseparen = lcp;
 417
 418
 419 STATIC void
 420 S_regcppop(pTHX_ regexp *rex)
 421 {
 422     dVAR;
 423     UV i;
 424     U32 paren;
 425     GET_RE_DEBUG_FLAGS_DECL;
 426
 427     PERL_ARGS_ASSERT_REGCPPOP;
 428
 429     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 430     i = SSPOPUV;
 431     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 432     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 433     rex->lastcloseparen = SSPOPINT;
 434     rex->lastparen = SSPOPINT;
 435     PL_regsize = SSPOPINT;
 436
 437     i -= REGCP_OTHER_ELEMS;
 438     /* Now restore the parentheses context. */
 439     DEBUG_BUFFERS_r(
 440         if (i || rex->lastparen + 1 <= rex->nparens)
 441             PerlIO_printf(Perl_debug_log,
 442                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 443                 PTR2UV(rex),
 444                 PTR2UV(rex->offs)
 445             );
 446     );
 447     paren = PL_regsize;
 448     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 449         I32 tmps;
 450         rex->offs[paren].start_tmp = SSPOPINT;
 451         rex->offs[paren].start = SSPOPINT;
 452         tmps = SSPOPINT;
 453         if (paren <= rex->lastparen)
 454             rex->offs[paren].end = tmps;
 455         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 456             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 457             (UV)paren,
 458             (IV)rex->offs[paren].start,
 459             (IV)rex->offs[paren].start_tmp,
 460             (IV)rex->offs[paren].end,
 461             (paren > rex->lastparen ? "(skipped)" : ""));
 462         );
 463         paren--;
 464     }
 465 #if 1
 466     /* It would seem that the similar code in regtry()
 467      * already takes care of this, and in fact it is in
 468      * a better location to since this code can #if 0-ed out
 469      * but the code in regtry() is needed or otherwise tests
 470      * requiring null fields (pat.t#187 and split.t#{13,14}
 471      * (as of patchlevel 7877)  will fail.  Then again,
 472      * this code seems to be necessary or otherwise
 473      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 474      * --jhi updated by dapm */
 475     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 476         if (i > PL_regsize)
 477             rex->offs[i].start = -1;
 478         rex->offs[i].end = -1;
 479         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 480             "    \\%"UVuf": %s   ..-1 undeffing\n",
 481             (UV)i,
 482             (i > PL_regsize) ? "-1" : "  "
 483         ));
 484     }
 485 #endif
 486 }
 487
 488 /* restore the parens and associated vars at savestack position ix,
 489  * but without popping the stack */
 490
 491 STATIC void
 492 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 493 {
 494     I32 tmpix = PL_savestack_ix;
 495     PL_savestack_ix = ix;
 496     regcppop(rex);
 497     PL_savestack_ix = tmpix;
 498 }
 499
 500 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 501
 502 /*
 503  * pregexec and friends
 504  */
 505
 506 #ifndef PERL_IN_XSUB_RE
 507 /*
 508  - pregexec - match a regexp against a string
 509  */
 510 I32
 511 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 512          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 513 /* strend: pointer to null at end of string */
 514 /* strbeg: real beginning of string */
 515 /* minend: end of match must be >=minend after stringarg. */
 516 /* nosave: For optimizations. */
 517 {
 518     PERL_ARGS_ASSERT_PREGEXEC;
 519
 520     return
 521         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 522                       nosave ? 0 : REXEC_COPY_STR);
 523 }
 524 #endif
 525
 526 /*
 527  * Need to implement the following flags for reg_anch:
 528  *
 529  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 530  * USE_INTUIT_ML
 531  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 532  * INTUIT_AUTORITATIVE_ML
 533  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 534  * INTUIT_ONCE_ML
 535  *
 536  * Another flag for this function: SECOND_TIME (so that float substrs
 537  * with giant delta may be not rechecked).
 538  */
 539
 540 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 541
 542 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 543    Otherwise, only SvCUR(sv) is used to get strbeg. */
 544
 545 /* XXXX We assume that strpos is strbeg unless sv. */
 546
 547 /* XXXX Some places assume that there is a fixed substring.
 548         An update may be needed if optimizer marks as "INTUITable"
 549         RExen without fixed substrings.  Similarly, it is assumed that
 550         lengths of all the strings are no more than minlen, thus they
 551         cannot come from lookahead.
 552         (Or minlen should take into account lookahead.)
 553   NOTE: Some of this comment is not correct. minlen does now take account
 554   of lookahead/behind. Further research is required. -- demerphq
 555
 556 */
 557
 558 /* A failure to find a constant substring means that there is no need to make
 559    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 560    finding a substring too deep into the string means that less calls to
 561    regtry() should be needed.
 562
 563    REx compiler's optimizer found 4 possible hints:
 564         a) Anchored substring;
 565         b) Fixed substring;
 566         c) Whether we are anchored (beginning-of-line or \G);
 567         d) First node (of those at offset 0) which may distinguish positions;
 568    We use a)b)d) and multiline-part of c), and try to find a position in the
 569    string which does not contradict any of them.
 570  */
 571
 572 /* Most of decisions we do here should have been done at compile time.
 573    The nodes of the REx which we used for the search should have been
 574    deleted from the finite automaton. */
 575
 576 char *
 577 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 578                      char *strend, const U32 flags, re_scream_pos_data *data)
 579 {
 580     dVAR;
 581     struct regexp *const prog = (struct regexp *)SvANY(rx);
 582     register I32 start_shift = 0;
 583     /* Should be nonnegative! */
 584     register I32 end_shift   = 0;
 585     register char *s;
 586     register SV *check;
 587     char *strbeg;
 588     char *t;
 589     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 590     I32 ml_anch;
 591     register char *other_last = NULL;   /* other substr checked before this */
 592     char *check_at = NULL;              /* check substr found at this pos */
 593     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 594     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 595     RXi_GET_DECL(prog,progi);
 596 #ifdef DEBUGGING
 597     const char * const i_strpos = strpos;
 598 #endif
 599     GET_RE_DEBUG_FLAGS_DECL;
 600
 601     PERL_ARGS_ASSERT_RE_INTUIT_START;
 602     PERL_UNUSED_ARG(flags);
 603     PERL_UNUSED_ARG(data);
 604
 605     RX_MATCH_UTF8_set(rx,utf8_target);
 606
 607     if (RX_UTF8(rx)) {
 608         PL_reg_flags |= RF_utf8;
 609     }
 610     DEBUG_EXECUTE_r(
 611         debug_start_match(rx, utf8_target, strpos, strend,
 612             sv ? "Guessing start of match in sv for"
 613                : "Guessing start of match in string for");
 614               );
 615
 616     /* CHR_DIST() would be more correct here but it makes things slow. */
 617     if (prog->minlen > strend - strpos) {
 618         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 619                               "String too short... [re_intuit_start]\n"));
 620         goto fail;
 621     }
 622
 623     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 624     PL_regeol = strend;
 625     if (utf8_target) {
 626         if (!prog->check_utf8 && prog->check_substr)
 627             to_utf8_substr(prog);
 628         check = prog->check_utf8;
 629     } else {
 630         if (!prog->check_substr && prog->check_utf8)
 631             to_byte_substr(prog);
 632         check = prog->check_substr;
 633     }
 634     if (check == &PL_sv_undef) {
 635         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 636                 "Non-utf8 string cannot match utf8 check string\n"));
 637         goto fail;
 638     }
 639     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 640         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 641                      || ( (prog->extflags & RXf_ANCH_BOL)
 642                           && !multiline ) );    /* Check after \n? */
 643
 644         if (!ml_anch) {
 645           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 646                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 647                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 648                && sv && !SvROK(sv)
 649                && (strpos != strbeg)) {
 650               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 651               goto fail;
 652           }
 653           if (prog->check_offset_min == prog->check_offset_max &&
 654               !(prog->extflags & RXf_CANY_SEEN)) {
 655             /* Substring at constant offset from beg-of-str... */
 656             I32 slen;
 657
 658             s = HOP3c(strpos, prog->check_offset_min, strend);
 659
 660             if (SvTAIL(check)) {
 661                 slen = SvCUR(check);    /* >= 1 */
 662
 663                 if ( strend - s > slen || strend - s < slen - 1
 664                      || (strend - s == slen && strend[-1] != '\n')) {
 665                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 666                     goto fail_finish;
 667                 }
 668                 /* Now should match s[0..slen-2] */
 669                 slen--;
 670                 if (slen && (*SvPVX_const(check) != *s
 671                              || (slen > 1
 672                                  && memNE(SvPVX_const(check), s, slen)))) {
 673                   report_neq:
 674                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 675                     goto fail_finish;
 676                 }
 677             }
 678             else if (*SvPVX_const(check) != *s
 679                      || ((slen = SvCUR(check)) > 1
 680                          && memNE(SvPVX_const(check), s, slen)))
 681                 goto report_neq;
 682             check_at = s;
 683             goto success_at_start;
 684           }
 685         }
 686         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 687         s = strpos;
 688         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 689         end_shift = prog->check_end_shift;
 690
 691         if (!ml_anch) {
 692             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 693                                          - (SvTAIL(check) != 0);
 694             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 695
 696             if (end_shift < eshift)
 697                 end_shift = eshift;
 698         }
 699     }
 700     else {                              /* Can match at random position */
 701         ml_anch = 0;
 702         s = strpos;
 703         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 704         end_shift = prog->check_end_shift;
 705
 706         /* end shift should be non negative here */
 707     }
 708
 709 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 710     if (end_shift < 0)
 711         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 712                    (IV)end_shift, RX_PRECOMP(prog));
 713 #endif
 714
 715   restart:
 716     /* Find a possible match in the region s..strend by looking for
 717        the "check" substring in the region corrected by start/end_shift. */
 718
 719     {
 720         I32 srch_start_shift = start_shift;
 721         I32 srch_end_shift = end_shift;
 722         U8* start_point;
 723         U8* end_point;
 724         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 725             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 726             srch_start_shift = strbeg - s;
 727         }
 728     DEBUG_OPTIMISE_MORE_r({
 729         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 730             (IV)prog->check_offset_min,
 731             (IV)srch_start_shift,
 732             (IV)srch_end_shift,
 733             (IV)prog->check_end_shift);
 734     });
 735
 736         if (prog->extflags & RXf_CANY_SEEN) {
 737             start_point= (U8*)(s + srch_start_shift);
 738             end_point= (U8*)(strend - srch_end_shift);
 739         } else {
 740             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 741             end_point= HOP3(strend, -srch_end_shift, strbeg);
 742         }
 743         DEBUG_OPTIMISE_MORE_r({
 744             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 745                 (int)(end_point - start_point),
 746                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 747                 start_point);
 748         });
 749
 750         s = fbm_instr( start_point, end_point,
 751                       check, multiline ? FBMrf_MULTILINE : 0);
 752     }
 753     /* Update the count-of-usability, remove useless subpatterns,
 754         unshift s.  */
 755
 756     DEBUG_EXECUTE_r({
 757         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 758             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 759         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 760                           (s ? "Found" : "Did not find"),
 761             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 762                 ? "anchored" : "floating"),
 763             quoted,
 764             RE_SV_TAIL(check),
 765             (s ? " at offset " : "...\n") );
 766     });
 767
 768     if (!s)
 769         goto fail_finish;
 770     /* Finish the diagnostic message */
 771     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 772
 773     /* XXX dmq: first branch is for positive lookbehind...
 774        Our check string is offset from the beginning of the pattern.
 775        So we need to do any stclass tests offset forward from that
 776        point. I think. :-(
 777      */
 778
 779
 780
 781     check_at=s;
 782
 783
 784     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 785        Start with the other substr.
 786        XXXX no SCREAM optimization yet - and a very coarse implementation
 787        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 788                 *always* match.  Probably should be marked during compile...
 789        Probably it is right to do no SCREAM here...
 790      */
 791
 792     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 793                 : (prog->float_substr && prog->anchored_substr))
 794     {
 795         /* Take into account the "other" substring. */
 796         /* XXXX May be hopelessly wrong for UTF... */
 797         if (!other_last)
 798             other_last = strpos;
 799         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 800           do_other_anchored:
 801             {
 802                 char * const last = HOP3c(s, -start_shift, strbeg);
 803                 char *last1, *last2;
 804                 char * const saved_s = s;
 805                 SV* must;
 806
 807                 t = s - prog->check_offset_max;
 808                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 809                     && (!utf8_target
 810                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 811                             && t > strpos)))
 812                     NOOP;
 813                 else
 814                     t = strpos;
 815                 t = HOP3c(t, prog->anchored_offset, strend);
 816                 if (t < other_last)     /* These positions already checked */
 817                     t = other_last;
 818                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 819                 if (last < last1)
 820                     last1 = last;
 821                 /* XXXX It is not documented what units *_offsets are in.
 822                    We assume bytes, but this is clearly wrong.
 823                    Meaning this code needs to be carefully reviewed for errors.
 824                    dmq.
 825                   */
 826
 827                 /* On end-of-str: see comment below. */
 828                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 829                 if (must == &PL_sv_undef) {
 830                     s = (char*)NULL;
 831                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 832                 }
 833                 else
 834                     s = fbm_instr(
 835                         (unsigned char*)t,
 836                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 837                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 838                         must,
 839                         multiline ? FBMrf_MULTILINE : 0
 840                     );
 841                 DEBUG_EXECUTE_r({
 842                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 843                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 844                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 845                         (s ? "Found" : "Contradicts"),
 846                         quoted, RE_SV_TAIL(must));
 847                 });
 848
 849
 850                 if (!s) {
 851                     if (last1 >= last2) {
 852                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 853                                                 ", giving up...\n"));
 854                         goto fail_finish;
 855                     }
 856                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 857                         ", trying floating at offset %ld...\n",
 858                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 859                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 860                     s = HOP3c(last, 1, strend);
 861                     goto restart;
 862                 }
 863                 else {
 864                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 865                           (long)(s - i_strpos)));
 866                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 867                     other_last = HOP3c(s, 1, strend);
 868                     s = saved_s;
 869                     if (t == strpos)
 870                         goto try_at_start;
 871                     goto try_at_offset;
 872                 }
 873             }
 874         }
 875         else {          /* Take into account the floating substring. */
 876             char *last, *last1;
 877             char * const saved_s = s;
 878             SV* must;
 879
 880             t = HOP3c(s, -start_shift, strbeg);
 881             last1 = last =
 882                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 883             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 884                 last = HOP3c(t, prog->float_max_offset, strend);
 885             s = HOP3c(t, prog->float_min_offset, strend);
 886             if (s < other_last)
 887                 s = other_last;
 888  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 889             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 890             /* fbm_instr() takes into account exact value of end-of-str
 891                if the check is SvTAIL(ed).  Since false positives are OK,
 892                and end-of-str is not later than strend we are OK. */
 893             if (must == &PL_sv_undef) {
 894                 s = (char*)NULL;
 895                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 896             }
 897             else
 898                 s = fbm_instr((unsigned char*)s,
 899                               (unsigned char*)last + SvCUR(must)
 900                                   - (SvTAIL(must)!=0),
 901                               must, multiline ? FBMrf_MULTILINE : 0);
 902             DEBUG_EXECUTE_r({
 903                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 904                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 905                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 906                     (s ? "Found" : "Contradicts"),
 907                     quoted, RE_SV_TAIL(must));
 908             });
 909             if (!s) {
 910                 if (last1 == last) {
 911                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 912                                             ", giving up...\n"));
 913                     goto fail_finish;
 914                 }
 915                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 916                     ", trying anchored starting at offset %ld...\n",
 917                     (long)(saved_s + 1 - i_strpos)));
 918                 other_last = last;
 919                 s = HOP3c(t, 1, strend);
 920                 goto restart;
 921             }
 922             else {
 923                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 924                       (long)(s - i_strpos)));
 925                 other_last = s; /* Fix this later. --Hugo */
 926                 s = saved_s;
 927                 if (t == strpos)
 928                     goto try_at_start;
 929                 goto try_at_offset;
 930             }
 931         }
 932     }
 933
 934
 935     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 936
 937     DEBUG_OPTIMISE_MORE_r(
 938         PerlIO_printf(Perl_debug_log,
 939             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 940             (IV)prog->check_offset_min,
 941             (IV)prog->check_offset_max,
 942             (IV)(s-strpos),
 943             (IV)(t-strpos),
 944             (IV)(t-s),
 945             (IV)(strend-strpos)
 946         )
 947     );
 948
 949     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 950         && (!utf8_target
 951             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 952                  && t > strpos)))
 953     {
 954         /* Fixed substring is found far enough so that the match
 955            cannot start at strpos. */
 956       try_at_offset:
 957         if (ml_anch && t[-1] != '\n') {
 958             /* Eventually fbm_*() should handle this, but often
 959                anchored_offset is not 0, so this check will not be wasted. */
 960             /* XXXX In the code below we prefer to look for "^" even in
 961                presence of anchored substrings.  And we search even
 962                beyond the found float position.  These pessimizations
 963                are historical artefacts only.  */
 964           find_anchor:
 965             while (t < strend - prog->minlen) {
 966                 if (*t == '\n') {
 967                     if (t < check_at - prog->check_offset_min) {
 968                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 969                             /* Since we moved from the found position,
 970                                we definitely contradict the found anchored
 971                                substr.  Due to the above check we do not
 972                                contradict "check" substr.
 973                                Thus we can arrive here only if check substr
 974                                is float.  Redo checking for "other"=="fixed".
 975                              */
 976                             strpos = t + 1;
 977                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 978                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 979                             goto do_other_anchored;
 980                         }
 981                         /* We don't contradict the found floating substring. */
 982                         /* XXXX Why not check for STCLASS? */
 983                         s = t + 1;
 984                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 985                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 986                         goto set_useful;
 987                     }
 988                     /* Position contradicts check-string */
 989                     /* XXXX probably better to look for check-string
 990                        than for "\n", so one should lower the limit for t? */
 991                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 992                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 993                     other_last = strpos = s = t + 1;
 994                     goto restart;
 995                 }
 996                 t++;
 997             }
 998             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 999                         PL_colors[0], PL_colors[1]));
1000             goto fail_finish;
1001         }
1002         else {
1003             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1004                         PL_colors[0], PL_colors[1]));
1005         }
1006         s = t;
1007       set_useful:
1008         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1009     }
1010     else {
1011         /* The found string does not prohibit matching at strpos,
1012            - no optimization of calling REx engine can be performed,
1013            unless it was an MBOL and we are not after MBOL,
1014            or a future STCLASS check will fail this. */
1015       try_at_start:
1016         /* Even in this situation we may use MBOL flag if strpos is offset
1017            wrt the start of the string. */
1018         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1019             && (strpos != strbeg) && strpos[-1] != '\n'
1020             /* May be due to an implicit anchor of m{.*foo}  */
1021             && !(prog->intflags & PREGf_IMPLICIT))
1022         {
1023             t = strpos;
1024             goto find_anchor;
1025         }
1026         DEBUG_EXECUTE_r( if (ml_anch)
1027             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1028                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1029         );
1030       success_at_start:
1031         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1032             && (utf8_target ? (
1033                 prog->check_utf8                /* Could be deleted already */
1034                 && --BmUSEFUL(prog->check_utf8) < 0
1035                 && (prog->check_utf8 == prog->float_utf8)
1036             ) : (
1037                 prog->check_substr              /* Could be deleted already */
1038                 && --BmUSEFUL(prog->check_substr) < 0
1039                 && (prog->check_substr == prog->float_substr)
1040             )))
1041         {
1042             /* If flags & SOMETHING - do not do it many times on the same match */
1043             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1044             /* XXX Does the destruction order has to change with utf8_target? */
1045             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1046             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1047             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1048             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1049             check = NULL;                       /* abort */
1050             s = strpos;
1051             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1052                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1053             if (prog->intflags & PREGf_IMPLICIT)
1054                 prog->extflags &= ~RXf_ANCH_MBOL;
1055             /* XXXX This is a remnant of the old implementation.  It
1056                     looks wasteful, since now INTUIT can use many
1057                     other heuristics. */
1058             prog->extflags &= ~RXf_USE_INTUIT;
1059             /* XXXX What other flags might need to be cleared in this branch? */
1060         }
1061         else
1062             s = strpos;
1063     }
1064
1065     /* Last resort... */
1066     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1067     /* trie stclasses are too expensive to use here, we are better off to
1068        leave it to regmatch itself */
1069     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1070         /* minlen == 0 is possible if regstclass is \b or \B,
1071            and the fixed substr is ''$.
1072            Since minlen is already taken into account, s+1 is before strend;
1073            accidentally, minlen >= 1 guaranties no false positives at s + 1
1074            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1075            regstclass does not come from lookahead...  */
1076         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1077            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1078         const U8* const str = (U8*)STRING(progi->regstclass);
1079         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1080                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1081                     : 1);
1082         char * endpos;
1083         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1084             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1085         else if (prog->float_substr || prog->float_utf8)
1086             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1087         else
1088             endpos= strend;
1089
1090         if (checked_upto < s)
1091            checked_upto = s;
1092         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1093                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1094
1095         t = s;
1096         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1097         if (s) {
1098             checked_upto = s;
1099         } else {
1100 #ifdef DEBUGGING
1101             const char *what = NULL;
1102 #endif
1103             if (endpos == strend) {
1104                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1105                                 "Could not match STCLASS...\n") );
1106                 goto fail;
1107             }
1108             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1109                                    "This position contradicts STCLASS...\n") );
1110             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1111                 goto fail;
1112             checked_upto = HOPBACKc(endpos, start_shift);
1113             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1114                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1115             /* Contradict one of substrings */
1116             if (prog->anchored_substr || prog->anchored_utf8) {
1117                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1118                     DEBUG_EXECUTE_r( what = "anchored" );
1119                   hop_and_restart:
1120                     s = HOP3c(t, 1, strend);
1121                     if (s + start_shift + end_shift > strend) {
1122                         /* XXXX Should be taken into account earlier? */
1123                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1124                                                "Could not match STCLASS...\n") );
1125                         goto fail;
1126                     }
1127                     if (!check)
1128                         goto giveup;
1129                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1130                                 "Looking for %s substr starting at offset %ld...\n",
1131                                  what, (long)(s + start_shift - i_strpos)) );
1132                     goto restart;
1133                 }
1134                 /* Have both, check_string is floating */
1135                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1136                     goto retry_floating_check;
1137                 /* Recheck anchored substring, but not floating... */
1138                 s = check_at;
1139                 if (!check)
1140                     goto giveup;
1141                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142                           "Looking for anchored substr starting at offset %ld...\n",
1143                           (long)(other_last - i_strpos)) );
1144                 goto do_other_anchored;
1145             }
1146             /* Another way we could have checked stclass at the
1147                current position only: */
1148             if (ml_anch) {
1149                 s = t = t + 1;
1150                 if (!check)
1151                     goto giveup;
1152                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1153                           "Looking for /%s^%s/m starting at offset %ld...\n",
1154                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1155                 goto try_at_offset;
1156             }
1157             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1158                 goto fail;
1159             /* Check is floating substring. */
1160           retry_floating_check:
1161             t = check_at - start_shift;
1162             DEBUG_EXECUTE_r( what = "floating" );
1163             goto hop_and_restart;
1164         }
1165         if (t != s) {
1166             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1167                         "By STCLASS: moving %ld --> %ld\n",
1168                                   (long)(t - i_strpos), (long)(s - i_strpos))
1169                    );
1170         }
1171         else {
1172             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1173                                   "Does not contradict STCLASS...\n");
1174                    );
1175         }
1176     }
1177   giveup:
1178     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1179                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1180                           PL_colors[5], (long)(s - i_strpos)) );
1181     return s;
1182
1183   fail_finish:                          /* Substring not found */
1184     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1185         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1186   fail:
1187     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1188                           PL_colors[4], PL_colors[5]));
1189     return NULL;
1190 }
1191
1192 #define DECL_TRIE_TYPE(scan) \
1193     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1194                     trie_type = ((scan->flags == EXACT) \
1195                               ? (utf8_target ? trie_utf8 : trie_plain) \
1196                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1197
1198 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1199 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1200     STRLEN skiplen;                                                                 \
1201     switch (trie_type) {                                                            \
1202     case trie_utf8_fold:                                                            \
1203         if ( foldlen>0 ) {                                                          \
1204             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1205             foldlen -= len;                                                         \
1206             uscan += len;                                                           \
1207             len=0;                                                                  \
1208         } else {                                                                    \
1209             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1210             len = UTF8SKIP(uc);                                                     \
1211             skiplen = UNISKIP( uvc );                                               \
1212             foldlen -= skiplen;                                                     \
1213             uscan = foldbuf + skiplen;                                              \
1214         }                                                                           \
1215         break;                                                                      \
1216     case trie_latin_utf8_fold:                                                      \
1217         if ( foldlen>0 ) {                                                          \
1218             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1219             foldlen -= len;                                                         \
1220             uscan += len;                                                           \
1221             len=0;                                                                  \
1222         } else {                                                                    \
1223             len = 1;                                                                \
1224             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1225             skiplen = UNISKIP( uvc );                                               \
1226             foldlen -= skiplen;                                                     \
1227             uscan = foldbuf + skiplen;                                              \
1228         }                                                                           \
1229         break;                                                                      \
1230     case trie_utf8:                                                                 \
1231         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1232         break;                                                                      \
1233     case trie_plain:                                                                \
1234         uvc = (UV)*uc;                                                              \
1235         len = 1;                                                                    \
1236     }                                                                               \
1237     if (uvc < 256) {                                                                \
1238         charid = trie->charmap[ uvc ];                                              \
1239     }                                                                               \
1240     else {                                                                          \
1241         charid = 0;                                                                 \
1242         if (widecharmap) {                                                          \
1243             SV** const svpp = hv_fetch(widecharmap,                                 \
1244                         (char*)&uvc, sizeof(UV), 0);                                \
1245             if (svpp)                                                               \
1246                 charid = (U16)SvIV(*svpp);                                          \
1247         }                                                                           \
1248     }                                                                               \
1249 } STMT_END
1250
1251 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1252 STMT_START {                                              \
1253     while (s <= e) {                                      \
1254         if ( (CoNd)                                       \
1255              && (ln == 1 || folder(s, pat_string, ln))    \
1256              && (!reginfo || regtry(reginfo, &s)) )       \
1257             goto got_it;                                  \
1258         s++;                                              \
1259     }                                                     \
1260 } STMT_END
1261
1262 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1263 STMT_START {                                          \
1264     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1265         CoDe                                          \
1266         s += uskip;                                   \
1267     }                                                 \
1268 } STMT_END
1269
1270 #define REXEC_FBC_SCAN(CoDe)                          \
1271 STMT_START {                                          \
1272     while (s < strend) {                              \
1273         CoDe                                          \
1274         s++;                                          \
1275     }                                                 \
1276 } STMT_END
1277
1278 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1279 REXEC_FBC_UTF8_SCAN(                                  \
1280     if (CoNd) {                                       \
1281         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1282             goto got_it;                              \
1283         else                                          \
1284             tmp = doevery;                            \
1285     }                                                 \
1286     else                                              \
1287         tmp = 1;                                      \
1288 )
1289
1290 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1291 REXEC_FBC_SCAN(                                       \
1292     if (CoNd) {                                       \
1293         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1294             goto got_it;                              \
1295         else                                          \
1296             tmp = doevery;                            \
1297     }                                                 \
1298     else                                              \
1299         tmp = 1;                                      \
1300 )
1301
1302 #define REXEC_FBC_TRYIT               \
1303 if ((!reginfo || regtry(reginfo, &s))) \
1304     goto got_it
1305
1306 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1307     if (utf8_target) {                                             \
1308         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1309     }                                                          \
1310     else {                                                     \
1311         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1312     }
1313
1314 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1315     if (utf8_target) {                                             \
1316         UtFpReLoAd;                                            \
1317         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1318     }                                                          \
1319     else {                                                     \
1320         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1321     }
1322
1323 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1324     PL_reg_flags |= RF_tainted;                                \
1325     if (utf8_target) {                                             \
1326         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1327     }                                                          \
1328     else {                                                     \
1329         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1330     }
1331
1332 #define DUMP_EXEC_POS(li,s,doutf8) \
1333     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1334
1335
1336 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1337         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1338         tmp = TEST_NON_UTF8(tmp);                                              \
1339         REXEC_FBC_UTF8_SCAN(                                                   \
1340             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1341                 tmp = !tmp;                                                    \
1342                 IF_SUCCESS;                                                    \
1343             }                                                                  \
1344             else {                                                             \
1345                 IF_FAIL;                                                       \
1346             }                                                                  \
1347         );                                                                     \
1348
1349 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1350         if (s == PL_bostr) {                                                   \
1351             tmp = '\n';                                                        \
1352         }                                                                      \
1353         else {                                                                 \
1354             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1355             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1356         }                                                                      \
1357         tmp = TeSt1_UtF8;                                                      \
1358         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1359         REXEC_FBC_UTF8_SCAN(                                                   \
1360             if (tmp == ! (TeSt2_UtF8)) { \
1361                 tmp = !tmp;                                                    \
1362                 IF_SUCCESS;                                                    \
1363             }                                                                  \
1364             else {                                                             \
1365                 IF_FAIL;                                                       \
1366             }                                                                  \
1367         );                                                                     \
1368
1369 /* The only difference between the BOUND and NBOUND cases is that
1370  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1371  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1372  * with the other one being empty */
1373 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1374     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1375
1376 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1377     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1378
1379 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1380     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1381
1382 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1383     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1384
1385
1386 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1387  * be passed in completely with the variable name being tested, which isn't
1388  * such a clean interface, but this is easier to read than it was before.  We
1389  * are looking for the boundary (or non-boundary between a word and non-word
1390  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1391  * must be different.  Find the "wordness" of the character just prior to this
1392  * one, and compare it with the wordness of this one.  If they differ, we have
1393  * a boundary.  At the beginning of the string, pretend that the previous
1394  * character was a new-line */
1395 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1396     if (utf8_target) {                                                         \
1397                 UTF8_CODE \
1398     }                                                                          \
1399     else {  /* Not utf8 */                                                     \
1400         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1401         tmp = TEST_NON_UTF8(tmp);                                              \
1402         REXEC_FBC_SCAN(                                                        \
1403             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1404                 tmp = !tmp;                                                    \
1405                 IF_SUCCESS;                                                    \
1406             }                                                                  \
1407             else {                                                             \
1408                 IF_FAIL;                                                       \
1409             }                                                                  \
1410         );                                                                     \
1411     }                                                                          \
1412     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1413         goto got_it;
1414
1415 /* We know what class REx starts with.  Try to find this position... */
1416 /* if reginfo is NULL, its a dryrun */
1417 /* annoyingly all the vars in this routine have different names from their counterparts
1418    in regmatch. /grrr */
1419
1420 STATIC char *
1421 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1422     const char *strend, regmatch_info *reginfo)
1423 {
1424         dVAR;
1425         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1426         char *pat_string;   /* The pattern's exactish string */
1427         char *pat_end;      /* ptr to end char of pat_string */
1428         re_fold_t folder;       /* Function for computing non-utf8 folds */
1429         const U8 *fold_array;   /* array for folding ords < 256 */
1430         STRLEN ln;
1431         STRLEN lnc;
1432         register STRLEN uskip;
1433         U8 c1;
1434         U8 c2;
1435         char *e;
1436         register I32 tmp = 1;   /* Scratch variable? */
1437         register const bool utf8_target = PL_reg_match_utf8;
1438         UV utf8_fold_flags = 0;
1439         RXi_GET_DECL(prog,progi);
1440
1441         PERL_ARGS_ASSERT_FIND_BYCLASS;
1442
1443         /* We know what class it must start with. */
1444         switch (OP(c)) {
1445         case ANYOFV:
1446         case ANYOF:
1447             if (utf8_target || OP(c) == ANYOFV) {
1448                 STRLEN inclasslen = strend - s;
1449                 REXEC_FBC_UTF8_CLASS_SCAN(
1450                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1451             }
1452             else {
1453                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1454             }
1455             break;
1456         case CANY:
1457             REXEC_FBC_SCAN(
1458                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1459                     goto got_it;
1460                 else
1461                     tmp = doevery;
1462             );
1463             break;
1464
1465         case EXACTFA:
1466             if (UTF_PATTERN || utf8_target) {
1467                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1468                 goto do_exactf_utf8;
1469             }
1470             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1471             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1472             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1473
1474         case EXACTF:
1475             if (utf8_target) {
1476
1477                 /* regcomp.c already folded this if pattern is in UTF-8 */
1478                 utf8_fold_flags = 0;
1479                 goto do_exactf_utf8;
1480             }
1481             fold_array = PL_fold;
1482             folder = foldEQ;
1483             goto do_exactf_non_utf8;
1484
1485         case EXACTFL:
1486             if (UTF_PATTERN || utf8_target) {
1487                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1488                 goto do_exactf_utf8;
1489             }
1490             fold_array = PL_fold_locale;
1491             folder = foldEQ_locale;
1492             goto do_exactf_non_utf8;
1493
1494         case EXACTFU_SS:
1495             if (UTF_PATTERN) {
1496                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1497             }
1498             goto do_exactf_utf8;
1499
1500         case EXACTFU_TRICKYFOLD:
1501         case EXACTFU:
1502             if (UTF_PATTERN || utf8_target) {
1503                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1504                 goto do_exactf_utf8;
1505             }
1506
1507             /* Any 'ss' in the pattern should have been replaced by regcomp,
1508              * so we don't have to worry here about this single special case
1509              * in the Latin1 range */
1510             fold_array = PL_fold_latin1;
1511             folder = foldEQ_latin1;
1512
1513             /* FALL THROUGH */
1514
1515         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1516                                are no glitches with fold-length differences
1517                                between the target string and pattern */
1518
1519             /* The idea in the non-utf8 EXACTF* cases is to first find the
1520              * first character of the EXACTF* node and then, if necessary,
1521              * case-insensitively compare the full text of the node.  c1 is the
1522              * first character.  c2 is its fold.  This logic will not work for
1523              * Unicode semantics and the german sharp ss, which hence should
1524              * not be compiled into a node that gets here. */
1525             pat_string = STRING(c);
1526             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1527
1528             /* We know that we have to match at least 'ln' bytes (which is the
1529              * same as characters, since not utf8).  If we have to match 3
1530              * characters, and there are only 2 availabe, we know without
1531              * trying that it will fail; so don't start a match past the
1532              * required minimum number from the far end */
1533             e = HOP3c(strend, -((I32)ln), s);
1534
1535             if (!reginfo && e < s) {
1536                 e = s;                  /* Due to minlen logic of intuit() */
1537             }
1538
1539             c1 = *pat_string;
1540             c2 = fold_array[c1];
1541             if (c1 == c2) { /* If char and fold are the same */
1542                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1543             }
1544             else {
1545                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1546             }
1547             break;
1548
1549         do_exactf_utf8:
1550         {
1551             unsigned expansion;
1552
1553
1554             /* If one of the operands is in utf8, we can't use the simpler
1555              * folding above, due to the fact that many different characters
1556              * can have the same fold, or portion of a fold, or different-
1557              * length fold */
1558             pat_string = STRING(c);
1559             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1560             pat_end = pat_string + ln;
1561             lnc = (UTF_PATTERN) /* length to match in characters */
1562                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1563                     : ln;
1564
1565             /* We have 'lnc' characters to match in the pattern, but because of
1566              * multi-character folding, each character in the target can match
1567              * up to 3 characters (Unicode guarantees it will never exceed
1568              * this) if it is utf8-encoded; and up to 2 if not (based on the
1569              * fact that the Latin 1 folds are already determined, and the
1570              * only multi-char fold in that range is the sharp-s folding to
1571              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1572              * string character.  Adjust lnc accordingly, rounding up, so that
1573              * if we need to match at least 4+1/3 chars, that really is 5. */
1574             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1575             lnc = (lnc + expansion - 1) / expansion;
1576
1577             /* As in the non-UTF8 case, if we have to match 3 characters, and
1578              * only 2 are left, it's guaranteed to fail, so don't start a
1579              * match that would require us to go beyond the end of the string
1580              */
1581             e = HOP3c(strend, -((I32)lnc), s);
1582
1583             if (!reginfo && e < s) {
1584                 e = s;                  /* Due to minlen logic of intuit() */
1585             }
1586
1587             /* XXX Note that we could recalculate e to stop the loop earlier,
1588              * as the worst case expansion above will rarely be met, and as we
1589              * go along we would usually find that e moves further to the left.
1590              * This would happen only after we reached the point in the loop
1591              * where if there were no expansion we should fail.  Unclear if
1592              * worth the expense */
1593
1594             while (s <= e) {
1595                 char *my_strend= (char *)strend;
1596                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1597                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1598                     && (!reginfo || regtry(reginfo, &s)) )
1599                 {
1600                     goto got_it;
1601                 }
1602                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1603             }
1604             break;
1605         }
1606         case BOUNDL:
1607             PL_reg_flags |= RF_tainted;
1608             FBC_BOUND(isALNUM_LC,
1609                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1610                       isALNUM_LC_utf8((U8*)s));
1611             break;
1612         case NBOUNDL:
1613             PL_reg_flags |= RF_tainted;
1614             FBC_NBOUND(isALNUM_LC,
1615                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1616                        isALNUM_LC_utf8((U8*)s));
1617             break;
1618         case BOUND:
1619             FBC_BOUND(isWORDCHAR,
1620                       isALNUM_uni(tmp),
1621                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1622             break;
1623         case BOUNDA:
1624             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1625                              isWORDCHAR_A(tmp),
1626                              isWORDCHAR_A((U8*)s));
1627             break;
1628         case NBOUND:
1629             FBC_NBOUND(isWORDCHAR,
1630                        isALNUM_uni(tmp),
1631                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1632             break;
1633         case NBOUNDA:
1634             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1635                               isWORDCHAR_A(tmp),
1636                               isWORDCHAR_A((U8*)s));
1637             break;
1638         case BOUNDU:
1639             FBC_BOUND(isWORDCHAR_L1,
1640                       isALNUM_uni(tmp),
1641                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1642             break;
1643         case NBOUNDU:
1644             FBC_NBOUND(isWORDCHAR_L1,
1645                        isALNUM_uni(tmp),
1646                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1647             break;
1648         case ALNUML:
1649             REXEC_FBC_CSCAN_TAINT(
1650                 isALNUM_LC_utf8((U8*)s),
1651                 isALNUM_LC(*s)
1652             );
1653             break;
1654         case ALNUMU:
1655             REXEC_FBC_CSCAN_PRELOAD(
1656                 LOAD_UTF8_CHARCLASS_ALNUM(),
1657                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1658                 isWORDCHAR_L1((U8) *s)
1659             );
1660             break;
1661         case ALNUM:
1662             REXEC_FBC_CSCAN_PRELOAD(
1663                 LOAD_UTF8_CHARCLASS_ALNUM(),
1664                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1665                 isWORDCHAR((U8) *s)
1666             );
1667             break;
1668         case ALNUMA:
1669             /* Don't need to worry about utf8, as it can match only a single
1670              * byte invariant character */
1671             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1672             break;
1673         case NALNUMU:
1674             REXEC_FBC_CSCAN_PRELOAD(
1675                 LOAD_UTF8_CHARCLASS_ALNUM(),
1676                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1677                 ! isWORDCHAR_L1((U8) *s)
1678             );
1679             break;
1680         case NALNUM:
1681             REXEC_FBC_CSCAN_PRELOAD(
1682                 LOAD_UTF8_CHARCLASS_ALNUM(),
1683                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1684                 ! isALNUM(*s)
1685             );
1686             break;
1687         case NALNUMA:
1688             REXEC_FBC_CSCAN(
1689                 !isWORDCHAR_A(*s),
1690                 !isWORDCHAR_A(*s)
1691             );
1692             break;
1693         case NALNUML:
1694             REXEC_FBC_CSCAN_TAINT(
1695                 !isALNUM_LC_utf8((U8*)s),
1696                 !isALNUM_LC(*s)
1697             );
1698             break;
1699         case SPACEU:
1700             REXEC_FBC_CSCAN_PRELOAD(
1701                 LOAD_UTF8_CHARCLASS_SPACE(),
1702                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1703                 isSPACE_L1((U8) *s)
1704             );
1705             break;
1706         case SPACE:
1707             REXEC_FBC_CSCAN_PRELOAD(
1708                 LOAD_UTF8_CHARCLASS_SPACE(),
1709                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1710                 isSPACE((U8) *s)
1711             );
1712             break;
1713         case SPACEA:
1714             /* Don't need to worry about utf8, as it can match only a single
1715              * byte invariant character */
1716             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1717             break;
1718         case SPACEL:
1719             REXEC_FBC_CSCAN_TAINT(
1720                 isSPACE_LC_utf8((U8*)s),
1721                 isSPACE_LC(*s)
1722             );
1723             break;
1724         case NSPACEU:
1725             REXEC_FBC_CSCAN_PRELOAD(
1726                 LOAD_UTF8_CHARCLASS_SPACE(),
1727                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1728                 ! isSPACE_L1((U8) *s)
1729             );
1730             break;
1731         case NSPACE:
1732             REXEC_FBC_CSCAN_PRELOAD(
1733                 LOAD_UTF8_CHARCLASS_SPACE(),
1734                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1735                 ! isSPACE((U8) *s)
1736             );
1737             break;
1738         case NSPACEA:
1739             REXEC_FBC_CSCAN(
1740                 !isSPACE_A(*s),
1741                 !isSPACE_A(*s)
1742             );
1743             break;
1744         case NSPACEL:
1745             REXEC_FBC_CSCAN_TAINT(
1746                 !isSPACE_LC_utf8((U8*)s),
1747                 !isSPACE_LC(*s)
1748             );
1749             break;
1750         case DIGIT:
1751             REXEC_FBC_CSCAN_PRELOAD(
1752                 LOAD_UTF8_CHARCLASS_DIGIT(),
1753                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1754                 isDIGIT(*s)
1755             );
1756             break;
1757         case DIGITA:
1758             /* Don't need to worry about utf8, as it can match only a single
1759              * byte invariant character */
1760             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1761             break;
1762         case DIGITL:
1763             REXEC_FBC_CSCAN_TAINT(
1764                 isDIGIT_LC_utf8((U8*)s),
1765                 isDIGIT_LC(*s)
1766             );
1767             break;
1768         case NDIGIT:
1769             REXEC_FBC_CSCAN_PRELOAD(
1770                 LOAD_UTF8_CHARCLASS_DIGIT(),
1771                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1772                 !isDIGIT(*s)
1773             );
1774             break;
1775         case NDIGITA:
1776             REXEC_FBC_CSCAN(
1777                 !isDIGIT_A(*s),
1778                 !isDIGIT_A(*s)
1779             );
1780             break;
1781         case NDIGITL:
1782             REXEC_FBC_CSCAN_TAINT(
1783                 !isDIGIT_LC_utf8((U8*)s),
1784                 !isDIGIT_LC(*s)
1785             );
1786             break;
1787         case LNBREAK:
1788             REXEC_FBC_CSCAN(
1789                 is_LNBREAK_utf8(s),
1790                 is_LNBREAK_latin1(s)
1791             );
1792             break;
1793         case VERTWS:
1794             REXEC_FBC_CSCAN(
1795                 is_VERTWS_utf8(s),
1796                 is_VERTWS_latin1(s)
1797             );
1798             break;
1799         case NVERTWS:
1800             REXEC_FBC_CSCAN(
1801                 !is_VERTWS_utf8(s),
1802                 !is_VERTWS_latin1(s)
1803             );
1804             break;
1805         case HORIZWS:
1806             REXEC_FBC_CSCAN(
1807                 is_HORIZWS_utf8(s),
1808                 is_HORIZWS_latin1(s)
1809             );
1810             break;
1811         case NHORIZWS:
1812             REXEC_FBC_CSCAN(
1813                 !is_HORIZWS_utf8(s),
1814                 !is_HORIZWS_latin1(s)
1815             );
1816             break;
1817         case AHOCORASICKC:
1818         case AHOCORASICK:
1819             {
1820                 DECL_TRIE_TYPE(c);
1821                 /* what trie are we using right now */
1822                 reg_ac_data *aho
1823                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1824                 reg_trie_data *trie
1825                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1826                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1827
1828                 const char *last_start = strend - trie->minlen;
1829 #ifdef DEBUGGING
1830                 const char *real_start = s;
1831 #endif
1832                 STRLEN maxlen = trie->maxlen;
1833                 SV *sv_points;
1834                 U8 **points; /* map of where we were in the input string
1835                                 when reading a given char. For ASCII this
1836                                 is unnecessary overhead as the relationship
1837                                 is always 1:1, but for Unicode, especially
1838                                 case folded Unicode this is not true. */
1839                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1840                 U8 *bitmap=NULL;
1841
1842
1843                 GET_RE_DEBUG_FLAGS_DECL;
1844
1845                 /* We can't just allocate points here. We need to wrap it in
1846                  * an SV so it gets freed properly if there is a croak while
1847                  * running the match */
1848                 ENTER;
1849                 SAVETMPS;
1850                 sv_points=newSV(maxlen * sizeof(U8 *));
1851                 SvCUR_set(sv_points,
1852                     maxlen * sizeof(U8 *));
1853                 SvPOK_on(sv_points);
1854                 sv_2mortal(sv_points);
1855                 points=(U8**)SvPV_nolen(sv_points );
1856                 if ( trie_type != trie_utf8_fold
1857                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1858                 {
1859                     if (trie->bitmap)
1860                         bitmap=(U8*)trie->bitmap;
1861                     else
1862                         bitmap=(U8*)ANYOF_BITMAP(c);
1863                 }
1864                 /* this is the Aho-Corasick algorithm modified a touch
1865                    to include special handling for long "unknown char"
1866                    sequences. The basic idea being that we use AC as long
1867                    as we are dealing with a possible matching char, when
1868                    we encounter an unknown char (and we have not encountered
1869                    an accepting state) we scan forward until we find a legal
1870                    starting char.
1871                    AC matching is basically that of trie matching, except
1872                    that when we encounter a failing transition, we fall back
1873                    to the current states "fail state", and try the current char
1874                    again, a process we repeat until we reach the root state,
1875                    state 1, or a legal transition. If we fail on the root state
1876                    then we can either terminate if we have reached an accepting
1877                    state previously, or restart the entire process from the beginning
1878                    if we have not.
1879
1880                  */
1881                 while (s <= last_start) {
1882                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1883                     U8 *uc = (U8*)s;
1884                     U16 charid = 0;
1885                     U32 base = 1;
1886                     U32 state = 1;
1887                     UV uvc = 0;
1888                     STRLEN len = 0;
1889                     STRLEN foldlen = 0;
1890                     U8 *uscan = (U8*)NULL;
1891                     U8 *leftmost = NULL;
1892 #ifdef DEBUGGING
1893                     U32 accepted_word= 0;
1894 #endif
1895                     U32 pointpos = 0;
1896
1897                     while ( state && uc <= (U8*)strend ) {
1898                         int failed=0;
1899                         U32 word = aho->states[ state ].wordnum;
1900
1901                         if( state==1 ) {
1902                             if ( bitmap ) {
1903                                 DEBUG_TRIE_EXECUTE_r(
1904                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1905                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1906                                             (char *)uc, utf8_target );
1907                                         PerlIO_printf( Perl_debug_log,
1908                                             " Scanning for legal start char...\n");
1909                                     }
1910                                 );
1911                                 if (utf8_target) {
1912                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1913                                         uc += UTF8SKIP(uc);
1914                                     }
1915                                 } else {
1916                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1917                                         uc++;
1918                                     }
1919                                 }
1920                                 s= (char *)uc;
1921                             }
1922                             if (uc >(U8*)last_start) break;
1923                         }
1924
1925                         if ( word ) {
1926                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1927                             if (!leftmost || lpos < leftmost) {
1928                                 DEBUG_r(accepted_word=word);
1929                                 leftmost= lpos;
1930                             }
1931                             if (base==0) break;
1932
1933                         }
1934                         points[pointpos++ % maxlen]= uc;
1935                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1936                                              uscan, len, uvc, charid, foldlen,
1937                                              foldbuf, uniflags);
1938                         DEBUG_TRIE_EXECUTE_r({
1939                             dump_exec_pos( (char *)uc, c, strend, real_start,
1940                                 s,   utf8_target );
1941                             PerlIO_printf(Perl_debug_log,
1942                                 " Charid:%3u CP:%4"UVxf" ",
1943                                  charid, uvc);
1944                         });
1945
1946                         do {
1947 #ifdef DEBUGGING
1948                             word = aho->states[ state ].wordnum;
1949 #endif
1950                             base = aho->states[ state ].trans.base;
1951
1952                             DEBUG_TRIE_EXECUTE_r({
1953                                 if (failed)
1954                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1955                                         s,   utf8_target );
1956                                 PerlIO_printf( Perl_debug_log,
1957                                     "%sState: %4"UVxf", word=%"UVxf,
1958                                     failed ? " Fail transition to " : "",
1959                                     (UV)state, (UV)word);
1960                             });
1961                             if ( base ) {
1962                                 U32 tmp;
1963                                 I32 offset;
1964                                 if (charid &&
1965                                      ( ((offset = base + charid
1966                                         - 1 - trie->uniquecharcount)) >= 0)
1967                                      && ((U32)offset < trie->lasttrans)
1968                                      && trie->trans[offset].check == state
1969                                      && (tmp=trie->trans[offset].next))
1970                                 {
1971                                     DEBUG_TRIE_EXECUTE_r(
1972                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1973                                     state = tmp;
1974                                     break;
1975                                 }
1976                                 else {
1977                                     DEBUG_TRIE_EXECUTE_r(
1978                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1979                                     failed = 1;
1980                                     state = aho->fail[state];
1981                                 }
1982                             }
1983                             else {
1984                                 /* we must be accepting here */
1985                                 DEBUG_TRIE_EXECUTE_r(
1986                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1987                                 failed = 1;
1988                                 break;
1989                             }
1990                         } while(state);
1991                         uc += len;
1992                         if (failed) {
1993                             if (leftmost)
1994                                 break;
1995                             if (!state) state = 1;
1996                         }
1997                     }
1998                     if ( aho->states[ state ].wordnum ) {
1999                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2000                         if (!leftmost || lpos < leftmost) {
2001                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2002                             leftmost = lpos;
2003                         }
2004                     }
2005                     if (leftmost) {
2006                         s = (char*)leftmost;
2007                         DEBUG_TRIE_EXECUTE_r({
2008                             PerlIO_printf(
2009                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2010                                 (UV)accepted_word, (IV)(s - real_start)
2011                             );
2012                         });
2013                         if (!reginfo || regtry(reginfo, &s)) {
2014                             FREETMPS;
2015                             LEAVE;
2016                             goto got_it;
2017                         }
2018                         s = HOPc(s,1);
2019                         DEBUG_TRIE_EXECUTE_r({
2020                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2021                         });
2022                     } else {
2023                         DEBUG_TRIE_EXECUTE_r(
2024                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2025                         break;
2026                     }
2027                 }
2028                 FREETMPS;
2029                 LEAVE;
2030             }
2031             break;
2032         default:
2033             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2034             break;
2035         }
2036         return 0;
2037       got_it:
2038         return s;
2039 }
2040
2041
2042 /*
2043  - regexec_flags - match a regexp against a string
2044  */
2045 I32
2046 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2047               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2048 /* strend: pointer to null at end of string */
2049 /* strbeg: real beginning of string */
2050 /* minend: end of match must be >=minend after stringarg. */
2051 /* data: May be used for some additional optimizations.
2052          Currently its only used, with a U32 cast, for transmitting
2053          the ganch offset when doing a /g match. This will change */
2054 /* nosave: For optimizations. */
2055 {
2056     dVAR;
2057     struct regexp *const prog = (struct regexp *)SvANY(rx);
2058     /*register*/ char *s;
2059     register regnode *c;
2060     /*register*/ char *startpos = stringarg;
2061     I32 minlen;         /* must match at least this many chars */
2062     I32 dontbother = 0; /* how many characters not to try at end */
2063     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2064     I32 scream_pos = -1;                /* Internal iterator of scream. */
2065     char *scream_olds = NULL;
2066     const bool utf8_target = cBOOL(DO_UTF8(sv));
2067     I32 multiline;
2068     RXi_GET_DECL(prog,progi);
2069     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2070     regexp_paren_pair *swap = NULL;
2071     GET_RE_DEBUG_FLAGS_DECL;
2072
2073     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2074     PERL_UNUSED_ARG(data);
2075
2076     /* Be paranoid... */
2077     if (prog == NULL || startpos == NULL) {
2078         Perl_croak(aTHX_ "NULL regexp parameter");
2079         return 0;
2080     }
2081
2082     multiline = prog->extflags & RXf_PMf_MULTILINE;
2083     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2084
2085     RX_MATCH_UTF8_set(rx, utf8_target);
2086     DEBUG_EXECUTE_r(
2087         debug_start_match(rx, utf8_target, startpos, strend,
2088         "Matching");
2089     );
2090
2091     minlen = prog->minlen;
2092
2093     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2094         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2095                               "String too short [regexec_flags]...\n"));
2096         goto phooey;
2097     }
2098
2099
2100     /* Check validity of program. */
2101     if (UCHARAT(progi->program) != REG_MAGIC) {
2102         Perl_croak(aTHX_ "corrupted regexp program");
2103     }
2104
2105     PL_reg_flags = 0;
2106     PL_reg_state.re_state_eval_setup_done = FALSE;
2107     PL_reg_maxiter = 0;
2108
2109     if (RX_UTF8(rx))
2110         PL_reg_flags |= RF_utf8;
2111
2112     /* Mark beginning of line for ^ and lookbehind. */
2113     reginfo.bol = startpos; /* XXX not used ??? */
2114     PL_bostr  = strbeg;
2115     reginfo.sv = sv;
2116
2117     /* Mark end of line for $ (and such) */
2118     PL_regeol = strend;
2119
2120     /* see how far we have to get to not match where we matched before */
2121     reginfo.till = startpos+minend;
2122
2123     /* If there is a "must appear" string, look for it. */
2124     s = startpos;
2125
2126     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2127         MAGIC *mg;
2128         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2129             reginfo.ganch = startpos + prog->gofs;
2130             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2131               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2132         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2133                   && SvMAGIC(sv)
2134                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2135                   && mg->mg_len >= 0) {
2136             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2137             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2138                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2139
2140             if (prog->extflags & RXf_ANCH_GPOS) {
2141                 if (s > reginfo.ganch)
2142                     goto phooey;
2143                 s = reginfo.ganch - prog->gofs;
2144                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2145                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2146                 if (s < strbeg)
2147                     goto phooey;
2148             }
2149         }
2150         else if (data) {
2151             reginfo.ganch = strbeg + PTR2UV(data);
2152             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2153                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2154
2155         } else {                                /* pos() not defined */
2156             reginfo.ganch = strbeg;
2157             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2158                  "GPOS: reginfo.ganch = strbeg\n"));
2159         }
2160     }
2161     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2162         /* We have to be careful. If the previous successful match
2163            was from this regex we don't want a subsequent partially
2164            successful match to clobber the old results.
2165            So when we detect this possibility we add a swap buffer
2166            to the re, and switch the buffer each match. If we fail
2167            we switch it back, otherwise we leave it swapped.
2168         */
2169         swap = prog->offs;
2170         /* do we need a save destructor here for eval dies? */
2171         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2172         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2173             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2174             PTR2UV(prog),
2175             PTR2UV(swap),
2176             PTR2UV(prog->offs)
2177         ));
2178     }
2179     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2180         re_scream_pos_data d;
2181
2182         d.scream_olds = &scream_olds;
2183         d.scream_pos = &scream_pos;
2184         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2185         if (!s) {
2186             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2187             goto phooey;        /* not present */
2188         }
2189     }
2190
2191
2192
2193     /* Simplest case:  anchored match need be tried only once. */
2194     /*  [unless only anchor is BOL and multiline is set] */
2195     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2196         if (s == startpos && regtry(&reginfo, &startpos))
2197             goto got_it;
2198         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2199                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2200         {
2201             char *end;
2202
2203             if (minlen)
2204                 dontbother = minlen - 1;
2205             end = HOP3c(strend, -dontbother, strbeg) - 1;
2206             /* for multiline we only have to try after newlines */
2207             if (prog->check_substr || prog->check_utf8) {
2208                 /* because of the goto we can not easily reuse the macros for bifurcating the
2209                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2210                 if (utf8_target) {
2211                     if (s == startpos)
2212                         goto after_try_utf8;
2213                     while (1) {
2214                         if (regtry(&reginfo, &s)) {
2215                             goto got_it;
2216                         }
2217                       after_try_utf8:
2218                         if (s > end) {
2219                             goto phooey;
2220                         }
2221                         if (prog->extflags & RXf_USE_INTUIT) {
2222                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2223                             if (!s) {
2224                                 goto phooey;
2225                             }
2226                         }
2227                         else {
2228                             s += UTF8SKIP(s);
2229                         }
2230                     }
2231                 } /* end search for check string in unicode */
2232                 else {
2233                     if (s == startpos) {
2234                         goto after_try_latin;
2235                     }
2236                     while (1) {
2237                         if (regtry(&reginfo, &s)) {
2238                             goto got_it;
2239                         }
2240                       after_try_latin:
2241                         if (s > end) {
2242                             goto phooey;
2243                         }
2244                         if (prog->extflags & RXf_USE_INTUIT) {
2245                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2246                             if (!s) {
2247                                 goto phooey;
2248                             }
2249                         }
2250                         else {
2251                             s++;
2252                         }
2253                     }
2254                 } /* end search for check string in latin*/
2255             } /* end search for check string */
2256             else { /* search for newline */
2257                 if (s > startpos) {
2258                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2259                     s--;
2260                 }
2261                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2262                 while (s <= end) { /* note it could be possible to match at the end of the string */
2263                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2264                         if (regtry(&reginfo, &s))
2265                             goto got_it;
2266                     }
2267                 }
2268             } /* end search for newline */
2269         } /* end anchored/multiline check string search */
2270         goto phooey;
2271     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2272     {
2273         /* the warning about reginfo.ganch being used without initialization
2274            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2275            and we only enter this block when the same bit is set. */
2276         char *tmp_s = reginfo.ganch - prog->gofs;
2277
2278         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2279             goto got_it;
2280         goto phooey;
2281     }
2282
2283     /* Messy cases:  unanchored match. */
2284     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2285         /* we have /x+whatever/ */
2286         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2287         char ch;
2288 #ifdef DEBUGGING
2289         int did_match = 0;
2290 #endif
2291         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2292             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2293         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2294
2295         if (utf8_target) {
2296             REXEC_FBC_SCAN(
2297                 if (*s == ch) {
2298                     DEBUG_EXECUTE_r( did_match = 1 );
2299                     if (regtry(&reginfo, &s)) goto got_it;
2300                     s += UTF8SKIP(s);
2301                     while (s < strend && *s == ch)
2302                         s += UTF8SKIP(s);
2303                 }
2304             );
2305         }
2306         else {
2307             REXEC_FBC_SCAN(
2308                 if (*s == ch) {
2309                     DEBUG_EXECUTE_r( did_match = 1 );
2310                     if (regtry(&reginfo, &s)) goto got_it;
2311                     s++;
2312                     while (s < strend && *s == ch)
2313                         s++;
2314                 }
2315             );
2316         }
2317         DEBUG_EXECUTE_r(if (!did_match)
2318                 PerlIO_printf(Perl_debug_log,
2319                                   "Did not find anchored character...\n")
2320                );
2321     }
2322     else if (prog->anchored_substr != NULL
2323               || prog->anchored_utf8 != NULL
2324               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2325                   && prog->float_max_offset < strend - s)) {
2326         SV *must;
2327         I32 back_max;
2328         I32 back_min;
2329         char *last;
2330         char *last1;            /* Last position checked before */
2331 #ifdef DEBUGGING
2332         int did_match = 0;
2333 #endif
2334         if (prog->anchored_substr || prog->anchored_utf8) {
2335             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2336                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2337             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2338             back_max = back_min = prog->anchored_offset;
2339         } else {
2340             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2341                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2342             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2343             back_max = prog->float_max_offset;
2344             back_min = prog->float_min_offset;
2345         }
2346
2347
2348         if (must == &PL_sv_undef)
2349             /* could not downgrade utf8 check substring, so must fail */
2350             goto phooey;
2351
2352         if (back_min<0) {
2353             last = strend;
2354         } else {
2355             last = HOP3c(strend,        /* Cannot start after this */
2356                   -(I32)(CHR_SVLEN(must)
2357                          - (SvTAIL(must) != 0) + back_min), strbeg);
2358         }
2359         if (s > PL_bostr)
2360             last1 = HOPc(s, -1);
2361         else
2362             last1 = s - 1;      /* bogus */
2363
2364         /* XXXX check_substr already used to find "s", can optimize if
2365            check_substr==must. */
2366         scream_pos = -1;
2367         dontbother = end_shift;
2368         strend = HOPc(strend, -dontbother);
2369         while ( (s <= last) &&
2370                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2371                                   (unsigned char*)strend, must,
2372                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2373             DEBUG_EXECUTE_r( did_match = 1 );
2374             if (HOPc(s, -back_max) > last1) {
2375                 last1 = HOPc(s, -back_min);
2376                 s = HOPc(s, -back_max);
2377             }
2378             else {
2379                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2380
2381                 last1 = HOPc(s, -back_min);
2382                 s = t;
2383             }
2384             if (utf8_target) {
2385                 while (s <= last1) {
2386                     if (regtry(&reginfo, &s))
2387                         goto got_it;
2388                     s += UTF8SKIP(s);
2389                 }
2390             }
2391             else {
2392                 while (s <= last1) {
2393                     if (regtry(&reginfo, &s))
2394                         goto got_it;
2395                     s++;
2396                 }
2397             }
2398         }
2399         DEBUG_EXECUTE_r(if (!did_match) {
2400             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2401                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2402             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2403                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2404                                ? "anchored" : "floating"),
2405                 quoted, RE_SV_TAIL(must));
2406         });
2407         goto phooey;
2408     }
2409     else if ( (c = progi->regstclass) ) {
2410         if (minlen) {
2411             const OPCODE op = OP(progi->regstclass);
2412             /* don't bother with what can't match */
2413             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2414                 strend = HOPc(strend, -(minlen - 1));
2415         }
2416         DEBUG_EXECUTE_r({
2417             SV * const prop = sv_newmortal();
2418             regprop(prog, prop, c);
2419             {
2420                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2421                     s,strend-s,60);
2422                 PerlIO_printf(Perl_debug_log,
2423                     "Matching stclass %.*s against %s (%d bytes)\n",
2424                     (int)SvCUR(prop), SvPVX_const(prop),
2425                      quoted, (int)(strend - s));
2426             }
2427         });
2428         if (find_byclass(prog, c, s, strend, &reginfo))
2429             goto got_it;
2430         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2431     }
2432     else {
2433         dontbother = 0;
2434         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2435             /* Trim the end. */
2436             char *last= NULL;
2437             SV* float_real;
2438             STRLEN len;
2439             const char *little;
2440
2441             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2442                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2443             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2444
2445             little = SvPV_const(float_real, len);
2446             if (SvTAIL(float_real)) {
2447                     /* This means that float_real contains an artificial \n on the end
2448                      * due to the presence of something like this: /foo$/
2449                      * where we can match both "foo" and "foo\n" at the end of the string.
2450                      * So we have to compare the end of the string first against the float_real
2451                      * without the \n and then against the full float_real with the string.
2452                      * We have to watch out for cases where the string might be smaller
2453                      * than the float_real or the float_real without the \n.
2454                      */
2455                     char *checkpos= strend - len;
2456                     DEBUG_OPTIMISE_r(
2457                         PerlIO_printf(Perl_debug_log,
2458                             "%sChecking for float_real.%s\n",
2459                             PL_colors[4], PL_colors[5]));
2460                     if (checkpos + 1 < strbeg) {
2461                         /* can't match, even if we remove the trailing \n string is too short to match */
2462                         DEBUG_EXECUTE_r(
2463                             PerlIO_printf(Perl_debug_log,
2464                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2465                                 PL_colors[4], PL_colors[5]));
2466                         goto phooey;
2467                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2468                         /* can match, the end of the string matches without the "\n" */
2469                         last = checkpos + 1;
2470                     } else if (checkpos < strbeg) {
2471                         /* cant match, string is too short when the "\n" is included */
2472                         DEBUG_EXECUTE_r(
2473                             PerlIO_printf(Perl_debug_log,
2474                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2475                                 PL_colors[4], PL_colors[5]));
2476                         goto phooey;
2477                     } else if (!multiline) {
2478                         /* non multiline match, so compare with the "\n" at the end of the string */
2479                         if (memEQ(checkpos, little, len)) {
2480                             last= checkpos;
2481                         } else {
2482                             DEBUG_EXECUTE_r(
2483                                 PerlIO_printf(Perl_debug_log,
2484                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2485                                     PL_colors[4], PL_colors[5]));
2486                             goto phooey;
2487                         }
2488                     } else {
2489                         /* multiline match, so we have to search for a place where the full string is located */
2490                         goto find_last;
2491                     }
2492             } else {
2493                   find_last:
2494                     if (len)
2495                         last = rninstr(s, strend, little, little + len);
2496                     else
2497                         last = strend;  /* matching "$" */
2498             }
2499             if (!last) {
2500                 /* at one point this block contained a comment which was probably
2501                  * incorrect, which said that this was a "should not happen" case.
2502                  * Even if it was true when it was written I am pretty sure it is
2503                  * not anymore, so I have removed the comment and replaced it with
2504                  * this one. Yves */
2505                 DEBUG_EXECUTE_r(
2506                     PerlIO_printf(Perl_debug_log,
2507                         "String does not contain required substring, cannot match.\n"
2508                     ));
2509                 goto phooey;
2510             }
2511             dontbother = strend - last + prog->float_min_offset;
2512         }
2513         if (minlen && (dontbother < minlen))
2514             dontbother = minlen - 1;
2515         strend -= dontbother;              /* this one's always in bytes! */
2516         /* We don't know much -- general case. */
2517         if (utf8_target) {
2518             for (;;) {
2519                 if (regtry(&reginfo, &s))
2520                     goto got_it;
2521                 if (s >= strend)
2522                     break;
2523                 s += UTF8SKIP(s);
2524             };
2525         }
2526         else {
2527             do {
2528                 if (regtry(&reginfo, &s))
2529                     goto got_it;
2530             } while (s++ < strend);
2531         }
2532     }
2533
2534     /* Failure. */
2535     goto phooey;
2536
2537 got_it:
2538     DEBUG_BUFFERS_r(
2539         if (swap)
2540             PerlIO_printf(Perl_debug_log,
2541                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2542                 PTR2UV(prog),
2543                 PTR2UV(swap)
2544             );
2545     );
2546     Safefree(swap);
2547     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2548
2549     if (PL_reg_state.re_state_eval_setup_done)
2550         restore_pos(aTHX_ prog);
2551     if (RXp_PAREN_NAMES(prog))
2552         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2553
2554     /* make sure $`, $&, $', and $digit will work later */
2555     if ( !(flags & REXEC_NOT_FIRST) ) {
2556         RX_MATCH_COPY_FREE(rx);
2557         if (flags & REXEC_COPY_STR) {
2558             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2559 #ifdef PERL_OLD_COPY_ON_WRITE
2560             if ((SvIsCOW(sv)
2561                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2562                 if (DEBUG_C_TEST) {
2563                     PerlIO_printf(Perl_debug_log,
2564                                   "Copy on write: regexp capture, type %d\n",
2565                                   (int) SvTYPE(sv));
2566                 }
2567                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2568                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2569                 assert (SvPOKp(prog->saved_copy));
2570             } else
2571 #endif
2572             {
2573                 RX_MATCH_COPIED_on(rx);
2574                 s = savepvn(strbeg, i);
2575                 prog->subbeg = s;
2576             }
2577             prog->sublen = i;
2578         }
2579         else {
2580             prog->subbeg = strbeg;
2581             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2582         }
2583     }
2584
2585     return 1;
2586
2587 phooey:
2588     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2589                           PL_colors[4], PL_colors[5]));
2590     if (PL_reg_state.re_state_eval_setup_done)
2591         restore_pos(aTHX_ prog);
2592     if (swap) {
2593         /* we failed :-( roll it back */
2594         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2595             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2596             PTR2UV(prog),
2597             PTR2UV(prog->offs),
2598             PTR2UV(swap)
2599         ));
2600         Safefree(prog->offs);
2601         prog->offs = swap;
2602     }
2603
2604     return 0;
2605 }
2606
2607
2608 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2609  * Do inc before dec, in case old and new rex are the same */
2610 #define SET_reg_curpm(Re2) \
2611     if (PL_reg_state.re_state_eval_setup_done) {    \
2612         (void)ReREFCNT_inc(Re2);                    \
2613         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2614         PM_SETRE((PL_reg_curpm), (Re2));            \
2615     }
2616
2617
2618 /*
2619  - regtry - try match at specific point
2620  */
2621 STATIC I32                      /* 0 failure, 1 success */
2622 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2623 {
2624     dVAR;
2625     CHECKPOINT lastcp;
2626     REGEXP *const rx = reginfo->prog;
2627     regexp *const prog = (struct regexp *)SvANY(rx);
2628     RXi_GET_DECL(prog,progi);
2629     GET_RE_DEBUG_FLAGS_DECL;
2630
2631     PERL_ARGS_ASSERT_REGTRY;
2632
2633     reginfo->cutpoint=NULL;
2634
2635     if ((prog->extflags & RXf_EVAL_SEEN)
2636         && !PL_reg_state.re_state_eval_setup_done)
2637     {
2638         MAGIC *mg;
2639
2640         PL_reg_state.re_state_eval_setup_done = TRUE;
2641         if (reginfo->sv) {
2642             /* Make $_ available to executed code. */
2643             if (reginfo->sv != DEFSV) {
2644                 SAVE_DEFSV;
2645                 DEFSV_set(reginfo->sv);
2646             }
2647
2648             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2649                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2650                 /* prepare for quick setting of pos */
2651 #ifdef PERL_OLD_COPY_ON_WRITE
2652                 if (SvIsCOW(reginfo->sv))
2653                     sv_force_normal_flags(reginfo->sv, 0);
2654 #endif
2655                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2656                                  &PL_vtbl_mglob, NULL, 0);
2657                 mg->mg_len = -1;
2658             }
2659             PL_reg_magic    = mg;
2660             PL_reg_oldpos   = mg->mg_len;
2661             SAVEDESTRUCTOR_X(restore_pos, prog);
2662         }
2663         if (!PL_reg_curpm) {
2664             Newxz(PL_reg_curpm, 1, PMOP);
2665 #ifdef USE_ITHREADS
2666             {
2667                 SV* const repointer = &PL_sv_undef;
2668                 /* this regexp is also owned by the new PL_reg_curpm, which
2669                    will try to free it.  */
2670                 av_push(PL_regex_padav, repointer);
2671                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2672                 PL_regex_pad = AvARRAY(PL_regex_padav);
2673             }
2674 #endif
2675         }
2676         SET_reg_curpm(rx);
2677         PL_reg_oldcurpm = PL_curpm;
2678         PL_curpm = PL_reg_curpm;
2679         if (RXp_MATCH_COPIED(prog)) {
2680             /*  Here is a serious problem: we cannot rewrite subbeg,
2681                 since it may be needed if this match fails.  Thus
2682                 $` inside (?{}) could fail... */
2683             PL_reg_oldsaved = prog->subbeg;
2684             PL_reg_oldsavedlen = prog->sublen;
2685 #ifdef PERL_OLD_COPY_ON_WRITE
2686             PL_nrs = prog->saved_copy;
2687 #endif
2688             RXp_MATCH_COPIED_off(prog);
2689         }
2690         else
2691             PL_reg_oldsaved = NULL;
2692         prog->subbeg = PL_bostr;
2693         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2694     }
2695 #ifdef DEBUGGING
2696     PL_reg_starttry = *startpos;
2697 #endif
2698     prog->offs[0].start = *startpos - PL_bostr;
2699     PL_reginput = *startpos;
2700     prog->lastparen = 0;
2701     prog->lastcloseparen = 0;
2702     PL_regsize = 0;
2703
2704     /* XXXX What this code is doing here?!!!  There should be no need
2705        to do this again and again, prog->lastparen should take care of
2706        this!  --ilya*/
2707
2708     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2709      * Actually, the code in regcppop() (which Ilya may be meaning by
2710      * prog->lastparen), is not needed at all by the test suite
2711      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2712      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2713      * Meanwhile, this code *is* needed for the
2714      * above-mentioned test suite tests to succeed.  The common theme
2715      * on those tests seems to be returning null fields from matches.
2716      * --jhi updated by dapm */
2717 #if 1
2718     if (prog->nparens) {
2719         regexp_paren_pair *pp = prog->offs;
2720         register I32 i;
2721         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2722             ++pp;
2723             pp->start = -1;
2724             pp->end = -1;
2725         }
2726     }
2727 #endif
2728     REGCP_SET(lastcp);
2729     if (regmatch(reginfo, progi->program + 1)) {
2730         prog->offs[0].end = PL_reginput - PL_bostr;
2731         return 1;
2732     }
2733     if (reginfo->cutpoint)
2734         *startpos= reginfo->cutpoint;
2735     REGCP_UNWIND(lastcp);
2736     return 0;
2737 }
2738
2739
2740 #define sayYES goto yes
2741 #define sayNO goto no
2742 #define sayNO_SILENT goto no_silent
2743
2744 /* we dont use STMT_START/END here because it leads to
2745    "unreachable code" warnings, which are bogus, but distracting. */
2746 #define CACHEsayNO \
2747     if (ST.cache_mask) \
2748        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2749     sayNO
2750
2751 /* this is used to determine how far from the left messages like
2752    'failed...' are printed. It should be set such that messages
2753    are inline with the regop output that created them.
2754 */
2755 #define REPORT_CODE_OFF 32
2756
2757
2758 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2759 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2760
2761 #define SLAB_FIRST(s) (&(s)->states[0])
2762 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2763
2764 /* grab a new slab and return the first slot in it */
2765
2766 STATIC regmatch_state *
2767 S_push_slab(pTHX)
2768 {
2769 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2770     dMY_CXT;
2771 #endif
2772     regmatch_slab *s = PL_regmatch_slab->next;
2773     if (!s) {
2774         Newx(s, 1, regmatch_slab);
2775         s->prev = PL_regmatch_slab;
2776         s->next = NULL;
2777         PL_regmatch_slab->next = s;
2778     }
2779     PL_regmatch_slab = s;
2780     return SLAB_FIRST(s);
2781 }
2782
2783
2784 /* push a new state then goto it */
2785
2786 #define PUSH_STATE_GOTO(state, node) \
2787     scan = node; \
2788     st->resume_state = state; \
2789     goto push_state;
2790
2791 /* push a new state with success backtracking, then goto it */
2792
2793 #define PUSH_YES_STATE_GOTO(state, node) \
2794     scan = node; \
2795     st->resume_state = state; \
2796     goto push_yes_state;
2797
2798
2799
2800 /*
2801
2802 regmatch() - main matching routine
2803
2804 This is basically one big switch statement in a loop. We execute an op,
2805 set 'next' to point the next op, and continue. If we come to a point which
2806 we may need to backtrack to on failure such as (A|B|C), we push a
2807 backtrack state onto the backtrack stack. On failure, we pop the top
2808 state, and re-enter the loop at the state indicated. If there are no more
2809 states to pop, we return failure.
2810
2811 Sometimes we also need to backtrack on success; for example /A+/, where
2812 after successfully matching one A, we need to go back and try to
2813 match another one; similarly for lookahead assertions: if the assertion
2814 completes successfully, we backtrack to the state just before the assertion
2815 and then carry on.  In these cases, the pushed state is marked as
2816 'backtrack on success too'. This marking is in fact done by a chain of
2817 pointers, each pointing to the previous 'yes' state. On success, we pop to
2818 the nearest yes state, discarding any intermediate failure-only states.
2819 Sometimes a yes state is pushed just to force some cleanup code to be
2820 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2821 it to free the inner regex.
2822
2823 Note that failure backtracking rewinds the cursor position, while
2824 success backtracking leaves it alone.
2825
2826 A pattern is complete when the END op is executed, while a subpattern
2827 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2828 ops trigger the "pop to last yes state if any, otherwise return true"
2829 behaviour.
2830
2831 A common convention in this function is to use A and B to refer to the two
2832 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2833 the subpattern to be matched possibly multiple times, while B is the entire
2834 rest of the pattern. Variable and state names reflect this convention.
2835
2836 The states in the main switch are the union of ops and failure/success of
2837 substates associated with with that op.  For example, IFMATCH is the op
2838 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2839 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2840 successfully matched A and IFMATCH_A_fail is a state saying that we have
2841 just failed to match A. Resume states always come in pairs. The backtrack
2842 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2843 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2844 on success or failure.
2845
2846 The struct that holds a backtracking state is actually a big union, with
2847 one variant for each major type of op. The variable st points to the
2848 top-most backtrack struct. To make the code clearer, within each
2849 block of code we #define ST to alias the relevant union.
2850
2851 Here's a concrete example of a (vastly oversimplified) IFMATCH
2852 implementation:
2853
2854     switch (state) {
2855     ....
2856
2857 #define ST st->u.ifmatch
2858
2859     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2860         ST.foo = ...; // some state we wish to save
2861         ...
2862         // push a yes backtrack state with a resume value of
2863         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2864         // first node of A:
2865         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2866         // NOTREACHED
2867
2868     case IFMATCH_A: // we have successfully executed A; now continue with B
2869         next = B;
2870         bar = ST.foo; // do something with the preserved value
2871         break;
2872
2873     case IFMATCH_A_fail: // A failed, so the assertion failed
2874         ...;   // do some housekeeping, then ...
2875         sayNO; // propagate the failure
2876
2877 #undef ST
2878
2879     ...
2880     }
2881
2882 For any old-timers reading this who are familiar with the old recursive
2883 approach, the code above is equivalent to:
2884
2885     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2886     {
2887         int foo = ...
2888         ...
2889         if (regmatch(A)) {
2890             next = B;
2891             bar = foo;
2892             break;
2893         }
2894         ...;   // do some housekeeping, then ...
2895         sayNO; // propagate the failure
2896     }
2897
2898 The topmost backtrack state, pointed to by st, is usually free. If you
2899 want to claim it, populate any ST.foo fields in it with values you wish to
2900 save, then do one of
2901
2902         PUSH_STATE_GOTO(resume_state, node);
2903         PUSH_YES_STATE_GOTO(resume_state, node);
2904
2905 which sets that backtrack state's resume value to 'resume_state', pushes a
2906 new free entry to the top of the backtrack stack, then goes to 'node'.
2907 On backtracking, the free slot is popped, and the saved state becomes the
2908 new free state. An ST.foo field in this new top state can be temporarily
2909 accessed to retrieve values, but once the main loop is re-entered, it
2910 becomes available for reuse.
2911
2912 Note that the depth of the backtrack stack constantly increases during the
2913 left-to-right execution of the pattern, rather than going up and down with
2914 the pattern nesting. For example the stack is at its maximum at Z at the
2915 end of the pattern, rather than at X in the following:
2916
2917     /(((X)+)+)+....(Y)+....Z/
2918
2919 The only exceptions to this are lookahead/behind assertions and the cut,
2920 (?>A), which pop all the backtrack states associated with A before
2921 continuing.
2922
2923 Backtrack state structs are allocated in slabs of about 4K in size.
2924 PL_regmatch_state and st always point to the currently active state,
2925 and PL_regmatch_slab points to the slab currently containing
2926 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2927 allocated, and is never freed until interpreter destruction. When the slab
2928 is full, a new one is allocated and chained to the end. At exit from
2929 regmatch(), slabs allocated since entry are freed.
2930
2931 */
2932
2933
2934 #define DEBUG_STATE_pp(pp)                                  \
2935     DEBUG_STATE_r({                                         \
2936         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2937         PerlIO_printf(Perl_debug_log,                       \
2938             "    %*s"pp" %s%s%s%s%s\n",                     \
2939             depth*2, "",                                    \
2940             PL_reg_name[st->resume_state],                     \
2941             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2942             ((st==yes_state) ? "Y" : ""),                   \
2943             ((st==mark_state) ? "M" : ""),                  \
2944             ((st==yes_state||st==mark_state) ? "]" : "")    \
2945         );                                                  \
2946     });
2947
2948
2949 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2950
2951 #ifdef DEBUGGING
2952
2953 STATIC void
2954 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2955     const char *start, const char *end, const char *blurb)
2956 {
2957     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2958
2959     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2960
2961     if (!PL_colorset)
2962             reginitcolors();
2963     {
2964         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2965             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2966
2967         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2968             start, end - start, 60);
2969
2970         PerlIO_printf(Perl_debug_log,
2971             "%s%s REx%s %s against %s\n",
2972                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2973
2974         if (utf8_target||utf8_pat)
2975             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2976                 utf8_pat ? "pattern" : "",
2977                 utf8_pat && utf8_target ? " and " : "",
2978                 utf8_target ? "string" : ""
2979             );
2980     }
2981 }
2982
2983 STATIC void
2984 S_dump_exec_pos(pTHX_ const char *locinput,
2985                       const regnode *scan,
2986                       const char *loc_regeol,
2987                       const char *loc_bostr,
2988                       const char *loc_reg_starttry,
2989                       const bool utf8_target)
2990 {
2991     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2992     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2993     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2994     /* The part of the string before starttry has one color
2995        (pref0_len chars), between starttry and current
2996        position another one (pref_len - pref0_len chars),
2997        after the current position the third one.
2998        We assume that pref0_len <= pref_len, otherwise we
2999        decrease pref0_len.  */
3000     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3001         ? (5 + taill) - l : locinput - loc_bostr;
3002     int pref0_len;
3003
3004     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3005
3006     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3007         pref_len++;
3008     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3009     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3010         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3011               ? (5 + taill) - pref_len : loc_regeol - locinput);
3012     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3013         l--;
3014     if (pref0_len < 0)
3015         pref0_len = 0;
3016     if (pref0_len > pref_len)
3017         pref0_len = pref_len;
3018     {
3019         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3020
3021         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3022             (locinput - pref_len),pref0_len, 60, 4, 5);
3023
3024         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3025                     (locinput - pref_len + pref0_len),
3026                     pref_len - pref0_len, 60, 2, 3);
3027
3028         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3029                     locinput, loc_regeol - locinput, 10, 0, 1);
3030
3031         const STRLEN tlen=len0+len1+len2;
3032         PerlIO_printf(Perl_debug_log,
3033                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3034                     (IV)(locinput - loc_bostr),
3035                     len0, s0,
3036                     len1, s1,
3037                     (docolor ? "" : "> <"),
3038                     len2, s2,
3039                     (int)(tlen > 19 ? 0 :  19 - tlen),
3040                     "");
3041     }
3042 }
3043
3044 #endif
3045
3046 /* reg_check_named_buff_matched()
3047  * Checks to see if a named buffer has matched. The data array of
3048  * buffer numbers corresponding to the buffer is expected to reside
3049  * in the regexp->data->data array in the slot stored in the ARG() of
3050  * node involved. Note that this routine doesn't actually care about the
3051  * name, that information is not preserved from compilation to execution.
3052  * Returns the index of the leftmost defined buffer with the given name
3053  * or 0 if non of the buffers matched.
3054  */
3055 STATIC I32
3056 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3057 {
3058     I32 n;
3059     RXi_GET_DECL(rex,rexi);
3060     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3061     I32 *nums=(I32*)SvPVX(sv_dat);
3062
3063     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3064
3065     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3066         if ((I32)rex->lastparen >= nums[n] &&
3067             rex->offs[nums[n]].end != -1)
3068         {
3069             return nums[n];
3070         }
3071     }
3072     return 0;
3073 }
3074
3075
3076 /* free all slabs above current one  - called during LEAVE_SCOPE */
3077
3078 STATIC void
3079 S_clear_backtrack_stack(pTHX_ void *p)
3080 {
3081     regmatch_slab *s = PL_regmatch_slab->next;
3082     PERL_UNUSED_ARG(p);
3083
3084     if (!s)
3085         return;
3086     PL_regmatch_slab->next = NULL;
3087     while (s) {
3088         regmatch_slab * const osl = s;
3089         s = s->next;
3090         Safefree(osl);
3091     }
3092 }
3093
3094
3095 STATIC I32                      /* 0 failure, 1 success */
3096 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3097 {
3098 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3099     dMY_CXT;
3100 #endif
3101     dVAR;
3102     register const bool utf8_target = PL_reg_match_utf8;
3103     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3104     REGEXP *rex_sv = reginfo->prog;
3105     regexp *rex = (struct regexp *)SvANY(rex_sv);
3106     RXi_GET_DECL(rex,rexi);
3107     I32 oldsave;
3108     /* the current state. This is a cached copy of PL_regmatch_state */
3109     register regmatch_state *st;
3110     /* cache heavy used fields of st in registers */
3111     register regnode *scan;
3112     register regnode *next;
3113     register U32 n = 0; /* general value; init to avoid compiler warning */
3114     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3115     register char *locinput = PL_reginput;
3116     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3117
3118     bool result = 0;        /* return value of S_regmatch */
3119     int depth = 0;          /* depth of backtrack stack */
3120     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3121     const U32 max_nochange_depth =
3122         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3123         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3124     regmatch_state *yes_state = NULL; /* state to pop to on success of
3125                                                             subpattern */
3126     /* mark_state piggy backs on the yes_state logic so that when we unwind
3127        the stack on success we can update the mark_state as we go */
3128     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3129     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3130     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3131     U32 state_num;
3132     bool no_final = 0;      /* prevent failure from backtracking? */
3133     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3134     char *startpoint = PL_reginput;
3135     SV *popmark = NULL;     /* are we looking for a mark? */
3136     SV *sv_commit = NULL;   /* last mark name seen in failure */
3137     SV *sv_yes_mark = NULL; /* last mark name we have seen
3138                                during a successful match */
3139     U32 lastopen = 0;       /* last open we saw */
3140     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3141     SV* const oreplsv = GvSV(PL_replgv);
3142     /* these three flags are set by various ops to signal information to
3143      * the very next op. They have a useful lifetime of exactly one loop
3144      * iteration, and are not preserved or restored by state pushes/pops
3145      */
3146     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3147     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3148     int logical = 0;        /* the following EVAL is:
3149                                 0: (?{...})
3150                                 1: (?(?{...})X|Y)
3151                                 2: (??{...})
3152                                or the following IFMATCH/UNLESSM is:
3153                                 false: plain (?=foo)
3154                                 true:  used as a condition: (?(?=foo))
3155                             */
3156     PAD* last_pad = NULL;
3157     dMULTICALL;
3158     I32 gimme = G_SCALAR;
3159     CV *caller_cv = NULL;       /* who called us */
3160     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3161     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3162
3163 #ifdef DEBUGGING
3164     GET_RE_DEBUG_FLAGS_DECL;
3165 #endif
3166
3167     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3168     multicall_oldcatch = 0;
3169     multicall_cv = NULL;
3170     cx = NULL;
3171     PERL_UNUSED_VAR(multicall_cop);
3172     PERL_UNUSED_VAR(newsp);
3173
3174
3175     PERL_ARGS_ASSERT_REGMATCH;
3176
3177     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3178             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3179     }));
3180     /* on first ever call to regmatch, allocate first slab */
3181     if (!PL_regmatch_slab) {
3182         Newx(PL_regmatch_slab, 1, regmatch_slab);
3183         PL_regmatch_slab->prev = NULL;
3184         PL_regmatch_slab->next = NULL;
3185         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3186     }
3187
3188     oldsave = PL_savestack_ix;
3189     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3190     SAVEVPTR(PL_regmatch_slab);
3191     SAVEVPTR(PL_regmatch_state);
3192
3193     /* grab next free state slot */
3194     st = ++PL_regmatch_state;
3195     if (st >  SLAB_LAST(PL_regmatch_slab))
3196         st = PL_regmatch_state = S_push_slab(aTHX);
3197
3198     /* Note that nextchr is a byte even in UTF */
3199     nextchr = UCHARAT(locinput);
3200     scan = prog;
3201     while (scan != NULL) {
3202
3203         DEBUG_EXECUTE_r( {
3204             SV * const prop = sv_newmortal();
3205             regnode *rnext=regnext(scan);
3206             DUMP_EXEC_POS( locinput, scan, utf8_target );
3207             regprop(rex, prop, scan);
3208
3209             PerlIO_printf(Perl_debug_log,
3210                     "%3"IVdf":%*s%s(%"IVdf")\n",
3211                     (IV)(scan - rexi->program), depth*2, "",
3212                     SvPVX_const(prop),
3213                     (PL_regkind[OP(scan)] == END || !rnext) ?
3214                         0 : (IV)(rnext - rexi->program));
3215         });
3216
3217         next = scan + NEXT_OFF(scan);
3218         if (next == scan)
3219             next = NULL;
3220         state_num = OP(scan);
3221
3222       reenter_switch:
3223
3224         switch (state_num) {
3225         case BOL:
3226             if (locinput == PL_bostr)
3227             {
3228                 /* reginfo->till = reginfo->bol; */
3229                 break;
3230             }
3231             sayNO;
3232         case MBOL:
3233             if (locinput == PL_bostr ||
3234                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3235             {
3236                 break;
3237             }
3238             sayNO;
3239         case SBOL:
3240             if (locinput == PL_bostr)
3241                 break;
3242             sayNO;
3243         case GPOS:
3244             if (locinput == reginfo->ganch)
3245                 break;
3246             sayNO;
3247
3248         case KEEPS:
3249             /* update the startpoint */
3250             st->u.keeper.val = rex->offs[0].start;
3251             PL_reginput = locinput;
3252             rex->offs[0].start = locinput - PL_bostr;
3253             PUSH_STATE_GOTO(KEEPS_next, next);
3254             /*NOT-REACHED*/
3255         case KEEPS_next_fail:
3256             /* rollback the start point change */
3257             rex->offs[0].start = st->u.keeper.val;
3258             sayNO_SILENT;
3259             /*NOT-REACHED*/
3260         case EOL:
3261                 goto seol;
3262         case MEOL:
3263             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3264                 sayNO;
3265             break;
3266         case SEOL:
3267           seol:
3268             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3269                 sayNO;
3270             if (PL_regeol - locinput > 1)
3271                 sayNO;
3272             break;
3273         case EOS:
3274             if (PL_regeol != locinput)
3275                 sayNO;
3276             break;
3277         case SANY:
3278             if (!nextchr && locinput >= PL_regeol)
3279                 sayNO;
3280             if (utf8_target) {
3281                 locinput += PL_utf8skip[nextchr];
3282                 if (locinput > PL_regeol)
3283                     sayNO;
3284                 nextchr = UCHARAT(locinput);
3285             }
3286             else
3287                 nextchr = UCHARAT(++locinput);
3288             break;
3289         case CANY:
3290             if (!nextchr && locinput >= PL_regeol)
3291                 sayNO;
3292             nextchr = UCHARAT(++locinput);
3293             break;
3294         case REG_ANY:
3295             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3296                 sayNO;
3297             if (utf8_target) {
3298                 locinput += PL_utf8skip[nextchr];
3299                 if (locinput > PL_regeol)
3300                     sayNO;
3301                 nextchr = UCHARAT(locinput);
3302             }
3303             else
3304                 nextchr = UCHARAT(++locinput);
3305             break;
3306
3307 #undef  ST
3308 #define ST st->u.trie
3309         case TRIEC:
3310             /* In this case the charclass data is available inline so
3311                we can fail fast without a lot of extra overhead.
3312              */
3313             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3314                 DEBUG_EXECUTE_r(
3315                     PerlIO_printf(Perl_debug_log,
3316                               "%*s  %sfailed to match trie start class...%s\n",
3317                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3318                 );
3319                 sayNO_SILENT;
3320                 assert(0); /* NOTREACHED */
3321             }
3322             /* FALL THROUGH */
3323         case TRIE:
3324             /* the basic plan of execution of the trie is:
3325              * At the beginning, run though all the states, and
3326              * find the longest-matching word. Also remember the position
3327              * of the shortest matching word. For example, this pattern:
3328              *    1  2 3 4    5
3329              *    ab|a|x|abcd|abc
3330              * when matched against the string "abcde", will generate
3331              * accept states for all words except 3, with the longest
3332              * matching word being 4, and the shortest being 1 (with
3333              * the position being after char 1 of the string).
3334              *
3335              * Then for each matching word, in word order (i.e. 1,2,4,5),
3336              * we run the remainder of the pattern; on each try setting
3337              * the current position to the character following the word,
3338              * returning to try the next word on failure.
3339              *
3340              * We avoid having to build a list of words at runtime by
3341              * using a compile-time structure, wordinfo[].prev, which
3342              * gives, for each word, the previous accepting word (if any).
3343              * In the case above it would contain the mappings 1->2, 2->0,
3344              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3345              * the longest word (4 above), a list of all words, by
3346              * following the list of prev pointers; this gives us the
3347              * unordered list 4,5,1,2. Then given the current word we have
3348              * just tried, we can go through the list and find the
3349              * next-biggest word to try (so if we just failed on word 2,
3350              * the next in the list is 4).
3351              *
3352              * Since at runtime we don't record the matching position in
3353              * the string for each word, we have to work that out for
3354              * each word we're about to process. The wordinfo table holds
3355              * the character length of each word; given that we recorded
3356              * at the start: the position of the shortest word and its
3357              * length in chars, we just need to move the pointer the
3358              * difference between the two char lengths. Depending on
3359              * Unicode status and folding, that's cheap or expensive.
3360              *
3361              * This algorithm is optimised for the case where are only a
3362              * small number of accept states, i.e. 0,1, or maybe 2.
3363              * With lots of accepts states, and having to try all of them,
3364              * it becomes quadratic on number of accept states to find all
3365              * the next words.
3366              */
3367
3368             {
3369                 /* what type of TRIE am I? (utf8 makes this contextual) */
3370                 DECL_TRIE_TYPE(scan);
3371
3372                 /* what trie are we using right now */
3373                 reg_trie_data * const trie
3374                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3375                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3376                 U32 state = trie->startstate;
3377
3378                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3379                     if (trie->states[ state ].wordnum) {
3380                          DEBUG_EXECUTE_r(
3381                             PerlIO_printf(Perl_debug_log,
3382                                           "%*s  %smatched empty string...%s\n",
3383                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3384                         );
3385                         if (!trie->jump)
3386                             break;
3387                     } else {
3388                         DEBUG_EXECUTE_r(
3389                             PerlIO_printf(Perl_debug_log,
3390                                           "%*s  %sfailed to match trie start class...%s\n",
3391                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3392                         );
3393                         sayNO_SILENT;
3394                    }
3395                 }
3396
3397             {
3398                 U8 *uc = ( U8* )locinput;
3399
3400                 STRLEN len = 0;
3401                 STRLEN foldlen = 0;
3402                 U8 *uscan = (U8*)NULL;
3403                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3404                 U32 charcount = 0; /* how many input chars we have matched */
3405                 U32 accepted = 0; /* have we seen any accepting states? */
3406
3407                 ST.jump = trie->jump;
3408                 ST.me = scan;
3409                 ST.firstpos = NULL;
3410                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3411                 ST.nextword = 0;
3412
3413                 /* fully traverse the TRIE; note the position of the
3414                    shortest accept state and the wordnum of the longest
3415                    accept state */
3416
3417                 while ( state && uc <= (U8*)PL_regeol ) {
3418                     U32 base = trie->states[ state ].trans.base;
3419                     UV uvc = 0;
3420                     U16 charid = 0;
3421                     U16 wordnum;
3422                     wordnum = trie->states[ state ].wordnum;
3423
3424                     if (wordnum) { /* it's an accept state */
3425                         if (!accepted) {
3426                             accepted = 1;
3427                             /* record first match position */
3428                             if (ST.longfold) {
3429                                 ST.firstpos = (U8*)locinput;
3430                                 ST.firstchars = 0;
3431                             }
3432                             else {
3433                                 ST.firstpos = uc;
3434                                 ST.firstchars = charcount;
3435                             }
3436                         }
3437                         if (!ST.nextword || wordnum < ST.nextword)
3438                             ST.nextword = wordnum;
3439                         ST.topword = wordnum;
3440                     }
3441
3442                     DEBUG_TRIE_EXECUTE_r({
3443                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3444                                 PerlIO_printf( Perl_debug_log,
3445                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3446                                     2+depth * 2, "", PL_colors[4],
3447                                     (UV)state, (accepted ? 'Y' : 'N'));
3448                     });
3449
3450                     /* read a char and goto next state */
3451                     if ( base ) {
3452                         I32 offset;
3453                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3454                                              uscan, len, uvc, charid, foldlen,
3455                                              foldbuf, uniflags);
3456                         charcount++;
3457                         if (foldlen>0)
3458                             ST.longfold = TRUE;
3459                         if (charid &&
3460                              ( ((offset =
3461                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3462
3463                              && ((U32)offset < trie->lasttrans)
3464                              && trie->trans[offset].check == state)
3465                         {
3466                             state = trie->trans[offset].next;
3467                         }
3468                         else {
3469                             state = 0;
3470                         }
3471                         uc += len;
3472
3473                     }
3474                     else {
3475                         state = 0;
3476                     }
3477                     DEBUG_TRIE_EXECUTE_r(
3478                         PerlIO_printf( Perl_debug_log,
3479                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3480                             charid, uvc, (UV)state, PL_colors[5] );
3481                     );
3482                 }
3483                 if (!accepted)
3484                    sayNO;
3485
3486                 /* calculate total number of accept states */
3487                 {
3488                     U16 w = ST.topword;
3489                     accepted = 0;
3490                     while (w) {
3491                         w = trie->wordinfo[w].prev;
3492                         accepted++;
3493                     }
3494                     ST.accepted = accepted;
3495                 }
3496
3497                 DEBUG_EXECUTE_r(
3498                     PerlIO_printf( Perl_debug_log,
3499                         "%*s  %sgot %"IVdf" possible matches%s\n",
3500                         REPORT_CODE_OFF + depth * 2, "",
3501                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3502                 );
3503                 goto trie_first_try; /* jump into the fail handler */
3504             }}
3505             assert(0); /* NOTREACHED */
3506
3507         case TRIE_next_fail: /* we failed - try next alternative */
3508             if ( ST.jump) {
3509                 REGCP_UNWIND(ST.cp);
3510                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3511             }
3512             if (!--ST.accepted) {
3513                 DEBUG_EXECUTE_r({
3514                     PerlIO_printf( Perl_debug_log,
3515                         "%*s  %sTRIE failed...%s\n",
3516                         REPORT_CODE_OFF+depth*2, "",
3517                         PL_colors[4],
3518                         PL_colors[5] );
3519                 });
3520                 sayNO_SILENT;
3521             }
3522             {
3523                 /* Find next-highest word to process.  Note that this code
3524                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3525                 register U16 min = 0;
3526                 register U16 word;
3527                 register U16 const nextword = ST.nextword;
3528                 register reg_trie_wordinfo * const wordinfo
3529                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3530                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3531                     if (word > nextword && (!min || word < min))
3532                         min = word;
3533                 }
3534                 ST.nextword = min;
3535             }
3536
3537           trie_first_try:
3538             if (do_cutgroup) {
3539                 do_cutgroup = 0;
3540                 no_final = 0;
3541             }
3542
3543             if ( ST.jump) {
3544                 ST.lastparen = rex->lastparen;
3545                 ST.lastcloseparen = rex->lastcloseparen;
3546                 REGCP_SET(ST.cp);
3547             }
3548
3549             /* find start char of end of current word */
3550             {
3551                 U32 chars; /* how many chars to skip */
3552                 U8 *uc = ST.firstpos;
3553                 reg_trie_data * const trie
3554                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3555
3556                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3557                             >=  ST.firstchars);
3558                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3559                             - ST.firstchars;
3560
3561                 if (ST.longfold) {
3562                     /* the hard option - fold each char in turn and find
3563                      * its folded length (which may be different */
3564                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3565                     STRLEN foldlen;
3566                     STRLEN len;
3567                     UV uvc;
3568                     U8 *uscan;
3569
3570                     while (chars) {
3571                         if (utf8_target) {
3572                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3573                                                     uniflags);
3574                             uc += len;
3575                         }
3576                         else {
3577                             uvc = *uc;
3578                             uc++;
3579                         }
3580                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3581                         uscan = foldbuf;
3582                         while (foldlen) {
3583                             if (!--chars)
3584                                 break;
3585                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3586                                             uniflags);
3587                             uscan += len;
3588                             foldlen -= len;
3589                         }
3590                     }
3591                 }
3592                 else {
3593                     if (utf8_target)
3594                         while (chars--)
3595                             uc += UTF8SKIP(uc);
3596                     else
3597                         uc += chars;
3598                 }
3599                 PL_reginput = (char *)uc;
3600             }
3601
3602             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3603                             ? ST.jump[ST.nextword]
3604                             : NEXT_OFF(ST.me));
3605
3606             DEBUG_EXECUTE_r({
3607                 PerlIO_printf( Perl_debug_log,
3608                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3609                     REPORT_CODE_OFF+depth*2, "",
3610                     PL_colors[4],
3611                     ST.nextword,
3612                     PL_colors[5]
3613                     );
3614             });
3615
3616             if (ST.accepted > 1 || has_cutgroup) {
3617                 PUSH_STATE_GOTO(TRIE_next, scan);
3618                 assert(0); /* NOTREACHED */
3619             }
3620             /* only one choice left - just continue */
3621             DEBUG_EXECUTE_r({
3622                 AV *const trie_words
3623                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3624                 SV ** const tmp = av_fetch( trie_words,
3625                     ST.nextword-1, 0 );
3626                 SV *sv= tmp ? sv_newmortal() : NULL;
3627
3628                 PerlIO_printf( Perl_debug_log,
3629                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3630                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3631                     ST.nextword,
3632                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3633                             PL_colors[0], PL_colors[1],
3634                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3635                         )
3636                     : "not compiled under -Dr",
3637                     PL_colors[5] );
3638             });
3639
3640             locinput = PL_reginput;
3641             nextchr = UCHARAT(locinput);
3642             continue; /* execute rest of RE */
3643             assert(0); /* NOTREACHED */
3644 #undef  ST
3645
3646         case EXACT: {
3647             char *s = STRING(scan);
3648             ln = STR_LEN(scan);
3649             if (utf8_target != UTF_PATTERN) {
3650                 /* The target and the pattern have differing utf8ness. */
3651                 char *l = locinput;
3652                 const char * const e = s + ln;
3653
3654                 if (utf8_target) {
3655                     /* The target is utf8, the pattern is not utf8. */
3656                     while (s < e) {
3657                         STRLEN ulen;
3658                         if (l >= PL_regeol)
3659                              sayNO;
3660                         if (NATIVE_TO_UNI(*(U8*)s) !=
3661                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3662                                             uniflags))
3663                              sayNO;
3664                         l += ulen;
3665                         s ++;
3666                     }
3667                 }
3668                 else {
3669                     /* The target is not utf8, the pattern is utf8. */
3670                     while (s < e) {
3671                         STRLEN ulen;
3672                         if (l >= PL_regeol)
3673                             sayNO;
3674                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3675                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3676                                            uniflags))
3677                             sayNO;
3678                         s += ulen;
3679                         l ++;
3680                     }
3681                 }
3682                 locinput = l;
3683                 nextchr = UCHARAT(locinput);
3684                 break;
3685             }
3686             /* The target and the pattern have the same utf8ness. */
3687             /* Inline the first character, for speed. */
3688             if (UCHARAT(s) != nextchr)
3689                 sayNO;
3690             if (PL_regeol - locinput < ln)
3691                 sayNO;
3692             if (ln > 1 && memNE(s, locinput, ln))
3693                 sayNO;
3694             locinput += ln;
3695             nextchr = UCHARAT(locinput);
3696             break;
3697             }
3698         case EXACTFL: {
3699             re_fold_t folder;
3700             const U8 * fold_array;
3701             const char * s;
3702             U32 fold_utf8_flags;
3703
3704             PL_reg_flags |= RF_tainted;
3705             folder = foldEQ_locale;
3706             fold_array = PL_fold_locale;
3707             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3708             goto do_exactf;
3709
3710         case EXACTFU_SS:
3711         case EXACTFU_TRICKYFOLD:
3712         case EXACTFU:
3713             folder = foldEQ_latin1;
3714             fold_array = PL_fold_latin1;
3715             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3716             goto do_exactf;
3717
3718         case EXACTFA:
3719             folder = foldEQ_latin1;
3720             fold_array = PL_fold_latin1;
3721             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3722             goto do_exactf;
3723
3724         case EXACTF:
3725             folder = foldEQ;
3726             fold_array = PL_fold;
3727             fold_utf8_flags = 0;
3728
3729           do_exactf:
3730             s = STRING(scan);
3731             ln = STR_LEN(scan);
3732
3733             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3734               /* Either target or the pattern are utf8, or has the issue where
3735                * the fold lengths may differ. */
3736                 const char * const l = locinput;
3737                 char *e = PL_regeol;
3738
3739                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3740                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3741                 {
3742                     sayNO;
3743                 }
3744                 locinput = e;
3745                 nextchr = UCHARAT(locinput);
3746                 break;
3747             }
3748
3749             /* Neither the target nor the pattern are utf8 */
3750             if (UCHARAT(s) != nextchr &&
3751                 UCHARAT(s) != fold_array[nextchr])
3752             {
3753                 sayNO;
3754             }
3755             if (PL_regeol - locinput < ln)
3756                 sayNO;
3757             if (ln > 1 && ! folder(s, locinput, ln))
3758                 sayNO;
3759             locinput += ln;
3760             nextchr = UCHARAT(locinput);
3761             break;
3762         }
3763
3764         /* XXX Could improve efficiency by separating these all out using a
3765          * macro or in-line function.  At that point regcomp.c would no longer
3766          * have to set the FLAGS fields of these */
3767         case BOUNDL:
3768         case NBOUNDL:
3769             PL_reg_flags |= RF_tainted;
3770             /* FALL THROUGH */
3771         case BOUND:
3772         case BOUNDU:
3773         case BOUNDA:
3774         case NBOUND:
3775         case NBOUNDU:
3776         case NBOUNDA:
3777             /* was last char in word? */
3778             if (utf8_target
3779                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3780                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3781             {
3782                 if (locinput == PL_bostr)
3783                     ln = '\n';
3784                 else {
3785                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3786
3787                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3788                 }
3789                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3790                     ln = isALNUM_uni(ln);
3791                     LOAD_UTF8_CHARCLASS_ALNUM();
3792                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3793                 }
3794                 else {
3795                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3796                     n = isALNUM_LC_utf8((U8*)locinput);
3797                 }
3798             }
3799             else {
3800
3801                 /* Here the string isn't utf8, or is utf8 and only ascii
3802                  * characters are to match \w.  In the latter case looking at
3803                  * the byte just prior to the current one may be just the final
3804                  * byte of a multi-byte character.  This is ok.  There are two
3805                  * cases:
3806                  * 1) it is a single byte character, and then the test is doing
3807                  *      just what it's supposed to.
3808                  * 2) it is a multi-byte character, in which case the final
3809                  *      byte is never mistakable for ASCII, and so the test
3810                  *      will say it is not a word character, which is the
3811                  *      correct answer. */
3812                 ln = (locinput != PL_bostr) ?
3813                     UCHARAT(locinput - 1) : '\n';
3814                 switch (FLAGS(scan)) {
3815                     case REGEX_UNICODE_CHARSET:
3816                         ln = isWORDCHAR_L1(ln);
3817                         n = isWORDCHAR_L1(nextchr);
3818                         break;
3819                     case REGEX_LOCALE_CHARSET:
3820                         ln = isALNUM_LC(ln);
3821                         n = isALNUM_LC(nextchr);
3822                         break;
3823                     case REGEX_DEPENDS_CHARSET:
3824                         ln = isALNUM(ln);
3825                         n = isALNUM(nextchr);
3826                         break;
3827                     case REGEX_ASCII_RESTRICTED_CHARSET:
3828                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3829                         ln = isWORDCHAR_A(ln);
3830                         n = isWORDCHAR_A(nextchr);
3831                         break;
3832                     default:
3833                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3834                         break;
3835                 }
3836             }
3837             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3838              * regcomp.sym */
3839             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3840                     sayNO;
3841             break;
3842         case ANYOFV:
3843         case ANYOF:
3844             if (utf8_target || state_num == ANYOFV) {
3845                 STRLEN inclasslen = PL_regeol - locinput;
3846                 if (locinput >= PL_regeol)
3847                     sayNO;
3848
3849                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3850                     sayNO;
3851                 locinput += inclasslen;
3852                 nextchr = UCHARAT(locinput);
3853                 break;
3854             }
3855             else {
3856                 if (nextchr < 0)
3857                     nextchr = UCHARAT(locinput);
3858                 if (!nextchr && locinput >= PL_regeol)
3859                     sayNO;
3860                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3861                     sayNO;
3862                 nextchr = UCHARAT(++locinput);
3863                 break;
3864             }
3865             break;
3866         /* Special char classes - The defines start on line 129 or so */
3867         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3868                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3869                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3870                   ALNUMA, NALNUMA, isWORDCHAR_A,
3871                   alnum, "a");
3872
3873         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3874                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3875                   SPACEU, NSPACEU, isSPACE_L1,
3876                   SPACEA, NSPACEA, isSPACE_A,
3877                   space, " ");
3878
3879         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3880                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3881                 DIGITA, NDIGITA, isDIGIT_A,
3882                 digit, "0");
3883
3884         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3885                        a Unicode extended Grapheme Cluster */
3886             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3887               extended Grapheme Cluster is:
3888
3889                CR LF
3890                | Prepend* Begin Extend*
3891                | .
3892
3893                Begin is (Hangul-syllable | ! Control)
3894                Extend is (Grapheme_Extend | Spacing_Mark)
3895                Control is [ GCB_Control CR LF ]
3896
3897                The discussion below shows how the code for CLUMP is derived
3898                from this regex.  Note that most of these concepts are from
3899                property values of the Grapheme Cluster Boundary (GCB) property.
3900                No code point can have multiple property values for a given
3901                property.  Thus a code point in Prepend can't be in Control, but
3902                it must be in !Control.  This is why Control above includes
3903                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3904                property separately, and so can't be in GCB_Control, even though
3905                they logically are controls.  Control is not the same as gc=cc,
3906                but includes format and other characters as well.
3907
3908                The Unicode definition of Hangul-syllable is:
3909                    L+
3910                    | (L* ( ( V | LV ) V* | LVT ) T*)
3911                    | T+
3912                   )
3913                Each of these is a value for the GCB property, and hence must be
3914                disjoint, so the order they are tested is immaterial, so the
3915                above can safely be changed to
3916                    T+
3917                    | L+
3918                    | (L* ( LVT | ( V | LV ) V*) T*)
3919
3920                The last two terms can be combined like this:
3921                    L* ( L
3922                         | (( LVT | ( V | LV ) V*) T*))
3923
3924                And refactored into this:
3925                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3926
3927                That means that if we have seen any L's at all we can quit
3928                there, but if the next character is an LVT, a V, or an LV we
3929                should keep going.
3930
3931                There is a subtlety with Prepend* which showed up in testing.
3932                Note that the Begin, and only the Begin is required in:
3933                 | Prepend* Begin Extend*
3934                Also, Begin contains '! Control'.  A Prepend must be a
3935                '!  Control', which means it must also be a Begin.  What it
3936                comes down to is that if we match Prepend* and then find no
3937                suitable Begin afterwards, that if we backtrack the last
3938                Prepend, that one will be a suitable Begin.
3939             */
3940
3941             if (locinput >= PL_regeol)
3942                 sayNO;
3943             if  (! utf8_target) {
3944
3945                 /* Match either CR LF  or '.', as all the other possibilities
3946                  * require utf8 */
3947                 locinput++;         /* Match the . or CR */
3948                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3949                                        match the LF */
3950                     && locinput < PL_regeol
3951                     && UCHARAT(locinput) == '\n') locinput++;
3952             }
3953             else {
3954
3955                 /* Utf8: See if is ( CR LF ); already know that locinput <
3956                  * PL_regeol, so locinput+1 is in bounds */
3957                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3958                     locinput += 2;
3959                 }
3960                 else {
3961                     /* In case have to backtrack to beginning, then match '.' */
3962                     char *starting = locinput;
3963
3964                     /* In case have to backtrack the last prepend */
3965                     char *previous_prepend = 0;
3966
3967                     LOAD_UTF8_CHARCLASS_GCB();
3968
3969                     /* Match (prepend)* */
3970                     while (locinput < PL_regeol
3971                            && swash_fetch(PL_utf8_X_prepend,
3972                                           (U8*)locinput, utf8_target))
3973                     {
3974                         previous_prepend = locinput;
3975                         locinput += UTF8SKIP(locinput);
3976                     }
3977
3978                     /* As noted above, if we matched a prepend character, but
3979                      * the next thing won't match, back off the last prepend we
3980                      * matched, as it is guaranteed to match the begin */
3981                     if (previous_prepend
3982                         && (locinput >=  PL_regeol
3983                             || ! swash_fetch(PL_utf8_X_begin,
3984                                              (U8*)locinput, utf8_target)))
3985                     {
3986                         locinput = previous_prepend;
3987                     }
3988
3989                     /* Note that here we know PL_regeol > locinput, as we
3990                      * tested that upon input to this switch case, and if we
3991                      * moved locinput forward, we tested the result just above
3992                      * and it either passed, or we backed off so that it will
3993                      * now pass */
3994                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3995
3996                         /* Here did not match the required 'Begin' in the
3997                          * second term.  So just match the very first
3998                          * character, the '.' of the final term of the regex */
3999                         locinput = starting + UTF8SKIP(starting);
4000                     } else {
4001
4002                         /* Here is the beginning of a character that can have
4003                          * an extender.  It is either a hangul syllable, or a
4004                          * non-control */
4005                         if (swash_fetch(PL_utf8_X_non_hangul,
4006                                         (U8*)locinput, utf8_target))
4007                         {
4008
4009                             /* Here not a Hangul syllable, must be a
4010                              * ('!  * Control') */
4011                             locinput += UTF8SKIP(locinput);
4012                         } else {
4013
4014                             /* Here is a Hangul syllable.  It can be composed
4015                              * of several individual characters.  One
4016                              * possibility is T+ */
4017                             if (swash_fetch(PL_utf8_X_T,
4018                                             (U8*)locinput, utf8_target))
4019                             {
4020                                 while (locinput < PL_regeol
4021                                         && swash_fetch(PL_utf8_X_T,
4022                                                         (U8*)locinput, utf8_target))
4023                                 {
4024                                     locinput += UTF8SKIP(locinput);
4025                                 }
4026                             } else {
4027
4028                                 /* Here, not T+, but is a Hangul.  That means
4029                                  * it is one of the others: L, LV, LVT or V,
4030                                  * and matches:
4031                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
4032
4033                                 /* Match L*           */
4034                                 while (locinput < PL_regeol
4035                                         && swash_fetch(PL_utf8_X_L,
4036                                                         (U8*)locinput, utf8_target))
4037                                 {
4038                                     locinput += UTF8SKIP(locinput);
4039                                 }
4040
4041                                 /* Here, have exhausted L*.  If the next
4042                                  * character is not an LV, LVT nor V, it means
4043                                  * we had to have at least one L, so matches L+
4044                                  * in the original equation, we have a complete
4045                                  * hangul syllable.  Are done. */
4046
4047                                 if (locinput < PL_regeol
4048                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
4049                                                     (U8*)locinput, utf8_target))
4050                                 {
4051
4052                                     /* Otherwise keep going.  Must be LV, LVT
4053                                      * or V.  See if LVT */
4054                                     if (swash_fetch(PL_utf8_X_LVT,
4055                                                     (U8*)locinput, utf8_target))
4056                                     {
4057                                         locinput += UTF8SKIP(locinput);
4058                                     } else {
4059
4060                                         /* Must be  V or LV.  Take it, then
4061                                          * match V*     */
4062                                         locinput += UTF8SKIP(locinput);
4063                                         while (locinput < PL_regeol
4064                                                 && swash_fetch(PL_utf8_X_V,
4065                                                          (U8*)locinput, utf8_target))
4066                                         {
4067                                             locinput += UTF8SKIP(locinput);
4068                                         }
4069                                     }
4070
4071                                     /* And any of LV, LVT, or V can be followed
4072                                      * by T*            */
4073                                     while (locinput < PL_regeol
4074                                            && swash_fetch(PL_utf8_X_T,
4075                                                            (U8*)locinput,
4076                                                            utf8_target))
4077                                     {
4078                                         locinput += UTF8SKIP(locinput);
4079                                     }
4080                                 }
4081                             }
4082                         }
4083
4084                         /* Match any extender */
4085                         while (locinput < PL_regeol
4086                                 && swash_fetch(PL_utf8_X_extend,
4087                                                 (U8*)locinput, utf8_target))
4088                         {
4089                             locinput += UTF8SKIP(locinput);
4090                         }
4091                     }
4092                 }
4093                 if (locinput > PL_regeol) sayNO;
4094             }
4095             nextchr = UCHARAT(locinput);
4096             break;
4097
4098         case NREFFL:
4099         {   /* The capture buffer cases.  The ones beginning with N for the
4100                named buffers just convert to the equivalent numbered and
4101                pretend they were called as the corresponding numbered buffer
4102                op.  */
4103             /* don't initialize these in the declaration, it makes C++
4104                unhappy */
4105             char *s;
4106             char type;
4107             re_fold_t folder;
4108             const U8 *fold_array;
4109             UV utf8_fold_flags;
4110
4111             PL_reg_flags |= RF_tainted;
4112             folder = foldEQ_locale;
4113             fold_array = PL_fold_locale;
4114             type = REFFL;
4115             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4116             goto do_nref;
4117
4118         case NREFFA:
4119             folder = foldEQ_latin1;
4120             fold_array = PL_fold_latin1;
4121             type = REFFA;
4122             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4123             goto do_nref;
4124
4125         case NREFFU:
4126             folder = foldEQ_latin1;
4127             fold_array = PL_fold_latin1;
4128             type = REFFU;
4129             utf8_fold_flags = 0;
4130             goto do_nref;
4131
4132         case NREFF:
4133             folder = foldEQ;
4134             fold_array = PL_fold;
4135             type = REFF;
4136             utf8_fold_flags = 0;
4137             goto do_nref;
4138
4139         case NREF:
4140             type = REF;
4141             folder = NULL;
4142             fold_array = NULL;
4143             utf8_fold_flags = 0;
4144           do_nref:
4145
4146             /* For the named back references, find the corresponding buffer
4147              * number */
4148             n = reg_check_named_buff_matched(rex,scan);
4149
4150             if ( ! n ) {
4151                 sayNO;
4152             }
4153             goto do_nref_ref_common;
4154
4155         case REFFL:
4156             PL_reg_flags |= RF_tainted;
4157             folder = foldEQ_locale;
4158             fold_array = PL_fold_locale;
4159             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4160             goto do_ref;
4161
4162         case REFFA:
4163             folder = foldEQ_latin1;
4164             fold_array = PL_fold_latin1;
4165             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4166             goto do_ref;
4167
4168         case REFFU:
4169             folder = foldEQ_latin1;
4170             fold_array = PL_fold_latin1;
4171             utf8_fold_flags = 0;
4172             goto do_ref;
4173
4174         case REFF:
4175             folder = foldEQ;
4176             fold_array = PL_fold;
4177             utf8_fold_flags = 0;
4178             goto do_ref;
4179
4180         case REF:
4181             folder = NULL;
4182             fold_array = NULL;
4183             utf8_fold_flags = 0;
4184
4185           do_ref:
4186             type = OP(scan);
4187             n = ARG(scan);  /* which paren pair */
4188
4189           do_nref_ref_common:
4190             ln = rex->offs[n].start;
4191             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4192             if (rex->lastparen < n || ln == -1)
4193                 sayNO;                  /* Do not match unless seen CLOSEn. */
4194             if (ln == rex->offs[n].end)
4195                 break;
4196
4197             s = PL_bostr + ln;
4198             if (type != REF     /* REF can do byte comparison */
4199                 && (utf8_target || type == REFFU))
4200             { /* XXX handle REFFL better */
4201                 char * limit = PL_regeol;
4202
4203                 /* This call case insensitively compares the entire buffer
4204                     * at s, with the current input starting at locinput, but
4205                     * not going off the end given by PL_regeol, and returns in
4206                     * limit upon success, how much of the current input was
4207                     * matched */
4208                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4209                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4210                 {
4211                     sayNO;
4212                 }
4213                 locinput = limit;
4214                 nextchr = UCHARAT(locinput);
4215                 break;
4216             }
4217
4218             /* Not utf8:  Inline the first character, for speed. */
4219             if (UCHARAT(s) != nextchr &&
4220                 (type == REF ||
4221                  UCHARAT(s) != fold_array[nextchr]))
4222                 sayNO;
4223             ln = rex->offs[n].end - ln;
4224             if (locinput + ln > PL_regeol)
4225                 sayNO;
4226             if (ln > 1 && (type == REF
4227                            ? memNE(s, locinput, ln)
4228                            : ! folder(s, locinput, ln)))
4229                 sayNO;
4230             locinput += ln;
4231             nextchr = UCHARAT(locinput);
4232             break;
4233         }
4234         case NOTHING:
4235         case TAIL:
4236             break;
4237         case BACK:
4238             break;
4239
4240 #undef  ST
4241 #define ST st->u.eval
4242         {
4243             SV *ret;
4244             REGEXP *re_sv;
4245             regexp *re;
4246             regexp_internal *rei;
4247             regnode *startpoint;
4248
4249         case GOSTART:
4250         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4251             if (cur_eval && cur_eval->locinput==locinput) {
4252                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4253                     Perl_croak(aTHX_ "Infinite recursion in regex");
4254                 if ( ++nochange_depth > max_nochange_depth )
4255                     Perl_croak(aTHX_
4256                         "Pattern subroutine nesting without pos change"
4257                         " exceeded limit in regex");
4258             } else {
4259                 nochange_depth = 0;
4260             }
4261             re_sv = rex_sv;
4262             re = rex;
4263             rei = rexi;
4264             if (OP(scan)==GOSUB) {
4265                 startpoint = scan + ARG2L(scan);
4266                 ST.close_paren = ARG(scan);
4267             } else {
4268                 startpoint = rei->program+1;
4269                 ST.close_paren = 0;
4270             }
4271             goto eval_recurse_doit;
4272             assert(0); /* NOTREACHED */
4273         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4274             if (cur_eval && cur_eval->locinput==locinput) {
4275                 if ( ++nochange_depth > max_nochange_depth )
4276                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4277             } else {
4278                 nochange_depth = 0;
4279             }
4280             {
4281                 /* execute the code in the {...} */
4282
4283                 dSP;
4284                 SV ** before;
4285                 OP * const oop = PL_op;
4286                 COP * const ocurcop = PL_curcop;
4287                 OP *nop;
4288                 char *saved_regeol = PL_regeol;
4289                 struct re_save_state saved_state;
4290                 CV *newcv;
4291
4292                 /* save *all* paren positions */
4293                 regcppush(rex, 0);
4294                 REGCP_SET(runops_cp);
4295
4296                 /* To not corrupt the existing regex state while executing the
4297                  * eval we would normally put it on the save stack, like with
4298                  * save_re_context. However, re-evals have a weird scoping so we
4299                  * can't just add ENTER/LEAVE here. With that, things like
4300                  *
4301                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4302                  *
4303                  * would break, as they expect the localisation to be unwound
4304                  * only when the re-engine backtracks through the bit that
4305                  * localised it.
4306                  *
4307                  * What we do instead is just saving the state in a local c
4308                  * variable.
4309                  */
4310                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4311
4312                 PL_reg_state.re_reparsing = FALSE;
4313
4314                 if (!caller_cv)
4315                     caller_cv = find_runcv(NULL);
4316
4317                 n = ARG(scan);
4318
4319                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4320                     newcv = ((struct regexp *)SvANY(
4321                                                 (REGEXP*)(rexi->data->data[n])
4322                                             ))->qr_anoncv
4323                                         ;
4324                     nop = (OP*)rexi->data->data[n+1];
4325                 }
4326                 else if (rexi->data->what[n] == 'l') { /* literal code */
4327                     newcv = caller_cv;
4328                     nop = (OP*)rexi->data->data[n];
4329                     assert(CvDEPTH(newcv));
4330                 }
4331                 else {
4332                     /* literal with own CV */
4333                     assert(rexi->data->what[n] == 'L');
4334                     newcv = rex->qr_anoncv;
4335                     nop = (OP*)rexi->data->data[n];
4336                 }
4337
4338                 /* normally if we're about to execute code from the same
4339                  * CV that we used previously, we just use the existing
4340                  * CX stack entry. However, its possible that in the
4341                  * meantime we may have backtracked, popped from the save
4342                  * stack, and undone the SAVECOMPPAD(s) associated with
4343                  * PUSH_MULTICALL; in which case PL_comppad no longer
4344                  * points to newcv's pad. */
4345                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4346                 {
4347                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4348                     if (last_pushed_cv) {
4349                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4350                     }
4351                     else {
4352                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4353                     }
4354                     last_pushed_cv = newcv;
4355                 }
4356                 last_pad = PL_comppad;
4357
4358                 /* the initial nextstate you would normally execute
4359                  * at the start of an eval (which would cause error
4360                  * messages to come from the eval), may be optimised
4361                  * away from the execution path in the regex code blocks;
4362                  * so manually set PL_curcop to it initially */
4363                 {
4364                     OP *o = cUNOPx(nop)->op_first;
4365                     assert(o->op_type == OP_NULL);
4366                     if (o->op_targ == OP_SCOPE) {
4367                         o = cUNOPo->op_first;
4368                     }
4369                     else {
4370                         assert(o->op_targ == OP_LEAVE);
4371                         o = cUNOPo->op_first;
4372                         assert(o->op_type == OP_ENTER);
4373                         o = o->op_sibling;
4374                     }
4375
4376                     if (o->op_type != OP_STUB) {
4377                         assert(    o->op_type == OP_NEXTSTATE
4378                                 || o->op_type == OP_DBSTATE
4379                                 || (o->op_type == OP_NULL
4380                                     &&  (  o->op_targ == OP_NEXTSTATE
4381                                         || o->op_targ == OP_DBSTATE
4382                                         )
4383                                     )
4384                         );
4385                         PL_curcop = (COP*)o;
4386                     }
4387                 }
4388                 nop = nop->op_next;
4389
4390                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4391                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4392
4393                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4394
4395                 if (sv_yes_mark) {
4396                     SV *sv_mrk = get_sv("REGMARK", 1);
4397                     sv_setsv(sv_mrk, sv_yes_mark);
4398                 }
4399
4400                 /* we don't use MULTICALL here as we want to call the
4401                  * first op of the block of interest, rather than the
4402                  * first op of the sub */
4403                 before = SP;
4404                 PL_op = nop;
4405                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4406                 SPAGAIN;
4407                 if (SP == before)
4408                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4409                 else {
4410                     ret = POPs;
4411                     PUTBACK;
4412                 }
4413
4414                 /* before restoring everything, evaluate the returned
4415                  * value, so that 'uninit' warnings don't use the wrong
4416                  * PL_op or pad. Also need to process any magic vars
4417                  * (e.g. $1) *before* parentheses are restored */
4418
4419                 PL_op = NULL;
4420
4421                 re_sv = NULL;
4422                 if (logical == 0)        /*   (?{})/   */
4423                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4424                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4425                     sw = cBOOL(SvTRUE(ret));
4426                     logical = 0;
4427                 }
4428                 else {                   /*  /(??{})  */
4429                     /*  if its overloaded, let the regex compiler handle
4430                      *  it; otherwise extract regex, or stringify  */
4431                     if (!SvAMAGIC(ret)) {
4432                         SV *sv = ret;
4433                         if (SvROK(sv))
4434                             sv = SvRV(sv);
4435                         if (SvTYPE(sv) == SVt_REGEXP)
4436                             re_sv = (REGEXP*) sv;
4437                         else if (SvSMAGICAL(sv)) {
4438                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4439                             if (mg)
4440                                 re_sv = (REGEXP *) mg->mg_obj;
4441                         }
4442
4443                         /* force any magic, undef warnings here */
4444                         if (!re_sv) {
4445                             ret = sv_mortalcopy(ret);
4446                             (void) SvPV_force_nolen(ret);
4447                         }
4448                     }
4449
4450                 }
4451
4452                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4453
4454                 /* *** Note that at this point we don't restore
4455                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4456                  * be used again soon. This is safe as long as nothing
4457                  * in the regexp code uses the pad ! */
4458                 PL_op = oop;
4459                 PL_curcop = ocurcop;
4460                 PL_regeol = saved_regeol;
4461                 S_regcp_restore(aTHX_ rex, runops_cp);
4462
4463                 if (logical != 2)
4464                     break;
4465             }
4466
4467                 /* only /(??{})/  from now on */
4468                 logical = 0;
4469                 {
4470                     /* extract RE object from returned value; compiling if
4471                      * necessary */
4472
4473                     if (re_sv) {
4474                         re_sv = reg_temp_copy(NULL, re_sv);
4475                     }
4476                     else {
4477                         U32 pm_flags = 0;
4478                         const I32 osize = PL_regsize;
4479
4480                         if (SvUTF8(ret) && IN_BYTES) {
4481                             /* In use 'bytes': make a copy of the octet
4482                              * sequence, but without the flag on */
4483                             STRLEN len;
4484                             const char *const p = SvPV(ret, len);
4485                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4486                         }
4487                         if (rex->intflags & PREGf_USE_RE_EVAL)
4488                             pm_flags |= PMf_USE_RE_EVAL;
4489
4490                         /* if we got here, it should be an engine which
4491                          * supports compiling code blocks and stuff */
4492                         assert(rex->engine && rex->engine->op_comp);
4493                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4494                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4495                                     rex->engine, NULL, NULL,
4496                                     /* copy /msix etc to inner pattern */
4497                                     scan->flags,
4498                                     pm_flags);
4499
4500                         if (!(SvFLAGS(ret)
4501                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4502                                  | SVs_GMG))) {
4503                             /* This isn't a first class regexp. Instead, it's
4504                                caching a regexp onto an existing, Perl visible
4505                                scalar.  */
4506                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4507                         }
4508                         PL_regsize = osize;
4509                         /* safe to do now that any $1 etc has been
4510                          * interpolated into the new pattern string and
4511                          * compiled */
4512                         S_regcp_restore(aTHX_ rex, runops_cp);
4513                     }
4514                     re = (struct regexp *)SvANY(re_sv);
4515                 }
4516                 RXp_MATCH_COPIED_off(re);
4517                 re->subbeg = rex->subbeg;
4518                 re->sublen = rex->sublen;
4519                 rei = RXi_GET(re);
4520                 DEBUG_EXECUTE_r(
4521                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4522                         "Matching embedded");
4523                 );
4524                 startpoint = rei->program + 1;
4525                 ST.close_paren = 0; /* only used for GOSUB */
4526
4527         eval_recurse_doit: /* Share code with GOSUB below this line */
4528                 /* run the pattern returned from (??{...}) */
4529                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4530                 REGCP_SET(ST.lastcp);
4531
4532                 re->lastparen = 0;
4533                 re->lastcloseparen = 0;
4534
4535                 PL_reginput = locinput;
4536                 PL_regsize = 0;
4537
4538                 /* XXXX This is too dramatic a measure... */
4539                 PL_reg_maxiter = 0;
4540
4541                 ST.toggle_reg_flags = PL_reg_flags;
4542                 if (RX_UTF8(re_sv))
4543                     PL_reg_flags |= RF_utf8;
4544                 else
4545                     PL_reg_flags &= ~RF_utf8;
4546                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4547
4548                 ST.prev_rex = rex_sv;
4549                 ST.prev_curlyx = cur_curlyx;
4550                 rex_sv = re_sv;
4551                 SET_reg_curpm(rex_sv);
4552                 rex = re;
4553                 rexi = rei;
4554                 cur_curlyx = NULL;
4555                 ST.B = next;
4556                 ST.prev_eval = cur_eval;
4557                 cur_eval = st;
4558                 /* now continue from first node in postoned RE */
4559                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4560                 assert(0); /* NOTREACHED */
4561         }
4562
4563         case EVAL_AB: /* cleanup after a successful (??{A})B */
4564             /* note: this is called twice; first after popping B, then A */
4565             PL_reg_flags ^= ST.toggle_reg_flags;
4566             rex_sv = ST.prev_rex;
4567             SET_reg_curpm(rex_sv);
4568             rex = (struct regexp *)SvANY(rex_sv);
4569             rexi = RXi_GET(rex);
4570             regcpblow(ST.cp);
4571             cur_eval = ST.prev_eval;
4572             cur_curlyx = ST.prev_curlyx;
4573
4574             /* XXXX This is too dramatic a measure... */
4575             PL_reg_maxiter = 0;
4576             if ( nochange_depth )
4577                 nochange_depth--;
4578             sayYES;
4579
4580
4581         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4582             /* note: this is called twice; first after popping B, then A */
4583             PL_reg_flags ^= ST.toggle_reg_flags;
4584             rex_sv = ST.prev_rex;
4585             SET_reg_curpm(rex_sv);
4586             rex = (struct regexp *)SvANY(rex_sv);
4587             rexi = RXi_GET(rex);
4588
4589             PL_reginput = locinput;
4590             REGCP_UNWIND(ST.lastcp);
4591             regcppop(rex);
4592             cur_eval = ST.prev_eval;
4593             cur_curlyx = ST.prev_curlyx;
4594             /* XXXX This is too dramatic a measure... */
4595             PL_reg_maxiter = 0;
4596             if ( nochange_depth )
4597                 nochange_depth--;
4598             sayNO_SILENT;
4599 #undef ST
4600
4601         case OPEN:
4602             n = ARG(scan);  /* which paren pair */
4603             rex->offs[n].start_tmp = locinput - PL_bostr;
4604             if (n > PL_regsize)
4605                 PL_regsize = n;
4606             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4607                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4608                 PTR2UV(rex),
4609                 PTR2UV(rex->offs),
4610                 (UV)n,
4611                 (IV)rex->offs[n].start_tmp,
4612                 (UV)PL_regsize
4613             ));
4614             lastopen = n;
4615             break;
4616
4617 /* XXX really need to log other places start/end are set too */
4618 #define CLOSE_CAPTURE \
4619     rex->offs[n].start = rex->offs[n].start_tmp; \
4620     rex->offs[n].end = locinput - PL_bostr; \
4621     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4622         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4623         PTR2UV(rex), \
4624         PTR2UV(rex->offs), \
4625         (UV)n, \
4626         (IV)rex->offs[n].start, \
4627         (IV)rex->offs[n].end \
4628     ))
4629
4630         case CLOSE:
4631             n = ARG(scan);  /* which paren pair */
4632             CLOSE_CAPTURE;
4633             /*if (n > PL_regsize)
4634                 PL_regsize = n;*/
4635             if (n > rex->lastparen)
4636                 rex->lastparen = n;
4637             rex->lastcloseparen = n;
4638             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4639                 goto fake_end;
4640             }
4641             break;
4642         case ACCEPT:
4643             if (ARG(scan)){
4644                 regnode *cursor;
4645                 for (cursor=scan;
4646                      cursor && OP(cursor)!=END;
4647                      cursor=regnext(cursor))
4648                 {
4649                     if ( OP(cursor)==CLOSE ){
4650                         n = ARG(cursor);
4651                         if ( n <= lastopen ) {
4652                             CLOSE_CAPTURE;
4653                             /*if (n > PL_regsize)
4654                             PL_regsize = n;*/
4655                             if (n > rex->lastparen)
4656                                 rex->lastparen = n;
4657                             rex->lastcloseparen = n;
4658                             if ( n == ARG(scan) || (cur_eval &&
4659                                 cur_eval->u.eval.close_paren == n))
4660                                 break;
4661                         }
4662                     }
4663                 }
4664             }
4665             goto fake_end;
4666             /*NOTREACHED*/
4667         case GROUPP:
4668             n = ARG(scan);  /* which paren pair */
4669             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4670             break;
4671         case NGROUPP:
4672             /* reg_check_named_buff_matched returns 0 for no match */
4673             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4674             break;
4675         case INSUBP:
4676             n = ARG(scan);
4677             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4678             break;
4679         case DEFINEP:
4680             sw = 0;
4681             break;
4682         case IFTHEN:
4683             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4684             if (sw)
4685                 next = NEXTOPER(NEXTOPER(scan));
4686             else {
4687                 next = scan + ARG(scan);
4688                 if (OP(next) == IFTHEN) /* Fake one. */
4689                     next = NEXTOPER(NEXTOPER(next));
4690             }
4691             break;
4692         case LOGICAL:
4693             logical = scan->flags;
4694             break;
4695
4696 /*******************************************************************
4697
4698 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4699 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4700 STAR/PLUS/CURLY/CURLYN are used instead.)
4701
4702 A*B is compiled as <CURLYX><A><WHILEM><B>
4703
4704 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4705 state, which contains the current count, initialised to -1. It also sets
4706 cur_curlyx to point to this state, with any previous value saved in the
4707 state block.
4708
4709 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4710 since the pattern may possibly match zero times (i.e. it's a while {} loop
4711 rather than a do {} while loop).
4712
4713 Each entry to WHILEM represents a successful match of A. The count in the
4714 CURLYX block is incremented, another WHILEM state is pushed, and execution
4715 passes to A or B depending on greediness and the current count.
4716
4717 For example, if matching against the string a1a2a3b (where the aN are
4718 substrings that match /A/), then the match progresses as follows: (the
4719 pushed states are interspersed with the bits of strings matched so far):
4720
4721     <CURLYX cnt=-1>
4722     <CURLYX cnt=0><WHILEM>
4723     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4724     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4725     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4726     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4727
4728 (Contrast this with something like CURLYM, which maintains only a single
4729 backtrack state:
4730
4731     <CURLYM cnt=0> a1
4732     a1 <CURLYM cnt=1> a2
4733     a1 a2 <CURLYM cnt=2> a3
4734     a1 a2 a3 <CURLYM cnt=3> b
4735 )
4736
4737 Each WHILEM state block marks a point to backtrack to upon partial failure
4738 of A or B, and also contains some minor state data related to that
4739 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4740 overall state, such as the count, and pointers to the A and B ops.
4741
4742 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4743 must always point to the *current* CURLYX block, the rules are:
4744
4745 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4746 and set cur_curlyx to point the new block.
4747
4748 When popping the CURLYX block after a successful or unsuccessful match,
4749 restore the previous cur_curlyx.
4750
4751 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4752 to the outer one saved in the CURLYX block.
4753
4754 When popping the WHILEM block after a successful or unsuccessful B match,
4755 restore the previous cur_curlyx.
4756
4757 Here's an example for the pattern (AI* BI)*BO
4758 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4759
4760 cur_
4761 curlyx backtrack stack
4762 ------ ---------------
4763 NULL
4764 CO     <CO prev=NULL> <WO>
4765 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4766 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4767 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4768
4769 At this point the pattern succeeds, and we work back down the stack to
4770 clean up, restoring as we go:
4771
4772 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4773 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4774 CO     <CO prev=NULL> <WO>
4775 NULL
4776
4777 *******************************************************************/
4778
4779 #define ST st->u.curlyx
4780
4781         case CURLYX:    /* start of /A*B/  (for complex A) */
4782         {
4783             /* No need to save/restore up to this paren */
4784             I32 parenfloor = scan->flags;
4785
4786             assert(next); /* keep Coverity happy */
4787             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4788                 next += ARG(next);
4789
4790             /* XXXX Probably it is better to teach regpush to support
4791                parenfloor > PL_regsize... */
4792             if (parenfloor > (I32)rex->lastparen)
4793                 parenfloor = rex->lastparen; /* Pessimization... */
4794
4795             ST.prev_curlyx= cur_curlyx;
4796             cur_curlyx = st;
4797             ST.cp = PL_savestack_ix;
4798
4799             /* these fields contain the state of the current curly.
4800              * they are accessed by subsequent WHILEMs */
4801             ST.parenfloor = parenfloor;
4802             ST.me = scan;
4803             ST.B = next;
4804             ST.minmod = minmod;
4805             minmod = 0;
4806             ST.count = -1;      /* this will be updated by WHILEM */
4807             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4808
4809             PL_reginput = locinput;
4810             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4811             assert(0); /* NOTREACHED */
4812         }
4813
4814         case CURLYX_end: /* just finished matching all of A*B */
4815             cur_curlyx = ST.prev_curlyx;
4816             sayYES;
4817             assert(0); /* NOTREACHED */
4818
4819         case CURLYX_end_fail: /* just failed to match all of A*B */
4820             regcpblow(ST.cp);
4821             cur_curlyx = ST.prev_curlyx;
4822             sayNO;
4823             assert(0); /* NOTREACHED */
4824
4825
4826 #undef ST
4827 #define ST st->u.whilem
4828
4829         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4830         {
4831             /* see the discussion above about CURLYX/WHILEM */
4832             I32 n;
4833             int min = ARG1(cur_curlyx->u.curlyx.me);
4834             int max = ARG2(cur_curlyx->u.curlyx.me);
4835             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4836
4837             assert(cur_curlyx); /* keep Coverity happy */
4838             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4839             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4840             ST.cache_offset = 0;
4841             ST.cache_mask = 0;
4842
4843             PL_reginput = locinput;
4844
4845             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4846                   "%*s  whilem: matched %ld out of %d..%d\n",
4847                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4848             );
4849
4850             /* First just match a string of min A's. */
4851
4852             if (n < min) {
4853                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4854                 cur_curlyx->u.curlyx.lastloc = locinput;
4855                 REGCP_SET(ST.lastcp);
4856
4857                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4858                 assert(0); /* NOTREACHED */
4859             }
4860
4861             /* If degenerate A matches "", assume A done. */
4862
4863             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4864                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4865                    "%*s  whilem: empty match detected, trying continuation...\n",
4866                    REPORT_CODE_OFF+depth*2, "")
4867                 );
4868                 goto do_whilem_B_max;
4869             }
4870
4871             /* super-linear cache processing */
4872
4873             if (scan->flags) {
4874
4875                 if (!PL_reg_maxiter) {
4876                     /* start the countdown: Postpone detection until we
4877                      * know the match is not *that* much linear. */
4878                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4879                     /* possible overflow for long strings and many CURLYX's */
4880                     if (PL_reg_maxiter < 0)
4881                         PL_reg_maxiter = I32_MAX;
4882                     PL_reg_leftiter = PL_reg_maxiter;
4883                 }
4884
4885                 if (PL_reg_leftiter-- == 0) {
4886                     /* initialise cache */
4887                     const I32 size = (PL_reg_maxiter + 7)/8;
4888                     if (PL_reg_poscache) {
4889                         if ((I32)PL_reg_poscache_size < size) {
4890                             Renew(PL_reg_poscache, size, char);
4891                             PL_reg_poscache_size = size;
4892                         }
4893                         Zero(PL_reg_poscache, size, char);
4894                     }
4895                     else {
4896                         PL_reg_poscache_size = size;
4897                         Newxz(PL_reg_poscache, size, char);
4898                     }
4899                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4900       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4901                               PL_colors[4], PL_colors[5])
4902                     );
4903                 }
4904
4905                 if (PL_reg_leftiter < 0) {
4906                     /* have we already failed at this position? */
4907                     I32 offset, mask;
4908                     offset  = (scan->flags & 0xf) - 1
4909                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4910                     mask    = 1 << (offset % 8);
4911                     offset /= 8;
4912                     if (PL_reg_poscache[offset] & mask) {
4913                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4914                             "%*s  whilem: (cache) already tried at this position...\n",
4915                             REPORT_CODE_OFF+depth*2, "")
4916                         );
4917                         sayNO; /* cache records failure */
4918                     }
4919                     ST.cache_offset = offset;
4920                     ST.cache_mask   = mask;
4921                 }
4922             }
4923
4924             /* Prefer B over A for minimal matching. */
4925
4926             if (cur_curlyx->u.curlyx.minmod) {
4927                 ST.save_curlyx = cur_curlyx;
4928                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4929                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
4930                 REGCP_SET(ST.lastcp);
4931                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4932                 assert(0); /* NOTREACHED */
4933             }
4934
4935             /* Prefer A over B for maximal matching. */
4936
4937             if (n < max) { /* More greed allowed? */
4938                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4939                 cur_curlyx->u.curlyx.lastloc = locinput;
4940                 REGCP_SET(ST.lastcp);
4941                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4942                 assert(0); /* NOTREACHED */
4943             }
4944             goto do_whilem_B_max;
4945         }
4946         assert(0); /* NOTREACHED */
4947
4948         case WHILEM_B_min: /* just matched B in a minimal match */
4949         case WHILEM_B_max: /* just matched B in a maximal match */
4950             cur_curlyx = ST.save_curlyx;
4951             sayYES;
4952             assert(0); /* NOTREACHED */
4953
4954         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4955             cur_curlyx = ST.save_curlyx;
4956             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4957             cur_curlyx->u.curlyx.count--;
4958             CACHEsayNO;
4959             assert(0); /* NOTREACHED */
4960
4961         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4962             /* FALL THROUGH */
4963         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4964             REGCP_UNWIND(ST.lastcp);
4965             regcppop(rex);
4966             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4967             cur_curlyx->u.curlyx.count--;
4968             CACHEsayNO;
4969             assert(0); /* NOTREACHED */
4970
4971         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4972             REGCP_UNWIND(ST.lastcp);
4973             regcppop(rex);      /* Restore some previous $<digit>s? */
4974             PL_reginput = locinput;
4975             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4976                 "%*s  whilem: failed, trying continuation...\n",
4977                 REPORT_CODE_OFF+depth*2, "")
4978             );
4979           do_whilem_B_max:
4980             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4981                 && ckWARN(WARN_REGEXP)
4982                 && !(PL_reg_flags & RF_warned))
4983             {
4984                 PL_reg_flags |= RF_warned;
4985                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4986                      "Complex regular subexpression recursion limit (%d) "
4987                      "exceeded",
4988                      REG_INFTY - 1);
4989             }
4990
4991             /* now try B */
4992             ST.save_curlyx = cur_curlyx;
4993             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4994             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4995             assert(0); /* NOTREACHED */
4996
4997         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4998             cur_curlyx = ST.save_curlyx;
4999             REGCP_UNWIND(ST.lastcp);
5000             regcppop(rex);
5001
5002             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5003                 /* Maximum greed exceeded */
5004                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5005                     && ckWARN(WARN_REGEXP)
5006                     && !(PL_reg_flags & RF_warned))
5007                 {
5008                     PL_reg_flags |= RF_warned;
5009                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5010                         "Complex regular subexpression recursion "
5011                         "limit (%d) exceeded",
5012                         REG_INFTY - 1);
5013                 }
5014                 cur_curlyx->u.curlyx.count--;
5015                 CACHEsayNO;
5016             }
5017
5018             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5019                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5020             );
5021             /* Try grabbing another A and see if it helps. */
5022             PL_reginput = locinput;
5023             cur_curlyx->u.curlyx.lastloc = locinput;
5024             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5025             REGCP_SET(ST.lastcp);
5026             PUSH_STATE_GOTO(WHILEM_A_min,
5027                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
5028             assert(0); /* NOTREACHED */
5029
5030 #undef  ST
5031 #define ST st->u.branch
5032
5033         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5034             next = scan + ARG(scan);
5035             if (next == scan)
5036                 next = NULL;
5037             scan = NEXTOPER(scan);
5038             /* FALL THROUGH */
5039
5040         case BRANCH:        /*  /(...|A|...)/ */
5041             scan = NEXTOPER(scan); /* scan now points to inner node */
5042             ST.lastparen = rex->lastparen;
5043             ST.lastcloseparen = rex->lastcloseparen;
5044             ST.next_branch = next;
5045             REGCP_SET(ST.cp);
5046             PL_reginput = locinput;
5047
5048             /* Now go into the branch */
5049             if (has_cutgroup) {
5050                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5051             } else {
5052                 PUSH_STATE_GOTO(BRANCH_next, scan);
5053             }
5054             assert(0); /* NOTREACHED */
5055         case CUTGROUP:
5056             PL_reginput = locinput;
5057             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5058                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5059             PUSH_STATE_GOTO(CUTGROUP_next,next);
5060             assert(0); /* NOTREACHED */
5061         case CUTGROUP_next_fail:
5062             do_cutgroup = 1;
5063             no_final = 1;
5064             if (st->u.mark.mark_name)
5065                 sv_commit = st->u.mark.mark_name;
5066             sayNO;
5067             assert(0); /* NOTREACHED */
5068         case BRANCH_next:
5069             sayYES;
5070             assert(0); /* NOTREACHED */
5071         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5072             if (do_cutgroup) {
5073                 do_cutgroup = 0;
5074                 no_final = 0;
5075             }
5076             REGCP_UNWIND(ST.cp);
5077             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5078             scan = ST.next_branch;
5079             /* no more branches? */
5080             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5081                 DEBUG_EXECUTE_r({
5082                     PerlIO_printf( Perl_debug_log,
5083                         "%*s  %sBRANCH failed...%s\n",
5084                         REPORT_CODE_OFF+depth*2, "",
5085                         PL_colors[4],
5086                         PL_colors[5] );
5087                 });
5088                 sayNO_SILENT;
5089             }
5090             continue; /* execute next BRANCH[J] op */
5091             assert(0); /* NOTREACHED */
5092
5093         case MINMOD:
5094             minmod = 1;
5095             break;
5096
5097 #undef  ST
5098 #define ST st->u.curlym
5099
5100         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5101
5102             /* This is an optimisation of CURLYX that enables us to push
5103              * only a single backtracking state, no matter how many matches
5104              * there are in {m,n}. It relies on the pattern being constant
5105              * length, with no parens to influence future backrefs
5106              */
5107
5108             ST.me = scan;
5109             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5110
5111             ST.lastparen      = rex->lastparen;
5112             ST.lastcloseparen = rex->lastcloseparen;
5113
5114             /* if paren positive, emulate an OPEN/CLOSE around A */
5115             if (ST.me->flags) {
5116                 U32 paren = ST.me->flags;
5117                 if (paren > PL_regsize)
5118                     PL_regsize = paren;
5119                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5120             }
5121             ST.A = scan;
5122             ST.B = next;
5123             ST.alen = 0;
5124             ST.count = 0;
5125             ST.minmod = minmod;
5126             minmod = 0;
5127             ST.c1 = CHRTEST_UNINIT;
5128             REGCP_SET(ST.cp);
5129
5130             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5131                 goto curlym_do_B;
5132
5133           curlym_do_A: /* execute the A in /A{m,n}B/  */
5134             PL_reginput = locinput;
5135             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5136             assert(0); /* NOTREACHED */
5137
5138         case CURLYM_A: /* we've just matched an A */
5139             locinput = st->locinput;
5140             nextchr = UCHARAT(locinput);
5141
5142             ST.count++;
5143             /* after first match, determine A's length: u.curlym.alen */
5144             if (ST.count == 1) {
5145                 if (PL_reg_match_utf8) {
5146                     char *s = locinput;
5147                     while (s < PL_reginput) {
5148                         ST.alen++;
5149                         s += UTF8SKIP(s);
5150                     }
5151                 }
5152                 else {
5153                     ST.alen = PL_reginput - locinput;
5154                 }
5155                 if (ST.alen == 0)
5156                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5157             }
5158             DEBUG_EXECUTE_r(
5159                 PerlIO_printf(Perl_debug_log,
5160                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5161                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5162                           (IV) ST.count, (IV)ST.alen)
5163             );
5164
5165             locinput = PL_reginput;
5166
5167             if (cur_eval && cur_eval->u.eval.close_paren &&
5168                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5169                 goto fake_end;
5170
5171             {
5172                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5173                 if ( max == REG_INFTY || ST.count < max )
5174                     goto curlym_do_A; /* try to match another A */
5175             }
5176             goto curlym_do_B; /* try to match B */
5177
5178         case CURLYM_A_fail: /* just failed to match an A */
5179             REGCP_UNWIND(ST.cp);
5180
5181             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5182                 || (cur_eval && cur_eval->u.eval.close_paren &&
5183                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5184                 sayNO;
5185
5186           curlym_do_B: /* execute the B in /A{m,n}B/  */
5187             PL_reginput = locinput;
5188             if (ST.c1 == CHRTEST_UNINIT) {
5189                 /* calculate c1 and c2 for possible match of 1st char
5190                  * following curly */
5191                 ST.c1 = ST.c2 = CHRTEST_VOID;
5192                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5193                     regnode *text_node = ST.B;
5194                     if (! HAS_TEXT(text_node))
5195                         FIND_NEXT_IMPT(text_node);
5196                     /* this used to be
5197
5198                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5199
5200                         But the former is redundant in light of the latter.
5201
5202                         if this changes back then the macro for
5203                         IS_TEXT and friends need to change.
5204                      */
5205                     if (PL_regkind[OP(text_node)] == EXACT)
5206                     {
5207
5208                         ST.c1 = (U8)*STRING(text_node);
5209                         switch (OP(text_node)) {
5210                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5211                             case EXACTFA:
5212                             case EXACTFU_SS:
5213                             case EXACTFU_TRICKYFOLD:
5214                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5215                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5216                             default: ST.c2 = ST.c1;
5217                         }
5218                     }
5219                 }
5220             }
5221
5222             DEBUG_EXECUTE_r(
5223                 PerlIO_printf(Perl_debug_log,
5224                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5225                     (int)(REPORT_CODE_OFF+(depth*2)),
5226                     "", (IV)ST.count)
5227                 );
5228             if (ST.c1 != CHRTEST_VOID
5229                     && UCHARAT(PL_reginput) != ST.c1
5230                     && UCHARAT(PL_reginput) != ST.c2)
5231             {
5232                 /* simulate B failing */
5233                 DEBUG_OPTIMISE_r(
5234                     PerlIO_printf(Perl_debug_log,
5235                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5236                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5237                         (IV)ST.c1,(IV)ST.c2
5238                 ));
5239                 state_num = CURLYM_B_fail;
5240                 goto reenter_switch;
5241             }
5242
5243             if (ST.me->flags) {
5244                 /* emulate CLOSE: mark current A as captured */
5245                 I32 paren = ST.me->flags;
5246                 if (ST.count) {
5247                     rex->offs[paren].start
5248                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5249                     rex->offs[paren].end = PL_reginput - PL_bostr;
5250                     if ((U32)paren > rex->lastparen)
5251                         rex->lastparen = paren;
5252                     rex->lastcloseparen = paren;
5253                 }
5254                 else
5255                     rex->offs[paren].end = -1;
5256                 if (cur_eval && cur_eval->u.eval.close_paren &&
5257                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5258                 {
5259                     if (ST.count)
5260                         goto fake_end;
5261                     else
5262                         sayNO;
5263                 }
5264             }
5265
5266             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5267             assert(0); /* NOTREACHED */
5268
5269         case CURLYM_B_fail: /* just failed to match a B */
5270             REGCP_UNWIND(ST.cp);
5271             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5272             if (ST.minmod) {
5273                 I32 max = ARG2(ST.me);
5274                 if (max != REG_INFTY && ST.count == max)
5275                     sayNO;
5276                 goto curlym_do_A; /* try to match a further A */
5277             }
5278             /* backtrack one A */
5279             if (ST.count == ARG1(ST.me) /* min */)
5280                 sayNO;
5281             ST.count--;
5282             locinput = HOPc(locinput, -ST.alen);
5283             goto curlym_do_B; /* try to match B */
5284
5285 #undef ST
5286 #define ST st->u.curly
5287
5288 #define CURLY_SETPAREN(paren, success) \
5289     if (paren) { \
5290         if (success) { \
5291             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5292             rex->offs[paren].end = locinput - PL_bostr; \
5293             if (paren > rex->lastparen) \
5294                 rex->lastparen = paren; \
5295             rex->lastcloseparen = paren; \
5296         } \
5297         else { \
5298             rex->offs[paren].end = -1; \
5299             rex->lastparen      = ST.lastparen; \
5300             rex->lastcloseparen = ST.lastcloseparen; \
5301         } \
5302     }
5303
5304         case STAR:              /*  /A*B/ where A is width 1 */
5305             ST.paren = 0;
5306             ST.min = 0;
5307             ST.max = REG_INFTY;
5308             scan = NEXTOPER(scan);
5309             goto repeat;
5310         case PLUS:              /*  /A+B/ where A is width 1 */
5311             ST.paren = 0;
5312             ST.min = 1;
5313             ST.max = REG_INFTY;
5314             scan = NEXTOPER(scan);
5315             goto repeat;
5316         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5317             ST.paren = scan->flags;     /* Which paren to set */
5318             ST.lastparen      = rex->lastparen;
5319             ST.lastcloseparen = rex->lastcloseparen;
5320             if (ST.paren > PL_regsize)
5321                 PL_regsize = ST.paren;
5322             ST.min = ARG1(scan);  /* min to match */
5323             ST.max = ARG2(scan);  /* max to match */
5324             if (cur_eval && cur_eval->u.eval.close_paren &&
5325                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5326                 ST.min=1;
5327                 ST.max=1;
5328             }
5329             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5330             goto repeat;
5331         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5332             ST.paren = 0;
5333             ST.min = ARG1(scan);  /* min to match */
5334             ST.max = ARG2(scan);  /* max to match */
5335             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5336           repeat:
5337             /*
5338             * Lookahead to avoid useless match attempts
5339             * when we know what character comes next.
5340             *
5341             * Used to only do .*x and .*?x, but now it allows
5342             * for )'s, ('s and (?{ ... })'s to be in the way
5343             * of the quantifier and the EXACT-like node.  -- japhy
5344             */
5345
5346             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5347                 sayNO;
5348             if (HAS_TEXT(next) || JUMPABLE(next)) {
5349                 U8 *s;
5350                 regnode *text_node = next;
5351
5352                 if (! HAS_TEXT(text_node))
5353                     FIND_NEXT_IMPT(text_node);
5354
5355                 if (! HAS_TEXT(text_node))
5356                     ST.c1 = ST.c2 = CHRTEST_VOID;
5357                 else {
5358                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5359                         ST.c1 = ST.c2 = CHRTEST_VOID;
5360                         goto assume_ok_easy;
5361                     }
5362                     else
5363                         s = (U8*)STRING(text_node);
5364
5365                     /*  Currently we only get here when
5366
5367                         PL_rekind[OP(text_node)] == EXACT
5368
5369                         if this changes back then the macro for IS_TEXT and
5370                         friends need to change. */
5371                     if (!UTF_PATTERN) {
5372                         ST.c1 = *s;
5373                         switch (OP(text_node)) {
5374                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5375                             case EXACTFA:
5376                             case EXACTFU_SS:
5377                             case EXACTFU_TRICKYFOLD:
5378                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5379                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5380                             default: ST.c2 = ST.c1; break;
5381                         }
5382                     }
5383                     else { /* UTF_PATTERN */
5384                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5385                              STRLEN ulen;
5386                              U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5387
5388                              to_utf8_fold((U8*)s, tmpbuf, &ulen);
5389                              ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5390                                                     uniflags);
5391                         }
5392                         else {
5393                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5394                                                      uniflags);
5395                         }
5396                     }
5397                 }
5398             }
5399             else
5400                 ST.c1 = ST.c2 = CHRTEST_VOID;
5401         assume_ok_easy:
5402
5403             ST.A = scan;
5404             ST.B = next;
5405             PL_reginput = locinput;
5406             if (minmod) {
5407                 minmod = 0;
5408                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5409                     sayNO;
5410                 ST.count = ST.min;
5411                 locinput = PL_reginput;
5412                 REGCP_SET(ST.cp);
5413                 if (ST.c1 == CHRTEST_VOID)
5414                     goto curly_try_B_min;
5415
5416                 ST.oldloc = locinput;
5417
5418                 /* set ST.maxpos to the furthest point along the
5419                  * string that could possibly match */
5420                 if  (ST.max == REG_INFTY) {
5421                     ST.maxpos = PL_regeol - 1;
5422                     if (utf8_target)
5423                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5424                             ST.maxpos--;
5425                 }
5426                 else if (utf8_target) {
5427                     int m = ST.max - ST.min;
5428                     for (ST.maxpos = locinput;
5429                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5430                         ST.maxpos += UTF8SKIP(ST.maxpos);
5431                 }
5432                 else {
5433                     ST.maxpos = locinput + ST.max - ST.min;
5434                     if (ST.maxpos >= PL_regeol)
5435                         ST.maxpos = PL_regeol - 1;
5436                 }
5437                 goto curly_try_B_min_known;
5438
5439             }
5440             else {
5441                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5442                 locinput = PL_reginput;
5443                 if (ST.count < ST.min)
5444                     sayNO;
5445                 if ((ST.count > ST.min)
5446                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5447                 {
5448                     /* A{m,n} must come at the end of the string, there's
5449                      * no point in backing off ... */
5450                     ST.min = ST.count;
5451                     /* ...except that $ and \Z can match before *and* after
5452                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5453                        We may back off by one in this case. */
5454                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5455                         ST.min--;
5456                 }
5457                 REGCP_SET(ST.cp);
5458                 goto curly_try_B_max;
5459             }
5460             assert(0); /* NOTREACHED */
5461
5462
5463         case CURLY_B_min_known_fail:
5464             /* failed to find B in a non-greedy match where c1,c2 valid */
5465
5466             PL_reginput = locinput;     /* Could be reset... */
5467             REGCP_UNWIND(ST.cp);
5468             if (ST.paren) {
5469                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5470             }
5471             /* Couldn't or didn't -- move forward. */
5472             ST.oldloc = locinput;
5473             if (utf8_target)
5474                 locinput += UTF8SKIP(locinput);
5475             else
5476                 locinput++;
5477             ST.count++;
5478           curly_try_B_min_known:
5479              /* find the next place where 'B' could work, then call B */
5480             {
5481                 int n;
5482                 if (utf8_target) {
5483                     n = (ST.oldloc == locinput) ? 0 : 1;
5484                     if (ST.c1 == ST.c2) {
5485                         STRLEN len;
5486                         /* set n to utf8_distance(oldloc, locinput) */
5487                         while (locinput <= ST.maxpos &&
5488                                utf8n_to_uvchr((U8*)locinput,
5489                                               UTF8_MAXBYTES, &len,
5490                                               uniflags) != (UV)ST.c1) {
5491                             locinput += len;
5492                             n++;
5493                         }
5494                     }
5495                     else {
5496                         /* set n to utf8_distance(oldloc, locinput) */
5497                         while (locinput <= ST.maxpos) {
5498                             STRLEN len;
5499                             const UV c = utf8n_to_uvchr((U8*)locinput,
5500                                                   UTF8_MAXBYTES, &len,
5501                                                   uniflags);
5502                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5503                                 break;
5504                             locinput += len;
5505                             n++;
5506                         }
5507                     }
5508                 }
5509                 else {
5510                     if (ST.c1 == ST.c2) {
5511                         while (locinput <= ST.maxpos &&
5512                                UCHARAT(locinput) != ST.c1)
5513                             locinput++;
5514                     }
5515                     else {
5516                         while (locinput <= ST.maxpos
5517                                && UCHARAT(locinput) != ST.c1
5518                                && UCHARAT(locinput) != ST.c2)
5519                             locinput++;
5520                     }
5521                     n = locinput - ST.oldloc;
5522                 }
5523                 if (locinput > ST.maxpos)
5524                     sayNO;
5525                 /* PL_reginput == oldloc now */
5526                 if (n) {
5527                     ST.count += n;
5528                     if (regrepeat(rex, ST.A, n, depth) < n)
5529                         sayNO;
5530                 }
5531                 PL_reginput = locinput;
5532                 CURLY_SETPAREN(ST.paren, ST.count);
5533                 if (cur_eval && cur_eval->u.eval.close_paren &&
5534                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5535                     goto fake_end;
5536                 }
5537                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5538             }
5539             assert(0); /* NOTREACHED */
5540
5541
5542         case CURLY_B_min_fail:
5543             /* failed to find B in a non-greedy match where c1,c2 invalid */
5544
5545             REGCP_UNWIND(ST.cp);
5546             if (ST.paren) {
5547                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5548             }
5549             /* failed -- move forward one */
5550             PL_reginput = locinput;
5551             if (regrepeat(rex, ST.A, 1, depth)) {
5552                 ST.count++;
5553                 locinput = PL_reginput;
5554                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5555                         ST.count > 0)) /* count overflow ? */
5556                 {
5557                   curly_try_B_min:
5558                     CURLY_SETPAREN(ST.paren, ST.count);
5559                     if (cur_eval && cur_eval->u.eval.close_paren &&
5560                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5561                         goto fake_end;
5562                     }
5563                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5564                 }
5565             }
5566             sayNO;
5567             assert(0); /* NOTREACHED */
5568
5569
5570         curly_try_B_max:
5571             /* a successful greedy match: now try to match B */
5572             if (cur_eval && cur_eval->u.eval.close_paren &&
5573                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5574                 goto fake_end;
5575             }
5576             {
5577                 UV c = 0;
5578                 if (ST.c1 != CHRTEST_VOID)
5579                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5580                                            UTF8_MAXBYTES, 0, uniflags)
5581                                 : (UV) UCHARAT(PL_reginput);
5582                 /* If it could work, try it. */
5583                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5584                     CURLY_SETPAREN(ST.paren, ST.count);
5585                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5586                     assert(0); /* NOTREACHED */
5587                 }
5588             }
5589             /* FALL THROUGH */
5590         case CURLY_B_max_fail:
5591             /* failed to find B in a greedy match */
5592
5593             REGCP_UNWIND(ST.cp);
5594             if (ST.paren) {
5595                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5596             }
5597             /*  back up. */
5598             if (--ST.count < ST.min)
5599                 sayNO;
5600             PL_reginput = locinput = HOPc(locinput, -1);
5601             goto curly_try_B_max;
5602
5603 #undef ST
5604
5605         case END:
5606             fake_end:
5607             if (cur_eval) {
5608                 /* we've just finished A in /(??{A})B/; now continue with B */
5609                 st->u.eval.toggle_reg_flags
5610                             = cur_eval->u.eval.toggle_reg_flags;
5611                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5612
5613                 st->u.eval.prev_rex = rex_sv;           /* inner */
5614                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5615                 rex_sv = cur_eval->u.eval.prev_rex;
5616                 SET_reg_curpm(rex_sv);
5617                 rex = (struct regexp *)SvANY(rex_sv);
5618                 rexi = RXi_GET(rex);
5619                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5620
5621                 REGCP_SET(st->u.eval.lastcp);
5622                 PL_reginput = locinput;
5623
5624                 /* Restore parens of the outer rex without popping the
5625                  * savestack */
5626                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5627
5628                 st->u.eval.prev_eval = cur_eval;
5629                 cur_eval = cur_eval->u.eval.prev_eval;
5630                 DEBUG_EXECUTE_r(
5631                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5632                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5633                 if ( nochange_depth )
5634                     nochange_depth--;
5635
5636                 PUSH_YES_STATE_GOTO(EVAL_AB,
5637                         st->u.eval.prev_eval->u.eval.B); /* match B */
5638             }
5639
5640             if (locinput < reginfo->till) {
5641                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5642                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5643                                       PL_colors[4],
5644                                       (long)(locinput - PL_reg_starttry),
5645                                       (long)(reginfo->till - PL_reg_starttry),
5646                                       PL_colors[5]));
5647
5648                 sayNO_SILENT;           /* Cannot match: too short. */
5649             }
5650             PL_reginput = locinput;     /* put where regtry can find it */
5651             sayYES;                     /* Success! */
5652
5653         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5654             DEBUG_EXECUTE_r(
5655             PerlIO_printf(Perl_debug_log,
5656                 "%*s  %ssubpattern success...%s\n",
5657                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5658             PL_reginput = locinput;     /* put where regtry can find it */
5659             sayYES;                     /* Success! */
5660
5661 #undef  ST
5662 #define ST st->u.ifmatch
5663
5664         case SUSPEND:   /* (?>A) */
5665             ST.wanted = 1;
5666             PL_reginput = locinput;
5667             goto do_ifmatch;
5668
5669         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5670             ST.wanted = 0;
5671             goto ifmatch_trivial_fail_test;
5672
5673         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5674             ST.wanted = 1;
5675           ifmatch_trivial_fail_test:
5676             if (scan->flags) {
5677                 char * const s = HOPBACKc(locinput, scan->flags);
5678                 if (!s) {
5679                     /* trivial fail */
5680                     if (logical) {
5681                         logical = 0;
5682                         sw = 1 - cBOOL(ST.wanted);
5683                     }
5684                     else if (ST.wanted)
5685                         sayNO;
5686                     next = scan + ARG(scan);
5687                     if (next == scan)
5688                         next = NULL;
5689                     break;
5690                 }
5691                 PL_reginput = s;
5692             }
5693             else
5694                 PL_reginput = locinput;
5695
5696           do_ifmatch:
5697             ST.me = scan;
5698             ST.logical = logical;
5699             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5700
5701             /* execute body of (?...A) */
5702             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5703             assert(0); /* NOTREACHED */
5704
5705         case IFMATCH_A_fail: /* body of (?...A) failed */
5706             ST.wanted = !ST.wanted;
5707             /* FALL THROUGH */
5708
5709         case IFMATCH_A: /* body of (?...A) succeeded */
5710             if (ST.logical) {
5711                 sw = cBOOL(ST.wanted);
5712             }
5713             else if (!ST.wanted)
5714                 sayNO;
5715
5716             if (OP(ST.me) == SUSPEND)
5717                 locinput = PL_reginput;
5718             else {
5719                 locinput = PL_reginput = st->locinput;
5720                 nextchr = UCHARAT(locinput);
5721             }
5722             scan = ST.me + ARG(ST.me);
5723             if (scan == ST.me)
5724                 scan = NULL;
5725             continue; /* execute B */
5726
5727 #undef ST
5728
5729         case LONGJMP:
5730             next = scan + ARG(scan);
5731             if (next == scan)
5732                 next = NULL;
5733             break;
5734         case COMMIT:
5735             reginfo->cutpoint = PL_regeol;
5736             /* FALLTHROUGH */
5737         case PRUNE:
5738             PL_reginput = locinput;
5739             if (!scan->flags)
5740                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5741             PUSH_STATE_GOTO(COMMIT_next,next);
5742             assert(0); /* NOTREACHED */
5743         case COMMIT_next_fail:
5744             no_final = 1;
5745             /* FALLTHROUGH */
5746         case OPFAIL:
5747             sayNO;
5748             assert(0); /* NOTREACHED */
5749
5750 #define ST st->u.mark
5751         case MARKPOINT:
5752             ST.prev_mark = mark_state;
5753             ST.mark_name = sv_commit = sv_yes_mark
5754                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5755             mark_state = st;
5756             ST.mark_loc = PL_reginput = locinput;
5757             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5758             assert(0); /* NOTREACHED */
5759         case MARKPOINT_next:
5760             mark_state = ST.prev_mark;
5761             sayYES;
5762             assert(0); /* NOTREACHED */
5763         case MARKPOINT_next_fail:
5764             if (popmark && sv_eq(ST.mark_name,popmark))
5765             {
5766                 if (ST.mark_loc > startpoint)
5767                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5768                 popmark = NULL; /* we found our mark */
5769                 sv_commit = ST.mark_name;
5770
5771                 DEBUG_EXECUTE_r({
5772                         PerlIO_printf(Perl_debug_log,
5773                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5774                             REPORT_CODE_OFF+depth*2, "",
5775                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5776                 });
5777             }
5778             mark_state = ST.prev_mark;
5779             sv_yes_mark = mark_state ?
5780                 mark_state->u.mark.mark_name : NULL;
5781             sayNO;
5782             assert(0); /* NOTREACHED */
5783         case SKIP:
5784             PL_reginput = locinput;
5785             if (scan->flags) {
5786                 /* (*SKIP) : if we fail we cut here*/
5787                 ST.mark_name = NULL;
5788                 ST.mark_loc = locinput;
5789                 PUSH_STATE_GOTO(SKIP_next,next);
5790             } else {
5791                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5792                    otherwise do nothing.  Meaning we need to scan
5793                  */
5794                 regmatch_state *cur = mark_state;
5795                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5796
5797                 while (cur) {
5798                     if ( sv_eq( cur->u.mark.mark_name,
5799                                 find ) )
5800                     {
5801                         ST.mark_name = find;
5802                         PUSH_STATE_GOTO( SKIP_next, next );
5803                     }
5804                     cur = cur->u.mark.prev_mark;
5805                 }
5806             }
5807             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5808             break;
5809         case SKIP_next_fail:
5810             if (ST.mark_name) {
5811                 /* (*CUT:NAME) - Set up to search for the name as we
5812                    collapse the stack*/
5813                 popmark = ST.mark_name;
5814             } else {
5815                 /* (*CUT) - No name, we cut here.*/
5816                 if (ST.mark_loc > startpoint)
5817                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5818                 /* but we set sv_commit to latest mark_name if there
5819                    is one so they can test to see how things lead to this
5820                    cut */
5821                 if (mark_state)
5822                     sv_commit=mark_state->u.mark.mark_name;
5823             }
5824             no_final = 1;
5825             sayNO;
5826             assert(0); /* NOTREACHED */
5827 #undef ST
5828         case LNBREAK:
5829             if ((n=is_LNBREAK(locinput,utf8_target))) {
5830                 locinput += n;
5831                 nextchr = UCHARAT(locinput);
5832             } else
5833                 sayNO;
5834             break;
5835
5836 #define CASE_CLASS(nAmE)                              \
5837         case nAmE:                                    \
5838             if (locinput >= PL_regeol)                \
5839                 sayNO;                                \
5840             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5841                 locinput += n;                        \
5842                 nextchr = UCHARAT(locinput);          \
5843             } else                                    \
5844                 sayNO;                                \
5845             break;                                    \
5846         case N##nAmE:                                 \
5847             if (locinput >= PL_regeol)                \
5848                 sayNO;                                \
5849             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5850                 sayNO;                                \
5851             } else {                                  \
5852                 locinput += UTF8SKIP(locinput);       \
5853                 nextchr = UCHARAT(locinput);          \
5854             }                                         \
5855             break
5856
5857         CASE_CLASS(VERTWS);
5858         CASE_CLASS(HORIZWS);
5859 #undef CASE_CLASS
5860
5861         default:
5862             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5863                           PTR2UV(scan), OP(scan));
5864             Perl_croak(aTHX_ "regexp memory corruption");
5865
5866         } /* end switch */
5867
5868         /* switch break jumps here */
5869         scan = next; /* prepare to execute the next op and ... */
5870         continue;    /* ... jump back to the top, reusing st */
5871         assert(0); /* NOTREACHED */
5872
5873       push_yes_state:
5874         /* push a state that backtracks on success */
5875         st->u.yes.prev_yes_state = yes_state;
5876         yes_state = st;
5877         /* FALL THROUGH */
5878       push_state:
5879         /* push a new regex state, then continue at scan  */
5880         {
5881             regmatch_state *newst;
5882
5883             DEBUG_STACK_r({
5884                 regmatch_state *cur = st;
5885                 regmatch_state *curyes = yes_state;
5886                 int curd = depth;
5887                 regmatch_slab *slab = PL_regmatch_slab;
5888                 for (;curd > -1;cur--,curd--) {
5889                     if (cur < SLAB_FIRST(slab)) {
5890                         slab = slab->prev;
5891                         cur = SLAB_LAST(slab);
5892                     }
5893                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5894                         REPORT_CODE_OFF + 2 + depth * 2,"",
5895                         curd, PL_reg_name[cur->resume_state],
5896                         (curyes == cur) ? "yes" : ""
5897                     );
5898                     if (curyes == cur)
5899                         curyes = cur->u.yes.prev_yes_state;
5900                 }
5901             } else
5902                 DEBUG_STATE_pp("push")
5903             );
5904             depth++;
5905             st->locinput = locinput;
5906             newst = st+1;
5907             if (newst >  SLAB_LAST(PL_regmatch_slab))
5908                 newst = S_push_slab(aTHX);
5909             PL_regmatch_state = newst;
5910
5911             locinput = PL_reginput;
5912             nextchr = UCHARAT(locinput);
5913             st = newst;
5914             continue;
5915             assert(0); /* NOTREACHED */
5916         }
5917     }
5918
5919     /*
5920     * We get here only if there's trouble -- normally "case END" is
5921     * the terminating point.
5922     */
5923     Perl_croak(aTHX_ "corrupted regexp pointers");
5924     /*NOTREACHED*/
5925     sayNO;
5926
5927 yes:
5928     if (yes_state) {
5929         /* we have successfully completed a subexpression, but we must now
5930          * pop to the state marked by yes_state and continue from there */
5931         assert(st != yes_state);
5932 #ifdef DEBUGGING
5933         while (st != yes_state) {
5934             st--;
5935             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5936                 PL_regmatch_slab = PL_regmatch_slab->prev;
5937                 st = SLAB_LAST(PL_regmatch_slab);
5938             }
5939             DEBUG_STATE_r({
5940                 if (no_final) {
5941                     DEBUG_STATE_pp("pop (no final)");
5942                 } else {
5943                     DEBUG_STATE_pp("pop (yes)");
5944                 }
5945             });
5946             depth--;
5947         }
5948 #else
5949         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5950             || yes_state > SLAB_LAST(PL_regmatch_slab))
5951         {
5952             /* not in this slab, pop slab */
5953             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5954             PL_regmatch_slab = PL_regmatch_slab->prev;
5955             st = SLAB_LAST(PL_regmatch_slab);
5956         }
5957         depth -= (st - yes_state);
5958 #endif
5959         st = yes_state;
5960         yes_state = st->u.yes.prev_yes_state;
5961         PL_regmatch_state = st;
5962
5963         if (no_final) {
5964             locinput= st->locinput;
5965             nextchr = UCHARAT(locinput);
5966         }
5967         state_num = st->resume_state + no_final;
5968         goto reenter_switch;
5969     }
5970
5971     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5972                           PL_colors[4], PL_colors[5]));
5973
5974     if (PL_reg_state.re_state_eval_setup_done) {
5975         /* each successfully executed (?{...}) block does the equivalent of
5976          *   local $^R = do {...}
5977          * When popping the save stack, all these locals would be undone;
5978          * bypass this by setting the outermost saved $^R to the latest
5979          * value */
5980         if (oreplsv != GvSV(PL_replgv))
5981             sv_setsv(oreplsv, GvSV(PL_replgv));
5982     }
5983     result = 1;
5984     goto final_exit;
5985
5986 no:
5987     DEBUG_EXECUTE_r(
5988         PerlIO_printf(Perl_debug_log,
5989             "%*s  %sfailed...%s\n",
5990             REPORT_CODE_OFF+depth*2, "",
5991             PL_colors[4], PL_colors[5])
5992         );
5993
5994 no_silent:
5995     if (no_final) {
5996         if (yes_state) {
5997             goto yes;
5998         } else {
5999             goto final_exit;
6000         }
6001     }
6002     if (depth) {
6003         /* there's a previous state to backtrack to */
6004         st--;
6005         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6006             PL_regmatch_slab = PL_regmatch_slab->prev;
6007             st = SLAB_LAST(PL_regmatch_slab);
6008         }
6009         PL_regmatch_state = st;
6010         locinput= st->locinput;
6011         nextchr = UCHARAT(locinput);
6012
6013         DEBUG_STATE_pp("pop");
6014         depth--;
6015         if (yes_state == st)
6016             yes_state = st->u.yes.prev_yes_state;
6017
6018         state_num = st->resume_state + 1; /* failure = success + 1 */
6019         goto reenter_switch;
6020     }
6021     result = 0;
6022
6023   final_exit:
6024     if (rex->intflags & PREGf_VERBARG_SEEN) {
6025         SV *sv_err = get_sv("REGERROR", 1);
6026         SV *sv_mrk = get_sv("REGMARK", 1);
6027         if (result) {
6028             sv_commit = &PL_sv_no;
6029             if (!sv_yes_mark)
6030                 sv_yes_mark = &PL_sv_yes;
6031         } else {
6032             if (!sv_commit)
6033                 sv_commit = &PL_sv_yes;
6034             sv_yes_mark = &PL_sv_no;
6035         }
6036         sv_setsv(sv_err, sv_commit);
6037         sv_setsv(sv_mrk, sv_yes_mark);
6038     }
6039
6040
6041     if (last_pushed_cv) {
6042         dSP;
6043         POP_MULTICALL;
6044         PERL_UNUSED_VAR(SP);
6045     }
6046
6047     /* clean up; in particular, free all slabs above current one */
6048     LEAVE_SCOPE(oldsave);
6049
6050     return result;
6051 }
6052
6053 /*
6054  - regrepeat - repeatedly match something simple, report how many
6055  */
6056 /*
6057  * [This routine now assumes that it will only match on things of length 1.
6058  * That was true before, but now we assume scan - reginput is the count,
6059  * rather than incrementing count on every character.  [Er, except utf8.]]
6060  */
6061 STATIC I32
6062 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
6063 {
6064     dVAR;
6065     register char *scan;
6066     register I32 c;
6067     register char *loceol = PL_regeol;
6068     register I32 hardcount = 0;
6069     register bool utf8_target = PL_reg_match_utf8;
6070     UV utf8_flags;
6071 #ifndef DEBUGGING
6072     PERL_UNUSED_ARG(depth);
6073 #endif
6074
6075     PERL_ARGS_ASSERT_REGREPEAT;
6076
6077     scan = PL_reginput;
6078     if (max == REG_INFTY)
6079         max = I32_MAX;
6080     else if (max < loceol - scan)
6081         loceol = scan + max;
6082     switch (OP(p)) {
6083     case REG_ANY:
6084         if (utf8_target) {
6085             loceol = PL_regeol;
6086             while (scan < loceol && hardcount < max && *scan != '\n') {
6087                 scan += UTF8SKIP(scan);
6088                 hardcount++;
6089             }
6090         } else {
6091             while (scan < loceol && *scan != '\n')
6092                 scan++;
6093         }
6094         break;
6095     case SANY:
6096         if (utf8_target) {
6097             loceol = PL_regeol;
6098             while (scan < loceol && hardcount < max) {
6099                 scan += UTF8SKIP(scan);
6100                 hardcount++;
6101             }
6102         }
6103         else
6104             scan = loceol;
6105         break;
6106     case CANY:
6107         scan = loceol;
6108         break;
6109     case EXACT:
6110         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6111          * means they match only characters in the string that can be expressed
6112          * as a single byte.  For non-utf8 strings, that means a simple match.
6113          * For utf8 strings, the character matched must be an invariant, or
6114          * downgradable to a single byte.  The pattern's utf8ness is
6115          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6116          * it is, it's an invariant */
6117
6118         c = (U8)*STRING(p);
6119         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6120
6121         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6122             while (scan < loceol && UCHARAT(scan) == c) {
6123                 scan++;
6124             }
6125         }
6126         else {
6127
6128             /* Here, the string is utf8, and the pattern char is different
6129              * in utf8 than not, so can't compare them directly.  Outside the
6130              * loop, find the two utf8 bytes that represent c, and then
6131              * look for those in sequence in the utf8 string */
6132             U8 high = UTF8_TWO_BYTE_HI(c);
6133             U8 low = UTF8_TWO_BYTE_LO(c);
6134             loceol = PL_regeol;
6135
6136             while (hardcount < max
6137                     && scan + 1 < loceol
6138                     && UCHARAT(scan) == high
6139                     && UCHARAT(scan + 1) == low)
6140             {
6141                 scan += 2;
6142                 hardcount++;
6143             }
6144         }
6145         break;
6146     case EXACTFA:
6147         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6148         goto do_exactf;
6149
6150     case EXACTFL:
6151         PL_reg_flags |= RF_tainted;
6152         utf8_flags = FOLDEQ_UTF8_LOCALE;
6153         goto do_exactf;
6154
6155     case EXACTF:
6156             utf8_flags = 0;
6157             goto do_exactf;
6158
6159     case EXACTFU_SS:
6160     case EXACTFU_TRICKYFOLD:
6161     case EXACTFU:
6162         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6163
6164         /* The comments for the EXACT case above apply as well to these fold
6165          * ones */
6166
6167     do_exactf:
6168         c = (U8)*STRING(p);
6169         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6170
6171         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6172             char *tmpeol = loceol;
6173             while (hardcount < max
6174                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6175                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6176             {
6177                 scan = tmpeol;
6178                 tmpeol = loceol;
6179                 hardcount++;
6180             }
6181
6182             /* XXX Note that the above handles properly the German sharp s in
6183              * the pattern matching ss in the string.  But it doesn't handle
6184              * properly cases where the string contains say 'LIGATURE ff' and
6185              * the pattern is 'f+'.  This would require, say, a new function or
6186              * revised interface to foldEQ_utf8(), in which the maximum number
6187              * of characters to match could be passed and it would return how
6188              * many actually did.  This is just one of many cases where
6189              * multi-char folds don't work properly, and so the fix is being
6190              * deferred */
6191         }
6192         else {
6193             U8 folded;
6194
6195             /* Here, the string isn't utf8 and c is a single byte; and either
6196              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6197              * doesn't affect c.  Can just do simple comparisons for exact or
6198              * fold matching. */
6199             switch (OP(p)) {
6200                 case EXACTF: folded = PL_fold[c]; break;
6201                 case EXACTFA:
6202                 case EXACTFU_TRICKYFOLD:
6203                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6204                 case EXACTFL: folded = PL_fold_locale[c]; break;
6205                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6206             }
6207             while (scan < loceol &&
6208                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6209             {
6210                 scan++;
6211             }
6212         }
6213         break;
6214     case ANYOFV:
6215     case ANYOF:
6216         if (utf8_target || OP(p) == ANYOFV) {
6217             STRLEN inclasslen;
6218             loceol = PL_regeol;
6219             inclasslen = loceol - scan;
6220             while (hardcount < max
6221                    && ((inclasslen = loceol - scan) > 0)
6222                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6223             {
6224                 scan += inclasslen;
6225                 hardcount++;
6226             }
6227         } else {
6228             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6229                 scan++;
6230         }
6231         break;
6232     case ALNUMU:
6233         if (utf8_target) {
6234     utf8_wordchar:
6235             loceol = PL_regeol;
6236             LOAD_UTF8_CHARCLASS_ALNUM();
6237             while (hardcount < max && scan < loceol &&
6238                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6239             {
6240                 scan += UTF8SKIP(scan);
6241                 hardcount++;
6242             }
6243         } else {
6244             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6245                 scan++;
6246             }
6247         }
6248         break;
6249     case ALNUM:
6250         if (utf8_target)
6251             goto utf8_wordchar;
6252         while (scan < loceol && isALNUM((U8) *scan)) {
6253             scan++;
6254         }
6255         break;
6256     case ALNUMA:
6257         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6258             scan++;
6259         }
6260         break;
6261     case ALNUML:
6262         PL_reg_flags |= RF_tainted;
6263         if (utf8_target) {
6264             loceol = PL_regeol;
6265             while (hardcount < max && scan < loceol &&
6266                    isALNUM_LC_utf8((U8*)scan)) {
6267                 scan += UTF8SKIP(scan);
6268                 hardcount++;
6269             }
6270         } else {
6271             while (scan < loceol && isALNUM_LC(*scan))
6272                 scan++;
6273         }
6274         break;
6275     case NALNUMU:
6276         if (utf8_target) {
6277
6278     utf8_Nwordchar:
6279
6280             loceol = PL_regeol;
6281             LOAD_UTF8_CHARCLASS_ALNUM();
6282             while (hardcount < max && scan < loceol &&
6283                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6284             {
6285                 scan += UTF8SKIP(scan);
6286                 hardcount++;
6287             }
6288         } else {
6289             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6290                 scan++;
6291             }
6292         }
6293         break;
6294     case NALNUM:
6295         if (utf8_target)
6296             goto utf8_Nwordchar;
6297         while (scan < loceol && ! isALNUM((U8) *scan)) {
6298             scan++;
6299         }
6300         break;
6301     case NALNUMA:
6302         if (utf8_target) {
6303             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6304                 scan += UTF8SKIP(scan);
6305             }
6306         }
6307         else {
6308             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6309                 scan++;
6310             }
6311         }
6312         break;
6313     case NALNUML:
6314         PL_reg_flags |= RF_tainted;
6315         if (utf8_target) {
6316             loceol = PL_regeol;
6317             while (hardcount < max && scan < loceol &&
6318                    !isALNUM_LC_utf8((U8*)scan)) {
6319                 scan += UTF8SKIP(scan);
6320                 hardcount++;
6321             }
6322         } else {
6323             while (scan < loceol && !isALNUM_LC(*scan))
6324                 scan++;
6325         }
6326         break;
6327     case SPACEU:
6328         if (utf8_target) {
6329
6330     utf8_space:
6331
6332             loceol = PL_regeol;
6333             LOAD_UTF8_CHARCLASS_SPACE();
6334             while (hardcount < max && scan < loceol &&
6335                    (*scan == ' ' ||
6336                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6337             {
6338                 scan += UTF8SKIP(scan);
6339                 hardcount++;
6340             }
6341             break;
6342         }
6343         else {
6344             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6345                 scan++;
6346             }
6347             break;
6348         }
6349     case SPACE:
6350         if (utf8_target)
6351             goto utf8_space;
6352
6353         while (scan < loceol && isSPACE((U8) *scan)) {
6354             scan++;
6355         }
6356         break;
6357     case SPACEA:
6358         while (scan < loceol && isSPACE_A((U8) *scan)) {
6359             scan++;
6360         }
6361         break;
6362     case SPACEL:
6363         PL_reg_flags |= RF_tainted;
6364         if (utf8_target) {
6365             loceol = PL_regeol;
6366             while (hardcount < max && scan < loceol &&
6367                    isSPACE_LC_utf8((U8*)scan)) {
6368                 scan += UTF8SKIP(scan);
6369                 hardcount++;
6370             }
6371         } else {
6372             while (scan < loceol && isSPACE_LC(*scan))
6373                 scan++;
6374         }
6375         break;
6376     case NSPACEU:
6377         if (utf8_target) {
6378
6379     utf8_Nspace:
6380
6381             loceol = PL_regeol;
6382             LOAD_UTF8_CHARCLASS_SPACE();
6383             while (hardcount < max && scan < loceol &&
6384                    ! (*scan == ' ' ||
6385                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6386             {
6387                 scan += UTF8SKIP(scan);
6388                 hardcount++;
6389             }
6390             break;
6391         }
6392         else {
6393             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6394                 scan++;
6395             }
6396         }
6397         break;
6398     case NSPACE:
6399         if (utf8_target)
6400             goto utf8_Nspace;
6401
6402         while (scan < loceol && ! isSPACE((U8) *scan)) {
6403             scan++;
6404         }
6405         break;
6406     case NSPACEA:
6407         if (utf8_target) {
6408             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6409                 scan += UTF8SKIP(scan);
6410             }
6411         }
6412         else {
6413             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6414                 scan++;
6415             }
6416         }
6417         break;
6418     case NSPACEL:
6419         PL_reg_flags |= RF_tainted;
6420         if (utf8_target) {
6421             loceol = PL_regeol;
6422             while (hardcount < max && scan < loceol &&
6423                    !isSPACE_LC_utf8((U8*)scan)) {
6424                 scan += UTF8SKIP(scan);
6425                 hardcount++;
6426             }
6427         } else {
6428             while (scan < loceol && !isSPACE_LC(*scan))
6429                 scan++;
6430         }
6431         break;
6432     case DIGIT:
6433         if (utf8_target) {
6434             loceol = PL_regeol;
6435             LOAD_UTF8_CHARCLASS_DIGIT();
6436             while (hardcount < max && scan < loceol &&
6437                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6438                 scan += UTF8SKIP(scan);
6439                 hardcount++;
6440             }
6441         } else {
6442             while (scan < loceol && isDIGIT(*scan))
6443                 scan++;
6444         }
6445         break;
6446     case DIGITA:
6447         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6448             scan++;
6449         }
6450         break;
6451     case DIGITL:
6452         PL_reg_flags |= RF_tainted;
6453         if (utf8_target) {
6454             loceol = PL_regeol;
6455             while (hardcount < max && scan < loceol &&
6456                    isDIGIT_LC_utf8((U8*)scan)) {
6457                 scan += UTF8SKIP(scan);
6458                 hardcount++;
6459             }
6460         } else {
6461             while (scan < loceol && isDIGIT_LC(*scan))
6462                 scan++;
6463         }
6464         break;
6465     case NDIGIT:
6466         if (utf8_target) {
6467             loceol = PL_regeol;
6468             LOAD_UTF8_CHARCLASS_DIGIT();
6469             while (hardcount < max && scan < loceol &&
6470                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6471                 scan += UTF8SKIP(scan);
6472                 hardcount++;
6473             }
6474         } else {
6475             while (scan < loceol && !isDIGIT(*scan))
6476                 scan++;
6477         }
6478         break;
6479     case NDIGITA:
6480         if (utf8_target) {
6481             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6482                 scan += UTF8SKIP(scan);
6483             }
6484         }
6485         else {
6486             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6487                 scan++;
6488             }
6489         }
6490         break;
6491     case NDIGITL:
6492         PL_reg_flags |= RF_tainted;
6493         if (utf8_target) {
6494             loceol = PL_regeol;
6495             while (hardcount < max && scan < loceol &&
6496                    !isDIGIT_LC_utf8((U8*)scan)) {
6497                 scan += UTF8SKIP(scan);
6498                 hardcount++;
6499             }
6500         } else {
6501             while (scan < loceol && !isDIGIT_LC(*scan))
6502                 scan++;
6503         }
6504         break;
6505     case LNBREAK:
6506         if (utf8_target) {
6507             loceol = PL_regeol;
6508             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6509                 scan += c;
6510                 hardcount++;
6511             }
6512         } else {
6513             /*
6514               LNBREAK can match two latin chars, which is ok,
6515               because we have a null terminated string, but we
6516               have to use hardcount in this situation
6517             */
6518             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6519                 scan+=c;
6520                 hardcount++;
6521             }
6522         }
6523         break;
6524     case HORIZWS:
6525         if (utf8_target) {
6526             loceol = PL_regeol;
6527             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6528                 scan += c;
6529                 hardcount++;
6530             }
6531         } else {
6532             while (scan < loceol && is_HORIZWS_latin1(scan))
6533                 scan++;
6534         }
6535         break;
6536     case NHORIZWS:
6537         if (utf8_target) {
6538             loceol = PL_regeol;
6539             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6540                 scan += UTF8SKIP(scan);
6541                 hardcount++;
6542             }
6543         } else {
6544             while (scan < loceol && !is_HORIZWS_latin1(scan))
6545                 scan++;
6546
6547         }
6548         break;
6549     case VERTWS:
6550         if (utf8_target) {
6551             loceol = PL_regeol;
6552             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6553                 scan += c;
6554                 hardcount++;
6555             }
6556         } else {
6557             while (scan < loceol && is_VERTWS_latin1(scan))
6558                 scan++;
6559
6560         }
6561         break;
6562     case NVERTWS:
6563         if (utf8_target) {
6564             loceol = PL_regeol;
6565             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6566                 scan += UTF8SKIP(scan);
6567                 hardcount++;
6568             }
6569         } else {
6570             while (scan < loceol && !is_VERTWS_latin1(scan))
6571                 scan++;
6572
6573         }
6574         break;
6575
6576     default:            /* Called on something of 0 width. */
6577         break;          /* So match right here or not at all. */
6578     }
6579
6580     if (hardcount)
6581         c = hardcount;
6582     else
6583         c = scan - PL_reginput;
6584     PL_reginput = scan;
6585
6586     DEBUG_r({
6587         GET_RE_DEBUG_FLAGS_DECL;
6588         DEBUG_EXECUTE_r({
6589             SV * const prop = sv_newmortal();
6590             regprop(prog, prop, p);
6591             PerlIO_printf(Perl_debug_log,
6592                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6593                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6594         });
6595     });
6596
6597     return(c);
6598 }
6599
6600
6601 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6602 /*
6603 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6604 create a copy so that changes the caller makes won't change the shared one
6605  */
6606 SV *
6607 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6608 {
6609     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6610     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6611 }
6612 #endif
6613
6614 STATIC SV *
6615 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6616 {
6617     /* Returns the swash for the input 'node' in the regex 'prog'.
6618      * If <doinit> is true, will attempt to create the swash if not already
6619      *    done.
6620      * If <listsvp> is non-null, will return the swash initialization string in
6621      *    it.
6622      * If <altsvp> is non-null, will return the alternates to the regular swash
6623      *    in it
6624      * Tied intimately to how regcomp.c sets up the data structure */
6625
6626     dVAR;
6627     SV *sw  = NULL;
6628     SV *si  = NULL;
6629     SV *alt = NULL;
6630     SV*  invlist = NULL;
6631
6632     RXi_GET_DECL(prog,progi);
6633     const struct reg_data * const data = prog ? progi->data : NULL;
6634
6635     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6636
6637     assert(ANYOF_NONBITMAP(node));
6638
6639     if (data && data->count) {
6640         const U32 n = ARG(node);
6641
6642         if (data->what[n] == 's') {
6643             SV * const rv = MUTABLE_SV(data->data[n]);
6644             AV * const av = MUTABLE_AV(SvRV(rv));
6645             SV **const ary = AvARRAY(av);
6646             bool invlist_has_user_defined_property;
6647
6648             si = *ary;  /* ary[0] = the string to initialize the swash with */
6649
6650             /* Elements 3 and 4 are either both present or both absent. [3] is
6651              * any inversion list generated at compile time; [4] indicates if
6652              * that inversion list has any user-defined properties in it. */
6653             if (av_len(av) >= 3) {
6654                 invlist = ary[3];
6655                 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6656             }
6657             else {
6658                 invlist = NULL;
6659                 invlist_has_user_defined_property = FALSE;
6660             }
6661
6662             /* Element [1] is reserved for the set-up swash.  If already there,
6663              * return it; if not, create it and store it there */
6664             if (SvROK(ary[1])) {
6665                 sw = ary[1];
6666             }
6667             else if (si && doinit) {
6668
6669                 sw = _core_swash_init("utf8", /* the utf8 package */
6670                                       "", /* nameless */
6671                                       si,
6672                                       1, /* binary */
6673                                       0, /* not from tr/// */
6674                                       FALSE, /* is error if can't find
6675                                                 property */
6676                                       invlist,
6677                                       invlist_has_user_defined_property);
6678                 (void)av_store(av, 1, sw);
6679             }
6680
6681             /* Element [2] is for any multi-char folds.  Note that is a
6682              * fundamentally flawed design, because can't backtrack and try
6683              * again.  See [perl #89774] */
6684             if (SvTYPE(ary[2]) == SVt_PVAV) {
6685                 alt = ary[2];
6686             }
6687         }
6688     }
6689
6690     if (listsvp) {
6691         SV* matches_string = newSVpvn("", 0);
6692         SV** invlistsvp;
6693
6694         /* Use the swash, if any, which has to have incorporated into it all
6695          * possibilities */
6696         if (   sw
6697             && SvROK(sw)
6698             && SvTYPE(SvRV(sw)) == SVt_PVHV
6699             && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6700         {
6701             invlist = *invlistsvp;
6702         }
6703         else if (si && si != &PL_sv_undef) {
6704
6705             /* If no swash, use the input nitialization string, if available */
6706             sv_catsv(matches_string, si);
6707         }
6708
6709         /* Add the inversion list to whatever we have.  This may have come from
6710          * the swash, or from an input parameter */
6711         if (invlist) {
6712             sv_catsv(matches_string, _invlist_contents(invlist));
6713         }
6714         *listsvp = matches_string;
6715     }
6716
6717     if (altsvp)
6718         *altsvp  = alt;
6719
6720     return sw;
6721 }
6722
6723 /*
6724  - reginclass - determine if a character falls into a character class
6725
6726   n is the ANYOF regnode
6727   p is the target string
6728   lenp is pointer to the maximum number of bytes of how far to go in p
6729     (This is assumed wthout checking to always be at least the current
6730     character's size)
6731   utf8_target tells whether p is in UTF-8.
6732
6733   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6734   from a successful match, the value it points to will be updated to how many
6735   bytes in p were matched.  If there was no match, the value is undefined,
6736   possibly changed from the input.
6737
6738   Note that this can be a synthetic start class, a combination of various
6739   nodes, so things you think might be mutually exclusive, such as locale,
6740   aren't.  It can match both locale and non-locale
6741
6742  */
6743
6744 STATIC bool
6745 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6746 {
6747     dVAR;
6748     const char flags = ANYOF_FLAGS(n);
6749     bool match = FALSE;
6750     UV c = *p;
6751     STRLEN c_len = 0;
6752     STRLEN maxlen;
6753
6754     PERL_ARGS_ASSERT_REGINCLASS;
6755
6756     /* If c is not already the code point, get it */
6757     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6758         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6759                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6760                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6761                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6762                  * UTF8_ALLOW_FFFF */
6763         if (c_len == (STRLEN)-1)
6764             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6765     }
6766     else {
6767         c_len = 1;
6768     }
6769
6770     /* Use passed in max length, or one character if none passed in or less
6771      * than one character.  And assume will match just one character.  This is
6772      * overwritten later if matched more. */
6773     if (lenp) {
6774         maxlen = (*lenp > c_len) ? *lenp : c_len;
6775         *lenp = c_len;
6776
6777     }
6778     else {
6779         maxlen = c_len;
6780     }
6781
6782     /* If this character is potentially in the bitmap, check it */
6783     if (c < 256) {
6784         if (ANYOF_BITMAP_TEST(n, c))
6785             match = TRUE;
6786         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6787                 && ! utf8_target
6788                 && ! isASCII(c))
6789         {
6790             match = TRUE;
6791         }
6792
6793         else if (flags & ANYOF_LOCALE) {
6794             PL_reg_flags |= RF_tainted;
6795
6796             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6797                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6798             {
6799                 match = TRUE;
6800             }
6801             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6802                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6803                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6804                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6805                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6806                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6807                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6808                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6809                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6810                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6811                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6812                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6813                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6814                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6815                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6816                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6817                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6818                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6819                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6820                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6821                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6822                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6823                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6824                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6825                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6826                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6827                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6828                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6829                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6830                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6831                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6832                      ) /* How's that for a conditional? */
6833             ) {
6834                 match = TRUE;
6835             }
6836         }
6837     }
6838
6839     /* If the bitmap didn't (or couldn't) match, and something outside the
6840      * bitmap could match, try that.  Locale nodes specifiy completely the
6841      * behavior of code points in the bit map (otherwise, a utf8 target would
6842      * cause them to be treated as Unicode and not locale), except in
6843      * the very unlikely event when this node is a synthetic start class, which
6844      * could be a combination of locale and non-locale nodes.  So allow locale
6845      * to match for the synthetic start class, which will give a false
6846      * positive that will be resolved when the match is done again as not part
6847      * of the synthetic start class */
6848     if (!match) {
6849         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6850             match = TRUE;       /* Everything above 255 matches */
6851         }
6852         else if (ANYOF_NONBITMAP(n)
6853                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6854                      || (utf8_target
6855                          && (c >=256
6856                              || (! (flags & ANYOF_LOCALE))
6857                              || (flags & ANYOF_IS_SYNTHETIC)))))
6858         {
6859             AV *av;
6860             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6861
6862             if (sw) {
6863                 U8 * utf8_p;
6864                 if (utf8_target) {
6865                     utf8_p = (U8 *) p;
6866                 } else {
6867
6868                     /* Not utf8.  Convert as much of the string as available up
6869                      * to the limit of how far the (single) character in the
6870                      * pattern can possibly match (no need to go further).  If
6871                      * the node is a straight ANYOF or not folding, it can't
6872                      * match more than one.  Otherwise, It can match up to how
6873                      * far a single char can fold to.  Since not utf8, each
6874                      * character is a single byte, so the max it can be in
6875                      * bytes is the same as the max it can be in characters */
6876                     STRLEN len = (OP(n) == ANYOF
6877                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6878                                   ? 1
6879                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6880                                     ? maxlen
6881                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6882                     utf8_p = bytes_to_utf8(p, &len);
6883                 }
6884
6885                 if (swash_fetch(sw, utf8_p, TRUE))
6886                     match = TRUE;
6887                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6888
6889                     /* Here, we need to test if the fold of the target string
6890                      * matches.  The non-multi char folds have all been moved to
6891                      * the compilation phase, and the multi-char folds have
6892                      * been stored by regcomp into 'av'; we linearly check to
6893                      * see if any match the target string (folded).   We know
6894                      * that the originals were each one character, but we don't
6895                      * currently know how many characters/bytes each folded to,
6896                      * except we do know that there are small limits imposed by
6897                      * Unicode.  XXX A performance enhancement would be to have
6898                      * regcomp.c store the max number of chars/bytes that are
6899                      * in an av entry, as, say the 0th element.  Even better
6900                      * would be to have a hash of the few characters that can
6901                      * start a multi-char fold to the max number of chars of
6902                      * those folds.
6903                      *
6904                      * If there is a match, we will need to advance (if lenp is
6905                      * specified) the match pointer in the target string.  But
6906                      * what we are comparing here isn't that string directly,
6907                      * but its fold, whose length may differ from the original.
6908                      * As we go along in constructing the fold, therefore, we
6909                      * create a map so that we know how many bytes in the
6910                      * source to advance given that we have matched a certain
6911                      * number of bytes in the fold.  This map is stored in
6912                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6913                      * the fold of the first character that we are folding.
6914                      * Then map_fold_len_back[n] is set to the number of bytes
6915                      * in that first character.  Similarly let m be the
6916                      * corresponding number for the second character to be
6917                      * folded.  Then map_fold_len_back[n+m] is set to the
6918                      * number of bytes occupied by the first two source
6919                      * characters. ... */
6920                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6921                     U8 folded[UTF8_MAXBYTES_CASE+1];
6922                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6923                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6924                                                   chars */
6925
6926                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6927
6928                         /* Here, only need to fold the first char of the target
6929                          * string.  It the source wasn't utf8, is 1 byte long */
6930                         to_utf8_fold(utf8_p, folded, &foldlen);
6931                         total_foldlen = foldlen;
6932                         map_fold_len_back[foldlen] = (utf8_target)
6933                                                      ? UTF8SKIP(utf8_p)
6934                                                      : 1;
6935                     }
6936                     else {
6937
6938                         /* Here, need to fold more than the first char.  Do so
6939                          * up to the limits */
6940                         U8* source_ptr = utf8_p;    /* The source for the fold
6941                                                        is the regex target
6942                                                        string */
6943                         U8* folded_ptr = folded;
6944                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6945                                                        available byte in the
6946                                                        target string */
6947                         U8 i;
6948                         for (i = 0;
6949                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6950                              i++)
6951                         {
6952
6953                             /* Fold the next character */
6954                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6955                             STRLEN this_char_foldlen;
6956                             to_utf8_fold(source_ptr,
6957                                          this_char_folded,
6958                                          &this_char_foldlen);
6959
6960                             /* Bail if it would exceed the byte limit for
6961                              * folding a single char. */
6962                             if (this_char_foldlen + folded_ptr - folded >
6963                                                             UTF8_MAXBYTES_CASE)
6964                             {
6965                                 break;
6966                             }
6967
6968                             /* Add the fold of this character */
6969                             Copy(this_char_folded,
6970                                  folded_ptr,
6971                                  this_char_foldlen,
6972                                  U8);
6973                             source_ptr += UTF8SKIP(source_ptr);
6974                             folded_ptr += this_char_foldlen;
6975                             total_foldlen = folded_ptr - folded;
6976
6977                             /* Create map from the number of bytes in the fold
6978                              * back to the number of bytes in the source.  If
6979                              * the source isn't utf8, the byte count is just
6980                              * the number of characters so far */
6981                             map_fold_len_back[total_foldlen]
6982                                                       = (utf8_target)
6983                                                         ? source_ptr - utf8_p
6984                                                         : i + 1;
6985                         }
6986                         *folded_ptr = '\0';
6987                     }
6988
6989
6990                     /* Do the linear search to see if the fold is in the list
6991                      * of multi-char folds. */
6992                     if (av) {
6993                         I32 i;
6994                         for (i = 0; i <= av_len(av); i++) {
6995                             SV* const sv = *av_fetch(av, i, FALSE);
6996                             STRLEN len;
6997                             const char * const s = SvPV_const(sv, len);
6998
6999                             if (len <= total_foldlen
7000                                 && memEQ(s, (char*)folded, len)
7001
7002                                    /* If 0, means matched a partial char. See
7003                                     * [perl #90536] */
7004                                 && map_fold_len_back[len])
7005                             {
7006
7007                                 /* Advance the target string ptr to account for
7008                                  * this fold, but have to translate from the
7009                                  * folded length to the corresponding source
7010                                  * length. */
7011                                 if (lenp) {
7012                                     *lenp = map_fold_len_back[len];
7013                                 }
7014                                 match = TRUE;
7015                                 break;
7016                             }
7017                         }
7018                     }
7019                 }
7020
7021                 /* If we allocated a string above, free it */
7022                 if (! utf8_target) Safefree(utf8_p);
7023             }
7024         }
7025     }
7026
7027     return (flags & ANYOF_INVERT) ? !match : match;
7028 }
7029
7030 STATIC U8 *
7031 S_reghop3(U8 *s, I32 off, const U8* lim)
7032 {
7033     /* return the position 'off' UTF-8 characters away from 's', forward if
7034      * 'off' >= 0, backwards if negative.  But don't go outside of position
7035      * 'lim', which better be < s  if off < 0 */
7036
7037     dVAR;
7038
7039     PERL_ARGS_ASSERT_REGHOP3;
7040
7041     if (off >= 0) {
7042         while (off-- && s < lim) {
7043             /* XXX could check well-formedness here */
7044             s += UTF8SKIP(s);
7045         }
7046     }
7047     else {
7048         while (off++ && s > lim) {
7049             s--;
7050             if (UTF8_IS_CONTINUED(*s)) {
7051                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7052                     s--;
7053             }
7054             /* XXX could check well-formedness here */
7055         }
7056     }
7057     return s;
7058 }
7059
7060 #ifdef XXX_dmq
7061 /* there are a bunch of places where we use two reghop3's that should
7062    be replaced with this routine. but since thats not done yet
7063    we ifdef it out - dmq
7064 */
7065 STATIC U8 *
7066 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7067 {
7068     dVAR;
7069
7070     PERL_ARGS_ASSERT_REGHOP4;
7071
7072     if (off >= 0) {
7073         while (off-- && s < rlim) {
7074             /* XXX could check well-formedness here */
7075             s += UTF8SKIP(s);
7076         }
7077     }
7078     else {
7079         while (off++ && s > llim) {
7080             s--;
7081             if (UTF8_IS_CONTINUED(*s)) {
7082                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7083                     s--;
7084             }
7085             /* XXX could check well-formedness here */
7086         }
7087     }
7088     return s;
7089 }
7090 #endif
7091
7092 STATIC U8 *
7093 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7094 {
7095     dVAR;
7096
7097     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7098
7099     if (off >= 0) {
7100         while (off-- && s < lim) {
7101             /* XXX could check well-formedness here */
7102             s += UTF8SKIP(s);
7103         }
7104         if (off >= 0)
7105             return NULL;
7106     }
7107     else {
7108         while (off++ && s > lim) {
7109             s--;
7110             if (UTF8_IS_CONTINUED(*s)) {
7111                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7112                     s--;
7113             }
7114             /* XXX could check well-formedness here */
7115         }
7116         if (off <= 0)
7117             return NULL;
7118     }
7119     return s;
7120 }
7121
7122 static void
7123 restore_pos(pTHX_ void *arg)
7124 {
7125     dVAR;
7126     regexp * const rex = (regexp *)arg;
7127     if (PL_reg_state.re_state_eval_setup_done) {
7128         if (PL_reg_oldsaved) {
7129             rex->subbeg = PL_reg_oldsaved;
7130             rex->sublen = PL_reg_oldsavedlen;
7131 #ifdef PERL_OLD_COPY_ON_WRITE
7132             rex->saved_copy = PL_nrs;
7133 #endif
7134             RXp_MATCH_COPIED_on(rex);
7135         }
7136         PL_reg_magic->mg_len = PL_reg_oldpos;
7137         PL_reg_state.re_state_eval_setup_done = FALSE;
7138         PL_curpm = PL_reg_oldcurpm;
7139     }
7140 }
7141
7142 STATIC void
7143 S_to_utf8_substr(pTHX_ register regexp *prog)
7144 {
7145     int i = 1;
7146
7147     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7148
7149     do {
7150         if (prog->substrs->data[i].substr
7151             && !prog->substrs->data[i].utf8_substr) {
7152             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7153             prog->substrs->data[i].utf8_substr = sv;
7154             sv_utf8_upgrade(sv);
7155             if (SvVALID(prog->substrs->data[i].substr)) {
7156                 if (SvTAIL(prog->substrs->data[i].substr)) {
7157                     /* Trim the trailing \n that fbm_compile added last
7158                        time.  */
7159                     SvCUR_set(sv, SvCUR(sv) - 1);
7160                     /* Whilst this makes the SV technically "invalid" (as its
7161                        buffer is no longer followed by "\0") when fbm_compile()
7162                        adds the "\n" back, a "\0" is restored.  */
7163                     fbm_compile(sv, FBMcf_TAIL);
7164                 } else
7165                     fbm_compile(sv, 0);
7166             }
7167             if (prog->substrs->data[i].substr == prog->check_substr)
7168                 prog->check_utf8 = sv;
7169         }
7170     } while (i--);
7171 }
7172
7173 STATIC void
7174 S_to_byte_substr(pTHX_ register regexp *prog)
7175 {
7176     dVAR;
7177     int i = 1;
7178
7179     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7180
7181     do {
7182         if (prog->substrs->data[i].utf8_substr
7183             && !prog->substrs->data[i].substr) {
7184             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7185             if (sv_utf8_downgrade(sv, TRUE)) {
7186                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7187                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7188                         /* Trim the trailing \n that fbm_compile added last
7189                            time.  */
7190                         SvCUR_set(sv, SvCUR(sv) - 1);
7191                         fbm_compile(sv, FBMcf_TAIL);
7192                     } else
7193                         fbm_compile(sv, 0);
7194                 }
7195             } else {
7196                 SvREFCNT_dec(sv);
7197                 sv = &PL_sv_undef;
7198             }
7199             prog->substrs->data[i].substr = sv;
7200             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7201                 prog->check_substr = sv;
7202         }
7203     } while (i--);
7204 }
7205
7206 /*
7207  * Local variables:
7208  * c-indentation-style: bsd
7209  * c-basic-offset: 4
7210  * indent-tabs-mode: nil
7211  * End:
7212  *
7213  * ex: set ts=8 sts=4 sw=4 et:
7214  */