regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "inline_invlist.c"
  84 #include "unicode_constants.h"
  85
  86 #define RF_tainted      1       /* tainted information used? e.g. locale */
  87 #define RF_warned       2               /* warned about big count? */
  88
  89 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  90
  91 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { \
 127         bool ok; \
 128         ENTER; save_re_context(); \
 129         ok=CAT2(is_utf8_,class)((const U8*)str); \
 130         PERL_UNUSED_VAR(ok); \
 131         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 132 /* Doesn't do an assert to verify that is correct */
 133 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 134     if (!CAT2(PL_utf8_,class)) { \
 135         bool throw_away PERL_UNUSED_DECL; \
 136         ENTER; save_re_context(); \
 137         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 138         LEAVE; } } STMT_END
 139
 140 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 141 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 142 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 143
 144 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 145         /* No asserts are done for some of these, in case called on a   */  \
 146         /* Unicode version in which they map to nothing */                  \
 147         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 148         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 149
 150 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 151
 152 /* The actual code for CCC_TRY, which uses several variables from the routine
 153  * it's callable from.  It is designed to be the bulk of a case statement.
 154  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 155  *      nextchr matches the class.
 156  * UTF8_TEST is the whole test string to use for utf8 targets
 157  * LOAD is what to use to test, and if not present to load in the swash for the
 158  *      class
 159  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 160  *      UTF8_TEST test.
 161  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 162  * utf8 and a variant, load the swash if necessary and test using the utf8
 163  * test.  Advance to the next character if test is ok, otherwise fail; If not
 164  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 165  * fails, or advance to the next character */
 166
 167 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 168     if (locinput >= PL_regeol) {                                              \
 169         sayNO;                                                                \
 170     }                                                                         \
 171     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 172         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 173         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 174             sayNO;                                                            \
 175         }                                                                     \
 176         locinput += PL_utf8skip[nextchr];                                     \
 177         nextchr = UCHARAT(locinput);                                          \
 178         break;                                                                \
 179     }                                                                         \
 180     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 181         sayNO;                                                                \
 182     }                                                                         \
 183     nextchr = UCHARAT(++locinput);                                            \
 184     break;
 185
 186 /* Handle the non-locale cases for a character class and its complement.  It
 187  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 188  * This is because that code fails when the test succeeds, so we want to have
 189  * the test fail so that the code succeeds.  The swash is stored in a
 190  * predictable PL_ place */
 191 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 192                            CLASS, STR)                                        \
 193     case NAME:                                                                \
 194         _CCC_TRY_CODE( !, FUNC,                                               \
 195                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 196                                             (U8*)locinput, TRUE)),            \
 197                           CLASS, STR)                                         \
 198     case NNAME:                                                               \
 199         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 200                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 201                                             (U8*)locinput, TRUE)),            \
 202                           CLASS, STR)                                         \
 203
 204 /* Generate the case statements for both locale and non-locale character
 205  * classes in regmatch for classes that don't have special unicode semantics.
 206  * Locales don't use an immediate swash, but an intermediary special locale
 207  * function that is called on the pointer to the current place in the input
 208  * string.  That function will resolve to needing the same swash.  One might
 209  * think that because we don't know what the locale will match, we shouldn't
 210  * check with the swash loading function that it loaded properly; ie, that we
 211  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 212  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 213  * irrelevant here */
 214 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 215                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 216                 NAMEA, NNAMEA, FUNCA,                                         \
 217                 CLASS, STR)                                                   \
 218     case NAMEL:                                                               \
 219         PL_reg_flags |= RF_tainted;                                           \
 220         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 221     case NNAMEL:                                                              \
 222         PL_reg_flags |= RF_tainted;                                           \
 223         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 224                        CLASS, STR)                                            \
 225     case NAMEA:                                                               \
 226         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 227             sayNO;                                                            \
 228         }                                                                     \
 229         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 230         nextchr = UCHARAT(++locinput);                                        \
 231         break;                                                                \
 232     case NNAMEA:                                                              \
 233         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 234             sayNO;                                                            \
 235         }                                                                     \
 236         if (utf8_target) {                                                    \
 237             locinput += PL_utf8skip[nextchr];                                 \
 238             nextchr = UCHARAT(locinput);                                      \
 239         }                                                                     \
 240         else {                                                                \
 241             nextchr = UCHARAT(++locinput);                                    \
 242         }                                                                     \
 243         break;                                                                \
 244     /* Generate the non-locale cases */                                       \
 245     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 246
 247 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 248  * statements to handle separate Unicode semantics nodes */
 249 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 250                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 251                   NAMEU, NNAMEU, FUNCU,                                        \
 252                   NAMEA, NNAMEA, FUNCA,                                        \
 253                   CLASS, STR)                                                  \
 254     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 255             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 256             NAMEA, NNAMEA, FUNCA,                                              \
 257             CLASS, STR)                                                        \
 258     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 259
 260 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 261
 262 /* for use after a quantifier and before an EXACT-like node -- japhy */
 263 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 264  *
 265  * NOTE that *nothing* that affects backtracking should be in here, specifically
 266  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 267  * node that is in between two EXACT like nodes when ascertaining what the required
 268  * "follow" character is. This should probably be moved to regex compile time
 269  * although it may be done at run time beause of the REF possibility - more
 270  * investigation required. -- demerphq
 271 */
 272 #define JUMPABLE(rn) (      \
 273     OP(rn) == OPEN ||       \
 274     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 275     OP(rn) == EVAL ||   \
 276     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 277     OP(rn) == PLUS || OP(rn) == MINMOD || \
 278     OP(rn) == KEEPS || \
 279     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 280 )
 281 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 282
 283 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 284
 285 #if 0
 286 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 287    we don't need this definition. */
 288 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 289 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 290 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 291
 292 #else
 293 /* ... so we use this as its faster. */
 294 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 295 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 296 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 297 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 298
 299 #endif
 300
 301 /*
 302   Search for mandatory following text node; for lookahead, the text must
 303   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 304 */
 305 #define FIND_NEXT_IMPT(rn) STMT_START { \
 306     while (JUMPABLE(rn)) { \
 307         const OPCODE type = OP(rn); \
 308         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 309             rn = NEXTOPER(NEXTOPER(rn)); \
 310         else if (type == PLUS) \
 311             rn = NEXTOPER(rn); \
 312         else if (type == IFMATCH) \
 313             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 314         else rn += NEXT_OFF(rn); \
 315     } \
 316 } STMT_END
 317
 318
 319 static void restore_pos(pTHX_ void *arg);
 320
 321 #define REGCP_PAREN_ELEMS 3
 322 #define REGCP_OTHER_ELEMS 3
 323 #define REGCP_FRAME_ELEMS 1
 324 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 325  * are needed for the regexp context stack bookkeeping. */
 326
 327 STATIC CHECKPOINT
 328 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 329 {
 330     dVAR;
 331     const int retval = PL_savestack_ix;
 332     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 333     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 334     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 335     I32 p;
 336     GET_RE_DEBUG_FLAGS_DECL;
 337
 338     PERL_ARGS_ASSERT_REGCPPUSH;
 339
 340     if (paren_elems_to_push < 0)
 341         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 342                    paren_elems_to_push);
 343
 344     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 345         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 346                    " out of range (%lu-%ld)",
 347                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 348
 349     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 350
 351     DEBUG_BUFFERS_r(
 352         if ((int)PL_regsize > (int)parenfloor)
 353             PerlIO_printf(Perl_debug_log,
 354                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 355                 PTR2UV(rex),
 356                 PTR2UV(rex->offs)
 357             );
 358     );
 359     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 360 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 361         SSPUSHINT(rex->offs[p].end);
 362         SSPUSHINT(rex->offs[p].start);
 363         SSPUSHINT(rex->offs[p].start_tmp);
 364         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 365             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 366             (UV)p,
 367             (IV)rex->offs[p].start,
 368             (IV)rex->offs[p].start_tmp,
 369             (IV)rex->offs[p].end
 370         ));
 371     }
 372 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 373     SSPUSHINT(PL_regsize);
 374     SSPUSHINT(rex->lastparen);
 375     SSPUSHINT(rex->lastcloseparen);
 376     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 377
 378     return retval;
 379 }
 380
 381 /* These are needed since we do not localize EVAL nodes: */
 382 #define REGCP_SET(cp)                                           \
 383     DEBUG_STATE_r(                                              \
 384             PerlIO_printf(Perl_debug_log,                       \
 385                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 386                 (IV)PL_savestack_ix));                          \
 387     cp = PL_savestack_ix
 388
 389 #define REGCP_UNWIND(cp)                                        \
 390     DEBUG_STATE_r(                                              \
 391         if (cp != PL_savestack_ix)                              \
 392             PerlIO_printf(Perl_debug_log,                       \
 393                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 394                 (IV)(cp), (IV)PL_savestack_ix));                \
 395     regcpblow(cp)
 396
 397 #define UNWIND_PAREN(lp, lcp)               \
 398     for (n = rex->lastparen; n > lp; n--)   \
 399         rex->offs[n].end = -1;              \
 400     rex->lastparen = n;                     \
 401     rex->lastcloseparen = lcp;
 402
 403
 404 STATIC void
 405 S_regcppop(pTHX_ regexp *rex)
 406 {
 407     dVAR;
 408     UV i;
 409     U32 paren;
 410     GET_RE_DEBUG_FLAGS_DECL;
 411
 412     PERL_ARGS_ASSERT_REGCPPOP;
 413
 414     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 415     i = SSPOPUV;
 416     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 417     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 418     rex->lastcloseparen = SSPOPINT;
 419     rex->lastparen = SSPOPINT;
 420     PL_regsize = SSPOPINT;
 421
 422     i -= REGCP_OTHER_ELEMS;
 423     /* Now restore the parentheses context. */
 424     DEBUG_BUFFERS_r(
 425         if (i || rex->lastparen + 1 <= rex->nparens)
 426             PerlIO_printf(Perl_debug_log,
 427                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 428                 PTR2UV(rex),
 429                 PTR2UV(rex->offs)
 430             );
 431     );
 432     paren = PL_regsize;
 433     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 434         I32 tmps;
 435         rex->offs[paren].start_tmp = SSPOPINT;
 436         rex->offs[paren].start = SSPOPINT;
 437         tmps = SSPOPINT;
 438         if (paren <= rex->lastparen)
 439             rex->offs[paren].end = tmps;
 440         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 441             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 442             (UV)paren,
 443             (IV)rex->offs[paren].start,
 444             (IV)rex->offs[paren].start_tmp,
 445             (IV)rex->offs[paren].end,
 446             (paren > rex->lastparen ? "(skipped)" : ""));
 447         );
 448         paren--;
 449     }
 450 #if 1
 451     /* It would seem that the similar code in regtry()
 452      * already takes care of this, and in fact it is in
 453      * a better location to since this code can #if 0-ed out
 454      * but the code in regtry() is needed or otherwise tests
 455      * requiring null fields (pat.t#187 and split.t#{13,14}
 456      * (as of patchlevel 7877)  will fail.  Then again,
 457      * this code seems to be necessary or otherwise
 458      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 459      * --jhi updated by dapm */
 460     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 461         if (i > PL_regsize)
 462             rex->offs[i].start = -1;
 463         rex->offs[i].end = -1;
 464         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 465             "    \\%"UVuf": %s   ..-1 undeffing\n",
 466             (UV)i,
 467             (i > PL_regsize) ? "-1" : "  "
 468         ));
 469     }
 470 #endif
 471 }
 472
 473 /* restore the parens and associated vars at savestack position ix,
 474  * but without popping the stack */
 475
 476 STATIC void
 477 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 478 {
 479     I32 tmpix = PL_savestack_ix;
 480     PL_savestack_ix = ix;
 481     regcppop(rex);
 482     PL_savestack_ix = tmpix;
 483 }
 484
 485 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 486
 487 /*
 488  * pregexec and friends
 489  */
 490
 491 #ifndef PERL_IN_XSUB_RE
 492 /*
 493  - pregexec - match a regexp against a string
 494  */
 495 I32
 496 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 497          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 498 /* stringarg: the point in the string at which to begin matching */
 499 /* strend:    pointer to null at end of string */
 500 /* strbeg:    real beginning of string */
 501 /* minend:    end of match must be >= minend bytes after stringarg. */
 502 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 503  *            itself is accessed via the pointers above */
 504 /* nosave:    For optimizations. */
 505 {
 506     PERL_ARGS_ASSERT_PREGEXEC;
 507
 508     return
 509         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 510                       nosave ? 0 : REXEC_COPY_STR);
 511 }
 512 #endif
 513
 514 /*
 515  * Need to implement the following flags for reg_anch:
 516  *
 517  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 518  * USE_INTUIT_ML
 519  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 520  * INTUIT_AUTORITATIVE_ML
 521  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 522  * INTUIT_ONCE_ML
 523  *
 524  * Another flag for this function: SECOND_TIME (so that float substrs
 525  * with giant delta may be not rechecked).
 526  */
 527
 528 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 529
 530 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 531    Otherwise, only SvCUR(sv) is used to get strbeg. */
 532
 533 /* XXXX We assume that strpos is strbeg unless sv. */
 534
 535 /* XXXX Some places assume that there is a fixed substring.
 536         An update may be needed if optimizer marks as "INTUITable"
 537         RExen without fixed substrings.  Similarly, it is assumed that
 538         lengths of all the strings are no more than minlen, thus they
 539         cannot come from lookahead.
 540         (Or minlen should take into account lookahead.)
 541   NOTE: Some of this comment is not correct. minlen does now take account
 542   of lookahead/behind. Further research is required. -- demerphq
 543
 544 */
 545
 546 /* A failure to find a constant substring means that there is no need to make
 547    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 548    finding a substring too deep into the string means that less calls to
 549    regtry() should be needed.
 550
 551    REx compiler's optimizer found 4 possible hints:
 552         a) Anchored substring;
 553         b) Fixed substring;
 554         c) Whether we are anchored (beginning-of-line or \G);
 555         d) First node (of those at offset 0) which may distinguish positions;
 556    We use a)b)d) and multiline-part of c), and try to find a position in the
 557    string which does not contradict any of them.
 558  */
 559
 560 /* Most of decisions we do here should have been done at compile time.
 561    The nodes of the REx which we used for the search should have been
 562    deleted from the finite automaton. */
 563
 564 char *
 565 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 566                      char *strend, const U32 flags, re_scream_pos_data *data)
 567 {
 568     dVAR;
 569     struct regexp *const prog = (struct regexp *)SvANY(rx);
 570     I32 start_shift = 0;
 571     /* Should be nonnegative! */
 572     I32 end_shift   = 0;
 573     char *s;
 574     SV *check;
 575     char *strbeg;
 576     char *t;
 577     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 578     I32 ml_anch;
 579     char *other_last = NULL;    /* other substr checked before this */
 580     char *check_at = NULL;              /* check substr found at this pos */
 581     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 582     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 583     RXi_GET_DECL(prog,progi);
 584 #ifdef DEBUGGING
 585     const char * const i_strpos = strpos;
 586 #endif
 587     GET_RE_DEBUG_FLAGS_DECL;
 588
 589     PERL_ARGS_ASSERT_RE_INTUIT_START;
 590     PERL_UNUSED_ARG(flags);
 591     PERL_UNUSED_ARG(data);
 592
 593     RX_MATCH_UTF8_set(rx,utf8_target);
 594
 595     if (RX_UTF8(rx)) {
 596         PL_reg_flags |= RF_utf8;
 597     }
 598     DEBUG_EXECUTE_r(
 599         debug_start_match(rx, utf8_target, strpos, strend,
 600             sv ? "Guessing start of match in sv for"
 601                : "Guessing start of match in string for");
 602               );
 603
 604     /* CHR_DIST() would be more correct here but it makes things slow. */
 605     if (prog->minlen > strend - strpos) {
 606         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 607                               "String too short... [re_intuit_start]\n"));
 608         goto fail;
 609     }
 610
 611     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 612     PL_regeol = strend;
 613     if (utf8_target) {
 614         if (!prog->check_utf8 && prog->check_substr)
 615             to_utf8_substr(prog);
 616         check = prog->check_utf8;
 617     } else {
 618         if (!prog->check_substr && prog->check_utf8)
 619             to_byte_substr(prog);
 620         check = prog->check_substr;
 621     }
 622     if (check == &PL_sv_undef) {
 623         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 624                 "Non-utf8 string cannot match utf8 check string\n"));
 625         goto fail;
 626     }
 627     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 628         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 629                      || ( (prog->extflags & RXf_ANCH_BOL)
 630                           && !multiline ) );    /* Check after \n? */
 631
 632         if (!ml_anch) {
 633           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 634                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 635                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 636                && sv && !SvROK(sv)
 637                && (strpos != strbeg)) {
 638               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 639               goto fail;
 640           }
 641           if (prog->check_offset_min == prog->check_offset_max &&
 642               !(prog->extflags & RXf_CANY_SEEN)) {
 643             /* Substring at constant offset from beg-of-str... */
 644             I32 slen;
 645
 646             s = HOP3c(strpos, prog->check_offset_min, strend);
 647
 648             if (SvTAIL(check)) {
 649                 slen = SvCUR(check);    /* >= 1 */
 650
 651                 if ( strend - s > slen || strend - s < slen - 1
 652                      || (strend - s == slen && strend[-1] != '\n')) {
 653                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 654                     goto fail_finish;
 655                 }
 656                 /* Now should match s[0..slen-2] */
 657                 slen--;
 658                 if (slen && (*SvPVX_const(check) != *s
 659                              || (slen > 1
 660                                  && memNE(SvPVX_const(check), s, slen)))) {
 661                   report_neq:
 662                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 663                     goto fail_finish;
 664                 }
 665             }
 666             else if (*SvPVX_const(check) != *s
 667                      || ((slen = SvCUR(check)) > 1
 668                          && memNE(SvPVX_const(check), s, slen)))
 669                 goto report_neq;
 670             check_at = s;
 671             goto success_at_start;
 672           }
 673         }
 674         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 675         s = strpos;
 676         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 677         end_shift = prog->check_end_shift;
 678
 679         if (!ml_anch) {
 680             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 681                                          - (SvTAIL(check) != 0);
 682             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 683
 684             if (end_shift < eshift)
 685                 end_shift = eshift;
 686         }
 687     }
 688     else {                              /* Can match at random position */
 689         ml_anch = 0;
 690         s = strpos;
 691         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 692         end_shift = prog->check_end_shift;
 693
 694         /* end shift should be non negative here */
 695     }
 696
 697 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 698     if (end_shift < 0)
 699         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 700                    (IV)end_shift, RX_PRECOMP(prog));
 701 #endif
 702
 703   restart:
 704     /* Find a possible match in the region s..strend by looking for
 705        the "check" substring in the region corrected by start/end_shift. */
 706
 707     {
 708         I32 srch_start_shift = start_shift;
 709         I32 srch_end_shift = end_shift;
 710         U8* start_point;
 711         U8* end_point;
 712         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 713             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 714             srch_start_shift = strbeg - s;
 715         }
 716     DEBUG_OPTIMISE_MORE_r({
 717         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 718             (IV)prog->check_offset_min,
 719             (IV)srch_start_shift,
 720             (IV)srch_end_shift,
 721             (IV)prog->check_end_shift);
 722     });
 723
 724         if (prog->extflags & RXf_CANY_SEEN) {
 725             start_point= (U8*)(s + srch_start_shift);
 726             end_point= (U8*)(strend - srch_end_shift);
 727         } else {
 728             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 729             end_point= HOP3(strend, -srch_end_shift, strbeg);
 730         }
 731         DEBUG_OPTIMISE_MORE_r({
 732             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 733                 (int)(end_point - start_point),
 734                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 735                 start_point);
 736         });
 737
 738         s = fbm_instr( start_point, end_point,
 739                       check, multiline ? FBMrf_MULTILINE : 0);
 740     }
 741     /* Update the count-of-usability, remove useless subpatterns,
 742         unshift s.  */
 743
 744     DEBUG_EXECUTE_r({
 745         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 746             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 747         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 748                           (s ? "Found" : "Did not find"),
 749             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 750                 ? "anchored" : "floating"),
 751             quoted,
 752             RE_SV_TAIL(check),
 753             (s ? " at offset " : "...\n") );
 754     });
 755
 756     if (!s)
 757         goto fail_finish;
 758     /* Finish the diagnostic message */
 759     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 760
 761     /* XXX dmq: first branch is for positive lookbehind...
 762        Our check string is offset from the beginning of the pattern.
 763        So we need to do any stclass tests offset forward from that
 764        point. I think. :-(
 765      */
 766
 767
 768
 769     check_at=s;
 770
 771
 772     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 773        Start with the other substr.
 774        XXXX no SCREAM optimization yet - and a very coarse implementation
 775        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 776                 *always* match.  Probably should be marked during compile...
 777        Probably it is right to do no SCREAM here...
 778      */
 779
 780     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 781                 : (prog->float_substr && prog->anchored_substr))
 782     {
 783         /* Take into account the "other" substring. */
 784         /* XXXX May be hopelessly wrong for UTF... */
 785         if (!other_last)
 786             other_last = strpos;
 787         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 788           do_other_anchored:
 789             {
 790                 char * const last = HOP3c(s, -start_shift, strbeg);
 791                 char *last1, *last2;
 792                 char * const saved_s = s;
 793                 SV* must;
 794
 795                 t = s - prog->check_offset_max;
 796                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 797                     && (!utf8_target
 798                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 799                             && t > strpos)))
 800                     NOOP;
 801                 else
 802                     t = strpos;
 803                 t = HOP3c(t, prog->anchored_offset, strend);
 804                 if (t < other_last)     /* These positions already checked */
 805                     t = other_last;
 806                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 807                 if (last < last1)
 808                     last1 = last;
 809                 /* XXXX It is not documented what units *_offsets are in.
 810                    We assume bytes, but this is clearly wrong.
 811                    Meaning this code needs to be carefully reviewed for errors.
 812                    dmq.
 813                   */
 814
 815                 /* On end-of-str: see comment below. */
 816                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 817                 if (must == &PL_sv_undef) {
 818                     s = (char*)NULL;
 819                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 820                 }
 821                 else
 822                     s = fbm_instr(
 823                         (unsigned char*)t,
 824                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 825                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 826                         must,
 827                         multiline ? FBMrf_MULTILINE : 0
 828                     );
 829                 DEBUG_EXECUTE_r({
 830                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 831                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 832                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 833                         (s ? "Found" : "Contradicts"),
 834                         quoted, RE_SV_TAIL(must));
 835                 });
 836
 837
 838                 if (!s) {
 839                     if (last1 >= last2) {
 840                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 841                                                 ", giving up...\n"));
 842                         goto fail_finish;
 843                     }
 844                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 845                         ", trying floating at offset %ld...\n",
 846                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 847                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 848                     s = HOP3c(last, 1, strend);
 849                     goto restart;
 850                 }
 851                 else {
 852                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 853                           (long)(s - i_strpos)));
 854                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 855                     other_last = HOP3c(s, 1, strend);
 856                     s = saved_s;
 857                     if (t == strpos)
 858                         goto try_at_start;
 859                     goto try_at_offset;
 860                 }
 861             }
 862         }
 863         else {          /* Take into account the floating substring. */
 864             char *last, *last1;
 865             char * const saved_s = s;
 866             SV* must;
 867
 868             t = HOP3c(s, -start_shift, strbeg);
 869             last1 = last =
 870                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 871             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 872                 last = HOP3c(t, prog->float_max_offset, strend);
 873             s = HOP3c(t, prog->float_min_offset, strend);
 874             if (s < other_last)
 875                 s = other_last;
 876  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 877             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 878             /* fbm_instr() takes into account exact value of end-of-str
 879                if the check is SvTAIL(ed).  Since false positives are OK,
 880                and end-of-str is not later than strend we are OK. */
 881             if (must == &PL_sv_undef) {
 882                 s = (char*)NULL;
 883                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 884             }
 885             else
 886                 s = fbm_instr((unsigned char*)s,
 887                               (unsigned char*)last + SvCUR(must)
 888                                   - (SvTAIL(must)!=0),
 889                               must, multiline ? FBMrf_MULTILINE : 0);
 890             DEBUG_EXECUTE_r({
 891                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 892                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 893                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 894                     (s ? "Found" : "Contradicts"),
 895                     quoted, RE_SV_TAIL(must));
 896             });
 897             if (!s) {
 898                 if (last1 == last) {
 899                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 900                                             ", giving up...\n"));
 901                     goto fail_finish;
 902                 }
 903                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 904                     ", trying anchored starting at offset %ld...\n",
 905                     (long)(saved_s + 1 - i_strpos)));
 906                 other_last = last;
 907                 s = HOP3c(t, 1, strend);
 908                 goto restart;
 909             }
 910             else {
 911                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 912                       (long)(s - i_strpos)));
 913                 other_last = s; /* Fix this later. --Hugo */
 914                 s = saved_s;
 915                 if (t == strpos)
 916                     goto try_at_start;
 917                 goto try_at_offset;
 918             }
 919         }
 920     }
 921
 922
 923     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 924
 925     DEBUG_OPTIMISE_MORE_r(
 926         PerlIO_printf(Perl_debug_log,
 927             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 928             (IV)prog->check_offset_min,
 929             (IV)prog->check_offset_max,
 930             (IV)(s-strpos),
 931             (IV)(t-strpos),
 932             (IV)(t-s),
 933             (IV)(strend-strpos)
 934         )
 935     );
 936
 937     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 938         && (!utf8_target
 939             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 940                  && t > strpos)))
 941     {
 942         /* Fixed substring is found far enough so that the match
 943            cannot start at strpos. */
 944       try_at_offset:
 945         if (ml_anch && t[-1] != '\n') {
 946             /* Eventually fbm_*() should handle this, but often
 947                anchored_offset is not 0, so this check will not be wasted. */
 948             /* XXXX In the code below we prefer to look for "^" even in
 949                presence of anchored substrings.  And we search even
 950                beyond the found float position.  These pessimizations
 951                are historical artefacts only.  */
 952           find_anchor:
 953             while (t < strend - prog->minlen) {
 954                 if (*t == '\n') {
 955                     if (t < check_at - prog->check_offset_min) {
 956                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 957                             /* Since we moved from the found position,
 958                                we definitely contradict the found anchored
 959                                substr.  Due to the above check we do not
 960                                contradict "check" substr.
 961                                Thus we can arrive here only if check substr
 962                                is float.  Redo checking for "other"=="fixed".
 963                              */
 964                             strpos = t + 1;
 965                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 966                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 967                             goto do_other_anchored;
 968                         }
 969                         /* We don't contradict the found floating substring. */
 970                         /* XXXX Why not check for STCLASS? */
 971                         s = t + 1;
 972                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 973                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 974                         goto set_useful;
 975                     }
 976                     /* Position contradicts check-string */
 977                     /* XXXX probably better to look for check-string
 978                        than for "\n", so one should lower the limit for t? */
 979                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 980                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 981                     other_last = strpos = s = t + 1;
 982                     goto restart;
 983                 }
 984                 t++;
 985             }
 986             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 987                         PL_colors[0], PL_colors[1]));
 988             goto fail_finish;
 989         }
 990         else {
 991             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 992                         PL_colors[0], PL_colors[1]));
 993         }
 994         s = t;
 995       set_useful:
 996         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 997     }
 998     else {
 999         /* The found string does not prohibit matching at strpos,
1000            - no optimization of calling REx engine can be performed,
1001            unless it was an MBOL and we are not after MBOL,
1002            or a future STCLASS check will fail this. */
1003       try_at_start:
1004         /* Even in this situation we may use MBOL flag if strpos is offset
1005            wrt the start of the string. */
1006         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1007             && (strpos != strbeg) && strpos[-1] != '\n'
1008             /* May be due to an implicit anchor of m{.*foo}  */
1009             && !(prog->intflags & PREGf_IMPLICIT))
1010         {
1011             t = strpos;
1012             goto find_anchor;
1013         }
1014         DEBUG_EXECUTE_r( if (ml_anch)
1015             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1016                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1017         );
1018       success_at_start:
1019         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1020             && (utf8_target ? (
1021                 prog->check_utf8                /* Could be deleted already */
1022                 && --BmUSEFUL(prog->check_utf8) < 0
1023                 && (prog->check_utf8 == prog->float_utf8)
1024             ) : (
1025                 prog->check_substr              /* Could be deleted already */
1026                 && --BmUSEFUL(prog->check_substr) < 0
1027                 && (prog->check_substr == prog->float_substr)
1028             )))
1029         {
1030             /* If flags & SOMETHING - do not do it many times on the same match */
1031             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1032             /* XXX Does the destruction order has to change with utf8_target? */
1033             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1034             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1035             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1036             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1037             check = NULL;                       /* abort */
1038             s = strpos;
1039             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1040                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1041             if (prog->intflags & PREGf_IMPLICIT)
1042                 prog->extflags &= ~RXf_ANCH_MBOL;
1043             /* XXXX This is a remnant of the old implementation.  It
1044                     looks wasteful, since now INTUIT can use many
1045                     other heuristics. */
1046             prog->extflags &= ~RXf_USE_INTUIT;
1047             /* XXXX What other flags might need to be cleared in this branch? */
1048         }
1049         else
1050             s = strpos;
1051     }
1052
1053     /* Last resort... */
1054     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1055     /* trie stclasses are too expensive to use here, we are better off to
1056        leave it to regmatch itself */
1057     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1058         /* minlen == 0 is possible if regstclass is \b or \B,
1059            and the fixed substr is ''$.
1060            Since minlen is already taken into account, s+1 is before strend;
1061            accidentally, minlen >= 1 guaranties no false positives at s + 1
1062            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1063            regstclass does not come from lookahead...  */
1064         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1065            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1066         const U8* const str = (U8*)STRING(progi->regstclass);
1067         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1068                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1069                     : 1);
1070         char * endpos;
1071         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1072             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1073         else if (prog->float_substr || prog->float_utf8)
1074             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1075         else
1076             endpos= strend;
1077
1078         if (checked_upto < s)
1079            checked_upto = s;
1080         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1081                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1082
1083         t = s;
1084         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1085         if (s) {
1086             checked_upto = s;
1087         } else {
1088 #ifdef DEBUGGING
1089             const char *what = NULL;
1090 #endif
1091             if (endpos == strend) {
1092                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1093                                 "Could not match STCLASS...\n") );
1094                 goto fail;
1095             }
1096             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1097                                    "This position contradicts STCLASS...\n") );
1098             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1099                 goto fail;
1100             checked_upto = HOPBACKc(endpos, start_shift);
1101             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1102                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1103             /* Contradict one of substrings */
1104             if (prog->anchored_substr || prog->anchored_utf8) {
1105                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1106                     DEBUG_EXECUTE_r( what = "anchored" );
1107                   hop_and_restart:
1108                     s = HOP3c(t, 1, strend);
1109                     if (s + start_shift + end_shift > strend) {
1110                         /* XXXX Should be taken into account earlier? */
1111                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1112                                                "Could not match STCLASS...\n") );
1113                         goto fail;
1114                     }
1115                     if (!check)
1116                         goto giveup;
1117                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1118                                 "Looking for %s substr starting at offset %ld...\n",
1119                                  what, (long)(s + start_shift - i_strpos)) );
1120                     goto restart;
1121                 }
1122                 /* Have both, check_string is floating */
1123                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1124                     goto retry_floating_check;
1125                 /* Recheck anchored substring, but not floating... */
1126                 s = check_at;
1127                 if (!check)
1128                     goto giveup;
1129                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1130                           "Looking for anchored substr starting at offset %ld...\n",
1131                           (long)(other_last - i_strpos)) );
1132                 goto do_other_anchored;
1133             }
1134             /* Another way we could have checked stclass at the
1135                current position only: */
1136             if (ml_anch) {
1137                 s = t = t + 1;
1138                 if (!check)
1139                     goto giveup;
1140                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1141                           "Looking for /%s^%s/m starting at offset %ld...\n",
1142                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1143                 goto try_at_offset;
1144             }
1145             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1146                 goto fail;
1147             /* Check is floating substring. */
1148           retry_floating_check:
1149             t = check_at - start_shift;
1150             DEBUG_EXECUTE_r( what = "floating" );
1151             goto hop_and_restart;
1152         }
1153         if (t != s) {
1154             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1155                         "By STCLASS: moving %ld --> %ld\n",
1156                                   (long)(t - i_strpos), (long)(s - i_strpos))
1157                    );
1158         }
1159         else {
1160             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1161                                   "Does not contradict STCLASS...\n");
1162                    );
1163         }
1164     }
1165   giveup:
1166     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1167                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1168                           PL_colors[5], (long)(s - i_strpos)) );
1169     return s;
1170
1171   fail_finish:                          /* Substring not found */
1172     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1173         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1174   fail:
1175     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1176                           PL_colors[4], PL_colors[5]));
1177     return NULL;
1178 }
1179
1180 #define DECL_TRIE_TYPE(scan) \
1181     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1182                     trie_type = ((scan->flags == EXACT) \
1183                               ? (utf8_target ? trie_utf8 : trie_plain) \
1184                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1185
1186 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1187 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1188     STRLEN skiplen;                                                                 \
1189     switch (trie_type) {                                                            \
1190     case trie_utf8_fold:                                                            \
1191         if ( foldlen>0 ) {                                                          \
1192             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1193             foldlen -= len;                                                         \
1194             uscan += len;                                                           \
1195             len=0;                                                                  \
1196         } else {                                                                    \
1197             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1198             len = UTF8SKIP(uc);                                                     \
1199             skiplen = UNISKIP( uvc );                                               \
1200             foldlen -= skiplen;                                                     \
1201             uscan = foldbuf + skiplen;                                              \
1202         }                                                                           \
1203         break;                                                                      \
1204     case trie_latin_utf8_fold:                                                      \
1205         if ( foldlen>0 ) {                                                          \
1206             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1207             foldlen -= len;                                                         \
1208             uscan += len;                                                           \
1209             len=0;                                                                  \
1210         } else {                                                                    \
1211             len = 1;                                                                \
1212             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1213             skiplen = UNISKIP( uvc );                                               \
1214             foldlen -= skiplen;                                                     \
1215             uscan = foldbuf + skiplen;                                              \
1216         }                                                                           \
1217         break;                                                                      \
1218     case trie_utf8:                                                                 \
1219         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1220         break;                                                                      \
1221     case trie_plain:                                                                \
1222         uvc = (UV)*uc;                                                              \
1223         len = 1;                                                                    \
1224     }                                                                               \
1225     if (uvc < 256) {                                                                \
1226         charid = trie->charmap[ uvc ];                                              \
1227     }                                                                               \
1228     else {                                                                          \
1229         charid = 0;                                                                 \
1230         if (widecharmap) {                                                          \
1231             SV** const svpp = hv_fetch(widecharmap,                                 \
1232                         (char*)&uvc, sizeof(UV), 0);                                \
1233             if (svpp)                                                               \
1234                 charid = (U16)SvIV(*svpp);                                          \
1235         }                                                                           \
1236     }                                                                               \
1237 } STMT_END
1238
1239 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1240 STMT_START {                                              \
1241     while (s <= e) {                                      \
1242         if ( (CoNd)                                       \
1243              && (ln == 1 || folder(s, pat_string, ln))    \
1244              && (!reginfo || regtry(reginfo, &s)) )       \
1245             goto got_it;                                  \
1246         s++;                                              \
1247     }                                                     \
1248 } STMT_END
1249
1250 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1251 STMT_START {                                          \
1252     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1253         CoDe                                          \
1254         s += uskip;                                   \
1255     }                                                 \
1256 } STMT_END
1257
1258 #define REXEC_FBC_SCAN(CoDe)                          \
1259 STMT_START {                                          \
1260     while (s < strend) {                              \
1261         CoDe                                          \
1262         s++;                                          \
1263     }                                                 \
1264 } STMT_END
1265
1266 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1267 REXEC_FBC_UTF8_SCAN(                                  \
1268     if (CoNd) {                                       \
1269         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1270             goto got_it;                              \
1271         else                                          \
1272             tmp = doevery;                            \
1273     }                                                 \
1274     else                                              \
1275         tmp = 1;                                      \
1276 )
1277
1278 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1279 REXEC_FBC_SCAN(                                       \
1280     if (CoNd) {                                       \
1281         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1282             goto got_it;                              \
1283         else                                          \
1284             tmp = doevery;                            \
1285     }                                                 \
1286     else                                              \
1287         tmp = 1;                                      \
1288 )
1289
1290 #define REXEC_FBC_TRYIT               \
1291 if ((!reginfo || regtry(reginfo, &s))) \
1292     goto got_it
1293
1294 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1295     if (utf8_target) {                                             \
1296         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1297     }                                                          \
1298     else {                                                     \
1299         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1300     }
1301
1302 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1303     if (utf8_target) {                                             \
1304         UtFpReLoAd;                                            \
1305         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1306     }                                                          \
1307     else {                                                     \
1308         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1309     }
1310
1311 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1312     PL_reg_flags |= RF_tainted;                                \
1313     if (utf8_target) {                                             \
1314         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1315     }                                                          \
1316     else {                                                     \
1317         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1318     }
1319
1320 #define DUMP_EXEC_POS(li,s,doutf8) \
1321     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1322
1323
1324 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1325         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1326         tmp = TEST_NON_UTF8(tmp);                                              \
1327         REXEC_FBC_UTF8_SCAN(                                                   \
1328             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1329                 tmp = !tmp;                                                    \
1330                 IF_SUCCESS;                                                    \
1331             }                                                                  \
1332             else {                                                             \
1333                 IF_FAIL;                                                       \
1334             }                                                                  \
1335         );                                                                     \
1336
1337 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1338         if (s == PL_bostr) {                                                   \
1339             tmp = '\n';                                                        \
1340         }                                                                      \
1341         else {                                                                 \
1342             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1343             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1344         }                                                                      \
1345         tmp = TeSt1_UtF8;                                                      \
1346         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1347         REXEC_FBC_UTF8_SCAN(                                                   \
1348             if (tmp == ! (TeSt2_UtF8)) { \
1349                 tmp = !tmp;                                                    \
1350                 IF_SUCCESS;                                                    \
1351             }                                                                  \
1352             else {                                                             \
1353                 IF_FAIL;                                                       \
1354             }                                                                  \
1355         );                                                                     \
1356
1357 /* The only difference between the BOUND and NBOUND cases is that
1358  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1359  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1360  * with the other one being empty */
1361 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1362     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1363
1364 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1365     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1366
1367 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1368     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1369
1370 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1371     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1372
1373
1374 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1375  * be passed in completely with the variable name being tested, which isn't
1376  * such a clean interface, but this is easier to read than it was before.  We
1377  * are looking for the boundary (or non-boundary between a word and non-word
1378  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1379  * must be different.  Find the "wordness" of the character just prior to this
1380  * one, and compare it with the wordness of this one.  If they differ, we have
1381  * a boundary.  At the beginning of the string, pretend that the previous
1382  * character was a new-line */
1383 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1384     if (utf8_target) {                                                         \
1385                 UTF8_CODE \
1386     }                                                                          \
1387     else {  /* Not utf8 */                                                     \
1388         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1389         tmp = TEST_NON_UTF8(tmp);                                              \
1390         REXEC_FBC_SCAN(                                                        \
1391             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1392                 tmp = !tmp;                                                    \
1393                 IF_SUCCESS;                                                    \
1394             }                                                                  \
1395             else {                                                             \
1396                 IF_FAIL;                                                       \
1397             }                                                                  \
1398         );                                                                     \
1399     }                                                                          \
1400     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1401         goto got_it;
1402
1403 /* We know what class REx starts with.  Try to find this position... */
1404 /* if reginfo is NULL, its a dryrun */
1405 /* annoyingly all the vars in this routine have different names from their counterparts
1406    in regmatch. /grrr */
1407
1408 STATIC char *
1409 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1410     const char *strend, regmatch_info *reginfo)
1411 {
1412         dVAR;
1413         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1414         char *pat_string;   /* The pattern's exactish string */
1415         char *pat_end;      /* ptr to end char of pat_string */
1416         re_fold_t folder;       /* Function for computing non-utf8 folds */
1417         const U8 *fold_array;   /* array for folding ords < 256 */
1418         STRLEN ln;
1419         STRLEN lnc;
1420         STRLEN uskip;
1421         U8 c1;
1422         U8 c2;
1423         char *e;
1424         I32 tmp = 1;    /* Scratch variable? */
1425         const bool utf8_target = PL_reg_match_utf8;
1426         UV utf8_fold_flags = 0;
1427         RXi_GET_DECL(prog,progi);
1428
1429         PERL_ARGS_ASSERT_FIND_BYCLASS;
1430
1431         /* We know what class it must start with. */
1432         switch (OP(c)) {
1433         case ANYOFV:
1434         case ANYOF:
1435             if (utf8_target || OP(c) == ANYOFV) {
1436                 STRLEN inclasslen = strend - s;
1437                 REXEC_FBC_UTF8_CLASS_SCAN(
1438                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1439             }
1440             else {
1441                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1442             }
1443             break;
1444         case CANY:
1445             REXEC_FBC_SCAN(
1446                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1447                     goto got_it;
1448                 else
1449                     tmp = doevery;
1450             );
1451             break;
1452
1453         case EXACTFA:
1454             if (UTF_PATTERN || utf8_target) {
1455                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1456                 goto do_exactf_utf8;
1457             }
1458             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1459             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1460             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1461
1462         case EXACTF:
1463             if (utf8_target) {
1464
1465                 /* regcomp.c already folded this if pattern is in UTF-8 */
1466                 utf8_fold_flags = 0;
1467                 goto do_exactf_utf8;
1468             }
1469             fold_array = PL_fold;
1470             folder = foldEQ;
1471             goto do_exactf_non_utf8;
1472
1473         case EXACTFL:
1474             if (UTF_PATTERN || utf8_target) {
1475                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1476                 goto do_exactf_utf8;
1477             }
1478             fold_array = PL_fold_locale;
1479             folder = foldEQ_locale;
1480             goto do_exactf_non_utf8;
1481
1482         case EXACTFU_SS:
1483             if (UTF_PATTERN) {
1484                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1485             }
1486             goto do_exactf_utf8;
1487
1488         case EXACTFU_TRICKYFOLD:
1489         case EXACTFU:
1490             if (UTF_PATTERN || utf8_target) {
1491                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1492                 goto do_exactf_utf8;
1493             }
1494
1495             /* Any 'ss' in the pattern should have been replaced by regcomp,
1496              * so we don't have to worry here about this single special case
1497              * in the Latin1 range */
1498             fold_array = PL_fold_latin1;
1499             folder = foldEQ_latin1;
1500
1501             /* FALL THROUGH */
1502
1503         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1504                                are no glitches with fold-length differences
1505                                between the target string and pattern */
1506
1507             /* The idea in the non-utf8 EXACTF* cases is to first find the
1508              * first character of the EXACTF* node and then, if necessary,
1509              * case-insensitively compare the full text of the node.  c1 is the
1510              * first character.  c2 is its fold.  This logic will not work for
1511              * Unicode semantics and the german sharp ss, which hence should
1512              * not be compiled into a node that gets here. */
1513             pat_string = STRING(c);
1514             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1515
1516             /* We know that we have to match at least 'ln' bytes (which is the
1517              * same as characters, since not utf8).  If we have to match 3
1518              * characters, and there are only 2 availabe, we know without
1519              * trying that it will fail; so don't start a match past the
1520              * required minimum number from the far end */
1521             e = HOP3c(strend, -((I32)ln), s);
1522
1523             if (!reginfo && e < s) {
1524                 e = s;                  /* Due to minlen logic of intuit() */
1525             }
1526
1527             c1 = *pat_string;
1528             c2 = fold_array[c1];
1529             if (c1 == c2) { /* If char and fold are the same */
1530                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1531             }
1532             else {
1533                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1534             }
1535             break;
1536
1537         do_exactf_utf8:
1538         {
1539             unsigned expansion;
1540
1541
1542             /* If one of the operands is in utf8, we can't use the simpler
1543              * folding above, due to the fact that many different characters
1544              * can have the same fold, or portion of a fold, or different-
1545              * length fold */
1546             pat_string = STRING(c);
1547             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1548             pat_end = pat_string + ln;
1549             lnc = (UTF_PATTERN) /* length to match in characters */
1550                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1551                     : ln;
1552
1553             /* We have 'lnc' characters to match in the pattern, but because of
1554              * multi-character folding, each character in the target can match
1555              * up to 3 characters (Unicode guarantees it will never exceed
1556              * this) if it is utf8-encoded; and up to 2 if not (based on the
1557              * fact that the Latin 1 folds are already determined, and the
1558              * only multi-char fold in that range is the sharp-s folding to
1559              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1560              * string character.  Adjust lnc accordingly, rounding up, so that
1561              * if we need to match at least 4+1/3 chars, that really is 5. */
1562             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1563             lnc = (lnc + expansion - 1) / expansion;
1564
1565             /* As in the non-UTF8 case, if we have to match 3 characters, and
1566              * only 2 are left, it's guaranteed to fail, so don't start a
1567              * match that would require us to go beyond the end of the string
1568              */
1569             e = HOP3c(strend, -((I32)lnc), s);
1570
1571             if (!reginfo && e < s) {
1572                 e = s;                  /* Due to minlen logic of intuit() */
1573             }
1574
1575             /* XXX Note that we could recalculate e to stop the loop earlier,
1576              * as the worst case expansion above will rarely be met, and as we
1577              * go along we would usually find that e moves further to the left.
1578              * This would happen only after we reached the point in the loop
1579              * where if there were no expansion we should fail.  Unclear if
1580              * worth the expense */
1581
1582             while (s <= e) {
1583                 char *my_strend= (char *)strend;
1584                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1585                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1586                     && (!reginfo || regtry(reginfo, &s)) )
1587                 {
1588                     goto got_it;
1589                 }
1590                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1591             }
1592             break;
1593         }
1594         case BOUNDL:
1595             PL_reg_flags |= RF_tainted;
1596             FBC_BOUND(isALNUM_LC,
1597                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1598                       isALNUM_LC_utf8((U8*)s));
1599             break;
1600         case NBOUNDL:
1601             PL_reg_flags |= RF_tainted;
1602             FBC_NBOUND(isALNUM_LC,
1603                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1604                        isALNUM_LC_utf8((U8*)s));
1605             break;
1606         case BOUND:
1607             FBC_BOUND(isWORDCHAR,
1608                       isALNUM_uni(tmp),
1609                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1610             break;
1611         case BOUNDA:
1612             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1613                              isWORDCHAR_A(tmp),
1614                              isWORDCHAR_A((U8*)s));
1615             break;
1616         case NBOUND:
1617             FBC_NBOUND(isWORDCHAR,
1618                        isALNUM_uni(tmp),
1619                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1620             break;
1621         case NBOUNDA:
1622             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1623                               isWORDCHAR_A(tmp),
1624                               isWORDCHAR_A((U8*)s));
1625             break;
1626         case BOUNDU:
1627             FBC_BOUND(isWORDCHAR_L1,
1628                       isALNUM_uni(tmp),
1629                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1630             break;
1631         case NBOUNDU:
1632             FBC_NBOUND(isWORDCHAR_L1,
1633                        isALNUM_uni(tmp),
1634                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1635             break;
1636         case ALNUML:
1637             REXEC_FBC_CSCAN_TAINT(
1638                 isALNUM_LC_utf8((U8*)s),
1639                 isALNUM_LC(*s)
1640             );
1641             break;
1642         case ALNUMU:
1643             REXEC_FBC_CSCAN_PRELOAD(
1644                 LOAD_UTF8_CHARCLASS_ALNUM(),
1645                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1646                 isWORDCHAR_L1((U8) *s)
1647             );
1648             break;
1649         case ALNUM:
1650             REXEC_FBC_CSCAN_PRELOAD(
1651                 LOAD_UTF8_CHARCLASS_ALNUM(),
1652                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1653                 isWORDCHAR((U8) *s)
1654             );
1655             break;
1656         case ALNUMA:
1657             /* Don't need to worry about utf8, as it can match only a single
1658              * byte invariant character */
1659             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1660             break;
1661         case NALNUMU:
1662             REXEC_FBC_CSCAN_PRELOAD(
1663                 LOAD_UTF8_CHARCLASS_ALNUM(),
1664                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1665                 ! isWORDCHAR_L1((U8) *s)
1666             );
1667             break;
1668         case NALNUM:
1669             REXEC_FBC_CSCAN_PRELOAD(
1670                 LOAD_UTF8_CHARCLASS_ALNUM(),
1671                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1672                 ! isALNUM(*s)
1673             );
1674             break;
1675         case NALNUMA:
1676             REXEC_FBC_CSCAN(
1677                 !isWORDCHAR_A(*s),
1678                 !isWORDCHAR_A(*s)
1679             );
1680             break;
1681         case NALNUML:
1682             REXEC_FBC_CSCAN_TAINT(
1683                 !isALNUM_LC_utf8((U8*)s),
1684                 !isALNUM_LC(*s)
1685             );
1686             break;
1687         case SPACEU:
1688             REXEC_FBC_CSCAN_PRELOAD(
1689                 LOAD_UTF8_CHARCLASS_SPACE(),
1690                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1691                 isSPACE_L1((U8) *s)
1692             );
1693             break;
1694         case SPACE:
1695             REXEC_FBC_CSCAN_PRELOAD(
1696                 LOAD_UTF8_CHARCLASS_SPACE(),
1697                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1698                 isSPACE((U8) *s)
1699             );
1700             break;
1701         case SPACEA:
1702             /* Don't need to worry about utf8, as it can match only a single
1703              * byte invariant character */
1704             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1705             break;
1706         case SPACEL:
1707             REXEC_FBC_CSCAN_TAINT(
1708                 isSPACE_LC_utf8((U8*)s),
1709                 isSPACE_LC(*s)
1710             );
1711             break;
1712         case NSPACEU:
1713             REXEC_FBC_CSCAN_PRELOAD(
1714                 LOAD_UTF8_CHARCLASS_SPACE(),
1715                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1716                 ! isSPACE_L1((U8) *s)
1717             );
1718             break;
1719         case NSPACE:
1720             REXEC_FBC_CSCAN_PRELOAD(
1721                 LOAD_UTF8_CHARCLASS_SPACE(),
1722                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1723                 ! isSPACE((U8) *s)
1724             );
1725             break;
1726         case NSPACEA:
1727             REXEC_FBC_CSCAN(
1728                 !isSPACE_A(*s),
1729                 !isSPACE_A(*s)
1730             );
1731             break;
1732         case NSPACEL:
1733             REXEC_FBC_CSCAN_TAINT(
1734                 !isSPACE_LC_utf8((U8*)s),
1735                 !isSPACE_LC(*s)
1736             );
1737             break;
1738         case DIGIT:
1739             REXEC_FBC_CSCAN_PRELOAD(
1740                 LOAD_UTF8_CHARCLASS_DIGIT(),
1741                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1742                 isDIGIT(*s)
1743             );
1744             break;
1745         case DIGITA:
1746             /* Don't need to worry about utf8, as it can match only a single
1747              * byte invariant character */
1748             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1749             break;
1750         case DIGITL:
1751             REXEC_FBC_CSCAN_TAINT(
1752                 isDIGIT_LC_utf8((U8*)s),
1753                 isDIGIT_LC(*s)
1754             );
1755             break;
1756         case NDIGIT:
1757             REXEC_FBC_CSCAN_PRELOAD(
1758                 LOAD_UTF8_CHARCLASS_DIGIT(),
1759                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1760                 !isDIGIT(*s)
1761             );
1762             break;
1763         case NDIGITA:
1764             REXEC_FBC_CSCAN(
1765                 !isDIGIT_A(*s),
1766                 !isDIGIT_A(*s)
1767             );
1768             break;
1769         case NDIGITL:
1770             REXEC_FBC_CSCAN_TAINT(
1771                 !isDIGIT_LC_utf8((U8*)s),
1772                 !isDIGIT_LC(*s)
1773             );
1774             break;
1775         case LNBREAK:
1776             REXEC_FBC_CSCAN(
1777                 is_LNBREAK_utf8(s),
1778                 is_LNBREAK_latin1(s)
1779             );
1780             break;
1781         case VERTWS:
1782             REXEC_FBC_CSCAN(
1783                 is_VERTWS_utf8(s),
1784                 is_VERTWS_latin1(s)
1785             );
1786             break;
1787         case NVERTWS:
1788             REXEC_FBC_CSCAN(
1789                 !is_VERTWS_utf8(s),
1790                 !is_VERTWS_latin1(s)
1791             );
1792             break;
1793         case HORIZWS:
1794             REXEC_FBC_CSCAN(
1795                 is_HORIZWS_utf8(s),
1796                 is_HORIZWS_latin1(s)
1797             );
1798             break;
1799         case NHORIZWS:
1800             REXEC_FBC_CSCAN(
1801                 !is_HORIZWS_utf8(s),
1802                 !is_HORIZWS_latin1(s)
1803             );
1804             break;
1805         case POSIXA:
1806             /* Don't need to worry about utf8, as it can match only a single
1807             * byte invariant character.  The flag in this node type is the
1808             * class number to pass to _generic_isCC() to build a mask for
1809             * searching in PL_charclass[] */
1810             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1811             break;
1812         case NPOSIXA:
1813             REXEC_FBC_CSCAN(
1814                 !_generic_isCC_A(*s, FLAGS(c)),
1815                 !_generic_isCC_A(*s, FLAGS(c))
1816             );
1817             break;
1818
1819         case AHOCORASICKC:
1820         case AHOCORASICK:
1821             {
1822                 DECL_TRIE_TYPE(c);
1823                 /* what trie are we using right now */
1824                 reg_ac_data *aho
1825                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1826                 reg_trie_data *trie
1827                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1828                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1829
1830                 const char *last_start = strend - trie->minlen;
1831 #ifdef DEBUGGING
1832                 const char *real_start = s;
1833 #endif
1834                 STRLEN maxlen = trie->maxlen;
1835                 SV *sv_points;
1836                 U8 **points; /* map of where we were in the input string
1837                                 when reading a given char. For ASCII this
1838                                 is unnecessary overhead as the relationship
1839                                 is always 1:1, but for Unicode, especially
1840                                 case folded Unicode this is not true. */
1841                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1842                 U8 *bitmap=NULL;
1843
1844
1845                 GET_RE_DEBUG_FLAGS_DECL;
1846
1847                 /* We can't just allocate points here. We need to wrap it in
1848                  * an SV so it gets freed properly if there is a croak while
1849                  * running the match */
1850                 ENTER;
1851                 SAVETMPS;
1852                 sv_points=newSV(maxlen * sizeof(U8 *));
1853                 SvCUR_set(sv_points,
1854                     maxlen * sizeof(U8 *));
1855                 SvPOK_on(sv_points);
1856                 sv_2mortal(sv_points);
1857                 points=(U8**)SvPV_nolen(sv_points );
1858                 if ( trie_type != trie_utf8_fold
1859                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1860                 {
1861                     if (trie->bitmap)
1862                         bitmap=(U8*)trie->bitmap;
1863                     else
1864                         bitmap=(U8*)ANYOF_BITMAP(c);
1865                 }
1866                 /* this is the Aho-Corasick algorithm modified a touch
1867                    to include special handling for long "unknown char"
1868                    sequences. The basic idea being that we use AC as long
1869                    as we are dealing with a possible matching char, when
1870                    we encounter an unknown char (and we have not encountered
1871                    an accepting state) we scan forward until we find a legal
1872                    starting char.
1873                    AC matching is basically that of trie matching, except
1874                    that when we encounter a failing transition, we fall back
1875                    to the current states "fail state", and try the current char
1876                    again, a process we repeat until we reach the root state,
1877                    state 1, or a legal transition. If we fail on the root state
1878                    then we can either terminate if we have reached an accepting
1879                    state previously, or restart the entire process from the beginning
1880                    if we have not.
1881
1882                  */
1883                 while (s <= last_start) {
1884                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1885                     U8 *uc = (U8*)s;
1886                     U16 charid = 0;
1887                     U32 base = 1;
1888                     U32 state = 1;
1889                     UV uvc = 0;
1890                     STRLEN len = 0;
1891                     STRLEN foldlen = 0;
1892                     U8 *uscan = (U8*)NULL;
1893                     U8 *leftmost = NULL;
1894 #ifdef DEBUGGING
1895                     U32 accepted_word= 0;
1896 #endif
1897                     U32 pointpos = 0;
1898
1899                     while ( state && uc <= (U8*)strend ) {
1900                         int failed=0;
1901                         U32 word = aho->states[ state ].wordnum;
1902
1903                         if( state==1 ) {
1904                             if ( bitmap ) {
1905                                 DEBUG_TRIE_EXECUTE_r(
1906                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1907                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1908                                             (char *)uc, utf8_target );
1909                                         PerlIO_printf( Perl_debug_log,
1910                                             " Scanning for legal start char...\n");
1911                                     }
1912                                 );
1913                                 if (utf8_target) {
1914                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1915                                         uc += UTF8SKIP(uc);
1916                                     }
1917                                 } else {
1918                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1919                                         uc++;
1920                                     }
1921                                 }
1922                                 s= (char *)uc;
1923                             }
1924                             if (uc >(U8*)last_start) break;
1925                         }
1926
1927                         if ( word ) {
1928                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1929                             if (!leftmost || lpos < leftmost) {
1930                                 DEBUG_r(accepted_word=word);
1931                                 leftmost= lpos;
1932                             }
1933                             if (base==0) break;
1934
1935                         }
1936                         points[pointpos++ % maxlen]= uc;
1937                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1938                                              uscan, len, uvc, charid, foldlen,
1939                                              foldbuf, uniflags);
1940                         DEBUG_TRIE_EXECUTE_r({
1941                             dump_exec_pos( (char *)uc, c, strend, real_start,
1942                                 s,   utf8_target );
1943                             PerlIO_printf(Perl_debug_log,
1944                                 " Charid:%3u CP:%4"UVxf" ",
1945                                  charid, uvc);
1946                         });
1947
1948                         do {
1949 #ifdef DEBUGGING
1950                             word = aho->states[ state ].wordnum;
1951 #endif
1952                             base = aho->states[ state ].trans.base;
1953
1954                             DEBUG_TRIE_EXECUTE_r({
1955                                 if (failed)
1956                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1957                                         s,   utf8_target );
1958                                 PerlIO_printf( Perl_debug_log,
1959                                     "%sState: %4"UVxf", word=%"UVxf,
1960                                     failed ? " Fail transition to " : "",
1961                                     (UV)state, (UV)word);
1962                             });
1963                             if ( base ) {
1964                                 U32 tmp;
1965                                 I32 offset;
1966                                 if (charid &&
1967                                      ( ((offset = base + charid
1968                                         - 1 - trie->uniquecharcount)) >= 0)
1969                                      && ((U32)offset < trie->lasttrans)
1970                                      && trie->trans[offset].check == state
1971                                      && (tmp=trie->trans[offset].next))
1972                                 {
1973                                     DEBUG_TRIE_EXECUTE_r(
1974                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1975                                     state = tmp;
1976                                     break;
1977                                 }
1978                                 else {
1979                                     DEBUG_TRIE_EXECUTE_r(
1980                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1981                                     failed = 1;
1982                                     state = aho->fail[state];
1983                                 }
1984                             }
1985                             else {
1986                                 /* we must be accepting here */
1987                                 DEBUG_TRIE_EXECUTE_r(
1988                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1989                                 failed = 1;
1990                                 break;
1991                             }
1992                         } while(state);
1993                         uc += len;
1994                         if (failed) {
1995                             if (leftmost)
1996                                 break;
1997                             if (!state) state = 1;
1998                         }
1999                     }
2000                     if ( aho->states[ state ].wordnum ) {
2001                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2002                         if (!leftmost || lpos < leftmost) {
2003                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2004                             leftmost = lpos;
2005                         }
2006                     }
2007                     if (leftmost) {
2008                         s = (char*)leftmost;
2009                         DEBUG_TRIE_EXECUTE_r({
2010                             PerlIO_printf(
2011                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2012                                 (UV)accepted_word, (IV)(s - real_start)
2013                             );
2014                         });
2015                         if (!reginfo || regtry(reginfo, &s)) {
2016                             FREETMPS;
2017                             LEAVE;
2018                             goto got_it;
2019                         }
2020                         s = HOPc(s,1);
2021                         DEBUG_TRIE_EXECUTE_r({
2022                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2023                         });
2024                     } else {
2025                         DEBUG_TRIE_EXECUTE_r(
2026                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2027                         break;
2028                     }
2029                 }
2030                 FREETMPS;
2031                 LEAVE;
2032             }
2033             break;
2034         default:
2035             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2036             break;
2037         }
2038         return 0;
2039       got_it:
2040         return s;
2041 }
2042
2043
2044 /*
2045  - regexec_flags - match a regexp against a string
2046  */
2047 I32
2048 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2049               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2050 /* stringarg: the point in the string at which to begin matching */
2051 /* strend:    pointer to null at end of string */
2052 /* strbeg:    real beginning of string */
2053 /* minend:    end of match must be >= minend bytes after stringarg. */
2054 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2055  *            itself is accessed via the pointers above */
2056 /* data:      May be used for some additional optimizations.
2057               Currently its only used, with a U32 cast, for transmitting
2058               the ganch offset when doing a /g match. This will change */
2059 /* nosave:    For optimizations. */
2060
2061 {
2062     dVAR;
2063     struct regexp *const prog = (struct regexp *)SvANY(rx);
2064     /*register*/ char *s;
2065     regnode *c;
2066     /*register*/ char *startpos = stringarg;
2067     I32 minlen;         /* must match at least this many chars */
2068     I32 dontbother = 0; /* how many characters not to try at end */
2069     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2070     I32 scream_pos = -1;                /* Internal iterator of scream. */
2071     char *scream_olds = NULL;
2072     const bool utf8_target = cBOOL(DO_UTF8(sv));
2073     I32 multiline;
2074     RXi_GET_DECL(prog,progi);
2075     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2076     regexp_paren_pair *swap = NULL;
2077     GET_RE_DEBUG_FLAGS_DECL;
2078
2079     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2080     PERL_UNUSED_ARG(data);
2081
2082     /* Be paranoid... */
2083     if (prog == NULL || startpos == NULL) {
2084         Perl_croak(aTHX_ "NULL regexp parameter");
2085         return 0;
2086     }
2087
2088     multiline = prog->extflags & RXf_PMf_MULTILINE;
2089     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2090
2091     RX_MATCH_UTF8_set(rx, utf8_target);
2092     DEBUG_EXECUTE_r(
2093         debug_start_match(rx, utf8_target, startpos, strend,
2094         "Matching");
2095     );
2096
2097     minlen = prog->minlen;
2098
2099     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2100         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2101                               "String too short [regexec_flags]...\n"));
2102         goto phooey;
2103     }
2104
2105
2106     /* Check validity of program. */
2107     if (UCHARAT(progi->program) != REG_MAGIC) {
2108         Perl_croak(aTHX_ "corrupted regexp program");
2109     }
2110
2111     PL_reg_flags = 0;
2112     PL_reg_state.re_state_eval_setup_done = FALSE;
2113     PL_reg_maxiter = 0;
2114
2115     if (RX_UTF8(rx))
2116         PL_reg_flags |= RF_utf8;
2117
2118     /* Mark beginning of line for ^ and lookbehind. */
2119     reginfo.bol = startpos; /* XXX not used ??? */
2120     PL_bostr  = strbeg;
2121     reginfo.sv = sv;
2122
2123     /* Mark end of line for $ (and such) */
2124     PL_regeol = strend;
2125
2126     /* see how far we have to get to not match where we matched before */
2127     reginfo.till = startpos+minend;
2128
2129     /* If there is a "must appear" string, look for it. */
2130     s = startpos;
2131
2132     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2133         MAGIC *mg;
2134         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2135             reginfo.ganch = startpos + prog->gofs;
2136             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2137               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2138         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2139                   && SvMAGIC(sv)
2140                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2141                   && mg->mg_len >= 0) {
2142             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2143             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2144                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2145
2146             if (prog->extflags & RXf_ANCH_GPOS) {
2147                 if (s > reginfo.ganch)
2148                     goto phooey;
2149                 s = reginfo.ganch - prog->gofs;
2150                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2151                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2152                 if (s < strbeg)
2153                     goto phooey;
2154             }
2155         }
2156         else if (data) {
2157             reginfo.ganch = strbeg + PTR2UV(data);
2158             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2160
2161         } else {                                /* pos() not defined */
2162             reginfo.ganch = strbeg;
2163             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2164                  "GPOS: reginfo.ganch = strbeg\n"));
2165         }
2166     }
2167     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2168         /* We have to be careful. If the previous successful match
2169            was from this regex we don't want a subsequent partially
2170            successful match to clobber the old results.
2171            So when we detect this possibility we add a swap buffer
2172            to the re, and switch the buffer each match. If we fail
2173            we switch it back, otherwise we leave it swapped.
2174         */
2175         swap = prog->offs;
2176         /* do we need a save destructor here for eval dies? */
2177         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2178         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2179             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2180             PTR2UV(prog),
2181             PTR2UV(swap),
2182             PTR2UV(prog->offs)
2183         ));
2184     }
2185     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2186         re_scream_pos_data d;
2187
2188         d.scream_olds = &scream_olds;
2189         d.scream_pos = &scream_pos;
2190         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2191         if (!s) {
2192             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2193             goto phooey;        /* not present */
2194         }
2195     }
2196
2197
2198
2199     /* Simplest case:  anchored match need be tried only once. */
2200     /*  [unless only anchor is BOL and multiline is set] */
2201     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2202         if (s == startpos && regtry(&reginfo, &startpos))
2203             goto got_it;
2204         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2205                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2206         {
2207             char *end;
2208
2209             if (minlen)
2210                 dontbother = minlen - 1;
2211             end = HOP3c(strend, -dontbother, strbeg) - 1;
2212             /* for multiline we only have to try after newlines */
2213             if (prog->check_substr || prog->check_utf8) {
2214                 /* because of the goto we can not easily reuse the macros for bifurcating the
2215                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2216                 if (utf8_target) {
2217                     if (s == startpos)
2218                         goto after_try_utf8;
2219                     while (1) {
2220                         if (regtry(&reginfo, &s)) {
2221                             goto got_it;
2222                         }
2223                       after_try_utf8:
2224                         if (s > end) {
2225                             goto phooey;
2226                         }
2227                         if (prog->extflags & RXf_USE_INTUIT) {
2228                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2229                             if (!s) {
2230                                 goto phooey;
2231                             }
2232                         }
2233                         else {
2234                             s += UTF8SKIP(s);
2235                         }
2236                     }
2237                 } /* end search for check string in unicode */
2238                 else {
2239                     if (s == startpos) {
2240                         goto after_try_latin;
2241                     }
2242                     while (1) {
2243                         if (regtry(&reginfo, &s)) {
2244                             goto got_it;
2245                         }
2246                       after_try_latin:
2247                         if (s > end) {
2248                             goto phooey;
2249                         }
2250                         if (prog->extflags & RXf_USE_INTUIT) {
2251                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2252                             if (!s) {
2253                                 goto phooey;
2254                             }
2255                         }
2256                         else {
2257                             s++;
2258                         }
2259                     }
2260                 } /* end search for check string in latin*/
2261             } /* end search for check string */
2262             else { /* search for newline */
2263                 if (s > startpos) {
2264                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2265                     s--;
2266                 }
2267                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2268                 while (s <= end) { /* note it could be possible to match at the end of the string */
2269                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2270                         if (regtry(&reginfo, &s))
2271                             goto got_it;
2272                     }
2273                 }
2274             } /* end search for newline */
2275         } /* end anchored/multiline check string search */
2276         goto phooey;
2277     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2278     {
2279         /* the warning about reginfo.ganch being used without initialization
2280            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2281            and we only enter this block when the same bit is set. */
2282         char *tmp_s = reginfo.ganch - prog->gofs;
2283
2284         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2285             goto got_it;
2286         goto phooey;
2287     }
2288
2289     /* Messy cases:  unanchored match. */
2290     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2291         /* we have /x+whatever/ */
2292         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2293         char ch;
2294 #ifdef DEBUGGING
2295         int did_match = 0;
2296 #endif
2297         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2298             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2299         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2300
2301         if (utf8_target) {
2302             REXEC_FBC_SCAN(
2303                 if (*s == ch) {
2304                     DEBUG_EXECUTE_r( did_match = 1 );
2305                     if (regtry(&reginfo, &s)) goto got_it;
2306                     s += UTF8SKIP(s);
2307                     while (s < strend && *s == ch)
2308                         s += UTF8SKIP(s);
2309                 }
2310             );
2311         }
2312         else {
2313             REXEC_FBC_SCAN(
2314                 if (*s == ch) {
2315                     DEBUG_EXECUTE_r( did_match = 1 );
2316                     if (regtry(&reginfo, &s)) goto got_it;
2317                     s++;
2318                     while (s < strend && *s == ch)
2319                         s++;
2320                 }
2321             );
2322         }
2323         DEBUG_EXECUTE_r(if (!did_match)
2324                 PerlIO_printf(Perl_debug_log,
2325                                   "Did not find anchored character...\n")
2326                );
2327     }
2328     else if (prog->anchored_substr != NULL
2329               || prog->anchored_utf8 != NULL
2330               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2331                   && prog->float_max_offset < strend - s)) {
2332         SV *must;
2333         I32 back_max;
2334         I32 back_min;
2335         char *last;
2336         char *last1;            /* Last position checked before */
2337 #ifdef DEBUGGING
2338         int did_match = 0;
2339 #endif
2340         if (prog->anchored_substr || prog->anchored_utf8) {
2341             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2342                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2343             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2344             back_max = back_min = prog->anchored_offset;
2345         } else {
2346             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2347                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2348             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2349             back_max = prog->float_max_offset;
2350             back_min = prog->float_min_offset;
2351         }
2352
2353
2354         if (must == &PL_sv_undef)
2355             /* could not downgrade utf8 check substring, so must fail */
2356             goto phooey;
2357
2358         if (back_min<0) {
2359             last = strend;
2360         } else {
2361             last = HOP3c(strend,        /* Cannot start after this */
2362                   -(I32)(CHR_SVLEN(must)
2363                          - (SvTAIL(must) != 0) + back_min), strbeg);
2364         }
2365         if (s > PL_bostr)
2366             last1 = HOPc(s, -1);
2367         else
2368             last1 = s - 1;      /* bogus */
2369
2370         /* XXXX check_substr already used to find "s", can optimize if
2371            check_substr==must. */
2372         scream_pos = -1;
2373         dontbother = end_shift;
2374         strend = HOPc(strend, -dontbother);
2375         while ( (s <= last) &&
2376                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2377                                   (unsigned char*)strend, must,
2378                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2379             DEBUG_EXECUTE_r( did_match = 1 );
2380             if (HOPc(s, -back_max) > last1) {
2381                 last1 = HOPc(s, -back_min);
2382                 s = HOPc(s, -back_max);
2383             }
2384             else {
2385                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2386
2387                 last1 = HOPc(s, -back_min);
2388                 s = t;
2389             }
2390             if (utf8_target) {
2391                 while (s <= last1) {
2392                     if (regtry(&reginfo, &s))
2393                         goto got_it;
2394                     s += UTF8SKIP(s);
2395                 }
2396             }
2397             else {
2398                 while (s <= last1) {
2399                     if (regtry(&reginfo, &s))
2400                         goto got_it;
2401                     s++;
2402                 }
2403             }
2404         }
2405         DEBUG_EXECUTE_r(if (!did_match) {
2406             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2407                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2408             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2409                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2410                                ? "anchored" : "floating"),
2411                 quoted, RE_SV_TAIL(must));
2412         });
2413         goto phooey;
2414     }
2415     else if ( (c = progi->regstclass) ) {
2416         if (minlen) {
2417             const OPCODE op = OP(progi->regstclass);
2418             /* don't bother with what can't match */
2419             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2420                 strend = HOPc(strend, -(minlen - 1));
2421         }
2422         DEBUG_EXECUTE_r({
2423             SV * const prop = sv_newmortal();
2424             regprop(prog, prop, c);
2425             {
2426                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2427                     s,strend-s,60);
2428                 PerlIO_printf(Perl_debug_log,
2429                     "Matching stclass %.*s against %s (%d bytes)\n",
2430                     (int)SvCUR(prop), SvPVX_const(prop),
2431                      quoted, (int)(strend - s));
2432             }
2433         });
2434         if (find_byclass(prog, c, s, strend, &reginfo))
2435             goto got_it;
2436         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2437     }
2438     else {
2439         dontbother = 0;
2440         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2441             /* Trim the end. */
2442             char *last= NULL;
2443             SV* float_real;
2444             STRLEN len;
2445             const char *little;
2446
2447             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2448                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2449             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2450
2451             little = SvPV_const(float_real, len);
2452             if (SvTAIL(float_real)) {
2453                     /* This means that float_real contains an artificial \n on the end
2454                      * due to the presence of something like this: /foo$/
2455                      * where we can match both "foo" and "foo\n" at the end of the string.
2456                      * So we have to compare the end of the string first against the float_real
2457                      * without the \n and then against the full float_real with the string.
2458                      * We have to watch out for cases where the string might be smaller
2459                      * than the float_real or the float_real without the \n.
2460                      */
2461                     char *checkpos= strend - len;
2462                     DEBUG_OPTIMISE_r(
2463                         PerlIO_printf(Perl_debug_log,
2464                             "%sChecking for float_real.%s\n",
2465                             PL_colors[4], PL_colors[5]));
2466                     if (checkpos + 1 < strbeg) {
2467                         /* can't match, even if we remove the trailing \n string is too short to match */
2468                         DEBUG_EXECUTE_r(
2469                             PerlIO_printf(Perl_debug_log,
2470                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2471                                 PL_colors[4], PL_colors[5]));
2472                         goto phooey;
2473                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2474                         /* can match, the end of the string matches without the "\n" */
2475                         last = checkpos + 1;
2476                     } else if (checkpos < strbeg) {
2477                         /* cant match, string is too short when the "\n" is included */
2478                         DEBUG_EXECUTE_r(
2479                             PerlIO_printf(Perl_debug_log,
2480                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2481                                 PL_colors[4], PL_colors[5]));
2482                         goto phooey;
2483                     } else if (!multiline) {
2484                         /* non multiline match, so compare with the "\n" at the end of the string */
2485                         if (memEQ(checkpos, little, len)) {
2486                             last= checkpos;
2487                         } else {
2488                             DEBUG_EXECUTE_r(
2489                                 PerlIO_printf(Perl_debug_log,
2490                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2491                                     PL_colors[4], PL_colors[5]));
2492                             goto phooey;
2493                         }
2494                     } else {
2495                         /* multiline match, so we have to search for a place where the full string is located */
2496                         goto find_last;
2497                     }
2498             } else {
2499                   find_last:
2500                     if (len)
2501                         last = rninstr(s, strend, little, little + len);
2502                     else
2503                         last = strend;  /* matching "$" */
2504             }
2505             if (!last) {
2506                 /* at one point this block contained a comment which was probably
2507                  * incorrect, which said that this was a "should not happen" case.
2508                  * Even if it was true when it was written I am pretty sure it is
2509                  * not anymore, so I have removed the comment and replaced it with
2510                  * this one. Yves */
2511                 DEBUG_EXECUTE_r(
2512                     PerlIO_printf(Perl_debug_log,
2513                         "String does not contain required substring, cannot match.\n"
2514                     ));
2515                 goto phooey;
2516             }
2517             dontbother = strend - last + prog->float_min_offset;
2518         }
2519         if (minlen && (dontbother < minlen))
2520             dontbother = minlen - 1;
2521         strend -= dontbother;              /* this one's always in bytes! */
2522         /* We don't know much -- general case. */
2523         if (utf8_target) {
2524             for (;;) {
2525                 if (regtry(&reginfo, &s))
2526                     goto got_it;
2527                 if (s >= strend)
2528                     break;
2529                 s += UTF8SKIP(s);
2530             };
2531         }
2532         else {
2533             do {
2534                 if (regtry(&reginfo, &s))
2535                     goto got_it;
2536             } while (s++ < strend);
2537         }
2538     }
2539
2540     /* Failure. */
2541     goto phooey;
2542
2543 got_it:
2544     DEBUG_BUFFERS_r(
2545         if (swap)
2546             PerlIO_printf(Perl_debug_log,
2547                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2548                 PTR2UV(prog),
2549                 PTR2UV(swap)
2550             );
2551     );
2552     Safefree(swap);
2553     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2554
2555     if (PL_reg_state.re_state_eval_setup_done)
2556         restore_pos(aTHX_ prog);
2557     if (RXp_PAREN_NAMES(prog))
2558         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2559
2560     /* make sure $`, $&, $', and $digit will work later */
2561     if ( !(flags & REXEC_NOT_FIRST) ) {
2562         if (flags & REXEC_COPY_STR) {
2563 #ifdef PERL_OLD_COPY_ON_WRITE
2564             if ((SvIsCOW(sv)
2565                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2566                 if (DEBUG_C_TEST) {
2567                     PerlIO_printf(Perl_debug_log,
2568                                   "Copy on write: regexp capture, type %d\n",
2569                                   (int) SvTYPE(sv));
2570                 }
2571                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2572                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2573                 assert (SvPOKp(prog->saved_copy));
2574                 prog->sublen  = PL_regeol - strbeg;
2575                 prog->suboffset = 0;
2576                 prog->subcoffset = 0;
2577             } else
2578 #endif
2579             {
2580                 I32 min = 0;
2581                 I32 max = PL_regeol - strbeg;
2582                 I32 sublen;
2583
2584                 if (    (flags & REXEC_COPY_SKIP_POST)
2585                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2586                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2587                 ) { /* don't copy $' part of string */
2588                     U32 n = 0;
2589                     max = -1;
2590                     /* calculate the right-most part of the string covered
2591                      * by a capture. Due to look-ahead, this may be to
2592                      * the right of $&, so we have to scan all captures */
2593                     while (n <= prog->lastparen) {
2594                         if (prog->offs[n].end > max)
2595                             max = prog->offs[n].end;
2596                         n++;
2597                     }
2598                     if (max == -1)
2599                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2600                                 ? prog->offs[0].start
2601                                 : 0;
2602                     assert(max >= 0 && max <= PL_regeol - strbeg);
2603                 }
2604
2605                 if (    (flags & REXEC_COPY_SKIP_PRE)
2606                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2607                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2608                 ) { /* don't copy $` part of string */
2609                     U32 n = 0;
2610                     min = max;
2611                     /* calculate the left-most part of the string covered
2612                      * by a capture. Due to look-behind, this may be to
2613                      * the left of $&, so we have to scan all captures */
2614                     while (min && n <= prog->lastparen) {
2615                         if (   prog->offs[n].start != -1
2616                             && prog->offs[n].start < min)
2617                         {
2618                             min = prog->offs[n].start;
2619                         }
2620                         n++;
2621                     }
2622                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2623                         && min >  prog->offs[0].end
2624                     )
2625                         min = prog->offs[0].end;
2626
2627                 }
2628
2629                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2630                 sublen = max - min;
2631
2632                 if (RX_MATCH_COPIED(rx)) {
2633                     if (sublen > prog->sublen)
2634                         prog->subbeg =
2635                                 (char*)saferealloc(prog->subbeg, sublen+1);
2636                 }
2637                 else
2638                     prog->subbeg = (char*)safemalloc(sublen+1);
2639                 Copy(strbeg + min, prog->subbeg, sublen, char);
2640                 prog->subbeg[sublen] = '\0';
2641                 prog->suboffset = min;
2642                 prog->sublen = sublen;
2643             }
2644             RX_MATCH_COPIED_on(rx);
2645             prog->subcoffset = prog->suboffset;
2646             if (prog->suboffset && utf8_target) {
2647                 /* Convert byte offset to chars.
2648                  * XXX ideally should only compute this if @-/@+
2649                  * has been seen, a la PL_sawampersand ??? */
2650
2651                 /* If there's a direct correspondence between the
2652                  * string which we're matching and the original SV,
2653                  * then we can use the utf8 len cache associated with
2654                  * the SV. In particular, it means that under //g,
2655                  * sv_pos_b2u() will use the previously cached
2656                  * position to speed up working out the new length of
2657                  * subcoffset, rather than counting from the start of
2658                  * the string each time. This stops
2659                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2660                  * from going quadratic */
2661                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2662                     sv_pos_b2u(sv, &(prog->subcoffset));
2663                 else
2664                     prog->subcoffset = utf8_length((U8*)strbeg,
2665                                         (U8*)(strbeg+prog->suboffset));
2666             }
2667         }
2668         else {
2669             RX_MATCH_COPY_FREE(rx);
2670             prog->subbeg = strbeg;
2671             prog->suboffset = 0;
2672             prog->subcoffset = 0;
2673             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2674         }
2675     }
2676
2677     return 1;
2678
2679 phooey:
2680     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2681                           PL_colors[4], PL_colors[5]));
2682     if (PL_reg_state.re_state_eval_setup_done)
2683         restore_pos(aTHX_ prog);
2684     if (swap) {
2685         /* we failed :-( roll it back */
2686         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2687             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2688             PTR2UV(prog),
2689             PTR2UV(prog->offs),
2690             PTR2UV(swap)
2691         ));
2692         Safefree(prog->offs);
2693         prog->offs = swap;
2694     }
2695
2696     return 0;
2697 }
2698
2699
2700 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2701  * Do inc before dec, in case old and new rex are the same */
2702 #define SET_reg_curpm(Re2) \
2703     if (PL_reg_state.re_state_eval_setup_done) {    \
2704         (void)ReREFCNT_inc(Re2);                    \
2705         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2706         PM_SETRE((PL_reg_curpm), (Re2));            \
2707     }
2708
2709
2710 /*
2711  - regtry - try match at specific point
2712  */
2713 STATIC I32                      /* 0 failure, 1 success */
2714 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2715 {
2716     dVAR;
2717     CHECKPOINT lastcp;
2718     REGEXP *const rx = reginfo->prog;
2719     regexp *const prog = (struct regexp *)SvANY(rx);
2720     RXi_GET_DECL(prog,progi);
2721     GET_RE_DEBUG_FLAGS_DECL;
2722
2723     PERL_ARGS_ASSERT_REGTRY;
2724
2725     reginfo->cutpoint=NULL;
2726
2727     if ((prog->extflags & RXf_EVAL_SEEN)
2728         && !PL_reg_state.re_state_eval_setup_done)
2729     {
2730         MAGIC *mg;
2731
2732         PL_reg_state.re_state_eval_setup_done = TRUE;
2733         if (reginfo->sv) {
2734             /* Make $_ available to executed code. */
2735             if (reginfo->sv != DEFSV) {
2736                 SAVE_DEFSV;
2737                 DEFSV_set(reginfo->sv);
2738             }
2739
2740             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2741                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2742                 /* prepare for quick setting of pos */
2743 #ifdef PERL_OLD_COPY_ON_WRITE
2744                 if (SvIsCOW(reginfo->sv))
2745                     sv_force_normal_flags(reginfo->sv, 0);
2746 #endif
2747                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2748                                  &PL_vtbl_mglob, NULL, 0);
2749                 mg->mg_len = -1;
2750             }
2751             PL_reg_magic    = mg;
2752             PL_reg_oldpos   = mg->mg_len;
2753             SAVEDESTRUCTOR_X(restore_pos, prog);
2754         }
2755         if (!PL_reg_curpm) {
2756             Newxz(PL_reg_curpm, 1, PMOP);
2757 #ifdef USE_ITHREADS
2758             {
2759                 SV* const repointer = &PL_sv_undef;
2760                 /* this regexp is also owned by the new PL_reg_curpm, which
2761                    will try to free it.  */
2762                 av_push(PL_regex_padav, repointer);
2763                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2764                 PL_regex_pad = AvARRAY(PL_regex_padav);
2765             }
2766 #endif
2767         }
2768         SET_reg_curpm(rx);
2769         PL_reg_oldcurpm = PL_curpm;
2770         PL_curpm = PL_reg_curpm;
2771         if (RXp_MATCH_COPIED(prog)) {
2772             /*  Here is a serious problem: we cannot rewrite subbeg,
2773                 since it may be needed if this match fails.  Thus
2774                 $` inside (?{}) could fail... */
2775             PL_reg_oldsaved = prog->subbeg;
2776             PL_reg_oldsavedlen = prog->sublen;
2777             PL_reg_oldsavedoffset = prog->suboffset;
2778             PL_reg_oldsavedcoffset = prog->suboffset;
2779 #ifdef PERL_OLD_COPY_ON_WRITE
2780             PL_nrs = prog->saved_copy;
2781 #endif
2782             RXp_MATCH_COPIED_off(prog);
2783         }
2784         else
2785             PL_reg_oldsaved = NULL;
2786         prog->subbeg = PL_bostr;
2787         prog->suboffset = 0;
2788         prog->subcoffset = 0;
2789         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2790     }
2791 #ifdef DEBUGGING
2792     PL_reg_starttry = *startpos;
2793 #endif
2794     prog->offs[0].start = *startpos - PL_bostr;
2795     PL_reginput = *startpos;
2796     prog->lastparen = 0;
2797     prog->lastcloseparen = 0;
2798     PL_regsize = 0;
2799
2800     /* XXXX What this code is doing here?!!!  There should be no need
2801        to do this again and again, prog->lastparen should take care of
2802        this!  --ilya*/
2803
2804     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2805      * Actually, the code in regcppop() (which Ilya may be meaning by
2806      * prog->lastparen), is not needed at all by the test suite
2807      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2808      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2809      * Meanwhile, this code *is* needed for the
2810      * above-mentioned test suite tests to succeed.  The common theme
2811      * on those tests seems to be returning null fields from matches.
2812      * --jhi updated by dapm */
2813 #if 1
2814     if (prog->nparens) {
2815         regexp_paren_pair *pp = prog->offs;
2816         I32 i;
2817         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2818             ++pp;
2819             pp->start = -1;
2820             pp->end = -1;
2821         }
2822     }
2823 #endif
2824     REGCP_SET(lastcp);
2825     if (regmatch(reginfo, progi->program + 1)) {
2826         prog->offs[0].end = PL_reginput - PL_bostr;
2827         return 1;
2828     }
2829     if (reginfo->cutpoint)
2830         *startpos= reginfo->cutpoint;
2831     REGCP_UNWIND(lastcp);
2832     return 0;
2833 }
2834
2835
2836 #define sayYES goto yes
2837 #define sayNO goto no
2838 #define sayNO_SILENT goto no_silent
2839
2840 /* we dont use STMT_START/END here because it leads to
2841    "unreachable code" warnings, which are bogus, but distracting. */
2842 #define CACHEsayNO \
2843     if (ST.cache_mask) \
2844        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2845     sayNO
2846
2847 /* this is used to determine how far from the left messages like
2848    'failed...' are printed. It should be set such that messages
2849    are inline with the regop output that created them.
2850 */
2851 #define REPORT_CODE_OFF 32
2852
2853
2854 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2855 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2856
2857 #define SLAB_FIRST(s) (&(s)->states[0])
2858 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2859
2860 /* grab a new slab and return the first slot in it */
2861
2862 STATIC regmatch_state *
2863 S_push_slab(pTHX)
2864 {
2865 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2866     dMY_CXT;
2867 #endif
2868     regmatch_slab *s = PL_regmatch_slab->next;
2869     if (!s) {
2870         Newx(s, 1, regmatch_slab);
2871         s->prev = PL_regmatch_slab;
2872         s->next = NULL;
2873         PL_regmatch_slab->next = s;
2874     }
2875     PL_regmatch_slab = s;
2876     return SLAB_FIRST(s);
2877 }
2878
2879
2880 /* push a new state then goto it */
2881
2882 #define PUSH_STATE_GOTO(state, node) \
2883     scan = node; \
2884     st->resume_state = state; \
2885     goto push_state;
2886
2887 /* push a new state with success backtracking, then goto it */
2888
2889 #define PUSH_YES_STATE_GOTO(state, node) \
2890     scan = node; \
2891     st->resume_state = state; \
2892     goto push_yes_state;
2893
2894
2895
2896 /*
2897
2898 regmatch() - main matching routine
2899
2900 This is basically one big switch statement in a loop. We execute an op,
2901 set 'next' to point the next op, and continue. If we come to a point which
2902 we may need to backtrack to on failure such as (A|B|C), we push a
2903 backtrack state onto the backtrack stack. On failure, we pop the top
2904 state, and re-enter the loop at the state indicated. If there are no more
2905 states to pop, we return failure.
2906
2907 Sometimes we also need to backtrack on success; for example /A+/, where
2908 after successfully matching one A, we need to go back and try to
2909 match another one; similarly for lookahead assertions: if the assertion
2910 completes successfully, we backtrack to the state just before the assertion
2911 and then carry on.  In these cases, the pushed state is marked as
2912 'backtrack on success too'. This marking is in fact done by a chain of
2913 pointers, each pointing to the previous 'yes' state. On success, we pop to
2914 the nearest yes state, discarding any intermediate failure-only states.
2915 Sometimes a yes state is pushed just to force some cleanup code to be
2916 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2917 it to free the inner regex.
2918
2919 Note that failure backtracking rewinds the cursor position, while
2920 success backtracking leaves it alone.
2921
2922 A pattern is complete when the END op is executed, while a subpattern
2923 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2924 ops trigger the "pop to last yes state if any, otherwise return true"
2925 behaviour.
2926
2927 A common convention in this function is to use A and B to refer to the two
2928 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2929 the subpattern to be matched possibly multiple times, while B is the entire
2930 rest of the pattern. Variable and state names reflect this convention.
2931
2932 The states in the main switch are the union of ops and failure/success of
2933 substates associated with with that op.  For example, IFMATCH is the op
2934 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2935 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2936 successfully matched A and IFMATCH_A_fail is a state saying that we have
2937 just failed to match A. Resume states always come in pairs. The backtrack
2938 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2939 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2940 on success or failure.
2941
2942 The struct that holds a backtracking state is actually a big union, with
2943 one variant for each major type of op. The variable st points to the
2944 top-most backtrack struct. To make the code clearer, within each
2945 block of code we #define ST to alias the relevant union.
2946
2947 Here's a concrete example of a (vastly oversimplified) IFMATCH
2948 implementation:
2949
2950     switch (state) {
2951     ....
2952
2953 #define ST st->u.ifmatch
2954
2955     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2956         ST.foo = ...; // some state we wish to save
2957         ...
2958         // push a yes backtrack state with a resume value of
2959         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2960         // first node of A:
2961         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2962         // NOTREACHED
2963
2964     case IFMATCH_A: // we have successfully executed A; now continue with B
2965         next = B;
2966         bar = ST.foo; // do something with the preserved value
2967         break;
2968
2969     case IFMATCH_A_fail: // A failed, so the assertion failed
2970         ...;   // do some housekeeping, then ...
2971         sayNO; // propagate the failure
2972
2973 #undef ST
2974
2975     ...
2976     }
2977
2978 For any old-timers reading this who are familiar with the old recursive
2979 approach, the code above is equivalent to:
2980
2981     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2982     {
2983         int foo = ...
2984         ...
2985         if (regmatch(A)) {
2986             next = B;
2987             bar = foo;
2988             break;
2989         }
2990         ...;   // do some housekeeping, then ...
2991         sayNO; // propagate the failure
2992     }
2993
2994 The topmost backtrack state, pointed to by st, is usually free. If you
2995 want to claim it, populate any ST.foo fields in it with values you wish to
2996 save, then do one of
2997
2998         PUSH_STATE_GOTO(resume_state, node);
2999         PUSH_YES_STATE_GOTO(resume_state, node);
3000
3001 which sets that backtrack state's resume value to 'resume_state', pushes a
3002 new free entry to the top of the backtrack stack, then goes to 'node'.
3003 On backtracking, the free slot is popped, and the saved state becomes the
3004 new free state. An ST.foo field in this new top state can be temporarily
3005 accessed to retrieve values, but once the main loop is re-entered, it
3006 becomes available for reuse.
3007
3008 Note that the depth of the backtrack stack constantly increases during the
3009 left-to-right execution of the pattern, rather than going up and down with
3010 the pattern nesting. For example the stack is at its maximum at Z at the
3011 end of the pattern, rather than at X in the following:
3012
3013     /(((X)+)+)+....(Y)+....Z/
3014
3015 The only exceptions to this are lookahead/behind assertions and the cut,
3016 (?>A), which pop all the backtrack states associated with A before
3017 continuing.
3018
3019 Backtrack state structs are allocated in slabs of about 4K in size.
3020 PL_regmatch_state and st always point to the currently active state,
3021 and PL_regmatch_slab points to the slab currently containing
3022 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3023 allocated, and is never freed until interpreter destruction. When the slab
3024 is full, a new one is allocated and chained to the end. At exit from
3025 regmatch(), slabs allocated since entry are freed.
3026
3027 */
3028
3029
3030 #define DEBUG_STATE_pp(pp)                                  \
3031     DEBUG_STATE_r({                                         \
3032         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3033         PerlIO_printf(Perl_debug_log,                       \
3034             "    %*s"pp" %s%s%s%s%s\n",                     \
3035             depth*2, "",                                    \
3036             PL_reg_name[st->resume_state],                     \
3037             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3038             ((st==yes_state) ? "Y" : ""),                   \
3039             ((st==mark_state) ? "M" : ""),                  \
3040             ((st==yes_state||st==mark_state) ? "]" : "")    \
3041         );                                                  \
3042     });
3043
3044
3045 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3046
3047 #ifdef DEBUGGING
3048
3049 STATIC void
3050 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3051     const char *start, const char *end, const char *blurb)
3052 {
3053     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3054
3055     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3056
3057     if (!PL_colorset)
3058             reginitcolors();
3059     {
3060         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3061             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3062
3063         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3064             start, end - start, 60);
3065
3066         PerlIO_printf(Perl_debug_log,
3067             "%s%s REx%s %s against %s\n",
3068                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3069
3070         if (utf8_target||utf8_pat)
3071             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3072                 utf8_pat ? "pattern" : "",
3073                 utf8_pat && utf8_target ? " and " : "",
3074                 utf8_target ? "string" : ""
3075             );
3076     }
3077 }
3078
3079 STATIC void
3080 S_dump_exec_pos(pTHX_ const char *locinput,
3081                       const regnode *scan,
3082                       const char *loc_regeol,
3083                       const char *loc_bostr,
3084                       const char *loc_reg_starttry,
3085                       const bool utf8_target)
3086 {
3087     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3088     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3089     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3090     /* The part of the string before starttry has one color
3091        (pref0_len chars), between starttry and current
3092        position another one (pref_len - pref0_len chars),
3093        after the current position the third one.
3094        We assume that pref0_len <= pref_len, otherwise we
3095        decrease pref0_len.  */
3096     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3097         ? (5 + taill) - l : locinput - loc_bostr;
3098     int pref0_len;
3099
3100     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3101
3102     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3103         pref_len++;
3104     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3105     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3106         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3107               ? (5 + taill) - pref_len : loc_regeol - locinput);
3108     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3109         l--;
3110     if (pref0_len < 0)
3111         pref0_len = 0;
3112     if (pref0_len > pref_len)
3113         pref0_len = pref_len;
3114     {
3115         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3116
3117         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3118             (locinput - pref_len),pref0_len, 60, 4, 5);
3119
3120         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3121                     (locinput - pref_len + pref0_len),
3122                     pref_len - pref0_len, 60, 2, 3);
3123
3124         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3125                     locinput, loc_regeol - locinput, 10, 0, 1);
3126
3127         const STRLEN tlen=len0+len1+len2;
3128         PerlIO_printf(Perl_debug_log,
3129                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3130                     (IV)(locinput - loc_bostr),
3131                     len0, s0,
3132                     len1, s1,
3133                     (docolor ? "" : "> <"),
3134                     len2, s2,
3135                     (int)(tlen > 19 ? 0 :  19 - tlen),
3136                     "");
3137     }
3138 }
3139
3140 #endif
3141
3142 /* reg_check_named_buff_matched()
3143  * Checks to see if a named buffer has matched. The data array of
3144  * buffer numbers corresponding to the buffer is expected to reside
3145  * in the regexp->data->data array in the slot stored in the ARG() of
3146  * node involved. Note that this routine doesn't actually care about the
3147  * name, that information is not preserved from compilation to execution.
3148  * Returns the index of the leftmost defined buffer with the given name
3149  * or 0 if non of the buffers matched.
3150  */
3151 STATIC I32
3152 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3153 {
3154     I32 n;
3155     RXi_GET_DECL(rex,rexi);
3156     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3157     I32 *nums=(I32*)SvPVX(sv_dat);
3158
3159     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3160
3161     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3162         if ((I32)rex->lastparen >= nums[n] &&
3163             rex->offs[nums[n]].end != -1)
3164         {
3165             return nums[n];
3166         }
3167     }
3168     return 0;
3169 }
3170
3171
3172 /* free all slabs above current one  - called during LEAVE_SCOPE */
3173
3174 STATIC void
3175 S_clear_backtrack_stack(pTHX_ void *p)
3176 {
3177     regmatch_slab *s = PL_regmatch_slab->next;
3178     PERL_UNUSED_ARG(p);
3179
3180     if (!s)
3181         return;
3182     PL_regmatch_slab->next = NULL;
3183     while (s) {
3184         regmatch_slab * const osl = s;
3185         s = s->next;
3186         Safefree(osl);
3187     }
3188 }
3189
3190
3191 STATIC I32                      /* 0 failure, 1 success */
3192 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3193 {
3194 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3195     dMY_CXT;
3196 #endif
3197     dVAR;
3198     const bool utf8_target = PL_reg_match_utf8;
3199     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3200     REGEXP *rex_sv = reginfo->prog;
3201     regexp *rex = (struct regexp *)SvANY(rex_sv);
3202     RXi_GET_DECL(rex,rexi);
3203     I32 oldsave;
3204     /* the current state. This is a cached copy of PL_regmatch_state */
3205     regmatch_state *st;
3206     /* cache heavy used fields of st in registers */
3207     regnode *scan;
3208     regnode *next;
3209     U32 n = 0;  /* general value; init to avoid compiler warning */
3210     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3211     char *locinput = PL_reginput;
3212     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3213
3214     bool result = 0;        /* return value of S_regmatch */
3215     int depth = 0;          /* depth of backtrack stack */
3216     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3217     const U32 max_nochange_depth =
3218         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3219         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3220     regmatch_state *yes_state = NULL; /* state to pop to on success of
3221                                                             subpattern */
3222     /* mark_state piggy backs on the yes_state logic so that when we unwind
3223        the stack on success we can update the mark_state as we go */
3224     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3225     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3226     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3227     U32 state_num;
3228     bool no_final = 0;      /* prevent failure from backtracking? */
3229     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3230     char *startpoint = PL_reginput;
3231     SV *popmark = NULL;     /* are we looking for a mark? */
3232     SV *sv_commit = NULL;   /* last mark name seen in failure */
3233     SV *sv_yes_mark = NULL; /* last mark name we have seen
3234                                during a successful match */
3235     U32 lastopen = 0;       /* last open we saw */
3236     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3237     SV* const oreplsv = GvSV(PL_replgv);
3238     /* these three flags are set by various ops to signal information to
3239      * the very next op. They have a useful lifetime of exactly one loop
3240      * iteration, and are not preserved or restored by state pushes/pops
3241      */
3242     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3243     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3244     int logical = 0;        /* the following EVAL is:
3245                                 0: (?{...})
3246                                 1: (?(?{...})X|Y)
3247                                 2: (??{...})
3248                                or the following IFMATCH/UNLESSM is:
3249                                 false: plain (?=foo)
3250                                 true:  used as a condition: (?(?=foo))
3251                             */
3252     PAD* last_pad = NULL;
3253     dMULTICALL;
3254     I32 gimme = G_SCALAR;
3255     CV *caller_cv = NULL;       /* who called us */
3256     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3257     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3258
3259 #ifdef DEBUGGING
3260     GET_RE_DEBUG_FLAGS_DECL;
3261 #endif
3262
3263     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3264     multicall_oldcatch = 0;
3265     multicall_cv = NULL;
3266     cx = NULL;
3267     PERL_UNUSED_VAR(multicall_cop);
3268     PERL_UNUSED_VAR(newsp);
3269
3270
3271     PERL_ARGS_ASSERT_REGMATCH;
3272
3273     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3274             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3275     }));
3276     /* on first ever call to regmatch, allocate first slab */
3277     if (!PL_regmatch_slab) {
3278         Newx(PL_regmatch_slab, 1, regmatch_slab);
3279         PL_regmatch_slab->prev = NULL;
3280         PL_regmatch_slab->next = NULL;
3281         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3282     }
3283
3284     oldsave = PL_savestack_ix;
3285     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3286     SAVEVPTR(PL_regmatch_slab);
3287     SAVEVPTR(PL_regmatch_state);
3288
3289     /* grab next free state slot */
3290     st = ++PL_regmatch_state;
3291     if (st >  SLAB_LAST(PL_regmatch_slab))
3292         st = PL_regmatch_state = S_push_slab(aTHX);
3293
3294     /* Note that nextchr is a byte even in UTF */
3295     nextchr = UCHARAT(locinput);
3296     scan = prog;
3297     while (scan != NULL) {
3298
3299         DEBUG_EXECUTE_r( {
3300             SV * const prop = sv_newmortal();
3301             regnode *rnext=regnext(scan);
3302             DUMP_EXEC_POS( locinput, scan, utf8_target );
3303             regprop(rex, prop, scan);
3304
3305             PerlIO_printf(Perl_debug_log,
3306                     "%3"IVdf":%*s%s(%"IVdf")\n",
3307                     (IV)(scan - rexi->program), depth*2, "",
3308                     SvPVX_const(prop),
3309                     (PL_regkind[OP(scan)] == END || !rnext) ?
3310                         0 : (IV)(rnext - rexi->program));
3311         });
3312
3313         next = scan + NEXT_OFF(scan);
3314         if (next == scan)
3315             next = NULL;
3316         state_num = OP(scan);
3317
3318       reenter_switch:
3319
3320         switch (state_num) {
3321         case BOL:
3322             if (locinput == PL_bostr)
3323             {
3324                 /* reginfo->till = reginfo->bol; */
3325                 break;
3326             }
3327             sayNO;
3328         case MBOL:
3329             if (locinput == PL_bostr ||
3330                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3331             {
3332                 break;
3333             }
3334             sayNO;
3335         case SBOL:
3336             if (locinput == PL_bostr)
3337                 break;
3338             sayNO;
3339         case GPOS:
3340             if (locinput == reginfo->ganch)
3341                 break;
3342             sayNO;
3343
3344         case KEEPS:
3345             /* update the startpoint */
3346             st->u.keeper.val = rex->offs[0].start;
3347             PL_reginput = locinput;
3348             rex->offs[0].start = locinput - PL_bostr;
3349             PUSH_STATE_GOTO(KEEPS_next, next);
3350             /*NOT-REACHED*/
3351         case KEEPS_next_fail:
3352             /* rollback the start point change */
3353             rex->offs[0].start = st->u.keeper.val;
3354             sayNO_SILENT;
3355             /*NOT-REACHED*/
3356         case EOL:
3357                 goto seol;
3358         case MEOL:
3359             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3360                 sayNO;
3361             break;
3362         case SEOL:
3363           seol:
3364             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3365                 sayNO;
3366             if (PL_regeol - locinput > 1)
3367                 sayNO;
3368             break;
3369         case EOS:
3370             if (PL_regeol != locinput)
3371                 sayNO;
3372             break;
3373         case SANY:
3374             if (!nextchr && locinput >= PL_regeol)
3375                 sayNO;
3376             if (utf8_target) {
3377                 locinput += PL_utf8skip[nextchr];
3378                 if (locinput > PL_regeol)
3379                     sayNO;
3380                 nextchr = UCHARAT(locinput);
3381             }
3382             else
3383                 nextchr = UCHARAT(++locinput);
3384             break;
3385         case CANY:
3386             if (!nextchr && locinput >= PL_regeol)
3387                 sayNO;
3388             nextchr = UCHARAT(++locinput);
3389             break;
3390         case REG_ANY:
3391             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3392                 sayNO;
3393             if (utf8_target) {
3394                 locinput += PL_utf8skip[nextchr];
3395                 if (locinput > PL_regeol)
3396                     sayNO;
3397                 nextchr = UCHARAT(locinput);
3398             }
3399             else
3400                 nextchr = UCHARAT(++locinput);
3401             break;
3402
3403 #undef  ST
3404 #define ST st->u.trie
3405         case TRIEC:
3406             /* In this case the charclass data is available inline so
3407                we can fail fast without a lot of extra overhead.
3408              */
3409             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3410                 DEBUG_EXECUTE_r(
3411                     PerlIO_printf(Perl_debug_log,
3412                               "%*s  %sfailed to match trie start class...%s\n",
3413                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3414                 );
3415                 sayNO_SILENT;
3416                 assert(0); /* NOTREACHED */
3417             }
3418             /* FALL THROUGH */
3419         case TRIE:
3420             /* the basic plan of execution of the trie is:
3421              * At the beginning, run though all the states, and
3422              * find the longest-matching word. Also remember the position
3423              * of the shortest matching word. For example, this pattern:
3424              *    1  2 3 4    5
3425              *    ab|a|x|abcd|abc
3426              * when matched against the string "abcde", will generate
3427              * accept states for all words except 3, with the longest
3428              * matching word being 4, and the shortest being 1 (with
3429              * the position being after char 1 of the string).
3430              *
3431              * Then for each matching word, in word order (i.e. 1,2,4,5),
3432              * we run the remainder of the pattern; on each try setting
3433              * the current position to the character following the word,
3434              * returning to try the next word on failure.
3435              *
3436              * We avoid having to build a list of words at runtime by
3437              * using a compile-time structure, wordinfo[].prev, which
3438              * gives, for each word, the previous accepting word (if any).
3439              * In the case above it would contain the mappings 1->2, 2->0,
3440              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3441              * the longest word (4 above), a list of all words, by
3442              * following the list of prev pointers; this gives us the
3443              * unordered list 4,5,1,2. Then given the current word we have
3444              * just tried, we can go through the list and find the
3445              * next-biggest word to try (so if we just failed on word 2,
3446              * the next in the list is 4).
3447              *
3448              * Since at runtime we don't record the matching position in
3449              * the string for each word, we have to work that out for
3450              * each word we're about to process. The wordinfo table holds
3451              * the character length of each word; given that we recorded
3452              * at the start: the position of the shortest word and its
3453              * length in chars, we just need to move the pointer the
3454              * difference between the two char lengths. Depending on
3455              * Unicode status and folding, that's cheap or expensive.
3456              *
3457              * This algorithm is optimised for the case where are only a
3458              * small number of accept states, i.e. 0,1, or maybe 2.
3459              * With lots of accepts states, and having to try all of them,
3460              * it becomes quadratic on number of accept states to find all
3461              * the next words.
3462              */
3463
3464             {
3465                 /* what type of TRIE am I? (utf8 makes this contextual) */
3466                 DECL_TRIE_TYPE(scan);
3467
3468                 /* what trie are we using right now */
3469                 reg_trie_data * const trie
3470                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3471                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3472                 U32 state = trie->startstate;
3473
3474                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3475                     if (trie->states[ state ].wordnum) {
3476                          DEBUG_EXECUTE_r(
3477                             PerlIO_printf(Perl_debug_log,
3478                                           "%*s  %smatched empty string...%s\n",
3479                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3480                         );
3481                         if (!trie->jump)
3482                             break;
3483                     } else {
3484                         DEBUG_EXECUTE_r(
3485                             PerlIO_printf(Perl_debug_log,
3486                                           "%*s  %sfailed to match trie start class...%s\n",
3487                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3488                         );
3489                         sayNO_SILENT;
3490                    }
3491                 }
3492
3493             {
3494                 U8 *uc = ( U8* )locinput;
3495
3496                 STRLEN len = 0;
3497                 STRLEN foldlen = 0;
3498                 U8 *uscan = (U8*)NULL;
3499                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3500                 U32 charcount = 0; /* how many input chars we have matched */
3501                 U32 accepted = 0; /* have we seen any accepting states? */
3502
3503                 ST.jump = trie->jump;
3504                 ST.me = scan;
3505                 ST.firstpos = NULL;
3506                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3507                 ST.nextword = 0;
3508
3509                 /* fully traverse the TRIE; note the position of the
3510                    shortest accept state and the wordnum of the longest
3511                    accept state */
3512
3513                 while ( state && uc <= (U8*)PL_regeol ) {
3514                     U32 base = trie->states[ state ].trans.base;
3515                     UV uvc = 0;
3516                     U16 charid = 0;
3517                     U16 wordnum;
3518                     wordnum = trie->states[ state ].wordnum;
3519
3520                     if (wordnum) { /* it's an accept state */
3521                         if (!accepted) {
3522                             accepted = 1;
3523                             /* record first match position */
3524                             if (ST.longfold) {
3525                                 ST.firstpos = (U8*)locinput;
3526                                 ST.firstchars = 0;
3527                             }
3528                             else {
3529                                 ST.firstpos = uc;
3530                                 ST.firstchars = charcount;
3531                             }
3532                         }
3533                         if (!ST.nextword || wordnum < ST.nextword)
3534                             ST.nextword = wordnum;
3535                         ST.topword = wordnum;
3536                     }
3537
3538                     DEBUG_TRIE_EXECUTE_r({
3539                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3540                                 PerlIO_printf( Perl_debug_log,
3541                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3542                                     2+depth * 2, "", PL_colors[4],
3543                                     (UV)state, (accepted ? 'Y' : 'N'));
3544                     });
3545
3546                     /* read a char and goto next state */
3547                     if ( base ) {
3548                         I32 offset;
3549                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3550                                              uscan, len, uvc, charid, foldlen,
3551                                              foldbuf, uniflags);
3552                         charcount++;
3553                         if (foldlen>0)
3554                             ST.longfold = TRUE;
3555                         if (charid &&
3556                              ( ((offset =
3557                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3558
3559                              && ((U32)offset < trie->lasttrans)
3560                              && trie->trans[offset].check == state)
3561                         {
3562                             state = trie->trans[offset].next;
3563                         }
3564                         else {
3565                             state = 0;
3566                         }
3567                         uc += len;
3568
3569                     }
3570                     else {
3571                         state = 0;
3572                     }
3573                     DEBUG_TRIE_EXECUTE_r(
3574                         PerlIO_printf( Perl_debug_log,
3575                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3576                             charid, uvc, (UV)state, PL_colors[5] );
3577                     );
3578                 }
3579                 if (!accepted)
3580                    sayNO;
3581
3582                 /* calculate total number of accept states */
3583                 {
3584                     U16 w = ST.topword;
3585                     accepted = 0;
3586                     while (w) {
3587                         w = trie->wordinfo[w].prev;
3588                         accepted++;
3589                     }
3590                     ST.accepted = accepted;
3591                 }
3592
3593                 DEBUG_EXECUTE_r(
3594                     PerlIO_printf( Perl_debug_log,
3595                         "%*s  %sgot %"IVdf" possible matches%s\n",
3596                         REPORT_CODE_OFF + depth * 2, "",
3597                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3598                 );
3599                 goto trie_first_try; /* jump into the fail handler */
3600             }}
3601             assert(0); /* NOTREACHED */
3602
3603         case TRIE_next_fail: /* we failed - try next alternative */
3604             if ( ST.jump) {
3605                 REGCP_UNWIND(ST.cp);
3606                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3607             }
3608             if (!--ST.accepted) {
3609                 DEBUG_EXECUTE_r({
3610                     PerlIO_printf( Perl_debug_log,
3611                         "%*s  %sTRIE failed...%s\n",
3612                         REPORT_CODE_OFF+depth*2, "",
3613                         PL_colors[4],
3614                         PL_colors[5] );
3615                 });
3616                 sayNO_SILENT;
3617             }
3618             {
3619                 /* Find next-highest word to process.  Note that this code
3620                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3621                 U16 min = 0;
3622                 U16 word;
3623                 U16 const nextword = ST.nextword;
3624                 reg_trie_wordinfo * const wordinfo
3625                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3626                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3627                     if (word > nextword && (!min || word < min))
3628                         min = word;
3629                 }
3630                 ST.nextword = min;
3631             }
3632
3633           trie_first_try:
3634             if (do_cutgroup) {
3635                 do_cutgroup = 0;
3636                 no_final = 0;
3637             }
3638
3639             if ( ST.jump) {
3640                 ST.lastparen = rex->lastparen;
3641                 ST.lastcloseparen = rex->lastcloseparen;
3642                 REGCP_SET(ST.cp);
3643             }
3644
3645             /* find start char of end of current word */
3646             {
3647                 U32 chars; /* how many chars to skip */
3648                 U8 *uc = ST.firstpos;
3649                 reg_trie_data * const trie
3650                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3651
3652                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3653                             >=  ST.firstchars);
3654                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3655                             - ST.firstchars;
3656
3657                 if (ST.longfold) {
3658                     /* the hard option - fold each char in turn and find
3659                      * its folded length (which may be different */
3660                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3661                     STRLEN foldlen;
3662                     STRLEN len;
3663                     UV uvc;
3664                     U8 *uscan;
3665
3666                     while (chars) {
3667                         if (utf8_target) {
3668                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3669                                                     uniflags);
3670                             uc += len;
3671                         }
3672                         else {
3673                             uvc = *uc;
3674                             uc++;
3675                         }
3676                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3677                         uscan = foldbuf;
3678                         while (foldlen) {
3679                             if (!--chars)
3680                                 break;
3681                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3682                                             uniflags);
3683                             uscan += len;
3684                             foldlen -= len;
3685                         }
3686                     }
3687                 }
3688                 else {
3689                     if (utf8_target)
3690                         while (chars--)
3691                             uc += UTF8SKIP(uc);
3692                     else
3693                         uc += chars;
3694                 }
3695                 PL_reginput = (char *)uc;
3696             }
3697
3698             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3699                             ? ST.jump[ST.nextword]
3700                             : NEXT_OFF(ST.me));
3701
3702             DEBUG_EXECUTE_r({
3703                 PerlIO_printf( Perl_debug_log,
3704                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3705                     REPORT_CODE_OFF+depth*2, "",
3706                     PL_colors[4],
3707                     ST.nextword,
3708                     PL_colors[5]
3709                     );
3710             });
3711
3712             if (ST.accepted > 1 || has_cutgroup) {
3713                 PUSH_STATE_GOTO(TRIE_next, scan);
3714                 assert(0); /* NOTREACHED */
3715             }
3716             /* only one choice left - just continue */
3717             DEBUG_EXECUTE_r({
3718                 AV *const trie_words
3719                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3720                 SV ** const tmp = av_fetch( trie_words,
3721                     ST.nextword-1, 0 );
3722                 SV *sv= tmp ? sv_newmortal() : NULL;
3723
3724                 PerlIO_printf( Perl_debug_log,
3725                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3726                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3727                     ST.nextword,
3728                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3729                             PL_colors[0], PL_colors[1],
3730                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3731                         )
3732                     : "not compiled under -Dr",
3733                     PL_colors[5] );
3734             });
3735
3736             locinput = PL_reginput;
3737             nextchr = UCHARAT(locinput);
3738             continue; /* execute rest of RE */
3739             assert(0); /* NOTREACHED */
3740 #undef  ST
3741
3742         case EXACT: {
3743             char *s = STRING(scan);
3744             ln = STR_LEN(scan);
3745             if (utf8_target != UTF_PATTERN) {
3746                 /* The target and the pattern have differing utf8ness. */
3747                 char *l = locinput;
3748                 const char * const e = s + ln;
3749
3750                 if (utf8_target) {
3751                     /* The target is utf8, the pattern is not utf8. */
3752                     while (s < e) {
3753                         STRLEN ulen;
3754                         if (l >= PL_regeol)
3755                              sayNO;
3756                         if (NATIVE_TO_UNI(*(U8*)s) !=
3757                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3758                                             uniflags))
3759                              sayNO;
3760                         l += ulen;
3761                         s ++;
3762                     }
3763                 }
3764                 else {
3765                     /* The target is not utf8, the pattern is utf8. */
3766                     while (s < e) {
3767                         STRLEN ulen;
3768                         if (l >= PL_regeol)
3769                             sayNO;
3770                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3771                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3772                                            uniflags))
3773                             sayNO;
3774                         s += ulen;
3775                         l ++;
3776                     }
3777                 }
3778                 locinput = l;
3779                 nextchr = UCHARAT(locinput);
3780                 break;
3781             }
3782             /* The target and the pattern have the same utf8ness. */
3783             /* Inline the first character, for speed. */
3784             if (UCHARAT(s) != nextchr)
3785                 sayNO;
3786             if (PL_regeol - locinput < ln)
3787                 sayNO;
3788             if (ln > 1 && memNE(s, locinput, ln))
3789                 sayNO;
3790             locinput += ln;
3791             nextchr = UCHARAT(locinput);
3792             break;
3793             }
3794         case EXACTFL: {
3795             re_fold_t folder;
3796             const U8 * fold_array;
3797             const char * s;
3798             U32 fold_utf8_flags;
3799
3800             PL_reg_flags |= RF_tainted;
3801             folder = foldEQ_locale;
3802             fold_array = PL_fold_locale;
3803             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3804             goto do_exactf;
3805
3806         case EXACTFU_SS:
3807         case EXACTFU_TRICKYFOLD:
3808         case EXACTFU:
3809             folder = foldEQ_latin1;
3810             fold_array = PL_fold_latin1;
3811             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3812             goto do_exactf;
3813
3814         case EXACTFA:
3815             folder = foldEQ_latin1;
3816             fold_array = PL_fold_latin1;
3817             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3818             goto do_exactf;
3819
3820         case EXACTF:
3821             folder = foldEQ;
3822             fold_array = PL_fold;
3823             fold_utf8_flags = 0;
3824
3825           do_exactf:
3826             s = STRING(scan);
3827             ln = STR_LEN(scan);
3828
3829             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3830               /* Either target or the pattern are utf8, or has the issue where
3831                * the fold lengths may differ. */
3832                 const char * const l = locinput;
3833                 char *e = PL_regeol;
3834
3835                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3836                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3837                 {
3838                     sayNO;
3839                 }
3840                 locinput = e;
3841                 nextchr = UCHARAT(locinput);
3842                 break;
3843             }
3844
3845             /* Neither the target nor the pattern are utf8 */
3846             if (UCHARAT(s) != nextchr &&
3847                 UCHARAT(s) != fold_array[nextchr])
3848             {
3849                 sayNO;
3850             }
3851             if (PL_regeol - locinput < ln)
3852                 sayNO;
3853             if (ln > 1 && ! folder(s, locinput, ln))
3854                 sayNO;
3855             locinput += ln;
3856             nextchr = UCHARAT(locinput);
3857             break;
3858         }
3859
3860         /* XXX Could improve efficiency by separating these all out using a
3861          * macro or in-line function.  At that point regcomp.c would no longer
3862          * have to set the FLAGS fields of these */
3863         case BOUNDL:
3864         case NBOUNDL:
3865             PL_reg_flags |= RF_tainted;
3866             /* FALL THROUGH */
3867         case BOUND:
3868         case BOUNDU:
3869         case BOUNDA:
3870         case NBOUND:
3871         case NBOUNDU:
3872         case NBOUNDA:
3873             /* was last char in word? */
3874             if (utf8_target
3875                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3876                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3877             {
3878                 if (locinput == PL_bostr)
3879                     ln = '\n';
3880                 else {
3881                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3882
3883                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3884                 }
3885                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3886                     ln = isALNUM_uni(ln);
3887                     LOAD_UTF8_CHARCLASS_ALNUM();
3888                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3889                 }
3890                 else {
3891                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3892                     n = isALNUM_LC_utf8((U8*)locinput);
3893                 }
3894             }
3895             else {
3896
3897                 /* Here the string isn't utf8, or is utf8 and only ascii
3898                  * characters are to match \w.  In the latter case looking at
3899                  * the byte just prior to the current one may be just the final
3900                  * byte of a multi-byte character.  This is ok.  There are two
3901                  * cases:
3902                  * 1) it is a single byte character, and then the test is doing
3903                  *      just what it's supposed to.
3904                  * 2) it is a multi-byte character, in which case the final
3905                  *      byte is never mistakable for ASCII, and so the test
3906                  *      will say it is not a word character, which is the
3907                  *      correct answer. */
3908                 ln = (locinput != PL_bostr) ?
3909                     UCHARAT(locinput - 1) : '\n';
3910                 switch (FLAGS(scan)) {
3911                     case REGEX_UNICODE_CHARSET:
3912                         ln = isWORDCHAR_L1(ln);
3913                         n = isWORDCHAR_L1(nextchr);
3914                         break;
3915                     case REGEX_LOCALE_CHARSET:
3916                         ln = isALNUM_LC(ln);
3917                         n = isALNUM_LC(nextchr);
3918                         break;
3919                     case REGEX_DEPENDS_CHARSET:
3920                         ln = isALNUM(ln);
3921                         n = isALNUM(nextchr);
3922                         break;
3923                     case REGEX_ASCII_RESTRICTED_CHARSET:
3924                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3925                         ln = isWORDCHAR_A(ln);
3926                         n = isWORDCHAR_A(nextchr);
3927                         break;
3928                     default:
3929                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3930                         break;
3931                 }
3932             }
3933             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3934              * regcomp.sym */
3935             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3936                     sayNO;
3937             break;
3938         case ANYOFV:
3939         case ANYOF:
3940             if (utf8_target || state_num == ANYOFV) {
3941                 STRLEN inclasslen = PL_regeol - locinput;
3942                 if (locinput >= PL_regeol)
3943                     sayNO;
3944
3945                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3946                     sayNO;
3947                 locinput += inclasslen;
3948                 nextchr = UCHARAT(locinput);
3949                 break;
3950             }
3951             else {
3952                 if (nextchr < 0)
3953                     nextchr = UCHARAT(locinput);
3954                 if (!nextchr && locinput >= PL_regeol)
3955                     sayNO;
3956                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3957                     sayNO;
3958                 nextchr = UCHARAT(++locinput);
3959                 break;
3960             }
3961             break;
3962         /* Special char classes - The defines start on line 129 or so */
3963         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3964                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3965                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3966                   ALNUMA, NALNUMA, isWORDCHAR_A,
3967                   alnum, "a");
3968
3969         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3970                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3971                   SPACEU, NSPACEU, isSPACE_L1,
3972                   SPACEA, NSPACEA, isSPACE_A,
3973                   space, " ");
3974
3975         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3976                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3977                 DIGITA, NDIGITA, isDIGIT_A,
3978                 digit, "0");
3979
3980         case POSIXA:
3981             if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
3982                 sayNO;
3983             }
3984             /* Matched a utf8-invariant, so don't have to worry about utf8 */
3985             nextchr = UCHARAT(++locinput);
3986             break;
3987         case NPOSIXA:
3988             if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
3989                 sayNO;
3990             }
3991             if (utf8_target) {
3992                 locinput += PL_utf8skip[nextchr];
3993                 nextchr = UCHARAT(locinput);
3994             }
3995             else {
3996                 nextchr = UCHARAT(++locinput);
3997             }
3998             break;
3999
4000         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4001                        a Unicode extended Grapheme Cluster */
4002             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4003               extended Grapheme Cluster is:
4004
4005                CR LF
4006                | Prepend* Begin Extend*
4007                | .
4008
4009                Begin is:           ( Special_Begin | ! Control )
4010                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4011                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4012                Control is:         [ GCB_Control  CR  LF ]
4013                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4014
4015                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4016                we can rewrite
4017
4018                    Begin is ( Regular_Begin + Special Begin )
4019
4020                It turns out that 98.4% of all Unicode code points match
4021                Regular_Begin.  Doing it this way eliminates a table match in
4022                the previous implementation for almost all Unicode code points.
4023
4024                There is a subtlety with Prepend* which showed up in testing.
4025                Note that the Begin, and only the Begin is required in:
4026                 | Prepend* Begin Extend*
4027                Also, Begin contains '! Control'.  A Prepend must be a
4028                '!  Control', which means it must also be a Begin.  What it
4029                comes down to is that if we match Prepend* and then find no
4030                suitable Begin afterwards, that if we backtrack the last
4031                Prepend, that one will be a suitable Begin.
4032             */
4033
4034             if (locinput >= PL_regeol)
4035                 sayNO;
4036             if  (! utf8_target) {
4037
4038                 /* Match either CR LF  or '.', as all the other possibilities
4039                  * require utf8 */
4040                 locinput++;         /* Match the . or CR */
4041                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4042                                        match the LF */
4043                     && locinput < PL_regeol
4044                     && UCHARAT(locinput) == '\n') locinput++;
4045             }
4046             else {
4047
4048                 /* Utf8: See if is ( CR LF ); already know that locinput <
4049                  * PL_regeol, so locinput+1 is in bounds */
4050                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
4051                     locinput += 2;
4052                 }
4053                 else {
4054                     STRLEN len;
4055
4056                     /* In case have to backtrack to beginning, then match '.' */
4057                     char *starting = locinput;
4058
4059                     /* In case have to backtrack the last prepend */
4060                     char *previous_prepend = 0;
4061
4062                     LOAD_UTF8_CHARCLASS_GCB();
4063
4064                     /* Match (prepend)*   */
4065                     while (locinput < PL_regeol
4066                            && (len = is_GCB_Prepend_utf8(locinput)))
4067                     {
4068                         previous_prepend = locinput;
4069                         locinput += len;
4070                     }
4071
4072                     /* As noted above, if we matched a prepend character, but
4073                      * the next thing won't match, back off the last prepend we
4074                      * matched, as it is guaranteed to match the begin */
4075                     if (previous_prepend
4076                         && (locinput >=  PL_regeol
4077                             || (! swash_fetch(PL_utf8_X_regular_begin,
4078                                              (U8*)locinput, utf8_target)
4079                                  && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4080                         )
4081                     {
4082                         locinput = previous_prepend;
4083                     }
4084
4085                     /* Note that here we know PL_regeol > locinput, as we
4086                      * tested that upon input to this switch case, and if we
4087                      * moved locinput forward, we tested the result just above
4088                      * and it either passed, or we backed off so that it will
4089                      * now pass */
4090                     if (swash_fetch(PL_utf8_X_regular_begin,
4091                                     (U8*)locinput, utf8_target)) {
4092                         locinput += UTF8SKIP(locinput);
4093                     }
4094                     else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4095
4096                         /* Here did not match the required 'Begin' in the
4097                          * second term.  So just match the very first
4098                          * character, the '.' of the final term of the regex */
4099                         locinput = starting + UTF8SKIP(starting);
4100                         goto exit_utf8;
4101                     } else {
4102
4103                         /* Here is a special begin.  It can be composed of
4104                          * several individual characters.  One possibility is
4105                          * RI+ */
4106                         if ((len = is_GCB_RI_utf8(locinput))) {
4107                             locinput += len;
4108                             while (locinput < PL_regeol
4109                                    && (len = is_GCB_RI_utf8(locinput)))
4110                             {
4111                                 locinput += len;
4112                             }
4113                         } else if ((len = is_GCB_T_utf8(locinput))) {
4114                             /* Another possibility is T+ */
4115                             locinput += len;
4116                             while (locinput < PL_regeol
4117                                 && (len = is_GCB_T_utf8(locinput)))
4118                             {
4119                                 locinput += len;
4120                             }
4121                         } else {
4122
4123                             /* Here, neither RI+ nor T+; must be some other
4124                              * Hangul.  That means it is one of the others: L,
4125                              * LV, LVT or V, and matches:
4126                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4127
4128                             /* Match L*           */
4129                             while (locinput < PL_regeol
4130                                    && (len = is_GCB_L_utf8(locinput)))
4131                             {
4132                                 locinput += len;
4133                             }
4134
4135                             /* Here, have exhausted L*.  If the next character
4136                              * is not an LV, LVT nor V, it means we had to have
4137                              * at least one L, so matches L+ in the original
4138                              * equation, we have a complete hangul syllable.
4139                              * Are done. */
4140
4141                             if (locinput < PL_regeol
4142                                 && is_GCB_LV_LVT_V_utf8(locinput))
4143                             {
4144
4145                                 /* Otherwise keep going.  Must be LV, LVT or V.
4146                                  * See if LVT */
4147                                 if (is_utf8_X_LVT((U8*)locinput)) {
4148                                     locinput += UTF8SKIP(locinput);
4149                                 } else {
4150
4151                                     /* Must be  V or LV.  Take it, then match
4152                                      * V*     */
4153                                     locinput += UTF8SKIP(locinput);
4154                                     while (locinput < PL_regeol
4155                                            && (len = is_GCB_V_utf8(locinput)))
4156                                     {
4157                                         locinput += len;
4158                                     }
4159                                 }
4160
4161                                 /* And any of LV, LVT, or V can be followed
4162                                  * by T*            */
4163                                 while (locinput < PL_regeol
4164                                        && (len = is_GCB_T_utf8(locinput)))
4165                                 {
4166                                     locinput += len;
4167                                 }
4168                             }
4169                         }
4170                     }
4171
4172                     /* Match any extender */
4173                     while (locinput < PL_regeol
4174                             && swash_fetch(PL_utf8_X_extend,
4175                                             (U8*)locinput, utf8_target))
4176                     {
4177                         locinput += UTF8SKIP(locinput);
4178                     }
4179                 }
4180             exit_utf8:
4181                 if (locinput > PL_regeol) sayNO;
4182             }
4183             nextchr = UCHARAT(locinput);
4184             break;
4185
4186         case NREFFL:
4187         {   /* The capture buffer cases.  The ones beginning with N for the
4188                named buffers just convert to the equivalent numbered and
4189                pretend they were called as the corresponding numbered buffer
4190                op.  */
4191             /* don't initialize these in the declaration, it makes C++
4192                unhappy */
4193             char *s;
4194             char type;
4195             re_fold_t folder;
4196             const U8 *fold_array;
4197             UV utf8_fold_flags;
4198
4199             PL_reg_flags |= RF_tainted;
4200             folder = foldEQ_locale;
4201             fold_array = PL_fold_locale;
4202             type = REFFL;
4203             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4204             goto do_nref;
4205
4206         case NREFFA:
4207             folder = foldEQ_latin1;
4208             fold_array = PL_fold_latin1;
4209             type = REFFA;
4210             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4211             goto do_nref;
4212
4213         case NREFFU:
4214             folder = foldEQ_latin1;
4215             fold_array = PL_fold_latin1;
4216             type = REFFU;
4217             utf8_fold_flags = 0;
4218             goto do_nref;
4219
4220         case NREFF:
4221             folder = foldEQ;
4222             fold_array = PL_fold;
4223             type = REFF;
4224             utf8_fold_flags = 0;
4225             goto do_nref;
4226
4227         case NREF:
4228             type = REF;
4229             folder = NULL;
4230             fold_array = NULL;
4231             utf8_fold_flags = 0;
4232           do_nref:
4233
4234             /* For the named back references, find the corresponding buffer
4235              * number */
4236             n = reg_check_named_buff_matched(rex,scan);
4237
4238             if ( ! n ) {
4239                 sayNO;
4240             }
4241             goto do_nref_ref_common;
4242
4243         case REFFL:
4244             PL_reg_flags |= RF_tainted;
4245             folder = foldEQ_locale;
4246             fold_array = PL_fold_locale;
4247             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4248             goto do_ref;
4249
4250         case REFFA:
4251             folder = foldEQ_latin1;
4252             fold_array = PL_fold_latin1;
4253             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4254             goto do_ref;
4255
4256         case REFFU:
4257             folder = foldEQ_latin1;
4258             fold_array = PL_fold_latin1;
4259             utf8_fold_flags = 0;
4260             goto do_ref;
4261
4262         case REFF:
4263             folder = foldEQ;
4264             fold_array = PL_fold;
4265             utf8_fold_flags = 0;
4266             goto do_ref;
4267
4268         case REF:
4269             folder = NULL;
4270             fold_array = NULL;
4271             utf8_fold_flags = 0;
4272
4273           do_ref:
4274             type = OP(scan);
4275             n = ARG(scan);  /* which paren pair */
4276
4277           do_nref_ref_common:
4278             ln = rex->offs[n].start;
4279             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4280             if (rex->lastparen < n || ln == -1)
4281                 sayNO;                  /* Do not match unless seen CLOSEn. */
4282             if (ln == rex->offs[n].end)
4283                 break;
4284
4285             s = PL_bostr + ln;
4286             if (type != REF     /* REF can do byte comparison */
4287                 && (utf8_target || type == REFFU))
4288             { /* XXX handle REFFL better */
4289                 char * limit = PL_regeol;
4290
4291                 /* This call case insensitively compares the entire buffer
4292                     * at s, with the current input starting at locinput, but
4293                     * not going off the end given by PL_regeol, and returns in
4294                     * limit upon success, how much of the current input was
4295                     * matched */
4296                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4297                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4298                 {
4299                     sayNO;
4300                 }
4301                 locinput = limit;
4302                 nextchr = UCHARAT(locinput);
4303                 break;
4304             }
4305
4306             /* Not utf8:  Inline the first character, for speed. */
4307             if (UCHARAT(s) != nextchr &&
4308                 (type == REF ||
4309                  UCHARAT(s) != fold_array[nextchr]))
4310                 sayNO;
4311             ln = rex->offs[n].end - ln;
4312             if (locinput + ln > PL_regeol)
4313                 sayNO;
4314             if (ln > 1 && (type == REF
4315                            ? memNE(s, locinput, ln)
4316                            : ! folder(s, locinput, ln)))
4317                 sayNO;
4318             locinput += ln;
4319             nextchr = UCHARAT(locinput);
4320             break;
4321         }
4322         case NOTHING:
4323         case TAIL:
4324             break;
4325         case BACK:
4326             break;
4327
4328 #undef  ST
4329 #define ST st->u.eval
4330         {
4331             SV *ret;
4332             REGEXP *re_sv;
4333             regexp *re;
4334             regexp_internal *rei;
4335             regnode *startpoint;
4336
4337         case GOSTART:
4338         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4339             if (cur_eval && cur_eval->locinput==locinput) {
4340                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4341                     Perl_croak(aTHX_ "Infinite recursion in regex");
4342                 if ( ++nochange_depth > max_nochange_depth )
4343                     Perl_croak(aTHX_
4344                         "Pattern subroutine nesting without pos change"
4345                         " exceeded limit in regex");
4346             } else {
4347                 nochange_depth = 0;
4348             }
4349             re_sv = rex_sv;
4350             re = rex;
4351             rei = rexi;
4352             if (OP(scan)==GOSUB) {
4353                 startpoint = scan + ARG2L(scan);
4354                 ST.close_paren = ARG(scan);
4355             } else {
4356                 startpoint = rei->program+1;
4357                 ST.close_paren = 0;
4358             }
4359             goto eval_recurse_doit;
4360             assert(0); /* NOTREACHED */
4361         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4362             if (cur_eval && cur_eval->locinput==locinput) {
4363                 if ( ++nochange_depth > max_nochange_depth )
4364                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4365             } else {
4366                 nochange_depth = 0;
4367             }
4368             {
4369                 /* execute the code in the {...} */
4370
4371                 dSP;
4372                 SV ** before;
4373                 OP * const oop = PL_op;
4374                 COP * const ocurcop = PL_curcop;
4375                 OP *nop;
4376                 char *saved_regeol = PL_regeol;
4377                 struct re_save_state saved_state;
4378                 CV *newcv;
4379
4380                 /* save *all* paren positions */
4381                 regcppush(rex, 0);
4382                 REGCP_SET(runops_cp);
4383
4384                 /* To not corrupt the existing regex state while executing the
4385                  * eval we would normally put it on the save stack, like with
4386                  * save_re_context. However, re-evals have a weird scoping so we
4387                  * can't just add ENTER/LEAVE here. With that, things like
4388                  *
4389                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4390                  *
4391                  * would break, as they expect the localisation to be unwound
4392                  * only when the re-engine backtracks through the bit that
4393                  * localised it.
4394                  *
4395                  * What we do instead is just saving the state in a local c
4396                  * variable.
4397                  */
4398                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4399
4400                 PL_reg_state.re_reparsing = FALSE;
4401
4402                 if (!caller_cv)
4403                     caller_cv = find_runcv(NULL);
4404
4405                 n = ARG(scan);
4406
4407                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4408                     newcv = ((struct regexp *)SvANY(
4409                                                 (REGEXP*)(rexi->data->data[n])
4410                                             ))->qr_anoncv
4411                                         ;
4412                     nop = (OP*)rexi->data->data[n+1];
4413                 }
4414                 else if (rexi->data->what[n] == 'l') { /* literal code */
4415                     newcv = caller_cv;
4416                     nop = (OP*)rexi->data->data[n];
4417                     assert(CvDEPTH(newcv));
4418                 }
4419                 else {
4420                     /* literal with own CV */
4421                     assert(rexi->data->what[n] == 'L');
4422                     newcv = rex->qr_anoncv;
4423                     nop = (OP*)rexi->data->data[n];
4424                 }
4425
4426                 /* normally if we're about to execute code from the same
4427                  * CV that we used previously, we just use the existing
4428                  * CX stack entry. However, its possible that in the
4429                  * meantime we may have backtracked, popped from the save
4430                  * stack, and undone the SAVECOMPPAD(s) associated with
4431                  * PUSH_MULTICALL; in which case PL_comppad no longer
4432                  * points to newcv's pad. */
4433                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4434                 {
4435                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4436                     if (last_pushed_cv) {
4437                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4438                     }
4439                     else {
4440                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4441                     }
4442                     last_pushed_cv = newcv;
4443                 }
4444                 last_pad = PL_comppad;
4445
4446                 /* the initial nextstate you would normally execute
4447                  * at the start of an eval (which would cause error
4448                  * messages to come from the eval), may be optimised
4449                  * away from the execution path in the regex code blocks;
4450                  * so manually set PL_curcop to it initially */
4451                 {
4452                     OP *o = cUNOPx(nop)->op_first;
4453                     assert(o->op_type == OP_NULL);
4454                     if (o->op_targ == OP_SCOPE) {
4455                         o = cUNOPo->op_first;
4456                     }
4457                     else {
4458                         assert(o->op_targ == OP_LEAVE);
4459                         o = cUNOPo->op_first;
4460                         assert(o->op_type == OP_ENTER);
4461                         o = o->op_sibling;
4462                     }
4463
4464                     if (o->op_type != OP_STUB) {
4465                         assert(    o->op_type == OP_NEXTSTATE
4466                                 || o->op_type == OP_DBSTATE
4467                                 || (o->op_type == OP_NULL
4468                                     &&  (  o->op_targ == OP_NEXTSTATE
4469                                         || o->op_targ == OP_DBSTATE
4470                                         )
4471                                     )
4472                         );
4473                         PL_curcop = (COP*)o;
4474                     }
4475                 }
4476                 nop = nop->op_next;
4477
4478                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4479                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4480
4481                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4482
4483                 if (sv_yes_mark) {
4484                     SV *sv_mrk = get_sv("REGMARK", 1);
4485                     sv_setsv(sv_mrk, sv_yes_mark);
4486                 }
4487
4488                 /* we don't use MULTICALL here as we want to call the
4489                  * first op of the block of interest, rather than the
4490                  * first op of the sub */
4491                 before = SP;
4492                 PL_op = nop;
4493                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4494                 SPAGAIN;
4495                 if (SP == before)
4496                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4497                 else {
4498                     ret = POPs;
4499                     PUTBACK;
4500                 }
4501
4502                 /* before restoring everything, evaluate the returned
4503                  * value, so that 'uninit' warnings don't use the wrong
4504                  * PL_op or pad. Also need to process any magic vars
4505                  * (e.g. $1) *before* parentheses are restored */
4506
4507                 PL_op = NULL;
4508
4509                 re_sv = NULL;
4510                 if (logical == 0)        /*   (?{})/   */
4511                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4512                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4513                     sw = cBOOL(SvTRUE(ret));
4514                     logical = 0;
4515                 }
4516                 else {                   /*  /(??{})  */
4517                     /*  if its overloaded, let the regex compiler handle
4518                      *  it; otherwise extract regex, or stringify  */
4519                     if (!SvAMAGIC(ret)) {
4520                         SV *sv = ret;
4521                         if (SvROK(sv))
4522                             sv = SvRV(sv);
4523                         if (SvTYPE(sv) == SVt_REGEXP)
4524                             re_sv = (REGEXP*) sv;
4525                         else if (SvSMAGICAL(sv)) {
4526                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4527                             if (mg)
4528                                 re_sv = (REGEXP *) mg->mg_obj;
4529                         }
4530
4531                         /* force any magic, undef warnings here */
4532                         if (!re_sv) {
4533                             ret = sv_mortalcopy(ret);
4534                             (void) SvPV_force_nolen(ret);
4535                         }
4536                     }
4537
4538                 }
4539
4540                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4541
4542                 /* *** Note that at this point we don't restore
4543                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4544                  * be used again soon. This is safe as long as nothing
4545                  * in the regexp code uses the pad ! */
4546                 PL_op = oop;
4547                 PL_curcop = ocurcop;
4548                 PL_regeol = saved_regeol;
4549                 S_regcp_restore(aTHX_ rex, runops_cp);
4550
4551                 if (logical != 2)
4552                     break;
4553             }
4554
4555                 /* only /(??{})/  from now on */
4556                 logical = 0;
4557                 {
4558                     /* extract RE object from returned value; compiling if
4559                      * necessary */
4560
4561                     if (re_sv) {
4562                         re_sv = reg_temp_copy(NULL, re_sv);
4563                     }
4564                     else {
4565                         U32 pm_flags = 0;
4566                         const I32 osize = PL_regsize;
4567
4568                         if (SvUTF8(ret) && IN_BYTES) {
4569                             /* In use 'bytes': make a copy of the octet
4570                              * sequence, but without the flag on */
4571                             STRLEN len;
4572                             const char *const p = SvPV(ret, len);
4573                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4574                         }
4575                         if (rex->intflags & PREGf_USE_RE_EVAL)
4576                             pm_flags |= PMf_USE_RE_EVAL;
4577
4578                         /* if we got here, it should be an engine which
4579                          * supports compiling code blocks and stuff */
4580                         assert(rex->engine && rex->engine->op_comp);
4581                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4582                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4583                                     rex->engine, NULL, NULL,
4584                                     /* copy /msix etc to inner pattern */
4585                                     scan->flags,
4586                                     pm_flags);
4587
4588                         if (!(SvFLAGS(ret)
4589                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4590                                  | SVs_GMG))) {
4591                             /* This isn't a first class regexp. Instead, it's
4592                                caching a regexp onto an existing, Perl visible
4593                                scalar.  */
4594                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4595                         }
4596                         PL_regsize = osize;
4597                         /* safe to do now that any $1 etc has been
4598                          * interpolated into the new pattern string and
4599                          * compiled */
4600                         S_regcp_restore(aTHX_ rex, runops_cp);
4601                     }
4602                     re = (struct regexp *)SvANY(re_sv);
4603                 }
4604                 RXp_MATCH_COPIED_off(re);
4605                 re->subbeg = rex->subbeg;
4606                 re->sublen = rex->sublen;
4607                 re->suboffset = rex->suboffset;
4608                 re->subcoffset = rex->subcoffset;
4609                 rei = RXi_GET(re);
4610                 DEBUG_EXECUTE_r(
4611                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4612                         "Matching embedded");
4613                 );
4614                 startpoint = rei->program + 1;
4615                 ST.close_paren = 0; /* only used for GOSUB */
4616
4617         eval_recurse_doit: /* Share code with GOSUB below this line */
4618                 /* run the pattern returned from (??{...}) */
4619                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4620                 REGCP_SET(ST.lastcp);
4621
4622                 re->lastparen = 0;
4623                 re->lastcloseparen = 0;
4624
4625                 PL_reginput = locinput;
4626                 PL_regsize = 0;
4627
4628                 /* XXXX This is too dramatic a measure... */
4629                 PL_reg_maxiter = 0;
4630
4631                 ST.toggle_reg_flags = PL_reg_flags;
4632                 if (RX_UTF8(re_sv))
4633                     PL_reg_flags |= RF_utf8;
4634                 else
4635                     PL_reg_flags &= ~RF_utf8;
4636                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4637
4638                 ST.prev_rex = rex_sv;
4639                 ST.prev_curlyx = cur_curlyx;
4640                 rex_sv = re_sv;
4641                 SET_reg_curpm(rex_sv);
4642                 rex = re;
4643                 rexi = rei;
4644                 cur_curlyx = NULL;
4645                 ST.B = next;
4646                 ST.prev_eval = cur_eval;
4647                 cur_eval = st;
4648                 /* now continue from first node in postoned RE */
4649                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4650                 assert(0); /* NOTREACHED */
4651         }
4652
4653         case EVAL_AB: /* cleanup after a successful (??{A})B */
4654             /* note: this is called twice; first after popping B, then A */
4655             PL_reg_flags ^= ST.toggle_reg_flags;
4656             rex_sv = ST.prev_rex;
4657             SET_reg_curpm(rex_sv);
4658             rex = (struct regexp *)SvANY(rex_sv);
4659             rexi = RXi_GET(rex);
4660             regcpblow(ST.cp);
4661             cur_eval = ST.prev_eval;
4662             cur_curlyx = ST.prev_curlyx;
4663
4664             /* XXXX This is too dramatic a measure... */
4665             PL_reg_maxiter = 0;
4666             if ( nochange_depth )
4667                 nochange_depth--;
4668             sayYES;
4669
4670
4671         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4672             /* note: this is called twice; first after popping B, then A */
4673             PL_reg_flags ^= ST.toggle_reg_flags;
4674             rex_sv = ST.prev_rex;
4675             SET_reg_curpm(rex_sv);
4676             rex = (struct regexp *)SvANY(rex_sv);
4677             rexi = RXi_GET(rex);
4678
4679             PL_reginput = locinput;
4680             REGCP_UNWIND(ST.lastcp);
4681             regcppop(rex);
4682             cur_eval = ST.prev_eval;
4683             cur_curlyx = ST.prev_curlyx;
4684             /* XXXX This is too dramatic a measure... */
4685             PL_reg_maxiter = 0;
4686             if ( nochange_depth )
4687                 nochange_depth--;
4688             sayNO_SILENT;
4689 #undef ST
4690
4691         case OPEN:
4692             n = ARG(scan);  /* which paren pair */
4693             rex->offs[n].start_tmp = locinput - PL_bostr;
4694             if (n > PL_regsize)
4695                 PL_regsize = n;
4696             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4697                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4698                 PTR2UV(rex),
4699                 PTR2UV(rex->offs),
4700                 (UV)n,
4701                 (IV)rex->offs[n].start_tmp,
4702                 (UV)PL_regsize
4703             ));
4704             lastopen = n;
4705             break;
4706
4707 /* XXX really need to log other places start/end are set too */
4708 #define CLOSE_CAPTURE \
4709     rex->offs[n].start = rex->offs[n].start_tmp; \
4710     rex->offs[n].end = locinput - PL_bostr; \
4711     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4712         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4713         PTR2UV(rex), \
4714         PTR2UV(rex->offs), \
4715         (UV)n, \
4716         (IV)rex->offs[n].start, \
4717         (IV)rex->offs[n].end \
4718     ))
4719
4720         case CLOSE:
4721             n = ARG(scan);  /* which paren pair */
4722             CLOSE_CAPTURE;
4723             /*if (n > PL_regsize)
4724                 PL_regsize = n;*/
4725             if (n > rex->lastparen)
4726                 rex->lastparen = n;
4727             rex->lastcloseparen = n;
4728             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4729                 goto fake_end;
4730             }
4731             break;
4732         case ACCEPT:
4733             if (ARG(scan)){
4734                 regnode *cursor;
4735                 for (cursor=scan;
4736                      cursor && OP(cursor)!=END;
4737                      cursor=regnext(cursor))
4738                 {
4739                     if ( OP(cursor)==CLOSE ){
4740                         n = ARG(cursor);
4741                         if ( n <= lastopen ) {
4742                             CLOSE_CAPTURE;
4743                             /*if (n > PL_regsize)
4744                             PL_regsize = n;*/
4745                             if (n > rex->lastparen)
4746                                 rex->lastparen = n;
4747                             rex->lastcloseparen = n;
4748                             if ( n == ARG(scan) || (cur_eval &&
4749                                 cur_eval->u.eval.close_paren == n))
4750                                 break;
4751                         }
4752                     }
4753                 }
4754             }
4755             goto fake_end;
4756             /*NOTREACHED*/
4757         case GROUPP:
4758             n = ARG(scan);  /* which paren pair */
4759             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4760             break;
4761         case NGROUPP:
4762             /* reg_check_named_buff_matched returns 0 for no match */
4763             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4764             break;
4765         case INSUBP:
4766             n = ARG(scan);
4767             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4768             break;
4769         case DEFINEP:
4770             sw = 0;
4771             break;
4772         case IFTHEN:
4773             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4774             if (sw)
4775                 next = NEXTOPER(NEXTOPER(scan));
4776             else {
4777                 next = scan + ARG(scan);
4778                 if (OP(next) == IFTHEN) /* Fake one. */
4779                     next = NEXTOPER(NEXTOPER(next));
4780             }
4781             break;
4782         case LOGICAL:
4783             logical = scan->flags;
4784             break;
4785
4786 /*******************************************************************
4787
4788 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4789 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4790 STAR/PLUS/CURLY/CURLYN are used instead.)
4791
4792 A*B is compiled as <CURLYX><A><WHILEM><B>
4793
4794 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4795 state, which contains the current count, initialised to -1. It also sets
4796 cur_curlyx to point to this state, with any previous value saved in the
4797 state block.
4798
4799 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4800 since the pattern may possibly match zero times (i.e. it's a while {} loop
4801 rather than a do {} while loop).
4802
4803 Each entry to WHILEM represents a successful match of A. The count in the
4804 CURLYX block is incremented, another WHILEM state is pushed, and execution
4805 passes to A or B depending on greediness and the current count.
4806
4807 For example, if matching against the string a1a2a3b (where the aN are
4808 substrings that match /A/), then the match progresses as follows: (the
4809 pushed states are interspersed with the bits of strings matched so far):
4810
4811     <CURLYX cnt=-1>
4812     <CURLYX cnt=0><WHILEM>
4813     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4814     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4815     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4816     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4817
4818 (Contrast this with something like CURLYM, which maintains only a single
4819 backtrack state:
4820
4821     <CURLYM cnt=0> a1
4822     a1 <CURLYM cnt=1> a2
4823     a1 a2 <CURLYM cnt=2> a3
4824     a1 a2 a3 <CURLYM cnt=3> b
4825 )
4826
4827 Each WHILEM state block marks a point to backtrack to upon partial failure
4828 of A or B, and also contains some minor state data related to that
4829 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4830 overall state, such as the count, and pointers to the A and B ops.
4831
4832 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4833 must always point to the *current* CURLYX block, the rules are:
4834
4835 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4836 and set cur_curlyx to point the new block.
4837
4838 When popping the CURLYX block after a successful or unsuccessful match,
4839 restore the previous cur_curlyx.
4840
4841 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4842 to the outer one saved in the CURLYX block.
4843
4844 When popping the WHILEM block after a successful or unsuccessful B match,
4845 restore the previous cur_curlyx.
4846
4847 Here's an example for the pattern (AI* BI)*BO
4848 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4849
4850 cur_
4851 curlyx backtrack stack
4852 ------ ---------------
4853 NULL
4854 CO     <CO prev=NULL> <WO>
4855 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4856 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4857 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4858
4859 At this point the pattern succeeds, and we work back down the stack to
4860 clean up, restoring as we go:
4861
4862 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4863 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4864 CO     <CO prev=NULL> <WO>
4865 NULL
4866
4867 *******************************************************************/
4868
4869 #define ST st->u.curlyx
4870
4871         case CURLYX:    /* start of /A*B/  (for complex A) */
4872         {
4873             /* No need to save/restore up to this paren */
4874             I32 parenfloor = scan->flags;
4875
4876             assert(next); /* keep Coverity happy */
4877             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4878                 next += ARG(next);
4879
4880             /* XXXX Probably it is better to teach regpush to support
4881                parenfloor > PL_regsize... */
4882             if (parenfloor > (I32)rex->lastparen)
4883                 parenfloor = rex->lastparen; /* Pessimization... */
4884
4885             ST.prev_curlyx= cur_curlyx;
4886             cur_curlyx = st;
4887             ST.cp = PL_savestack_ix;
4888
4889             /* these fields contain the state of the current curly.
4890              * they are accessed by subsequent WHILEMs */
4891             ST.parenfloor = parenfloor;
4892             ST.me = scan;
4893             ST.B = next;
4894             ST.minmod = minmod;
4895             minmod = 0;
4896             ST.count = -1;      /* this will be updated by WHILEM */
4897             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4898
4899             PL_reginput = locinput;
4900             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4901             assert(0); /* NOTREACHED */
4902         }
4903
4904         case CURLYX_end: /* just finished matching all of A*B */
4905             cur_curlyx = ST.prev_curlyx;
4906             sayYES;
4907             assert(0); /* NOTREACHED */
4908
4909         case CURLYX_end_fail: /* just failed to match all of A*B */
4910             regcpblow(ST.cp);
4911             cur_curlyx = ST.prev_curlyx;
4912             sayNO;
4913             assert(0); /* NOTREACHED */
4914
4915
4916 #undef ST
4917 #define ST st->u.whilem
4918
4919         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4920         {
4921             /* see the discussion above about CURLYX/WHILEM */
4922             I32 n;
4923             int min = ARG1(cur_curlyx->u.curlyx.me);
4924             int max = ARG2(cur_curlyx->u.curlyx.me);
4925             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4926
4927             assert(cur_curlyx); /* keep Coverity happy */
4928             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4929             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4930             ST.cache_offset = 0;
4931             ST.cache_mask = 0;
4932
4933             PL_reginput = locinput;
4934
4935             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4936                   "%*s  whilem: matched %ld out of %d..%d\n",
4937                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4938             );
4939
4940             /* First just match a string of min A's. */
4941
4942             if (n < min) {
4943                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4944                 cur_curlyx->u.curlyx.lastloc = locinput;
4945                 REGCP_SET(ST.lastcp);
4946
4947                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4948                 assert(0); /* NOTREACHED */
4949             }
4950
4951             /* If degenerate A matches "", assume A done. */
4952
4953             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4954                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4955                    "%*s  whilem: empty match detected, trying continuation...\n",
4956                    REPORT_CODE_OFF+depth*2, "")
4957                 );
4958                 goto do_whilem_B_max;
4959             }
4960
4961             /* super-linear cache processing */
4962
4963             if (scan->flags) {
4964
4965                 if (!PL_reg_maxiter) {
4966                     /* start the countdown: Postpone detection until we
4967                      * know the match is not *that* much linear. */
4968                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4969                     /* possible overflow for long strings and many CURLYX's */
4970                     if (PL_reg_maxiter < 0)
4971                         PL_reg_maxiter = I32_MAX;
4972                     PL_reg_leftiter = PL_reg_maxiter;
4973                 }
4974
4975                 if (PL_reg_leftiter-- == 0) {
4976                     /* initialise cache */
4977                     const I32 size = (PL_reg_maxiter + 7)/8;
4978                     if (PL_reg_poscache) {
4979                         if ((I32)PL_reg_poscache_size < size) {
4980                             Renew(PL_reg_poscache, size, char);
4981                             PL_reg_poscache_size = size;
4982                         }
4983                         Zero(PL_reg_poscache, size, char);
4984                     }
4985                     else {
4986                         PL_reg_poscache_size = size;
4987                         Newxz(PL_reg_poscache, size, char);
4988                     }
4989                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4990       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4991                               PL_colors[4], PL_colors[5])
4992                     );
4993                 }
4994
4995                 if (PL_reg_leftiter < 0) {
4996                     /* have we already failed at this position? */
4997                     I32 offset, mask;
4998                     offset  = (scan->flags & 0xf) - 1
4999                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5000                     mask    = 1 << (offset % 8);
5001                     offset /= 8;
5002                     if (PL_reg_poscache[offset] & mask) {
5003                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5004                             "%*s  whilem: (cache) already tried at this position...\n",
5005                             REPORT_CODE_OFF+depth*2, "")
5006                         );
5007                         sayNO; /* cache records failure */
5008                     }
5009                     ST.cache_offset = offset;
5010                     ST.cache_mask   = mask;
5011                 }
5012             }
5013
5014             /* Prefer B over A for minimal matching. */
5015
5016             if (cur_curlyx->u.curlyx.minmod) {
5017                 ST.save_curlyx = cur_curlyx;
5018                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5019                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5020                 REGCP_SET(ST.lastcp);
5021                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
5022                 assert(0); /* NOTREACHED */
5023             }
5024
5025             /* Prefer A over B for maximal matching. */
5026
5027             if (n < max) { /* More greed allowed? */
5028                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5029                 cur_curlyx->u.curlyx.lastloc = locinput;
5030                 REGCP_SET(ST.lastcp);
5031                 PUSH_STATE_GOTO(WHILEM_A_max, A);
5032                 assert(0); /* NOTREACHED */
5033             }
5034             goto do_whilem_B_max;
5035         }
5036         assert(0); /* NOTREACHED */
5037
5038         case WHILEM_B_min: /* just matched B in a minimal match */
5039         case WHILEM_B_max: /* just matched B in a maximal match */
5040             cur_curlyx = ST.save_curlyx;
5041             sayYES;
5042             assert(0); /* NOTREACHED */
5043
5044         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5045             cur_curlyx = ST.save_curlyx;
5046             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5047             cur_curlyx->u.curlyx.count--;
5048             CACHEsayNO;
5049             assert(0); /* NOTREACHED */
5050
5051         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5052             /* FALL THROUGH */
5053         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5054             REGCP_UNWIND(ST.lastcp);
5055             regcppop(rex);
5056             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5057             cur_curlyx->u.curlyx.count--;
5058             CACHEsayNO;
5059             assert(0); /* NOTREACHED */
5060
5061         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5062             REGCP_UNWIND(ST.lastcp);
5063             regcppop(rex);      /* Restore some previous $<digit>s? */
5064             PL_reginput = locinput;
5065             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5066                 "%*s  whilem: failed, trying continuation...\n",
5067                 REPORT_CODE_OFF+depth*2, "")
5068             );
5069           do_whilem_B_max:
5070             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5071                 && ckWARN(WARN_REGEXP)
5072                 && !(PL_reg_flags & RF_warned))
5073             {
5074                 PL_reg_flags |= RF_warned;
5075                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5076                      "Complex regular subexpression recursion limit (%d) "
5077                      "exceeded",
5078                      REG_INFTY - 1);
5079             }
5080
5081             /* now try B */
5082             ST.save_curlyx = cur_curlyx;
5083             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5084             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
5085             assert(0); /* NOTREACHED */
5086
5087         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5088             cur_curlyx = ST.save_curlyx;
5089             REGCP_UNWIND(ST.lastcp);
5090             regcppop(rex);
5091
5092             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5093                 /* Maximum greed exceeded */
5094                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5095                     && ckWARN(WARN_REGEXP)
5096                     && !(PL_reg_flags & RF_warned))
5097                 {
5098                     PL_reg_flags |= RF_warned;
5099                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5100                         "Complex regular subexpression recursion "
5101                         "limit (%d) exceeded",
5102                         REG_INFTY - 1);
5103                 }
5104                 cur_curlyx->u.curlyx.count--;
5105                 CACHEsayNO;
5106             }
5107
5108             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5109                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5110             );
5111             /* Try grabbing another A and see if it helps. */
5112             PL_reginput = locinput;
5113             cur_curlyx->u.curlyx.lastloc = locinput;
5114             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5115             REGCP_SET(ST.lastcp);
5116             PUSH_STATE_GOTO(WHILEM_A_min,
5117                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
5118             assert(0); /* NOTREACHED */
5119
5120 #undef  ST
5121 #define ST st->u.branch
5122
5123         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5124             next = scan + ARG(scan);
5125             if (next == scan)
5126                 next = NULL;
5127             scan = NEXTOPER(scan);
5128             /* FALL THROUGH */
5129
5130         case BRANCH:        /*  /(...|A|...)/ */
5131             scan = NEXTOPER(scan); /* scan now points to inner node */
5132             ST.lastparen = rex->lastparen;
5133             ST.lastcloseparen = rex->lastcloseparen;
5134             ST.next_branch = next;
5135             REGCP_SET(ST.cp);
5136             PL_reginput = locinput;
5137
5138             /* Now go into the branch */
5139             if (has_cutgroup) {
5140                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5141             } else {
5142                 PUSH_STATE_GOTO(BRANCH_next, scan);
5143             }
5144             assert(0); /* NOTREACHED */
5145         case CUTGROUP:
5146             PL_reginput = locinput;
5147             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5148                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5149             PUSH_STATE_GOTO(CUTGROUP_next,next);
5150             assert(0); /* NOTREACHED */
5151         case CUTGROUP_next_fail:
5152             do_cutgroup = 1;
5153             no_final = 1;
5154             if (st->u.mark.mark_name)
5155                 sv_commit = st->u.mark.mark_name;
5156             sayNO;
5157             assert(0); /* NOTREACHED */
5158         case BRANCH_next:
5159             sayYES;
5160             assert(0); /* NOTREACHED */
5161         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5162             if (do_cutgroup) {
5163                 do_cutgroup = 0;
5164                 no_final = 0;
5165             }
5166             REGCP_UNWIND(ST.cp);
5167             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5168             scan = ST.next_branch;
5169             /* no more branches? */
5170             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5171                 DEBUG_EXECUTE_r({
5172                     PerlIO_printf( Perl_debug_log,
5173                         "%*s  %sBRANCH failed...%s\n",
5174                         REPORT_CODE_OFF+depth*2, "",
5175                         PL_colors[4],
5176                         PL_colors[5] );
5177                 });
5178                 sayNO_SILENT;
5179             }
5180             continue; /* execute next BRANCH[J] op */
5181             assert(0); /* NOTREACHED */
5182
5183         case MINMOD:
5184             minmod = 1;
5185             break;
5186
5187 #undef  ST
5188 #define ST st->u.curlym
5189
5190         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5191
5192             /* This is an optimisation of CURLYX that enables us to push
5193              * only a single backtracking state, no matter how many matches
5194              * there are in {m,n}. It relies on the pattern being constant
5195              * length, with no parens to influence future backrefs
5196              */
5197
5198             ST.me = scan;
5199             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5200
5201             ST.lastparen      = rex->lastparen;
5202             ST.lastcloseparen = rex->lastcloseparen;
5203
5204             /* if paren positive, emulate an OPEN/CLOSE around A */
5205             if (ST.me->flags) {
5206                 U32 paren = ST.me->flags;
5207                 if (paren > PL_regsize)
5208                     PL_regsize = paren;
5209                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5210             }
5211             ST.A = scan;
5212             ST.B = next;
5213             ST.alen = 0;
5214             ST.count = 0;
5215             ST.minmod = minmod;
5216             minmod = 0;
5217             ST.c1 = CHRTEST_UNINIT;
5218             REGCP_SET(ST.cp);
5219
5220             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5221                 goto curlym_do_B;
5222
5223           curlym_do_A: /* execute the A in /A{m,n}B/  */
5224             PL_reginput = locinput;
5225             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5226             assert(0); /* NOTREACHED */
5227
5228         case CURLYM_A: /* we've just matched an A */
5229             locinput = st->locinput;
5230             nextchr = UCHARAT(locinput);
5231
5232             ST.count++;
5233             /* after first match, determine A's length: u.curlym.alen */
5234             if (ST.count == 1) {
5235                 if (PL_reg_match_utf8) {
5236                     char *s = locinput;
5237                     while (s < PL_reginput) {
5238                         ST.alen++;
5239                         s += UTF8SKIP(s);
5240                     }
5241                 }
5242                 else {
5243                     ST.alen = PL_reginput - locinput;
5244                 }
5245                 if (ST.alen == 0)
5246                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5247             }
5248             DEBUG_EXECUTE_r(
5249                 PerlIO_printf(Perl_debug_log,
5250                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5251                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5252                           (IV) ST.count, (IV)ST.alen)
5253             );
5254
5255             locinput = PL_reginput;
5256
5257             if (cur_eval && cur_eval->u.eval.close_paren &&
5258                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5259                 goto fake_end;
5260
5261             {
5262                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5263                 if ( max == REG_INFTY || ST.count < max )
5264                     goto curlym_do_A; /* try to match another A */
5265             }
5266             goto curlym_do_B; /* try to match B */
5267
5268         case CURLYM_A_fail: /* just failed to match an A */
5269             REGCP_UNWIND(ST.cp);
5270
5271             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5272                 || (cur_eval && cur_eval->u.eval.close_paren &&
5273                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5274                 sayNO;
5275
5276           curlym_do_B: /* execute the B in /A{m,n}B/  */
5277             PL_reginput = locinput;
5278             if (ST.c1 == CHRTEST_UNINIT) {
5279                 /* calculate c1 and c2 for possible match of 1st char
5280                  * following curly */
5281                 ST.c1 = ST.c2 = CHRTEST_VOID;
5282                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5283                     regnode *text_node = ST.B;
5284                     if (! HAS_TEXT(text_node))
5285                         FIND_NEXT_IMPT(text_node);
5286                     /* this used to be
5287
5288                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5289
5290                         But the former is redundant in light of the latter.
5291
5292                         if this changes back then the macro for
5293                         IS_TEXT and friends need to change.
5294                      */
5295                     if (PL_regkind[OP(text_node)] == EXACT)
5296                     {
5297
5298                         ST.c1 = (U8)*STRING(text_node);
5299                         switch (OP(text_node)) {
5300                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5301                             case EXACTFA:
5302                             case EXACTFU_SS:
5303                             case EXACTFU_TRICKYFOLD:
5304                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5305                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5306                             default: ST.c2 = ST.c1;
5307                         }
5308                     }
5309                 }
5310             }
5311
5312             DEBUG_EXECUTE_r(
5313                 PerlIO_printf(Perl_debug_log,
5314                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5315                     (int)(REPORT_CODE_OFF+(depth*2)),
5316                     "", (IV)ST.count)
5317                 );
5318             if (ST.c1 != CHRTEST_VOID
5319                     && UCHARAT(PL_reginput) != ST.c1
5320                     && UCHARAT(PL_reginput) != ST.c2)
5321             {
5322                 /* simulate B failing */
5323                 DEBUG_OPTIMISE_r(
5324                     PerlIO_printf(Perl_debug_log,
5325                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5326                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5327                         (IV)ST.c1,(IV)ST.c2
5328                 ));
5329                 state_num = CURLYM_B_fail;
5330                 goto reenter_switch;
5331             }
5332
5333             if (ST.me->flags) {
5334                 /* emulate CLOSE: mark current A as captured */
5335                 I32 paren = ST.me->flags;
5336                 if (ST.count) {
5337                     rex->offs[paren].start
5338                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5339                     rex->offs[paren].end = PL_reginput - PL_bostr;
5340                     if ((U32)paren > rex->lastparen)
5341                         rex->lastparen = paren;
5342                     rex->lastcloseparen = paren;
5343                 }
5344                 else
5345                     rex->offs[paren].end = -1;
5346                 if (cur_eval && cur_eval->u.eval.close_paren &&
5347                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5348                 {
5349                     if (ST.count)
5350                         goto fake_end;
5351                     else
5352                         sayNO;
5353                 }
5354             }
5355
5356             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5357             assert(0); /* NOTREACHED */
5358
5359         case CURLYM_B_fail: /* just failed to match a B */
5360             REGCP_UNWIND(ST.cp);
5361             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5362             if (ST.minmod) {
5363                 I32 max = ARG2(ST.me);
5364                 if (max != REG_INFTY && ST.count == max)
5365                     sayNO;
5366                 goto curlym_do_A; /* try to match a further A */
5367             }
5368             /* backtrack one A */
5369             if (ST.count == ARG1(ST.me) /* min */)
5370                 sayNO;
5371             ST.count--;
5372             locinput = HOPc(locinput, -ST.alen);
5373             goto curlym_do_B; /* try to match B */
5374
5375 #undef ST
5376 #define ST st->u.curly
5377
5378 #define CURLY_SETPAREN(paren, success) \
5379     if (paren) { \
5380         if (success) { \
5381             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5382             rex->offs[paren].end = locinput - PL_bostr; \
5383             if (paren > rex->lastparen) \
5384                 rex->lastparen = paren; \
5385             rex->lastcloseparen = paren; \
5386         } \
5387         else { \
5388             rex->offs[paren].end = -1; \
5389             rex->lastparen      = ST.lastparen; \
5390             rex->lastcloseparen = ST.lastcloseparen; \
5391         } \
5392     }
5393
5394         case STAR:              /*  /A*B/ where A is width 1 */
5395             ST.paren = 0;
5396             ST.min = 0;
5397             ST.max = REG_INFTY;
5398             scan = NEXTOPER(scan);
5399             goto repeat;
5400         case PLUS:              /*  /A+B/ where A is width 1 */
5401             ST.paren = 0;
5402             ST.min = 1;
5403             ST.max = REG_INFTY;
5404             scan = NEXTOPER(scan);
5405             goto repeat;
5406         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5407             ST.paren = scan->flags;     /* Which paren to set */
5408             ST.lastparen      = rex->lastparen;
5409             ST.lastcloseparen = rex->lastcloseparen;
5410             if (ST.paren > PL_regsize)
5411                 PL_regsize = ST.paren;
5412             ST.min = ARG1(scan);  /* min to match */
5413             ST.max = ARG2(scan);  /* max to match */
5414             if (cur_eval && cur_eval->u.eval.close_paren &&
5415                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5416                 ST.min=1;
5417                 ST.max=1;
5418             }
5419             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5420             goto repeat;
5421         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5422             ST.paren = 0;
5423             ST.min = ARG1(scan);  /* min to match */
5424             ST.max = ARG2(scan);  /* max to match */
5425             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5426           repeat:
5427             /*
5428             * Lookahead to avoid useless match attempts
5429             * when we know what character comes next.
5430             *
5431             * Used to only do .*x and .*?x, but now it allows
5432             * for )'s, ('s and (?{ ... })'s to be in the way
5433             * of the quantifier and the EXACT-like node.  -- japhy
5434             */
5435
5436             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5437                 sayNO;
5438             if (HAS_TEXT(next) || JUMPABLE(next)) {
5439                 U8 *s;
5440                 regnode *text_node = next;
5441
5442                 if (! HAS_TEXT(text_node))
5443                     FIND_NEXT_IMPT(text_node);
5444
5445                 if (! HAS_TEXT(text_node))
5446                     ST.c1 = ST.c2 = CHRTEST_VOID;
5447                 else {
5448                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5449                         ST.c1 = ST.c2 = CHRTEST_VOID;
5450                         goto assume_ok_easy;
5451                     }
5452                     else
5453                         s = (U8*)STRING(text_node);
5454
5455                     /*  Currently we only get here when
5456
5457                         PL_rekind[OP(text_node)] == EXACT
5458
5459                         if this changes back then the macro for IS_TEXT and
5460                         friends need to change. */
5461                     if (!UTF_PATTERN) {
5462                         ST.c1 = *s;
5463                         switch (OP(text_node)) {
5464                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5465                             case EXACTFA:
5466                             case EXACTFU_SS:
5467                             case EXACTFU_TRICKYFOLD:
5468                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5469                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5470                             default: ST.c2 = ST.c1; break;
5471                         }
5472                     }
5473                     else { /* UTF_PATTERN */
5474                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5475                              STRLEN ulen;
5476                              U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5477
5478                              to_utf8_fold((U8*)s, tmpbuf, &ulen);
5479                              ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5480                                                     uniflags);
5481                         }
5482                         else {
5483                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5484                                                      uniflags);
5485                         }
5486                     }
5487                 }
5488             }
5489             else
5490                 ST.c1 = ST.c2 = CHRTEST_VOID;
5491         assume_ok_easy:
5492
5493             ST.A = scan;
5494             ST.B = next;
5495             PL_reginput = locinput;
5496             if (minmod) {
5497                 minmod = 0;
5498                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5499                     sayNO;
5500                 ST.count = ST.min;
5501                 locinput = PL_reginput;
5502                 REGCP_SET(ST.cp);
5503                 if (ST.c1 == CHRTEST_VOID)
5504                     goto curly_try_B_min;
5505
5506                 ST.oldloc = locinput;
5507
5508                 /* set ST.maxpos to the furthest point along the
5509                  * string that could possibly match */
5510                 if  (ST.max == REG_INFTY) {
5511                     ST.maxpos = PL_regeol - 1;
5512                     if (utf8_target)
5513                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5514                             ST.maxpos--;
5515                 }
5516                 else if (utf8_target) {
5517                     int m = ST.max - ST.min;
5518                     for (ST.maxpos = locinput;
5519                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5520                         ST.maxpos += UTF8SKIP(ST.maxpos);
5521                 }
5522                 else {
5523                     ST.maxpos = locinput + ST.max - ST.min;
5524                     if (ST.maxpos >= PL_regeol)
5525                         ST.maxpos = PL_regeol - 1;
5526                 }
5527                 goto curly_try_B_min_known;
5528
5529             }
5530             else {
5531                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5532                 locinput = PL_reginput;
5533                 if (ST.count < ST.min)
5534                     sayNO;
5535                 if ((ST.count > ST.min)
5536                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5537                 {
5538                     /* A{m,n} must come at the end of the string, there's
5539                      * no point in backing off ... */
5540                     ST.min = ST.count;
5541                     /* ...except that $ and \Z can match before *and* after
5542                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5543                        We may back off by one in this case. */
5544                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5545                         ST.min--;
5546                 }
5547                 REGCP_SET(ST.cp);
5548                 goto curly_try_B_max;
5549             }
5550             assert(0); /* NOTREACHED */
5551
5552
5553         case CURLY_B_min_known_fail:
5554             /* failed to find B in a non-greedy match where c1,c2 valid */
5555
5556             PL_reginput = locinput;     /* Could be reset... */
5557             REGCP_UNWIND(ST.cp);
5558             if (ST.paren) {
5559                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5560             }
5561             /* Couldn't or didn't -- move forward. */
5562             ST.oldloc = locinput;
5563             if (utf8_target)
5564                 locinput += UTF8SKIP(locinput);
5565             else
5566                 locinput++;
5567             ST.count++;
5568           curly_try_B_min_known:
5569              /* find the next place where 'B' could work, then call B */
5570             {
5571                 int n;
5572                 if (utf8_target) {
5573                     n = (ST.oldloc == locinput) ? 0 : 1;
5574                     if (ST.c1 == ST.c2) {
5575                         STRLEN len;
5576                         /* set n to utf8_distance(oldloc, locinput) */
5577                         while (locinput <= ST.maxpos &&
5578                                utf8n_to_uvchr((U8*)locinput,
5579                                               UTF8_MAXBYTES, &len,
5580                                               uniflags) != (UV)ST.c1) {
5581                             locinput += len;
5582                             n++;
5583                         }
5584                     }
5585                     else {
5586                         /* set n to utf8_distance(oldloc, locinput) */
5587                         while (locinput <= ST.maxpos) {
5588                             STRLEN len;
5589                             const UV c = utf8n_to_uvchr((U8*)locinput,
5590                                                   UTF8_MAXBYTES, &len,
5591                                                   uniflags);
5592                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5593                                 break;
5594                             locinput += len;
5595                             n++;
5596                         }
5597                     }
5598                 }
5599                 else {
5600                     if (ST.c1 == ST.c2) {
5601                         while (locinput <= ST.maxpos &&
5602                                UCHARAT(locinput) != ST.c1)
5603                             locinput++;
5604                     }
5605                     else {
5606                         while (locinput <= ST.maxpos
5607                                && UCHARAT(locinput) != ST.c1
5608                                && UCHARAT(locinput) != ST.c2)
5609                             locinput++;
5610                     }
5611                     n = locinput - ST.oldloc;
5612                 }
5613                 if (locinput > ST.maxpos)
5614                     sayNO;
5615                 /* PL_reginput == oldloc now */
5616                 if (n) {
5617                     ST.count += n;
5618                     if (regrepeat(rex, ST.A, n, depth) < n)
5619                         sayNO;
5620                 }
5621                 PL_reginput = locinput;
5622                 CURLY_SETPAREN(ST.paren, ST.count);
5623                 if (cur_eval && cur_eval->u.eval.close_paren &&
5624                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5625                     goto fake_end;
5626                 }
5627                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5628             }
5629             assert(0); /* NOTREACHED */
5630
5631
5632         case CURLY_B_min_fail:
5633             /* failed to find B in a non-greedy match where c1,c2 invalid */
5634
5635             REGCP_UNWIND(ST.cp);
5636             if (ST.paren) {
5637                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5638             }
5639             /* failed -- move forward one */
5640             PL_reginput = locinput;
5641             if (regrepeat(rex, ST.A, 1, depth)) {
5642                 ST.count++;
5643                 locinput = PL_reginput;
5644                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5645                         ST.count > 0)) /* count overflow ? */
5646                 {
5647                   curly_try_B_min:
5648                     CURLY_SETPAREN(ST.paren, ST.count);
5649                     if (cur_eval && cur_eval->u.eval.close_paren &&
5650                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5651                         goto fake_end;
5652                     }
5653                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5654                 }
5655             }
5656             sayNO;
5657             assert(0); /* NOTREACHED */
5658
5659
5660         curly_try_B_max:
5661             /* a successful greedy match: now try to match B */
5662             if (cur_eval && cur_eval->u.eval.close_paren &&
5663                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5664                 goto fake_end;
5665             }
5666             {
5667                 UV c = 0;
5668                 if (ST.c1 != CHRTEST_VOID)
5669                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5670                                            UTF8_MAXBYTES, 0, uniflags)
5671                                 : (UV) UCHARAT(PL_reginput);
5672                 /* If it could work, try it. */
5673                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5674                     CURLY_SETPAREN(ST.paren, ST.count);
5675                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5676                     assert(0); /* NOTREACHED */
5677                 }
5678             }
5679             /* FALL THROUGH */
5680         case CURLY_B_max_fail:
5681             /* failed to find B in a greedy match */
5682
5683             REGCP_UNWIND(ST.cp);
5684             if (ST.paren) {
5685                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5686             }
5687             /*  back up. */
5688             if (--ST.count < ST.min)
5689                 sayNO;
5690             PL_reginput = locinput = HOPc(locinput, -1);
5691             goto curly_try_B_max;
5692
5693 #undef ST
5694
5695         case END:
5696             fake_end:
5697             if (cur_eval) {
5698                 /* we've just finished A in /(??{A})B/; now continue with B */
5699                 st->u.eval.toggle_reg_flags
5700                             = cur_eval->u.eval.toggle_reg_flags;
5701                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5702
5703                 st->u.eval.prev_rex = rex_sv;           /* inner */
5704                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5705                 rex_sv = cur_eval->u.eval.prev_rex;
5706                 SET_reg_curpm(rex_sv);
5707                 rex = (struct regexp *)SvANY(rex_sv);
5708                 rexi = RXi_GET(rex);
5709                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5710
5711                 REGCP_SET(st->u.eval.lastcp);
5712                 PL_reginput = locinput;
5713
5714                 /* Restore parens of the outer rex without popping the
5715                  * savestack */
5716                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5717
5718                 st->u.eval.prev_eval = cur_eval;
5719                 cur_eval = cur_eval->u.eval.prev_eval;
5720                 DEBUG_EXECUTE_r(
5721                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5722                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5723                 if ( nochange_depth )
5724                     nochange_depth--;
5725
5726                 PUSH_YES_STATE_GOTO(EVAL_AB,
5727                         st->u.eval.prev_eval->u.eval.B); /* match B */
5728             }
5729
5730             if (locinput < reginfo->till) {
5731                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5732                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5733                                       PL_colors[4],
5734                                       (long)(locinput - PL_reg_starttry),
5735                                       (long)(reginfo->till - PL_reg_starttry),
5736                                       PL_colors[5]));
5737
5738                 sayNO_SILENT;           /* Cannot match: too short. */
5739             }
5740             PL_reginput = locinput;     /* put where regtry can find it */
5741             sayYES;                     /* Success! */
5742
5743         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5744             DEBUG_EXECUTE_r(
5745             PerlIO_printf(Perl_debug_log,
5746                 "%*s  %ssubpattern success...%s\n",
5747                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5748             PL_reginput = locinput;     /* put where regtry can find it */
5749             sayYES;                     /* Success! */
5750
5751 #undef  ST
5752 #define ST st->u.ifmatch
5753
5754         case SUSPEND:   /* (?>A) */
5755             ST.wanted = 1;
5756             PL_reginput = locinput;
5757             goto do_ifmatch;
5758
5759         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5760             ST.wanted = 0;
5761             goto ifmatch_trivial_fail_test;
5762
5763         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5764             ST.wanted = 1;
5765           ifmatch_trivial_fail_test:
5766             if (scan->flags) {
5767                 char * const s = HOPBACKc(locinput, scan->flags);
5768                 if (!s) {
5769                     /* trivial fail */
5770                     if (logical) {
5771                         logical = 0;
5772                         sw = 1 - cBOOL(ST.wanted);
5773                     }
5774                     else if (ST.wanted)
5775                         sayNO;
5776                     next = scan + ARG(scan);
5777                     if (next == scan)
5778                         next = NULL;
5779                     break;
5780                 }
5781                 PL_reginput = s;
5782             }
5783             else
5784                 PL_reginput = locinput;
5785
5786           do_ifmatch:
5787             ST.me = scan;
5788             ST.logical = logical;
5789             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5790
5791             /* execute body of (?...A) */
5792             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5793             assert(0); /* NOTREACHED */
5794
5795         case IFMATCH_A_fail: /* body of (?...A) failed */
5796             ST.wanted = !ST.wanted;
5797             /* FALL THROUGH */
5798
5799         case IFMATCH_A: /* body of (?...A) succeeded */
5800             if (ST.logical) {
5801                 sw = cBOOL(ST.wanted);
5802             }
5803             else if (!ST.wanted)
5804                 sayNO;
5805
5806             if (OP(ST.me) == SUSPEND)
5807                 locinput = PL_reginput;
5808             else {
5809                 locinput = PL_reginput = st->locinput;
5810                 nextchr = UCHARAT(locinput);
5811             }
5812             scan = ST.me + ARG(ST.me);
5813             if (scan == ST.me)
5814                 scan = NULL;
5815             continue; /* execute B */
5816
5817 #undef ST
5818
5819         case LONGJMP:
5820             next = scan + ARG(scan);
5821             if (next == scan)
5822                 next = NULL;
5823             break;
5824         case COMMIT:
5825             reginfo->cutpoint = PL_regeol;
5826             /* FALLTHROUGH */
5827         case PRUNE:
5828             PL_reginput = locinput;
5829             if (!scan->flags)
5830                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5831             PUSH_STATE_GOTO(COMMIT_next,next);
5832             assert(0); /* NOTREACHED */
5833         case COMMIT_next_fail:
5834             no_final = 1;
5835             /* FALLTHROUGH */
5836         case OPFAIL:
5837             sayNO;
5838             assert(0); /* NOTREACHED */
5839
5840 #define ST st->u.mark
5841         case MARKPOINT:
5842             ST.prev_mark = mark_state;
5843             ST.mark_name = sv_commit = sv_yes_mark
5844                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5845             mark_state = st;
5846             ST.mark_loc = PL_reginput = locinput;
5847             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5848             assert(0); /* NOTREACHED */
5849         case MARKPOINT_next:
5850             mark_state = ST.prev_mark;
5851             sayYES;
5852             assert(0); /* NOTREACHED */
5853         case MARKPOINT_next_fail:
5854             if (popmark && sv_eq(ST.mark_name,popmark))
5855             {
5856                 if (ST.mark_loc > startpoint)
5857                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5858                 popmark = NULL; /* we found our mark */
5859                 sv_commit = ST.mark_name;
5860
5861                 DEBUG_EXECUTE_r({
5862                         PerlIO_printf(Perl_debug_log,
5863                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5864                             REPORT_CODE_OFF+depth*2, "",
5865                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5866                 });
5867             }
5868             mark_state = ST.prev_mark;
5869             sv_yes_mark = mark_state ?
5870                 mark_state->u.mark.mark_name : NULL;
5871             sayNO;
5872             assert(0); /* NOTREACHED */
5873         case SKIP:
5874             PL_reginput = locinput;
5875             if (scan->flags) {
5876                 /* (*SKIP) : if we fail we cut here*/
5877                 ST.mark_name = NULL;
5878                 ST.mark_loc = locinput;
5879                 PUSH_STATE_GOTO(SKIP_next,next);
5880             } else {
5881                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5882                    otherwise do nothing.  Meaning we need to scan
5883                  */
5884                 regmatch_state *cur = mark_state;
5885                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5886
5887                 while (cur) {
5888                     if ( sv_eq( cur->u.mark.mark_name,
5889                                 find ) )
5890                     {
5891                         ST.mark_name = find;
5892                         PUSH_STATE_GOTO( SKIP_next, next );
5893                     }
5894                     cur = cur->u.mark.prev_mark;
5895                 }
5896             }
5897             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5898             break;
5899         case SKIP_next_fail:
5900             if (ST.mark_name) {
5901                 /* (*CUT:NAME) - Set up to search for the name as we
5902                    collapse the stack*/
5903                 popmark = ST.mark_name;
5904             } else {
5905                 /* (*CUT) - No name, we cut here.*/
5906                 if (ST.mark_loc > startpoint)
5907                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5908                 /* but we set sv_commit to latest mark_name if there
5909                    is one so they can test to see how things lead to this
5910                    cut */
5911                 if (mark_state)
5912                     sv_commit=mark_state->u.mark.mark_name;
5913             }
5914             no_final = 1;
5915             sayNO;
5916             assert(0); /* NOTREACHED */
5917 #undef ST
5918         case LNBREAK:
5919             if ((n=is_LNBREAK(locinput,utf8_target))) {
5920                 locinput += n;
5921                 nextchr = UCHARAT(locinput);
5922             } else
5923                 sayNO;
5924             break;
5925
5926 #define CASE_CLASS(nAmE)                              \
5927         case nAmE:                                    \
5928             if (locinput >= PL_regeol)                \
5929                 sayNO;                                \
5930             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5931                 locinput += n;                        \
5932                 nextchr = UCHARAT(locinput);          \
5933             } else                                    \
5934                 sayNO;                                \
5935             break;                                    \
5936         case N##nAmE:                                 \
5937             if (locinput >= PL_regeol)                \
5938                 sayNO;                                \
5939             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5940                 sayNO;                                \
5941             } else {                                  \
5942                 locinput += UTF8SKIP(locinput);       \
5943                 nextchr = UCHARAT(locinput);          \
5944             }                                         \
5945             break
5946
5947         CASE_CLASS(VERTWS);
5948         CASE_CLASS(HORIZWS);
5949 #undef CASE_CLASS
5950
5951         default:
5952             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5953                           PTR2UV(scan), OP(scan));
5954             Perl_croak(aTHX_ "regexp memory corruption");
5955
5956         } /* end switch */
5957
5958         /* switch break jumps here */
5959         scan = next; /* prepare to execute the next op and ... */
5960         continue;    /* ... jump back to the top, reusing st */
5961         assert(0); /* NOTREACHED */
5962
5963       push_yes_state:
5964         /* push a state that backtracks on success */
5965         st->u.yes.prev_yes_state = yes_state;
5966         yes_state = st;
5967         /* FALL THROUGH */
5968       push_state:
5969         /* push a new regex state, then continue at scan  */
5970         {
5971             regmatch_state *newst;
5972
5973             DEBUG_STACK_r({
5974                 regmatch_state *cur = st;
5975                 regmatch_state *curyes = yes_state;
5976                 int curd = depth;
5977                 regmatch_slab *slab = PL_regmatch_slab;
5978                 for (;curd > -1;cur--,curd--) {
5979                     if (cur < SLAB_FIRST(slab)) {
5980                         slab = slab->prev;
5981                         cur = SLAB_LAST(slab);
5982                     }
5983                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5984                         REPORT_CODE_OFF + 2 + depth * 2,"",
5985                         curd, PL_reg_name[cur->resume_state],
5986                         (curyes == cur) ? "yes" : ""
5987                     );
5988                     if (curyes == cur)
5989                         curyes = cur->u.yes.prev_yes_state;
5990                 }
5991             } else
5992                 DEBUG_STATE_pp("push")
5993             );
5994             depth++;
5995             st->locinput = locinput;
5996             newst = st+1;
5997             if (newst >  SLAB_LAST(PL_regmatch_slab))
5998                 newst = S_push_slab(aTHX);
5999             PL_regmatch_state = newst;
6000
6001             locinput = PL_reginput;
6002             nextchr = UCHARAT(locinput);
6003             st = newst;
6004             continue;
6005             assert(0); /* NOTREACHED */
6006         }
6007     }
6008
6009     /*
6010     * We get here only if there's trouble -- normally "case END" is
6011     * the terminating point.
6012     */
6013     Perl_croak(aTHX_ "corrupted regexp pointers");
6014     /*NOTREACHED*/
6015     sayNO;
6016
6017 yes:
6018     if (yes_state) {
6019         /* we have successfully completed a subexpression, but we must now
6020          * pop to the state marked by yes_state and continue from there */
6021         assert(st != yes_state);
6022 #ifdef DEBUGGING
6023         while (st != yes_state) {
6024             st--;
6025             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6026                 PL_regmatch_slab = PL_regmatch_slab->prev;
6027                 st = SLAB_LAST(PL_regmatch_slab);
6028             }
6029             DEBUG_STATE_r({
6030                 if (no_final) {
6031                     DEBUG_STATE_pp("pop (no final)");
6032                 } else {
6033                     DEBUG_STATE_pp("pop (yes)");
6034                 }
6035             });
6036             depth--;
6037         }
6038 #else
6039         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6040             || yes_state > SLAB_LAST(PL_regmatch_slab))
6041         {
6042             /* not in this slab, pop slab */
6043             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6044             PL_regmatch_slab = PL_regmatch_slab->prev;
6045             st = SLAB_LAST(PL_regmatch_slab);
6046         }
6047         depth -= (st - yes_state);
6048 #endif
6049         st = yes_state;
6050         yes_state = st->u.yes.prev_yes_state;
6051         PL_regmatch_state = st;
6052
6053         if (no_final) {
6054             locinput= st->locinput;
6055             nextchr = UCHARAT(locinput);
6056         }
6057         state_num = st->resume_state + no_final;
6058         goto reenter_switch;
6059     }
6060
6061     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6062                           PL_colors[4], PL_colors[5]));
6063
6064     if (PL_reg_state.re_state_eval_setup_done) {
6065         /* each successfully executed (?{...}) block does the equivalent of
6066          *   local $^R = do {...}
6067          * When popping the save stack, all these locals would be undone;
6068          * bypass this by setting the outermost saved $^R to the latest
6069          * value */
6070         if (oreplsv != GvSV(PL_replgv))
6071             sv_setsv(oreplsv, GvSV(PL_replgv));
6072     }
6073     result = 1;
6074     goto final_exit;
6075
6076 no:
6077     DEBUG_EXECUTE_r(
6078         PerlIO_printf(Perl_debug_log,
6079             "%*s  %sfailed...%s\n",
6080             REPORT_CODE_OFF+depth*2, "",
6081             PL_colors[4], PL_colors[5])
6082         );
6083
6084 no_silent:
6085     if (no_final) {
6086         if (yes_state) {
6087             goto yes;
6088         } else {
6089             goto final_exit;
6090         }
6091     }
6092     if (depth) {
6093         /* there's a previous state to backtrack to */
6094         st--;
6095         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6096             PL_regmatch_slab = PL_regmatch_slab->prev;
6097             st = SLAB_LAST(PL_regmatch_slab);
6098         }
6099         PL_regmatch_state = st;
6100         locinput= st->locinput;
6101         nextchr = UCHARAT(locinput);
6102
6103         DEBUG_STATE_pp("pop");
6104         depth--;
6105         if (yes_state == st)
6106             yes_state = st->u.yes.prev_yes_state;
6107
6108         state_num = st->resume_state + 1; /* failure = success + 1 */
6109         goto reenter_switch;
6110     }
6111     result = 0;
6112
6113   final_exit:
6114     if (rex->intflags & PREGf_VERBARG_SEEN) {
6115         SV *sv_err = get_sv("REGERROR", 1);
6116         SV *sv_mrk = get_sv("REGMARK", 1);
6117         if (result) {
6118             sv_commit = &PL_sv_no;
6119             if (!sv_yes_mark)
6120                 sv_yes_mark = &PL_sv_yes;
6121         } else {
6122             if (!sv_commit)
6123                 sv_commit = &PL_sv_yes;
6124             sv_yes_mark = &PL_sv_no;
6125         }
6126         sv_setsv(sv_err, sv_commit);
6127         sv_setsv(sv_mrk, sv_yes_mark);
6128     }
6129
6130
6131     if (last_pushed_cv) {
6132         dSP;
6133         POP_MULTICALL;
6134         PERL_UNUSED_VAR(SP);
6135     }
6136
6137     /* clean up; in particular, free all slabs above current one */
6138     LEAVE_SCOPE(oldsave);
6139
6140     return result;
6141 }
6142
6143 /*
6144  - regrepeat - repeatedly match something simple, report how many
6145  */
6146 /*
6147  * [This routine now assumes that it will only match on things of length 1.
6148  * That was true before, but now we assume scan - reginput is the count,
6149  * rather than incrementing count on every character.  [Er, except utf8.]]
6150  */
6151 STATIC I32
6152 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
6153 {
6154     dVAR;
6155     char *scan;
6156     I32 c;
6157     char *loceol = PL_regeol;
6158     I32 hardcount = 0;
6159     bool utf8_target = PL_reg_match_utf8;
6160     UV utf8_flags;
6161 #ifndef DEBUGGING
6162     PERL_UNUSED_ARG(depth);
6163 #endif
6164
6165     PERL_ARGS_ASSERT_REGREPEAT;
6166
6167     scan = PL_reginput;
6168     if (max == REG_INFTY)
6169         max = I32_MAX;
6170     else if (max < loceol - scan)
6171         loceol = scan + max;
6172     switch (OP(p)) {
6173     case REG_ANY:
6174         if (utf8_target) {
6175             loceol = PL_regeol;
6176             while (scan < loceol && hardcount < max && *scan != '\n') {
6177                 scan += UTF8SKIP(scan);
6178                 hardcount++;
6179             }
6180         } else {
6181             while (scan < loceol && *scan != '\n')
6182                 scan++;
6183         }
6184         break;
6185     case SANY:
6186         if (utf8_target) {
6187             loceol = PL_regeol;
6188             while (scan < loceol && hardcount < max) {
6189                 scan += UTF8SKIP(scan);
6190                 hardcount++;
6191             }
6192         }
6193         else
6194             scan = loceol;
6195         break;
6196     case CANY:
6197         scan = loceol;
6198         break;
6199     case EXACT:
6200         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6201          * means they match only characters in the string that can be expressed
6202          * as a single byte.  For non-utf8 strings, that means a simple match.
6203          * For utf8 strings, the character matched must be an invariant, or
6204          * downgradable to a single byte.  The pattern's utf8ness is
6205          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6206          * it is, it's an invariant */
6207
6208         c = (U8)*STRING(p);
6209         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6210
6211         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6212             while (scan < loceol && UCHARAT(scan) == c) {
6213                 scan++;
6214             }
6215         }
6216         else {
6217
6218             /* Here, the string is utf8, and the pattern char is different
6219              * in utf8 than not, so can't compare them directly.  Outside the
6220              * loop, find the two utf8 bytes that represent c, and then
6221              * look for those in sequence in the utf8 string */
6222             U8 high = UTF8_TWO_BYTE_HI(c);
6223             U8 low = UTF8_TWO_BYTE_LO(c);
6224             loceol = PL_regeol;
6225
6226             while (hardcount < max
6227                     && scan + 1 < loceol
6228                     && UCHARAT(scan) == high
6229                     && UCHARAT(scan + 1) == low)
6230             {
6231                 scan += 2;
6232                 hardcount++;
6233             }
6234         }
6235         break;
6236     case EXACTFA:
6237         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6238         goto do_exactf;
6239
6240     case EXACTFL:
6241         PL_reg_flags |= RF_tainted;
6242         utf8_flags = FOLDEQ_UTF8_LOCALE;
6243         goto do_exactf;
6244
6245     case EXACTF:
6246             utf8_flags = 0;
6247             goto do_exactf;
6248
6249     case EXACTFU_SS:
6250     case EXACTFU_TRICKYFOLD:
6251     case EXACTFU:
6252         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6253
6254         /* The comments for the EXACT case above apply as well to these fold
6255          * ones */
6256
6257     do_exactf:
6258         c = (U8)*STRING(p);
6259         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6260
6261         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6262             char *tmpeol = loceol;
6263             while (hardcount < max
6264                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6265                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6266             {
6267                 scan = tmpeol;
6268                 tmpeol = loceol;
6269                 hardcount++;
6270             }
6271
6272             /* XXX Note that the above handles properly the German sharp s in
6273              * the pattern matching ss in the string.  But it doesn't handle
6274              * properly cases where the string contains say 'LIGATURE ff' and
6275              * the pattern is 'f+'.  This would require, say, a new function or
6276              * revised interface to foldEQ_utf8(), in which the maximum number
6277              * of characters to match could be passed and it would return how
6278              * many actually did.  This is just one of many cases where
6279              * multi-char folds don't work properly, and so the fix is being
6280              * deferred */
6281         }
6282         else {
6283             U8 folded;
6284
6285             /* Here, the string isn't utf8 and c is a single byte; and either
6286              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6287              * doesn't affect c.  Can just do simple comparisons for exact or
6288              * fold matching. */
6289             switch (OP(p)) {
6290                 case EXACTF: folded = PL_fold[c]; break;
6291                 case EXACTFA:
6292                 case EXACTFU_TRICKYFOLD:
6293                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6294                 case EXACTFL: folded = PL_fold_locale[c]; break;
6295                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6296             }
6297             while (scan < loceol &&
6298                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6299             {
6300                 scan++;
6301             }
6302         }
6303         break;
6304     case ANYOFV:
6305     case ANYOF:
6306         if (utf8_target || OP(p) == ANYOFV) {
6307             STRLEN inclasslen;
6308             loceol = PL_regeol;
6309             inclasslen = loceol - scan;
6310             while (hardcount < max
6311                    && ((inclasslen = loceol - scan) > 0)
6312                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6313             {
6314                 scan += inclasslen;
6315                 hardcount++;
6316             }
6317         } else {
6318             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6319                 scan++;
6320         }
6321         break;
6322     case ALNUMU:
6323         if (utf8_target) {
6324     utf8_wordchar:
6325             loceol = PL_regeol;
6326             LOAD_UTF8_CHARCLASS_ALNUM();
6327             while (hardcount < max && scan < loceol &&
6328                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6329             {
6330                 scan += UTF8SKIP(scan);
6331                 hardcount++;
6332             }
6333         } else {
6334             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6335                 scan++;
6336             }
6337         }
6338         break;
6339     case ALNUM:
6340         if (utf8_target)
6341             goto utf8_wordchar;
6342         while (scan < loceol && isALNUM((U8) *scan)) {
6343             scan++;
6344         }
6345         break;
6346     case ALNUMA:
6347         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6348             scan++;
6349         }
6350         break;
6351     case ALNUML:
6352         PL_reg_flags |= RF_tainted;
6353         if (utf8_target) {
6354             loceol = PL_regeol;
6355             while (hardcount < max && scan < loceol &&
6356                    isALNUM_LC_utf8((U8*)scan)) {
6357                 scan += UTF8SKIP(scan);
6358                 hardcount++;
6359             }
6360         } else {
6361             while (scan < loceol && isALNUM_LC(*scan))
6362                 scan++;
6363         }
6364         break;
6365     case NALNUMU:
6366         if (utf8_target) {
6367
6368     utf8_Nwordchar:
6369
6370             loceol = PL_regeol;
6371             LOAD_UTF8_CHARCLASS_ALNUM();
6372             while (hardcount < max && scan < loceol &&
6373                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6374             {
6375                 scan += UTF8SKIP(scan);
6376                 hardcount++;
6377             }
6378         } else {
6379             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6380                 scan++;
6381             }
6382         }
6383         break;
6384     case NALNUM:
6385         if (utf8_target)
6386             goto utf8_Nwordchar;
6387         while (scan < loceol && ! isALNUM((U8) *scan)) {
6388             scan++;
6389         }
6390         break;
6391
6392     case POSIXA:
6393        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6394             scan++;
6395         }
6396         break;
6397     case NPOSIXA:
6398         if (utf8_target) {
6399             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6400                 scan += UTF8SKIP(scan);
6401             }
6402         }
6403         else {
6404             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6405                 scan++;
6406             }
6407         }
6408         break;
6409     case NALNUMA:
6410         if (utf8_target) {
6411             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6412                 scan += UTF8SKIP(scan);
6413             }
6414         }
6415         else {
6416             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6417                 scan++;
6418             }
6419         }
6420         break;
6421     case NALNUML:
6422         PL_reg_flags |= RF_tainted;
6423         if (utf8_target) {
6424             loceol = PL_regeol;
6425             while (hardcount < max && scan < loceol &&
6426                    !isALNUM_LC_utf8((U8*)scan)) {
6427                 scan += UTF8SKIP(scan);
6428                 hardcount++;
6429             }
6430         } else {
6431             while (scan < loceol && !isALNUM_LC(*scan))
6432                 scan++;
6433         }
6434         break;
6435     case SPACEU:
6436         if (utf8_target) {
6437
6438     utf8_space:
6439
6440             loceol = PL_regeol;
6441             LOAD_UTF8_CHARCLASS_SPACE();
6442             while (hardcount < max && scan < loceol &&
6443                    (*scan == ' ' ||
6444                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6445             {
6446                 scan += UTF8SKIP(scan);
6447                 hardcount++;
6448             }
6449             break;
6450         }
6451         else {
6452             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6453                 scan++;
6454             }
6455             break;
6456         }
6457     case SPACE:
6458         if (utf8_target)
6459             goto utf8_space;
6460
6461         while (scan < loceol && isSPACE((U8) *scan)) {
6462             scan++;
6463         }
6464         break;
6465     case SPACEA:
6466         while (scan < loceol && isSPACE_A((U8) *scan)) {
6467             scan++;
6468         }
6469         break;
6470     case SPACEL:
6471         PL_reg_flags |= RF_tainted;
6472         if (utf8_target) {
6473             loceol = PL_regeol;
6474             while (hardcount < max && scan < loceol &&
6475                    isSPACE_LC_utf8((U8*)scan)) {
6476                 scan += UTF8SKIP(scan);
6477                 hardcount++;
6478             }
6479         } else {
6480             while (scan < loceol && isSPACE_LC(*scan))
6481                 scan++;
6482         }
6483         break;
6484     case NSPACEU:
6485         if (utf8_target) {
6486
6487     utf8_Nspace:
6488
6489             loceol = PL_regeol;
6490             LOAD_UTF8_CHARCLASS_SPACE();
6491             while (hardcount < max && scan < loceol &&
6492                    ! (*scan == ' ' ||
6493                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6494             {
6495                 scan += UTF8SKIP(scan);
6496                 hardcount++;
6497             }
6498             break;
6499         }
6500         else {
6501             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6502                 scan++;
6503             }
6504         }
6505         break;
6506     case NSPACE:
6507         if (utf8_target)
6508             goto utf8_Nspace;
6509
6510         while (scan < loceol && ! isSPACE((U8) *scan)) {
6511             scan++;
6512         }
6513         break;
6514     case NSPACEA:
6515         if (utf8_target) {
6516             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6517                 scan += UTF8SKIP(scan);
6518             }
6519         }
6520         else {
6521             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6522                 scan++;
6523             }
6524         }
6525         break;
6526     case NSPACEL:
6527         PL_reg_flags |= RF_tainted;
6528         if (utf8_target) {
6529             loceol = PL_regeol;
6530             while (hardcount < max && scan < loceol &&
6531                    !isSPACE_LC_utf8((U8*)scan)) {
6532                 scan += UTF8SKIP(scan);
6533                 hardcount++;
6534             }
6535         } else {
6536             while (scan < loceol && !isSPACE_LC(*scan))
6537                 scan++;
6538         }
6539         break;
6540     case DIGIT:
6541         if (utf8_target) {
6542             loceol = PL_regeol;
6543             LOAD_UTF8_CHARCLASS_DIGIT();
6544             while (hardcount < max && scan < loceol &&
6545                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6546                 scan += UTF8SKIP(scan);
6547                 hardcount++;
6548             }
6549         } else {
6550             while (scan < loceol && isDIGIT(*scan))
6551                 scan++;
6552         }
6553         break;
6554     case DIGITA:
6555         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6556             scan++;
6557         }
6558         break;
6559     case DIGITL:
6560         PL_reg_flags |= RF_tainted;
6561         if (utf8_target) {
6562             loceol = PL_regeol;
6563             while (hardcount < max && scan < loceol &&
6564                    isDIGIT_LC_utf8((U8*)scan)) {
6565                 scan += UTF8SKIP(scan);
6566                 hardcount++;
6567             }
6568         } else {
6569             while (scan < loceol && isDIGIT_LC(*scan))
6570                 scan++;
6571         }
6572         break;
6573     case NDIGIT:
6574         if (utf8_target) {
6575             loceol = PL_regeol;
6576             LOAD_UTF8_CHARCLASS_DIGIT();
6577             while (hardcount < max && scan < loceol &&
6578                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6579                 scan += UTF8SKIP(scan);
6580                 hardcount++;
6581             }
6582         } else {
6583             while (scan < loceol && !isDIGIT(*scan))
6584                 scan++;
6585         }
6586         break;
6587     case NDIGITA:
6588         if (utf8_target) {
6589             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6590                 scan += UTF8SKIP(scan);
6591             }
6592         }
6593         else {
6594             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6595                 scan++;
6596             }
6597         }
6598         break;
6599     case NDIGITL:
6600         PL_reg_flags |= RF_tainted;
6601         if (utf8_target) {
6602             loceol = PL_regeol;
6603             while (hardcount < max && scan < loceol &&
6604                    !isDIGIT_LC_utf8((U8*)scan)) {
6605                 scan += UTF8SKIP(scan);
6606                 hardcount++;
6607             }
6608         } else {
6609             while (scan < loceol && !isDIGIT_LC(*scan))
6610                 scan++;
6611         }
6612         break;
6613     case LNBREAK:
6614         if (utf8_target) {
6615             loceol = PL_regeol;
6616             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6617                 scan += c;
6618                 hardcount++;
6619             }
6620         } else {
6621             /*
6622               LNBREAK can match two latin chars, which is ok,
6623               because we have a null terminated string, but we
6624               have to use hardcount in this situation
6625             */
6626             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6627                 scan+=c;
6628                 hardcount++;
6629             }
6630         }
6631         break;
6632     case HORIZWS:
6633         if (utf8_target) {
6634             loceol = PL_regeol;
6635             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6636                 scan += c;
6637                 hardcount++;
6638             }
6639         } else {
6640             while (scan < loceol && is_HORIZWS_latin1(scan))
6641                 scan++;
6642         }
6643         break;
6644     case NHORIZWS:
6645         if (utf8_target) {
6646             loceol = PL_regeol;
6647             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6648                 scan += UTF8SKIP(scan);
6649                 hardcount++;
6650             }
6651         } else {
6652             while (scan < loceol && !is_HORIZWS_latin1(scan))
6653                 scan++;
6654
6655         }
6656         break;
6657     case VERTWS:
6658         if (utf8_target) {
6659             loceol = PL_regeol;
6660             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6661                 scan += c;
6662                 hardcount++;
6663             }
6664         } else {
6665             while (scan < loceol && is_VERTWS_latin1(scan))
6666                 scan++;
6667
6668         }
6669         break;
6670     case NVERTWS:
6671         if (utf8_target) {
6672             loceol = PL_regeol;
6673             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6674                 scan += UTF8SKIP(scan);
6675                 hardcount++;
6676             }
6677         } else {
6678             while (scan < loceol && !is_VERTWS_latin1(scan))
6679                 scan++;
6680
6681         }
6682         break;
6683
6684     default:            /* Called on something of 0 width. */
6685         break;          /* So match right here or not at all. */
6686     }
6687
6688     if (hardcount)
6689         c = hardcount;
6690     else
6691         c = scan - PL_reginput;
6692     PL_reginput = scan;
6693
6694     DEBUG_r({
6695         GET_RE_DEBUG_FLAGS_DECL;
6696         DEBUG_EXECUTE_r({
6697             SV * const prop = sv_newmortal();
6698             regprop(prog, prop, p);
6699             PerlIO_printf(Perl_debug_log,
6700                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6701                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6702         });
6703     });
6704
6705     return(c);
6706 }
6707
6708
6709 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6710 /*
6711 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6712 create a copy so that changes the caller makes won't change the shared one
6713  */
6714 SV *
6715 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6716 {
6717     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6718     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6719 }
6720 #endif
6721
6722 STATIC SV *
6723 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6724 {
6725     /* Returns the swash for the input 'node' in the regex 'prog'.
6726      * If <doinit> is true, will attempt to create the swash if not already
6727      *    done.
6728      * If <listsvp> is non-null, will return the swash initialization string in
6729      *    it.
6730      * If <altsvp> is non-null, will return the alternates to the regular swash
6731      *    in it
6732      * Tied intimately to how regcomp.c sets up the data structure */
6733
6734     dVAR;
6735     SV *sw  = NULL;
6736     SV *si  = NULL;
6737     SV *alt = NULL;
6738     SV*  invlist = NULL;
6739
6740     RXi_GET_DECL(prog,progi);
6741     const struct reg_data * const data = prog ? progi->data : NULL;
6742
6743     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6744
6745     assert(ANYOF_NONBITMAP(node));
6746
6747     if (data && data->count) {
6748         const U32 n = ARG(node);
6749
6750         if (data->what[n] == 's') {
6751             SV * const rv = MUTABLE_SV(data->data[n]);
6752             AV * const av = MUTABLE_AV(SvRV(rv));
6753             SV **const ary = AvARRAY(av);
6754             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6755
6756             si = *ary;  /* ary[0] = the string to initialize the swash with */
6757
6758             /* Elements 3 and 4 are either both present or both absent. [3] is
6759              * any inversion list generated at compile time; [4] indicates if
6760              * that inversion list has any user-defined properties in it. */
6761             if (av_len(av) >= 3) {
6762                 invlist = ary[3];
6763                 if (SvUV(ary[4])) {
6764                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
6765                 }
6766             }
6767             else {
6768                 invlist = NULL;
6769             }
6770
6771             /* Element [1] is reserved for the set-up swash.  If already there,
6772              * return it; if not, create it and store it there */
6773             if (SvROK(ary[1])) {
6774                 sw = ary[1];
6775             }
6776             else if (si && doinit) {
6777
6778                 sw = _core_swash_init("utf8", /* the utf8 package */
6779                                       "", /* nameless */
6780                                       si,
6781                                       1, /* binary */
6782                                       0, /* not from tr/// */
6783                                       invlist,
6784                                       &swash_init_flags);
6785                 (void)av_store(av, 1, sw);
6786             }
6787
6788             /* Element [2] is for any multi-char folds.  Note that is a
6789              * fundamentally flawed design, because can't backtrack and try
6790              * again.  See [perl #89774] */
6791             if (SvTYPE(ary[2]) == SVt_PVAV) {
6792                 alt = ary[2];
6793             }
6794         }
6795     }
6796
6797     if (listsvp) {
6798         SV* matches_string = newSVpvn("", 0);
6799
6800         /* Use the swash, if any, which has to have incorporated into it all
6801          * possibilities */
6802         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
6803             && (si && si != &PL_sv_undef))
6804         {
6805
6806             /* If no swash, use the input initialization string, if available */
6807             sv_catsv(matches_string, si);
6808         }
6809
6810         /* Add the inversion list to whatever we have.  This may have come from
6811          * the swash, or from an input parameter */
6812         if (invlist) {
6813             sv_catsv(matches_string, _invlist_contents(invlist));
6814         }
6815         *listsvp = matches_string;
6816     }
6817
6818     if (altsvp)
6819         *altsvp  = alt;
6820
6821     return sw;
6822 }
6823
6824 /*
6825  - reginclass - determine if a character falls into a character class
6826
6827   n is the ANYOF regnode
6828   p is the target string
6829   lenp is pointer to the maximum number of bytes of how far to go in p
6830     (This is assumed wthout checking to always be at least the current
6831     character's size)
6832   utf8_target tells whether p is in UTF-8.
6833
6834   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6835   from a successful match, the value it points to will be updated to how many
6836   bytes in p were matched.  If there was no match, the value is undefined,
6837   possibly changed from the input.
6838
6839   Note that this can be a synthetic start class, a combination of various
6840   nodes, so things you think might be mutually exclusive, such as locale,
6841   aren't.  It can match both locale and non-locale
6842
6843  */
6844
6845 STATIC bool
6846 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6847 {
6848     dVAR;
6849     const char flags = ANYOF_FLAGS(n);
6850     bool match = FALSE;
6851     UV c = *p;
6852     STRLEN c_len = 0;
6853     STRLEN maxlen;
6854
6855     PERL_ARGS_ASSERT_REGINCLASS;
6856
6857     /* If c is not already the code point, get it */
6858     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6859         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6860                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6861                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6862                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6863                  * UTF8_ALLOW_FFFF */
6864         if (c_len == (STRLEN)-1)
6865             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6866     }
6867     else {
6868         c_len = 1;
6869     }
6870
6871     /* Use passed in max length, or one character if none passed in or less
6872      * than one character.  And assume will match just one character.  This is
6873      * overwritten later if matched more. */
6874     if (lenp) {
6875         maxlen = (*lenp > c_len) ? *lenp : c_len;
6876         *lenp = c_len;
6877
6878     }
6879     else {
6880         maxlen = c_len;
6881     }
6882
6883     /* If this character is potentially in the bitmap, check it */
6884     if (c < 256) {
6885         if (ANYOF_BITMAP_TEST(n, c))
6886             match = TRUE;
6887         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6888                 && ! utf8_target
6889                 && ! isASCII(c))
6890         {
6891             match = TRUE;
6892         }
6893
6894         else if (flags & ANYOF_LOCALE) {
6895             PL_reg_flags |= RF_tainted;
6896
6897             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6898                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6899             {
6900                 match = TRUE;
6901             }
6902             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6903                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6904                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6905                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6906                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6907                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6908                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6909                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6910                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6911                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6912                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6913                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6914                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6915                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6916                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6917                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6918                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6919                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6920                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6921                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6922                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6923                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6924                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6925                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6926                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6927                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6928                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6929                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6930                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6931                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6932                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6933                      ) /* How's that for a conditional? */
6934             ) {
6935                 match = TRUE;
6936             }
6937         }
6938     }
6939
6940     /* If the bitmap didn't (or couldn't) match, and something outside the
6941      * bitmap could match, try that.  Locale nodes specifiy completely the
6942      * behavior of code points in the bit map (otherwise, a utf8 target would
6943      * cause them to be treated as Unicode and not locale), except in
6944      * the very unlikely event when this node is a synthetic start class, which
6945      * could be a combination of locale and non-locale nodes.  So allow locale
6946      * to match for the synthetic start class, which will give a false
6947      * positive that will be resolved when the match is done again as not part
6948      * of the synthetic start class */
6949     if (!match) {
6950         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6951             match = TRUE;       /* Everything above 255 matches */
6952         }
6953         else if (ANYOF_NONBITMAP(n)
6954                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6955                      || (utf8_target
6956                          && (c >=256
6957                              || (! (flags & ANYOF_LOCALE))
6958                              || (flags & ANYOF_IS_SYNTHETIC)))))
6959         {
6960             AV *av;
6961             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6962
6963             if (sw) {
6964                 U8 * utf8_p;
6965                 if (utf8_target) {
6966                     utf8_p = (U8 *) p;
6967                 } else {
6968
6969                     /* Not utf8.  Convert as much of the string as available up
6970                      * to the limit of how far the (single) character in the
6971                      * pattern can possibly match (no need to go further).  If
6972                      * the node is a straight ANYOF or not folding, it can't
6973                      * match more than one.  Otherwise, It can match up to how
6974                      * far a single char can fold to.  Since not utf8, each
6975                      * character is a single byte, so the max it can be in
6976                      * bytes is the same as the max it can be in characters */
6977                     STRLEN len = (OP(n) == ANYOF
6978                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6979                                   ? 1
6980                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6981                                     ? maxlen
6982                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6983                     utf8_p = bytes_to_utf8(p, &len);
6984                 }
6985
6986                 if (swash_fetch(sw, utf8_p, TRUE))
6987                     match = TRUE;
6988                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6989
6990                     /* Here, we need to test if the fold of the target string
6991                      * matches.  The non-multi char folds have all been moved to
6992                      * the compilation phase, and the multi-char folds have
6993                      * been stored by regcomp into 'av'; we linearly check to
6994                      * see if any match the target string (folded).   We know
6995                      * that the originals were each one character, but we don't
6996                      * currently know how many characters/bytes each folded to,
6997                      * except we do know that there are small limits imposed by
6998                      * Unicode.  XXX A performance enhancement would be to have
6999                      * regcomp.c store the max number of chars/bytes that are
7000                      * in an av entry, as, say the 0th element.  Even better
7001                      * would be to have a hash of the few characters that can
7002                      * start a multi-char fold to the max number of chars of
7003                      * those folds.
7004                      *
7005                      * If there is a match, we will need to advance (if lenp is
7006                      * specified) the match pointer in the target string.  But
7007                      * what we are comparing here isn't that string directly,
7008                      * but its fold, whose length may differ from the original.
7009                      * As we go along in constructing the fold, therefore, we
7010                      * create a map so that we know how many bytes in the
7011                      * source to advance given that we have matched a certain
7012                      * number of bytes in the fold.  This map is stored in
7013                      * 'map_fold_len_back'.  Let n mean the number of bytes in
7014                      * the fold of the first character that we are folding.
7015                      * Then map_fold_len_back[n] is set to the number of bytes
7016                      * in that first character.  Similarly let m be the
7017                      * corresponding number for the second character to be
7018                      * folded.  Then map_fold_len_back[n+m] is set to the
7019                      * number of bytes occupied by the first two source
7020                      * characters. ... */
7021                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
7022                     U8 folded[UTF8_MAXBYTES_CASE+1];
7023                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
7024                     STRLEN total_foldlen = 0; /* num bytes in fold of all
7025                                                   chars */
7026
7027                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
7028
7029                         /* Here, only need to fold the first char of the target
7030                          * string.  It the source wasn't utf8, is 1 byte long */
7031                         to_utf8_fold(utf8_p, folded, &foldlen);
7032                         total_foldlen = foldlen;
7033                         map_fold_len_back[foldlen] = (utf8_target)
7034                                                      ? UTF8SKIP(utf8_p)
7035                                                      : 1;
7036                     }
7037                     else {
7038
7039                         /* Here, need to fold more than the first char.  Do so
7040                          * up to the limits */
7041                         U8* source_ptr = utf8_p;    /* The source for the fold
7042                                                        is the regex target
7043                                                        string */
7044                         U8* folded_ptr = folded;
7045                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
7046                                                        available byte in the
7047                                                        target string */
7048                         U8 i;
7049                         for (i = 0;
7050                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
7051                              i++)
7052                         {
7053
7054                             /* Fold the next character */
7055                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
7056                             STRLEN this_char_foldlen;
7057                             to_utf8_fold(source_ptr,
7058                                          this_char_folded,
7059                                          &this_char_foldlen);
7060
7061                             /* Bail if it would exceed the byte limit for
7062                              * folding a single char. */
7063                             if (this_char_foldlen + folded_ptr - folded >
7064                                                             UTF8_MAXBYTES_CASE)
7065                             {
7066                                 break;
7067                             }
7068
7069                             /* Add the fold of this character */
7070                             Copy(this_char_folded,
7071                                  folded_ptr,
7072                                  this_char_foldlen,
7073                                  U8);
7074                             source_ptr += UTF8SKIP(source_ptr);
7075                             folded_ptr += this_char_foldlen;
7076                             total_foldlen = folded_ptr - folded;
7077
7078                             /* Create map from the number of bytes in the fold
7079                              * back to the number of bytes in the source.  If
7080                              * the source isn't utf8, the byte count is just
7081                              * the number of characters so far */
7082                             map_fold_len_back[total_foldlen]
7083                                                       = (utf8_target)
7084                                                         ? source_ptr - utf8_p
7085                                                         : i + 1;
7086                         }
7087                         *folded_ptr = '\0';
7088                     }
7089
7090
7091                     /* Do the linear search to see if the fold is in the list
7092                      * of multi-char folds. */
7093                     if (av) {
7094                         I32 i;
7095                         for (i = 0; i <= av_len(av); i++) {
7096                             SV* const sv = *av_fetch(av, i, FALSE);
7097                             STRLEN len;
7098                             const char * const s = SvPV_const(sv, len);
7099
7100                             if (len <= total_foldlen
7101                                 && memEQ(s, (char*)folded, len)
7102
7103                                    /* If 0, means matched a partial char. See
7104                                     * [perl #90536] */
7105                                 && map_fold_len_back[len])
7106                             {
7107
7108                                 /* Advance the target string ptr to account for
7109                                  * this fold, but have to translate from the
7110                                  * folded length to the corresponding source
7111                                  * length. */
7112                                 if (lenp) {
7113                                     *lenp = map_fold_len_back[len];
7114                                 }
7115                                 match = TRUE;
7116                                 break;
7117                             }
7118                         }
7119                     }
7120                 }
7121
7122                 /* If we allocated a string above, free it */
7123                 if (! utf8_target) Safefree(utf8_p);
7124             }
7125         }
7126
7127         if (UNICODE_IS_SUPER(c)
7128             && (flags & ANYOF_WARN_SUPER)
7129             && ckWARN_d(WARN_NON_UNICODE))
7130         {
7131             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7132                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7133         }
7134     }
7135
7136     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7137     return cBOOL(flags & ANYOF_INVERT) ^ match;
7138 }
7139
7140 STATIC U8 *
7141 S_reghop3(U8 *s, I32 off, const U8* lim)
7142 {
7143     /* return the position 'off' UTF-8 characters away from 's', forward if
7144      * 'off' >= 0, backwards if negative.  But don't go outside of position
7145      * 'lim', which better be < s  if off < 0 */
7146
7147     dVAR;
7148
7149     PERL_ARGS_ASSERT_REGHOP3;
7150
7151     if (off >= 0) {
7152         while (off-- && s < lim) {
7153             /* XXX could check well-formedness here */
7154             s += UTF8SKIP(s);
7155         }
7156     }
7157     else {
7158         while (off++ && s > lim) {
7159             s--;
7160             if (UTF8_IS_CONTINUED(*s)) {
7161                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7162                     s--;
7163             }
7164             /* XXX could check well-formedness here */
7165         }
7166     }
7167     return s;
7168 }
7169
7170 #ifdef XXX_dmq
7171 /* there are a bunch of places where we use two reghop3's that should
7172    be replaced with this routine. but since thats not done yet
7173    we ifdef it out - dmq
7174 */
7175 STATIC U8 *
7176 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7177 {
7178     dVAR;
7179
7180     PERL_ARGS_ASSERT_REGHOP4;
7181
7182     if (off >= 0) {
7183         while (off-- && s < rlim) {
7184             /* XXX could check well-formedness here */
7185             s += UTF8SKIP(s);
7186         }
7187     }
7188     else {
7189         while (off++ && s > llim) {
7190             s--;
7191             if (UTF8_IS_CONTINUED(*s)) {
7192                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7193                     s--;
7194             }
7195             /* XXX could check well-formedness here */
7196         }
7197     }
7198     return s;
7199 }
7200 #endif
7201
7202 STATIC U8 *
7203 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7204 {
7205     dVAR;
7206
7207     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7208
7209     if (off >= 0) {
7210         while (off-- && s < lim) {
7211             /* XXX could check well-formedness here */
7212             s += UTF8SKIP(s);
7213         }
7214         if (off >= 0)
7215             return NULL;
7216     }
7217     else {
7218         while (off++ && s > lim) {
7219             s--;
7220             if (UTF8_IS_CONTINUED(*s)) {
7221                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7222                     s--;
7223             }
7224             /* XXX could check well-formedness here */
7225         }
7226         if (off <= 0)
7227             return NULL;
7228     }
7229     return s;
7230 }
7231
7232 static void
7233 restore_pos(pTHX_ void *arg)
7234 {
7235     dVAR;
7236     regexp * const rex = (regexp *)arg;
7237     if (PL_reg_state.re_state_eval_setup_done) {
7238         if (PL_reg_oldsaved) {
7239             rex->subbeg = PL_reg_oldsaved;
7240             rex->sublen = PL_reg_oldsavedlen;
7241             rex->suboffset = PL_reg_oldsavedoffset;
7242             rex->subcoffset = PL_reg_oldsavedcoffset;
7243 #ifdef PERL_OLD_COPY_ON_WRITE
7244             rex->saved_copy = PL_nrs;
7245 #endif
7246             RXp_MATCH_COPIED_on(rex);
7247         }
7248         PL_reg_magic->mg_len = PL_reg_oldpos;
7249         PL_reg_state.re_state_eval_setup_done = FALSE;
7250         PL_curpm = PL_reg_oldcurpm;
7251     }
7252 }
7253
7254 STATIC void
7255 S_to_utf8_substr(pTHX_ register regexp *prog)
7256 {
7257     int i = 1;
7258
7259     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7260
7261     do {
7262         if (prog->substrs->data[i].substr
7263             && !prog->substrs->data[i].utf8_substr) {
7264             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7265             prog->substrs->data[i].utf8_substr = sv;
7266             sv_utf8_upgrade(sv);
7267             if (SvVALID(prog->substrs->data[i].substr)) {
7268                 if (SvTAIL(prog->substrs->data[i].substr)) {
7269                     /* Trim the trailing \n that fbm_compile added last
7270                        time.  */
7271                     SvCUR_set(sv, SvCUR(sv) - 1);
7272                     /* Whilst this makes the SV technically "invalid" (as its
7273                        buffer is no longer followed by "\0") when fbm_compile()
7274                        adds the "\n" back, a "\0" is restored.  */
7275                     fbm_compile(sv, FBMcf_TAIL);
7276                 } else
7277                     fbm_compile(sv, 0);
7278             }
7279             if (prog->substrs->data[i].substr == prog->check_substr)
7280                 prog->check_utf8 = sv;
7281         }
7282     } while (i--);
7283 }
7284
7285 STATIC void
7286 S_to_byte_substr(pTHX_ register regexp *prog)
7287 {
7288     dVAR;
7289     int i = 1;
7290
7291     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7292
7293     do {
7294         if (prog->substrs->data[i].utf8_substr
7295             && !prog->substrs->data[i].substr) {
7296             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7297             if (sv_utf8_downgrade(sv, TRUE)) {
7298                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7299                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7300                         /* Trim the trailing \n that fbm_compile added last
7301                            time.  */
7302                         SvCUR_set(sv, SvCUR(sv) - 1);
7303                         fbm_compile(sv, FBMcf_TAIL);
7304                     } else
7305                         fbm_compile(sv, 0);
7306                 }
7307             } else {
7308                 SvREFCNT_dec(sv);
7309                 sv = &PL_sv_undef;
7310             }
7311             prog->substrs->data[i].substr = sv;
7312             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7313                 prog->check_substr = sv;
7314         }
7315     } while (i--);
7316 }
7317
7318 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
7319  * pre-composed Hangul syllables, which are all in a contiguous block and
7320  * arranged there in such a way so as to facilitate alorithmic determination of
7321  * their characteristics.  As such, they don't need a swash, but can be
7322  * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
7323  * is a GCB=LV */
7324 #define SBASE 0xAC00    /* Start of block */
7325 #define SCount 11172    /* Length of block */
7326 #define TCount 28
7327
7328 #if 0   /* This routine is not currently used */
7329 PERL_STATIC_INLINE bool
7330 S_is_utf8_X_LV(pTHX_ const U8 *p)
7331 {
7332     /* Unlike most other similarly named routines here, this does not create a
7333      * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7334
7335     dVAR;
7336
7337     UV cp = valid_utf8_to_uvchr(p, NULL);
7338
7339     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7340
7341     /* The earliest Unicode releases did not have these precomposed Hangul
7342      * syllables.  Set to point to undef in that case, so will return false on
7343      * every call */
7344     if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
7345         PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7346         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7347             SvREFCNT_dec(PL_utf8_X_LV);
7348             PL_utf8_X_LV = &PL_sv_undef;
7349         }
7350     }
7351
7352     return (PL_utf8_X_LV != &PL_sv_undef
7353             && cp >= SBASE && cp < SBASE + SCount
7354             && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7355 }
7356 #endif
7357
7358 PERL_STATIC_INLINE bool
7359 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7360 {
7361     /* Unlike most other similarly named routines here, this does not create a
7362      * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7363
7364     dVAR;
7365
7366     UV cp = valid_utf8_to_uvchr(p, NULL);
7367
7368     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7369
7370     /* The earliest Unicode releases did not have these precomposed Hangul
7371      * syllables.  Set to point to undef in that case, so will return false on
7372      * every call */
7373     if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
7374         PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7375         if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7376             SvREFCNT_dec(PL_utf8_X_LVT);
7377             PL_utf8_X_LVT = &PL_sv_undef;
7378         }
7379     }
7380
7381     return (PL_utf8_X_LVT != &PL_sv_undef
7382             && cp >= SBASE && cp < SBASE + SCount
7383             && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7384 }
7385
7386 /*
7387  * Local variables:
7388  * c-indentation-style: bsd
7389  * c-basic-offset: 4
7390  * indent-tabs-mode: nil
7391  * End:
7392  *
7393  * ex: set ts=8 sts=4 sw=4 et:
7394  */