regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #include "inline_invlist.c"
  84 #include "utf8_strings.h"
  85
  86 #define RF_tainted      1       /* tainted information used? e.g. locale */
  87 #define RF_warned       2               /* warned about big count? */
  88
  89 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  90
  91 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { \
 127         bool ok; \
 128         ENTER; save_re_context(); \
 129         ok=CAT2(is_utf8_,class)((const U8*)str); \
 130         PERL_UNUSED_VAR(ok); \
 131         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 132 /* Doesn't do an assert to verify that is correct */
 133 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 134     if (!CAT2(PL_utf8_,class)) { \
 135         bool throw_away PERL_UNUSED_DECL; \
 136         ENTER; save_re_context(); \
 137         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 138         LEAVE; } } STMT_END
 139
 140 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 141 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 142 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 143
 144 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 145         /* No asserts are done for some of these, in case called on a   */  \
 146         /* Unicode version in which they map to nothing */                  \
 147         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                      \
 149         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
 151         LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8);              \
 152         LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8);      \
 153         LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI);    /* empty in many releases */ \
 154         LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8);             \
 155         LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
 156
 157 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 158
 159 /* The actual code for CCC_TRY, which uses several variables from the routine
 160  * it's callable from.  It is designed to be the bulk of a case statement.
 161  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 162  *      nextchr matches the class.
 163  * UTF8_TEST is the whole test string to use for utf8 targets
 164  * LOAD is what to use to test, and if not present to load in the swash for the
 165  *      class
 166  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 167  *      UTF8_TEST test.
 168  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 169  * utf8 and a variant, load the swash if necessary and test using the utf8
 170  * test.  Advance to the next character if test is ok, otherwise fail; If not
 171  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 172  * fails, or advance to the next character */
 173
 174 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 175     if (locinput >= PL_regeol) {                                              \
 176         sayNO;                                                                \
 177     }                                                                         \
 178     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 179         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 180         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 181             sayNO;                                                            \
 182         }                                                                     \
 183         locinput += PL_utf8skip[nextchr];                                     \
 184         nextchr = UCHARAT(locinput);                                          \
 185         break;                                                                \
 186     }                                                                         \
 187     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 188         sayNO;                                                                \
 189     }                                                                         \
 190     nextchr = UCHARAT(++locinput);                                            \
 191     break;
 192
 193 /* Handle the non-locale cases for a character class and its complement.  It
 194  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 195  * This is because that code fails when the test succeeds, so we want to have
 196  * the test fail so that the code succeeds.  The swash is stored in a
 197  * predictable PL_ place */
 198 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 199                            CLASS, STR)                                        \
 200     case NAME:                                                                \
 201         _CCC_TRY_CODE( !, FUNC,                                               \
 202                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 203                                             (U8*)locinput, TRUE)),            \
 204                           CLASS, STR)                                         \
 205     case NNAME:                                                               \
 206         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 207                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 208                                             (U8*)locinput, TRUE)),            \
 209                           CLASS, STR)                                         \
 210
 211 /* Generate the case statements for both locale and non-locale character
 212  * classes in regmatch for classes that don't have special unicode semantics.
 213  * Locales don't use an immediate swash, but an intermediary special locale
 214  * function that is called on the pointer to the current place in the input
 215  * string.  That function will resolve to needing the same swash.  One might
 216  * think that because we don't know what the locale will match, we shouldn't
 217  * check with the swash loading function that it loaded properly; ie, that we
 218  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 219  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 220  * irrelevant here */
 221 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 222                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 223                 NAMEA, NNAMEA, FUNCA,                                         \
 224                 CLASS, STR)                                                   \
 225     case NAMEL:                                                               \
 226         PL_reg_flags |= RF_tainted;                                           \
 227         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 228     case NNAMEL:                                                              \
 229         PL_reg_flags |= RF_tainted;                                           \
 230         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 231                        CLASS, STR)                                            \
 232     case NAMEA:                                                               \
 233         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 234             sayNO;                                                            \
 235         }                                                                     \
 236         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 237         nextchr = UCHARAT(++locinput);                                        \
 238         break;                                                                \
 239     case NNAMEA:                                                              \
 240         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 241             sayNO;                                                            \
 242         }                                                                     \
 243         if (utf8_target) {                                                    \
 244             locinput += PL_utf8skip[nextchr];                                 \
 245             nextchr = UCHARAT(locinput);                                      \
 246         }                                                                     \
 247         else {                                                                \
 248             nextchr = UCHARAT(++locinput);                                    \
 249         }                                                                     \
 250         break;                                                                \
 251     /* Generate the non-locale cases */                                       \
 252     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 253
 254 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 255  * statements to handle separate Unicode semantics nodes */
 256 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 257                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 258                   NAMEU, NNAMEU, FUNCU,                                        \
 259                   NAMEA, NNAMEA, FUNCA,                                        \
 260                   CLASS, STR)                                                  \
 261     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 262             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 263             NAMEA, NNAMEA, FUNCA,                                              \
 264             CLASS, STR)                                                        \
 265     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 266
 267 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 268
 269 /* for use after a quantifier and before an EXACT-like node -- japhy */
 270 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 271  *
 272  * NOTE that *nothing* that affects backtracking should be in here, specifically
 273  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 274  * node that is in between two EXACT like nodes when ascertaining what the required
 275  * "follow" character is. This should probably be moved to regex compile time
 276  * although it may be done at run time beause of the REF possibility - more
 277  * investigation required. -- demerphq
 278 */
 279 #define JUMPABLE(rn) (      \
 280     OP(rn) == OPEN ||       \
 281     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 282     OP(rn) == EVAL ||   \
 283     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 284     OP(rn) == PLUS || OP(rn) == MINMOD || \
 285     OP(rn) == KEEPS || \
 286     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 287 )
 288 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 289
 290 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 291
 292 #if 0
 293 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 294    we don't need this definition. */
 295 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 296 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 297 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 298
 299 #else
 300 /* ... so we use this as its faster. */
 301 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 302 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 303 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 304 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 305
 306 #endif
 307
 308 /*
 309   Search for mandatory following text node; for lookahead, the text must
 310   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 311 */
 312 #define FIND_NEXT_IMPT(rn) STMT_START { \
 313     while (JUMPABLE(rn)) { \
 314         const OPCODE type = OP(rn); \
 315         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 316             rn = NEXTOPER(NEXTOPER(rn)); \
 317         else if (type == PLUS) \
 318             rn = NEXTOPER(rn); \
 319         else if (type == IFMATCH) \
 320             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 321         else rn += NEXT_OFF(rn); \
 322     } \
 323 } STMT_END
 324
 325
 326 static void restore_pos(pTHX_ void *arg);
 327
 328 #define REGCP_PAREN_ELEMS 3
 329 #define REGCP_OTHER_ELEMS 3
 330 #define REGCP_FRAME_ELEMS 1
 331 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 332  * are needed for the regexp context stack bookkeeping. */
 333
 334 STATIC CHECKPOINT
 335 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
 336 {
 337     dVAR;
 338     const int retval = PL_savestack_ix;
 339     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 340     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 341     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 342     I32 p;
 343     GET_RE_DEBUG_FLAGS_DECL;
 344
 345     PERL_ARGS_ASSERT_REGCPPUSH;
 346
 347     if (paren_elems_to_push < 0)
 348         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 349                    paren_elems_to_push);
 350
 351     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 352         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 353                    " out of range (%lu-%ld)",
 354                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 355
 356     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 357
 358     DEBUG_BUFFERS_r(
 359         if ((int)PL_regsize > (int)parenfloor)
 360             PerlIO_printf(Perl_debug_log,
 361                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 362                 PTR2UV(rex),
 363                 PTR2UV(rex->offs)
 364             );
 365     );
 366     for (p = parenfloor+1; p <= (I32)PL_regsize;  p++) {
 367 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 368         SSPUSHINT(rex->offs[p].end);
 369         SSPUSHINT(rex->offs[p].start);
 370         SSPUSHINT(rex->offs[p].start_tmp);
 371         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 372             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 373             (UV)p,
 374             (IV)rex->offs[p].start,
 375             (IV)rex->offs[p].start_tmp,
 376             (IV)rex->offs[p].end
 377         ));
 378     }
 379 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 380     SSPUSHINT(PL_regsize);
 381     SSPUSHINT(rex->lastparen);
 382     SSPUSHINT(rex->lastcloseparen);
 383     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 384
 385     return retval;
 386 }
 387
 388 /* These are needed since we do not localize EVAL nodes: */
 389 #define REGCP_SET(cp)                                           \
 390     DEBUG_STATE_r(                                              \
 391             PerlIO_printf(Perl_debug_log,                       \
 392                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 393                 (IV)PL_savestack_ix));                          \
 394     cp = PL_savestack_ix
 395
 396 #define REGCP_UNWIND(cp)                                        \
 397     DEBUG_STATE_r(                                              \
 398         if (cp != PL_savestack_ix)                              \
 399             PerlIO_printf(Perl_debug_log,                       \
 400                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 401                 (IV)(cp), (IV)PL_savestack_ix));                \
 402     regcpblow(cp)
 403
 404 #define UNWIND_PAREN(lp, lcp)               \
 405     for (n = rex->lastparen; n > lp; n--)   \
 406         rex->offs[n].end = -1;              \
 407     rex->lastparen = n;                     \
 408     rex->lastcloseparen = lcp;
 409
 410
 411 STATIC void
 412 S_regcppop(pTHX_ regexp *rex)
 413 {
 414     dVAR;
 415     UV i;
 416     U32 paren;
 417     GET_RE_DEBUG_FLAGS_DECL;
 418
 419     PERL_ARGS_ASSERT_REGCPPOP;
 420
 421     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 422     i = SSPOPUV;
 423     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 424     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 425     rex->lastcloseparen = SSPOPINT;
 426     rex->lastparen = SSPOPINT;
 427     PL_regsize = SSPOPINT;
 428
 429     i -= REGCP_OTHER_ELEMS;
 430     /* Now restore the parentheses context. */
 431     DEBUG_BUFFERS_r(
 432         if (i || rex->lastparen + 1 <= rex->nparens)
 433             PerlIO_printf(Perl_debug_log,
 434                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 435                 PTR2UV(rex),
 436                 PTR2UV(rex->offs)
 437             );
 438     );
 439     paren = PL_regsize;
 440     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 441         I32 tmps;
 442         rex->offs[paren].start_tmp = SSPOPINT;
 443         rex->offs[paren].start = SSPOPINT;
 444         tmps = SSPOPINT;
 445         if (paren <= rex->lastparen)
 446             rex->offs[paren].end = tmps;
 447         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 448             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 449             (UV)paren,
 450             (IV)rex->offs[paren].start,
 451             (IV)rex->offs[paren].start_tmp,
 452             (IV)rex->offs[paren].end,
 453             (paren > rex->lastparen ? "(skipped)" : ""));
 454         );
 455         paren--;
 456     }
 457 #if 1
 458     /* It would seem that the similar code in regtry()
 459      * already takes care of this, and in fact it is in
 460      * a better location to since this code can #if 0-ed out
 461      * but the code in regtry() is needed or otherwise tests
 462      * requiring null fields (pat.t#187 and split.t#{13,14}
 463      * (as of patchlevel 7877)  will fail.  Then again,
 464      * this code seems to be necessary or otherwise
 465      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 466      * --jhi updated by dapm */
 467     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 468         if (i > PL_regsize)
 469             rex->offs[i].start = -1;
 470         rex->offs[i].end = -1;
 471         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 472             "    \\%"UVuf": %s   ..-1 undeffing\n",
 473             (UV)i,
 474             (i > PL_regsize) ? "-1" : "  "
 475         ));
 476     }
 477 #endif
 478 }
 479
 480 /* restore the parens and associated vars at savestack position ix,
 481  * but without popping the stack */
 482
 483 STATIC void
 484 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
 485 {
 486     I32 tmpix = PL_savestack_ix;
 487     PL_savestack_ix = ix;
 488     regcppop(rex);
 489     PL_savestack_ix = tmpix;
 490 }
 491
 492 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 493
 494 /*
 495  * pregexec and friends
 496  */
 497
 498 #ifndef PERL_IN_XSUB_RE
 499 /*
 500  - pregexec - match a regexp against a string
 501  */
 502 I32
 503 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 504          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 505 /* strend: pointer to null at end of string */
 506 /* strbeg: real beginning of string */
 507 /* minend: end of match must be >=minend after stringarg. */
 508 /* nosave: For optimizations. */
 509 {
 510     PERL_ARGS_ASSERT_PREGEXEC;
 511
 512     return
 513         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 514                       nosave ? 0 : REXEC_COPY_STR);
 515 }
 516 #endif
 517
 518 /*
 519  * Need to implement the following flags for reg_anch:
 520  *
 521  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 522  * USE_INTUIT_ML
 523  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 524  * INTUIT_AUTORITATIVE_ML
 525  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 526  * INTUIT_ONCE_ML
 527  *
 528  * Another flag for this function: SECOND_TIME (so that float substrs
 529  * with giant delta may be not rechecked).
 530  */
 531
 532 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 533
 534 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 535    Otherwise, only SvCUR(sv) is used to get strbeg. */
 536
 537 /* XXXX We assume that strpos is strbeg unless sv. */
 538
 539 /* XXXX Some places assume that there is a fixed substring.
 540         An update may be needed if optimizer marks as "INTUITable"
 541         RExen without fixed substrings.  Similarly, it is assumed that
 542         lengths of all the strings are no more than minlen, thus they
 543         cannot come from lookahead.
 544         (Or minlen should take into account lookahead.)
 545   NOTE: Some of this comment is not correct. minlen does now take account
 546   of lookahead/behind. Further research is required. -- demerphq
 547
 548 */
 549
 550 /* A failure to find a constant substring means that there is no need to make
 551    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 552    finding a substring too deep into the string means that less calls to
 553    regtry() should be needed.
 554
 555    REx compiler's optimizer found 4 possible hints:
 556         a) Anchored substring;
 557         b) Fixed substring;
 558         c) Whether we are anchored (beginning-of-line or \G);
 559         d) First node (of those at offset 0) which may distinguish positions;
 560    We use a)b)d) and multiline-part of c), and try to find a position in the
 561    string which does not contradict any of them.
 562  */
 563
 564 /* Most of decisions we do here should have been done at compile time.
 565    The nodes of the REx which we used for the search should have been
 566    deleted from the finite automaton. */
 567
 568 char *
 569 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 570                      char *strend, const U32 flags, re_scream_pos_data *data)
 571 {
 572     dVAR;
 573     struct regexp *const prog = (struct regexp *)SvANY(rx);
 574     I32 start_shift = 0;
 575     /* Should be nonnegative! */
 576     I32 end_shift   = 0;
 577     char *s;
 578     SV *check;
 579     char *strbeg;
 580     char *t;
 581     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 582     I32 ml_anch;
 583     char *other_last = NULL;    /* other substr checked before this */
 584     char *check_at = NULL;              /* check substr found at this pos */
 585     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 586     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 587     RXi_GET_DECL(prog,progi);
 588 #ifdef DEBUGGING
 589     const char * const i_strpos = strpos;
 590 #endif
 591     GET_RE_DEBUG_FLAGS_DECL;
 592
 593     PERL_ARGS_ASSERT_RE_INTUIT_START;
 594     PERL_UNUSED_ARG(flags);
 595     PERL_UNUSED_ARG(data);
 596
 597     RX_MATCH_UTF8_set(rx,utf8_target);
 598
 599     if (RX_UTF8(rx)) {
 600         PL_reg_flags |= RF_utf8;
 601     }
 602     DEBUG_EXECUTE_r(
 603         debug_start_match(rx, utf8_target, strpos, strend,
 604             sv ? "Guessing start of match in sv for"
 605                : "Guessing start of match in string for");
 606               );
 607
 608     /* CHR_DIST() would be more correct here but it makes things slow. */
 609     if (prog->minlen > strend - strpos) {
 610         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 611                               "String too short... [re_intuit_start]\n"));
 612         goto fail;
 613     }
 614
 615     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 616     PL_regeol = strend;
 617     if (utf8_target) {
 618         if (!prog->check_utf8 && prog->check_substr)
 619             to_utf8_substr(prog);
 620         check = prog->check_utf8;
 621     } else {
 622         if (!prog->check_substr && prog->check_utf8)
 623             to_byte_substr(prog);
 624         check = prog->check_substr;
 625     }
 626     if (check == &PL_sv_undef) {
 627         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 628                 "Non-utf8 string cannot match utf8 check string\n"));
 629         goto fail;
 630     }
 631     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 632         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 633                      || ( (prog->extflags & RXf_ANCH_BOL)
 634                           && !multiline ) );    /* Check after \n? */
 635
 636         if (!ml_anch) {
 637           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 638                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 639                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 640                && sv && !SvROK(sv)
 641                && (strpos != strbeg)) {
 642               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 643               goto fail;
 644           }
 645           if (prog->check_offset_min == prog->check_offset_max &&
 646               !(prog->extflags & RXf_CANY_SEEN)) {
 647             /* Substring at constant offset from beg-of-str... */
 648             I32 slen;
 649
 650             s = HOP3c(strpos, prog->check_offset_min, strend);
 651
 652             if (SvTAIL(check)) {
 653                 slen = SvCUR(check);    /* >= 1 */
 654
 655                 if ( strend - s > slen || strend - s < slen - 1
 656                      || (strend - s == slen && strend[-1] != '\n')) {
 657                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 658                     goto fail_finish;
 659                 }
 660                 /* Now should match s[0..slen-2] */
 661                 slen--;
 662                 if (slen && (*SvPVX_const(check) != *s
 663                              || (slen > 1
 664                                  && memNE(SvPVX_const(check), s, slen)))) {
 665                   report_neq:
 666                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 667                     goto fail_finish;
 668                 }
 669             }
 670             else if (*SvPVX_const(check) != *s
 671                      || ((slen = SvCUR(check)) > 1
 672                          && memNE(SvPVX_const(check), s, slen)))
 673                 goto report_neq;
 674             check_at = s;
 675             goto success_at_start;
 676           }
 677         }
 678         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 679         s = strpos;
 680         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 681         end_shift = prog->check_end_shift;
 682
 683         if (!ml_anch) {
 684             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 685                                          - (SvTAIL(check) != 0);
 686             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 687
 688             if (end_shift < eshift)
 689                 end_shift = eshift;
 690         }
 691     }
 692     else {                              /* Can match at random position */
 693         ml_anch = 0;
 694         s = strpos;
 695         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 696         end_shift = prog->check_end_shift;
 697
 698         /* end shift should be non negative here */
 699     }
 700
 701 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 702     if (end_shift < 0)
 703         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 704                    (IV)end_shift, RX_PRECOMP(prog));
 705 #endif
 706
 707   restart:
 708     /* Find a possible match in the region s..strend by looking for
 709        the "check" substring in the region corrected by start/end_shift. */
 710
 711     {
 712         I32 srch_start_shift = start_shift;
 713         I32 srch_end_shift = end_shift;
 714         U8* start_point;
 715         U8* end_point;
 716         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 717             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 718             srch_start_shift = strbeg - s;
 719         }
 720     DEBUG_OPTIMISE_MORE_r({
 721         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 722             (IV)prog->check_offset_min,
 723             (IV)srch_start_shift,
 724             (IV)srch_end_shift,
 725             (IV)prog->check_end_shift);
 726     });
 727
 728         if (prog->extflags & RXf_CANY_SEEN) {
 729             start_point= (U8*)(s + srch_start_shift);
 730             end_point= (U8*)(strend - srch_end_shift);
 731         } else {
 732             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 733             end_point= HOP3(strend, -srch_end_shift, strbeg);
 734         }
 735         DEBUG_OPTIMISE_MORE_r({
 736             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 737                 (int)(end_point - start_point),
 738                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 739                 start_point);
 740         });
 741
 742         s = fbm_instr( start_point, end_point,
 743                       check, multiline ? FBMrf_MULTILINE : 0);
 744     }
 745     /* Update the count-of-usability, remove useless subpatterns,
 746         unshift s.  */
 747
 748     DEBUG_EXECUTE_r({
 749         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 750             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 751         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 752                           (s ? "Found" : "Did not find"),
 753             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 754                 ? "anchored" : "floating"),
 755             quoted,
 756             RE_SV_TAIL(check),
 757             (s ? " at offset " : "...\n") );
 758     });
 759
 760     if (!s)
 761         goto fail_finish;
 762     /* Finish the diagnostic message */
 763     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 764
 765     /* XXX dmq: first branch is for positive lookbehind...
 766        Our check string is offset from the beginning of the pattern.
 767        So we need to do any stclass tests offset forward from that
 768        point. I think. :-(
 769      */
 770
 771
 772
 773     check_at=s;
 774
 775
 776     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 777        Start with the other substr.
 778        XXXX no SCREAM optimization yet - and a very coarse implementation
 779        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 780                 *always* match.  Probably should be marked during compile...
 781        Probably it is right to do no SCREAM here...
 782      */
 783
 784     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 785                 : (prog->float_substr && prog->anchored_substr))
 786     {
 787         /* Take into account the "other" substring. */
 788         /* XXXX May be hopelessly wrong for UTF... */
 789         if (!other_last)
 790             other_last = strpos;
 791         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 792           do_other_anchored:
 793             {
 794                 char * const last = HOP3c(s, -start_shift, strbeg);
 795                 char *last1, *last2;
 796                 char * const saved_s = s;
 797                 SV* must;
 798
 799                 t = s - prog->check_offset_max;
 800                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 801                     && (!utf8_target
 802                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 803                             && t > strpos)))
 804                     NOOP;
 805                 else
 806                     t = strpos;
 807                 t = HOP3c(t, prog->anchored_offset, strend);
 808                 if (t < other_last)     /* These positions already checked */
 809                     t = other_last;
 810                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 811                 if (last < last1)
 812                     last1 = last;
 813                 /* XXXX It is not documented what units *_offsets are in.
 814                    We assume bytes, but this is clearly wrong.
 815                    Meaning this code needs to be carefully reviewed for errors.
 816                    dmq.
 817                   */
 818
 819                 /* On end-of-str: see comment below. */
 820                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 821                 if (must == &PL_sv_undef) {
 822                     s = (char*)NULL;
 823                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 824                 }
 825                 else
 826                     s = fbm_instr(
 827                         (unsigned char*)t,
 828                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 829                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 830                         must,
 831                         multiline ? FBMrf_MULTILINE : 0
 832                     );
 833                 DEBUG_EXECUTE_r({
 834                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 835                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 836                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 837                         (s ? "Found" : "Contradicts"),
 838                         quoted, RE_SV_TAIL(must));
 839                 });
 840
 841
 842                 if (!s) {
 843                     if (last1 >= last2) {
 844                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 845                                                 ", giving up...\n"));
 846                         goto fail_finish;
 847                     }
 848                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 849                         ", trying floating at offset %ld...\n",
 850                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 851                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 852                     s = HOP3c(last, 1, strend);
 853                     goto restart;
 854                 }
 855                 else {
 856                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 857                           (long)(s - i_strpos)));
 858                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 859                     other_last = HOP3c(s, 1, strend);
 860                     s = saved_s;
 861                     if (t == strpos)
 862                         goto try_at_start;
 863                     goto try_at_offset;
 864                 }
 865             }
 866         }
 867         else {          /* Take into account the floating substring. */
 868             char *last, *last1;
 869             char * const saved_s = s;
 870             SV* must;
 871
 872             t = HOP3c(s, -start_shift, strbeg);
 873             last1 = last =
 874                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 875             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 876                 last = HOP3c(t, prog->float_max_offset, strend);
 877             s = HOP3c(t, prog->float_min_offset, strend);
 878             if (s < other_last)
 879                 s = other_last;
 880  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 881             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 882             /* fbm_instr() takes into account exact value of end-of-str
 883                if the check is SvTAIL(ed).  Since false positives are OK,
 884                and end-of-str is not later than strend we are OK. */
 885             if (must == &PL_sv_undef) {
 886                 s = (char*)NULL;
 887                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 888             }
 889             else
 890                 s = fbm_instr((unsigned char*)s,
 891                               (unsigned char*)last + SvCUR(must)
 892                                   - (SvTAIL(must)!=0),
 893                               must, multiline ? FBMrf_MULTILINE : 0);
 894             DEBUG_EXECUTE_r({
 895                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 896                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 897                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 898                     (s ? "Found" : "Contradicts"),
 899                     quoted, RE_SV_TAIL(must));
 900             });
 901             if (!s) {
 902                 if (last1 == last) {
 903                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 904                                             ", giving up...\n"));
 905                     goto fail_finish;
 906                 }
 907                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 908                     ", trying anchored starting at offset %ld...\n",
 909                     (long)(saved_s + 1 - i_strpos)));
 910                 other_last = last;
 911                 s = HOP3c(t, 1, strend);
 912                 goto restart;
 913             }
 914             else {
 915                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 916                       (long)(s - i_strpos)));
 917                 other_last = s; /* Fix this later. --Hugo */
 918                 s = saved_s;
 919                 if (t == strpos)
 920                     goto try_at_start;
 921                 goto try_at_offset;
 922             }
 923         }
 924     }
 925
 926
 927     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 928
 929     DEBUG_OPTIMISE_MORE_r(
 930         PerlIO_printf(Perl_debug_log,
 931             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 932             (IV)prog->check_offset_min,
 933             (IV)prog->check_offset_max,
 934             (IV)(s-strpos),
 935             (IV)(t-strpos),
 936             (IV)(t-s),
 937             (IV)(strend-strpos)
 938         )
 939     );
 940
 941     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 942         && (!utf8_target
 943             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 944                  && t > strpos)))
 945     {
 946         /* Fixed substring is found far enough so that the match
 947            cannot start at strpos. */
 948       try_at_offset:
 949         if (ml_anch && t[-1] != '\n') {
 950             /* Eventually fbm_*() should handle this, but often
 951                anchored_offset is not 0, so this check will not be wasted. */
 952             /* XXXX In the code below we prefer to look for "^" even in
 953                presence of anchored substrings.  And we search even
 954                beyond the found float position.  These pessimizations
 955                are historical artefacts only.  */
 956           find_anchor:
 957             while (t < strend - prog->minlen) {
 958                 if (*t == '\n') {
 959                     if (t < check_at - prog->check_offset_min) {
 960                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 961                             /* Since we moved from the found position,
 962                                we definitely contradict the found anchored
 963                                substr.  Due to the above check we do not
 964                                contradict "check" substr.
 965                                Thus we can arrive here only if check substr
 966                                is float.  Redo checking for "other"=="fixed".
 967                              */
 968                             strpos = t + 1;
 969                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 970                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 971                             goto do_other_anchored;
 972                         }
 973                         /* We don't contradict the found floating substring. */
 974                         /* XXXX Why not check for STCLASS? */
 975                         s = t + 1;
 976                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 977                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 978                         goto set_useful;
 979                     }
 980                     /* Position contradicts check-string */
 981                     /* XXXX probably better to look for check-string
 982                        than for "\n", so one should lower the limit for t? */
 983                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 984                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 985                     other_last = strpos = s = t + 1;
 986                     goto restart;
 987                 }
 988                 t++;
 989             }
 990             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 991                         PL_colors[0], PL_colors[1]));
 992             goto fail_finish;
 993         }
 994         else {
 995             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 996                         PL_colors[0], PL_colors[1]));
 997         }
 998         s = t;
 999       set_useful:
1000         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1001     }
1002     else {
1003         /* The found string does not prohibit matching at strpos,
1004            - no optimization of calling REx engine can be performed,
1005            unless it was an MBOL and we are not after MBOL,
1006            or a future STCLASS check will fail this. */
1007       try_at_start:
1008         /* Even in this situation we may use MBOL flag if strpos is offset
1009            wrt the start of the string. */
1010         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1011             && (strpos != strbeg) && strpos[-1] != '\n'
1012             /* May be due to an implicit anchor of m{.*foo}  */
1013             && !(prog->intflags & PREGf_IMPLICIT))
1014         {
1015             t = strpos;
1016             goto find_anchor;
1017         }
1018         DEBUG_EXECUTE_r( if (ml_anch)
1019             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1020                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1021         );
1022       success_at_start:
1023         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1024             && (utf8_target ? (
1025                 prog->check_utf8                /* Could be deleted already */
1026                 && --BmUSEFUL(prog->check_utf8) < 0
1027                 && (prog->check_utf8 == prog->float_utf8)
1028             ) : (
1029                 prog->check_substr              /* Could be deleted already */
1030                 && --BmUSEFUL(prog->check_substr) < 0
1031                 && (prog->check_substr == prog->float_substr)
1032             )))
1033         {
1034             /* If flags & SOMETHING - do not do it many times on the same match */
1035             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1036             /* XXX Does the destruction order has to change with utf8_target? */
1037             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1038             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1039             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1040             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1041             check = NULL;                       /* abort */
1042             s = strpos;
1043             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1044                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1045             if (prog->intflags & PREGf_IMPLICIT)
1046                 prog->extflags &= ~RXf_ANCH_MBOL;
1047             /* XXXX This is a remnant of the old implementation.  It
1048                     looks wasteful, since now INTUIT can use many
1049                     other heuristics. */
1050             prog->extflags &= ~RXf_USE_INTUIT;
1051             /* XXXX What other flags might need to be cleared in this branch? */
1052         }
1053         else
1054             s = strpos;
1055     }
1056
1057     /* Last resort... */
1058     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1059     /* trie stclasses are too expensive to use here, we are better off to
1060        leave it to regmatch itself */
1061     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1062         /* minlen == 0 is possible if regstclass is \b or \B,
1063            and the fixed substr is ''$.
1064            Since minlen is already taken into account, s+1 is before strend;
1065            accidentally, minlen >= 1 guaranties no false positives at s + 1
1066            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1067            regstclass does not come from lookahead...  */
1068         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1069            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1070         const U8* const str = (U8*)STRING(progi->regstclass);
1071         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1072                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1073                     : 1);
1074         char * endpos;
1075         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1076             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1077         else if (prog->float_substr || prog->float_utf8)
1078             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1079         else
1080             endpos= strend;
1081
1082         if (checked_upto < s)
1083            checked_upto = s;
1084         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1085                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1086
1087         t = s;
1088         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1089         if (s) {
1090             checked_upto = s;
1091         } else {
1092 #ifdef DEBUGGING
1093             const char *what = NULL;
1094 #endif
1095             if (endpos == strend) {
1096                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1097                                 "Could not match STCLASS...\n") );
1098                 goto fail;
1099             }
1100             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1101                                    "This position contradicts STCLASS...\n") );
1102             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1103                 goto fail;
1104             checked_upto = HOPBACKc(endpos, start_shift);
1105             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1106                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1107             /* Contradict one of substrings */
1108             if (prog->anchored_substr || prog->anchored_utf8) {
1109                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1110                     DEBUG_EXECUTE_r( what = "anchored" );
1111                   hop_and_restart:
1112                     s = HOP3c(t, 1, strend);
1113                     if (s + start_shift + end_shift > strend) {
1114                         /* XXXX Should be taken into account earlier? */
1115                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116                                                "Could not match STCLASS...\n") );
1117                         goto fail;
1118                     }
1119                     if (!check)
1120                         goto giveup;
1121                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1122                                 "Looking for %s substr starting at offset %ld...\n",
1123                                  what, (long)(s + start_shift - i_strpos)) );
1124                     goto restart;
1125                 }
1126                 /* Have both, check_string is floating */
1127                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1128                     goto retry_floating_check;
1129                 /* Recheck anchored substring, but not floating... */
1130                 s = check_at;
1131                 if (!check)
1132                     goto giveup;
1133                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1134                           "Looking for anchored substr starting at offset %ld...\n",
1135                           (long)(other_last - i_strpos)) );
1136                 goto do_other_anchored;
1137             }
1138             /* Another way we could have checked stclass at the
1139                current position only: */
1140             if (ml_anch) {
1141                 s = t = t + 1;
1142                 if (!check)
1143                     goto giveup;
1144                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1145                           "Looking for /%s^%s/m starting at offset %ld...\n",
1146                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1147                 goto try_at_offset;
1148             }
1149             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1150                 goto fail;
1151             /* Check is floating substring. */
1152           retry_floating_check:
1153             t = check_at - start_shift;
1154             DEBUG_EXECUTE_r( what = "floating" );
1155             goto hop_and_restart;
1156         }
1157         if (t != s) {
1158             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1159                         "By STCLASS: moving %ld --> %ld\n",
1160                                   (long)(t - i_strpos), (long)(s - i_strpos))
1161                    );
1162         }
1163         else {
1164             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1165                                   "Does not contradict STCLASS...\n");
1166                    );
1167         }
1168     }
1169   giveup:
1170     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1171                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1172                           PL_colors[5], (long)(s - i_strpos)) );
1173     return s;
1174
1175   fail_finish:                          /* Substring not found */
1176     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1177         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1178   fail:
1179     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1180                           PL_colors[4], PL_colors[5]));
1181     return NULL;
1182 }
1183
1184 #define DECL_TRIE_TYPE(scan) \
1185     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1186                     trie_type = ((scan->flags == EXACT) \
1187                               ? (utf8_target ? trie_utf8 : trie_plain) \
1188                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1189
1190 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1191 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1192     STRLEN skiplen;                                                                 \
1193     switch (trie_type) {                                                            \
1194     case trie_utf8_fold:                                                            \
1195         if ( foldlen>0 ) {                                                          \
1196             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1197             foldlen -= len;                                                         \
1198             uscan += len;                                                           \
1199             len=0;                                                                  \
1200         } else {                                                                    \
1201             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1202             len = UTF8SKIP(uc);                                                     \
1203             skiplen = UNISKIP( uvc );                                               \
1204             foldlen -= skiplen;                                                     \
1205             uscan = foldbuf + skiplen;                                              \
1206         }                                                                           \
1207         break;                                                                      \
1208     case trie_latin_utf8_fold:                                                      \
1209         if ( foldlen>0 ) {                                                          \
1210             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1211             foldlen -= len;                                                         \
1212             uscan += len;                                                           \
1213             len=0;                                                                  \
1214         } else {                                                                    \
1215             len = 1;                                                                \
1216             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1217             skiplen = UNISKIP( uvc );                                               \
1218             foldlen -= skiplen;                                                     \
1219             uscan = foldbuf + skiplen;                                              \
1220         }                                                                           \
1221         break;                                                                      \
1222     case trie_utf8:                                                                 \
1223         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1224         break;                                                                      \
1225     case trie_plain:                                                                \
1226         uvc = (UV)*uc;                                                              \
1227         len = 1;                                                                    \
1228     }                                                                               \
1229     if (uvc < 256) {                                                                \
1230         charid = trie->charmap[ uvc ];                                              \
1231     }                                                                               \
1232     else {                                                                          \
1233         charid = 0;                                                                 \
1234         if (widecharmap) {                                                          \
1235             SV** const svpp = hv_fetch(widecharmap,                                 \
1236                         (char*)&uvc, sizeof(UV), 0);                                \
1237             if (svpp)                                                               \
1238                 charid = (U16)SvIV(*svpp);                                          \
1239         }                                                                           \
1240     }                                                                               \
1241 } STMT_END
1242
1243 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1244 STMT_START {                                              \
1245     while (s <= e) {                                      \
1246         if ( (CoNd)                                       \
1247              && (ln == 1 || folder(s, pat_string, ln))    \
1248              && (!reginfo || regtry(reginfo, &s)) )       \
1249             goto got_it;                                  \
1250         s++;                                              \
1251     }                                                     \
1252 } STMT_END
1253
1254 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1255 STMT_START {                                          \
1256     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1257         CoDe                                          \
1258         s += uskip;                                   \
1259     }                                                 \
1260 } STMT_END
1261
1262 #define REXEC_FBC_SCAN(CoDe)                          \
1263 STMT_START {                                          \
1264     while (s < strend) {                              \
1265         CoDe                                          \
1266         s++;                                          \
1267     }                                                 \
1268 } STMT_END
1269
1270 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1271 REXEC_FBC_UTF8_SCAN(                                  \
1272     if (CoNd) {                                       \
1273         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1274             goto got_it;                              \
1275         else                                          \
1276             tmp = doevery;                            \
1277     }                                                 \
1278     else                                              \
1279         tmp = 1;                                      \
1280 )
1281
1282 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1283 REXEC_FBC_SCAN(                                       \
1284     if (CoNd) {                                       \
1285         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1286             goto got_it;                              \
1287         else                                          \
1288             tmp = doevery;                            \
1289     }                                                 \
1290     else                                              \
1291         tmp = 1;                                      \
1292 )
1293
1294 #define REXEC_FBC_TRYIT               \
1295 if ((!reginfo || regtry(reginfo, &s))) \
1296     goto got_it
1297
1298 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1299     if (utf8_target) {                                             \
1300         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1301     }                                                          \
1302     else {                                                     \
1303         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1304     }
1305
1306 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1307     if (utf8_target) {                                             \
1308         UtFpReLoAd;                                            \
1309         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1310     }                                                          \
1311     else {                                                     \
1312         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1313     }
1314
1315 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1316     PL_reg_flags |= RF_tainted;                                \
1317     if (utf8_target) {                                             \
1318         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1319     }                                                          \
1320     else {                                                     \
1321         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1322     }
1323
1324 #define DUMP_EXEC_POS(li,s,doutf8) \
1325     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1326
1327
1328 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1329         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1330         tmp = TEST_NON_UTF8(tmp);                                              \
1331         REXEC_FBC_UTF8_SCAN(                                                   \
1332             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1333                 tmp = !tmp;                                                    \
1334                 IF_SUCCESS;                                                    \
1335             }                                                                  \
1336             else {                                                             \
1337                 IF_FAIL;                                                       \
1338             }                                                                  \
1339         );                                                                     \
1340
1341 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1342         if (s == PL_bostr) {                                                   \
1343             tmp = '\n';                                                        \
1344         }                                                                      \
1345         else {                                                                 \
1346             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1347             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1348         }                                                                      \
1349         tmp = TeSt1_UtF8;                                                      \
1350         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1351         REXEC_FBC_UTF8_SCAN(                                                   \
1352             if (tmp == ! (TeSt2_UtF8)) { \
1353                 tmp = !tmp;                                                    \
1354                 IF_SUCCESS;                                                    \
1355             }                                                                  \
1356             else {                                                             \
1357                 IF_FAIL;                                                       \
1358             }                                                                  \
1359         );                                                                     \
1360
1361 /* The only difference between the BOUND and NBOUND cases is that
1362  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1363  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1364  * with the other one being empty */
1365 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1366     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1367
1368 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1369     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1370
1371 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1372     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1373
1374 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1375     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1376
1377
1378 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1379  * be passed in completely with the variable name being tested, which isn't
1380  * such a clean interface, but this is easier to read than it was before.  We
1381  * are looking for the boundary (or non-boundary between a word and non-word
1382  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1383  * must be different.  Find the "wordness" of the character just prior to this
1384  * one, and compare it with the wordness of this one.  If they differ, we have
1385  * a boundary.  At the beginning of the string, pretend that the previous
1386  * character was a new-line */
1387 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1388     if (utf8_target) {                                                         \
1389                 UTF8_CODE \
1390     }                                                                          \
1391     else {  /* Not utf8 */                                                     \
1392         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1393         tmp = TEST_NON_UTF8(tmp);                                              \
1394         REXEC_FBC_SCAN(                                                        \
1395             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1396                 tmp = !tmp;                                                    \
1397                 IF_SUCCESS;                                                    \
1398             }                                                                  \
1399             else {                                                             \
1400                 IF_FAIL;                                                       \
1401             }                                                                  \
1402         );                                                                     \
1403     }                                                                          \
1404     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1405         goto got_it;
1406
1407 /* We know what class REx starts with.  Try to find this position... */
1408 /* if reginfo is NULL, its a dryrun */
1409 /* annoyingly all the vars in this routine have different names from their counterparts
1410    in regmatch. /grrr */
1411
1412 STATIC char *
1413 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1414     const char *strend, regmatch_info *reginfo)
1415 {
1416         dVAR;
1417         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1418         char *pat_string;   /* The pattern's exactish string */
1419         char *pat_end;      /* ptr to end char of pat_string */
1420         re_fold_t folder;       /* Function for computing non-utf8 folds */
1421         const U8 *fold_array;   /* array for folding ords < 256 */
1422         STRLEN ln;
1423         STRLEN lnc;
1424         STRLEN uskip;
1425         U8 c1;
1426         U8 c2;
1427         char *e;
1428         I32 tmp = 1;    /* Scratch variable? */
1429         const bool utf8_target = PL_reg_match_utf8;
1430         UV utf8_fold_flags = 0;
1431         RXi_GET_DECL(prog,progi);
1432
1433         PERL_ARGS_ASSERT_FIND_BYCLASS;
1434
1435         /* We know what class it must start with. */
1436         switch (OP(c)) {
1437         case ANYOFV:
1438         case ANYOF:
1439             if (utf8_target || OP(c) == ANYOFV) {
1440                 STRLEN inclasslen = strend - s;
1441                 REXEC_FBC_UTF8_CLASS_SCAN(
1442                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1443             }
1444             else {
1445                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1446             }
1447             break;
1448         case CANY:
1449             REXEC_FBC_SCAN(
1450                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1451                     goto got_it;
1452                 else
1453                     tmp = doevery;
1454             );
1455             break;
1456
1457         case EXACTFA:
1458             if (UTF_PATTERN || utf8_target) {
1459                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1460                 goto do_exactf_utf8;
1461             }
1462             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1463             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1464             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1465
1466         case EXACTF:
1467             if (utf8_target) {
1468
1469                 /* regcomp.c already folded this if pattern is in UTF-8 */
1470                 utf8_fold_flags = 0;
1471                 goto do_exactf_utf8;
1472             }
1473             fold_array = PL_fold;
1474             folder = foldEQ;
1475             goto do_exactf_non_utf8;
1476
1477         case EXACTFL:
1478             if (UTF_PATTERN || utf8_target) {
1479                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1480                 goto do_exactf_utf8;
1481             }
1482             fold_array = PL_fold_locale;
1483             folder = foldEQ_locale;
1484             goto do_exactf_non_utf8;
1485
1486         case EXACTFU_SS:
1487             if (UTF_PATTERN) {
1488                 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1489             }
1490             goto do_exactf_utf8;
1491
1492         case EXACTFU_TRICKYFOLD:
1493         case EXACTFU:
1494             if (UTF_PATTERN || utf8_target) {
1495                 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1496                 goto do_exactf_utf8;
1497             }
1498
1499             /* Any 'ss' in the pattern should have been replaced by regcomp,
1500              * so we don't have to worry here about this single special case
1501              * in the Latin1 range */
1502             fold_array = PL_fold_latin1;
1503             folder = foldEQ_latin1;
1504
1505             /* FALL THROUGH */
1506
1507         do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1508                                are no glitches with fold-length differences
1509                                between the target string and pattern */
1510
1511             /* The idea in the non-utf8 EXACTF* cases is to first find the
1512              * first character of the EXACTF* node and then, if necessary,
1513              * case-insensitively compare the full text of the node.  c1 is the
1514              * first character.  c2 is its fold.  This logic will not work for
1515              * Unicode semantics and the german sharp ss, which hence should
1516              * not be compiled into a node that gets here. */
1517             pat_string = STRING(c);
1518             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1519
1520             /* We know that we have to match at least 'ln' bytes (which is the
1521              * same as characters, since not utf8).  If we have to match 3
1522              * characters, and there are only 2 availabe, we know without
1523              * trying that it will fail; so don't start a match past the
1524              * required minimum number from the far end */
1525             e = HOP3c(strend, -((I32)ln), s);
1526
1527             if (!reginfo && e < s) {
1528                 e = s;                  /* Due to minlen logic of intuit() */
1529             }
1530
1531             c1 = *pat_string;
1532             c2 = fold_array[c1];
1533             if (c1 == c2) { /* If char and fold are the same */
1534                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1535             }
1536             else {
1537                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1538             }
1539             break;
1540
1541         do_exactf_utf8:
1542         {
1543             unsigned expansion;
1544
1545
1546             /* If one of the operands is in utf8, we can't use the simpler
1547              * folding above, due to the fact that many different characters
1548              * can have the same fold, or portion of a fold, or different-
1549              * length fold */
1550             pat_string = STRING(c);
1551             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1552             pat_end = pat_string + ln;
1553             lnc = (UTF_PATTERN) /* length to match in characters */
1554                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1555                     : ln;
1556
1557             /* We have 'lnc' characters to match in the pattern, but because of
1558              * multi-character folding, each character in the target can match
1559              * up to 3 characters (Unicode guarantees it will never exceed
1560              * this) if it is utf8-encoded; and up to 2 if not (based on the
1561              * fact that the Latin 1 folds are already determined, and the
1562              * only multi-char fold in that range is the sharp-s folding to
1563              * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1564              * string character.  Adjust lnc accordingly, rounding up, so that
1565              * if we need to match at least 4+1/3 chars, that really is 5. */
1566             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1567             lnc = (lnc + expansion - 1) / expansion;
1568
1569             /* As in the non-UTF8 case, if we have to match 3 characters, and
1570              * only 2 are left, it's guaranteed to fail, so don't start a
1571              * match that would require us to go beyond the end of the string
1572              */
1573             e = HOP3c(strend, -((I32)lnc), s);
1574
1575             if (!reginfo && e < s) {
1576                 e = s;                  /* Due to minlen logic of intuit() */
1577             }
1578
1579             /* XXX Note that we could recalculate e to stop the loop earlier,
1580              * as the worst case expansion above will rarely be met, and as we
1581              * go along we would usually find that e moves further to the left.
1582              * This would happen only after we reached the point in the loop
1583              * where if there were no expansion we should fail.  Unclear if
1584              * worth the expense */
1585
1586             while (s <= e) {
1587                 char *my_strend= (char *)strend;
1588                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1589                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1590                     && (!reginfo || regtry(reginfo, &s)) )
1591                 {
1592                     goto got_it;
1593                 }
1594                 s += (utf8_target) ? UTF8SKIP(s) : 1;
1595             }
1596             break;
1597         }
1598         case BOUNDL:
1599             PL_reg_flags |= RF_tainted;
1600             FBC_BOUND(isALNUM_LC,
1601                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1602                       isALNUM_LC_utf8((U8*)s));
1603             break;
1604         case NBOUNDL:
1605             PL_reg_flags |= RF_tainted;
1606             FBC_NBOUND(isALNUM_LC,
1607                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1608                        isALNUM_LC_utf8((U8*)s));
1609             break;
1610         case BOUND:
1611             FBC_BOUND(isWORDCHAR,
1612                       isALNUM_uni(tmp),
1613                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1614             break;
1615         case BOUNDA:
1616             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1617                              isWORDCHAR_A(tmp),
1618                              isWORDCHAR_A((U8*)s));
1619             break;
1620         case NBOUND:
1621             FBC_NBOUND(isWORDCHAR,
1622                        isALNUM_uni(tmp),
1623                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1624             break;
1625         case NBOUNDA:
1626             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1627                               isWORDCHAR_A(tmp),
1628                               isWORDCHAR_A((U8*)s));
1629             break;
1630         case BOUNDU:
1631             FBC_BOUND(isWORDCHAR_L1,
1632                       isALNUM_uni(tmp),
1633                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1634             break;
1635         case NBOUNDU:
1636             FBC_NBOUND(isWORDCHAR_L1,
1637                        isALNUM_uni(tmp),
1638                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1639             break;
1640         case ALNUML:
1641             REXEC_FBC_CSCAN_TAINT(
1642                 isALNUM_LC_utf8((U8*)s),
1643                 isALNUM_LC(*s)
1644             );
1645             break;
1646         case ALNUMU:
1647             REXEC_FBC_CSCAN_PRELOAD(
1648                 LOAD_UTF8_CHARCLASS_ALNUM(),
1649                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1650                 isWORDCHAR_L1((U8) *s)
1651             );
1652             break;
1653         case ALNUM:
1654             REXEC_FBC_CSCAN_PRELOAD(
1655                 LOAD_UTF8_CHARCLASS_ALNUM(),
1656                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1657                 isWORDCHAR((U8) *s)
1658             );
1659             break;
1660         case ALNUMA:
1661             /* Don't need to worry about utf8, as it can match only a single
1662              * byte invariant character */
1663             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1664             break;
1665         case NALNUMU:
1666             REXEC_FBC_CSCAN_PRELOAD(
1667                 LOAD_UTF8_CHARCLASS_ALNUM(),
1668                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1669                 ! isWORDCHAR_L1((U8) *s)
1670             );
1671             break;
1672         case NALNUM:
1673             REXEC_FBC_CSCAN_PRELOAD(
1674                 LOAD_UTF8_CHARCLASS_ALNUM(),
1675                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1676                 ! isALNUM(*s)
1677             );
1678             break;
1679         case NALNUMA:
1680             REXEC_FBC_CSCAN(
1681                 !isWORDCHAR_A(*s),
1682                 !isWORDCHAR_A(*s)
1683             );
1684             break;
1685         case NALNUML:
1686             REXEC_FBC_CSCAN_TAINT(
1687                 !isALNUM_LC_utf8((U8*)s),
1688                 !isALNUM_LC(*s)
1689             );
1690             break;
1691         case SPACEU:
1692             REXEC_FBC_CSCAN_PRELOAD(
1693                 LOAD_UTF8_CHARCLASS_SPACE(),
1694                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1695                 isSPACE_L1((U8) *s)
1696             );
1697             break;
1698         case SPACE:
1699             REXEC_FBC_CSCAN_PRELOAD(
1700                 LOAD_UTF8_CHARCLASS_SPACE(),
1701                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1702                 isSPACE((U8) *s)
1703             );
1704             break;
1705         case SPACEA:
1706             /* Don't need to worry about utf8, as it can match only a single
1707              * byte invariant character */
1708             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1709             break;
1710         case SPACEL:
1711             REXEC_FBC_CSCAN_TAINT(
1712                 isSPACE_LC_utf8((U8*)s),
1713                 isSPACE_LC(*s)
1714             );
1715             break;
1716         case NSPACEU:
1717             REXEC_FBC_CSCAN_PRELOAD(
1718                 LOAD_UTF8_CHARCLASS_SPACE(),
1719                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1720                 ! isSPACE_L1((U8) *s)
1721             );
1722             break;
1723         case NSPACE:
1724             REXEC_FBC_CSCAN_PRELOAD(
1725                 LOAD_UTF8_CHARCLASS_SPACE(),
1726                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1727                 ! isSPACE((U8) *s)
1728             );
1729             break;
1730         case NSPACEA:
1731             REXEC_FBC_CSCAN(
1732                 !isSPACE_A(*s),
1733                 !isSPACE_A(*s)
1734             );
1735             break;
1736         case NSPACEL:
1737             REXEC_FBC_CSCAN_TAINT(
1738                 !isSPACE_LC_utf8((U8*)s),
1739                 !isSPACE_LC(*s)
1740             );
1741             break;
1742         case DIGIT:
1743             REXEC_FBC_CSCAN_PRELOAD(
1744                 LOAD_UTF8_CHARCLASS_DIGIT(),
1745                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1746                 isDIGIT(*s)
1747             );
1748             break;
1749         case DIGITA:
1750             /* Don't need to worry about utf8, as it can match only a single
1751              * byte invariant character */
1752             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1753             break;
1754         case DIGITL:
1755             REXEC_FBC_CSCAN_TAINT(
1756                 isDIGIT_LC_utf8((U8*)s),
1757                 isDIGIT_LC(*s)
1758             );
1759             break;
1760         case NDIGIT:
1761             REXEC_FBC_CSCAN_PRELOAD(
1762                 LOAD_UTF8_CHARCLASS_DIGIT(),
1763                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1764                 !isDIGIT(*s)
1765             );
1766             break;
1767         case NDIGITA:
1768             REXEC_FBC_CSCAN(
1769                 !isDIGIT_A(*s),
1770                 !isDIGIT_A(*s)
1771             );
1772             break;
1773         case NDIGITL:
1774             REXEC_FBC_CSCAN_TAINT(
1775                 !isDIGIT_LC_utf8((U8*)s),
1776                 !isDIGIT_LC(*s)
1777             );
1778             break;
1779         case LNBREAK:
1780             REXEC_FBC_CSCAN(
1781                 is_LNBREAK_utf8(s),
1782                 is_LNBREAK_latin1(s)
1783             );
1784             break;
1785         case VERTWS:
1786             REXEC_FBC_CSCAN(
1787                 is_VERTWS_utf8(s),
1788                 is_VERTWS_latin1(s)
1789             );
1790             break;
1791         case NVERTWS:
1792             REXEC_FBC_CSCAN(
1793                 !is_VERTWS_utf8(s),
1794                 !is_VERTWS_latin1(s)
1795             );
1796             break;
1797         case HORIZWS:
1798             REXEC_FBC_CSCAN(
1799                 is_HORIZWS_utf8(s),
1800                 is_HORIZWS_latin1(s)
1801             );
1802             break;
1803         case NHORIZWS:
1804             REXEC_FBC_CSCAN(
1805                 !is_HORIZWS_utf8(s),
1806                 !is_HORIZWS_latin1(s)
1807             );
1808             break;
1809         case POSIXA:
1810             /* Don't need to worry about utf8, as it can match only a single
1811             * byte invariant character.  The flag in this node type is the
1812             * class number to pass to _generic_isCC() to build a mask for
1813             * searching in PL_charclass[] */
1814             REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1815             break;
1816         case NPOSIXA:
1817             REXEC_FBC_CSCAN(
1818                 !_generic_isCC_A(*s, FLAGS(c)),
1819                 !_generic_isCC_A(*s, FLAGS(c))
1820             );
1821             break;
1822
1823         case AHOCORASICKC:
1824         case AHOCORASICK:
1825             {
1826                 DECL_TRIE_TYPE(c);
1827                 /* what trie are we using right now */
1828                 reg_ac_data *aho
1829                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1830                 reg_trie_data *trie
1831                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1832                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1833
1834                 const char *last_start = strend - trie->minlen;
1835 #ifdef DEBUGGING
1836                 const char *real_start = s;
1837 #endif
1838                 STRLEN maxlen = trie->maxlen;
1839                 SV *sv_points;
1840                 U8 **points; /* map of where we were in the input string
1841                                 when reading a given char. For ASCII this
1842                                 is unnecessary overhead as the relationship
1843                                 is always 1:1, but for Unicode, especially
1844                                 case folded Unicode this is not true. */
1845                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1846                 U8 *bitmap=NULL;
1847
1848
1849                 GET_RE_DEBUG_FLAGS_DECL;
1850
1851                 /* We can't just allocate points here. We need to wrap it in
1852                  * an SV so it gets freed properly if there is a croak while
1853                  * running the match */
1854                 ENTER;
1855                 SAVETMPS;
1856                 sv_points=newSV(maxlen * sizeof(U8 *));
1857                 SvCUR_set(sv_points,
1858                     maxlen * sizeof(U8 *));
1859                 SvPOK_on(sv_points);
1860                 sv_2mortal(sv_points);
1861                 points=(U8**)SvPV_nolen(sv_points );
1862                 if ( trie_type != trie_utf8_fold
1863                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1864                 {
1865                     if (trie->bitmap)
1866                         bitmap=(U8*)trie->bitmap;
1867                     else
1868                         bitmap=(U8*)ANYOF_BITMAP(c);
1869                 }
1870                 /* this is the Aho-Corasick algorithm modified a touch
1871                    to include special handling for long "unknown char"
1872                    sequences. The basic idea being that we use AC as long
1873                    as we are dealing with a possible matching char, when
1874                    we encounter an unknown char (and we have not encountered
1875                    an accepting state) we scan forward until we find a legal
1876                    starting char.
1877                    AC matching is basically that of trie matching, except
1878                    that when we encounter a failing transition, we fall back
1879                    to the current states "fail state", and try the current char
1880                    again, a process we repeat until we reach the root state,
1881                    state 1, or a legal transition. If we fail on the root state
1882                    then we can either terminate if we have reached an accepting
1883                    state previously, or restart the entire process from the beginning
1884                    if we have not.
1885
1886                  */
1887                 while (s <= last_start) {
1888                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1889                     U8 *uc = (U8*)s;
1890                     U16 charid = 0;
1891                     U32 base = 1;
1892                     U32 state = 1;
1893                     UV uvc = 0;
1894                     STRLEN len = 0;
1895                     STRLEN foldlen = 0;
1896                     U8 *uscan = (U8*)NULL;
1897                     U8 *leftmost = NULL;
1898 #ifdef DEBUGGING
1899                     U32 accepted_word= 0;
1900 #endif
1901                     U32 pointpos = 0;
1902
1903                     while ( state && uc <= (U8*)strend ) {
1904                         int failed=0;
1905                         U32 word = aho->states[ state ].wordnum;
1906
1907                         if( state==1 ) {
1908                             if ( bitmap ) {
1909                                 DEBUG_TRIE_EXECUTE_r(
1910                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1911                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1912                                             (char *)uc, utf8_target );
1913                                         PerlIO_printf( Perl_debug_log,
1914                                             " Scanning for legal start char...\n");
1915                                     }
1916                                 );
1917                                 if (utf8_target) {
1918                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1919                                         uc += UTF8SKIP(uc);
1920                                     }
1921                                 } else {
1922                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1923                                         uc++;
1924                                     }
1925                                 }
1926                                 s= (char *)uc;
1927                             }
1928                             if (uc >(U8*)last_start) break;
1929                         }
1930
1931                         if ( word ) {
1932                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1933                             if (!leftmost || lpos < leftmost) {
1934                                 DEBUG_r(accepted_word=word);
1935                                 leftmost= lpos;
1936                             }
1937                             if (base==0) break;
1938
1939                         }
1940                         points[pointpos++ % maxlen]= uc;
1941                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1942                                              uscan, len, uvc, charid, foldlen,
1943                                              foldbuf, uniflags);
1944                         DEBUG_TRIE_EXECUTE_r({
1945                             dump_exec_pos( (char *)uc, c, strend, real_start,
1946                                 s,   utf8_target );
1947                             PerlIO_printf(Perl_debug_log,
1948                                 " Charid:%3u CP:%4"UVxf" ",
1949                                  charid, uvc);
1950                         });
1951
1952                         do {
1953 #ifdef DEBUGGING
1954                             word = aho->states[ state ].wordnum;
1955 #endif
1956                             base = aho->states[ state ].trans.base;
1957
1958                             DEBUG_TRIE_EXECUTE_r({
1959                                 if (failed)
1960                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1961                                         s,   utf8_target );
1962                                 PerlIO_printf( Perl_debug_log,
1963                                     "%sState: %4"UVxf", word=%"UVxf,
1964                                     failed ? " Fail transition to " : "",
1965                                     (UV)state, (UV)word);
1966                             });
1967                             if ( base ) {
1968                                 U32 tmp;
1969                                 I32 offset;
1970                                 if (charid &&
1971                                      ( ((offset = base + charid
1972                                         - 1 - trie->uniquecharcount)) >= 0)
1973                                      && ((U32)offset < trie->lasttrans)
1974                                      && trie->trans[offset].check == state
1975                                      && (tmp=trie->trans[offset].next))
1976                                 {
1977                                     DEBUG_TRIE_EXECUTE_r(
1978                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1979                                     state = tmp;
1980                                     break;
1981                                 }
1982                                 else {
1983                                     DEBUG_TRIE_EXECUTE_r(
1984                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1985                                     failed = 1;
1986                                     state = aho->fail[state];
1987                                 }
1988                             }
1989                             else {
1990                                 /* we must be accepting here */
1991                                 DEBUG_TRIE_EXECUTE_r(
1992                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1993                                 failed = 1;
1994                                 break;
1995                             }
1996                         } while(state);
1997                         uc += len;
1998                         if (failed) {
1999                             if (leftmost)
2000                                 break;
2001                             if (!state) state = 1;
2002                         }
2003                     }
2004                     if ( aho->states[ state ].wordnum ) {
2005                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2006                         if (!leftmost || lpos < leftmost) {
2007                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2008                             leftmost = lpos;
2009                         }
2010                     }
2011                     if (leftmost) {
2012                         s = (char*)leftmost;
2013                         DEBUG_TRIE_EXECUTE_r({
2014                             PerlIO_printf(
2015                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2016                                 (UV)accepted_word, (IV)(s - real_start)
2017                             );
2018                         });
2019                         if (!reginfo || regtry(reginfo, &s)) {
2020                             FREETMPS;
2021                             LEAVE;
2022                             goto got_it;
2023                         }
2024                         s = HOPc(s,1);
2025                         DEBUG_TRIE_EXECUTE_r({
2026                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2027                         });
2028                     } else {
2029                         DEBUG_TRIE_EXECUTE_r(
2030                             PerlIO_printf( Perl_debug_log,"No match.\n"));
2031                         break;
2032                     }
2033                 }
2034                 FREETMPS;
2035                 LEAVE;
2036             }
2037             break;
2038         default:
2039             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2040             break;
2041         }
2042         return 0;
2043       got_it:
2044         return s;
2045 }
2046
2047
2048 /*
2049  - regexec_flags - match a regexp against a string
2050  */
2051 I32
2052 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2053               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2054 /* strend: pointer to null at end of string */
2055 /* strbeg: real beginning of string */
2056 /* minend: end of match must be >=minend after stringarg. */
2057 /* data: May be used for some additional optimizations.
2058          Currently its only used, with a U32 cast, for transmitting
2059          the ganch offset when doing a /g match. This will change */
2060 /* nosave: For optimizations. */
2061 {
2062     dVAR;
2063     struct regexp *const prog = (struct regexp *)SvANY(rx);
2064     /*register*/ char *s;
2065     regnode *c;
2066     /*register*/ char *startpos = stringarg;
2067     I32 minlen;         /* must match at least this many chars */
2068     I32 dontbother = 0; /* how many characters not to try at end */
2069     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2070     I32 scream_pos = -1;                /* Internal iterator of scream. */
2071     char *scream_olds = NULL;
2072     const bool utf8_target = cBOOL(DO_UTF8(sv));
2073     I32 multiline;
2074     RXi_GET_DECL(prog,progi);
2075     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2076     regexp_paren_pair *swap = NULL;
2077     GET_RE_DEBUG_FLAGS_DECL;
2078
2079     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2080     PERL_UNUSED_ARG(data);
2081
2082     /* Be paranoid... */
2083     if (prog == NULL || startpos == NULL) {
2084         Perl_croak(aTHX_ "NULL regexp parameter");
2085         return 0;
2086     }
2087
2088     multiline = prog->extflags & RXf_PMf_MULTILINE;
2089     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2090
2091     RX_MATCH_UTF8_set(rx, utf8_target);
2092     DEBUG_EXECUTE_r(
2093         debug_start_match(rx, utf8_target, startpos, strend,
2094         "Matching");
2095     );
2096
2097     minlen = prog->minlen;
2098
2099     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2100         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2101                               "String too short [regexec_flags]...\n"));
2102         goto phooey;
2103     }
2104
2105
2106     /* Check validity of program. */
2107     if (UCHARAT(progi->program) != REG_MAGIC) {
2108         Perl_croak(aTHX_ "corrupted regexp program");
2109     }
2110
2111     PL_reg_flags = 0;
2112     PL_reg_state.re_state_eval_setup_done = FALSE;
2113     PL_reg_maxiter = 0;
2114
2115     if (RX_UTF8(rx))
2116         PL_reg_flags |= RF_utf8;
2117
2118     /* Mark beginning of line for ^ and lookbehind. */
2119     reginfo.bol = startpos; /* XXX not used ??? */
2120     PL_bostr  = strbeg;
2121     reginfo.sv = sv;
2122
2123     /* Mark end of line for $ (and such) */
2124     PL_regeol = strend;
2125
2126     /* see how far we have to get to not match where we matched before */
2127     reginfo.till = startpos+minend;
2128
2129     /* If there is a "must appear" string, look for it. */
2130     s = startpos;
2131
2132     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2133         MAGIC *mg;
2134         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2135             reginfo.ganch = startpos + prog->gofs;
2136             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2137               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2138         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2139                   && SvMAGIC(sv)
2140                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2141                   && mg->mg_len >= 0) {
2142             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2143             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2144                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2145
2146             if (prog->extflags & RXf_ANCH_GPOS) {
2147                 if (s > reginfo.ganch)
2148                     goto phooey;
2149                 s = reginfo.ganch - prog->gofs;
2150                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2151                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2152                 if (s < strbeg)
2153                     goto phooey;
2154             }
2155         }
2156         else if (data) {
2157             reginfo.ganch = strbeg + PTR2UV(data);
2158             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2160
2161         } else {                                /* pos() not defined */
2162             reginfo.ganch = strbeg;
2163             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2164                  "GPOS: reginfo.ganch = strbeg\n"));
2165         }
2166     }
2167     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2168         /* We have to be careful. If the previous successful match
2169            was from this regex we don't want a subsequent partially
2170            successful match to clobber the old results.
2171            So when we detect this possibility we add a swap buffer
2172            to the re, and switch the buffer each match. If we fail
2173            we switch it back, otherwise we leave it swapped.
2174         */
2175         swap = prog->offs;
2176         /* do we need a save destructor here for eval dies? */
2177         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2178         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2179             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2180             PTR2UV(prog),
2181             PTR2UV(swap),
2182             PTR2UV(prog->offs)
2183         ));
2184     }
2185     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2186         re_scream_pos_data d;
2187
2188         d.scream_olds = &scream_olds;
2189         d.scream_pos = &scream_pos;
2190         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2191         if (!s) {
2192             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2193             goto phooey;        /* not present */
2194         }
2195     }
2196
2197
2198
2199     /* Simplest case:  anchored match need be tried only once. */
2200     /*  [unless only anchor is BOL and multiline is set] */
2201     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2202         if (s == startpos && regtry(&reginfo, &startpos))
2203             goto got_it;
2204         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2205                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2206         {
2207             char *end;
2208
2209             if (minlen)
2210                 dontbother = minlen - 1;
2211             end = HOP3c(strend, -dontbother, strbeg) - 1;
2212             /* for multiline we only have to try after newlines */
2213             if (prog->check_substr || prog->check_utf8) {
2214                 /* because of the goto we can not easily reuse the macros for bifurcating the
2215                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2216                 if (utf8_target) {
2217                     if (s == startpos)
2218                         goto after_try_utf8;
2219                     while (1) {
2220                         if (regtry(&reginfo, &s)) {
2221                             goto got_it;
2222                         }
2223                       after_try_utf8:
2224                         if (s > end) {
2225                             goto phooey;
2226                         }
2227                         if (prog->extflags & RXf_USE_INTUIT) {
2228                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2229                             if (!s) {
2230                                 goto phooey;
2231                             }
2232                         }
2233                         else {
2234                             s += UTF8SKIP(s);
2235                         }
2236                     }
2237                 } /* end search for check string in unicode */
2238                 else {
2239                     if (s == startpos) {
2240                         goto after_try_latin;
2241                     }
2242                     while (1) {
2243                         if (regtry(&reginfo, &s)) {
2244                             goto got_it;
2245                         }
2246                       after_try_latin:
2247                         if (s > end) {
2248                             goto phooey;
2249                         }
2250                         if (prog->extflags & RXf_USE_INTUIT) {
2251                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2252                             if (!s) {
2253                                 goto phooey;
2254                             }
2255                         }
2256                         else {
2257                             s++;
2258                         }
2259                     }
2260                 } /* end search for check string in latin*/
2261             } /* end search for check string */
2262             else { /* search for newline */
2263                 if (s > startpos) {
2264                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2265                     s--;
2266                 }
2267                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2268                 while (s <= end) { /* note it could be possible to match at the end of the string */
2269                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2270                         if (regtry(&reginfo, &s))
2271                             goto got_it;
2272                     }
2273                 }
2274             } /* end search for newline */
2275         } /* end anchored/multiline check string search */
2276         goto phooey;
2277     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2278     {
2279         /* the warning about reginfo.ganch being used without initialization
2280            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2281            and we only enter this block when the same bit is set. */
2282         char *tmp_s = reginfo.ganch - prog->gofs;
2283
2284         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2285             goto got_it;
2286         goto phooey;
2287     }
2288
2289     /* Messy cases:  unanchored match. */
2290     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2291         /* we have /x+whatever/ */
2292         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2293         char ch;
2294 #ifdef DEBUGGING
2295         int did_match = 0;
2296 #endif
2297         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2298             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2299         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2300
2301         if (utf8_target) {
2302             REXEC_FBC_SCAN(
2303                 if (*s == ch) {
2304                     DEBUG_EXECUTE_r( did_match = 1 );
2305                     if (regtry(&reginfo, &s)) goto got_it;
2306                     s += UTF8SKIP(s);
2307                     while (s < strend && *s == ch)
2308                         s += UTF8SKIP(s);
2309                 }
2310             );
2311         }
2312         else {
2313             REXEC_FBC_SCAN(
2314                 if (*s == ch) {
2315                     DEBUG_EXECUTE_r( did_match = 1 );
2316                     if (regtry(&reginfo, &s)) goto got_it;
2317                     s++;
2318                     while (s < strend && *s == ch)
2319                         s++;
2320                 }
2321             );
2322         }
2323         DEBUG_EXECUTE_r(if (!did_match)
2324                 PerlIO_printf(Perl_debug_log,
2325                                   "Did not find anchored character...\n")
2326                );
2327     }
2328     else if (prog->anchored_substr != NULL
2329               || prog->anchored_utf8 != NULL
2330               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2331                   && prog->float_max_offset < strend - s)) {
2332         SV *must;
2333         I32 back_max;
2334         I32 back_min;
2335         char *last;
2336         char *last1;            /* Last position checked before */
2337 #ifdef DEBUGGING
2338         int did_match = 0;
2339 #endif
2340         if (prog->anchored_substr || prog->anchored_utf8) {
2341             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2342                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2343             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2344             back_max = back_min = prog->anchored_offset;
2345         } else {
2346             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2347                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2348             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2349             back_max = prog->float_max_offset;
2350             back_min = prog->float_min_offset;
2351         }
2352
2353
2354         if (must == &PL_sv_undef)
2355             /* could not downgrade utf8 check substring, so must fail */
2356             goto phooey;
2357
2358         if (back_min<0) {
2359             last = strend;
2360         } else {
2361             last = HOP3c(strend,        /* Cannot start after this */
2362                   -(I32)(CHR_SVLEN(must)
2363                          - (SvTAIL(must) != 0) + back_min), strbeg);
2364         }
2365         if (s > PL_bostr)
2366             last1 = HOPc(s, -1);
2367         else
2368             last1 = s - 1;      /* bogus */
2369
2370         /* XXXX check_substr already used to find "s", can optimize if
2371            check_substr==must. */
2372         scream_pos = -1;
2373         dontbother = end_shift;
2374         strend = HOPc(strend, -dontbother);
2375         while ( (s <= last) &&
2376                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2377                                   (unsigned char*)strend, must,
2378                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2379             DEBUG_EXECUTE_r( did_match = 1 );
2380             if (HOPc(s, -back_max) > last1) {
2381                 last1 = HOPc(s, -back_min);
2382                 s = HOPc(s, -back_max);
2383             }
2384             else {
2385                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2386
2387                 last1 = HOPc(s, -back_min);
2388                 s = t;
2389             }
2390             if (utf8_target) {
2391                 while (s <= last1) {
2392                     if (regtry(&reginfo, &s))
2393                         goto got_it;
2394                     s += UTF8SKIP(s);
2395                 }
2396             }
2397             else {
2398                 while (s <= last1) {
2399                     if (regtry(&reginfo, &s))
2400                         goto got_it;
2401                     s++;
2402                 }
2403             }
2404         }
2405         DEBUG_EXECUTE_r(if (!did_match) {
2406             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2407                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2408             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2409                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2410                                ? "anchored" : "floating"),
2411                 quoted, RE_SV_TAIL(must));
2412         });
2413         goto phooey;
2414     }
2415     else if ( (c = progi->regstclass) ) {
2416         if (minlen) {
2417             const OPCODE op = OP(progi->regstclass);
2418             /* don't bother with what can't match */
2419             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2420                 strend = HOPc(strend, -(minlen - 1));
2421         }
2422         DEBUG_EXECUTE_r({
2423             SV * const prop = sv_newmortal();
2424             regprop(prog, prop, c);
2425             {
2426                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2427                     s,strend-s,60);
2428                 PerlIO_printf(Perl_debug_log,
2429                     "Matching stclass %.*s against %s (%d bytes)\n",
2430                     (int)SvCUR(prop), SvPVX_const(prop),
2431                      quoted, (int)(strend - s));
2432             }
2433         });
2434         if (find_byclass(prog, c, s, strend, &reginfo))
2435             goto got_it;
2436         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2437     }
2438     else {
2439         dontbother = 0;
2440         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2441             /* Trim the end. */
2442             char *last= NULL;
2443             SV* float_real;
2444             STRLEN len;
2445             const char *little;
2446
2447             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2448                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2449             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2450
2451             little = SvPV_const(float_real, len);
2452             if (SvTAIL(float_real)) {
2453                     /* This means that float_real contains an artificial \n on the end
2454                      * due to the presence of something like this: /foo$/
2455                      * where we can match both "foo" and "foo\n" at the end of the string.
2456                      * So we have to compare the end of the string first against the float_real
2457                      * without the \n and then against the full float_real with the string.
2458                      * We have to watch out for cases where the string might be smaller
2459                      * than the float_real or the float_real without the \n.
2460                      */
2461                     char *checkpos= strend - len;
2462                     DEBUG_OPTIMISE_r(
2463                         PerlIO_printf(Perl_debug_log,
2464                             "%sChecking for float_real.%s\n",
2465                             PL_colors[4], PL_colors[5]));
2466                     if (checkpos + 1 < strbeg) {
2467                         /* can't match, even if we remove the trailing \n string is too short to match */
2468                         DEBUG_EXECUTE_r(
2469                             PerlIO_printf(Perl_debug_log,
2470                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2471                                 PL_colors[4], PL_colors[5]));
2472                         goto phooey;
2473                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2474                         /* can match, the end of the string matches without the "\n" */
2475                         last = checkpos + 1;
2476                     } else if (checkpos < strbeg) {
2477                         /* cant match, string is too short when the "\n" is included */
2478                         DEBUG_EXECUTE_r(
2479                             PerlIO_printf(Perl_debug_log,
2480                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2481                                 PL_colors[4], PL_colors[5]));
2482                         goto phooey;
2483                     } else if (!multiline) {
2484                         /* non multiline match, so compare with the "\n" at the end of the string */
2485                         if (memEQ(checkpos, little, len)) {
2486                             last= checkpos;
2487                         } else {
2488                             DEBUG_EXECUTE_r(
2489                                 PerlIO_printf(Perl_debug_log,
2490                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2491                                     PL_colors[4], PL_colors[5]));
2492                             goto phooey;
2493                         }
2494                     } else {
2495                         /* multiline match, so we have to search for a place where the full string is located */
2496                         goto find_last;
2497                     }
2498             } else {
2499                   find_last:
2500                     if (len)
2501                         last = rninstr(s, strend, little, little + len);
2502                     else
2503                         last = strend;  /* matching "$" */
2504             }
2505             if (!last) {
2506                 /* at one point this block contained a comment which was probably
2507                  * incorrect, which said that this was a "should not happen" case.
2508                  * Even if it was true when it was written I am pretty sure it is
2509                  * not anymore, so I have removed the comment and replaced it with
2510                  * this one. Yves */
2511                 DEBUG_EXECUTE_r(
2512                     PerlIO_printf(Perl_debug_log,
2513                         "String does not contain required substring, cannot match.\n"
2514                     ));
2515                 goto phooey;
2516             }
2517             dontbother = strend - last + prog->float_min_offset;
2518         }
2519         if (minlen && (dontbother < minlen))
2520             dontbother = minlen - 1;
2521         strend -= dontbother;              /* this one's always in bytes! */
2522         /* We don't know much -- general case. */
2523         if (utf8_target) {
2524             for (;;) {
2525                 if (regtry(&reginfo, &s))
2526                     goto got_it;
2527                 if (s >= strend)
2528                     break;
2529                 s += UTF8SKIP(s);
2530             };
2531         }
2532         else {
2533             do {
2534                 if (regtry(&reginfo, &s))
2535                     goto got_it;
2536             } while (s++ < strend);
2537         }
2538     }
2539
2540     /* Failure. */
2541     goto phooey;
2542
2543 got_it:
2544     DEBUG_BUFFERS_r(
2545         if (swap)
2546             PerlIO_printf(Perl_debug_log,
2547                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2548                 PTR2UV(prog),
2549                 PTR2UV(swap)
2550             );
2551     );
2552     Safefree(swap);
2553     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2554
2555     if (PL_reg_state.re_state_eval_setup_done)
2556         restore_pos(aTHX_ prog);
2557     if (RXp_PAREN_NAMES(prog))
2558         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2559
2560     /* make sure $`, $&, $', and $digit will work later */
2561     if ( !(flags & REXEC_NOT_FIRST) ) {
2562         RX_MATCH_COPY_FREE(rx);
2563         if (flags & REXEC_COPY_STR) {
2564             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2565 #ifdef PERL_OLD_COPY_ON_WRITE
2566             if ((SvIsCOW(sv)
2567                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2568                 if (DEBUG_C_TEST) {
2569                     PerlIO_printf(Perl_debug_log,
2570                                   "Copy on write: regexp capture, type %d\n",
2571                                   (int) SvTYPE(sv));
2572                 }
2573                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2574                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2575                 assert (SvPOKp(prog->saved_copy));
2576             } else
2577 #endif
2578             {
2579                 RX_MATCH_COPIED_on(rx);
2580                 s = savepvn(strbeg, i);
2581                 prog->subbeg = s;
2582             }
2583             prog->sublen = i;
2584         }
2585         else {
2586             prog->subbeg = strbeg;
2587             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2588         }
2589     }
2590
2591     return 1;
2592
2593 phooey:
2594     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2595                           PL_colors[4], PL_colors[5]));
2596     if (PL_reg_state.re_state_eval_setup_done)
2597         restore_pos(aTHX_ prog);
2598     if (swap) {
2599         /* we failed :-( roll it back */
2600         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2601             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2602             PTR2UV(prog),
2603             PTR2UV(prog->offs),
2604             PTR2UV(swap)
2605         ));
2606         Safefree(prog->offs);
2607         prog->offs = swap;
2608     }
2609
2610     return 0;
2611 }
2612
2613
2614 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2615  * Do inc before dec, in case old and new rex are the same */
2616 #define SET_reg_curpm(Re2) \
2617     if (PL_reg_state.re_state_eval_setup_done) {    \
2618         (void)ReREFCNT_inc(Re2);                    \
2619         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2620         PM_SETRE((PL_reg_curpm), (Re2));            \
2621     }
2622
2623
2624 /*
2625  - regtry - try match at specific point
2626  */
2627 STATIC I32                      /* 0 failure, 1 success */
2628 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2629 {
2630     dVAR;
2631     CHECKPOINT lastcp;
2632     REGEXP *const rx = reginfo->prog;
2633     regexp *const prog = (struct regexp *)SvANY(rx);
2634     RXi_GET_DECL(prog,progi);
2635     GET_RE_DEBUG_FLAGS_DECL;
2636
2637     PERL_ARGS_ASSERT_REGTRY;
2638
2639     reginfo->cutpoint=NULL;
2640
2641     if ((prog->extflags & RXf_EVAL_SEEN)
2642         && !PL_reg_state.re_state_eval_setup_done)
2643     {
2644         MAGIC *mg;
2645
2646         PL_reg_state.re_state_eval_setup_done = TRUE;
2647         if (reginfo->sv) {
2648             /* Make $_ available to executed code. */
2649             if (reginfo->sv != DEFSV) {
2650                 SAVE_DEFSV;
2651                 DEFSV_set(reginfo->sv);
2652             }
2653
2654             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2655                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2656                 /* prepare for quick setting of pos */
2657 #ifdef PERL_OLD_COPY_ON_WRITE
2658                 if (SvIsCOW(reginfo->sv))
2659                     sv_force_normal_flags(reginfo->sv, 0);
2660 #endif
2661                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2662                                  &PL_vtbl_mglob, NULL, 0);
2663                 mg->mg_len = -1;
2664             }
2665             PL_reg_magic    = mg;
2666             PL_reg_oldpos   = mg->mg_len;
2667             SAVEDESTRUCTOR_X(restore_pos, prog);
2668         }
2669         if (!PL_reg_curpm) {
2670             Newxz(PL_reg_curpm, 1, PMOP);
2671 #ifdef USE_ITHREADS
2672             {
2673                 SV* const repointer = &PL_sv_undef;
2674                 /* this regexp is also owned by the new PL_reg_curpm, which
2675                    will try to free it.  */
2676                 av_push(PL_regex_padav, repointer);
2677                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2678                 PL_regex_pad = AvARRAY(PL_regex_padav);
2679             }
2680 #endif
2681         }
2682         SET_reg_curpm(rx);
2683         PL_reg_oldcurpm = PL_curpm;
2684         PL_curpm = PL_reg_curpm;
2685         if (RXp_MATCH_COPIED(prog)) {
2686             /*  Here is a serious problem: we cannot rewrite subbeg,
2687                 since it may be needed if this match fails.  Thus
2688                 $` inside (?{}) could fail... */
2689             PL_reg_oldsaved = prog->subbeg;
2690             PL_reg_oldsavedlen = prog->sublen;
2691 #ifdef PERL_OLD_COPY_ON_WRITE
2692             PL_nrs = prog->saved_copy;
2693 #endif
2694             RXp_MATCH_COPIED_off(prog);
2695         }
2696         else
2697             PL_reg_oldsaved = NULL;
2698         prog->subbeg = PL_bostr;
2699         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2700     }
2701 #ifdef DEBUGGING
2702     PL_reg_starttry = *startpos;
2703 #endif
2704     prog->offs[0].start = *startpos - PL_bostr;
2705     PL_reginput = *startpos;
2706     prog->lastparen = 0;
2707     prog->lastcloseparen = 0;
2708     PL_regsize = 0;
2709
2710     /* XXXX What this code is doing here?!!!  There should be no need
2711        to do this again and again, prog->lastparen should take care of
2712        this!  --ilya*/
2713
2714     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2715      * Actually, the code in regcppop() (which Ilya may be meaning by
2716      * prog->lastparen), is not needed at all by the test suite
2717      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2718      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2719      * Meanwhile, this code *is* needed for the
2720      * above-mentioned test suite tests to succeed.  The common theme
2721      * on those tests seems to be returning null fields from matches.
2722      * --jhi updated by dapm */
2723 #if 1
2724     if (prog->nparens) {
2725         regexp_paren_pair *pp = prog->offs;
2726         I32 i;
2727         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2728             ++pp;
2729             pp->start = -1;
2730             pp->end = -1;
2731         }
2732     }
2733 #endif
2734     REGCP_SET(lastcp);
2735     if (regmatch(reginfo, progi->program + 1)) {
2736         prog->offs[0].end = PL_reginput - PL_bostr;
2737         return 1;
2738     }
2739     if (reginfo->cutpoint)
2740         *startpos= reginfo->cutpoint;
2741     REGCP_UNWIND(lastcp);
2742     return 0;
2743 }
2744
2745
2746 #define sayYES goto yes
2747 #define sayNO goto no
2748 #define sayNO_SILENT goto no_silent
2749
2750 /* we dont use STMT_START/END here because it leads to
2751    "unreachable code" warnings, which are bogus, but distracting. */
2752 #define CACHEsayNO \
2753     if (ST.cache_mask) \
2754        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2755     sayNO
2756
2757 /* this is used to determine how far from the left messages like
2758    'failed...' are printed. It should be set such that messages
2759    are inline with the regop output that created them.
2760 */
2761 #define REPORT_CODE_OFF 32
2762
2763
2764 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2765 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2766
2767 #define SLAB_FIRST(s) (&(s)->states[0])
2768 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2769
2770 /* grab a new slab and return the first slot in it */
2771
2772 STATIC regmatch_state *
2773 S_push_slab(pTHX)
2774 {
2775 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2776     dMY_CXT;
2777 #endif
2778     regmatch_slab *s = PL_regmatch_slab->next;
2779     if (!s) {
2780         Newx(s, 1, regmatch_slab);
2781         s->prev = PL_regmatch_slab;
2782         s->next = NULL;
2783         PL_regmatch_slab->next = s;
2784     }
2785     PL_regmatch_slab = s;
2786     return SLAB_FIRST(s);
2787 }
2788
2789
2790 /* push a new state then goto it */
2791
2792 #define PUSH_STATE_GOTO(state, node) \
2793     scan = node; \
2794     st->resume_state = state; \
2795     goto push_state;
2796
2797 /* push a new state with success backtracking, then goto it */
2798
2799 #define PUSH_YES_STATE_GOTO(state, node) \
2800     scan = node; \
2801     st->resume_state = state; \
2802     goto push_yes_state;
2803
2804
2805
2806 /*
2807
2808 regmatch() - main matching routine
2809
2810 This is basically one big switch statement in a loop. We execute an op,
2811 set 'next' to point the next op, and continue. If we come to a point which
2812 we may need to backtrack to on failure such as (A|B|C), we push a
2813 backtrack state onto the backtrack stack. On failure, we pop the top
2814 state, and re-enter the loop at the state indicated. If there are no more
2815 states to pop, we return failure.
2816
2817 Sometimes we also need to backtrack on success; for example /A+/, where
2818 after successfully matching one A, we need to go back and try to
2819 match another one; similarly for lookahead assertions: if the assertion
2820 completes successfully, we backtrack to the state just before the assertion
2821 and then carry on.  In these cases, the pushed state is marked as
2822 'backtrack on success too'. This marking is in fact done by a chain of
2823 pointers, each pointing to the previous 'yes' state. On success, we pop to
2824 the nearest yes state, discarding any intermediate failure-only states.
2825 Sometimes a yes state is pushed just to force some cleanup code to be
2826 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2827 it to free the inner regex.
2828
2829 Note that failure backtracking rewinds the cursor position, while
2830 success backtracking leaves it alone.
2831
2832 A pattern is complete when the END op is executed, while a subpattern
2833 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2834 ops trigger the "pop to last yes state if any, otherwise return true"
2835 behaviour.
2836
2837 A common convention in this function is to use A and B to refer to the two
2838 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2839 the subpattern to be matched possibly multiple times, while B is the entire
2840 rest of the pattern. Variable and state names reflect this convention.
2841
2842 The states in the main switch are the union of ops and failure/success of
2843 substates associated with with that op.  For example, IFMATCH is the op
2844 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2845 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2846 successfully matched A and IFMATCH_A_fail is a state saying that we have
2847 just failed to match A. Resume states always come in pairs. The backtrack
2848 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2849 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2850 on success or failure.
2851
2852 The struct that holds a backtracking state is actually a big union, with
2853 one variant for each major type of op. The variable st points to the
2854 top-most backtrack struct. To make the code clearer, within each
2855 block of code we #define ST to alias the relevant union.
2856
2857 Here's a concrete example of a (vastly oversimplified) IFMATCH
2858 implementation:
2859
2860     switch (state) {
2861     ....
2862
2863 #define ST st->u.ifmatch
2864
2865     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2866         ST.foo = ...; // some state we wish to save
2867         ...
2868         // push a yes backtrack state with a resume value of
2869         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2870         // first node of A:
2871         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2872         // NOTREACHED
2873
2874     case IFMATCH_A: // we have successfully executed A; now continue with B
2875         next = B;
2876         bar = ST.foo; // do something with the preserved value
2877         break;
2878
2879     case IFMATCH_A_fail: // A failed, so the assertion failed
2880         ...;   // do some housekeeping, then ...
2881         sayNO; // propagate the failure
2882
2883 #undef ST
2884
2885     ...
2886     }
2887
2888 For any old-timers reading this who are familiar with the old recursive
2889 approach, the code above is equivalent to:
2890
2891     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2892     {
2893         int foo = ...
2894         ...
2895         if (regmatch(A)) {
2896             next = B;
2897             bar = foo;
2898             break;
2899         }
2900         ...;   // do some housekeeping, then ...
2901         sayNO; // propagate the failure
2902     }
2903
2904 The topmost backtrack state, pointed to by st, is usually free. If you
2905 want to claim it, populate any ST.foo fields in it with values you wish to
2906 save, then do one of
2907
2908         PUSH_STATE_GOTO(resume_state, node);
2909         PUSH_YES_STATE_GOTO(resume_state, node);
2910
2911 which sets that backtrack state's resume value to 'resume_state', pushes a
2912 new free entry to the top of the backtrack stack, then goes to 'node'.
2913 On backtracking, the free slot is popped, and the saved state becomes the
2914 new free state. An ST.foo field in this new top state can be temporarily
2915 accessed to retrieve values, but once the main loop is re-entered, it
2916 becomes available for reuse.
2917
2918 Note that the depth of the backtrack stack constantly increases during the
2919 left-to-right execution of the pattern, rather than going up and down with
2920 the pattern nesting. For example the stack is at its maximum at Z at the
2921 end of the pattern, rather than at X in the following:
2922
2923     /(((X)+)+)+....(Y)+....Z/
2924
2925 The only exceptions to this are lookahead/behind assertions and the cut,
2926 (?>A), which pop all the backtrack states associated with A before
2927 continuing.
2928
2929 Backtrack state structs are allocated in slabs of about 4K in size.
2930 PL_regmatch_state and st always point to the currently active state,
2931 and PL_regmatch_slab points to the slab currently containing
2932 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2933 allocated, and is never freed until interpreter destruction. When the slab
2934 is full, a new one is allocated and chained to the end. At exit from
2935 regmatch(), slabs allocated since entry are freed.
2936
2937 */
2938
2939
2940 #define DEBUG_STATE_pp(pp)                                  \
2941     DEBUG_STATE_r({                                         \
2942         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2943         PerlIO_printf(Perl_debug_log,                       \
2944             "    %*s"pp" %s%s%s%s%s\n",                     \
2945             depth*2, "",                                    \
2946             PL_reg_name[st->resume_state],                     \
2947             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2948             ((st==yes_state) ? "Y" : ""),                   \
2949             ((st==mark_state) ? "M" : ""),                  \
2950             ((st==yes_state||st==mark_state) ? "]" : "")    \
2951         );                                                  \
2952     });
2953
2954
2955 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2956
2957 #ifdef DEBUGGING
2958
2959 STATIC void
2960 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2961     const char *start, const char *end, const char *blurb)
2962 {
2963     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2964
2965     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2966
2967     if (!PL_colorset)
2968             reginitcolors();
2969     {
2970         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2971             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2972
2973         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2974             start, end - start, 60);
2975
2976         PerlIO_printf(Perl_debug_log,
2977             "%s%s REx%s %s against %s\n",
2978                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2979
2980         if (utf8_target||utf8_pat)
2981             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2982                 utf8_pat ? "pattern" : "",
2983                 utf8_pat && utf8_target ? " and " : "",
2984                 utf8_target ? "string" : ""
2985             );
2986     }
2987 }
2988
2989 STATIC void
2990 S_dump_exec_pos(pTHX_ const char *locinput,
2991                       const regnode *scan,
2992                       const char *loc_regeol,
2993                       const char *loc_bostr,
2994                       const char *loc_reg_starttry,
2995                       const bool utf8_target)
2996 {
2997     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2998     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2999     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3000     /* The part of the string before starttry has one color
3001        (pref0_len chars), between starttry and current
3002        position another one (pref_len - pref0_len chars),
3003        after the current position the third one.
3004        We assume that pref0_len <= pref_len, otherwise we
3005        decrease pref0_len.  */
3006     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3007         ? (5 + taill) - l : locinput - loc_bostr;
3008     int pref0_len;
3009
3010     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3011
3012     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3013         pref_len++;
3014     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3015     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3016         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3017               ? (5 + taill) - pref_len : loc_regeol - locinput);
3018     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3019         l--;
3020     if (pref0_len < 0)
3021         pref0_len = 0;
3022     if (pref0_len > pref_len)
3023         pref0_len = pref_len;
3024     {
3025         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3026
3027         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3028             (locinput - pref_len),pref0_len, 60, 4, 5);
3029
3030         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3031                     (locinput - pref_len + pref0_len),
3032                     pref_len - pref0_len, 60, 2, 3);
3033
3034         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3035                     locinput, loc_regeol - locinput, 10, 0, 1);
3036
3037         const STRLEN tlen=len0+len1+len2;
3038         PerlIO_printf(Perl_debug_log,
3039                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3040                     (IV)(locinput - loc_bostr),
3041                     len0, s0,
3042                     len1, s1,
3043                     (docolor ? "" : "> <"),
3044                     len2, s2,
3045                     (int)(tlen > 19 ? 0 :  19 - tlen),
3046                     "");
3047     }
3048 }
3049
3050 #endif
3051
3052 /* reg_check_named_buff_matched()
3053  * Checks to see if a named buffer has matched. The data array of
3054  * buffer numbers corresponding to the buffer is expected to reside
3055  * in the regexp->data->data array in the slot stored in the ARG() of
3056  * node involved. Note that this routine doesn't actually care about the
3057  * name, that information is not preserved from compilation to execution.
3058  * Returns the index of the leftmost defined buffer with the given name
3059  * or 0 if non of the buffers matched.
3060  */
3061 STATIC I32
3062 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3063 {
3064     I32 n;
3065     RXi_GET_DECL(rex,rexi);
3066     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3067     I32 *nums=(I32*)SvPVX(sv_dat);
3068
3069     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3070
3071     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3072         if ((I32)rex->lastparen >= nums[n] &&
3073             rex->offs[nums[n]].end != -1)
3074         {
3075             return nums[n];
3076         }
3077     }
3078     return 0;
3079 }
3080
3081
3082 /* free all slabs above current one  - called during LEAVE_SCOPE */
3083
3084 STATIC void
3085 S_clear_backtrack_stack(pTHX_ void *p)
3086 {
3087     regmatch_slab *s = PL_regmatch_slab->next;
3088     PERL_UNUSED_ARG(p);
3089
3090     if (!s)
3091         return;
3092     PL_regmatch_slab->next = NULL;
3093     while (s) {
3094         regmatch_slab * const osl = s;
3095         s = s->next;
3096         Safefree(osl);
3097     }
3098 }
3099
3100
3101 STATIC I32                      /* 0 failure, 1 success */
3102 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3103 {
3104 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3105     dMY_CXT;
3106 #endif
3107     dVAR;
3108     const bool utf8_target = PL_reg_match_utf8;
3109     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3110     REGEXP *rex_sv = reginfo->prog;
3111     regexp *rex = (struct regexp *)SvANY(rex_sv);
3112     RXi_GET_DECL(rex,rexi);
3113     I32 oldsave;
3114     /* the current state. This is a cached copy of PL_regmatch_state */
3115     regmatch_state *st;
3116     /* cache heavy used fields of st in registers */
3117     regnode *scan;
3118     regnode *next;
3119     U32 n = 0;  /* general value; init to avoid compiler warning */
3120     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3121     char *locinput = PL_reginput;
3122     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3123
3124     bool result = 0;        /* return value of S_regmatch */
3125     int depth = 0;          /* depth of backtrack stack */
3126     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3127     const U32 max_nochange_depth =
3128         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3129         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3130     regmatch_state *yes_state = NULL; /* state to pop to on success of
3131                                                             subpattern */
3132     /* mark_state piggy backs on the yes_state logic so that when we unwind
3133        the stack on success we can update the mark_state as we go */
3134     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3135     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3136     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3137     U32 state_num;
3138     bool no_final = 0;      /* prevent failure from backtracking? */
3139     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3140     char *startpoint = PL_reginput;
3141     SV *popmark = NULL;     /* are we looking for a mark? */
3142     SV *sv_commit = NULL;   /* last mark name seen in failure */
3143     SV *sv_yes_mark = NULL; /* last mark name we have seen
3144                                during a successful match */
3145     U32 lastopen = 0;       /* last open we saw */
3146     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3147     SV* const oreplsv = GvSV(PL_replgv);
3148     /* these three flags are set by various ops to signal information to
3149      * the very next op. They have a useful lifetime of exactly one loop
3150      * iteration, and are not preserved or restored by state pushes/pops
3151      */
3152     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3153     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3154     int logical = 0;        /* the following EVAL is:
3155                                 0: (?{...})
3156                                 1: (?(?{...})X|Y)
3157                                 2: (??{...})
3158                                or the following IFMATCH/UNLESSM is:
3159                                 false: plain (?=foo)
3160                                 true:  used as a condition: (?(?=foo))
3161                             */
3162     PAD* last_pad = NULL;
3163     dMULTICALL;
3164     I32 gimme = G_SCALAR;
3165     CV *caller_cv = NULL;       /* who called us */
3166     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3167     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3168
3169 #ifdef DEBUGGING
3170     GET_RE_DEBUG_FLAGS_DECL;
3171 #endif
3172
3173     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3174     multicall_oldcatch = 0;
3175     multicall_cv = NULL;
3176     cx = NULL;
3177     PERL_UNUSED_VAR(multicall_cop);
3178     PERL_UNUSED_VAR(newsp);
3179
3180
3181     PERL_ARGS_ASSERT_REGMATCH;
3182
3183     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3184             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3185     }));
3186     /* on first ever call to regmatch, allocate first slab */
3187     if (!PL_regmatch_slab) {
3188         Newx(PL_regmatch_slab, 1, regmatch_slab);
3189         PL_regmatch_slab->prev = NULL;
3190         PL_regmatch_slab->next = NULL;
3191         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3192     }
3193
3194     oldsave = PL_savestack_ix;
3195     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3196     SAVEVPTR(PL_regmatch_slab);
3197     SAVEVPTR(PL_regmatch_state);
3198
3199     /* grab next free state slot */
3200     st = ++PL_regmatch_state;
3201     if (st >  SLAB_LAST(PL_regmatch_slab))
3202         st = PL_regmatch_state = S_push_slab(aTHX);
3203
3204     /* Note that nextchr is a byte even in UTF */
3205     nextchr = UCHARAT(locinput);
3206     scan = prog;
3207     while (scan != NULL) {
3208
3209         DEBUG_EXECUTE_r( {
3210             SV * const prop = sv_newmortal();
3211             regnode *rnext=regnext(scan);
3212             DUMP_EXEC_POS( locinput, scan, utf8_target );
3213             regprop(rex, prop, scan);
3214
3215             PerlIO_printf(Perl_debug_log,
3216                     "%3"IVdf":%*s%s(%"IVdf")\n",
3217                     (IV)(scan - rexi->program), depth*2, "",
3218                     SvPVX_const(prop),
3219                     (PL_regkind[OP(scan)] == END || !rnext) ?
3220                         0 : (IV)(rnext - rexi->program));
3221         });
3222
3223         next = scan + NEXT_OFF(scan);
3224         if (next == scan)
3225             next = NULL;
3226         state_num = OP(scan);
3227
3228       reenter_switch:
3229
3230         switch (state_num) {
3231         case BOL:
3232             if (locinput == PL_bostr)
3233             {
3234                 /* reginfo->till = reginfo->bol; */
3235                 break;
3236             }
3237             sayNO;
3238         case MBOL:
3239             if (locinput == PL_bostr ||
3240                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3241             {
3242                 break;
3243             }
3244             sayNO;
3245         case SBOL:
3246             if (locinput == PL_bostr)
3247                 break;
3248             sayNO;
3249         case GPOS:
3250             if (locinput == reginfo->ganch)
3251                 break;
3252             sayNO;
3253
3254         case KEEPS:
3255             /* update the startpoint */
3256             st->u.keeper.val = rex->offs[0].start;
3257             PL_reginput = locinput;
3258             rex->offs[0].start = locinput - PL_bostr;
3259             PUSH_STATE_GOTO(KEEPS_next, next);
3260             /*NOT-REACHED*/
3261         case KEEPS_next_fail:
3262             /* rollback the start point change */
3263             rex->offs[0].start = st->u.keeper.val;
3264             sayNO_SILENT;
3265             /*NOT-REACHED*/
3266         case EOL:
3267                 goto seol;
3268         case MEOL:
3269             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3270                 sayNO;
3271             break;
3272         case SEOL:
3273           seol:
3274             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3275                 sayNO;
3276             if (PL_regeol - locinput > 1)
3277                 sayNO;
3278             break;
3279         case EOS:
3280             if (PL_regeol != locinput)
3281                 sayNO;
3282             break;
3283         case SANY:
3284             if (!nextchr && locinput >= PL_regeol)
3285                 sayNO;
3286             if (utf8_target) {
3287                 locinput += PL_utf8skip[nextchr];
3288                 if (locinput > PL_regeol)
3289                     sayNO;
3290                 nextchr = UCHARAT(locinput);
3291             }
3292             else
3293                 nextchr = UCHARAT(++locinput);
3294             break;
3295         case CANY:
3296             if (!nextchr && locinput >= PL_regeol)
3297                 sayNO;
3298             nextchr = UCHARAT(++locinput);
3299             break;
3300         case REG_ANY:
3301             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3302                 sayNO;
3303             if (utf8_target) {
3304                 locinput += PL_utf8skip[nextchr];
3305                 if (locinput > PL_regeol)
3306                     sayNO;
3307                 nextchr = UCHARAT(locinput);
3308             }
3309             else
3310                 nextchr = UCHARAT(++locinput);
3311             break;
3312
3313 #undef  ST
3314 #define ST st->u.trie
3315         case TRIEC:
3316             /* In this case the charclass data is available inline so
3317                we can fail fast without a lot of extra overhead.
3318              */
3319             if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3320                 DEBUG_EXECUTE_r(
3321                     PerlIO_printf(Perl_debug_log,
3322                               "%*s  %sfailed to match trie start class...%s\n",
3323                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3324                 );
3325                 sayNO_SILENT;
3326                 assert(0); /* NOTREACHED */
3327             }
3328             /* FALL THROUGH */
3329         case TRIE:
3330             /* the basic plan of execution of the trie is:
3331              * At the beginning, run though all the states, and
3332              * find the longest-matching word. Also remember the position
3333              * of the shortest matching word. For example, this pattern:
3334              *    1  2 3 4    5
3335              *    ab|a|x|abcd|abc
3336              * when matched against the string "abcde", will generate
3337              * accept states for all words except 3, with the longest
3338              * matching word being 4, and the shortest being 1 (with
3339              * the position being after char 1 of the string).
3340              *
3341              * Then for each matching word, in word order (i.e. 1,2,4,5),
3342              * we run the remainder of the pattern; on each try setting
3343              * the current position to the character following the word,
3344              * returning to try the next word on failure.
3345              *
3346              * We avoid having to build a list of words at runtime by
3347              * using a compile-time structure, wordinfo[].prev, which
3348              * gives, for each word, the previous accepting word (if any).
3349              * In the case above it would contain the mappings 1->2, 2->0,
3350              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3351              * the longest word (4 above), a list of all words, by
3352              * following the list of prev pointers; this gives us the
3353              * unordered list 4,5,1,2. Then given the current word we have
3354              * just tried, we can go through the list and find the
3355              * next-biggest word to try (so if we just failed on word 2,
3356              * the next in the list is 4).
3357              *
3358              * Since at runtime we don't record the matching position in
3359              * the string for each word, we have to work that out for
3360              * each word we're about to process. The wordinfo table holds
3361              * the character length of each word; given that we recorded
3362              * at the start: the position of the shortest word and its
3363              * length in chars, we just need to move the pointer the
3364              * difference between the two char lengths. Depending on
3365              * Unicode status and folding, that's cheap or expensive.
3366              *
3367              * This algorithm is optimised for the case where are only a
3368              * small number of accept states, i.e. 0,1, or maybe 2.
3369              * With lots of accepts states, and having to try all of them,
3370              * it becomes quadratic on number of accept states to find all
3371              * the next words.
3372              */
3373
3374             {
3375                 /* what type of TRIE am I? (utf8 makes this contextual) */
3376                 DECL_TRIE_TYPE(scan);
3377
3378                 /* what trie are we using right now */
3379                 reg_trie_data * const trie
3380                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3381                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3382                 U32 state = trie->startstate;
3383
3384                 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3385                     if (trie->states[ state ].wordnum) {
3386                          DEBUG_EXECUTE_r(
3387                             PerlIO_printf(Perl_debug_log,
3388                                           "%*s  %smatched empty string...%s\n",
3389                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3390                         );
3391                         if (!trie->jump)
3392                             break;
3393                     } else {
3394                         DEBUG_EXECUTE_r(
3395                             PerlIO_printf(Perl_debug_log,
3396                                           "%*s  %sfailed to match trie start class...%s\n",
3397                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3398                         );
3399                         sayNO_SILENT;
3400                    }
3401                 }
3402
3403             {
3404                 U8 *uc = ( U8* )locinput;
3405
3406                 STRLEN len = 0;
3407                 STRLEN foldlen = 0;
3408                 U8 *uscan = (U8*)NULL;
3409                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3410                 U32 charcount = 0; /* how many input chars we have matched */
3411                 U32 accepted = 0; /* have we seen any accepting states? */
3412
3413                 ST.jump = trie->jump;
3414                 ST.me = scan;
3415                 ST.firstpos = NULL;
3416                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3417                 ST.nextword = 0;
3418
3419                 /* fully traverse the TRIE; note the position of the
3420                    shortest accept state and the wordnum of the longest
3421                    accept state */
3422
3423                 while ( state && uc <= (U8*)PL_regeol ) {
3424                     U32 base = trie->states[ state ].trans.base;
3425                     UV uvc = 0;
3426                     U16 charid = 0;
3427                     U16 wordnum;
3428                     wordnum = trie->states[ state ].wordnum;
3429
3430                     if (wordnum) { /* it's an accept state */
3431                         if (!accepted) {
3432                             accepted = 1;
3433                             /* record first match position */
3434                             if (ST.longfold) {
3435                                 ST.firstpos = (U8*)locinput;
3436                                 ST.firstchars = 0;
3437                             }
3438                             else {
3439                                 ST.firstpos = uc;
3440                                 ST.firstchars = charcount;
3441                             }
3442                         }
3443                         if (!ST.nextword || wordnum < ST.nextword)
3444                             ST.nextword = wordnum;
3445                         ST.topword = wordnum;
3446                     }
3447
3448                     DEBUG_TRIE_EXECUTE_r({
3449                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3450                                 PerlIO_printf( Perl_debug_log,
3451                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3452                                     2+depth * 2, "", PL_colors[4],
3453                                     (UV)state, (accepted ? 'Y' : 'N'));
3454                     });
3455
3456                     /* read a char and goto next state */
3457                     if ( base ) {
3458                         I32 offset;
3459                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3460                                              uscan, len, uvc, charid, foldlen,
3461                                              foldbuf, uniflags);
3462                         charcount++;
3463                         if (foldlen>0)
3464                             ST.longfold = TRUE;
3465                         if (charid &&
3466                              ( ((offset =
3467                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3468
3469                              && ((U32)offset < trie->lasttrans)
3470                              && trie->trans[offset].check == state)
3471                         {
3472                             state = trie->trans[offset].next;
3473                         }
3474                         else {
3475                             state = 0;
3476                         }
3477                         uc += len;
3478
3479                     }
3480                     else {
3481                         state = 0;
3482                     }
3483                     DEBUG_TRIE_EXECUTE_r(
3484                         PerlIO_printf( Perl_debug_log,
3485                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3486                             charid, uvc, (UV)state, PL_colors[5] );
3487                     );
3488                 }
3489                 if (!accepted)
3490                    sayNO;
3491
3492                 /* calculate total number of accept states */
3493                 {
3494                     U16 w = ST.topword;
3495                     accepted = 0;
3496                     while (w) {
3497                         w = trie->wordinfo[w].prev;
3498                         accepted++;
3499                     }
3500                     ST.accepted = accepted;
3501                 }
3502
3503                 DEBUG_EXECUTE_r(
3504                     PerlIO_printf( Perl_debug_log,
3505                         "%*s  %sgot %"IVdf" possible matches%s\n",
3506                         REPORT_CODE_OFF + depth * 2, "",
3507                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3508                 );
3509                 goto trie_first_try; /* jump into the fail handler */
3510             }}
3511             assert(0); /* NOTREACHED */
3512
3513         case TRIE_next_fail: /* we failed - try next alternative */
3514             if ( ST.jump) {
3515                 REGCP_UNWIND(ST.cp);
3516                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3517             }
3518             if (!--ST.accepted) {
3519                 DEBUG_EXECUTE_r({
3520                     PerlIO_printf( Perl_debug_log,
3521                         "%*s  %sTRIE failed...%s\n",
3522                         REPORT_CODE_OFF+depth*2, "",
3523                         PL_colors[4],
3524                         PL_colors[5] );
3525                 });
3526                 sayNO_SILENT;
3527             }
3528             {
3529                 /* Find next-highest word to process.  Note that this code
3530                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3531                 U16 min = 0;
3532                 U16 word;
3533                 U16 const nextword = ST.nextword;
3534                 reg_trie_wordinfo * const wordinfo
3535                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3536                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3537                     if (word > nextword && (!min || word < min))
3538                         min = word;
3539                 }
3540                 ST.nextword = min;
3541             }
3542
3543           trie_first_try:
3544             if (do_cutgroup) {
3545                 do_cutgroup = 0;
3546                 no_final = 0;
3547             }
3548
3549             if ( ST.jump) {
3550                 ST.lastparen = rex->lastparen;
3551                 ST.lastcloseparen = rex->lastcloseparen;
3552                 REGCP_SET(ST.cp);
3553             }
3554
3555             /* find start char of end of current word */
3556             {
3557                 U32 chars; /* how many chars to skip */
3558                 U8 *uc = ST.firstpos;
3559                 reg_trie_data * const trie
3560                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3561
3562                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3563                             >=  ST.firstchars);
3564                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3565                             - ST.firstchars;
3566
3567                 if (ST.longfold) {
3568                     /* the hard option - fold each char in turn and find
3569                      * its folded length (which may be different */
3570                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3571                     STRLEN foldlen;
3572                     STRLEN len;
3573                     UV uvc;
3574                     U8 *uscan;
3575
3576                     while (chars) {
3577                         if (utf8_target) {
3578                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3579                                                     uniflags);
3580                             uc += len;
3581                         }
3582                         else {
3583                             uvc = *uc;
3584                             uc++;
3585                         }
3586                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3587                         uscan = foldbuf;
3588                         while (foldlen) {
3589                             if (!--chars)
3590                                 break;
3591                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3592                                             uniflags);
3593                             uscan += len;
3594                             foldlen -= len;
3595                         }
3596                     }
3597                 }
3598                 else {
3599                     if (utf8_target)
3600                         while (chars--)
3601                             uc += UTF8SKIP(uc);
3602                     else
3603                         uc += chars;
3604                 }
3605                 PL_reginput = (char *)uc;
3606             }
3607
3608             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3609                             ? ST.jump[ST.nextword]
3610                             : NEXT_OFF(ST.me));
3611
3612             DEBUG_EXECUTE_r({
3613                 PerlIO_printf( Perl_debug_log,
3614                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3615                     REPORT_CODE_OFF+depth*2, "",
3616                     PL_colors[4],
3617                     ST.nextword,
3618                     PL_colors[5]
3619                     );
3620             });
3621
3622             if (ST.accepted > 1 || has_cutgroup) {
3623                 PUSH_STATE_GOTO(TRIE_next, scan);
3624                 assert(0); /* NOTREACHED */
3625             }
3626             /* only one choice left - just continue */
3627             DEBUG_EXECUTE_r({
3628                 AV *const trie_words
3629                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3630                 SV ** const tmp = av_fetch( trie_words,
3631                     ST.nextword-1, 0 );
3632                 SV *sv= tmp ? sv_newmortal() : NULL;
3633
3634                 PerlIO_printf( Perl_debug_log,
3635                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3636                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3637                     ST.nextword,
3638                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3639                             PL_colors[0], PL_colors[1],
3640                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3641                         )
3642                     : "not compiled under -Dr",
3643                     PL_colors[5] );
3644             });
3645
3646             locinput = PL_reginput;
3647             nextchr = UCHARAT(locinput);
3648             continue; /* execute rest of RE */
3649             assert(0); /* NOTREACHED */
3650 #undef  ST
3651
3652         case EXACT: {
3653             char *s = STRING(scan);
3654             ln = STR_LEN(scan);
3655             if (utf8_target != UTF_PATTERN) {
3656                 /* The target and the pattern have differing utf8ness. */
3657                 char *l = locinput;
3658                 const char * const e = s + ln;
3659
3660                 if (utf8_target) {
3661                     /* The target is utf8, the pattern is not utf8. */
3662                     while (s < e) {
3663                         STRLEN ulen;
3664                         if (l >= PL_regeol)
3665                              sayNO;
3666                         if (NATIVE_TO_UNI(*(U8*)s) !=
3667                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3668                                             uniflags))
3669                              sayNO;
3670                         l += ulen;
3671                         s ++;
3672                     }
3673                 }
3674                 else {
3675                     /* The target is not utf8, the pattern is utf8. */
3676                     while (s < e) {
3677                         STRLEN ulen;
3678                         if (l >= PL_regeol)
3679                             sayNO;
3680                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3681                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3682                                            uniflags))
3683                             sayNO;
3684                         s += ulen;
3685                         l ++;
3686                     }
3687                 }
3688                 locinput = l;
3689                 nextchr = UCHARAT(locinput);
3690                 break;
3691             }
3692             /* The target and the pattern have the same utf8ness. */
3693             /* Inline the first character, for speed. */
3694             if (UCHARAT(s) != nextchr)
3695                 sayNO;
3696             if (PL_regeol - locinput < ln)
3697                 sayNO;
3698             if (ln > 1 && memNE(s, locinput, ln))
3699                 sayNO;
3700             locinput += ln;
3701             nextchr = UCHARAT(locinput);
3702             break;
3703             }
3704         case EXACTFL: {
3705             re_fold_t folder;
3706             const U8 * fold_array;
3707             const char * s;
3708             U32 fold_utf8_flags;
3709
3710             PL_reg_flags |= RF_tainted;
3711             folder = foldEQ_locale;
3712             fold_array = PL_fold_locale;
3713             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3714             goto do_exactf;
3715
3716         case EXACTFU_SS:
3717         case EXACTFU_TRICKYFOLD:
3718         case EXACTFU:
3719             folder = foldEQ_latin1;
3720             fold_array = PL_fold_latin1;
3721             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3722             goto do_exactf;
3723
3724         case EXACTFA:
3725             folder = foldEQ_latin1;
3726             fold_array = PL_fold_latin1;
3727             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3728             goto do_exactf;
3729
3730         case EXACTF:
3731             folder = foldEQ;
3732             fold_array = PL_fold;
3733             fold_utf8_flags = 0;
3734
3735           do_exactf:
3736             s = STRING(scan);
3737             ln = STR_LEN(scan);
3738
3739             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3740               /* Either target or the pattern are utf8, or has the issue where
3741                * the fold lengths may differ. */
3742                 const char * const l = locinput;
3743                 char *e = PL_regeol;
3744
3745                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3746                                         l, &e, 0,  utf8_target, fold_utf8_flags))
3747                 {
3748                     sayNO;
3749                 }
3750                 locinput = e;
3751                 nextchr = UCHARAT(locinput);
3752                 break;
3753             }
3754
3755             /* Neither the target nor the pattern are utf8 */
3756             if (UCHARAT(s) != nextchr &&
3757                 UCHARAT(s) != fold_array[nextchr])
3758             {
3759                 sayNO;
3760             }
3761             if (PL_regeol - locinput < ln)
3762                 sayNO;
3763             if (ln > 1 && ! folder(s, locinput, ln))
3764                 sayNO;
3765             locinput += ln;
3766             nextchr = UCHARAT(locinput);
3767             break;
3768         }
3769
3770         /* XXX Could improve efficiency by separating these all out using a
3771          * macro or in-line function.  At that point regcomp.c would no longer
3772          * have to set the FLAGS fields of these */
3773         case BOUNDL:
3774         case NBOUNDL:
3775             PL_reg_flags |= RF_tainted;
3776             /* FALL THROUGH */
3777         case BOUND:
3778         case BOUNDU:
3779         case BOUNDA:
3780         case NBOUND:
3781         case NBOUNDU:
3782         case NBOUNDA:
3783             /* was last char in word? */
3784             if (utf8_target
3785                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3786                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3787             {
3788                 if (locinput == PL_bostr)
3789                     ln = '\n';
3790                 else {
3791                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3792
3793                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3794                 }
3795                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3796                     ln = isALNUM_uni(ln);
3797                     LOAD_UTF8_CHARCLASS_ALNUM();
3798                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3799                 }
3800                 else {
3801                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3802                     n = isALNUM_LC_utf8((U8*)locinput);
3803                 }
3804             }
3805             else {
3806
3807                 /* Here the string isn't utf8, or is utf8 and only ascii
3808                  * characters are to match \w.  In the latter case looking at
3809                  * the byte just prior to the current one may be just the final
3810                  * byte of a multi-byte character.  This is ok.  There are two
3811                  * cases:
3812                  * 1) it is a single byte character, and then the test is doing
3813                  *      just what it's supposed to.
3814                  * 2) it is a multi-byte character, in which case the final
3815                  *      byte is never mistakable for ASCII, and so the test
3816                  *      will say it is not a word character, which is the
3817                  *      correct answer. */
3818                 ln = (locinput != PL_bostr) ?
3819                     UCHARAT(locinput - 1) : '\n';
3820                 switch (FLAGS(scan)) {
3821                     case REGEX_UNICODE_CHARSET:
3822                         ln = isWORDCHAR_L1(ln);
3823                         n = isWORDCHAR_L1(nextchr);
3824                         break;
3825                     case REGEX_LOCALE_CHARSET:
3826                         ln = isALNUM_LC(ln);
3827                         n = isALNUM_LC(nextchr);
3828                         break;
3829                     case REGEX_DEPENDS_CHARSET:
3830                         ln = isALNUM(ln);
3831                         n = isALNUM(nextchr);
3832                         break;
3833                     case REGEX_ASCII_RESTRICTED_CHARSET:
3834                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3835                         ln = isWORDCHAR_A(ln);
3836                         n = isWORDCHAR_A(nextchr);
3837                         break;
3838                     default:
3839                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3840                         break;
3841                 }
3842             }
3843             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3844              * regcomp.sym */
3845             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3846                     sayNO;
3847             break;
3848         case ANYOFV:
3849         case ANYOF:
3850             if (utf8_target || state_num == ANYOFV) {
3851                 STRLEN inclasslen = PL_regeol - locinput;
3852                 if (locinput >= PL_regeol)
3853                     sayNO;
3854
3855                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3856                     sayNO;
3857                 locinput += inclasslen;
3858                 nextchr = UCHARAT(locinput);
3859                 break;
3860             }
3861             else {
3862                 if (nextchr < 0)
3863                     nextchr = UCHARAT(locinput);
3864                 if (!nextchr && locinput >= PL_regeol)
3865                     sayNO;
3866                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3867                     sayNO;
3868                 nextchr = UCHARAT(++locinput);
3869                 break;
3870             }
3871             break;
3872         /* Special char classes - The defines start on line 129 or so */
3873         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3874                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3875                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3876                   ALNUMA, NALNUMA, isWORDCHAR_A,
3877                   alnum, "a");
3878
3879         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3880                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3881                   SPACEU, NSPACEU, isSPACE_L1,
3882                   SPACEA, NSPACEA, isSPACE_A,
3883                   space, " ");
3884
3885         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3886                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3887                 DIGITA, NDIGITA, isDIGIT_A,
3888                 digit, "0");
3889
3890         case POSIXA:
3891             if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
3892                 sayNO;
3893             }
3894             /* Matched a utf8-invariant, so don't have to worry about utf8 */
3895             nextchr = UCHARAT(++locinput);
3896             break;
3897         case NPOSIXA:
3898             if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
3899                 sayNO;
3900             }
3901             if (utf8_target) {
3902                 locinput += PL_utf8skip[nextchr];
3903                 nextchr = UCHARAT(locinput);
3904             }
3905             else {
3906                 nextchr = UCHARAT(++locinput);
3907             }
3908             break;
3909
3910         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3911                        a Unicode extended Grapheme Cluster */
3912             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3913               extended Grapheme Cluster is:
3914
3915                CR LF
3916                | Prepend* Begin Extend*
3917                | .
3918
3919                Begin is:           ( Special_Begin | ! Control )
3920                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
3921                Extend is:          ( Grapheme_Extend | Spacing_Mark )
3922                Control is:         [ GCB_Control  CR  LF ]
3923                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
3924
3925                If we create a 'Regular_Begin' = Begin - Special_Begin, then
3926                we can rewrite
3927
3928                    Begin is ( Regular_Begin + Special Begin )
3929
3930                It turns out that 98.4% of all Unicode code points match
3931                Regular_Begin.  Doing it this way eliminates a table match in
3932                the previouls implementation for almost all Unicode code points.
3933
3934                There is a subtlety with Prepend* which showed up in testing.
3935                Note that the Begin, and only the Begin is required in:
3936                 | Prepend* Begin Extend*
3937                Also, Begin contains '! Control'.  A Prepend must be a
3938                '!  Control', which means it must also be a Begin.  What it
3939                comes down to is that if we match Prepend* and then find no
3940                suitable Begin afterwards, that if we backtrack the last
3941                Prepend, that one will be a suitable Begin.
3942             */
3943
3944             if (locinput >= PL_regeol)
3945                 sayNO;
3946             if  (! utf8_target) {
3947
3948                 /* Match either CR LF  or '.', as all the other possibilities
3949                  * require utf8 */
3950                 locinput++;         /* Match the . or CR */
3951                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3952                                        match the LF */
3953                     && locinput < PL_regeol
3954                     && UCHARAT(locinput) == '\n') locinput++;
3955             }
3956             else {
3957
3958                 /* Utf8: See if is ( CR LF ); already know that locinput <
3959                  * PL_regeol, so locinput+1 is in bounds */
3960                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3961                     locinput += 2;
3962                 }
3963                 else {
3964                     /* In case have to backtrack to beginning, then match '.' */
3965                     char *starting = locinput;
3966
3967                     /* In case have to backtrack the last prepend */
3968                     char *previous_prepend = 0;
3969
3970                     LOAD_UTF8_CHARCLASS_GCB();
3971
3972                     /* Match (prepend)*, but don't bother trying if empty (as
3973                      * being set to _undef indicates) */
3974                     if (PL_utf8_X_prepend != &PL_sv_undef) {
3975                         while (locinput < PL_regeol
3976                                && swash_fetch(PL_utf8_X_prepend,
3977                                               (U8*)locinput, utf8_target))
3978                         {
3979                             previous_prepend = locinput;
3980                             locinput += UTF8SKIP(locinput);
3981                         }
3982                     }
3983
3984                     /* As noted above, if we matched a prepend character, but
3985                      * the next thing won't match, back off the last prepend we
3986                      * matched, as it is guaranteed to match the begin */
3987                     if (previous_prepend
3988                         && (locinput >=  PL_regeol
3989                             || ! swash_fetch(PL_utf8_X_regular_begin,
3990                                              (U8*)locinput, utf8_target)))
3991                     {
3992                         locinput = previous_prepend;
3993                     }
3994
3995                     /* Note that here we know PL_regeol > locinput, as we
3996                      * tested that upon input to this switch case, and if we
3997                      * moved locinput forward, we tested the result just above
3998                      * and it either passed, or we backed off so that it will
3999                      * now pass */
4000                     if (swash_fetch(PL_utf8_X_regular_begin,
4001                                     (U8*)locinput, utf8_target)) {
4002                         locinput += UTF8SKIP(locinput);
4003                     }
4004                     else if (! swash_fetch(PL_utf8_X_special_begin,
4005                                         (U8*)locinput, utf8_target))
4006                         {
4007
4008                         /* Here did not match the required 'Begin' in the
4009                          * second term.  So just match the very first
4010                          * character, the '.' of the final term of the regex */
4011                         locinput = starting + UTF8SKIP(starting);
4012                         goto exit_utf8;
4013                     } else {
4014
4015                         /* Here is a special begin.  It can be composed of
4016                          * several individual characters.  One possibility is
4017                          * RI+ */
4018                         if (swash_fetch(PL_utf8_X_RI,
4019                                         (U8*)locinput, utf8_target))
4020                         {
4021                             locinput += UTF8SKIP(locinput);
4022                             while (locinput < PL_regeol
4023                                     && swash_fetch(PL_utf8_X_RI,
4024                                                     (U8*)locinput, utf8_target))
4025                             {
4026                                 locinput += UTF8SKIP(locinput);
4027                             }
4028                         } else /* Another possibility is T+ */
4029                                if (swash_fetch(PL_utf8_X_T,
4030                                                (U8*)locinput, utf8_target))
4031                         {
4032                             locinput += UTF8SKIP(locinput);
4033                             while (locinput < PL_regeol
4034                                     && swash_fetch(PL_utf8_X_T,
4035                                                     (U8*)locinput, utf8_target))
4036                             {
4037                                 locinput += UTF8SKIP(locinput);
4038                             }
4039                         } else {
4040
4041                             /* Here, neither RI+ nor T+; must be some other
4042                              * Hangul.  That means it is one of the others: L,
4043                              * LV, LVT or V, and matches:
4044                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4045
4046                             /* Match L*           */
4047                             while (locinput < PL_regeol
4048                                     && swash_fetch(PL_utf8_X_L,
4049                                                     (U8*)locinput, utf8_target))
4050                             {
4051                                 locinput += UTF8SKIP(locinput);
4052                             }
4053
4054                             /* Here, have exhausted L*.  If the next character
4055                              * is not an LV, LVT nor V, it means we had to have
4056                              * at least one L, so matches L+ in the original
4057                              * equation, we have a complete hangul syllable.
4058                              * Are done. */
4059
4060                             if (locinput < PL_regeol
4061                                 && swash_fetch(PL_utf8_X_LV_LVT_V,
4062                                                 (U8*)locinput, utf8_target))
4063                             {
4064
4065                                 /* Otherwise keep going.  Must be LV, LVT or V.
4066                                  * See if LVT */
4067                                 if (is_utf8_X_LVT((U8*)locinput)) {
4068                                     locinput += UTF8SKIP(locinput);
4069                                 } else {
4070
4071                                     /* Must be  V or LV.  Take it, then match
4072                                      * V*     */
4073                                     locinput += UTF8SKIP(locinput);
4074                                     while (locinput < PL_regeol
4075                                             && swash_fetch(PL_utf8_X_V,
4076                                                            (U8*)locinput,
4077                                                            utf8_target))
4078                                     {
4079                                         locinput += UTF8SKIP(locinput);
4080                                     }
4081                                 }
4082
4083                                 /* And any of LV, LVT, or V can be followed
4084                                     * by T*            */
4085                                 while (locinput < PL_regeol
4086                                         && swash_fetch(PL_utf8_X_T,
4087                                                         (U8*)locinput,
4088                                                         utf8_target))
4089                                 {
4090                                     locinput += UTF8SKIP(locinput);
4091                                 }
4092                             }
4093                         }
4094                     }
4095
4096                     /* Match any extender */
4097                     while (locinput < PL_regeol
4098                             && swash_fetch(PL_utf8_X_extend,
4099                                             (U8*)locinput, utf8_target))
4100                     {
4101                         locinput += UTF8SKIP(locinput);
4102                     }
4103                 }
4104             exit_utf8:
4105                 if (locinput > PL_regeol) sayNO;
4106             }
4107             nextchr = UCHARAT(locinput);
4108             break;
4109
4110         case NREFFL:
4111         {   /* The capture buffer cases.  The ones beginning with N for the
4112                named buffers just convert to the equivalent numbered and
4113                pretend they were called as the corresponding numbered buffer
4114                op.  */
4115             /* don't initialize these in the declaration, it makes C++
4116                unhappy */
4117             char *s;
4118             char type;
4119             re_fold_t folder;
4120             const U8 *fold_array;
4121             UV utf8_fold_flags;
4122
4123             PL_reg_flags |= RF_tainted;
4124             folder = foldEQ_locale;
4125             fold_array = PL_fold_locale;
4126             type = REFFL;
4127             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4128             goto do_nref;
4129
4130         case NREFFA:
4131             folder = foldEQ_latin1;
4132             fold_array = PL_fold_latin1;
4133             type = REFFA;
4134             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4135             goto do_nref;
4136
4137         case NREFFU:
4138             folder = foldEQ_latin1;
4139             fold_array = PL_fold_latin1;
4140             type = REFFU;
4141             utf8_fold_flags = 0;
4142             goto do_nref;
4143
4144         case NREFF:
4145             folder = foldEQ;
4146             fold_array = PL_fold;
4147             type = REFF;
4148             utf8_fold_flags = 0;
4149             goto do_nref;
4150
4151         case NREF:
4152             type = REF;
4153             folder = NULL;
4154             fold_array = NULL;
4155             utf8_fold_flags = 0;
4156           do_nref:
4157
4158             /* For the named back references, find the corresponding buffer
4159              * number */
4160             n = reg_check_named_buff_matched(rex,scan);
4161
4162             if ( ! n ) {
4163                 sayNO;
4164             }
4165             goto do_nref_ref_common;
4166
4167         case REFFL:
4168             PL_reg_flags |= RF_tainted;
4169             folder = foldEQ_locale;
4170             fold_array = PL_fold_locale;
4171             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4172             goto do_ref;
4173
4174         case REFFA:
4175             folder = foldEQ_latin1;
4176             fold_array = PL_fold_latin1;
4177             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4178             goto do_ref;
4179
4180         case REFFU:
4181             folder = foldEQ_latin1;
4182             fold_array = PL_fold_latin1;
4183             utf8_fold_flags = 0;
4184             goto do_ref;
4185
4186         case REFF:
4187             folder = foldEQ;
4188             fold_array = PL_fold;
4189             utf8_fold_flags = 0;
4190             goto do_ref;
4191
4192         case REF:
4193             folder = NULL;
4194             fold_array = NULL;
4195             utf8_fold_flags = 0;
4196
4197           do_ref:
4198             type = OP(scan);
4199             n = ARG(scan);  /* which paren pair */
4200
4201           do_nref_ref_common:
4202             ln = rex->offs[n].start;
4203             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4204             if (rex->lastparen < n || ln == -1)
4205                 sayNO;                  /* Do not match unless seen CLOSEn. */
4206             if (ln == rex->offs[n].end)
4207                 break;
4208
4209             s = PL_bostr + ln;
4210             if (type != REF     /* REF can do byte comparison */
4211                 && (utf8_target || type == REFFU))
4212             { /* XXX handle REFFL better */
4213                 char * limit = PL_regeol;
4214
4215                 /* This call case insensitively compares the entire buffer
4216                     * at s, with the current input starting at locinput, but
4217                     * not going off the end given by PL_regeol, and returns in
4218                     * limit upon success, how much of the current input was
4219                     * matched */
4220                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4221                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4222                 {
4223                     sayNO;
4224                 }
4225                 locinput = limit;
4226                 nextchr = UCHARAT(locinput);
4227                 break;
4228             }
4229
4230             /* Not utf8:  Inline the first character, for speed. */
4231             if (UCHARAT(s) != nextchr &&
4232                 (type == REF ||
4233                  UCHARAT(s) != fold_array[nextchr]))
4234                 sayNO;
4235             ln = rex->offs[n].end - ln;
4236             if (locinput + ln > PL_regeol)
4237                 sayNO;
4238             if (ln > 1 && (type == REF
4239                            ? memNE(s, locinput, ln)
4240                            : ! folder(s, locinput, ln)))
4241                 sayNO;
4242             locinput += ln;
4243             nextchr = UCHARAT(locinput);
4244             break;
4245         }
4246         case NOTHING:
4247         case TAIL:
4248             break;
4249         case BACK:
4250             break;
4251
4252 #undef  ST
4253 #define ST st->u.eval
4254         {
4255             SV *ret;
4256             REGEXP *re_sv;
4257             regexp *re;
4258             regexp_internal *rei;
4259             regnode *startpoint;
4260
4261         case GOSTART:
4262         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4263             if (cur_eval && cur_eval->locinput==locinput) {
4264                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4265                     Perl_croak(aTHX_ "Infinite recursion in regex");
4266                 if ( ++nochange_depth > max_nochange_depth )
4267                     Perl_croak(aTHX_
4268                         "Pattern subroutine nesting without pos change"
4269                         " exceeded limit in regex");
4270             } else {
4271                 nochange_depth = 0;
4272             }
4273             re_sv = rex_sv;
4274             re = rex;
4275             rei = rexi;
4276             if (OP(scan)==GOSUB) {
4277                 startpoint = scan + ARG2L(scan);
4278                 ST.close_paren = ARG(scan);
4279             } else {
4280                 startpoint = rei->program+1;
4281                 ST.close_paren = 0;
4282             }
4283             goto eval_recurse_doit;
4284             assert(0); /* NOTREACHED */
4285         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4286             if (cur_eval && cur_eval->locinput==locinput) {
4287                 if ( ++nochange_depth > max_nochange_depth )
4288                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4289             } else {
4290                 nochange_depth = 0;
4291             }
4292             {
4293                 /* execute the code in the {...} */
4294
4295                 dSP;
4296                 SV ** before;
4297                 OP * const oop = PL_op;
4298                 COP * const ocurcop = PL_curcop;
4299                 OP *nop;
4300                 char *saved_regeol = PL_regeol;
4301                 struct re_save_state saved_state;
4302                 CV *newcv;
4303
4304                 /* save *all* paren positions */
4305                 regcppush(rex, 0);
4306                 REGCP_SET(runops_cp);
4307
4308                 /* To not corrupt the existing regex state while executing the
4309                  * eval we would normally put it on the save stack, like with
4310                  * save_re_context. However, re-evals have a weird scoping so we
4311                  * can't just add ENTER/LEAVE here. With that, things like
4312                  *
4313                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4314                  *
4315                  * would break, as they expect the localisation to be unwound
4316                  * only when the re-engine backtracks through the bit that
4317                  * localised it.
4318                  *
4319                  * What we do instead is just saving the state in a local c
4320                  * variable.
4321                  */
4322                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4323
4324                 PL_reg_state.re_reparsing = FALSE;
4325
4326                 if (!caller_cv)
4327                     caller_cv = find_runcv(NULL);
4328
4329                 n = ARG(scan);
4330
4331                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4332                     newcv = ((struct regexp *)SvANY(
4333                                                 (REGEXP*)(rexi->data->data[n])
4334                                             ))->qr_anoncv
4335                                         ;
4336                     nop = (OP*)rexi->data->data[n+1];
4337                 }
4338                 else if (rexi->data->what[n] == 'l') { /* literal code */
4339                     newcv = caller_cv;
4340                     nop = (OP*)rexi->data->data[n];
4341                     assert(CvDEPTH(newcv));
4342                 }
4343                 else {
4344                     /* literal with own CV */
4345                     assert(rexi->data->what[n] == 'L');
4346                     newcv = rex->qr_anoncv;
4347                     nop = (OP*)rexi->data->data[n];
4348                 }
4349
4350                 /* normally if we're about to execute code from the same
4351                  * CV that we used previously, we just use the existing
4352                  * CX stack entry. However, its possible that in the
4353                  * meantime we may have backtracked, popped from the save
4354                  * stack, and undone the SAVECOMPPAD(s) associated with
4355                  * PUSH_MULTICALL; in which case PL_comppad no longer
4356                  * points to newcv's pad. */
4357                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4358                 {
4359                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4360                     if (last_pushed_cv) {
4361                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4362                     }
4363                     else {
4364                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4365                     }
4366                     last_pushed_cv = newcv;
4367                 }
4368                 last_pad = PL_comppad;
4369
4370                 /* the initial nextstate you would normally execute
4371                  * at the start of an eval (which would cause error
4372                  * messages to come from the eval), may be optimised
4373                  * away from the execution path in the regex code blocks;
4374                  * so manually set PL_curcop to it initially */
4375                 {
4376                     OP *o = cUNOPx(nop)->op_first;
4377                     assert(o->op_type == OP_NULL);
4378                     if (o->op_targ == OP_SCOPE) {
4379                         o = cUNOPo->op_first;
4380                     }
4381                     else {
4382                         assert(o->op_targ == OP_LEAVE);
4383                         o = cUNOPo->op_first;
4384                         assert(o->op_type == OP_ENTER);
4385                         o = o->op_sibling;
4386                     }
4387
4388                     if (o->op_type != OP_STUB) {
4389                         assert(    o->op_type == OP_NEXTSTATE
4390                                 || o->op_type == OP_DBSTATE
4391                                 || (o->op_type == OP_NULL
4392                                     &&  (  o->op_targ == OP_NEXTSTATE
4393                                         || o->op_targ == OP_DBSTATE
4394                                         )
4395                                     )
4396                         );
4397                         PL_curcop = (COP*)o;
4398                     }
4399                 }
4400                 nop = nop->op_next;
4401
4402                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4403                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4404
4405                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4406
4407                 if (sv_yes_mark) {
4408                     SV *sv_mrk = get_sv("REGMARK", 1);
4409                     sv_setsv(sv_mrk, sv_yes_mark);
4410                 }
4411
4412                 /* we don't use MULTICALL here as we want to call the
4413                  * first op of the block of interest, rather than the
4414                  * first op of the sub */
4415                 before = SP;
4416                 PL_op = nop;
4417                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4418                 SPAGAIN;
4419                 if (SP == before)
4420                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4421                 else {
4422                     ret = POPs;
4423                     PUTBACK;
4424                 }
4425
4426                 /* before restoring everything, evaluate the returned
4427                  * value, so that 'uninit' warnings don't use the wrong
4428                  * PL_op or pad. Also need to process any magic vars
4429                  * (e.g. $1) *before* parentheses are restored */
4430
4431                 PL_op = NULL;
4432
4433                 re_sv = NULL;
4434                 if (logical == 0)        /*   (?{})/   */
4435                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4436                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4437                     sw = cBOOL(SvTRUE(ret));
4438                     logical = 0;
4439                 }
4440                 else {                   /*  /(??{})  */
4441                     /*  if its overloaded, let the regex compiler handle
4442                      *  it; otherwise extract regex, or stringify  */
4443                     if (!SvAMAGIC(ret)) {
4444                         SV *sv = ret;
4445                         if (SvROK(sv))
4446                             sv = SvRV(sv);
4447                         if (SvTYPE(sv) == SVt_REGEXP)
4448                             re_sv = (REGEXP*) sv;
4449                         else if (SvSMAGICAL(sv)) {
4450                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4451                             if (mg)
4452                                 re_sv = (REGEXP *) mg->mg_obj;
4453                         }
4454
4455                         /* force any magic, undef warnings here */
4456                         if (!re_sv) {
4457                             ret = sv_mortalcopy(ret);
4458                             (void) SvPV_force_nolen(ret);
4459                         }
4460                     }
4461
4462                 }
4463
4464                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4465
4466                 /* *** Note that at this point we don't restore
4467                  * PL_comppad, (or pop the CxSUB) on the assumption it may
4468                  * be used again soon. This is safe as long as nothing
4469                  * in the regexp code uses the pad ! */
4470                 PL_op = oop;
4471                 PL_curcop = ocurcop;
4472                 PL_regeol = saved_regeol;
4473                 S_regcp_restore(aTHX_ rex, runops_cp);
4474
4475                 if (logical != 2)
4476                     break;
4477             }
4478
4479                 /* only /(??{})/  from now on */
4480                 logical = 0;
4481                 {
4482                     /* extract RE object from returned value; compiling if
4483                      * necessary */
4484
4485                     if (re_sv) {
4486                         re_sv = reg_temp_copy(NULL, re_sv);
4487                     }
4488                     else {
4489                         U32 pm_flags = 0;
4490                         const I32 osize = PL_regsize;
4491
4492                         if (SvUTF8(ret) && IN_BYTES) {
4493                             /* In use 'bytes': make a copy of the octet
4494                              * sequence, but without the flag on */
4495                             STRLEN len;
4496                             const char *const p = SvPV(ret, len);
4497                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4498                         }
4499                         if (rex->intflags & PREGf_USE_RE_EVAL)
4500                             pm_flags |= PMf_USE_RE_EVAL;
4501
4502                         /* if we got here, it should be an engine which
4503                          * supports compiling code blocks and stuff */
4504                         assert(rex->engine && rex->engine->op_comp);
4505                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4506                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4507                                     rex->engine, NULL, NULL,
4508                                     /* copy /msix etc to inner pattern */
4509                                     scan->flags,
4510                                     pm_flags);
4511
4512                         if (!(SvFLAGS(ret)
4513                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4514                                  | SVs_GMG))) {
4515                             /* This isn't a first class regexp. Instead, it's
4516                                caching a regexp onto an existing, Perl visible
4517                                scalar.  */
4518                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4519                         }
4520                         PL_regsize = osize;
4521                         /* safe to do now that any $1 etc has been
4522                          * interpolated into the new pattern string and
4523                          * compiled */
4524                         S_regcp_restore(aTHX_ rex, runops_cp);
4525                     }
4526                     re = (struct regexp *)SvANY(re_sv);
4527                 }
4528                 RXp_MATCH_COPIED_off(re);
4529                 re->subbeg = rex->subbeg;
4530                 re->sublen = rex->sublen;
4531                 rei = RXi_GET(re);
4532                 DEBUG_EXECUTE_r(
4533                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4534                         "Matching embedded");
4535                 );
4536                 startpoint = rei->program + 1;
4537                 ST.close_paren = 0; /* only used for GOSUB */
4538
4539         eval_recurse_doit: /* Share code with GOSUB below this line */
4540                 /* run the pattern returned from (??{...}) */
4541                 ST.cp = regcppush(rex, 0);      /* Save *all* the positions. */
4542                 REGCP_SET(ST.lastcp);
4543
4544                 re->lastparen = 0;
4545                 re->lastcloseparen = 0;
4546
4547                 PL_reginput = locinput;
4548                 PL_regsize = 0;
4549
4550                 /* XXXX This is too dramatic a measure... */
4551                 PL_reg_maxiter = 0;
4552
4553                 ST.toggle_reg_flags = PL_reg_flags;
4554                 if (RX_UTF8(re_sv))
4555                     PL_reg_flags |= RF_utf8;
4556                 else
4557                     PL_reg_flags &= ~RF_utf8;
4558                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4559
4560                 ST.prev_rex = rex_sv;
4561                 ST.prev_curlyx = cur_curlyx;
4562                 rex_sv = re_sv;
4563                 SET_reg_curpm(rex_sv);
4564                 rex = re;
4565                 rexi = rei;
4566                 cur_curlyx = NULL;
4567                 ST.B = next;
4568                 ST.prev_eval = cur_eval;
4569                 cur_eval = st;
4570                 /* now continue from first node in postoned RE */
4571                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4572                 assert(0); /* NOTREACHED */
4573         }
4574
4575         case EVAL_AB: /* cleanup after a successful (??{A})B */
4576             /* note: this is called twice; first after popping B, then A */
4577             PL_reg_flags ^= ST.toggle_reg_flags;
4578             rex_sv = ST.prev_rex;
4579             SET_reg_curpm(rex_sv);
4580             rex = (struct regexp *)SvANY(rex_sv);
4581             rexi = RXi_GET(rex);
4582             regcpblow(ST.cp);
4583             cur_eval = ST.prev_eval;
4584             cur_curlyx = ST.prev_curlyx;
4585
4586             /* XXXX This is too dramatic a measure... */
4587             PL_reg_maxiter = 0;
4588             if ( nochange_depth )
4589                 nochange_depth--;
4590             sayYES;
4591
4592
4593         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4594             /* note: this is called twice; first after popping B, then A */
4595             PL_reg_flags ^= ST.toggle_reg_flags;
4596             rex_sv = ST.prev_rex;
4597             SET_reg_curpm(rex_sv);
4598             rex = (struct regexp *)SvANY(rex_sv);
4599             rexi = RXi_GET(rex);
4600
4601             PL_reginput = locinput;
4602             REGCP_UNWIND(ST.lastcp);
4603             regcppop(rex);
4604             cur_eval = ST.prev_eval;
4605             cur_curlyx = ST.prev_curlyx;
4606             /* XXXX This is too dramatic a measure... */
4607             PL_reg_maxiter = 0;
4608             if ( nochange_depth )
4609                 nochange_depth--;
4610             sayNO_SILENT;
4611 #undef ST
4612
4613         case OPEN:
4614             n = ARG(scan);  /* which paren pair */
4615             rex->offs[n].start_tmp = locinput - PL_bostr;
4616             if (n > PL_regsize)
4617                 PL_regsize = n;
4618             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4619                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4620                 PTR2UV(rex),
4621                 PTR2UV(rex->offs),
4622                 (UV)n,
4623                 (IV)rex->offs[n].start_tmp,
4624                 (UV)PL_regsize
4625             ));
4626             lastopen = n;
4627             break;
4628
4629 /* XXX really need to log other places start/end are set too */
4630 #define CLOSE_CAPTURE \
4631     rex->offs[n].start = rex->offs[n].start_tmp; \
4632     rex->offs[n].end = locinput - PL_bostr; \
4633     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4634         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4635         PTR2UV(rex), \
4636         PTR2UV(rex->offs), \
4637         (UV)n, \
4638         (IV)rex->offs[n].start, \
4639         (IV)rex->offs[n].end \
4640     ))
4641
4642         case CLOSE:
4643             n = ARG(scan);  /* which paren pair */
4644             CLOSE_CAPTURE;
4645             /*if (n > PL_regsize)
4646                 PL_regsize = n;*/
4647             if (n > rex->lastparen)
4648                 rex->lastparen = n;
4649             rex->lastcloseparen = n;
4650             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4651                 goto fake_end;
4652             }
4653             break;
4654         case ACCEPT:
4655             if (ARG(scan)){
4656                 regnode *cursor;
4657                 for (cursor=scan;
4658                      cursor && OP(cursor)!=END;
4659                      cursor=regnext(cursor))
4660                 {
4661                     if ( OP(cursor)==CLOSE ){
4662                         n = ARG(cursor);
4663                         if ( n <= lastopen ) {
4664                             CLOSE_CAPTURE;
4665                             /*if (n > PL_regsize)
4666                             PL_regsize = n;*/
4667                             if (n > rex->lastparen)
4668                                 rex->lastparen = n;
4669                             rex->lastcloseparen = n;
4670                             if ( n == ARG(scan) || (cur_eval &&
4671                                 cur_eval->u.eval.close_paren == n))
4672                                 break;
4673                         }
4674                     }
4675                 }
4676             }
4677             goto fake_end;
4678             /*NOTREACHED*/
4679         case GROUPP:
4680             n = ARG(scan);  /* which paren pair */
4681             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4682             break;
4683         case NGROUPP:
4684             /* reg_check_named_buff_matched returns 0 for no match */
4685             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4686             break;
4687         case INSUBP:
4688             n = ARG(scan);
4689             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4690             break;
4691         case DEFINEP:
4692             sw = 0;
4693             break;
4694         case IFTHEN:
4695             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4696             if (sw)
4697                 next = NEXTOPER(NEXTOPER(scan));
4698             else {
4699                 next = scan + ARG(scan);
4700                 if (OP(next) == IFTHEN) /* Fake one. */
4701                     next = NEXTOPER(NEXTOPER(next));
4702             }
4703             break;
4704         case LOGICAL:
4705             logical = scan->flags;
4706             break;
4707
4708 /*******************************************************************
4709
4710 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4711 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4712 STAR/PLUS/CURLY/CURLYN are used instead.)
4713
4714 A*B is compiled as <CURLYX><A><WHILEM><B>
4715
4716 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4717 state, which contains the current count, initialised to -1. It also sets
4718 cur_curlyx to point to this state, with any previous value saved in the
4719 state block.
4720
4721 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4722 since the pattern may possibly match zero times (i.e. it's a while {} loop
4723 rather than a do {} while loop).
4724
4725 Each entry to WHILEM represents a successful match of A. The count in the
4726 CURLYX block is incremented, another WHILEM state is pushed, and execution
4727 passes to A or B depending on greediness and the current count.
4728
4729 For example, if matching against the string a1a2a3b (where the aN are
4730 substrings that match /A/), then the match progresses as follows: (the
4731 pushed states are interspersed with the bits of strings matched so far):
4732
4733     <CURLYX cnt=-1>
4734     <CURLYX cnt=0><WHILEM>
4735     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4736     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4737     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4738     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4739
4740 (Contrast this with something like CURLYM, which maintains only a single
4741 backtrack state:
4742
4743     <CURLYM cnt=0> a1
4744     a1 <CURLYM cnt=1> a2
4745     a1 a2 <CURLYM cnt=2> a3
4746     a1 a2 a3 <CURLYM cnt=3> b
4747 )
4748
4749 Each WHILEM state block marks a point to backtrack to upon partial failure
4750 of A or B, and also contains some minor state data related to that
4751 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4752 overall state, such as the count, and pointers to the A and B ops.
4753
4754 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4755 must always point to the *current* CURLYX block, the rules are:
4756
4757 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4758 and set cur_curlyx to point the new block.
4759
4760 When popping the CURLYX block after a successful or unsuccessful match,
4761 restore the previous cur_curlyx.
4762
4763 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4764 to the outer one saved in the CURLYX block.
4765
4766 When popping the WHILEM block after a successful or unsuccessful B match,
4767 restore the previous cur_curlyx.
4768
4769 Here's an example for the pattern (AI* BI)*BO
4770 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4771
4772 cur_
4773 curlyx backtrack stack
4774 ------ ---------------
4775 NULL
4776 CO     <CO prev=NULL> <WO>
4777 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4778 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4779 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4780
4781 At this point the pattern succeeds, and we work back down the stack to
4782 clean up, restoring as we go:
4783
4784 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4785 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4786 CO     <CO prev=NULL> <WO>
4787 NULL
4788
4789 *******************************************************************/
4790
4791 #define ST st->u.curlyx
4792
4793         case CURLYX:    /* start of /A*B/  (for complex A) */
4794         {
4795             /* No need to save/restore up to this paren */
4796             I32 parenfloor = scan->flags;
4797
4798             assert(next); /* keep Coverity happy */
4799             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4800                 next += ARG(next);
4801
4802             /* XXXX Probably it is better to teach regpush to support
4803                parenfloor > PL_regsize... */
4804             if (parenfloor > (I32)rex->lastparen)
4805                 parenfloor = rex->lastparen; /* Pessimization... */
4806
4807             ST.prev_curlyx= cur_curlyx;
4808             cur_curlyx = st;
4809             ST.cp = PL_savestack_ix;
4810
4811             /* these fields contain the state of the current curly.
4812              * they are accessed by subsequent WHILEMs */
4813             ST.parenfloor = parenfloor;
4814             ST.me = scan;
4815             ST.B = next;
4816             ST.minmod = minmod;
4817             minmod = 0;
4818             ST.count = -1;      /* this will be updated by WHILEM */
4819             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4820
4821             PL_reginput = locinput;
4822             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4823             assert(0); /* NOTREACHED */
4824         }
4825
4826         case CURLYX_end: /* just finished matching all of A*B */
4827             cur_curlyx = ST.prev_curlyx;
4828             sayYES;
4829             assert(0); /* NOTREACHED */
4830
4831         case CURLYX_end_fail: /* just failed to match all of A*B */
4832             regcpblow(ST.cp);
4833             cur_curlyx = ST.prev_curlyx;
4834             sayNO;
4835             assert(0); /* NOTREACHED */
4836
4837
4838 #undef ST
4839 #define ST st->u.whilem
4840
4841         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4842         {
4843             /* see the discussion above about CURLYX/WHILEM */
4844             I32 n;
4845             int min = ARG1(cur_curlyx->u.curlyx.me);
4846             int max = ARG2(cur_curlyx->u.curlyx.me);
4847             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4848
4849             assert(cur_curlyx); /* keep Coverity happy */
4850             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4851             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4852             ST.cache_offset = 0;
4853             ST.cache_mask = 0;
4854
4855             PL_reginput = locinput;
4856
4857             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4858                   "%*s  whilem: matched %ld out of %d..%d\n",
4859                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4860             );
4861
4862             /* First just match a string of min A's. */
4863
4864             if (n < min) {
4865                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4866                 cur_curlyx->u.curlyx.lastloc = locinput;
4867                 REGCP_SET(ST.lastcp);
4868
4869                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4870                 assert(0); /* NOTREACHED */
4871             }
4872
4873             /* If degenerate A matches "", assume A done. */
4874
4875             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4876                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4877                    "%*s  whilem: empty match detected, trying continuation...\n",
4878                    REPORT_CODE_OFF+depth*2, "")
4879                 );
4880                 goto do_whilem_B_max;
4881             }
4882
4883             /* super-linear cache processing */
4884
4885             if (scan->flags) {
4886
4887                 if (!PL_reg_maxiter) {
4888                     /* start the countdown: Postpone detection until we
4889                      * know the match is not *that* much linear. */
4890                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4891                     /* possible overflow for long strings and many CURLYX's */
4892                     if (PL_reg_maxiter < 0)
4893                         PL_reg_maxiter = I32_MAX;
4894                     PL_reg_leftiter = PL_reg_maxiter;
4895                 }
4896
4897                 if (PL_reg_leftiter-- == 0) {
4898                     /* initialise cache */
4899                     const I32 size = (PL_reg_maxiter + 7)/8;
4900                     if (PL_reg_poscache) {
4901                         if ((I32)PL_reg_poscache_size < size) {
4902                             Renew(PL_reg_poscache, size, char);
4903                             PL_reg_poscache_size = size;
4904                         }
4905                         Zero(PL_reg_poscache, size, char);
4906                     }
4907                     else {
4908                         PL_reg_poscache_size = size;
4909                         Newxz(PL_reg_poscache, size, char);
4910                     }
4911                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4912       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4913                               PL_colors[4], PL_colors[5])
4914                     );
4915                 }
4916
4917                 if (PL_reg_leftiter < 0) {
4918                     /* have we already failed at this position? */
4919                     I32 offset, mask;
4920                     offset  = (scan->flags & 0xf) - 1
4921                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4922                     mask    = 1 << (offset % 8);
4923                     offset /= 8;
4924                     if (PL_reg_poscache[offset] & mask) {
4925                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4926                             "%*s  whilem: (cache) already tried at this position...\n",
4927                             REPORT_CODE_OFF+depth*2, "")
4928                         );
4929                         sayNO; /* cache records failure */
4930                     }
4931                     ST.cache_offset = offset;
4932                     ST.cache_mask   = mask;
4933                 }
4934             }
4935
4936             /* Prefer B over A for minimal matching. */
4937
4938             if (cur_curlyx->u.curlyx.minmod) {
4939                 ST.save_curlyx = cur_curlyx;
4940                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4941                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
4942                 REGCP_SET(ST.lastcp);
4943                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4944                 assert(0); /* NOTREACHED */
4945             }
4946
4947             /* Prefer A over B for maximal matching. */
4948
4949             if (n < max) { /* More greed allowed? */
4950                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4951                 cur_curlyx->u.curlyx.lastloc = locinput;
4952                 REGCP_SET(ST.lastcp);
4953                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4954                 assert(0); /* NOTREACHED */
4955             }
4956             goto do_whilem_B_max;
4957         }
4958         assert(0); /* NOTREACHED */
4959
4960         case WHILEM_B_min: /* just matched B in a minimal match */
4961         case WHILEM_B_max: /* just matched B in a maximal match */
4962             cur_curlyx = ST.save_curlyx;
4963             sayYES;
4964             assert(0); /* NOTREACHED */
4965
4966         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4967             cur_curlyx = ST.save_curlyx;
4968             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4969             cur_curlyx->u.curlyx.count--;
4970             CACHEsayNO;
4971             assert(0); /* NOTREACHED */
4972
4973         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4974             /* FALL THROUGH */
4975         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4976             REGCP_UNWIND(ST.lastcp);
4977             regcppop(rex);
4978             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4979             cur_curlyx->u.curlyx.count--;
4980             CACHEsayNO;
4981             assert(0); /* NOTREACHED */
4982
4983         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4984             REGCP_UNWIND(ST.lastcp);
4985             regcppop(rex);      /* Restore some previous $<digit>s? */
4986             PL_reginput = locinput;
4987             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4988                 "%*s  whilem: failed, trying continuation...\n",
4989                 REPORT_CODE_OFF+depth*2, "")
4990             );
4991           do_whilem_B_max:
4992             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4993                 && ckWARN(WARN_REGEXP)
4994                 && !(PL_reg_flags & RF_warned))
4995             {
4996                 PL_reg_flags |= RF_warned;
4997                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4998                      "Complex regular subexpression recursion limit (%d) "
4999                      "exceeded",
5000                      REG_INFTY - 1);
5001             }
5002
5003             /* now try B */
5004             ST.save_curlyx = cur_curlyx;
5005             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5006             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
5007             assert(0); /* NOTREACHED */
5008
5009         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5010             cur_curlyx = ST.save_curlyx;
5011             REGCP_UNWIND(ST.lastcp);
5012             regcppop(rex);
5013
5014             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5015                 /* Maximum greed exceeded */
5016                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5017                     && ckWARN(WARN_REGEXP)
5018                     && !(PL_reg_flags & RF_warned))
5019                 {
5020                     PL_reg_flags |= RF_warned;
5021                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5022                         "Complex regular subexpression recursion "
5023                         "limit (%d) exceeded",
5024                         REG_INFTY - 1);
5025                 }
5026                 cur_curlyx->u.curlyx.count--;
5027                 CACHEsayNO;
5028             }
5029
5030             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5031                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5032             );
5033             /* Try grabbing another A and see if it helps. */
5034             PL_reginput = locinput;
5035             cur_curlyx->u.curlyx.lastloc = locinput;
5036             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5037             REGCP_SET(ST.lastcp);
5038             PUSH_STATE_GOTO(WHILEM_A_min,
5039                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
5040             assert(0); /* NOTREACHED */
5041
5042 #undef  ST
5043 #define ST st->u.branch
5044
5045         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5046             next = scan + ARG(scan);
5047             if (next == scan)
5048                 next = NULL;
5049             scan = NEXTOPER(scan);
5050             /* FALL THROUGH */
5051
5052         case BRANCH:        /*  /(...|A|...)/ */
5053             scan = NEXTOPER(scan); /* scan now points to inner node */
5054             ST.lastparen = rex->lastparen;
5055             ST.lastcloseparen = rex->lastcloseparen;
5056             ST.next_branch = next;
5057             REGCP_SET(ST.cp);
5058             PL_reginput = locinput;
5059
5060             /* Now go into the branch */
5061             if (has_cutgroup) {
5062                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5063             } else {
5064                 PUSH_STATE_GOTO(BRANCH_next, scan);
5065             }
5066             assert(0); /* NOTREACHED */
5067         case CUTGROUP:
5068             PL_reginput = locinput;
5069             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5070                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5071             PUSH_STATE_GOTO(CUTGROUP_next,next);
5072             assert(0); /* NOTREACHED */
5073         case CUTGROUP_next_fail:
5074             do_cutgroup = 1;
5075             no_final = 1;
5076             if (st->u.mark.mark_name)
5077                 sv_commit = st->u.mark.mark_name;
5078             sayNO;
5079             assert(0); /* NOTREACHED */
5080         case BRANCH_next:
5081             sayYES;
5082             assert(0); /* NOTREACHED */
5083         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5084             if (do_cutgroup) {
5085                 do_cutgroup = 0;
5086                 no_final = 0;
5087             }
5088             REGCP_UNWIND(ST.cp);
5089             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5090             scan = ST.next_branch;
5091             /* no more branches? */
5092             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5093                 DEBUG_EXECUTE_r({
5094                     PerlIO_printf( Perl_debug_log,
5095                         "%*s  %sBRANCH failed...%s\n",
5096                         REPORT_CODE_OFF+depth*2, "",
5097                         PL_colors[4],
5098                         PL_colors[5] );
5099                 });
5100                 sayNO_SILENT;
5101             }
5102             continue; /* execute next BRANCH[J] op */
5103             assert(0); /* NOTREACHED */
5104
5105         case MINMOD:
5106             minmod = 1;
5107             break;
5108
5109 #undef  ST
5110 #define ST st->u.curlym
5111
5112         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5113
5114             /* This is an optimisation of CURLYX that enables us to push
5115              * only a single backtracking state, no matter how many matches
5116              * there are in {m,n}. It relies on the pattern being constant
5117              * length, with no parens to influence future backrefs
5118              */
5119
5120             ST.me = scan;
5121             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5122
5123             ST.lastparen      = rex->lastparen;
5124             ST.lastcloseparen = rex->lastcloseparen;
5125
5126             /* if paren positive, emulate an OPEN/CLOSE around A */
5127             if (ST.me->flags) {
5128                 U32 paren = ST.me->flags;
5129                 if (paren > PL_regsize)
5130                     PL_regsize = paren;
5131                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5132             }
5133             ST.A = scan;
5134             ST.B = next;
5135             ST.alen = 0;
5136             ST.count = 0;
5137             ST.minmod = minmod;
5138             minmod = 0;
5139             ST.c1 = CHRTEST_UNINIT;
5140             REGCP_SET(ST.cp);
5141
5142             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5143                 goto curlym_do_B;
5144
5145           curlym_do_A: /* execute the A in /A{m,n}B/  */
5146             PL_reginput = locinput;
5147             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5148             assert(0); /* NOTREACHED */
5149
5150         case CURLYM_A: /* we've just matched an A */
5151             locinput = st->locinput;
5152             nextchr = UCHARAT(locinput);
5153
5154             ST.count++;
5155             /* after first match, determine A's length: u.curlym.alen */
5156             if (ST.count == 1) {
5157                 if (PL_reg_match_utf8) {
5158                     char *s = locinput;
5159                     while (s < PL_reginput) {
5160                         ST.alen++;
5161                         s += UTF8SKIP(s);
5162                     }
5163                 }
5164                 else {
5165                     ST.alen = PL_reginput - locinput;
5166                 }
5167                 if (ST.alen == 0)
5168                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5169             }
5170             DEBUG_EXECUTE_r(
5171                 PerlIO_printf(Perl_debug_log,
5172                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5173                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5174                           (IV) ST.count, (IV)ST.alen)
5175             );
5176
5177             locinput = PL_reginput;
5178
5179             if (cur_eval && cur_eval->u.eval.close_paren &&
5180                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5181                 goto fake_end;
5182
5183             {
5184                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5185                 if ( max == REG_INFTY || ST.count < max )
5186                     goto curlym_do_A; /* try to match another A */
5187             }
5188             goto curlym_do_B; /* try to match B */
5189
5190         case CURLYM_A_fail: /* just failed to match an A */
5191             REGCP_UNWIND(ST.cp);
5192
5193             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5194                 || (cur_eval && cur_eval->u.eval.close_paren &&
5195                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5196                 sayNO;
5197
5198           curlym_do_B: /* execute the B in /A{m,n}B/  */
5199             PL_reginput = locinput;
5200             if (ST.c1 == CHRTEST_UNINIT) {
5201                 /* calculate c1 and c2 for possible match of 1st char
5202                  * following curly */
5203                 ST.c1 = ST.c2 = CHRTEST_VOID;
5204                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5205                     regnode *text_node = ST.B;
5206                     if (! HAS_TEXT(text_node))
5207                         FIND_NEXT_IMPT(text_node);
5208                     /* this used to be
5209
5210                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5211
5212                         But the former is redundant in light of the latter.
5213
5214                         if this changes back then the macro for
5215                         IS_TEXT and friends need to change.
5216                      */
5217                     if (PL_regkind[OP(text_node)] == EXACT)
5218                     {
5219
5220                         ST.c1 = (U8)*STRING(text_node);
5221                         switch (OP(text_node)) {
5222                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5223                             case EXACTFA:
5224                             case EXACTFU_SS:
5225                             case EXACTFU_TRICKYFOLD:
5226                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5227                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5228                             default: ST.c2 = ST.c1;
5229                         }
5230                     }
5231                 }
5232             }
5233
5234             DEBUG_EXECUTE_r(
5235                 PerlIO_printf(Perl_debug_log,
5236                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5237                     (int)(REPORT_CODE_OFF+(depth*2)),
5238                     "", (IV)ST.count)
5239                 );
5240             if (ST.c1 != CHRTEST_VOID
5241                     && UCHARAT(PL_reginput) != ST.c1
5242                     && UCHARAT(PL_reginput) != ST.c2)
5243             {
5244                 /* simulate B failing */
5245                 DEBUG_OPTIMISE_r(
5246                     PerlIO_printf(Perl_debug_log,
5247                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5248                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5249                         (IV)ST.c1,(IV)ST.c2
5250                 ));
5251                 state_num = CURLYM_B_fail;
5252                 goto reenter_switch;
5253             }
5254
5255             if (ST.me->flags) {
5256                 /* emulate CLOSE: mark current A as captured */
5257                 I32 paren = ST.me->flags;
5258                 if (ST.count) {
5259                     rex->offs[paren].start
5260                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5261                     rex->offs[paren].end = PL_reginput - PL_bostr;
5262                     if ((U32)paren > rex->lastparen)
5263                         rex->lastparen = paren;
5264                     rex->lastcloseparen = paren;
5265                 }
5266                 else
5267                     rex->offs[paren].end = -1;
5268                 if (cur_eval && cur_eval->u.eval.close_paren &&
5269                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5270                 {
5271                     if (ST.count)
5272                         goto fake_end;
5273                     else
5274                         sayNO;
5275                 }
5276             }
5277
5278             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5279             assert(0); /* NOTREACHED */
5280
5281         case CURLYM_B_fail: /* just failed to match a B */
5282             REGCP_UNWIND(ST.cp);
5283             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5284             if (ST.minmod) {
5285                 I32 max = ARG2(ST.me);
5286                 if (max != REG_INFTY && ST.count == max)
5287                     sayNO;
5288                 goto curlym_do_A; /* try to match a further A */
5289             }
5290             /* backtrack one A */
5291             if (ST.count == ARG1(ST.me) /* min */)
5292                 sayNO;
5293             ST.count--;
5294             locinput = HOPc(locinput, -ST.alen);
5295             goto curlym_do_B; /* try to match B */
5296
5297 #undef ST
5298 #define ST st->u.curly
5299
5300 #define CURLY_SETPAREN(paren, success) \
5301     if (paren) { \
5302         if (success) { \
5303             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5304             rex->offs[paren].end = locinput - PL_bostr; \
5305             if (paren > rex->lastparen) \
5306                 rex->lastparen = paren; \
5307             rex->lastcloseparen = paren; \
5308         } \
5309         else { \
5310             rex->offs[paren].end = -1; \
5311             rex->lastparen      = ST.lastparen; \
5312             rex->lastcloseparen = ST.lastcloseparen; \
5313         } \
5314     }
5315
5316         case STAR:              /*  /A*B/ where A is width 1 */
5317             ST.paren = 0;
5318             ST.min = 0;
5319             ST.max = REG_INFTY;
5320             scan = NEXTOPER(scan);
5321             goto repeat;
5322         case PLUS:              /*  /A+B/ where A is width 1 */
5323             ST.paren = 0;
5324             ST.min = 1;
5325             ST.max = REG_INFTY;
5326             scan = NEXTOPER(scan);
5327             goto repeat;
5328         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5329             ST.paren = scan->flags;     /* Which paren to set */
5330             ST.lastparen      = rex->lastparen;
5331             ST.lastcloseparen = rex->lastcloseparen;
5332             if (ST.paren > PL_regsize)
5333                 PL_regsize = ST.paren;
5334             ST.min = ARG1(scan);  /* min to match */
5335             ST.max = ARG2(scan);  /* max to match */
5336             if (cur_eval && cur_eval->u.eval.close_paren &&
5337                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5338                 ST.min=1;
5339                 ST.max=1;
5340             }
5341             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5342             goto repeat;
5343         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5344             ST.paren = 0;
5345             ST.min = ARG1(scan);  /* min to match */
5346             ST.max = ARG2(scan);  /* max to match */
5347             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5348           repeat:
5349             /*
5350             * Lookahead to avoid useless match attempts
5351             * when we know what character comes next.
5352             *
5353             * Used to only do .*x and .*?x, but now it allows
5354             * for )'s, ('s and (?{ ... })'s to be in the way
5355             * of the quantifier and the EXACT-like node.  -- japhy
5356             */
5357
5358             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5359                 sayNO;
5360             if (HAS_TEXT(next) || JUMPABLE(next)) {
5361                 U8 *s;
5362                 regnode *text_node = next;
5363
5364                 if (! HAS_TEXT(text_node))
5365                     FIND_NEXT_IMPT(text_node);
5366
5367                 if (! HAS_TEXT(text_node))
5368                     ST.c1 = ST.c2 = CHRTEST_VOID;
5369                 else {
5370                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5371                         ST.c1 = ST.c2 = CHRTEST_VOID;
5372                         goto assume_ok_easy;
5373                     }
5374                     else
5375                         s = (U8*)STRING(text_node);
5376
5377                     /*  Currently we only get here when
5378
5379                         PL_rekind[OP(text_node)] == EXACT
5380
5381                         if this changes back then the macro for IS_TEXT and
5382                         friends need to change. */
5383                     if (!UTF_PATTERN) {
5384                         ST.c1 = *s;
5385                         switch (OP(text_node)) {
5386                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5387                             case EXACTFA:
5388                             case EXACTFU_SS:
5389                             case EXACTFU_TRICKYFOLD:
5390                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5391                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5392                             default: ST.c2 = ST.c1; break;
5393                         }
5394                     }
5395                     else { /* UTF_PATTERN */
5396                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5397                              STRLEN ulen;
5398                              U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5399
5400                              to_utf8_fold((U8*)s, tmpbuf, &ulen);
5401                              ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5402                                                     uniflags);
5403                         }
5404                         else {
5405                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5406                                                      uniflags);
5407                         }
5408                     }
5409                 }
5410             }
5411             else
5412                 ST.c1 = ST.c2 = CHRTEST_VOID;
5413         assume_ok_easy:
5414
5415             ST.A = scan;
5416             ST.B = next;
5417             PL_reginput = locinput;
5418             if (minmod) {
5419                 minmod = 0;
5420                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5421                     sayNO;
5422                 ST.count = ST.min;
5423                 locinput = PL_reginput;
5424                 REGCP_SET(ST.cp);
5425                 if (ST.c1 == CHRTEST_VOID)
5426                     goto curly_try_B_min;
5427
5428                 ST.oldloc = locinput;
5429
5430                 /* set ST.maxpos to the furthest point along the
5431                  * string that could possibly match */
5432                 if  (ST.max == REG_INFTY) {
5433                     ST.maxpos = PL_regeol - 1;
5434                     if (utf8_target)
5435                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5436                             ST.maxpos--;
5437                 }
5438                 else if (utf8_target) {
5439                     int m = ST.max - ST.min;
5440                     for (ST.maxpos = locinput;
5441                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5442                         ST.maxpos += UTF8SKIP(ST.maxpos);
5443                 }
5444                 else {
5445                     ST.maxpos = locinput + ST.max - ST.min;
5446                     if (ST.maxpos >= PL_regeol)
5447                         ST.maxpos = PL_regeol - 1;
5448                 }
5449                 goto curly_try_B_min_known;
5450
5451             }
5452             else {
5453                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5454                 locinput = PL_reginput;
5455                 if (ST.count < ST.min)
5456                     sayNO;
5457                 if ((ST.count > ST.min)
5458                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5459                 {
5460                     /* A{m,n} must come at the end of the string, there's
5461                      * no point in backing off ... */
5462                     ST.min = ST.count;
5463                     /* ...except that $ and \Z can match before *and* after
5464                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5465                        We may back off by one in this case. */
5466                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5467                         ST.min--;
5468                 }
5469                 REGCP_SET(ST.cp);
5470                 goto curly_try_B_max;
5471             }
5472             assert(0); /* NOTREACHED */
5473
5474
5475         case CURLY_B_min_known_fail:
5476             /* failed to find B in a non-greedy match where c1,c2 valid */
5477
5478             PL_reginput = locinput;     /* Could be reset... */
5479             REGCP_UNWIND(ST.cp);
5480             if (ST.paren) {
5481                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5482             }
5483             /* Couldn't or didn't -- move forward. */
5484             ST.oldloc = locinput;
5485             if (utf8_target)
5486                 locinput += UTF8SKIP(locinput);
5487             else
5488                 locinput++;
5489             ST.count++;
5490           curly_try_B_min_known:
5491              /* find the next place where 'B' could work, then call B */
5492             {
5493                 int n;
5494                 if (utf8_target) {
5495                     n = (ST.oldloc == locinput) ? 0 : 1;
5496                     if (ST.c1 == ST.c2) {
5497                         STRLEN len;
5498                         /* set n to utf8_distance(oldloc, locinput) */
5499                         while (locinput <= ST.maxpos &&
5500                                utf8n_to_uvchr((U8*)locinput,
5501                                               UTF8_MAXBYTES, &len,
5502                                               uniflags) != (UV)ST.c1) {
5503                             locinput += len;
5504                             n++;
5505                         }
5506                     }
5507                     else {
5508                         /* set n to utf8_distance(oldloc, locinput) */
5509                         while (locinput <= ST.maxpos) {
5510                             STRLEN len;
5511                             const UV c = utf8n_to_uvchr((U8*)locinput,
5512                                                   UTF8_MAXBYTES, &len,
5513                                                   uniflags);
5514                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5515                                 break;
5516                             locinput += len;
5517                             n++;
5518                         }
5519                     }
5520                 }
5521                 else {
5522                     if (ST.c1 == ST.c2) {
5523                         while (locinput <= ST.maxpos &&
5524                                UCHARAT(locinput) != ST.c1)
5525                             locinput++;
5526                     }
5527                     else {
5528                         while (locinput <= ST.maxpos
5529                                && UCHARAT(locinput) != ST.c1
5530                                && UCHARAT(locinput) != ST.c2)
5531                             locinput++;
5532                     }
5533                     n = locinput - ST.oldloc;
5534                 }
5535                 if (locinput > ST.maxpos)
5536                     sayNO;
5537                 /* PL_reginput == oldloc now */
5538                 if (n) {
5539                     ST.count += n;
5540                     if (regrepeat(rex, ST.A, n, depth) < n)
5541                         sayNO;
5542                 }
5543                 PL_reginput = locinput;
5544                 CURLY_SETPAREN(ST.paren, ST.count);
5545                 if (cur_eval && cur_eval->u.eval.close_paren &&
5546                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5547                     goto fake_end;
5548                 }
5549                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5550             }
5551             assert(0); /* NOTREACHED */
5552
5553
5554         case CURLY_B_min_fail:
5555             /* failed to find B in a non-greedy match where c1,c2 invalid */
5556
5557             REGCP_UNWIND(ST.cp);
5558             if (ST.paren) {
5559                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5560             }
5561             /* failed -- move forward one */
5562             PL_reginput = locinput;
5563             if (regrepeat(rex, ST.A, 1, depth)) {
5564                 ST.count++;
5565                 locinput = PL_reginput;
5566                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5567                         ST.count > 0)) /* count overflow ? */
5568                 {
5569                   curly_try_B_min:
5570                     CURLY_SETPAREN(ST.paren, ST.count);
5571                     if (cur_eval && cur_eval->u.eval.close_paren &&
5572                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5573                         goto fake_end;
5574                     }
5575                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5576                 }
5577             }
5578             sayNO;
5579             assert(0); /* NOTREACHED */
5580
5581
5582         curly_try_B_max:
5583             /* a successful greedy match: now try to match B */
5584             if (cur_eval && cur_eval->u.eval.close_paren &&
5585                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5586                 goto fake_end;
5587             }
5588             {
5589                 UV c = 0;
5590                 if (ST.c1 != CHRTEST_VOID)
5591                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5592                                            UTF8_MAXBYTES, 0, uniflags)
5593                                 : (UV) UCHARAT(PL_reginput);
5594                 /* If it could work, try it. */
5595                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5596                     CURLY_SETPAREN(ST.paren, ST.count);
5597                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5598                     assert(0); /* NOTREACHED */
5599                 }
5600             }
5601             /* FALL THROUGH */
5602         case CURLY_B_max_fail:
5603             /* failed to find B in a greedy match */
5604
5605             REGCP_UNWIND(ST.cp);
5606             if (ST.paren) {
5607                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5608             }
5609             /*  back up. */
5610             if (--ST.count < ST.min)
5611                 sayNO;
5612             PL_reginput = locinput = HOPc(locinput, -1);
5613             goto curly_try_B_max;
5614
5615 #undef ST
5616
5617         case END:
5618             fake_end:
5619             if (cur_eval) {
5620                 /* we've just finished A in /(??{A})B/; now continue with B */
5621                 st->u.eval.toggle_reg_flags
5622                             = cur_eval->u.eval.toggle_reg_flags;
5623                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5624
5625                 st->u.eval.prev_rex = rex_sv;           /* inner */
5626                 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5627                 rex_sv = cur_eval->u.eval.prev_rex;
5628                 SET_reg_curpm(rex_sv);
5629                 rex = (struct regexp *)SvANY(rex_sv);
5630                 rexi = RXi_GET(rex);
5631                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5632
5633                 REGCP_SET(st->u.eval.lastcp);
5634                 PL_reginput = locinput;
5635
5636                 /* Restore parens of the outer rex without popping the
5637                  * savestack */
5638                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5639
5640                 st->u.eval.prev_eval = cur_eval;
5641                 cur_eval = cur_eval->u.eval.prev_eval;
5642                 DEBUG_EXECUTE_r(
5643                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5644                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5645                 if ( nochange_depth )
5646                     nochange_depth--;
5647
5648                 PUSH_YES_STATE_GOTO(EVAL_AB,
5649                         st->u.eval.prev_eval->u.eval.B); /* match B */
5650             }
5651
5652             if (locinput < reginfo->till) {
5653                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5654                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5655                                       PL_colors[4],
5656                                       (long)(locinput - PL_reg_starttry),
5657                                       (long)(reginfo->till - PL_reg_starttry),
5658                                       PL_colors[5]));
5659
5660                 sayNO_SILENT;           /* Cannot match: too short. */
5661             }
5662             PL_reginput = locinput;     /* put where regtry can find it */
5663             sayYES;                     /* Success! */
5664
5665         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5666             DEBUG_EXECUTE_r(
5667             PerlIO_printf(Perl_debug_log,
5668                 "%*s  %ssubpattern success...%s\n",
5669                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5670             PL_reginput = locinput;     /* put where regtry can find it */
5671             sayYES;                     /* Success! */
5672
5673 #undef  ST
5674 #define ST st->u.ifmatch
5675
5676         case SUSPEND:   /* (?>A) */
5677             ST.wanted = 1;
5678             PL_reginput = locinput;
5679             goto do_ifmatch;
5680
5681         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5682             ST.wanted = 0;
5683             goto ifmatch_trivial_fail_test;
5684
5685         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5686             ST.wanted = 1;
5687           ifmatch_trivial_fail_test:
5688             if (scan->flags) {
5689                 char * const s = HOPBACKc(locinput, scan->flags);
5690                 if (!s) {
5691                     /* trivial fail */
5692                     if (logical) {
5693                         logical = 0;
5694                         sw = 1 - cBOOL(ST.wanted);
5695                     }
5696                     else if (ST.wanted)
5697                         sayNO;
5698                     next = scan + ARG(scan);
5699                     if (next == scan)
5700                         next = NULL;
5701                     break;
5702                 }
5703                 PL_reginput = s;
5704             }
5705             else
5706                 PL_reginput = locinput;
5707
5708           do_ifmatch:
5709             ST.me = scan;
5710             ST.logical = logical;
5711             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5712
5713             /* execute body of (?...A) */
5714             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5715             assert(0); /* NOTREACHED */
5716
5717         case IFMATCH_A_fail: /* body of (?...A) failed */
5718             ST.wanted = !ST.wanted;
5719             /* FALL THROUGH */
5720
5721         case IFMATCH_A: /* body of (?...A) succeeded */
5722             if (ST.logical) {
5723                 sw = cBOOL(ST.wanted);
5724             }
5725             else if (!ST.wanted)
5726                 sayNO;
5727
5728             if (OP(ST.me) == SUSPEND)
5729                 locinput = PL_reginput;
5730             else {
5731                 locinput = PL_reginput = st->locinput;
5732                 nextchr = UCHARAT(locinput);
5733             }
5734             scan = ST.me + ARG(ST.me);
5735             if (scan == ST.me)
5736                 scan = NULL;
5737             continue; /* execute B */
5738
5739 #undef ST
5740
5741         case LONGJMP:
5742             next = scan + ARG(scan);
5743             if (next == scan)
5744                 next = NULL;
5745             break;
5746         case COMMIT:
5747             reginfo->cutpoint = PL_regeol;
5748             /* FALLTHROUGH */
5749         case PRUNE:
5750             PL_reginput = locinput;
5751             if (!scan->flags)
5752                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5753             PUSH_STATE_GOTO(COMMIT_next,next);
5754             assert(0); /* NOTREACHED */
5755         case COMMIT_next_fail:
5756             no_final = 1;
5757             /* FALLTHROUGH */
5758         case OPFAIL:
5759             sayNO;
5760             assert(0); /* NOTREACHED */
5761
5762 #define ST st->u.mark
5763         case MARKPOINT:
5764             ST.prev_mark = mark_state;
5765             ST.mark_name = sv_commit = sv_yes_mark
5766                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5767             mark_state = st;
5768             ST.mark_loc = PL_reginput = locinput;
5769             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5770             assert(0); /* NOTREACHED */
5771         case MARKPOINT_next:
5772             mark_state = ST.prev_mark;
5773             sayYES;
5774             assert(0); /* NOTREACHED */
5775         case MARKPOINT_next_fail:
5776             if (popmark && sv_eq(ST.mark_name,popmark))
5777             {
5778                 if (ST.mark_loc > startpoint)
5779                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5780                 popmark = NULL; /* we found our mark */
5781                 sv_commit = ST.mark_name;
5782
5783                 DEBUG_EXECUTE_r({
5784                         PerlIO_printf(Perl_debug_log,
5785                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5786                             REPORT_CODE_OFF+depth*2, "",
5787                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5788                 });
5789             }
5790             mark_state = ST.prev_mark;
5791             sv_yes_mark = mark_state ?
5792                 mark_state->u.mark.mark_name : NULL;
5793             sayNO;
5794             assert(0); /* NOTREACHED */
5795         case SKIP:
5796             PL_reginput = locinput;
5797             if (scan->flags) {
5798                 /* (*SKIP) : if we fail we cut here*/
5799                 ST.mark_name = NULL;
5800                 ST.mark_loc = locinput;
5801                 PUSH_STATE_GOTO(SKIP_next,next);
5802             } else {
5803                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5804                    otherwise do nothing.  Meaning we need to scan
5805                  */
5806                 regmatch_state *cur = mark_state;
5807                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5808
5809                 while (cur) {
5810                     if ( sv_eq( cur->u.mark.mark_name,
5811                                 find ) )
5812                     {
5813                         ST.mark_name = find;
5814                         PUSH_STATE_GOTO( SKIP_next, next );
5815                     }
5816                     cur = cur->u.mark.prev_mark;
5817                 }
5818             }
5819             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5820             break;
5821         case SKIP_next_fail:
5822             if (ST.mark_name) {
5823                 /* (*CUT:NAME) - Set up to search for the name as we
5824                    collapse the stack*/
5825                 popmark = ST.mark_name;
5826             } else {
5827                 /* (*CUT) - No name, we cut here.*/
5828                 if (ST.mark_loc > startpoint)
5829                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5830                 /* but we set sv_commit to latest mark_name if there
5831                    is one so they can test to see how things lead to this
5832                    cut */
5833                 if (mark_state)
5834                     sv_commit=mark_state->u.mark.mark_name;
5835             }
5836             no_final = 1;
5837             sayNO;
5838             assert(0); /* NOTREACHED */
5839 #undef ST
5840         case LNBREAK:
5841             if ((n=is_LNBREAK(locinput,utf8_target))) {
5842                 locinput += n;
5843                 nextchr = UCHARAT(locinput);
5844             } else
5845                 sayNO;
5846             break;
5847
5848 #define CASE_CLASS(nAmE)                              \
5849         case nAmE:                                    \
5850             if (locinput >= PL_regeol)                \
5851                 sayNO;                                \
5852             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5853                 locinput += n;                        \
5854                 nextchr = UCHARAT(locinput);          \
5855             } else                                    \
5856                 sayNO;                                \
5857             break;                                    \
5858         case N##nAmE:                                 \
5859             if (locinput >= PL_regeol)                \
5860                 sayNO;                                \
5861             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5862                 sayNO;                                \
5863             } else {                                  \
5864                 locinput += UTF8SKIP(locinput);       \
5865                 nextchr = UCHARAT(locinput);          \
5866             }                                         \
5867             break
5868
5869         CASE_CLASS(VERTWS);
5870         CASE_CLASS(HORIZWS);
5871 #undef CASE_CLASS
5872
5873         default:
5874             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5875                           PTR2UV(scan), OP(scan));
5876             Perl_croak(aTHX_ "regexp memory corruption");
5877
5878         } /* end switch */
5879
5880         /* switch break jumps here */
5881         scan = next; /* prepare to execute the next op and ... */
5882         continue;    /* ... jump back to the top, reusing st */
5883         assert(0); /* NOTREACHED */
5884
5885       push_yes_state:
5886         /* push a state that backtracks on success */
5887         st->u.yes.prev_yes_state = yes_state;
5888         yes_state = st;
5889         /* FALL THROUGH */
5890       push_state:
5891         /* push a new regex state, then continue at scan  */
5892         {
5893             regmatch_state *newst;
5894
5895             DEBUG_STACK_r({
5896                 regmatch_state *cur = st;
5897                 regmatch_state *curyes = yes_state;
5898                 int curd = depth;
5899                 regmatch_slab *slab = PL_regmatch_slab;
5900                 for (;curd > -1;cur--,curd--) {
5901                     if (cur < SLAB_FIRST(slab)) {
5902                         slab = slab->prev;
5903                         cur = SLAB_LAST(slab);
5904                     }
5905                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5906                         REPORT_CODE_OFF + 2 + depth * 2,"",
5907                         curd, PL_reg_name[cur->resume_state],
5908                         (curyes == cur) ? "yes" : ""
5909                     );
5910                     if (curyes == cur)
5911                         curyes = cur->u.yes.prev_yes_state;
5912                 }
5913             } else
5914                 DEBUG_STATE_pp("push")
5915             );
5916             depth++;
5917             st->locinput = locinput;
5918             newst = st+1;
5919             if (newst >  SLAB_LAST(PL_regmatch_slab))
5920                 newst = S_push_slab(aTHX);
5921             PL_regmatch_state = newst;
5922
5923             locinput = PL_reginput;
5924             nextchr = UCHARAT(locinput);
5925             st = newst;
5926             continue;
5927             assert(0); /* NOTREACHED */
5928         }
5929     }
5930
5931     /*
5932     * We get here only if there's trouble -- normally "case END" is
5933     * the terminating point.
5934     */
5935     Perl_croak(aTHX_ "corrupted regexp pointers");
5936     /*NOTREACHED*/
5937     sayNO;
5938
5939 yes:
5940     if (yes_state) {
5941         /* we have successfully completed a subexpression, but we must now
5942          * pop to the state marked by yes_state and continue from there */
5943         assert(st != yes_state);
5944 #ifdef DEBUGGING
5945         while (st != yes_state) {
5946             st--;
5947             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5948                 PL_regmatch_slab = PL_regmatch_slab->prev;
5949                 st = SLAB_LAST(PL_regmatch_slab);
5950             }
5951             DEBUG_STATE_r({
5952                 if (no_final) {
5953                     DEBUG_STATE_pp("pop (no final)");
5954                 } else {
5955                     DEBUG_STATE_pp("pop (yes)");
5956                 }
5957             });
5958             depth--;
5959         }
5960 #else
5961         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5962             || yes_state > SLAB_LAST(PL_regmatch_slab))
5963         {
5964             /* not in this slab, pop slab */
5965             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5966             PL_regmatch_slab = PL_regmatch_slab->prev;
5967             st = SLAB_LAST(PL_regmatch_slab);
5968         }
5969         depth -= (st - yes_state);
5970 #endif
5971         st = yes_state;
5972         yes_state = st->u.yes.prev_yes_state;
5973         PL_regmatch_state = st;
5974
5975         if (no_final) {
5976             locinput= st->locinput;
5977             nextchr = UCHARAT(locinput);
5978         }
5979         state_num = st->resume_state + no_final;
5980         goto reenter_switch;
5981     }
5982
5983     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5984                           PL_colors[4], PL_colors[5]));
5985
5986     if (PL_reg_state.re_state_eval_setup_done) {
5987         /* each successfully executed (?{...}) block does the equivalent of
5988          *   local $^R = do {...}
5989          * When popping the save stack, all these locals would be undone;
5990          * bypass this by setting the outermost saved $^R to the latest
5991          * value */
5992         if (oreplsv != GvSV(PL_replgv))
5993             sv_setsv(oreplsv, GvSV(PL_replgv));
5994     }
5995     result = 1;
5996     goto final_exit;
5997
5998 no:
5999     DEBUG_EXECUTE_r(
6000         PerlIO_printf(Perl_debug_log,
6001             "%*s  %sfailed...%s\n",
6002             REPORT_CODE_OFF+depth*2, "",
6003             PL_colors[4], PL_colors[5])
6004         );
6005
6006 no_silent:
6007     if (no_final) {
6008         if (yes_state) {
6009             goto yes;
6010         } else {
6011             goto final_exit;
6012         }
6013     }
6014     if (depth) {
6015         /* there's a previous state to backtrack to */
6016         st--;
6017         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6018             PL_regmatch_slab = PL_regmatch_slab->prev;
6019             st = SLAB_LAST(PL_regmatch_slab);
6020         }
6021         PL_regmatch_state = st;
6022         locinput= st->locinput;
6023         nextchr = UCHARAT(locinput);
6024
6025         DEBUG_STATE_pp("pop");
6026         depth--;
6027         if (yes_state == st)
6028             yes_state = st->u.yes.prev_yes_state;
6029
6030         state_num = st->resume_state + 1; /* failure = success + 1 */
6031         goto reenter_switch;
6032     }
6033     result = 0;
6034
6035   final_exit:
6036     if (rex->intflags & PREGf_VERBARG_SEEN) {
6037         SV *sv_err = get_sv("REGERROR", 1);
6038         SV *sv_mrk = get_sv("REGMARK", 1);
6039         if (result) {
6040             sv_commit = &PL_sv_no;
6041             if (!sv_yes_mark)
6042                 sv_yes_mark = &PL_sv_yes;
6043         } else {
6044             if (!sv_commit)
6045                 sv_commit = &PL_sv_yes;
6046             sv_yes_mark = &PL_sv_no;
6047         }
6048         sv_setsv(sv_err, sv_commit);
6049         sv_setsv(sv_mrk, sv_yes_mark);
6050     }
6051
6052
6053     if (last_pushed_cv) {
6054         dSP;
6055         POP_MULTICALL;
6056         PERL_UNUSED_VAR(SP);
6057     }
6058
6059     /* clean up; in particular, free all slabs above current one */
6060     LEAVE_SCOPE(oldsave);
6061
6062     return result;
6063 }
6064
6065 /*
6066  - regrepeat - repeatedly match something simple, report how many
6067  */
6068 /*
6069  * [This routine now assumes that it will only match on things of length 1.
6070  * That was true before, but now we assume scan - reginput is the count,
6071  * rather than incrementing count on every character.  [Er, except utf8.]]
6072  */
6073 STATIC I32
6074 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
6075 {
6076     dVAR;
6077     char *scan;
6078     I32 c;
6079     char *loceol = PL_regeol;
6080     I32 hardcount = 0;
6081     bool utf8_target = PL_reg_match_utf8;
6082     UV utf8_flags;
6083 #ifndef DEBUGGING
6084     PERL_UNUSED_ARG(depth);
6085 #endif
6086
6087     PERL_ARGS_ASSERT_REGREPEAT;
6088
6089     scan = PL_reginput;
6090     if (max == REG_INFTY)
6091         max = I32_MAX;
6092     else if (max < loceol - scan)
6093         loceol = scan + max;
6094     switch (OP(p)) {
6095     case REG_ANY:
6096         if (utf8_target) {
6097             loceol = PL_regeol;
6098             while (scan < loceol && hardcount < max && *scan != '\n') {
6099                 scan += UTF8SKIP(scan);
6100                 hardcount++;
6101             }
6102         } else {
6103             while (scan < loceol && *scan != '\n')
6104                 scan++;
6105         }
6106         break;
6107     case SANY:
6108         if (utf8_target) {
6109             loceol = PL_regeol;
6110             while (scan < loceol && hardcount < max) {
6111                 scan += UTF8SKIP(scan);
6112                 hardcount++;
6113             }
6114         }
6115         else
6116             scan = loceol;
6117         break;
6118     case CANY:
6119         scan = loceol;
6120         break;
6121     case EXACT:
6122         /* To get here, EXACTish nodes must have *byte* length == 1.  That
6123          * means they match only characters in the string that can be expressed
6124          * as a single byte.  For non-utf8 strings, that means a simple match.
6125          * For utf8 strings, the character matched must be an invariant, or
6126          * downgradable to a single byte.  The pattern's utf8ness is
6127          * irrelevant, as since it's a single byte, it either isn't utf8, or if
6128          * it is, it's an invariant */
6129
6130         c = (U8)*STRING(p);
6131         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6132
6133         if (! utf8_target || UNI_IS_INVARIANT(c)) {
6134             while (scan < loceol && UCHARAT(scan) == c) {
6135                 scan++;
6136             }
6137         }
6138         else {
6139
6140             /* Here, the string is utf8, and the pattern char is different
6141              * in utf8 than not, so can't compare them directly.  Outside the
6142              * loop, find the two utf8 bytes that represent c, and then
6143              * look for those in sequence in the utf8 string */
6144             U8 high = UTF8_TWO_BYTE_HI(c);
6145             U8 low = UTF8_TWO_BYTE_LO(c);
6146             loceol = PL_regeol;
6147
6148             while (hardcount < max
6149                     && scan + 1 < loceol
6150                     && UCHARAT(scan) == high
6151                     && UCHARAT(scan + 1) == low)
6152             {
6153                 scan += 2;
6154                 hardcount++;
6155             }
6156         }
6157         break;
6158     case EXACTFA:
6159         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6160         goto do_exactf;
6161
6162     case EXACTFL:
6163         PL_reg_flags |= RF_tainted;
6164         utf8_flags = FOLDEQ_UTF8_LOCALE;
6165         goto do_exactf;
6166
6167     case EXACTF:
6168             utf8_flags = 0;
6169             goto do_exactf;
6170
6171     case EXACTFU_SS:
6172     case EXACTFU_TRICKYFOLD:
6173     case EXACTFU:
6174         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6175
6176         /* The comments for the EXACT case above apply as well to these fold
6177          * ones */
6178
6179     do_exactf:
6180         c = (U8)*STRING(p);
6181         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6182
6183         if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6184             char *tmpeol = loceol;
6185             while (hardcount < max
6186                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6187                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6188             {
6189                 scan = tmpeol;
6190                 tmpeol = loceol;
6191                 hardcount++;
6192             }
6193
6194             /* XXX Note that the above handles properly the German sharp s in
6195              * the pattern matching ss in the string.  But it doesn't handle
6196              * properly cases where the string contains say 'LIGATURE ff' and
6197              * the pattern is 'f+'.  This would require, say, a new function or
6198              * revised interface to foldEQ_utf8(), in which the maximum number
6199              * of characters to match could be passed and it would return how
6200              * many actually did.  This is just one of many cases where
6201              * multi-char folds don't work properly, and so the fix is being
6202              * deferred */
6203         }
6204         else {
6205             U8 folded;
6206
6207             /* Here, the string isn't utf8 and c is a single byte; and either
6208              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6209              * doesn't affect c.  Can just do simple comparisons for exact or
6210              * fold matching. */
6211             switch (OP(p)) {
6212                 case EXACTF: folded = PL_fold[c]; break;
6213                 case EXACTFA:
6214                 case EXACTFU_TRICKYFOLD:
6215                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6216                 case EXACTFL: folded = PL_fold_locale[c]; break;
6217                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6218             }
6219             while (scan < loceol &&
6220                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6221             {
6222                 scan++;
6223             }
6224         }
6225         break;
6226     case ANYOFV:
6227     case ANYOF:
6228         if (utf8_target || OP(p) == ANYOFV) {
6229             STRLEN inclasslen;
6230             loceol = PL_regeol;
6231             inclasslen = loceol - scan;
6232             while (hardcount < max
6233                    && ((inclasslen = loceol - scan) > 0)
6234                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6235             {
6236                 scan += inclasslen;
6237                 hardcount++;
6238             }
6239         } else {
6240             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6241                 scan++;
6242         }
6243         break;
6244     case ALNUMU:
6245         if (utf8_target) {
6246     utf8_wordchar:
6247             loceol = PL_regeol;
6248             LOAD_UTF8_CHARCLASS_ALNUM();
6249             while (hardcount < max && scan < loceol &&
6250                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6251             {
6252                 scan += UTF8SKIP(scan);
6253                 hardcount++;
6254             }
6255         } else {
6256             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6257                 scan++;
6258             }
6259         }
6260         break;
6261     case ALNUM:
6262         if (utf8_target)
6263             goto utf8_wordchar;
6264         while (scan < loceol && isALNUM((U8) *scan)) {
6265             scan++;
6266         }
6267         break;
6268     case ALNUMA:
6269         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6270             scan++;
6271         }
6272         break;
6273     case ALNUML:
6274         PL_reg_flags |= RF_tainted;
6275         if (utf8_target) {
6276             loceol = PL_regeol;
6277             while (hardcount < max && scan < loceol &&
6278                    isALNUM_LC_utf8((U8*)scan)) {
6279                 scan += UTF8SKIP(scan);
6280                 hardcount++;
6281             }
6282         } else {
6283             while (scan < loceol && isALNUM_LC(*scan))
6284                 scan++;
6285         }
6286         break;
6287     case NALNUMU:
6288         if (utf8_target) {
6289
6290     utf8_Nwordchar:
6291
6292             loceol = PL_regeol;
6293             LOAD_UTF8_CHARCLASS_ALNUM();
6294             while (hardcount < max && scan < loceol &&
6295                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6296             {
6297                 scan += UTF8SKIP(scan);
6298                 hardcount++;
6299             }
6300         } else {
6301             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6302                 scan++;
6303             }
6304         }
6305         break;
6306     case NALNUM:
6307         if (utf8_target)
6308             goto utf8_Nwordchar;
6309         while (scan < loceol && ! isALNUM((U8) *scan)) {
6310             scan++;
6311         }
6312         break;
6313
6314     case POSIXA:
6315        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6316             scan++;
6317         }
6318         break;
6319     case NPOSIXA:
6320         if (utf8_target) {
6321             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6322                 scan += UTF8SKIP(scan);
6323             }
6324         }
6325         else {
6326             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6327                 scan++;
6328             }
6329         }
6330         break;
6331     case NALNUMA:
6332         if (utf8_target) {
6333             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6334                 scan += UTF8SKIP(scan);
6335             }
6336         }
6337         else {
6338             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6339                 scan++;
6340             }
6341         }
6342         break;
6343     case NALNUML:
6344         PL_reg_flags |= RF_tainted;
6345         if (utf8_target) {
6346             loceol = PL_regeol;
6347             while (hardcount < max && scan < loceol &&
6348                    !isALNUM_LC_utf8((U8*)scan)) {
6349                 scan += UTF8SKIP(scan);
6350                 hardcount++;
6351             }
6352         } else {
6353             while (scan < loceol && !isALNUM_LC(*scan))
6354                 scan++;
6355         }
6356         break;
6357     case SPACEU:
6358         if (utf8_target) {
6359
6360     utf8_space:
6361
6362             loceol = PL_regeol;
6363             LOAD_UTF8_CHARCLASS_SPACE();
6364             while (hardcount < max && scan < loceol &&
6365                    (*scan == ' ' ||
6366                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6367             {
6368                 scan += UTF8SKIP(scan);
6369                 hardcount++;
6370             }
6371             break;
6372         }
6373         else {
6374             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6375                 scan++;
6376             }
6377             break;
6378         }
6379     case SPACE:
6380         if (utf8_target)
6381             goto utf8_space;
6382
6383         while (scan < loceol && isSPACE((U8) *scan)) {
6384             scan++;
6385         }
6386         break;
6387     case SPACEA:
6388         while (scan < loceol && isSPACE_A((U8) *scan)) {
6389             scan++;
6390         }
6391         break;
6392     case SPACEL:
6393         PL_reg_flags |= RF_tainted;
6394         if (utf8_target) {
6395             loceol = PL_regeol;
6396             while (hardcount < max && scan < loceol &&
6397                    isSPACE_LC_utf8((U8*)scan)) {
6398                 scan += UTF8SKIP(scan);
6399                 hardcount++;
6400             }
6401         } else {
6402             while (scan < loceol && isSPACE_LC(*scan))
6403                 scan++;
6404         }
6405         break;
6406     case NSPACEU:
6407         if (utf8_target) {
6408
6409     utf8_Nspace:
6410
6411             loceol = PL_regeol;
6412             LOAD_UTF8_CHARCLASS_SPACE();
6413             while (hardcount < max && scan < loceol &&
6414                    ! (*scan == ' ' ||
6415                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6416             {
6417                 scan += UTF8SKIP(scan);
6418                 hardcount++;
6419             }
6420             break;
6421         }
6422         else {
6423             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6424                 scan++;
6425             }
6426         }
6427         break;
6428     case NSPACE:
6429         if (utf8_target)
6430             goto utf8_Nspace;
6431
6432         while (scan < loceol && ! isSPACE((U8) *scan)) {
6433             scan++;
6434         }
6435         break;
6436     case NSPACEA:
6437         if (utf8_target) {
6438             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6439                 scan += UTF8SKIP(scan);
6440             }
6441         }
6442         else {
6443             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6444                 scan++;
6445             }
6446         }
6447         break;
6448     case NSPACEL:
6449         PL_reg_flags |= RF_tainted;
6450         if (utf8_target) {
6451             loceol = PL_regeol;
6452             while (hardcount < max && scan < loceol &&
6453                    !isSPACE_LC_utf8((U8*)scan)) {
6454                 scan += UTF8SKIP(scan);
6455                 hardcount++;
6456             }
6457         } else {
6458             while (scan < loceol && !isSPACE_LC(*scan))
6459                 scan++;
6460         }
6461         break;
6462     case DIGIT:
6463         if (utf8_target) {
6464             loceol = PL_regeol;
6465             LOAD_UTF8_CHARCLASS_DIGIT();
6466             while (hardcount < max && scan < loceol &&
6467                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6468                 scan += UTF8SKIP(scan);
6469                 hardcount++;
6470             }
6471         } else {
6472             while (scan < loceol && isDIGIT(*scan))
6473                 scan++;
6474         }
6475         break;
6476     case DIGITA:
6477         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6478             scan++;
6479         }
6480         break;
6481     case DIGITL:
6482         PL_reg_flags |= RF_tainted;
6483         if (utf8_target) {
6484             loceol = PL_regeol;
6485             while (hardcount < max && scan < loceol &&
6486                    isDIGIT_LC_utf8((U8*)scan)) {
6487                 scan += UTF8SKIP(scan);
6488                 hardcount++;
6489             }
6490         } else {
6491             while (scan < loceol && isDIGIT_LC(*scan))
6492                 scan++;
6493         }
6494         break;
6495     case NDIGIT:
6496         if (utf8_target) {
6497             loceol = PL_regeol;
6498             LOAD_UTF8_CHARCLASS_DIGIT();
6499             while (hardcount < max && scan < loceol &&
6500                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6501                 scan += UTF8SKIP(scan);
6502                 hardcount++;
6503             }
6504         } else {
6505             while (scan < loceol && !isDIGIT(*scan))
6506                 scan++;
6507         }
6508         break;
6509     case NDIGITA:
6510         if (utf8_target) {
6511             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6512                 scan += UTF8SKIP(scan);
6513             }
6514         }
6515         else {
6516             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6517                 scan++;
6518             }
6519         }
6520         break;
6521     case NDIGITL:
6522         PL_reg_flags |= RF_tainted;
6523         if (utf8_target) {
6524             loceol = PL_regeol;
6525             while (hardcount < max && scan < loceol &&
6526                    !isDIGIT_LC_utf8((U8*)scan)) {
6527                 scan += UTF8SKIP(scan);
6528                 hardcount++;
6529             }
6530         } else {
6531             while (scan < loceol && !isDIGIT_LC(*scan))
6532                 scan++;
6533         }
6534         break;
6535     case LNBREAK:
6536         if (utf8_target) {
6537             loceol = PL_regeol;
6538             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6539                 scan += c;
6540                 hardcount++;
6541             }
6542         } else {
6543             /*
6544               LNBREAK can match two latin chars, which is ok,
6545               because we have a null terminated string, but we
6546               have to use hardcount in this situation
6547             */
6548             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6549                 scan+=c;
6550                 hardcount++;
6551             }
6552         }
6553         break;
6554     case HORIZWS:
6555         if (utf8_target) {
6556             loceol = PL_regeol;
6557             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6558                 scan += c;
6559                 hardcount++;
6560             }
6561         } else {
6562             while (scan < loceol && is_HORIZWS_latin1(scan))
6563                 scan++;
6564         }
6565         break;
6566     case NHORIZWS:
6567         if (utf8_target) {
6568             loceol = PL_regeol;
6569             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6570                 scan += UTF8SKIP(scan);
6571                 hardcount++;
6572             }
6573         } else {
6574             while (scan < loceol && !is_HORIZWS_latin1(scan))
6575                 scan++;
6576
6577         }
6578         break;
6579     case VERTWS:
6580         if (utf8_target) {
6581             loceol = PL_regeol;
6582             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6583                 scan += c;
6584                 hardcount++;
6585             }
6586         } else {
6587             while (scan < loceol && is_VERTWS_latin1(scan))
6588                 scan++;
6589
6590         }
6591         break;
6592     case NVERTWS:
6593         if (utf8_target) {
6594             loceol = PL_regeol;
6595             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6596                 scan += UTF8SKIP(scan);
6597                 hardcount++;
6598             }
6599         } else {
6600             while (scan < loceol && !is_VERTWS_latin1(scan))
6601                 scan++;
6602
6603         }
6604         break;
6605
6606     default:            /* Called on something of 0 width. */
6607         break;          /* So match right here or not at all. */
6608     }
6609
6610     if (hardcount)
6611         c = hardcount;
6612     else
6613         c = scan - PL_reginput;
6614     PL_reginput = scan;
6615
6616     DEBUG_r({
6617         GET_RE_DEBUG_FLAGS_DECL;
6618         DEBUG_EXECUTE_r({
6619             SV * const prop = sv_newmortal();
6620             regprop(prog, prop, p);
6621             PerlIO_printf(Perl_debug_log,
6622                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6623                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6624         });
6625     });
6626
6627     return(c);
6628 }
6629
6630
6631 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6632 /*
6633 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
6634 create a copy so that changes the caller makes won't change the shared one
6635  */
6636 SV *
6637 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6638 {
6639     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6640     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6641 }
6642 #endif
6643
6644 STATIC SV *
6645 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6646 {
6647     /* Returns the swash for the input 'node' in the regex 'prog'.
6648      * If <doinit> is true, will attempt to create the swash if not already
6649      *    done.
6650      * If <listsvp> is non-null, will return the swash initialization string in
6651      *    it.
6652      * If <altsvp> is non-null, will return the alternates to the regular swash
6653      *    in it
6654      * Tied intimately to how regcomp.c sets up the data structure */
6655
6656     dVAR;
6657     SV *sw  = NULL;
6658     SV *si  = NULL;
6659     SV *alt = NULL;
6660     SV*  invlist = NULL;
6661
6662     RXi_GET_DECL(prog,progi);
6663     const struct reg_data * const data = prog ? progi->data : NULL;
6664
6665     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6666
6667     assert(ANYOF_NONBITMAP(node));
6668
6669     if (data && data->count) {
6670         const U32 n = ARG(node);
6671
6672         if (data->what[n] == 's') {
6673             SV * const rv = MUTABLE_SV(data->data[n]);
6674             AV * const av = MUTABLE_AV(SvRV(rv));
6675             SV **const ary = AvARRAY(av);
6676             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6677
6678             si = *ary;  /* ary[0] = the string to initialize the swash with */
6679
6680             /* Elements 3 and 4 are either both present or both absent. [3] is
6681              * any inversion list generated at compile time; [4] indicates if
6682              * that inversion list has any user-defined properties in it. */
6683             if (av_len(av) >= 3) {
6684                 invlist = ary[3];
6685                 if (SvUV(ary[4])) {
6686                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
6687                 }
6688             }
6689             else {
6690                 invlist = NULL;
6691             }
6692
6693             /* Element [1] is reserved for the set-up swash.  If already there,
6694              * return it; if not, create it and store it there */
6695             if (SvROK(ary[1])) {
6696                 sw = ary[1];
6697             }
6698             else if (si && doinit) {
6699
6700                 sw = _core_swash_init("utf8", /* the utf8 package */
6701                                       "", /* nameless */
6702                                       si,
6703                                       1, /* binary */
6704                                       0, /* not from tr/// */
6705                                       invlist,
6706                                       &swash_init_flags);
6707                 (void)av_store(av, 1, sw);
6708             }
6709
6710             /* Element [2] is for any multi-char folds.  Note that is a
6711              * fundamentally flawed design, because can't backtrack and try
6712              * again.  See [perl #89774] */
6713             if (SvTYPE(ary[2]) == SVt_PVAV) {
6714                 alt = ary[2];
6715             }
6716         }
6717     }
6718
6719     if (listsvp) {
6720         SV* matches_string = newSVpvn("", 0);
6721
6722         /* Use the swash, if any, which has to have incorporated into it all
6723          * possibilities */
6724         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
6725             && (si && si != &PL_sv_undef))
6726         {
6727
6728             /* If no swash, use the input initialization string, if available */
6729             sv_catsv(matches_string, si);
6730         }
6731
6732         /* Add the inversion list to whatever we have.  This may have come from
6733          * the swash, or from an input parameter */
6734         if (invlist) {
6735             sv_catsv(matches_string, _invlist_contents(invlist));
6736         }
6737         *listsvp = matches_string;
6738     }
6739
6740     if (altsvp)
6741         *altsvp  = alt;
6742
6743     return sw;
6744 }
6745
6746 /*
6747  - reginclass - determine if a character falls into a character class
6748
6749   n is the ANYOF regnode
6750   p is the target string
6751   lenp is pointer to the maximum number of bytes of how far to go in p
6752     (This is assumed wthout checking to always be at least the current
6753     character's size)
6754   utf8_target tells whether p is in UTF-8.
6755
6756   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6757   from a successful match, the value it points to will be updated to how many
6758   bytes in p were matched.  If there was no match, the value is undefined,
6759   possibly changed from the input.
6760
6761   Note that this can be a synthetic start class, a combination of various
6762   nodes, so things you think might be mutually exclusive, such as locale,
6763   aren't.  It can match both locale and non-locale
6764
6765  */
6766
6767 STATIC bool
6768 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6769 {
6770     dVAR;
6771     const char flags = ANYOF_FLAGS(n);
6772     bool match = FALSE;
6773     UV c = *p;
6774     STRLEN c_len = 0;
6775     STRLEN maxlen;
6776
6777     PERL_ARGS_ASSERT_REGINCLASS;
6778
6779     /* If c is not already the code point, get it */
6780     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6781         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6782                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6783                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6784                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6785                  * UTF8_ALLOW_FFFF */
6786         if (c_len == (STRLEN)-1)
6787             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6788     }
6789     else {
6790         c_len = 1;
6791     }
6792
6793     /* Use passed in max length, or one character if none passed in or less
6794      * than one character.  And assume will match just one character.  This is
6795      * overwritten later if matched more. */
6796     if (lenp) {
6797         maxlen = (*lenp > c_len) ? *lenp : c_len;
6798         *lenp = c_len;
6799
6800     }
6801     else {
6802         maxlen = c_len;
6803     }
6804
6805     /* If this character is potentially in the bitmap, check it */
6806     if (c < 256) {
6807         if (ANYOF_BITMAP_TEST(n, c))
6808             match = TRUE;
6809         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6810                 && ! utf8_target
6811                 && ! isASCII(c))
6812         {
6813             match = TRUE;
6814         }
6815
6816         else if (flags & ANYOF_LOCALE) {
6817             PL_reg_flags |= RF_tainted;
6818
6819             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6820                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6821             {
6822                 match = TRUE;
6823             }
6824             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6825                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6826                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6827                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6828                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6829                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6830                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6831                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6832                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6833                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6834                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6835                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII_LC(c))  ||
6836                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII_LC(c))  ||
6837                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6838                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6839                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6840                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6841                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6842                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6843                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6844                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6845                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6846                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6847                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6848                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6849                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6850                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6851                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6852                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6853                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK_LC(c))  ||
6854                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK_LC(c))
6855                      ) /* How's that for a conditional? */
6856             ) {
6857                 match = TRUE;
6858             }
6859         }
6860     }
6861
6862     /* If the bitmap didn't (or couldn't) match, and something outside the
6863      * bitmap could match, try that.  Locale nodes specifiy completely the
6864      * behavior of code points in the bit map (otherwise, a utf8 target would
6865      * cause them to be treated as Unicode and not locale), except in
6866      * the very unlikely event when this node is a synthetic start class, which
6867      * could be a combination of locale and non-locale nodes.  So allow locale
6868      * to match for the synthetic start class, which will give a false
6869      * positive that will be resolved when the match is done again as not part
6870      * of the synthetic start class */
6871     if (!match) {
6872         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6873             match = TRUE;       /* Everything above 255 matches */
6874         }
6875         else if (ANYOF_NONBITMAP(n)
6876                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6877                      || (utf8_target
6878                          && (c >=256
6879                              || (! (flags & ANYOF_LOCALE))
6880                              || (flags & ANYOF_IS_SYNTHETIC)))))
6881         {
6882             AV *av;
6883             SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6884
6885             if (sw) {
6886                 U8 * utf8_p;
6887                 if (utf8_target) {
6888                     utf8_p = (U8 *) p;
6889                 } else {
6890
6891                     /* Not utf8.  Convert as much of the string as available up
6892                      * to the limit of how far the (single) character in the
6893                      * pattern can possibly match (no need to go further).  If
6894                      * the node is a straight ANYOF or not folding, it can't
6895                      * match more than one.  Otherwise, It can match up to how
6896                      * far a single char can fold to.  Since not utf8, each
6897                      * character is a single byte, so the max it can be in
6898                      * bytes is the same as the max it can be in characters */
6899                     STRLEN len = (OP(n) == ANYOF
6900                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6901                                   ? 1
6902                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6903                                     ? maxlen
6904                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6905                     utf8_p = bytes_to_utf8(p, &len);
6906                 }
6907
6908                 if (swash_fetch(sw, utf8_p, TRUE))
6909                     match = TRUE;
6910                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6911
6912                     /* Here, we need to test if the fold of the target string
6913                      * matches.  The non-multi char folds have all been moved to
6914                      * the compilation phase, and the multi-char folds have
6915                      * been stored by regcomp into 'av'; we linearly check to
6916                      * see if any match the target string (folded).   We know
6917                      * that the originals were each one character, but we don't
6918                      * currently know how many characters/bytes each folded to,
6919                      * except we do know that there are small limits imposed by
6920                      * Unicode.  XXX A performance enhancement would be to have
6921                      * regcomp.c store the max number of chars/bytes that are
6922                      * in an av entry, as, say the 0th element.  Even better
6923                      * would be to have a hash of the few characters that can
6924                      * start a multi-char fold to the max number of chars of
6925                      * those folds.
6926                      *
6927                      * If there is a match, we will need to advance (if lenp is
6928                      * specified) the match pointer in the target string.  But
6929                      * what we are comparing here isn't that string directly,
6930                      * but its fold, whose length may differ from the original.
6931                      * As we go along in constructing the fold, therefore, we
6932                      * create a map so that we know how many bytes in the
6933                      * source to advance given that we have matched a certain
6934                      * number of bytes in the fold.  This map is stored in
6935                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6936                      * the fold of the first character that we are folding.
6937                      * Then map_fold_len_back[n] is set to the number of bytes
6938                      * in that first character.  Similarly let m be the
6939                      * corresponding number for the second character to be
6940                      * folded.  Then map_fold_len_back[n+m] is set to the
6941                      * number of bytes occupied by the first two source
6942                      * characters. ... */
6943                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6944                     U8 folded[UTF8_MAXBYTES_CASE+1];
6945                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6946                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6947                                                   chars */
6948
6949                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6950
6951                         /* Here, only need to fold the first char of the target
6952                          * string.  It the source wasn't utf8, is 1 byte long */
6953                         to_utf8_fold(utf8_p, folded, &foldlen);
6954                         total_foldlen = foldlen;
6955                         map_fold_len_back[foldlen] = (utf8_target)
6956                                                      ? UTF8SKIP(utf8_p)
6957                                                      : 1;
6958                     }
6959                     else {
6960
6961                         /* Here, need to fold more than the first char.  Do so
6962                          * up to the limits */
6963                         U8* source_ptr = utf8_p;    /* The source for the fold
6964                                                        is the regex target
6965                                                        string */
6966                         U8* folded_ptr = folded;
6967                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6968                                                        available byte in the
6969                                                        target string */
6970                         U8 i;
6971                         for (i = 0;
6972                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6973                              i++)
6974                         {
6975
6976                             /* Fold the next character */
6977                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6978                             STRLEN this_char_foldlen;
6979                             to_utf8_fold(source_ptr,
6980                                          this_char_folded,
6981                                          &this_char_foldlen);
6982
6983                             /* Bail if it would exceed the byte limit for
6984                              * folding a single char. */
6985                             if (this_char_foldlen + folded_ptr - folded >
6986                                                             UTF8_MAXBYTES_CASE)
6987                             {
6988                                 break;
6989                             }
6990
6991                             /* Add the fold of this character */
6992                             Copy(this_char_folded,
6993                                  folded_ptr,
6994                                  this_char_foldlen,
6995                                  U8);
6996                             source_ptr += UTF8SKIP(source_ptr);
6997                             folded_ptr += this_char_foldlen;
6998                             total_foldlen = folded_ptr - folded;
6999
7000                             /* Create map from the number of bytes in the fold
7001                              * back to the number of bytes in the source.  If
7002                              * the source isn't utf8, the byte count is just
7003                              * the number of characters so far */
7004                             map_fold_len_back[total_foldlen]
7005                                                       = (utf8_target)
7006                                                         ? source_ptr - utf8_p
7007                                                         : i + 1;
7008                         }
7009                         *folded_ptr = '\0';
7010                     }
7011
7012
7013                     /* Do the linear search to see if the fold is in the list
7014                      * of multi-char folds. */
7015                     if (av) {
7016                         I32 i;
7017                         for (i = 0; i <= av_len(av); i++) {
7018                             SV* const sv = *av_fetch(av, i, FALSE);
7019                             STRLEN len;
7020                             const char * const s = SvPV_const(sv, len);
7021
7022                             if (len <= total_foldlen
7023                                 && memEQ(s, (char*)folded, len)
7024
7025                                    /* If 0, means matched a partial char. See
7026                                     * [perl #90536] */
7027                                 && map_fold_len_back[len])
7028                             {
7029
7030                                 /* Advance the target string ptr to account for
7031                                  * this fold, but have to translate from the
7032                                  * folded length to the corresponding source
7033                                  * length. */
7034                                 if (lenp) {
7035                                     *lenp = map_fold_len_back[len];
7036                                 }
7037                                 match = TRUE;
7038                                 break;
7039                             }
7040                         }
7041                     }
7042                 }
7043
7044                 /* If we allocated a string above, free it */
7045                 if (! utf8_target) Safefree(utf8_p);
7046             }
7047         }
7048
7049         if (UNICODE_IS_SUPER(c)
7050             && (flags & ANYOF_WARN_SUPER)
7051             && ckWARN_d(WARN_NON_UNICODE))
7052         {
7053             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7054                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7055         }
7056     }
7057
7058     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7059     return cBOOL(flags & ANYOF_INVERT) ^ match;
7060 }
7061
7062 STATIC U8 *
7063 S_reghop3(U8 *s, I32 off, const U8* lim)
7064 {
7065     /* return the position 'off' UTF-8 characters away from 's', forward if
7066      * 'off' >= 0, backwards if negative.  But don't go outside of position
7067      * 'lim', which better be < s  if off < 0 */
7068
7069     dVAR;
7070
7071     PERL_ARGS_ASSERT_REGHOP3;
7072
7073     if (off >= 0) {
7074         while (off-- && s < lim) {
7075             /* XXX could check well-formedness here */
7076             s += UTF8SKIP(s);
7077         }
7078     }
7079     else {
7080         while (off++ && s > lim) {
7081             s--;
7082             if (UTF8_IS_CONTINUED(*s)) {
7083                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7084                     s--;
7085             }
7086             /* XXX could check well-formedness here */
7087         }
7088     }
7089     return s;
7090 }
7091
7092 #ifdef XXX_dmq
7093 /* there are a bunch of places where we use two reghop3's that should
7094    be replaced with this routine. but since thats not done yet
7095    we ifdef it out - dmq
7096 */
7097 STATIC U8 *
7098 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7099 {
7100     dVAR;
7101
7102     PERL_ARGS_ASSERT_REGHOP4;
7103
7104     if (off >= 0) {
7105         while (off-- && s < rlim) {
7106             /* XXX could check well-formedness here */
7107             s += UTF8SKIP(s);
7108         }
7109     }
7110     else {
7111         while (off++ && s > llim) {
7112             s--;
7113             if (UTF8_IS_CONTINUED(*s)) {
7114                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7115                     s--;
7116             }
7117             /* XXX could check well-formedness here */
7118         }
7119     }
7120     return s;
7121 }
7122 #endif
7123
7124 STATIC U8 *
7125 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7126 {
7127     dVAR;
7128
7129     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7130
7131     if (off >= 0) {
7132         while (off-- && s < lim) {
7133             /* XXX could check well-formedness here */
7134             s += UTF8SKIP(s);
7135         }
7136         if (off >= 0)
7137             return NULL;
7138     }
7139     else {
7140         while (off++ && s > lim) {
7141             s--;
7142             if (UTF8_IS_CONTINUED(*s)) {
7143                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7144                     s--;
7145             }
7146             /* XXX could check well-formedness here */
7147         }
7148         if (off <= 0)
7149             return NULL;
7150     }
7151     return s;
7152 }
7153
7154 static void
7155 restore_pos(pTHX_ void *arg)
7156 {
7157     dVAR;
7158     regexp * const rex = (regexp *)arg;
7159     if (PL_reg_state.re_state_eval_setup_done) {
7160         if (PL_reg_oldsaved) {
7161             rex->subbeg = PL_reg_oldsaved;
7162             rex->sublen = PL_reg_oldsavedlen;
7163 #ifdef PERL_OLD_COPY_ON_WRITE
7164             rex->saved_copy = PL_nrs;
7165 #endif
7166             RXp_MATCH_COPIED_on(rex);
7167         }
7168         PL_reg_magic->mg_len = PL_reg_oldpos;
7169         PL_reg_state.re_state_eval_setup_done = FALSE;
7170         PL_curpm = PL_reg_oldcurpm;
7171     }
7172 }
7173
7174 STATIC void
7175 S_to_utf8_substr(pTHX_ register regexp *prog)
7176 {
7177     int i = 1;
7178
7179     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7180
7181     do {
7182         if (prog->substrs->data[i].substr
7183             && !prog->substrs->data[i].utf8_substr) {
7184             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7185             prog->substrs->data[i].utf8_substr = sv;
7186             sv_utf8_upgrade(sv);
7187             if (SvVALID(prog->substrs->data[i].substr)) {
7188                 if (SvTAIL(prog->substrs->data[i].substr)) {
7189                     /* Trim the trailing \n that fbm_compile added last
7190                        time.  */
7191                     SvCUR_set(sv, SvCUR(sv) - 1);
7192                     /* Whilst this makes the SV technically "invalid" (as its
7193                        buffer is no longer followed by "\0") when fbm_compile()
7194                        adds the "\n" back, a "\0" is restored.  */
7195                     fbm_compile(sv, FBMcf_TAIL);
7196                 } else
7197                     fbm_compile(sv, 0);
7198             }
7199             if (prog->substrs->data[i].substr == prog->check_substr)
7200                 prog->check_utf8 = sv;
7201         }
7202     } while (i--);
7203 }
7204
7205 STATIC void
7206 S_to_byte_substr(pTHX_ register regexp *prog)
7207 {
7208     dVAR;
7209     int i = 1;
7210
7211     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7212
7213     do {
7214         if (prog->substrs->data[i].utf8_substr
7215             && !prog->substrs->data[i].substr) {
7216             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7217             if (sv_utf8_downgrade(sv, TRUE)) {
7218                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7219                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7220                         /* Trim the trailing \n that fbm_compile added last
7221                            time.  */
7222                         SvCUR_set(sv, SvCUR(sv) - 1);
7223                         fbm_compile(sv, FBMcf_TAIL);
7224                     } else
7225                         fbm_compile(sv, 0);
7226                 }
7227             } else {
7228                 SvREFCNT_dec(sv);
7229                 sv = &PL_sv_undef;
7230             }
7231             prog->substrs->data[i].substr = sv;
7232             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7233                 prog->check_substr = sv;
7234         }
7235     } while (i--);
7236 }
7237
7238 /*
7239  * Local variables:
7240  * c-indentation-style: bsd
7241  * c-basic-offset: 4
7242  * indent-tabs-mode: nil
7243  * End:
7244  *
7245  * ex: set ts=8 sts=4 sw=4 et:
7246  */