regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { \
 127         bool ok; \
 128         ENTER; save_re_context(); \
 129         ok=CAT2(is_utf8_,class)((const U8*)str); \
 130         assert(ok); LEAVE; } } STMT_END
 131
 132 /* Doesn't do an assert to verify that is correct */
 133 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 134     if (!CAT2(PL_utf8_,class)) { \
 135         bool throw_away __attribute__unused__; \
 136         ENTER; save_re_context(); \
 137         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 138         LEAVE; } } STMT_END
 139
 140 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 141 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 142 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 143
 144 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 145         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 146         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 147         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 148             * assert should likely and hopefully fail on an EBCDIC machine */ \
 149         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 150                                                                             \
 151         /* No asserts are done for these, in case called on an early        \
 152             * Unicode version in which they map to nothing */               \
 153         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 154         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 155         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 156         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 157         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 158         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 159         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 160
 161 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 162
 163 /* The actual code for CCC_TRY, which uses several variables from the routine
 164  * it's callable from.  It is designed to be the bulk of a case statement.
 165  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 166  *      nextchr matches the class.
 167  * UTF8_TEST is the whole test string to use for utf8 targets
 168  * LOAD is what to use to test, and if not present to load in the swash for the
 169  *      class
 170  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 171  *      UTF8_TEST test.
 172  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 173  * utf8 and a variant, load the swash if necessary and test using the utf8
 174  * test.  Advance to the next character if test is ok, otherwise fail; If not
 175  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 176  * fails, or advance to the next character */
 177
 178 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 179     if (locinput >= PL_regeol) {                                              \
 180         sayNO;                                                                \
 181     }                                                                         \
 182     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 183         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 184         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 185             sayNO;                                                            \
 186         }                                                                     \
 187         locinput += PL_utf8skip[nextchr];                                     \
 188         nextchr = UCHARAT(locinput);                                          \
 189         break;                                                                \
 190     }                                                                         \
 191     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 192         sayNO;                                                                \
 193     }                                                                         \
 194     nextchr = UCHARAT(++locinput);                                            \
 195     break;
 196
 197 /* Handle the non-locale cases for a character class and its complement.  It
 198  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 199  * This is because that code fails when the test succeeds, so we want to have
 200  * the test fail so that the code succeeds.  The swash is stored in a
 201  * predictable PL_ place */
 202 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 203                            CLASS, STR)                                        \
 204     case NAME:                                                                \
 205         _CCC_TRY_CODE( !, FUNC,                                               \
 206                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 207                                             (U8*)locinput, TRUE)),            \
 208                           CLASS, STR)                                         \
 209     case NNAME:                                                               \
 210         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 211                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 212                                             (U8*)locinput, TRUE)),            \
 213                           CLASS, STR)                                         \
 214
 215 /* Generate the case statements for both locale and non-locale character
 216  * classes in regmatch for classes that don't have special unicode semantics.
 217  * Locales don't use an immediate swash, but an intermediary special locale
 218  * function that is called on the pointer to the current place in the input
 219  * string.  That function will resolve to needing the same swash.  One might
 220  * think that because we don't know what the locale will match, we shouldn't
 221  * check with the swash loading function that it loaded properly; ie, that we
 222  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 223  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 224  * irrelevant here */
 225 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 226                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 227                 NAMEA, NNAMEA, FUNCA,                                         \
 228                 CLASS, STR)                                                   \
 229     case NAMEL:                                                               \
 230         PL_reg_flags |= RF_tainted;                                           \
 231         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 232     case NNAMEL:                                                              \
 233         PL_reg_flags |= RF_tainted;                                           \
 234         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 235                        CLASS, STR)                                            \
 236     case NAMEA:                                                               \
 237         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 238             sayNO;                                                            \
 239         }                                                                     \
 240         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 241         nextchr = UCHARAT(++locinput);                                        \
 242         break;                                                                \
 243     case NNAMEA:                                                              \
 244         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 245             sayNO;                                                            \
 246         }                                                                     \
 247         if (utf8_target) {                                                    \
 248             locinput += PL_utf8skip[nextchr];                                 \
 249             nextchr = UCHARAT(locinput);                                      \
 250         }                                                                     \
 251         else {                                                                \
 252             nextchr = UCHARAT(++locinput);                                    \
 253         }                                                                     \
 254         break;                                                                \
 255     /* Generate the non-locale cases */                                       \
 256     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 257
 258 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 259  * statements to handle separate Unicode semantics nodes */
 260 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 261                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 262                   NAMEU, NNAMEU, FUNCU,                                        \
 263                   NAMEA, NNAMEA, FUNCA,                                        \
 264                   CLASS, STR)                                                  \
 265     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 266             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 267             NAMEA, NNAMEA, FUNCA,                                              \
 268             CLASS, STR)                                                        \
 269     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 270
 271 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 272
 273 /* for use after a quantifier and before an EXACT-like node -- japhy */
 274 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 275  *
 276  * NOTE that *nothing* that affects backtracking should be in here, specifically
 277  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 278  * node that is in between two EXACT like nodes when ascertaining what the required
 279  * "follow" character is. This should probably be moved to regex compile time
 280  * although it may be done at run time beause of the REF possibility - more
 281  * investigation required. -- demerphq
 282 */
 283 #define JUMPABLE(rn) (      \
 284     OP(rn) == OPEN ||       \
 285     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 286     OP(rn) == EVAL ||   \
 287     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 288     OP(rn) == PLUS || OP(rn) == MINMOD || \
 289     OP(rn) == KEEPS || \
 290     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 291 )
 292 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 293
 294 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 295
 296 #if 0
 297 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 298    we don't need this definition. */
 299 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 300 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 301 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 302
 303 #else
 304 /* ... so we use this as its faster. */
 305 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 306 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 307 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 308 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 309
 310 #endif
 311
 312 /*
 313   Search for mandatory following text node; for lookahead, the text must
 314   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 315 */
 316 #define FIND_NEXT_IMPT(rn) STMT_START { \
 317     while (JUMPABLE(rn)) { \
 318         const OPCODE type = OP(rn); \
 319         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 320             rn = NEXTOPER(NEXTOPER(rn)); \
 321         else if (type == PLUS) \
 322             rn = NEXTOPER(rn); \
 323         else if (type == IFMATCH) \
 324             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 325         else rn += NEXT_OFF(rn); \
 326     } \
 327 } STMT_END
 328
 329
 330 static void restore_pos(pTHX_ void *arg);
 331
 332 #define REGCP_PAREN_ELEMS 4
 333 #define REGCP_OTHER_ELEMS 5
 334 #define REGCP_FRAME_ELEMS 1
 335 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 336  * are needed for the regexp context stack bookkeeping. */
 337
 338 STATIC CHECKPOINT
 339 S_regcppush(pTHX_ I32 parenfloor)
 340 {
 341     dVAR;
 342     const int retval = PL_savestack_ix;
 343     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 344     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 345     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 346     int p;
 347     GET_RE_DEBUG_FLAGS_DECL;
 348
 349     if (paren_elems_to_push < 0)
 350         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 351
 352     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 353         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 354                    " out of range (%lu-%ld)",
 355                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 356
 357     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 358
 359     for (p = PL_regsize; p > parenfloor; p--) {
 360 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 361         SSPUSHINT(PL_regoffs[p].end);
 362         SSPUSHINT(PL_regoffs[p].start);
 363         SSPUSHPTR(PL_reg_start_tmp[p]);
 364         SSPUSHINT(p);
 365         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 366           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 367                       (UV)p, (IV)PL_regoffs[p].start,
 368                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 369                       (IV)PL_regoffs[p].end
 370         ));
 371     }
 372 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 373     SSPUSHPTR(PL_regoffs);
 374     SSPUSHINT(PL_regsize);
 375     SSPUSHINT(*PL_reglastparen);
 376     SSPUSHINT(*PL_reglastcloseparen);
 377     SSPUSHPTR(PL_reginput);
 378     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 379
 380     return retval;
 381 }
 382
 383 /* These are needed since we do not localize EVAL nodes: */
 384 #define REGCP_SET(cp)                                           \
 385     DEBUG_STATE_r(                                              \
 386             PerlIO_printf(Perl_debug_log,                       \
 387                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 388                 (IV)PL_savestack_ix));                          \
 389     cp = PL_savestack_ix
 390
 391 #define REGCP_UNWIND(cp)                                        \
 392     DEBUG_STATE_r(                                              \
 393         if (cp != PL_savestack_ix)                              \
 394             PerlIO_printf(Perl_debug_log,                       \
 395                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 396                 (IV)(cp), (IV)PL_savestack_ix));                \
 397     regcpblow(cp)
 398
 399 STATIC char *
 400 S_regcppop(pTHX_ const regexp *rex)
 401 {
 402     dVAR;
 403     UV i;
 404     char *input;
 405     GET_RE_DEBUG_FLAGS_DECL;
 406
 407     PERL_ARGS_ASSERT_REGCPPOP;
 408
 409     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 410     i = SSPOPUV;
 411     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 412     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 413     input = (char *) SSPOPPTR;
 414     *PL_reglastcloseparen = SSPOPINT;
 415     *PL_reglastparen = SSPOPINT;
 416     PL_regsize = SSPOPINT;
 417     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 418
 419     i -= REGCP_OTHER_ELEMS;
 420     /* Now restore the parentheses context. */
 421     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 422         I32 tmps;
 423         U32 paren = (U32)SSPOPINT;
 424         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 425         PL_regoffs[paren].start = SSPOPINT;
 426         tmps = SSPOPINT;
 427         if (paren <= *PL_reglastparen)
 428             PL_regoffs[paren].end = tmps;
 429         DEBUG_BUFFERS_r(
 430             PerlIO_printf(Perl_debug_log,
 431                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 432                           (UV)paren, (IV)PL_regoffs[paren].start,
 433                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 434                           (IV)PL_regoffs[paren].end,
 435                           (paren > *PL_reglastparen ? "(no)" : ""));
 436         );
 437     }
 438     DEBUG_BUFFERS_r(
 439         if (*PL_reglastparen + 1 <= rex->nparens) {
 440             PerlIO_printf(Perl_debug_log,
 441                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 442                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 443         }
 444     );
 445 #if 1
 446     /* It would seem that the similar code in regtry()
 447      * already takes care of this, and in fact it is in
 448      * a better location to since this code can #if 0-ed out
 449      * but the code in regtry() is needed or otherwise tests
 450      * requiring null fields (pat.t#187 and split.t#{13,14}
 451      * (as of patchlevel 7877)  will fail.  Then again,
 452      * this code seems to be necessary or otherwise
 453      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 454      * --jhi updated by dapm */
 455     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 456         if (i > PL_regsize)
 457             PL_regoffs[i].start = -1;
 458         PL_regoffs[i].end = -1;
 459     }
 460 #endif
 461     return input;
 462 }
 463
 464 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 465
 466 /*
 467  * pregexec and friends
 468  */
 469
 470 #ifndef PERL_IN_XSUB_RE
 471 /*
 472  - pregexec - match a regexp against a string
 473  */
 474 I32
 475 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 476          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 477 /* strend: pointer to null at end of string */
 478 /* strbeg: real beginning of string */
 479 /* minend: end of match must be >=minend after stringarg. */
 480 /* nosave: For optimizations. */
 481 {
 482     PERL_ARGS_ASSERT_PREGEXEC;
 483
 484     return
 485         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 486                       nosave ? 0 : REXEC_COPY_STR);
 487 }
 488 #endif
 489
 490 /*
 491  * Need to implement the following flags for reg_anch:
 492  *
 493  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 494  * USE_INTUIT_ML
 495  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 496  * INTUIT_AUTORITATIVE_ML
 497  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 498  * INTUIT_ONCE_ML
 499  *
 500  * Another flag for this function: SECOND_TIME (so that float substrs
 501  * with giant delta may be not rechecked).
 502  */
 503
 504 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 505
 506 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 507    Otherwise, only SvCUR(sv) is used to get strbeg. */
 508
 509 /* XXXX We assume that strpos is strbeg unless sv. */
 510
 511 /* XXXX Some places assume that there is a fixed substring.
 512         An update may be needed if optimizer marks as "INTUITable"
 513         RExen without fixed substrings.  Similarly, it is assumed that
 514         lengths of all the strings are no more than minlen, thus they
 515         cannot come from lookahead.
 516         (Or minlen should take into account lookahead.)
 517   NOTE: Some of this comment is not correct. minlen does now take account
 518   of lookahead/behind. Further research is required. -- demerphq
 519
 520 */
 521
 522 /* A failure to find a constant substring means that there is no need to make
 523    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 524    finding a substring too deep into the string means that less calls to
 525    regtry() should be needed.
 526
 527    REx compiler's optimizer found 4 possible hints:
 528         a) Anchored substring;
 529         b) Fixed substring;
 530         c) Whether we are anchored (beginning-of-line or \G);
 531         d) First node (of those at offset 0) which may distinguish positions;
 532    We use a)b)d) and multiline-part of c), and try to find a position in the
 533    string which does not contradict any of them.
 534  */
 535
 536 /* Most of decisions we do here should have been done at compile time.
 537    The nodes of the REx which we used for the search should have been
 538    deleted from the finite automaton. */
 539
 540 char *
 541 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 542                      char *strend, const U32 flags, re_scream_pos_data *data)
 543 {
 544     dVAR;
 545     struct regexp *const prog = (struct regexp *)SvANY(rx);
 546     register I32 start_shift = 0;
 547     /* Should be nonnegative! */
 548     register I32 end_shift   = 0;
 549     register char *s;
 550     register SV *check;
 551     char *strbeg;
 552     char *t;
 553     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 554     I32 ml_anch;
 555     register char *other_last = NULL;   /* other substr checked before this */
 556     char *check_at = NULL;              /* check substr found at this pos */
 557     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 558     RXi_GET_DECL(prog,progi);
 559 #ifdef DEBUGGING
 560     const char * const i_strpos = strpos;
 561 #endif
 562     GET_RE_DEBUG_FLAGS_DECL;
 563
 564     PERL_ARGS_ASSERT_RE_INTUIT_START;
 565
 566     RX_MATCH_UTF8_set(rx,utf8_target);
 567
 568     if (RX_UTF8(rx)) {
 569         PL_reg_flags |= RF_utf8;
 570     }
 571     DEBUG_EXECUTE_r(
 572         debug_start_match(rx, utf8_target, strpos, strend,
 573             sv ? "Guessing start of match in sv for"
 574                : "Guessing start of match in string for");
 575               );
 576
 577     /* CHR_DIST() would be more correct here but it makes things slow. */
 578     if (prog->minlen > strend - strpos) {
 579         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 580                               "String too short... [re_intuit_start]\n"));
 581         goto fail;
 582     }
 583
 584     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 585     PL_regeol = strend;
 586     if (utf8_target) {
 587         if (!prog->check_utf8 && prog->check_substr)
 588             to_utf8_substr(prog);
 589         check = prog->check_utf8;
 590     } else {
 591         if (!prog->check_substr && prog->check_utf8)
 592             to_byte_substr(prog);
 593         check = prog->check_substr;
 594     }
 595     if (check == &PL_sv_undef) {
 596         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 597                 "Non-utf8 string cannot match utf8 check string\n"));
 598         goto fail;
 599     }
 600     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 601         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 602                      || ( (prog->extflags & RXf_ANCH_BOL)
 603                           && !multiline ) );    /* Check after \n? */
 604
 605         if (!ml_anch) {
 606           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 607                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 608                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 609                && sv && !SvROK(sv)
 610                && (strpos != strbeg)) {
 611               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 612               goto fail;
 613           }
 614           if (prog->check_offset_min == prog->check_offset_max &&
 615               !(prog->extflags & RXf_CANY_SEEN)) {
 616             /* Substring at constant offset from beg-of-str... */
 617             I32 slen;
 618
 619             s = HOP3c(strpos, prog->check_offset_min, strend);
 620
 621             if (SvTAIL(check)) {
 622                 slen = SvCUR(check);    /* >= 1 */
 623
 624                 if ( strend - s > slen || strend - s < slen - 1
 625                      || (strend - s == slen && strend[-1] != '\n')) {
 626                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 627                     goto fail_finish;
 628                 }
 629                 /* Now should match s[0..slen-2] */
 630                 slen--;
 631                 if (slen && (*SvPVX_const(check) != *s
 632                              || (slen > 1
 633                                  && memNE(SvPVX_const(check), s, slen)))) {
 634                   report_neq:
 635                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 636                     goto fail_finish;
 637                 }
 638             }
 639             else if (*SvPVX_const(check) != *s
 640                      || ((slen = SvCUR(check)) > 1
 641                          && memNE(SvPVX_const(check), s, slen)))
 642                 goto report_neq;
 643             check_at = s;
 644             goto success_at_start;
 645           }
 646         }
 647         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 648         s = strpos;
 649         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 650         end_shift = prog->check_end_shift;
 651
 652         if (!ml_anch) {
 653             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 654                                          - (SvTAIL(check) != 0);
 655             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 656
 657             if (end_shift < eshift)
 658                 end_shift = eshift;
 659         }
 660     }
 661     else {                              /* Can match at random position */
 662         ml_anch = 0;
 663         s = strpos;
 664         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 665         end_shift = prog->check_end_shift;
 666
 667         /* end shift should be non negative here */
 668     }
 669
 670 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 671     if (end_shift < 0)
 672         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 673                    (IV)end_shift, RX_PRECOMP(prog));
 674 #endif
 675
 676   restart:
 677     /* Find a possible match in the region s..strend by looking for
 678        the "check" substring in the region corrected by start/end_shift. */
 679
 680     {
 681         I32 srch_start_shift = start_shift;
 682         I32 srch_end_shift = end_shift;
 683         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 684             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 685             srch_start_shift = strbeg - s;
 686         }
 687     DEBUG_OPTIMISE_MORE_r({
 688         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 689             (IV)prog->check_offset_min,
 690             (IV)srch_start_shift,
 691             (IV)srch_end_shift,
 692             (IV)prog->check_end_shift);
 693     });
 694
 695     if (flags & REXEC_SCREAM) {
 696         I32 p = -1;                     /* Internal iterator of scream. */
 697         I32 * const pp = data ? data->scream_pos : &p;
 698
 699         if (PL_screamfirst[BmRARE(check)] >= 0
 700             || ( BmRARE(check) == '\n'
 701                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 702                  && SvTAIL(check) ))
 703             s = screaminstr(sv, check,
 704                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 705         else
 706             goto fail_finish;
 707         /* we may be pointing at the wrong string */
 708         if (s && RXp_MATCH_COPIED(prog))
 709             s = strbeg + (s - SvPVX_const(sv));
 710         if (data)
 711             *data->scream_olds = s;
 712     }
 713     else {
 714         U8* start_point;
 715         U8* end_point;
 716         if (prog->extflags & RXf_CANY_SEEN) {
 717             start_point= (U8*)(s + srch_start_shift);
 718             end_point= (U8*)(strend - srch_end_shift);
 719         } else {
 720             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 721             end_point= HOP3(strend, -srch_end_shift, strbeg);
 722         }
 723         DEBUG_OPTIMISE_MORE_r({
 724             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 725                 (int)(end_point - start_point),
 726                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 727                 start_point);
 728         });
 729
 730         s = fbm_instr( start_point, end_point,
 731                       check, multiline ? FBMrf_MULTILINE : 0);
 732     }
 733     }
 734     /* Update the count-of-usability, remove useless subpatterns,
 735         unshift s.  */
 736
 737     DEBUG_EXECUTE_r({
 738         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 739             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 740         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 741                           (s ? "Found" : "Did not find"),
 742             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 743                 ? "anchored" : "floating"),
 744             quoted,
 745             RE_SV_TAIL(check),
 746             (s ? " at offset " : "...\n") );
 747     });
 748
 749     if (!s)
 750         goto fail_finish;
 751     /* Finish the diagnostic message */
 752     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 753
 754     /* XXX dmq: first branch is for positive lookbehind...
 755        Our check string is offset from the beginning of the pattern.
 756        So we need to do any stclass tests offset forward from that
 757        point. I think. :-(
 758      */
 759
 760
 761
 762     check_at=s;
 763
 764
 765     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 766        Start with the other substr.
 767        XXXX no SCREAM optimization yet - and a very coarse implementation
 768        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 769                 *always* match.  Probably should be marked during compile...
 770        Probably it is right to do no SCREAM here...
 771      */
 772
 773     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 774                 : (prog->float_substr && prog->anchored_substr))
 775     {
 776         /* Take into account the "other" substring. */
 777         /* XXXX May be hopelessly wrong for UTF... */
 778         if (!other_last)
 779             other_last = strpos;
 780         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 781           do_other_anchored:
 782             {
 783                 char * const last = HOP3c(s, -start_shift, strbeg);
 784                 char *last1, *last2;
 785                 char * const saved_s = s;
 786                 SV* must;
 787
 788                 t = s - prog->check_offset_max;
 789                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 790                     && (!utf8_target
 791                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 792                             && t > strpos)))
 793                     NOOP;
 794                 else
 795                     t = strpos;
 796                 t = HOP3c(t, prog->anchored_offset, strend);
 797                 if (t < other_last)     /* These positions already checked */
 798                     t = other_last;
 799                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 800                 if (last < last1)
 801                     last1 = last;
 802                 /* XXXX It is not documented what units *_offsets are in.
 803                    We assume bytes, but this is clearly wrong.
 804                    Meaning this code needs to be carefully reviewed for errors.
 805                    dmq.
 806                   */
 807
 808                 /* On end-of-str: see comment below. */
 809                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 810                 if (must == &PL_sv_undef) {
 811                     s = (char*)NULL;
 812                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 813                 }
 814                 else
 815                     s = fbm_instr(
 816                         (unsigned char*)t,
 817                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 818                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 819                         must,
 820                         multiline ? FBMrf_MULTILINE : 0
 821                     );
 822                 DEBUG_EXECUTE_r({
 823                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 824                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 825                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 826                         (s ? "Found" : "Contradicts"),
 827                         quoted, RE_SV_TAIL(must));
 828                 });
 829
 830
 831                 if (!s) {
 832                     if (last1 >= last2) {
 833                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 834                                                 ", giving up...\n"));
 835                         goto fail_finish;
 836                     }
 837                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 838                         ", trying floating at offset %ld...\n",
 839                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 840                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 841                     s = HOP3c(last, 1, strend);
 842                     goto restart;
 843                 }
 844                 else {
 845                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 846                           (long)(s - i_strpos)));
 847                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 848                     other_last = HOP3c(s, 1, strend);
 849                     s = saved_s;
 850                     if (t == strpos)
 851                         goto try_at_start;
 852                     goto try_at_offset;
 853                 }
 854             }
 855         }
 856         else {          /* Take into account the floating substring. */
 857             char *last, *last1;
 858             char * const saved_s = s;
 859             SV* must;
 860
 861             t = HOP3c(s, -start_shift, strbeg);
 862             last1 = last =
 863                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 864             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 865                 last = HOP3c(t, prog->float_max_offset, strend);
 866             s = HOP3c(t, prog->float_min_offset, strend);
 867             if (s < other_last)
 868                 s = other_last;
 869  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 870             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 871             /* fbm_instr() takes into account exact value of end-of-str
 872                if the check is SvTAIL(ed).  Since false positives are OK,
 873                and end-of-str is not later than strend we are OK. */
 874             if (must == &PL_sv_undef) {
 875                 s = (char*)NULL;
 876                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 877             }
 878             else
 879                 s = fbm_instr((unsigned char*)s,
 880                               (unsigned char*)last + SvCUR(must)
 881                                   - (SvTAIL(must)!=0),
 882                               must, multiline ? FBMrf_MULTILINE : 0);
 883             DEBUG_EXECUTE_r({
 884                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 885                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 886                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 887                     (s ? "Found" : "Contradicts"),
 888                     quoted, RE_SV_TAIL(must));
 889             });
 890             if (!s) {
 891                 if (last1 == last) {
 892                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 893                                             ", giving up...\n"));
 894                     goto fail_finish;
 895                 }
 896                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 897                     ", trying anchored starting at offset %ld...\n",
 898                     (long)(saved_s + 1 - i_strpos)));
 899                 other_last = last;
 900                 s = HOP3c(t, 1, strend);
 901                 goto restart;
 902             }
 903             else {
 904                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 905                       (long)(s - i_strpos)));
 906                 other_last = s; /* Fix this later. --Hugo */
 907                 s = saved_s;
 908                 if (t == strpos)
 909                     goto try_at_start;
 910                 goto try_at_offset;
 911             }
 912         }
 913     }
 914
 915
 916     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 917
 918     DEBUG_OPTIMISE_MORE_r(
 919         PerlIO_printf(Perl_debug_log,
 920             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 921             (IV)prog->check_offset_min,
 922             (IV)prog->check_offset_max,
 923             (IV)(s-strpos),
 924             (IV)(t-strpos),
 925             (IV)(t-s),
 926             (IV)(strend-strpos)
 927         )
 928     );
 929
 930     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 931         && (!utf8_target
 932             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 933                  && t > strpos)))
 934     {
 935         /* Fixed substring is found far enough so that the match
 936            cannot start at strpos. */
 937       try_at_offset:
 938         if (ml_anch && t[-1] != '\n') {
 939             /* Eventually fbm_*() should handle this, but often
 940                anchored_offset is not 0, so this check will not be wasted. */
 941             /* XXXX In the code below we prefer to look for "^" even in
 942                presence of anchored substrings.  And we search even
 943                beyond the found float position.  These pessimizations
 944                are historical artefacts only.  */
 945           find_anchor:
 946             while (t < strend - prog->minlen) {
 947                 if (*t == '\n') {
 948                     if (t < check_at - prog->check_offset_min) {
 949                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 950                             /* Since we moved from the found position,
 951                                we definitely contradict the found anchored
 952                                substr.  Due to the above check we do not
 953                                contradict "check" substr.
 954                                Thus we can arrive here only if check substr
 955                                is float.  Redo checking for "other"=="fixed".
 956                              */
 957                             strpos = t + 1;
 958                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 959                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 960                             goto do_other_anchored;
 961                         }
 962                         /* We don't contradict the found floating substring. */
 963                         /* XXXX Why not check for STCLASS? */
 964                         s = t + 1;
 965                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 966                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 967                         goto set_useful;
 968                     }
 969                     /* Position contradicts check-string */
 970                     /* XXXX probably better to look for check-string
 971                        than for "\n", so one should lower the limit for t? */
 972                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 973                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 974                     other_last = strpos = s = t + 1;
 975                     goto restart;
 976                 }
 977                 t++;
 978             }
 979             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 980                         PL_colors[0], PL_colors[1]));
 981             goto fail_finish;
 982         }
 983         else {
 984             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 985                         PL_colors[0], PL_colors[1]));
 986         }
 987         s = t;
 988       set_useful:
 989         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 990     }
 991     else {
 992         /* The found string does not prohibit matching at strpos,
 993            - no optimization of calling REx engine can be performed,
 994            unless it was an MBOL and we are not after MBOL,
 995            or a future STCLASS check will fail this. */
 996       try_at_start:
 997         /* Even in this situation we may use MBOL flag if strpos is offset
 998            wrt the start of the string. */
 999         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1000             && (strpos != strbeg) && strpos[-1] != '\n'
1001             /* May be due to an implicit anchor of m{.*foo}  */
1002             && !(prog->intflags & PREGf_IMPLICIT))
1003         {
1004             t = strpos;
1005             goto find_anchor;
1006         }
1007         DEBUG_EXECUTE_r( if (ml_anch)
1008             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1009                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1010         );
1011       success_at_start:
1012         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1013             && (utf8_target ? (
1014                 prog->check_utf8                /* Could be deleted already */
1015                 && --BmUSEFUL(prog->check_utf8) < 0
1016                 && (prog->check_utf8 == prog->float_utf8)
1017             ) : (
1018                 prog->check_substr              /* Could be deleted already */
1019                 && --BmUSEFUL(prog->check_substr) < 0
1020                 && (prog->check_substr == prog->float_substr)
1021             )))
1022         {
1023             /* If flags & SOMETHING - do not do it many times on the same match */
1024             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1025             /* XXX Does the destruction order has to change with utf8_target? */
1026             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1027             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1028             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1029             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1030             check = NULL;                       /* abort */
1031             s = strpos;
1032             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1033                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1034             if (prog->intflags & PREGf_IMPLICIT)
1035                 prog->extflags &= ~RXf_ANCH_MBOL;
1036             /* XXXX This is a remnant of the old implementation.  It
1037                     looks wasteful, since now INTUIT can use many
1038                     other heuristics. */
1039             prog->extflags &= ~RXf_USE_INTUIT;
1040             /* XXXX What other flags might need to be cleared in this branch? */
1041         }
1042         else
1043             s = strpos;
1044     }
1045
1046     /* Last resort... */
1047     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1048     /* trie stclasses are too expensive to use here, we are better off to
1049        leave it to regmatch itself */
1050     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1051         /* minlen == 0 is possible if regstclass is \b or \B,
1052            and the fixed substr is ''$.
1053            Since minlen is already taken into account, s+1 is before strend;
1054            accidentally, minlen >= 1 guaranties no false positives at s + 1
1055            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1056            regstclass does not come from lookahead...  */
1057         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1058            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1059         const U8* const str = (U8*)STRING(progi->regstclass);
1060         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1061                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1062                     : 1);
1063         char * endpos;
1064         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1065             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1066         else if (prog->float_substr || prog->float_utf8)
1067             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1068         else
1069             endpos= strend;
1070
1071         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1072                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1073
1074         t = s;
1075         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1076         if (!s) {
1077 #ifdef DEBUGGING
1078             const char *what = NULL;
1079 #endif
1080             if (endpos == strend) {
1081                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082                                 "Could not match STCLASS...\n") );
1083                 goto fail;
1084             }
1085             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1086                                    "This position contradicts STCLASS...\n") );
1087             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1088                 goto fail;
1089             /* Contradict one of substrings */
1090             if (prog->anchored_substr || prog->anchored_utf8) {
1091                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1092                     DEBUG_EXECUTE_r( what = "anchored" );
1093                   hop_and_restart:
1094                     s = HOP3c(t, 1, strend);
1095                     if (s + start_shift + end_shift > strend) {
1096                         /* XXXX Should be taken into account earlier? */
1097                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1098                                                "Could not match STCLASS...\n") );
1099                         goto fail;
1100                     }
1101                     if (!check)
1102                         goto giveup;
1103                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104                                 "Looking for %s substr starting at offset %ld...\n",
1105                                  what, (long)(s + start_shift - i_strpos)) );
1106                     goto restart;
1107                 }
1108                 /* Have both, check_string is floating */
1109                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1110                     goto retry_floating_check;
1111                 /* Recheck anchored substring, but not floating... */
1112                 s = check_at;
1113                 if (!check)
1114                     goto giveup;
1115                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116                           "Looking for anchored substr starting at offset %ld...\n",
1117                           (long)(other_last - i_strpos)) );
1118                 goto do_other_anchored;
1119             }
1120             /* Another way we could have checked stclass at the
1121                current position only: */
1122             if (ml_anch) {
1123                 s = t = t + 1;
1124                 if (!check)
1125                     goto giveup;
1126                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127                           "Looking for /%s^%s/m starting at offset %ld...\n",
1128                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1129                 goto try_at_offset;
1130             }
1131             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1132                 goto fail;
1133             /* Check is floating substring. */
1134           retry_floating_check:
1135             t = check_at - start_shift;
1136             DEBUG_EXECUTE_r( what = "floating" );
1137             goto hop_and_restart;
1138         }
1139         if (t != s) {
1140             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1141                         "By STCLASS: moving %ld --> %ld\n",
1142                                   (long)(t - i_strpos), (long)(s - i_strpos))
1143                    );
1144         }
1145         else {
1146             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1147                                   "Does not contradict STCLASS...\n");
1148                    );
1149         }
1150     }
1151   giveup:
1152     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1153                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1154                           PL_colors[5], (long)(s - i_strpos)) );
1155     return s;
1156
1157   fail_finish:                          /* Substring not found */
1158     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1159         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1160   fail:
1161     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1162                           PL_colors[4], PL_colors[5]));
1163     return NULL;
1164 }
1165
1166 #define DECL_TRIE_TYPE(scan) \
1167     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1168                     trie_type = (scan->flags != EXACT) \
1169                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1170                               : (utf8_target ? trie_utf8 : trie_plain)
1171
1172 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1173 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1174     switch (trie_type) {                                                    \
1175     case trie_utf8_fold:                                                    \
1176         if ( foldlen>0 ) {                                                  \
1177             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1178             foldlen -= len;                                                 \
1179             uscan += len;                                                   \
1180             len=0;                                                          \
1181         } else {                                                            \
1182             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1183             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1184             foldlen -= UNISKIP( uvc );                                      \
1185             uscan = foldbuf + UNISKIP( uvc );                               \
1186         }                                                                   \
1187         break;                                                              \
1188     case trie_latin_utf8_fold:                                              \
1189         if ( foldlen>0 ) {                                                  \
1190             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1191             foldlen -= len;                                                 \
1192             uscan += len;                                                   \
1193             len=0;                                                          \
1194         } else {                                                            \
1195             len = 1;                                                        \
1196             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1197             foldlen -= UNISKIP( uvc );                                      \
1198             uscan = foldbuf + UNISKIP( uvc );                               \
1199         }                                                                   \
1200         break;                                                              \
1201     case trie_utf8:                                                         \
1202         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1203         break;                                                              \
1204     case trie_plain:                                                        \
1205         uvc = (UV)*uc;                                                      \
1206         len = 1;                                                            \
1207     }                                                                       \
1208     if (uvc < 256) {                                                        \
1209         charid = trie->charmap[ uvc ];                                      \
1210     }                                                                       \
1211     else {                                                                  \
1212         charid = 0;                                                         \
1213         if (widecharmap) {                                                  \
1214             SV** const svpp = hv_fetch(widecharmap,                         \
1215                         (char*)&uvc, sizeof(UV), 0);                        \
1216             if (svpp)                                                       \
1217                 charid = (U16)SvIV(*svpp);                                  \
1218         }                                                                   \
1219     }                                                                       \
1220 } STMT_END
1221
1222 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1223 STMT_START {                                              \
1224     while (s <= e) {                                      \
1225         if ( (CoNd)                                       \
1226              && (ln == 1 || folder(s, pat_string, ln))    \
1227              && (!reginfo || regtry(reginfo, &s)) )       \
1228             goto got_it;                                  \
1229         s++;                                              \
1230     }                                                     \
1231 } STMT_END
1232
1233 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1234 STMT_START {                                          \
1235     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1236         CoDe                                          \
1237         s += uskip;                                   \
1238     }                                                 \
1239 } STMT_END
1240
1241 #define REXEC_FBC_SCAN(CoDe)                          \
1242 STMT_START {                                          \
1243     while (s < strend) {                              \
1244         CoDe                                          \
1245         s++;                                          \
1246     }                                                 \
1247 } STMT_END
1248
1249 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1250 REXEC_FBC_UTF8_SCAN(                                  \
1251     if (CoNd) {                                       \
1252         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1253             goto got_it;                              \
1254         else                                          \
1255             tmp = doevery;                            \
1256     }                                                 \
1257     else                                              \
1258         tmp = 1;                                      \
1259 )
1260
1261 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1262 REXEC_FBC_SCAN(                                       \
1263     if (CoNd) {                                       \
1264         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1265             goto got_it;                              \
1266         else                                          \
1267             tmp = doevery;                            \
1268     }                                                 \
1269     else                                              \
1270         tmp = 1;                                      \
1271 )
1272
1273 #define REXEC_FBC_TRYIT               \
1274 if ((!reginfo || regtry(reginfo, &s))) \
1275     goto got_it
1276
1277 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1278     if (utf8_target) {                                             \
1279         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1280     }                                                          \
1281     else {                                                     \
1282         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1283     }
1284
1285 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1286     if (utf8_target) {                                             \
1287         UtFpReLoAd;                                            \
1288         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1289     }                                                          \
1290     else {                                                     \
1291         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1292     }
1293
1294 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1295     PL_reg_flags |= RF_tainted;                                \
1296     if (utf8_target) {                                             \
1297         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1298     }                                                          \
1299     else {                                                     \
1300         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1301     }
1302
1303 #define DUMP_EXEC_POS(li,s,doutf8) \
1304     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1305
1306
1307 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1308         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1309         tmp = TEST_NON_UTF8(tmp);                                              \
1310         REXEC_FBC_UTF8_SCAN(                                                   \
1311             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1312                 tmp = !tmp;                                                    \
1313                 IF_SUCCESS;                                                    \
1314             }                                                                  \
1315             else {                                                             \
1316                 IF_FAIL;                                                       \
1317             }                                                                  \
1318         );                                                                     \
1319
1320 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1321         if (s == PL_bostr) {                                                   \
1322             tmp = '\n';                                                        \
1323         }                                                                      \
1324         else {                                                                 \
1325             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1326             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1327         }                                                                      \
1328         tmp = TeSt1_UtF8;                                                      \
1329         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1330         REXEC_FBC_UTF8_SCAN(                                                   \
1331             if (tmp == ! (TeSt2_UtF8)) { \
1332                 tmp = !tmp;                                                    \
1333                 IF_SUCCESS;                                                    \
1334             }                                                                  \
1335             else {                                                             \
1336                 IF_FAIL;                                                       \
1337             }                                                                  \
1338         );                                                                     \
1339
1340 /* The only difference between the BOUND and NBOUND cases is that
1341  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1342  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1343  * with the other one being empty */
1344 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1345     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1346
1347 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1348     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1349
1350 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1351     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1352
1353 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1354     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1355
1356
1357 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1358  * be passed in completely with the variable name being tested, which isn't
1359  * such a clean interface, but this is easier to read than it was before.  We
1360  * are looking for the boundary (or non-boundary between a word and non-word
1361  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1362  * must be different.  Find the "wordness" of the character just prior to this
1363  * one, and compare it with the wordness of this one.  If they differ, we have
1364  * a boundary.  At the beginning of the string, pretend that the previous
1365  * character was a new-line */
1366 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1367     if (utf8_target) {                                                         \
1368                 UTF8_CODE \
1369     }                                                                          \
1370     else {  /* Not utf8 */                                                     \
1371         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1372         tmp = TEST_NON_UTF8(tmp);                                              \
1373         REXEC_FBC_SCAN(                                                        \
1374             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1375                 tmp = !tmp;                                                    \
1376                 IF_SUCCESS;                                                    \
1377             }                                                                  \
1378             else {                                                             \
1379                 IF_FAIL;                                                       \
1380             }                                                                  \
1381         );                                                                     \
1382     }                                                                          \
1383     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1384         goto got_it;
1385
1386 /* We know what class REx starts with.  Try to find this position... */
1387 /* if reginfo is NULL, its a dryrun */
1388 /* annoyingly all the vars in this routine have different names from their counterparts
1389    in regmatch. /grrr */
1390
1391 STATIC char *
1392 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1393     const char *strend, regmatch_info *reginfo)
1394 {
1395         dVAR;
1396         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1397         char *pat_string;   /* The pattern's exactish string */
1398         char *pat_end;      /* ptr to end char of pat_string */
1399         re_fold_t folder;       /* Function for computing non-utf8 folds */
1400         const U8 *fold_array;   /* array for folding ords < 256 */
1401         STRLEN ln;
1402         STRLEN lnc;
1403         register STRLEN uskip;
1404         U8 c1;
1405         U8 c2;
1406         char *e;
1407         register I32 tmp = 1;   /* Scratch variable? */
1408         register const bool utf8_target = PL_reg_match_utf8;
1409         UV utf8_fold_flags = 0;
1410         RXi_GET_DECL(prog,progi);
1411
1412         PERL_ARGS_ASSERT_FIND_BYCLASS;
1413
1414         /* We know what class it must start with. */
1415         switch (OP(c)) {
1416         case ANYOFV:
1417         case ANYOF:
1418             if (utf8_target || OP(c) == ANYOFV) {
1419                 STRLEN inclasslen = strend - s;
1420                 REXEC_FBC_UTF8_CLASS_SCAN(
1421                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1422             }
1423             else {
1424                 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1425             }
1426             break;
1427         case CANY:
1428             REXEC_FBC_SCAN(
1429                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1430                     goto got_it;
1431                 else
1432                     tmp = doevery;
1433             );
1434             break;
1435
1436         case EXACTFA:
1437             if (UTF_PATTERN || utf8_target) {
1438                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1439                 goto do_exactf_utf8;
1440             }
1441             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1442             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1443             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1444
1445         case EXACTFU:
1446             if (UTF_PATTERN || utf8_target) {
1447                 utf8_fold_flags = 0;
1448                 goto do_exactf_utf8;
1449             }
1450
1451             /* Any 'ss' in the pattern should have been replaced by regcomp,
1452              * so we don't have to worry here about this single special case
1453              * in the Latin1 range */
1454             fold_array = PL_fold_latin1;
1455             folder = foldEQ_latin1;
1456             goto do_exactf_non_utf8;
1457
1458         case EXACTF:
1459             if (UTF_PATTERN || utf8_target) {
1460                 utf8_fold_flags = 0;
1461                 goto do_exactf_utf8;
1462             }
1463             fold_array = PL_fold;
1464             folder = foldEQ;
1465             goto do_exactf_non_utf8;
1466
1467         case EXACTFL:
1468             if (UTF_PATTERN || utf8_target) {
1469                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1470                 goto do_exactf_utf8;
1471             }
1472             fold_array = PL_fold_locale;
1473             folder = foldEQ_locale;
1474
1475             /* FALL THROUGH */
1476
1477         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1478
1479             /* The idea in the non-utf8 EXACTF* cases is to first find the
1480              * first character of the EXACTF* node and then, if necessary,
1481              * case-insensitively compare the full text of the node.  c1 is the
1482              * first character.  c2 is its fold.  This logic will not work for
1483              * Unicode semantics and the german sharp ss, which hence should
1484              * not be compiled into a node that gets here. */
1485             pat_string = STRING(c);
1486             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1487
1488             e = HOP3c(strend, -((I32)ln), s);
1489
1490             if (!reginfo && e < s) {
1491                 e = s;                  /* Due to minlen logic of intuit() */
1492             }
1493
1494             c1 = *pat_string;
1495             c2 = fold_array[c1];
1496             if (c1 == c2) { /* If char and fold are the same */
1497                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1498             }
1499             else {
1500                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1501             }
1502             break;
1503
1504         do_exactf_utf8:
1505
1506             /* If one of the operands is in utf8, we can't use the simpler
1507              * folding above, due to the fact that many different characters
1508              * can have the same fold, or portion of a fold, or different-
1509              * length fold */
1510             pat_string = STRING(c);
1511             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1512             pat_end = pat_string + ln;
1513             lnc = (UTF_PATTERN) /* length to match in characters */
1514                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1515                     : ln;
1516
1517             e = HOP3c(strend, -((I32)lnc), s);
1518
1519             if (!reginfo && e < s) {
1520                 e = s;                  /* Due to minlen logic of intuit() */
1521             }
1522
1523             while (s <= e) {
1524                 char *my_strend= (char *)strend;
1525                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1526                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1527                     && (!reginfo || regtry(reginfo, &s)) )
1528                 {
1529                     goto got_it;
1530                 }
1531                 s += UTF8SKIP(s);
1532             }
1533             break;
1534         case BOUNDL:
1535             PL_reg_flags |= RF_tainted;
1536             FBC_BOUND(isALNUM_LC,
1537                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1538                       isALNUM_LC_utf8((U8*)s));
1539             break;
1540         case NBOUNDL:
1541             PL_reg_flags |= RF_tainted;
1542             FBC_NBOUND(isALNUM_LC,
1543                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1544                        isALNUM_LC_utf8((U8*)s));
1545             break;
1546         case BOUND:
1547             FBC_BOUND(isWORDCHAR,
1548                       isALNUM_uni(tmp),
1549                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1550             break;
1551         case BOUNDA:
1552             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1553                              isWORDCHAR_A(tmp),
1554                              isWORDCHAR_A((U8*)s));
1555             break;
1556         case NBOUND:
1557             FBC_NBOUND(isWORDCHAR,
1558                        isALNUM_uni(tmp),
1559                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1560             break;
1561         case NBOUNDA:
1562             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1563                               isWORDCHAR_A(tmp),
1564                               isWORDCHAR_A((U8*)s));
1565             break;
1566         case BOUNDU:
1567             FBC_BOUND(isWORDCHAR_L1,
1568                       isALNUM_uni(tmp),
1569                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1570             break;
1571         case NBOUNDU:
1572             FBC_NBOUND(isWORDCHAR_L1,
1573                        isALNUM_uni(tmp),
1574                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1575             break;
1576         case ALNUML:
1577             REXEC_FBC_CSCAN_TAINT(
1578                 isALNUM_LC_utf8((U8*)s),
1579                 isALNUM_LC(*s)
1580             );
1581             break;
1582         case ALNUMU:
1583             REXEC_FBC_CSCAN_PRELOAD(
1584                 LOAD_UTF8_CHARCLASS_ALNUM(),
1585                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1586                 isWORDCHAR_L1((U8) *s)
1587             );
1588             break;
1589         case ALNUM:
1590             REXEC_FBC_CSCAN_PRELOAD(
1591                 LOAD_UTF8_CHARCLASS_ALNUM(),
1592                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1593                 isWORDCHAR((U8) *s)
1594             );
1595             break;
1596         case ALNUMA:
1597             /* Don't need to worry about utf8, as it can match only a single
1598              * byte invariant character */
1599             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1600             break;
1601         case NALNUMU:
1602             REXEC_FBC_CSCAN_PRELOAD(
1603                 LOAD_UTF8_CHARCLASS_ALNUM(),
1604                 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1605                 ! isWORDCHAR_L1((U8) *s)
1606             );
1607             break;
1608         case NALNUM:
1609             REXEC_FBC_CSCAN_PRELOAD(
1610                 LOAD_UTF8_CHARCLASS_ALNUM(),
1611                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1612                 ! isALNUM(*s)
1613             );
1614             break;
1615         case NALNUMA:
1616             REXEC_FBC_CSCAN(
1617                 !isWORDCHAR_A(*s),
1618                 !isWORDCHAR_A(*s)
1619             );
1620             break;
1621         case NALNUML:
1622             REXEC_FBC_CSCAN_TAINT(
1623                 !isALNUM_LC_utf8((U8*)s),
1624                 !isALNUM_LC(*s)
1625             );
1626             break;
1627         case SPACEU:
1628             REXEC_FBC_CSCAN_PRELOAD(
1629                 LOAD_UTF8_CHARCLASS_SPACE(),
1630                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1631                 isSPACE_L1((U8) *s)
1632             );
1633             break;
1634         case SPACE:
1635             REXEC_FBC_CSCAN_PRELOAD(
1636                 LOAD_UTF8_CHARCLASS_SPACE(),
1637                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1638                 isSPACE((U8) *s)
1639             );
1640             break;
1641         case SPACEA:
1642             /* Don't need to worry about utf8, as it can match only a single
1643              * byte invariant character */
1644             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1645             break;
1646         case SPACEL:
1647             REXEC_FBC_CSCAN_TAINT(
1648                 isSPACE_LC_utf8((U8*)s),
1649                 isSPACE_LC(*s)
1650             );
1651             break;
1652         case NSPACEU:
1653             REXEC_FBC_CSCAN_PRELOAD(
1654                 LOAD_UTF8_CHARCLASS_SPACE(),
1655                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1656                 ! isSPACE_L1((U8) *s)
1657             );
1658             break;
1659         case NSPACE:
1660             REXEC_FBC_CSCAN_PRELOAD(
1661                 LOAD_UTF8_CHARCLASS_SPACE(),
1662                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1663                 ! isSPACE((U8) *s)
1664             );
1665             break;
1666         case NSPACEA:
1667             REXEC_FBC_CSCAN(
1668                 !isSPACE_A(*s),
1669                 !isSPACE_A(*s)
1670             );
1671             break;
1672         case NSPACEL:
1673             REXEC_FBC_CSCAN_TAINT(
1674                 !isSPACE_LC_utf8((U8*)s),
1675                 !isSPACE_LC(*s)
1676             );
1677             break;
1678         case DIGIT:
1679             REXEC_FBC_CSCAN_PRELOAD(
1680                 LOAD_UTF8_CHARCLASS_DIGIT(),
1681                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1682                 isDIGIT(*s)
1683             );
1684             break;
1685         case DIGITA:
1686             /* Don't need to worry about utf8, as it can match only a single
1687              * byte invariant character */
1688             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1689             break;
1690         case DIGITL:
1691             REXEC_FBC_CSCAN_TAINT(
1692                 isDIGIT_LC_utf8((U8*)s),
1693                 isDIGIT_LC(*s)
1694             );
1695             break;
1696         case NDIGIT:
1697             REXEC_FBC_CSCAN_PRELOAD(
1698                 LOAD_UTF8_CHARCLASS_DIGIT(),
1699                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1700                 !isDIGIT(*s)
1701             );
1702             break;
1703         case NDIGITA:
1704             REXEC_FBC_CSCAN(
1705                 !isDIGIT_A(*s),
1706                 !isDIGIT_A(*s)
1707             );
1708             break;
1709         case NDIGITL:
1710             REXEC_FBC_CSCAN_TAINT(
1711                 !isDIGIT_LC_utf8((U8*)s),
1712                 !isDIGIT_LC(*s)
1713             );
1714             break;
1715         case LNBREAK:
1716             REXEC_FBC_CSCAN(
1717                 is_LNBREAK_utf8(s),
1718                 is_LNBREAK_latin1(s)
1719             );
1720             break;
1721         case VERTWS:
1722             REXEC_FBC_CSCAN(
1723                 is_VERTWS_utf8(s),
1724                 is_VERTWS_latin1(s)
1725             );
1726             break;
1727         case NVERTWS:
1728             REXEC_FBC_CSCAN(
1729                 !is_VERTWS_utf8(s),
1730                 !is_VERTWS_latin1(s)
1731             );
1732             break;
1733         case HORIZWS:
1734             REXEC_FBC_CSCAN(
1735                 is_HORIZWS_utf8(s),
1736                 is_HORIZWS_latin1(s)
1737             );
1738             break;
1739         case NHORIZWS:
1740             REXEC_FBC_CSCAN(
1741                 !is_HORIZWS_utf8(s),
1742                 !is_HORIZWS_latin1(s)
1743             );
1744             break;
1745         case AHOCORASICKC:
1746         case AHOCORASICK:
1747             {
1748                 DECL_TRIE_TYPE(c);
1749                 /* what trie are we using right now */
1750                 reg_ac_data *aho
1751                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1752                 reg_trie_data *trie
1753                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1754                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1755
1756                 const char *last_start = strend - trie->minlen;
1757 #ifdef DEBUGGING
1758                 const char *real_start = s;
1759 #endif
1760                 STRLEN maxlen = trie->maxlen;
1761                 SV *sv_points;
1762                 U8 **points; /* map of where we were in the input string
1763                                 when reading a given char. For ASCII this
1764                                 is unnecessary overhead as the relationship
1765                                 is always 1:1, but for Unicode, especially
1766                                 case folded Unicode this is not true. */
1767                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1768                 U8 *bitmap=NULL;
1769
1770
1771                 GET_RE_DEBUG_FLAGS_DECL;
1772
1773                 /* We can't just allocate points here. We need to wrap it in
1774                  * an SV so it gets freed properly if there is a croak while
1775                  * running the match */
1776                 ENTER;
1777                 SAVETMPS;
1778                 sv_points=newSV(maxlen * sizeof(U8 *));
1779                 SvCUR_set(sv_points,
1780                     maxlen * sizeof(U8 *));
1781                 SvPOK_on(sv_points);
1782                 sv_2mortal(sv_points);
1783                 points=(U8**)SvPV_nolen(sv_points );
1784                 if ( trie_type != trie_utf8_fold
1785                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1786                 {
1787                     if (trie->bitmap)
1788                         bitmap=(U8*)trie->bitmap;
1789                     else
1790                         bitmap=(U8*)ANYOF_BITMAP(c);
1791                 }
1792                 /* this is the Aho-Corasick algorithm modified a touch
1793                    to include special handling for long "unknown char"
1794                    sequences. The basic idea being that we use AC as long
1795                    as we are dealing with a possible matching char, when
1796                    we encounter an unknown char (and we have not encountered
1797                    an accepting state) we scan forward until we find a legal
1798                    starting char.
1799                    AC matching is basically that of trie matching, except
1800                    that when we encounter a failing transition, we fall back
1801                    to the current states "fail state", and try the current char
1802                    again, a process we repeat until we reach the root state,
1803                    state 1, or a legal transition. If we fail on the root state
1804                    then we can either terminate if we have reached an accepting
1805                    state previously, or restart the entire process from the beginning
1806                    if we have not.
1807
1808                  */
1809                 while (s <= last_start) {
1810                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1811                     U8 *uc = (U8*)s;
1812                     U16 charid = 0;
1813                     U32 base = 1;
1814                     U32 state = 1;
1815                     UV uvc = 0;
1816                     STRLEN len = 0;
1817                     STRLEN foldlen = 0;
1818                     U8 *uscan = (U8*)NULL;
1819                     U8 *leftmost = NULL;
1820 #ifdef DEBUGGING
1821                     U32 accepted_word= 0;
1822 #endif
1823                     U32 pointpos = 0;
1824
1825                     while ( state && uc <= (U8*)strend ) {
1826                         int failed=0;
1827                         U32 word = aho->states[ state ].wordnum;
1828
1829                         if( state==1 ) {
1830                             if ( bitmap ) {
1831                                 DEBUG_TRIE_EXECUTE_r(
1832                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1833                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1834                                             (char *)uc, utf8_target );
1835                                         PerlIO_printf( Perl_debug_log,
1836                                             " Scanning for legal start char...\n");
1837                                     }
1838                                 );
1839                                 if (utf8_target) {
1840                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1841                                         uc += UTF8SKIP(uc);
1842                                     }
1843                                 } else {
1844                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1845                                         uc++;
1846                                     }
1847                                 }
1848                                 s= (char *)uc;
1849                             }
1850                             if (uc >(U8*)last_start) break;
1851                         }
1852
1853                         if ( word ) {
1854                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1855                             if (!leftmost || lpos < leftmost) {
1856                                 DEBUG_r(accepted_word=word);
1857                                 leftmost= lpos;
1858                             }
1859                             if (base==0) break;
1860
1861                         }
1862                         points[pointpos++ % maxlen]= uc;
1863                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1864                                              uscan, len, uvc, charid, foldlen,
1865                                              foldbuf, uniflags);
1866                         DEBUG_TRIE_EXECUTE_r({
1867                             dump_exec_pos( (char *)uc, c, strend, real_start,
1868                                 s,   utf8_target );
1869                             PerlIO_printf(Perl_debug_log,
1870                                 " Charid:%3u CP:%4"UVxf" ",
1871                                  charid, uvc);
1872                         });
1873
1874                         do {
1875 #ifdef DEBUGGING
1876                             word = aho->states[ state ].wordnum;
1877 #endif
1878                             base = aho->states[ state ].trans.base;
1879
1880                             DEBUG_TRIE_EXECUTE_r({
1881                                 if (failed)
1882                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1883                                         s,   utf8_target );
1884                                 PerlIO_printf( Perl_debug_log,
1885                                     "%sState: %4"UVxf", word=%"UVxf,
1886                                     failed ? " Fail transition to " : "",
1887                                     (UV)state, (UV)word);
1888                             });
1889                             if ( base ) {
1890                                 U32 tmp;
1891                                 I32 offset;
1892                                 if (charid &&
1893                                      ( ((offset = base + charid
1894                                         - 1 - trie->uniquecharcount)) >= 0)
1895                                      && ((U32)offset < trie->lasttrans)
1896                                      && trie->trans[offset].check == state
1897                                      && (tmp=trie->trans[offset].next))
1898                                 {
1899                                     DEBUG_TRIE_EXECUTE_r(
1900                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1901                                     state = tmp;
1902                                     break;
1903                                 }
1904                                 else {
1905                                     DEBUG_TRIE_EXECUTE_r(
1906                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1907                                     failed = 1;
1908                                     state = aho->fail[state];
1909                                 }
1910                             }
1911                             else {
1912                                 /* we must be accepting here */
1913                                 DEBUG_TRIE_EXECUTE_r(
1914                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1915                                 failed = 1;
1916                                 break;
1917                             }
1918                         } while(state);
1919                         uc += len;
1920                         if (failed) {
1921                             if (leftmost)
1922                                 break;
1923                             if (!state) state = 1;
1924                         }
1925                     }
1926                     if ( aho->states[ state ].wordnum ) {
1927                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1928                         if (!leftmost || lpos < leftmost) {
1929                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1930                             leftmost = lpos;
1931                         }
1932                     }
1933                     if (leftmost) {
1934                         s = (char*)leftmost;
1935                         DEBUG_TRIE_EXECUTE_r({
1936                             PerlIO_printf(
1937                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1938                                 (UV)accepted_word, (IV)(s - real_start)
1939                             );
1940                         });
1941                         if (!reginfo || regtry(reginfo, &s)) {
1942                             FREETMPS;
1943                             LEAVE;
1944                             goto got_it;
1945                         }
1946                         s = HOPc(s,1);
1947                         DEBUG_TRIE_EXECUTE_r({
1948                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1949                         });
1950                     } else {
1951                         DEBUG_TRIE_EXECUTE_r(
1952                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1953                         break;
1954                     }
1955                 }
1956                 FREETMPS;
1957                 LEAVE;
1958             }
1959             break;
1960         default:
1961             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1962             break;
1963         }
1964         return 0;
1965       got_it:
1966         return s;
1967 }
1968
1969
1970 /*
1971  - regexec_flags - match a regexp against a string
1972  */
1973 I32
1974 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1975               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1976 /* strend: pointer to null at end of string */
1977 /* strbeg: real beginning of string */
1978 /* minend: end of match must be >=minend after stringarg. */
1979 /* data: May be used for some additional optimizations.
1980          Currently its only used, with a U32 cast, for transmitting
1981          the ganch offset when doing a /g match. This will change */
1982 /* nosave: For optimizations. */
1983 {
1984     dVAR;
1985     struct regexp *const prog = (struct regexp *)SvANY(rx);
1986     /*register*/ char *s;
1987     register regnode *c;
1988     /*register*/ char *startpos = stringarg;
1989     I32 minlen;         /* must match at least this many chars */
1990     I32 dontbother = 0; /* how many characters not to try at end */
1991     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1992     I32 scream_pos = -1;                /* Internal iterator of scream. */
1993     char *scream_olds = NULL;
1994     const bool utf8_target = cBOOL(DO_UTF8(sv));
1995     I32 multiline;
1996     RXi_GET_DECL(prog,progi);
1997     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1998     regexp_paren_pair *swap = NULL;
1999     GET_RE_DEBUG_FLAGS_DECL;
2000
2001     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2002     PERL_UNUSED_ARG(data);
2003
2004     /* Be paranoid... */
2005     if (prog == NULL || startpos == NULL) {
2006         Perl_croak(aTHX_ "NULL regexp parameter");
2007         return 0;
2008     }
2009
2010     multiline = prog->extflags & RXf_PMf_MULTILINE;
2011     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2012
2013     RX_MATCH_UTF8_set(rx, utf8_target);
2014     DEBUG_EXECUTE_r(
2015         debug_start_match(rx, utf8_target, startpos, strend,
2016         "Matching");
2017     );
2018
2019     minlen = prog->minlen;
2020
2021     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2022         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2023                               "String too short [regexec_flags]...\n"));
2024         goto phooey;
2025     }
2026
2027
2028     /* Check validity of program. */
2029     if (UCHARAT(progi->program) != REG_MAGIC) {
2030         Perl_croak(aTHX_ "corrupted regexp program");
2031     }
2032
2033     PL_reg_flags = 0;
2034     PL_reg_eval_set = 0;
2035     PL_reg_maxiter = 0;
2036
2037     if (RX_UTF8(rx))
2038         PL_reg_flags |= RF_utf8;
2039
2040     /* Mark beginning of line for ^ and lookbehind. */
2041     reginfo.bol = startpos; /* XXX not used ??? */
2042     PL_bostr  = strbeg;
2043     reginfo.sv = sv;
2044
2045     /* Mark end of line for $ (and such) */
2046     PL_regeol = strend;
2047
2048     /* see how far we have to get to not match where we matched before */
2049     reginfo.till = startpos+minend;
2050
2051     /* If there is a "must appear" string, look for it. */
2052     s = startpos;
2053
2054     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2055         MAGIC *mg;
2056         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2057             reginfo.ganch = startpos + prog->gofs;
2058             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2059               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2060         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2061                   && SvMAGIC(sv)
2062                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2063                   && mg->mg_len >= 0) {
2064             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2065             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2066                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2067
2068             if (prog->extflags & RXf_ANCH_GPOS) {
2069                 if (s > reginfo.ganch)
2070                     goto phooey;
2071                 s = reginfo.ganch - prog->gofs;
2072                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2073                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2074                 if (s < strbeg)
2075                     goto phooey;
2076             }
2077         }
2078         else if (data) {
2079             reginfo.ganch = strbeg + PTR2UV(data);
2080             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2081                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2082
2083         } else {                                /* pos() not defined */
2084             reginfo.ganch = strbeg;
2085             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2086                  "GPOS: reginfo.ganch = strbeg\n"));
2087         }
2088     }
2089     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2090         /* We have to be careful. If the previous successful match
2091            was from this regex we don't want a subsequent partially
2092            successful match to clobber the old results.
2093            So when we detect this possibility we add a swap buffer
2094            to the re, and switch the buffer each match. If we fail
2095            we switch it back, otherwise we leave it swapped.
2096         */
2097         swap = prog->offs;
2098         /* do we need a save destructor here for eval dies? */
2099         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2100     }
2101     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2102         re_scream_pos_data d;
2103
2104         d.scream_olds = &scream_olds;
2105         d.scream_pos = &scream_pos;
2106         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2107         if (!s) {
2108             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2109             goto phooey;        /* not present */
2110         }
2111     }
2112
2113
2114
2115     /* Simplest case:  anchored match need be tried only once. */
2116     /*  [unless only anchor is BOL and multiline is set] */
2117     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2118         if (s == startpos && regtry(&reginfo, &startpos))
2119             goto got_it;
2120         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2121                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2122         {
2123             char *end;
2124
2125             if (minlen)
2126                 dontbother = minlen - 1;
2127             end = HOP3c(strend, -dontbother, strbeg) - 1;
2128             /* for multiline we only have to try after newlines */
2129             if (prog->check_substr || prog->check_utf8) {
2130                 /* because of the goto we can not easily reuse the macros for bifurcating the
2131                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2132                 if (utf8_target) {
2133                     if (s == startpos)
2134                         goto after_try_utf8;
2135                     while (1) {
2136                         if (regtry(&reginfo, &s)) {
2137                             goto got_it;
2138                         }
2139                       after_try_utf8:
2140                         if (s > end) {
2141                             goto phooey;
2142                         }
2143                         if (prog->extflags & RXf_USE_INTUIT) {
2144                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2145                             if (!s) {
2146                                 goto phooey;
2147                             }
2148                         }
2149                         else {
2150                             s += UTF8SKIP(s);
2151                         }
2152                     }
2153                 } /* end search for check string in unicode */
2154                 else {
2155                     if (s == startpos) {
2156                         goto after_try_latin;
2157                     }
2158                     while (1) {
2159                         if (regtry(&reginfo, &s)) {
2160                             goto got_it;
2161                         }
2162                       after_try_latin:
2163                         if (s > end) {
2164                             goto phooey;
2165                         }
2166                         if (prog->extflags & RXf_USE_INTUIT) {
2167                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2168                             if (!s) {
2169                                 goto phooey;
2170                             }
2171                         }
2172                         else {
2173                             s++;
2174                         }
2175                     }
2176                 } /* end search for check string in latin*/
2177             } /* end search for check string */
2178             else { /* search for newline */
2179                 if (s > startpos) {
2180                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2181                     s--;
2182                 }
2183                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2184                 while (s < end) {
2185                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2186                         if (regtry(&reginfo, &s))
2187                             goto got_it;
2188                     }
2189                 }
2190             } /* end search for newline */
2191         } /* end anchored/multiline check string search */
2192         goto phooey;
2193     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2194     {
2195         /* the warning about reginfo.ganch being used without initialization
2196            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2197            and we only enter this block when the same bit is set. */
2198         char *tmp_s = reginfo.ganch - prog->gofs;
2199
2200         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2201             goto got_it;
2202         goto phooey;
2203     }
2204
2205     /* Messy cases:  unanchored match. */
2206     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2207         /* we have /x+whatever/ */
2208         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2209         char ch;
2210 #ifdef DEBUGGING
2211         int did_match = 0;
2212 #endif
2213         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2214             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2215         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2216
2217         if (utf8_target) {
2218             REXEC_FBC_SCAN(
2219                 if (*s == ch) {
2220                     DEBUG_EXECUTE_r( did_match = 1 );
2221                     if (regtry(&reginfo, &s)) goto got_it;
2222                     s += UTF8SKIP(s);
2223                     while (s < strend && *s == ch)
2224                         s += UTF8SKIP(s);
2225                 }
2226             );
2227         }
2228         else {
2229             REXEC_FBC_SCAN(
2230                 if (*s == ch) {
2231                     DEBUG_EXECUTE_r( did_match = 1 );
2232                     if (regtry(&reginfo, &s)) goto got_it;
2233                     s++;
2234                     while (s < strend && *s == ch)
2235                         s++;
2236                 }
2237             );
2238         }
2239         DEBUG_EXECUTE_r(if (!did_match)
2240                 PerlIO_printf(Perl_debug_log,
2241                                   "Did not find anchored character...\n")
2242                );
2243     }
2244     else if (prog->anchored_substr != NULL
2245               || prog->anchored_utf8 != NULL
2246               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2247                   && prog->float_max_offset < strend - s)) {
2248         SV *must;
2249         I32 back_max;
2250         I32 back_min;
2251         char *last;
2252         char *last1;            /* Last position checked before */
2253 #ifdef DEBUGGING
2254         int did_match = 0;
2255 #endif
2256         if (prog->anchored_substr || prog->anchored_utf8) {
2257             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2258                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2259             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2260             back_max = back_min = prog->anchored_offset;
2261         } else {
2262             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2263                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2264             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2265             back_max = prog->float_max_offset;
2266             back_min = prog->float_min_offset;
2267         }
2268
2269
2270         if (must == &PL_sv_undef)
2271             /* could not downgrade utf8 check substring, so must fail */
2272             goto phooey;
2273
2274         if (back_min<0) {
2275             last = strend;
2276         } else {
2277             last = HOP3c(strend,        /* Cannot start after this */
2278                   -(I32)(CHR_SVLEN(must)
2279                          - (SvTAIL(must) != 0) + back_min), strbeg);
2280         }
2281         if (s > PL_bostr)
2282             last1 = HOPc(s, -1);
2283         else
2284             last1 = s - 1;      /* bogus */
2285
2286         /* XXXX check_substr already used to find "s", can optimize if
2287            check_substr==must. */
2288         scream_pos = -1;
2289         dontbother = end_shift;
2290         strend = HOPc(strend, -dontbother);
2291         while ( (s <= last) &&
2292                 ((flags & REXEC_SCREAM)
2293                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2294                                     end_shift, &scream_pos, 0))
2295                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2296                                   (unsigned char*)strend, must,
2297                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2298             /* we may be pointing at the wrong string */
2299             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2300                 s = strbeg + (s - SvPVX_const(sv));
2301             DEBUG_EXECUTE_r( did_match = 1 );
2302             if (HOPc(s, -back_max) > last1) {
2303                 last1 = HOPc(s, -back_min);
2304                 s = HOPc(s, -back_max);
2305             }
2306             else {
2307                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2308
2309                 last1 = HOPc(s, -back_min);
2310                 s = t;
2311             }
2312             if (utf8_target) {
2313                 while (s <= last1) {
2314                     if (regtry(&reginfo, &s))
2315                         goto got_it;
2316                     s += UTF8SKIP(s);
2317                 }
2318             }
2319             else {
2320                 while (s <= last1) {
2321                     if (regtry(&reginfo, &s))
2322                         goto got_it;
2323                     s++;
2324                 }
2325             }
2326         }
2327         DEBUG_EXECUTE_r(if (!did_match) {
2328             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2329                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2330             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2331                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2332                                ? "anchored" : "floating"),
2333                 quoted, RE_SV_TAIL(must));
2334         });
2335         goto phooey;
2336     }
2337     else if ( (c = progi->regstclass) ) {
2338         if (minlen) {
2339             const OPCODE op = OP(progi->regstclass);
2340             /* don't bother with what can't match */
2341             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2342                 strend = HOPc(strend, -(minlen - 1));
2343         }
2344         DEBUG_EXECUTE_r({
2345             SV * const prop = sv_newmortal();
2346             regprop(prog, prop, c);
2347             {
2348                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2349                     s,strend-s,60);
2350                 PerlIO_printf(Perl_debug_log,
2351                     "Matching stclass %.*s against %s (%d bytes)\n",
2352                     (int)SvCUR(prop), SvPVX_const(prop),
2353                      quoted, (int)(strend - s));
2354             }
2355         });
2356         if (find_byclass(prog, c, s, strend, &reginfo))
2357             goto got_it;
2358         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2359     }
2360     else {
2361         dontbother = 0;
2362         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2363             /* Trim the end. */
2364             char *last;
2365             SV* float_real;
2366
2367             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2368                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2369             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2370
2371             if (flags & REXEC_SCREAM) {
2372                 last = screaminstr(sv, float_real, s - strbeg,
2373                                    end_shift, &scream_pos, 1); /* last one */
2374                 if (!last)
2375                     last = scream_olds; /* Only one occurrence. */
2376                 /* we may be pointing at the wrong string */
2377                 else if (RXp_MATCH_COPIED(prog))
2378                     s = strbeg + (s - SvPVX_const(sv));
2379             }
2380             else {
2381                 STRLEN len;
2382                 const char * const little = SvPV_const(float_real, len);
2383
2384                 if (SvTAIL(float_real)) {
2385                     if (memEQ(strend - len + 1, little, len - 1))
2386                         last = strend - len + 1;
2387                     else if (!multiline)
2388                         last = memEQ(strend - len, little, len)
2389                             ? strend - len : NULL;
2390                     else
2391                         goto find_last;
2392                 } else {
2393                   find_last:
2394                     if (len)
2395                         last = rninstr(s, strend, little, little + len);
2396                     else
2397                         last = strend;  /* matching "$" */
2398                 }
2399             }
2400             if (last == NULL) {
2401                 DEBUG_EXECUTE_r(
2402                     PerlIO_printf(Perl_debug_log,
2403                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2404                         PL_colors[4], PL_colors[5]));
2405                 goto phooey; /* Should not happen! */
2406             }
2407             dontbother = strend - last + prog->float_min_offset;
2408         }
2409         if (minlen && (dontbother < minlen))
2410             dontbother = minlen - 1;
2411         strend -= dontbother;              /* this one's always in bytes! */
2412         /* We don't know much -- general case. */
2413         if (utf8_target) {
2414             for (;;) {
2415                 if (regtry(&reginfo, &s))
2416                     goto got_it;
2417                 if (s >= strend)
2418                     break;
2419                 s += UTF8SKIP(s);
2420             };
2421         }
2422         else {
2423             do {
2424                 if (regtry(&reginfo, &s))
2425                     goto got_it;
2426             } while (s++ < strend);
2427         }
2428     }
2429
2430     /* Failure. */
2431     goto phooey;
2432
2433 got_it:
2434     Safefree(swap);
2435     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2436
2437     if (PL_reg_eval_set)
2438         restore_pos(aTHX_ prog);
2439     if (RXp_PAREN_NAMES(prog))
2440         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2441
2442     /* make sure $`, $&, $', and $digit will work later */
2443     if ( !(flags & REXEC_NOT_FIRST) ) {
2444         RX_MATCH_COPY_FREE(rx);
2445         if (flags & REXEC_COPY_STR) {
2446             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2447 #ifdef PERL_OLD_COPY_ON_WRITE
2448             if ((SvIsCOW(sv)
2449                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2450                 if (DEBUG_C_TEST) {
2451                     PerlIO_printf(Perl_debug_log,
2452                                   "Copy on write: regexp capture, type %d\n",
2453                                   (int) SvTYPE(sv));
2454                 }
2455                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2456                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2457                 assert (SvPOKp(prog->saved_copy));
2458             } else
2459 #endif
2460             {
2461                 RX_MATCH_COPIED_on(rx);
2462                 s = savepvn(strbeg, i);
2463                 prog->subbeg = s;
2464             }
2465             prog->sublen = i;
2466         }
2467         else {
2468             prog->subbeg = strbeg;
2469             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2470         }
2471     }
2472
2473     return 1;
2474
2475 phooey:
2476     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2477                           PL_colors[4], PL_colors[5]));
2478     if (PL_reg_eval_set)
2479         restore_pos(aTHX_ prog);
2480     if (swap) {
2481         /* we failed :-( roll it back */
2482         Safefree(prog->offs);
2483         prog->offs = swap;
2484     }
2485
2486     return 0;
2487 }
2488
2489
2490 /*
2491  - regtry - try match at specific point
2492  */
2493 STATIC I32                      /* 0 failure, 1 success */
2494 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2495 {
2496     dVAR;
2497     CHECKPOINT lastcp;
2498     REGEXP *const rx = reginfo->prog;
2499     regexp *const prog = (struct regexp *)SvANY(rx);
2500     RXi_GET_DECL(prog,progi);
2501     GET_RE_DEBUG_FLAGS_DECL;
2502
2503     PERL_ARGS_ASSERT_REGTRY;
2504
2505     reginfo->cutpoint=NULL;
2506
2507     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2508         MAGIC *mg;
2509
2510         PL_reg_eval_set = RS_init;
2511         DEBUG_EXECUTE_r(DEBUG_s(
2512             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2513                           (IV)(PL_stack_sp - PL_stack_base));
2514             ));
2515         SAVESTACK_CXPOS();
2516         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2517         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2518         SAVETMPS;
2519         /* Apparently this is not needed, judging by wantarray. */
2520         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2521            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2522
2523         if (reginfo->sv) {
2524             /* Make $_ available to executed code. */
2525             if (reginfo->sv != DEFSV) {
2526                 SAVE_DEFSV;
2527                 DEFSV_set(reginfo->sv);
2528             }
2529
2530             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2531                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2532                 /* prepare for quick setting of pos */
2533 #ifdef PERL_OLD_COPY_ON_WRITE
2534                 if (SvIsCOW(reginfo->sv))
2535                     sv_force_normal_flags(reginfo->sv, 0);
2536 #endif
2537                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2538                                  &PL_vtbl_mglob, NULL, 0);
2539                 mg->mg_len = -1;
2540             }
2541             PL_reg_magic    = mg;
2542             PL_reg_oldpos   = mg->mg_len;
2543             SAVEDESTRUCTOR_X(restore_pos, prog);
2544         }
2545         if (!PL_reg_curpm) {
2546             Newxz(PL_reg_curpm, 1, PMOP);
2547 #ifdef USE_ITHREADS
2548             {
2549                 SV* const repointer = &PL_sv_undef;
2550                 /* this regexp is also owned by the new PL_reg_curpm, which
2551                    will try to free it.  */
2552                 av_push(PL_regex_padav, repointer);
2553                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2554                 PL_regex_pad = AvARRAY(PL_regex_padav);
2555             }
2556 #endif
2557         }
2558 #ifdef USE_ITHREADS
2559         /* It seems that non-ithreads works both with and without this code.
2560            So for efficiency reasons it seems best not to have the code
2561            compiled when it is not needed.  */
2562         /* This is safe against NULLs: */
2563         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2564         /* PM_reg_curpm owns a reference to this regexp.  */
2565         (void)ReREFCNT_inc(rx);
2566 #endif
2567         PM_SETRE(PL_reg_curpm, rx);
2568         PL_reg_oldcurpm = PL_curpm;
2569         PL_curpm = PL_reg_curpm;
2570         if (RXp_MATCH_COPIED(prog)) {
2571             /*  Here is a serious problem: we cannot rewrite subbeg,
2572                 since it may be needed if this match fails.  Thus
2573                 $` inside (?{}) could fail... */
2574             PL_reg_oldsaved = prog->subbeg;
2575             PL_reg_oldsavedlen = prog->sublen;
2576 #ifdef PERL_OLD_COPY_ON_WRITE
2577             PL_nrs = prog->saved_copy;
2578 #endif
2579             RXp_MATCH_COPIED_off(prog);
2580         }
2581         else
2582             PL_reg_oldsaved = NULL;
2583         prog->subbeg = PL_bostr;
2584         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2585     }
2586     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2587     prog->offs[0].start = *startpos - PL_bostr;
2588     PL_reginput = *startpos;
2589     PL_reglastparen = &prog->lastparen;
2590     PL_reglastcloseparen = &prog->lastcloseparen;
2591     prog->lastparen = 0;
2592     prog->lastcloseparen = 0;
2593     PL_regsize = 0;
2594     PL_regoffs = prog->offs;
2595     if (PL_reg_start_tmpl <= prog->nparens) {
2596         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2597         if(PL_reg_start_tmp)
2598             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2599         else
2600             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2601     }
2602
2603     /* XXXX What this code is doing here?!!!  There should be no need
2604        to do this again and again, PL_reglastparen should take care of
2605        this!  --ilya*/
2606
2607     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2608      * Actually, the code in regcppop() (which Ilya may be meaning by
2609      * PL_reglastparen), is not needed at all by the test suite
2610      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2611      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2612      * Meanwhile, this code *is* needed for the
2613      * above-mentioned test suite tests to succeed.  The common theme
2614      * on those tests seems to be returning null fields from matches.
2615      * --jhi updated by dapm */
2616 #if 1
2617     if (prog->nparens) {
2618         regexp_paren_pair *pp = PL_regoffs;
2619         register I32 i;
2620         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2621             ++pp;
2622             pp->start = -1;
2623             pp->end = -1;
2624         }
2625     }
2626 #endif
2627     REGCP_SET(lastcp);
2628     if (regmatch(reginfo, progi->program + 1)) {
2629         PL_regoffs[0].end = PL_reginput - PL_bostr;
2630         return 1;
2631     }
2632     if (reginfo->cutpoint)
2633         *startpos= reginfo->cutpoint;
2634     REGCP_UNWIND(lastcp);
2635     return 0;
2636 }
2637
2638
2639 #define sayYES goto yes
2640 #define sayNO goto no
2641 #define sayNO_SILENT goto no_silent
2642
2643 /* we dont use STMT_START/END here because it leads to
2644    "unreachable code" warnings, which are bogus, but distracting. */
2645 #define CACHEsayNO \
2646     if (ST.cache_mask) \
2647        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2648     sayNO
2649
2650 /* this is used to determine how far from the left messages like
2651    'failed...' are printed. It should be set such that messages
2652    are inline with the regop output that created them.
2653 */
2654 #define REPORT_CODE_OFF 32
2655
2656
2657 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2658 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2659
2660 #define SLAB_FIRST(s) (&(s)->states[0])
2661 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2662
2663 /* grab a new slab and return the first slot in it */
2664
2665 STATIC regmatch_state *
2666 S_push_slab(pTHX)
2667 {
2668 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2669     dMY_CXT;
2670 #endif
2671     regmatch_slab *s = PL_regmatch_slab->next;
2672     if (!s) {
2673         Newx(s, 1, regmatch_slab);
2674         s->prev = PL_regmatch_slab;
2675         s->next = NULL;
2676         PL_regmatch_slab->next = s;
2677     }
2678     PL_regmatch_slab = s;
2679     return SLAB_FIRST(s);
2680 }
2681
2682
2683 /* push a new state then goto it */
2684
2685 #define PUSH_STATE_GOTO(state, node) \
2686     scan = node; \
2687     st->resume_state = state; \
2688     goto push_state;
2689
2690 /* push a new state with success backtracking, then goto it */
2691
2692 #define PUSH_YES_STATE_GOTO(state, node) \
2693     scan = node; \
2694     st->resume_state = state; \
2695     goto push_yes_state;
2696
2697
2698
2699 /*
2700
2701 regmatch() - main matching routine
2702
2703 This is basically one big switch statement in a loop. We execute an op,
2704 set 'next' to point the next op, and continue. If we come to a point which
2705 we may need to backtrack to on failure such as (A|B|C), we push a
2706 backtrack state onto the backtrack stack. On failure, we pop the top
2707 state, and re-enter the loop at the state indicated. If there are no more
2708 states to pop, we return failure.
2709
2710 Sometimes we also need to backtrack on success; for example /A+/, where
2711 after successfully matching one A, we need to go back and try to
2712 match another one; similarly for lookahead assertions: if the assertion
2713 completes successfully, we backtrack to the state just before the assertion
2714 and then carry on.  In these cases, the pushed state is marked as
2715 'backtrack on success too'. This marking is in fact done by a chain of
2716 pointers, each pointing to the previous 'yes' state. On success, we pop to
2717 the nearest yes state, discarding any intermediate failure-only states.
2718 Sometimes a yes state is pushed just to force some cleanup code to be
2719 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2720 it to free the inner regex.
2721
2722 Note that failure backtracking rewinds the cursor position, while
2723 success backtracking leaves it alone.
2724
2725 A pattern is complete when the END op is executed, while a subpattern
2726 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2727 ops trigger the "pop to last yes state if any, otherwise return true"
2728 behaviour.
2729
2730 A common convention in this function is to use A and B to refer to the two
2731 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2732 the subpattern to be matched possibly multiple times, while B is the entire
2733 rest of the pattern. Variable and state names reflect this convention.
2734
2735 The states in the main switch are the union of ops and failure/success of
2736 substates associated with with that op.  For example, IFMATCH is the op
2737 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2738 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2739 successfully matched A and IFMATCH_A_fail is a state saying that we have
2740 just failed to match A. Resume states always come in pairs. The backtrack
2741 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2742 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2743 on success or failure.
2744
2745 The struct that holds a backtracking state is actually a big union, with
2746 one variant for each major type of op. The variable st points to the
2747 top-most backtrack struct. To make the code clearer, within each
2748 block of code we #define ST to alias the relevant union.
2749
2750 Here's a concrete example of a (vastly oversimplified) IFMATCH
2751 implementation:
2752
2753     switch (state) {
2754     ....
2755
2756 #define ST st->u.ifmatch
2757
2758     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2759         ST.foo = ...; // some state we wish to save
2760         ...
2761         // push a yes backtrack state with a resume value of
2762         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2763         // first node of A:
2764         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2765         // NOTREACHED
2766
2767     case IFMATCH_A: // we have successfully executed A; now continue with B
2768         next = B;
2769         bar = ST.foo; // do something with the preserved value
2770         break;
2771
2772     case IFMATCH_A_fail: // A failed, so the assertion failed
2773         ...;   // do some housekeeping, then ...
2774         sayNO; // propagate the failure
2775
2776 #undef ST
2777
2778     ...
2779     }
2780
2781 For any old-timers reading this who are familiar with the old recursive
2782 approach, the code above is equivalent to:
2783
2784     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2785     {
2786         int foo = ...
2787         ...
2788         if (regmatch(A)) {
2789             next = B;
2790             bar = foo;
2791             break;
2792         }
2793         ...;   // do some housekeeping, then ...
2794         sayNO; // propagate the failure
2795     }
2796
2797 The topmost backtrack state, pointed to by st, is usually free. If you
2798 want to claim it, populate any ST.foo fields in it with values you wish to
2799 save, then do one of
2800
2801         PUSH_STATE_GOTO(resume_state, node);
2802         PUSH_YES_STATE_GOTO(resume_state, node);
2803
2804 which sets that backtrack state's resume value to 'resume_state', pushes a
2805 new free entry to the top of the backtrack stack, then goes to 'node'.
2806 On backtracking, the free slot is popped, and the saved state becomes the
2807 new free state. An ST.foo field in this new top state can be temporarily
2808 accessed to retrieve values, but once the main loop is re-entered, it
2809 becomes available for reuse.
2810
2811 Note that the depth of the backtrack stack constantly increases during the
2812 left-to-right execution of the pattern, rather than going up and down with
2813 the pattern nesting. For example the stack is at its maximum at Z at the
2814 end of the pattern, rather than at X in the following:
2815
2816     /(((X)+)+)+....(Y)+....Z/
2817
2818 The only exceptions to this are lookahead/behind assertions and the cut,
2819 (?>A), which pop all the backtrack states associated with A before
2820 continuing.
2821
2822 Backtrack state structs are allocated in slabs of about 4K in size.
2823 PL_regmatch_state and st always point to the currently active state,
2824 and PL_regmatch_slab points to the slab currently containing
2825 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2826 allocated, and is never freed until interpreter destruction. When the slab
2827 is full, a new one is allocated and chained to the end. At exit from
2828 regmatch(), slabs allocated since entry are freed.
2829
2830 */
2831
2832
2833 #define DEBUG_STATE_pp(pp)                                  \
2834     DEBUG_STATE_r({                                         \
2835         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2836         PerlIO_printf(Perl_debug_log,                       \
2837             "    %*s"pp" %s%s%s%s%s\n",                     \
2838             depth*2, "",                                    \
2839             PL_reg_name[st->resume_state],                     \
2840             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2841             ((st==yes_state) ? "Y" : ""),                   \
2842             ((st==mark_state) ? "M" : ""),                  \
2843             ((st==yes_state||st==mark_state) ? "]" : "")    \
2844         );                                                  \
2845     });
2846
2847
2848 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2849
2850 #ifdef DEBUGGING
2851
2852 STATIC void
2853 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2854     const char *start, const char *end, const char *blurb)
2855 {
2856     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2857
2858     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2859
2860     if (!PL_colorset)
2861             reginitcolors();
2862     {
2863         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2864             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2865
2866         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2867             start, end - start, 60);
2868
2869         PerlIO_printf(Perl_debug_log,
2870             "%s%s REx%s %s against %s\n",
2871                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2872
2873         if (utf8_target||utf8_pat)
2874             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2875                 utf8_pat ? "pattern" : "",
2876                 utf8_pat && utf8_target ? " and " : "",
2877                 utf8_target ? "string" : ""
2878             );
2879     }
2880 }
2881
2882 STATIC void
2883 S_dump_exec_pos(pTHX_ const char *locinput,
2884                       const regnode *scan,
2885                       const char *loc_regeol,
2886                       const char *loc_bostr,
2887                       const char *loc_reg_starttry,
2888                       const bool utf8_target)
2889 {
2890     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2891     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2892     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2893     /* The part of the string before starttry has one color
2894        (pref0_len chars), between starttry and current
2895        position another one (pref_len - pref0_len chars),
2896        after the current position the third one.
2897        We assume that pref0_len <= pref_len, otherwise we
2898        decrease pref0_len.  */
2899     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2900         ? (5 + taill) - l : locinput - loc_bostr;
2901     int pref0_len;
2902
2903     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2904
2905     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2906         pref_len++;
2907     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2908     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2909         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2910               ? (5 + taill) - pref_len : loc_regeol - locinput);
2911     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2912         l--;
2913     if (pref0_len < 0)
2914         pref0_len = 0;
2915     if (pref0_len > pref_len)
2916         pref0_len = pref_len;
2917     {
2918         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2919
2920         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2921             (locinput - pref_len),pref0_len, 60, 4, 5);
2922
2923         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2924                     (locinput - pref_len + pref0_len),
2925                     pref_len - pref0_len, 60, 2, 3);
2926
2927         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2928                     locinput, loc_regeol - locinput, 10, 0, 1);
2929
2930         const STRLEN tlen=len0+len1+len2;
2931         PerlIO_printf(Perl_debug_log,
2932                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2933                     (IV)(locinput - loc_bostr),
2934                     len0, s0,
2935                     len1, s1,
2936                     (docolor ? "" : "> <"),
2937                     len2, s2,
2938                     (int)(tlen > 19 ? 0 :  19 - tlen),
2939                     "");
2940     }
2941 }
2942
2943 #endif
2944
2945 /* reg_check_named_buff_matched()
2946  * Checks to see if a named buffer has matched. The data array of
2947  * buffer numbers corresponding to the buffer is expected to reside
2948  * in the regexp->data->data array in the slot stored in the ARG() of
2949  * node involved. Note that this routine doesn't actually care about the
2950  * name, that information is not preserved from compilation to execution.
2951  * Returns the index of the leftmost defined buffer with the given name
2952  * or 0 if non of the buffers matched.
2953  */
2954 STATIC I32
2955 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2956 {
2957     I32 n;
2958     RXi_GET_DECL(rex,rexi);
2959     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2960     I32 *nums=(I32*)SvPVX(sv_dat);
2961
2962     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2963
2964     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2965         if ((I32)*PL_reglastparen >= nums[n] &&
2966             PL_regoffs[nums[n]].end != -1)
2967         {
2968             return nums[n];
2969         }
2970     }
2971     return 0;
2972 }
2973
2974
2975 /* free all slabs above current one  - called during LEAVE_SCOPE */
2976
2977 STATIC void
2978 S_clear_backtrack_stack(pTHX_ void *p)
2979 {
2980     regmatch_slab *s = PL_regmatch_slab->next;
2981     PERL_UNUSED_ARG(p);
2982
2983     if (!s)
2984         return;
2985     PL_regmatch_slab->next = NULL;
2986     while (s) {
2987         regmatch_slab * const osl = s;
2988         s = s->next;
2989         Safefree(osl);
2990     }
2991 }
2992
2993
2994 #define SETREX(Re1,Re2) \
2995     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2996     Re1 = (Re2)
2997
2998 STATIC I32                      /* 0 failure, 1 success */
2999 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3000 {
3001 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3002     dMY_CXT;
3003 #endif
3004     dVAR;
3005     register const bool utf8_target = PL_reg_match_utf8;
3006     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3007     REGEXP *rex_sv = reginfo->prog;
3008     regexp *rex = (struct regexp *)SvANY(rex_sv);
3009     RXi_GET_DECL(rex,rexi);
3010     I32 oldsave;
3011     /* the current state. This is a cached copy of PL_regmatch_state */
3012     register regmatch_state *st;
3013     /* cache heavy used fields of st in registers */
3014     register regnode *scan;
3015     register regnode *next;
3016     register U32 n = 0; /* general value; init to avoid compiler warning */
3017     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3018     register char *locinput = PL_reginput;
3019     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3020
3021     bool result = 0;        /* return value of S_regmatch */
3022     int depth = 0;          /* depth of backtrack stack */
3023     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3024     const U32 max_nochange_depth =
3025         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3026         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3027     regmatch_state *yes_state = NULL; /* state to pop to on success of
3028                                                             subpattern */
3029     /* mark_state piggy backs on the yes_state logic so that when we unwind
3030        the stack on success we can update the mark_state as we go */
3031     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3032     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3033     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3034     U32 state_num;
3035     bool no_final = 0;      /* prevent failure from backtracking? */
3036     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3037     char *startpoint = PL_reginput;
3038     SV *popmark = NULL;     /* are we looking for a mark? */
3039     SV *sv_commit = NULL;   /* last mark name seen in failure */
3040     SV *sv_yes_mark = NULL; /* last mark name we have seen
3041                                during a successful match */
3042     U32 lastopen = 0;       /* last open we saw */
3043     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3044     SV* const oreplsv = GvSV(PL_replgv);
3045     /* these three flags are set by various ops to signal information to
3046      * the very next op. They have a useful lifetime of exactly one loop
3047      * iteration, and are not preserved or restored by state pushes/pops
3048      */
3049     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3050     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3051     int logical = 0;        /* the following EVAL is:
3052                                 0: (?{...})
3053                                 1: (?(?{...})X|Y)
3054                                 2: (??{...})
3055                                or the following IFMATCH/UNLESSM is:
3056                                 false: plain (?=foo)
3057                                 true:  used as a condition: (?(?=foo))
3058                             */
3059 #ifdef DEBUGGING
3060     GET_RE_DEBUG_FLAGS_DECL;
3061 #endif
3062
3063     PERL_ARGS_ASSERT_REGMATCH;
3064
3065     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3066             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3067     }));
3068     /* on first ever call to regmatch, allocate first slab */
3069     if (!PL_regmatch_slab) {
3070         Newx(PL_regmatch_slab, 1, regmatch_slab);
3071         PL_regmatch_slab->prev = NULL;
3072         PL_regmatch_slab->next = NULL;
3073         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3074     }
3075
3076     oldsave = PL_savestack_ix;
3077     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3078     SAVEVPTR(PL_regmatch_slab);
3079     SAVEVPTR(PL_regmatch_state);
3080
3081     /* grab next free state slot */
3082     st = ++PL_regmatch_state;
3083     if (st >  SLAB_LAST(PL_regmatch_slab))
3084         st = PL_regmatch_state = S_push_slab(aTHX);
3085
3086     /* Note that nextchr is a byte even in UTF */
3087     nextchr = UCHARAT(locinput);
3088     scan = prog;
3089     while (scan != NULL) {
3090
3091         DEBUG_EXECUTE_r( {
3092             SV * const prop = sv_newmortal();
3093             regnode *rnext=regnext(scan);
3094             DUMP_EXEC_POS( locinput, scan, utf8_target );
3095             regprop(rex, prop, scan);
3096
3097             PerlIO_printf(Perl_debug_log,
3098                     "%3"IVdf":%*s%s(%"IVdf")\n",
3099                     (IV)(scan - rexi->program), depth*2, "",
3100                     SvPVX_const(prop),
3101                     (PL_regkind[OP(scan)] == END || !rnext) ?
3102                         0 : (IV)(rnext - rexi->program));
3103         });
3104
3105         next = scan + NEXT_OFF(scan);
3106         if (next == scan)
3107             next = NULL;
3108         state_num = OP(scan);
3109
3110       reenter_switch:
3111
3112         assert(PL_reglastparen == &rex->lastparen);
3113         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3114         assert(PL_regoffs == rex->offs);
3115
3116         switch (state_num) {
3117         case BOL:
3118             if (locinput == PL_bostr)
3119             {
3120                 /* reginfo->till = reginfo->bol; */
3121                 break;
3122             }
3123             sayNO;
3124         case MBOL:
3125             if (locinput == PL_bostr ||
3126                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3127             {
3128                 break;
3129             }
3130             sayNO;
3131         case SBOL:
3132             if (locinput == PL_bostr)
3133                 break;
3134             sayNO;
3135         case GPOS:
3136             if (locinput == reginfo->ganch)
3137                 break;
3138             sayNO;
3139
3140         case KEEPS:
3141             /* update the startpoint */
3142             st->u.keeper.val = PL_regoffs[0].start;
3143             PL_reginput = locinput;
3144             PL_regoffs[0].start = locinput - PL_bostr;
3145             PUSH_STATE_GOTO(KEEPS_next, next);
3146             /*NOT-REACHED*/
3147         case KEEPS_next_fail:
3148             /* rollback the start point change */
3149             PL_regoffs[0].start = st->u.keeper.val;
3150             sayNO_SILENT;
3151             /*NOT-REACHED*/
3152         case EOL:
3153                 goto seol;
3154         case MEOL:
3155             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3156                 sayNO;
3157             break;
3158         case SEOL:
3159           seol:
3160             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3161                 sayNO;
3162             if (PL_regeol - locinput > 1)
3163                 sayNO;
3164             break;
3165         case EOS:
3166             if (PL_regeol != locinput)
3167                 sayNO;
3168             break;
3169         case SANY:
3170             if (!nextchr && locinput >= PL_regeol)
3171                 sayNO;
3172             if (utf8_target) {
3173                 locinput += PL_utf8skip[nextchr];
3174                 if (locinput > PL_regeol)
3175                     sayNO;
3176                 nextchr = UCHARAT(locinput);
3177             }
3178             else
3179                 nextchr = UCHARAT(++locinput);
3180             break;
3181         case CANY:
3182             if (!nextchr && locinput >= PL_regeol)
3183                 sayNO;
3184             nextchr = UCHARAT(++locinput);
3185             break;
3186         case REG_ANY:
3187             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3188                 sayNO;
3189             if (utf8_target) {
3190                 locinput += PL_utf8skip[nextchr];
3191                 if (locinput > PL_regeol)
3192                     sayNO;
3193                 nextchr = UCHARAT(locinput);
3194             }
3195             else
3196                 nextchr = UCHARAT(++locinput);
3197             break;
3198
3199 #undef  ST
3200 #define ST st->u.trie
3201         case TRIEC:
3202             /* In this case the charclass data is available inline so
3203                we can fail fast without a lot of extra overhead.
3204              */
3205             if (scan->flags == EXACT || !utf8_target) {
3206                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3207                     DEBUG_EXECUTE_r(
3208                         PerlIO_printf(Perl_debug_log,
3209                                   "%*s  %sfailed to match trie start class...%s\n",
3210                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3211                     );
3212                     sayNO_SILENT;
3213                     /* NOTREACHED */
3214                 }
3215             }
3216             /* FALL THROUGH */
3217         case TRIE:
3218             /* the basic plan of execution of the trie is:
3219              * At the beginning, run though all the states, and
3220              * find the longest-matching word. Also remember the position
3221              * of the shortest matching word. For example, this pattern:
3222              *    1  2 3 4    5
3223              *    ab|a|x|abcd|abc
3224              * when matched against the string "abcde", will generate
3225              * accept states for all words except 3, with the longest
3226              * matching word being 4, and the shortest being 1 (with
3227              * the position being after char 1 of the string).
3228              *
3229              * Then for each matching word, in word order (i.e. 1,2,4,5),
3230              * we run the remainder of the pattern; on each try setting
3231              * the current position to the character following the word,
3232              * returning to try the next word on failure.
3233              *
3234              * We avoid having to build a list of words at runtime by
3235              * using a compile-time structure, wordinfo[].prev, which
3236              * gives, for each word, the previous accepting word (if any).
3237              * In the case above it would contain the mappings 1->2, 2->0,
3238              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3239              * the longest word (4 above), a list of all words, by
3240              * following the list of prev pointers; this gives us the
3241              * unordered list 4,5,1,2. Then given the current word we have
3242              * just tried, we can go through the list and find the
3243              * next-biggest word to try (so if we just failed on word 2,
3244              * the next in the list is 4).
3245              *
3246              * Since at runtime we don't record the matching position in
3247              * the string for each word, we have to work that out for
3248              * each word we're about to process. The wordinfo table holds
3249              * the character length of each word; given that we recorded
3250              * at the start: the position of the shortest word and its
3251              * length in chars, we just need to move the pointer the
3252              * difference between the two char lengths. Depending on
3253              * Unicode status and folding, that's cheap or expensive.
3254              *
3255              * This algorithm is optimised for the case where are only a
3256              * small number of accept states, i.e. 0,1, or maybe 2.
3257              * With lots of accepts states, and having to try all of them,
3258              * it becomes quadratic on number of accept states to find all
3259              * the next words.
3260              */
3261
3262             {
3263                 /* what type of TRIE am I? (utf8 makes this contextual) */
3264                 DECL_TRIE_TYPE(scan);
3265
3266                 /* what trie are we using right now */
3267                 reg_trie_data * const trie
3268                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3269                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3270                 U32 state = trie->startstate;
3271
3272                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3273                     !TRIE_BITMAP_TEST(trie,*locinput)
3274                 ) {
3275                     if (trie->states[ state ].wordnum) {
3276                          DEBUG_EXECUTE_r(
3277                             PerlIO_printf(Perl_debug_log,
3278                                           "%*s  %smatched empty string...%s\n",
3279                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3280                         );
3281                         if (!trie->jump)
3282                             break;
3283                     } else {
3284                         DEBUG_EXECUTE_r(
3285                             PerlIO_printf(Perl_debug_log,
3286                                           "%*s  %sfailed to match trie start class...%s\n",
3287                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3288                         );
3289                         sayNO_SILENT;
3290                    }
3291                 }
3292
3293             {
3294                 U8 *uc = ( U8* )locinput;
3295
3296                 STRLEN len = 0;
3297                 STRLEN foldlen = 0;
3298                 U8 *uscan = (U8*)NULL;
3299                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3300                 U32 charcount = 0; /* how many input chars we have matched */
3301                 U32 accepted = 0; /* have we seen any accepting states? */
3302
3303                 ST.B = next;
3304                 ST.jump = trie->jump;
3305                 ST.me = scan;
3306                 ST.firstpos = NULL;
3307                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3308                 ST.nextword = 0;
3309
3310                 /* fully traverse the TRIE; note the position of the
3311                    shortest accept state and the wordnum of the longest
3312                    accept state */
3313
3314                 while ( state && uc <= (U8*)PL_regeol ) {
3315                     U32 base = trie->states[ state ].trans.base;
3316                     UV uvc = 0;
3317                     U16 charid = 0;
3318                     U16 wordnum;
3319                     wordnum = trie->states[ state ].wordnum;
3320
3321                     if (wordnum) { /* it's an accept state */
3322                         if (!accepted) {
3323                             accepted = 1;
3324                             /* record first match position */
3325                             if (ST.longfold) {
3326                                 ST.firstpos = (U8*)locinput;
3327                                 ST.firstchars = 0;
3328                             }
3329                             else {
3330                                 ST.firstpos = uc;
3331                                 ST.firstchars = charcount;
3332                             }
3333                         }
3334                         if (!ST.nextword || wordnum < ST.nextword)
3335                             ST.nextword = wordnum;
3336                         ST.topword = wordnum;
3337                     }
3338
3339                     DEBUG_TRIE_EXECUTE_r({
3340                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3341                                 PerlIO_printf( Perl_debug_log,
3342                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3343                                     2+depth * 2, "", PL_colors[4],
3344                                     (UV)state, (accepted ? 'Y' : 'N'));
3345                     });
3346
3347                     /* read a char and goto next state */
3348                     if ( base ) {
3349                         I32 offset;
3350                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3351                                              uscan, len, uvc, charid, foldlen,
3352                                              foldbuf, uniflags);
3353                         charcount++;
3354                         if (foldlen>0)
3355                             ST.longfold = TRUE;
3356                         if (charid &&
3357                              ( ((offset =
3358                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3359
3360                              && ((U32)offset < trie->lasttrans)
3361                              && trie->trans[offset].check == state)
3362                         {
3363                             state = trie->trans[offset].next;
3364                         }
3365                         else {
3366                             state = 0;
3367                         }
3368                         uc += len;
3369
3370                     }
3371                     else {
3372                         state = 0;
3373                     }
3374                     DEBUG_TRIE_EXECUTE_r(
3375                         PerlIO_printf( Perl_debug_log,
3376                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3377                             charid, uvc, (UV)state, PL_colors[5] );
3378                     );
3379                 }
3380                 if (!accepted)
3381                    sayNO;
3382
3383                 /* calculate total number of accept states */
3384                 {
3385                     U16 w = ST.topword;
3386                     accepted = 0;
3387                     while (w) {
3388                         w = trie->wordinfo[w].prev;
3389                         accepted++;
3390                     }
3391                     ST.accepted = accepted;
3392                 }
3393
3394                 DEBUG_EXECUTE_r(
3395                     PerlIO_printf( Perl_debug_log,
3396                         "%*s  %sgot %"IVdf" possible matches%s\n",
3397                         REPORT_CODE_OFF + depth * 2, "",
3398                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3399                 );
3400                 goto trie_first_try; /* jump into the fail handler */
3401             }}
3402             /* NOTREACHED */
3403
3404         case TRIE_next_fail: /* we failed - try next alternative */
3405             if ( ST.jump) {
3406                 REGCP_UNWIND(ST.cp);
3407                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3408                     PL_regoffs[n].end = -1;
3409                 *PL_reglastparen = n;
3410             }
3411             if (!--ST.accepted) {
3412                 DEBUG_EXECUTE_r({
3413                     PerlIO_printf( Perl_debug_log,
3414                         "%*s  %sTRIE failed...%s\n",
3415                         REPORT_CODE_OFF+depth*2, "",
3416                         PL_colors[4],
3417                         PL_colors[5] );
3418                 });
3419                 sayNO_SILENT;
3420             }
3421             {
3422                 /* Find next-highest word to process.  Note that this code
3423                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3424                 register U16 min = 0;
3425                 register U16 word;
3426                 register U16 const nextword = ST.nextword;
3427                 register reg_trie_wordinfo * const wordinfo
3428                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3429                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3430                     if (word > nextword && (!min || word < min))
3431                         min = word;
3432                 }
3433                 ST.nextword = min;
3434             }
3435
3436           trie_first_try:
3437             if (do_cutgroup) {
3438                 do_cutgroup = 0;
3439                 no_final = 0;
3440             }
3441
3442             if ( ST.jump) {
3443                 ST.lastparen = *PL_reglastparen;
3444                 REGCP_SET(ST.cp);
3445             }
3446
3447             /* find start char of end of current word */
3448             {
3449                 U32 chars; /* how many chars to skip */
3450                 U8 *uc = ST.firstpos;
3451                 reg_trie_data * const trie
3452                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3453
3454                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3455                             >=  ST.firstchars);
3456                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3457                             - ST.firstchars;
3458
3459                 if (ST.longfold) {
3460                     /* the hard option - fold each char in turn and find
3461                      * its folded length (which may be different */
3462                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3463                     STRLEN foldlen;
3464                     STRLEN len;
3465                     UV uvc;
3466                     U8 *uscan;
3467
3468                     while (chars) {
3469                         if (utf8_target) {
3470                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3471                                                     uniflags);
3472                             uc += len;
3473                         }
3474                         else {
3475                             uvc = *uc;
3476                             uc++;
3477                         }
3478                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3479                         uscan = foldbuf;
3480                         while (foldlen) {
3481                             if (!--chars)
3482                                 break;
3483                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3484                                             uniflags);
3485                             uscan += len;
3486                             foldlen -= len;
3487                         }
3488                     }
3489                 }
3490                 else {
3491                     if (utf8_target)
3492                         while (chars--)
3493                             uc += UTF8SKIP(uc);
3494                     else
3495                         uc += chars;
3496                 }
3497                 PL_reginput = (char *)uc;
3498             }
3499
3500             scan = (ST.jump && ST.jump[ST.nextword])
3501                         ? ST.me + ST.jump[ST.nextword]
3502                         : ST.B;
3503
3504             DEBUG_EXECUTE_r({
3505                 PerlIO_printf( Perl_debug_log,
3506                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3507                     REPORT_CODE_OFF+depth*2, "",
3508                     PL_colors[4],
3509                     ST.nextword,
3510                     PL_colors[5]
3511                     );
3512             });
3513
3514             if (ST.accepted > 1 || has_cutgroup) {
3515                 PUSH_STATE_GOTO(TRIE_next, scan);
3516                 /* NOTREACHED */
3517             }
3518             /* only one choice left - just continue */
3519             DEBUG_EXECUTE_r({
3520                 AV *const trie_words
3521                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3522                 SV ** const tmp = av_fetch( trie_words,
3523                     ST.nextword-1, 0 );
3524                 SV *sv= tmp ? sv_newmortal() : NULL;
3525
3526                 PerlIO_printf( Perl_debug_log,
3527                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3528                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3529                     ST.nextword,
3530                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3531                             PL_colors[0], PL_colors[1],
3532                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3533                         )
3534                     : "not compiled under -Dr",
3535                     PL_colors[5] );
3536             });
3537
3538             locinput = PL_reginput;
3539             nextchr = UCHARAT(locinput);
3540             continue; /* execute rest of RE */
3541             /* NOTREACHED */
3542 #undef  ST
3543
3544         case EXACT: {
3545             char *s = STRING(scan);
3546             ln = STR_LEN(scan);
3547             if (utf8_target != UTF_PATTERN) {
3548                 /* The target and the pattern have differing utf8ness. */
3549                 char *l = locinput;
3550                 const char * const e = s + ln;
3551
3552                 if (utf8_target) {
3553                     /* The target is utf8, the pattern is not utf8. */
3554                     while (s < e) {
3555                         STRLEN ulen;
3556                         if (l >= PL_regeol)
3557                              sayNO;
3558                         if (NATIVE_TO_UNI(*(U8*)s) !=
3559                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3560                                             uniflags))
3561                              sayNO;
3562                         l += ulen;
3563                         s ++;
3564                     }
3565                 }
3566                 else {
3567                     /* The target is not utf8, the pattern is utf8. */
3568                     while (s < e) {
3569                         STRLEN ulen;
3570                         if (l >= PL_regeol)
3571                             sayNO;
3572                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3573                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3574                                            uniflags))
3575                             sayNO;
3576                         s += ulen;
3577                         l ++;
3578                     }
3579                 }
3580                 locinput = l;
3581                 nextchr = UCHARAT(locinput);
3582                 break;
3583             }
3584             /* The target and the pattern have the same utf8ness. */
3585             /* Inline the first character, for speed. */
3586             if (UCHARAT(s) != nextchr)
3587                 sayNO;
3588             if (PL_regeol - locinput < ln)
3589                 sayNO;
3590             if (ln > 1 && memNE(s, locinput, ln))
3591                 sayNO;
3592             locinput += ln;
3593             nextchr = UCHARAT(locinput);
3594             break;
3595             }
3596         case EXACTFL: {
3597             re_fold_t folder;
3598             const U8 * fold_array;
3599             const char * s;
3600             U32 fold_utf8_flags;
3601
3602             PL_reg_flags |= RF_tainted;
3603             folder = foldEQ_locale;
3604             fold_array = PL_fold_locale;
3605             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3606             goto do_exactf;
3607
3608         case EXACTFU:
3609             folder = foldEQ_latin1;
3610             fold_array = PL_fold_latin1;
3611             fold_utf8_flags = 0;
3612             goto do_exactf;
3613
3614         case EXACTFA:
3615             folder = foldEQ_latin1;
3616             fold_array = PL_fold_latin1;
3617             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3618             goto do_exactf;
3619
3620         case EXACTF:
3621             folder = foldEQ;
3622             fold_array = PL_fold;
3623             fold_utf8_flags = 0;
3624
3625           do_exactf:
3626             s = STRING(scan);
3627             ln = STR_LEN(scan);
3628
3629             if (utf8_target || UTF_PATTERN) {
3630               /* Either target or the pattern are utf8. */
3631                 const char * const l = locinput;
3632                 char *e = PL_regeol;
3633
3634                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3635                                l, &e, 0,  utf8_target, fold_utf8_flags))
3636                 {
3637                     sayNO;
3638                 }
3639                 locinput = e;
3640                 nextchr = UCHARAT(locinput);
3641                 break;
3642             }
3643
3644             /* Neither the target nor the pattern are utf8 */
3645             if (UCHARAT(s) != nextchr &&
3646                 UCHARAT(s) != fold_array[nextchr])
3647             {
3648                 sayNO;
3649             }
3650             if (PL_regeol - locinput < ln)
3651                 sayNO;
3652             if (ln > 1 && ! folder(s, locinput, ln))
3653                 sayNO;
3654             locinput += ln;
3655             nextchr = UCHARAT(locinput);
3656             break;
3657         }
3658
3659         /* XXX Could improve efficiency by separating these all out using a
3660          * macro or in-line function.  At that point regcomp.c would no longer
3661          * have to set the FLAGS fields of these */
3662         case BOUNDL:
3663         case NBOUNDL:
3664             PL_reg_flags |= RF_tainted;
3665             /* FALL THROUGH */
3666         case BOUND:
3667         case BOUNDU:
3668         case BOUNDA:
3669         case NBOUND:
3670         case NBOUNDU:
3671         case NBOUNDA:
3672             /* was last char in word? */
3673             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3674                 if (locinput == PL_bostr)
3675                     ln = '\n';
3676                 else {
3677                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3678
3679                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3680                 }
3681                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3682                     ln = isALNUM_uni(ln);
3683                     LOAD_UTF8_CHARCLASS_ALNUM();
3684                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3685                 }
3686                 else {
3687                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3688                     n = isALNUM_LC_utf8((U8*)locinput);
3689                 }
3690             }
3691             else {
3692
3693                 /* Here the string isn't utf8, or is utf8 and only ascii
3694                  * characters are to match \w.  In the latter case looking at
3695                  * the byte just prior to the current one may be just the final
3696                  * byte of a multi-byte character.  This is ok.  There are two
3697                  * cases:
3698                  * 1) it is a single byte character, and then the test is doing
3699                  *      just what it's supposed to.
3700                  * 2) it is a multi-byte character, in which case the final
3701                  *      byte is never mistakable for ASCII, and so the test
3702                  *      will say it is not a word character, which is the
3703                  *      correct answer. */
3704                 ln = (locinput != PL_bostr) ?
3705                     UCHARAT(locinput - 1) : '\n';
3706                 switch (FLAGS(scan)) {
3707                     case REGEX_UNICODE_CHARSET:
3708                         ln = isWORDCHAR_L1(ln);
3709                         n = isWORDCHAR_L1(nextchr);
3710                         break;
3711                     case REGEX_LOCALE_CHARSET:
3712                         ln = isALNUM_LC(ln);
3713                         n = isALNUM_LC(nextchr);
3714                         break;
3715                     case REGEX_DEPENDS_CHARSET:
3716                         ln = isALNUM(ln);
3717                         n = isALNUM(nextchr);
3718                         break;
3719                     case REGEX_ASCII_RESTRICTED_CHARSET:
3720                         ln = isWORDCHAR_A(ln);
3721                         n = isWORDCHAR_A(nextchr);
3722                         break;
3723                     default:
3724                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3725                         break;
3726                 }
3727             }
3728             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3729              * regcomp.sym */
3730             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3731                     sayNO;
3732             break;
3733         case ANYOFV:
3734         case ANYOF:
3735             if (utf8_target || state_num == ANYOFV) {
3736                 STRLEN inclasslen = PL_regeol - locinput;
3737                 if (locinput >= PL_regeol)
3738                     sayNO;
3739
3740                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3741                     sayNO;
3742                 locinput += inclasslen;
3743                 nextchr = UCHARAT(locinput);
3744                 break;
3745             }
3746             else {
3747                 if (nextchr < 0)
3748                     nextchr = UCHARAT(locinput);
3749                 if (!nextchr && locinput >= PL_regeol)
3750                     sayNO;
3751                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3752                     sayNO;
3753                 nextchr = UCHARAT(++locinput);
3754                 break;
3755             }
3756             break;
3757         /* Special char classes - The defines start on line 129 or so */
3758         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3759                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3760                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3761                   ALNUMA, NALNUMA, isWORDCHAR_A,
3762                   alnum, "a");
3763
3764         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3765                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3766                   SPACEU, NSPACEU, isSPACE_L1,
3767                   SPACEA, NSPACEA, isSPACE_A,
3768                   space, " ");
3769
3770         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3771                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3772                 DIGITA, NDIGITA, isDIGIT_A,
3773                 digit, "0");
3774
3775         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3776                        a Unicode extended Grapheme Cluster */
3777             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3778               extended Grapheme Cluster is:
3779
3780                CR LF
3781                | Prepend* Begin Extend*
3782                | .
3783
3784                Begin is (Hangul-syllable | ! Control)
3785                Extend is (Grapheme_Extend | Spacing_Mark)
3786                Control is [ GCB_Control CR LF ]
3787
3788                The discussion below shows how the code for CLUMP is derived
3789                from this regex.  Note that most of these concepts are from
3790                property values of the Grapheme Cluster Boundary (GCB) property.
3791                No code point can have multiple property values for a given
3792                property.  Thus a code point in Prepend can't be in Control, but
3793                it must be in !Control.  This is why Control above includes
3794                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3795                property separately, and so can't be in GCB_Control, even though
3796                they logically are controls.  Control is not the same as gc=cc,
3797                but includes format and other characters as well.
3798
3799                The Unicode definition of Hangul-syllable is:
3800                    L+
3801                    | (L* ( ( V | LV ) V* | LVT ) T*)
3802                    | T+
3803                   )
3804                Each of these is a value for the GCB property, and hence must be
3805                disjoint, so the order they are tested is immaterial, so the
3806                above can safely be changed to
3807                    T+
3808                    | L+
3809                    | (L* ( LVT | ( V | LV ) V*) T*)
3810
3811                The last two terms can be combined like this:
3812                    L* ( L
3813                         | (( LVT | ( V | LV ) V*) T*))
3814
3815                And refactored into this:
3816                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3817
3818                That means that if we have seen any L's at all we can quit
3819                there, but if the next character is a LVT, a V or and LV we
3820                should keep going.
3821
3822                There is a subtlety with Prepend* which showed up in testing.
3823                Note that the Begin, and only the Begin is required in:
3824                 | Prepend* Begin Extend*
3825                Also, Begin contains '! Control'.  A Prepend must be a '!
3826                Control', which means it must be a Begin.  What it comes down to
3827                is that if we match Prepend* and then find no suitable Begin
3828                afterwards, that if we backtrack the last Prepend, that one will
3829                be a suitable Begin.
3830             */
3831
3832             if (locinput >= PL_regeol)
3833                 sayNO;
3834             if  (! utf8_target) {
3835
3836                 /* Match either CR LF  or '.', as all the other possibilities
3837                  * require utf8 */
3838                 locinput++;         /* Match the . or CR */
3839                 if (nextchr == '\r'
3840                     && locinput < PL_regeol
3841                     && UCHARAT(locinput) == '\n') locinput++;
3842             }
3843             else {
3844
3845                 /* Utf8: See if is ( CR LF ); already know that locinput <
3846                  * PL_regeol, so locinput+1 is in bounds */
3847                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3848                     locinput += 2;
3849                 }
3850                 else {
3851                     /* In case have to backtrack to beginning, then match '.' */
3852                     char *starting = locinput;
3853
3854                     /* In case have to backtrack the last prepend */
3855                     char *previous_prepend = 0;
3856
3857                     LOAD_UTF8_CHARCLASS_GCB();
3858
3859                     /* Match (prepend)* */
3860                     while (locinput < PL_regeol
3861                            && swash_fetch(PL_utf8_X_prepend,
3862                                           (U8*)locinput, utf8_target))
3863                     {
3864                         previous_prepend = locinput;
3865                         locinput += UTF8SKIP(locinput);
3866                     }
3867
3868                     /* As noted above, if we matched a prepend character, but
3869                      * the next thing won't match, back off the last prepend we
3870                      * matched, as it is guaranteed to match the begin */
3871                     if (previous_prepend
3872                         && (locinput >=  PL_regeol
3873                             || ! swash_fetch(PL_utf8_X_begin,
3874                                              (U8*)locinput, utf8_target)))
3875                     {
3876                         locinput = previous_prepend;
3877                     }
3878
3879                     /* Note that here we know PL_regeol > locinput, as we
3880                      * tested that upon input to this switch case, and if we
3881                      * moved locinput forward, we tested the result just above
3882                      * and it either passed, or we backed off so that it will
3883                      * now pass */
3884                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3885
3886                         /* Here did not match the required 'Begin' in the
3887                          * second term.  So just match the very first
3888                          * character, the '.' of the final term of the regex */
3889                         locinput = starting + UTF8SKIP(starting);
3890                     } else {
3891
3892                         /* Here is the beginning of a character that can have
3893                          * an extender.  It is either a hangul syllable, or a
3894                          * non-control */
3895                         if (swash_fetch(PL_utf8_X_non_hangul,
3896                                         (U8*)locinput, utf8_target))
3897                         {
3898
3899                             /* Here not a Hangul syllable, must be a
3900                              * ('!  * Control') */
3901                             locinput += UTF8SKIP(locinput);
3902                         } else {
3903
3904                             /* Here is a Hangul syllable.  It can be composed
3905                              * of several individual characters.  One
3906                              * possibility is T+ */
3907                             if (swash_fetch(PL_utf8_X_T,
3908                                             (U8*)locinput, utf8_target))
3909                             {
3910                                 while (locinput < PL_regeol
3911                                         && swash_fetch(PL_utf8_X_T,
3912                                                         (U8*)locinput, utf8_target))
3913                                 {
3914                                     locinput += UTF8SKIP(locinput);
3915                                 }
3916                             } else {
3917
3918                                 /* Here, not T+, but is a Hangul.  That means
3919                                  * it is one of the others: L, LV, LVT or V,
3920                                  * and matches:
3921                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3922
3923                                 /* Match L*           */
3924                                 while (locinput < PL_regeol
3925                                         && swash_fetch(PL_utf8_X_L,
3926                                                         (U8*)locinput, utf8_target))
3927                                 {
3928                                     locinput += UTF8SKIP(locinput);
3929                                 }
3930
3931                                 /* Here, have exhausted L*.  If the next
3932                                  * character is not an LV, LVT nor V, it means
3933                                  * we had to have at least one L, so matches L+
3934                                  * in the original equation, we have a complete
3935                                  * hangul syllable.  Are done. */
3936
3937                                 if (locinput < PL_regeol
3938                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3939                                                     (U8*)locinput, utf8_target))
3940                                 {
3941
3942                                     /* Otherwise keep going.  Must be LV, LVT
3943                                      * or V.  See if LVT */
3944                                     if (swash_fetch(PL_utf8_X_LVT,
3945                                                     (U8*)locinput, utf8_target))
3946                                     {
3947                                         locinput += UTF8SKIP(locinput);
3948                                     } else {
3949
3950                                         /* Must be  V or LV.  Take it, then
3951                                          * match V*     */
3952                                         locinput += UTF8SKIP(locinput);
3953                                         while (locinput < PL_regeol
3954                                                 && swash_fetch(PL_utf8_X_V,
3955                                                          (U8*)locinput, utf8_target))
3956                                         {
3957                                             locinput += UTF8SKIP(locinput);
3958                                         }
3959                                     }
3960
3961                                     /* And any of LV, LVT, or V can be followed
3962                                      * by T*            */
3963                                     while (locinput < PL_regeol
3964                                            && swash_fetch(PL_utf8_X_T,
3965                                                            (U8*)locinput,
3966                                                            utf8_target))
3967                                     {
3968                                         locinput += UTF8SKIP(locinput);
3969                                     }
3970                                 }
3971                             }
3972                         }
3973
3974                         /* Match any extender */
3975                         while (locinput < PL_regeol
3976                                 && swash_fetch(PL_utf8_X_extend,
3977                                                 (U8*)locinput, utf8_target))
3978                         {
3979                             locinput += UTF8SKIP(locinput);
3980                         }
3981                     }
3982                 }
3983                 if (locinput > PL_regeol) sayNO;
3984             }
3985             nextchr = UCHARAT(locinput);
3986             break;
3987
3988         case NREFFL:
3989         {   /* The capture buffer cases.  The ones beginning with N for the
3990                named buffers just convert to the equivalent numbered and
3991                pretend they were called as the corresponding numbered buffer
3992                op.  */
3993             /* don't initialize these in the declaration, it makes C++
3994                unhappy */
3995             char *s;
3996             char type;
3997             re_fold_t folder;
3998             const U8 *fold_array;
3999             UV utf8_fold_flags;
4000
4001             PL_reg_flags |= RF_tainted;
4002             folder = foldEQ_locale;
4003             fold_array = PL_fold_locale;
4004             type = REFFL;
4005             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4006             goto do_nref;
4007
4008         case NREFFA:
4009             folder = foldEQ_latin1;
4010             fold_array = PL_fold_latin1;
4011             type = REFFA;
4012             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4013             goto do_nref;
4014
4015         case NREFFU:
4016             folder = foldEQ_latin1;
4017             fold_array = PL_fold_latin1;
4018             type = REFFU;
4019             utf8_fold_flags = 0;
4020             goto do_nref;
4021
4022         case NREFF:
4023             folder = foldEQ;
4024             fold_array = PL_fold;
4025             type = REFF;
4026             utf8_fold_flags = 0;
4027             goto do_nref;
4028
4029         case NREF:
4030             type = REF;
4031             folder = NULL;
4032             fold_array = NULL;
4033             utf8_fold_flags = 0;
4034           do_nref:
4035
4036             /* For the named back references, find the corresponding buffer
4037              * number */
4038             n = reg_check_named_buff_matched(rex,scan);
4039
4040             if ( ! n ) {
4041                 sayNO;
4042             }
4043             goto do_nref_ref_common;
4044
4045         case REFFL:
4046             PL_reg_flags |= RF_tainted;
4047             folder = foldEQ_locale;
4048             fold_array = PL_fold_locale;
4049             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4050             goto do_ref;
4051
4052         case REFFA:
4053             folder = foldEQ_latin1;
4054             fold_array = PL_fold_latin1;
4055             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4056             goto do_ref;
4057
4058         case REFFU:
4059             folder = foldEQ_latin1;
4060             fold_array = PL_fold_latin1;
4061             utf8_fold_flags = 0;
4062             goto do_ref;
4063
4064         case REFF:
4065             folder = foldEQ;
4066             fold_array = PL_fold;
4067             utf8_fold_flags = 0;
4068             goto do_ref;
4069
4070         case REF:
4071             folder = NULL;
4072             fold_array = NULL;
4073             utf8_fold_flags = 0;
4074
4075           do_ref:
4076             type = OP(scan);
4077             n = ARG(scan);  /* which paren pair */
4078
4079           do_nref_ref_common:
4080             ln = PL_regoffs[n].start;
4081             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4082             if (*PL_reglastparen < n || ln == -1)
4083                 sayNO;                  /* Do not match unless seen CLOSEn. */
4084             if (ln == PL_regoffs[n].end)
4085                 break;
4086
4087             s = PL_bostr + ln;
4088             if (type != REF     /* REF can do byte comparison */
4089                 && (utf8_target || type == REFFU))
4090             { /* XXX handle REFFL better */
4091                 char * limit = PL_regeol;
4092
4093                 /* This call case insensitively compares the entire buffer
4094                     * at s, with the current input starting at locinput, but
4095                     * not going off the end given by PL_regeol, and returns in
4096                     * limit upon success, how much of the current input was
4097                     * matched */
4098                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4099                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4100                 {
4101                     sayNO;
4102                 }
4103                 locinput = limit;
4104                 nextchr = UCHARAT(locinput);
4105                 break;
4106             }
4107
4108             /* Not utf8:  Inline the first character, for speed. */
4109             if (UCHARAT(s) != nextchr &&
4110                 (type == REF ||
4111                  UCHARAT(s) != fold_array[nextchr]))
4112                 sayNO;
4113             ln = PL_regoffs[n].end - ln;
4114             if (locinput + ln > PL_regeol)
4115                 sayNO;
4116             if (ln > 1 && (type == REF
4117                            ? memNE(s, locinput, ln)
4118                            : ! folder(s, locinput, ln)))
4119                 sayNO;
4120             locinput += ln;
4121             nextchr = UCHARAT(locinput);
4122             break;
4123         }
4124         case NOTHING:
4125         case TAIL:
4126             break;
4127         case BACK:
4128             break;
4129
4130 #undef  ST
4131 #define ST st->u.eval
4132         {
4133             SV *ret;
4134             REGEXP *re_sv;
4135             regexp *re;
4136             regexp_internal *rei;
4137             regnode *startpoint;
4138
4139         case GOSTART:
4140         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4141             if (cur_eval && cur_eval->locinput==locinput) {
4142                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4143                     Perl_croak(aTHX_ "Infinite recursion in regex");
4144                 if ( ++nochange_depth > max_nochange_depth )
4145                     Perl_croak(aTHX_
4146                         "Pattern subroutine nesting without pos change"
4147                         " exceeded limit in regex");
4148             } else {
4149                 nochange_depth = 0;
4150             }
4151             re_sv = rex_sv;
4152             re = rex;
4153             rei = rexi;
4154             (void)ReREFCNT_inc(rex_sv);
4155             if (OP(scan)==GOSUB) {
4156                 startpoint = scan + ARG2L(scan);
4157                 ST.close_paren = ARG(scan);
4158             } else {
4159                 startpoint = rei->program+1;
4160                 ST.close_paren = 0;
4161             }
4162             goto eval_recurse_doit;
4163             /* NOTREACHED */
4164         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4165             if (cur_eval && cur_eval->locinput==locinput) {
4166                 if ( ++nochange_depth > max_nochange_depth )
4167                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4168             } else {
4169                 nochange_depth = 0;
4170             }
4171             {
4172                 /* execute the code in the {...} */
4173                 dSP;
4174                 SV ** const before = SP;
4175                 OP_4tree * const oop = PL_op;
4176                 COP * const ocurcop = PL_curcop;
4177                 PAD *old_comppad;
4178                 char *saved_regeol = PL_regeol;
4179                 struct re_save_state saved_state;
4180
4181                 /* To not corrupt the existing regex state while executing the
4182                  * eval we would normally put it on the save stack, like with
4183                  * save_re_context. However, re-evals have a weird scoping so we
4184                  * can't just add ENTER/LEAVE here. With that, things like
4185                  *
4186                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4187                  *
4188                  * would break, as they expect the localisation to be unwound
4189                  * only when the re-engine backtracks through the bit that
4190                  * localised it.
4191                  *
4192                  * What we do instead is just saving the state in a local c
4193                  * variable.
4194                  */
4195                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4196
4197                 n = ARG(scan);
4198                 PL_op = (OP_4tree*)rexi->data->data[n];
4199                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4200                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4201                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4202                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4203
4204                 if (sv_yes_mark) {
4205                     SV *sv_mrk = get_sv("REGMARK", 1);
4206                     sv_setsv(sv_mrk, sv_yes_mark);
4207                 }
4208
4209                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4210                 SPAGAIN;
4211                 if (SP == before)
4212                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4213                 else {
4214                     ret = POPs;
4215                     PUTBACK;
4216                 }
4217
4218                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4219
4220                 PL_op = oop;
4221                 PAD_RESTORE_LOCAL(old_comppad);
4222                 PL_curcop = ocurcop;
4223                 PL_regeol = saved_regeol;
4224                 if (!logical) {
4225                     /* /(?{...})/ */
4226                     sv_setsv(save_scalar(PL_replgv), ret);
4227                     break;
4228                 }
4229             }
4230             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4231                 logical = 0;
4232                 {
4233                     /* extract RE object from returned value; compiling if
4234                      * necessary */
4235                     MAGIC *mg = NULL;
4236                     REGEXP *rx = NULL;
4237
4238                     if (SvROK(ret)) {
4239                         SV *const sv = SvRV(ret);
4240
4241                         if (SvTYPE(sv) == SVt_REGEXP) {
4242                             rx = (REGEXP*) sv;
4243                         } else if (SvSMAGICAL(sv)) {
4244                             mg = mg_find(sv, PERL_MAGIC_qr);
4245                             assert(mg);
4246                         }
4247                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4248                         rx = (REGEXP*) ret;
4249                     } else if (SvSMAGICAL(ret)) {
4250                         if (SvGMAGICAL(ret)) {
4251                             /* I don't believe that there is ever qr magic
4252                                here.  */
4253                             assert(!mg_find(ret, PERL_MAGIC_qr));
4254                             sv_unmagic(ret, PERL_MAGIC_qr);
4255                         }
4256                         else {
4257                             mg = mg_find(ret, PERL_MAGIC_qr);
4258                             /* testing suggests mg only ends up non-NULL for
4259                                scalars who were upgraded and compiled in the
4260                                else block below. In turn, this is only
4261                                triggered in the "postponed utf8 string" tests
4262                                in t/op/pat.t  */
4263                         }
4264                     }
4265
4266                     if (mg) {
4267                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4268                         assert(rx);
4269                     }
4270                     if (rx) {
4271                         rx = reg_temp_copy(NULL, rx);
4272                     }
4273                     else {
4274                         U32 pm_flags = 0;
4275                         const I32 osize = PL_regsize;
4276
4277                         if (DO_UTF8(ret)) {
4278                             assert (SvUTF8(ret));
4279                         } else if (SvUTF8(ret)) {
4280                             /* Not doing UTF-8, despite what the SV says. Is
4281                                this only if we're trapped in use 'bytes'?  */
4282                             /* Make a copy of the octet sequence, but without
4283                                the flag on, as the compiler now honours the
4284                                SvUTF8 flag on ret.  */
4285                             STRLEN len;
4286                             const char *const p = SvPV(ret, len);
4287                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4288                         }
4289                         rx = CALLREGCOMP(ret, pm_flags);
4290                         if (!(SvFLAGS(ret)
4291                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4292                                  | SVs_GMG))) {
4293                             /* This isn't a first class regexp. Instead, it's
4294                                caching a regexp onto an existing, Perl visible
4295                                scalar.  */
4296                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4297                         }
4298                         PL_regsize = osize;
4299                     }
4300                     re_sv = rx;
4301                     re = (struct regexp *)SvANY(rx);
4302                 }
4303                 RXp_MATCH_COPIED_off(re);
4304                 re->subbeg = rex->subbeg;
4305                 re->sublen = rex->sublen;
4306                 rei = RXi_GET(re);
4307                 DEBUG_EXECUTE_r(
4308                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4309                         "Matching embedded");
4310                 );
4311                 startpoint = rei->program + 1;
4312                 ST.close_paren = 0; /* only used for GOSUB */
4313                 /* borrowed from regtry */
4314                 if (PL_reg_start_tmpl <= re->nparens) {
4315                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4316                     if(PL_reg_start_tmp)
4317                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4318                     else
4319                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4320                 }
4321
4322         eval_recurse_doit: /* Share code with GOSUB below this line */
4323                 /* run the pattern returned from (??{...}) */
4324                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4325                 REGCP_SET(ST.lastcp);
4326
4327                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4328
4329                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4330                 PL_reglastparen = &re->lastparen;
4331                 PL_reglastcloseparen = &re->lastcloseparen;
4332                 re->lastparen = 0;
4333                 re->lastcloseparen = 0;
4334
4335                 PL_reginput = locinput;
4336                 PL_regsize = 0;
4337
4338                 /* XXXX This is too dramatic a measure... */
4339                 PL_reg_maxiter = 0;
4340
4341                 ST.toggle_reg_flags = PL_reg_flags;
4342                 if (RX_UTF8(re_sv))
4343                     PL_reg_flags |= RF_utf8;
4344                 else
4345                     PL_reg_flags &= ~RF_utf8;
4346                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4347
4348                 ST.prev_rex = rex_sv;
4349                 ST.prev_curlyx = cur_curlyx;
4350                 SETREX(rex_sv,re_sv);
4351                 rex = re;
4352                 rexi = rei;
4353                 cur_curlyx = NULL;
4354                 ST.B = next;
4355                 ST.prev_eval = cur_eval;
4356                 cur_eval = st;
4357                 /* now continue from first node in postoned RE */
4358                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4359                 /* NOTREACHED */
4360             }
4361             /* logical is 1,   /(?(?{...})X|Y)/ */
4362             sw = cBOOL(SvTRUE(ret));
4363             logical = 0;
4364             break;
4365         }
4366
4367         case EVAL_AB: /* cleanup after a successful (??{A})B */
4368             /* note: this is called twice; first after popping B, then A */
4369             PL_reg_flags ^= ST.toggle_reg_flags;
4370             ReREFCNT_dec(rex_sv);
4371             SETREX(rex_sv,ST.prev_rex);
4372             rex = (struct regexp *)SvANY(rex_sv);
4373             rexi = RXi_GET(rex);
4374             regcpblow(ST.cp);
4375             cur_eval = ST.prev_eval;
4376             cur_curlyx = ST.prev_curlyx;
4377
4378             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4379             PL_reglastparen = &rex->lastparen;
4380             PL_reglastcloseparen = &rex->lastcloseparen;
4381             /* also update PL_regoffs */
4382             PL_regoffs = rex->offs;
4383
4384             /* XXXX This is too dramatic a measure... */
4385             PL_reg_maxiter = 0;
4386             if ( nochange_depth )
4387                 nochange_depth--;
4388             sayYES;
4389
4390
4391         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4392             /* note: this is called twice; first after popping B, then A */
4393             PL_reg_flags ^= ST.toggle_reg_flags;
4394             ReREFCNT_dec(rex_sv);
4395             SETREX(rex_sv,ST.prev_rex);
4396             rex = (struct regexp *)SvANY(rex_sv);
4397             rexi = RXi_GET(rex);
4398             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4399             PL_reglastparen = &rex->lastparen;
4400             PL_reglastcloseparen = &rex->lastcloseparen;
4401
4402             PL_reginput = locinput;
4403             REGCP_UNWIND(ST.lastcp);
4404             regcppop(rex);
4405             cur_eval = ST.prev_eval;
4406             cur_curlyx = ST.prev_curlyx;
4407             /* XXXX This is too dramatic a measure... */
4408             PL_reg_maxiter = 0;
4409             if ( nochange_depth )
4410                 nochange_depth--;
4411             sayNO_SILENT;
4412 #undef ST
4413
4414         case OPEN:
4415             n = ARG(scan);  /* which paren pair */
4416             PL_reg_start_tmp[n] = locinput;
4417             if (n > PL_regsize)
4418                 PL_regsize = n;
4419             lastopen = n;
4420             break;
4421         case CLOSE:
4422             n = ARG(scan);  /* which paren pair */
4423             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4424             PL_regoffs[n].end = locinput - PL_bostr;
4425             /*if (n > PL_regsize)
4426                 PL_regsize = n;*/
4427             if (n > *PL_reglastparen)
4428                 *PL_reglastparen = n;
4429             *PL_reglastcloseparen = n;
4430             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4431                 goto fake_end;
4432             }
4433             break;
4434         case ACCEPT:
4435             if (ARG(scan)){
4436                 regnode *cursor;
4437                 for (cursor=scan;
4438                      cursor && OP(cursor)!=END;
4439                      cursor=regnext(cursor))
4440                 {
4441                     if ( OP(cursor)==CLOSE ){
4442                         n = ARG(cursor);
4443                         if ( n <= lastopen ) {
4444                             PL_regoffs[n].start
4445                                 = PL_reg_start_tmp[n] - PL_bostr;
4446                             PL_regoffs[n].end = locinput - PL_bostr;
4447                             /*if (n > PL_regsize)
4448                             PL_regsize = n;*/
4449                             if (n > *PL_reglastparen)
4450                                 *PL_reglastparen = n;
4451                             *PL_reglastcloseparen = n;
4452                             if ( n == ARG(scan) || (cur_eval &&
4453                                 cur_eval->u.eval.close_paren == n))
4454                                 break;
4455                         }
4456                     }
4457                 }
4458             }
4459             goto fake_end;
4460             /*NOTREACHED*/
4461         case GROUPP:
4462             n = ARG(scan);  /* which paren pair */
4463             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4464             break;
4465         case NGROUPP:
4466             /* reg_check_named_buff_matched returns 0 for no match */
4467             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4468             break;
4469         case INSUBP:
4470             n = ARG(scan);
4471             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4472             break;
4473         case DEFINEP:
4474             sw = 0;
4475             break;
4476         case IFTHEN:
4477             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4478             if (sw)
4479                 next = NEXTOPER(NEXTOPER(scan));
4480             else {
4481                 next = scan + ARG(scan);
4482                 if (OP(next) == IFTHEN) /* Fake one. */
4483                     next = NEXTOPER(NEXTOPER(next));
4484             }
4485             break;
4486         case LOGICAL:
4487             logical = scan->flags;
4488             break;
4489
4490 /*******************************************************************
4491
4492 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4493 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4494 STAR/PLUS/CURLY/CURLYN are used instead.)
4495
4496 A*B is compiled as <CURLYX><A><WHILEM><B>
4497
4498 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4499 state, which contains the current count, initialised to -1. It also sets
4500 cur_curlyx to point to this state, with any previous value saved in the
4501 state block.
4502
4503 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4504 since the pattern may possibly match zero times (i.e. it's a while {} loop
4505 rather than a do {} while loop).
4506
4507 Each entry to WHILEM represents a successful match of A. The count in the
4508 CURLYX block is incremented, another WHILEM state is pushed, and execution
4509 passes to A or B depending on greediness and the current count.
4510
4511 For example, if matching against the string a1a2a3b (where the aN are
4512 substrings that match /A/), then the match progresses as follows: (the
4513 pushed states are interspersed with the bits of strings matched so far):
4514
4515     <CURLYX cnt=-1>
4516     <CURLYX cnt=0><WHILEM>
4517     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4518     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4519     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4520     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4521
4522 (Contrast this with something like CURLYM, which maintains only a single
4523 backtrack state:
4524
4525     <CURLYM cnt=0> a1
4526     a1 <CURLYM cnt=1> a2
4527     a1 a2 <CURLYM cnt=2> a3
4528     a1 a2 a3 <CURLYM cnt=3> b
4529 )
4530
4531 Each WHILEM state block marks a point to backtrack to upon partial failure
4532 of A or B, and also contains some minor state data related to that
4533 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4534 overall state, such as the count, and pointers to the A and B ops.
4535
4536 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4537 must always point to the *current* CURLYX block, the rules are:
4538
4539 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4540 and set cur_curlyx to point the new block.
4541
4542 When popping the CURLYX block after a successful or unsuccessful match,
4543 restore the previous cur_curlyx.
4544
4545 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4546 to the outer one saved in the CURLYX block.
4547
4548 When popping the WHILEM block after a successful or unsuccessful B match,
4549 restore the previous cur_curlyx.
4550
4551 Here's an example for the pattern (AI* BI)*BO
4552 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4553
4554 cur_
4555 curlyx backtrack stack
4556 ------ ---------------
4557 NULL
4558 CO     <CO prev=NULL> <WO>
4559 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4560 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4561 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4562
4563 At this point the pattern succeeds, and we work back down the stack to
4564 clean up, restoring as we go:
4565
4566 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4567 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4568 CO     <CO prev=NULL> <WO>
4569 NULL
4570
4571 *******************************************************************/
4572
4573 #define ST st->u.curlyx
4574
4575         case CURLYX:    /* start of /A*B/  (for complex A) */
4576         {
4577             /* No need to save/restore up to this paren */
4578             I32 parenfloor = scan->flags;
4579
4580             assert(next); /* keep Coverity happy */
4581             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4582                 next += ARG(next);
4583
4584             /* XXXX Probably it is better to teach regpush to support
4585                parenfloor > PL_regsize... */
4586             if (parenfloor > (I32)*PL_reglastparen)
4587                 parenfloor = *PL_reglastparen; /* Pessimization... */
4588
4589             ST.prev_curlyx= cur_curlyx;
4590             cur_curlyx = st;
4591             ST.cp = PL_savestack_ix;
4592
4593             /* these fields contain the state of the current curly.
4594              * they are accessed by subsequent WHILEMs */
4595             ST.parenfloor = parenfloor;
4596             ST.me = scan;
4597             ST.B = next;
4598             ST.minmod = minmod;
4599             minmod = 0;
4600             ST.count = -1;      /* this will be updated by WHILEM */
4601             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4602
4603             PL_reginput = locinput;
4604             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4605             /* NOTREACHED */
4606         }
4607
4608         case CURLYX_end: /* just finished matching all of A*B */
4609             cur_curlyx = ST.prev_curlyx;
4610             sayYES;
4611             /* NOTREACHED */
4612
4613         case CURLYX_end_fail: /* just failed to match all of A*B */
4614             regcpblow(ST.cp);
4615             cur_curlyx = ST.prev_curlyx;
4616             sayNO;
4617             /* NOTREACHED */
4618
4619
4620 #undef ST
4621 #define ST st->u.whilem
4622
4623         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4624         {
4625             /* see the discussion above about CURLYX/WHILEM */
4626             I32 n;
4627             int min = ARG1(cur_curlyx->u.curlyx.me);
4628             int max = ARG2(cur_curlyx->u.curlyx.me);
4629             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4630
4631             assert(cur_curlyx); /* keep Coverity happy */
4632             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4633             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4634             ST.cache_offset = 0;
4635             ST.cache_mask = 0;
4636
4637             PL_reginput = locinput;
4638
4639             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4640                   "%*s  whilem: matched %ld out of %d..%d\n",
4641                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4642             );
4643
4644             /* First just match a string of min A's. */
4645
4646             if (n < min) {
4647                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4648                 cur_curlyx->u.curlyx.lastloc = locinput;
4649                 REGCP_SET(ST.lastcp);
4650
4651                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4652                 /* NOTREACHED */
4653             }
4654
4655             /* If degenerate A matches "", assume A done. */
4656
4657             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4658                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4659                    "%*s  whilem: empty match detected, trying continuation...\n",
4660                    REPORT_CODE_OFF+depth*2, "")
4661                 );
4662                 goto do_whilem_B_max;
4663             }
4664
4665             /* super-linear cache processing */
4666
4667             if (scan->flags) {
4668
4669                 if (!PL_reg_maxiter) {
4670                     /* start the countdown: Postpone detection until we
4671                      * know the match is not *that* much linear. */
4672                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4673                     /* possible overflow for long strings and many CURLYX's */
4674                     if (PL_reg_maxiter < 0)
4675                         PL_reg_maxiter = I32_MAX;
4676                     PL_reg_leftiter = PL_reg_maxiter;
4677                 }
4678
4679                 if (PL_reg_leftiter-- == 0) {
4680                     /* initialise cache */
4681                     const I32 size = (PL_reg_maxiter + 7)/8;
4682                     if (PL_reg_poscache) {
4683                         if ((I32)PL_reg_poscache_size < size) {
4684                             Renew(PL_reg_poscache, size, char);
4685                             PL_reg_poscache_size = size;
4686                         }
4687                         Zero(PL_reg_poscache, size, char);
4688                     }
4689                     else {
4690                         PL_reg_poscache_size = size;
4691                         Newxz(PL_reg_poscache, size, char);
4692                     }
4693                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4694       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4695                               PL_colors[4], PL_colors[5])
4696                     );
4697                 }
4698
4699                 if (PL_reg_leftiter < 0) {
4700                     /* have we already failed at this position? */
4701                     I32 offset, mask;
4702                     offset  = (scan->flags & 0xf) - 1
4703                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4704                     mask    = 1 << (offset % 8);
4705                     offset /= 8;
4706                     if (PL_reg_poscache[offset] & mask) {
4707                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4708                             "%*s  whilem: (cache) already tried at this position...\n",
4709                             REPORT_CODE_OFF+depth*2, "")
4710                         );
4711                         sayNO; /* cache records failure */
4712                     }
4713                     ST.cache_offset = offset;
4714                     ST.cache_mask   = mask;
4715                 }
4716             }
4717
4718             /* Prefer B over A for minimal matching. */
4719
4720             if (cur_curlyx->u.curlyx.minmod) {
4721                 ST.save_curlyx = cur_curlyx;
4722                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4723                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4724                 REGCP_SET(ST.lastcp);
4725                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4726                 /* NOTREACHED */
4727             }
4728
4729             /* Prefer A over B for maximal matching. */
4730
4731             if (n < max) { /* More greed allowed? */
4732                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4733                 cur_curlyx->u.curlyx.lastloc = locinput;
4734                 REGCP_SET(ST.lastcp);
4735                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4736                 /* NOTREACHED */
4737             }
4738             goto do_whilem_B_max;
4739         }
4740         /* NOTREACHED */
4741
4742         case WHILEM_B_min: /* just matched B in a minimal match */
4743         case WHILEM_B_max: /* just matched B in a maximal match */
4744             cur_curlyx = ST.save_curlyx;
4745             sayYES;
4746             /* NOTREACHED */
4747
4748         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4749             cur_curlyx = ST.save_curlyx;
4750             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4751             cur_curlyx->u.curlyx.count--;
4752             CACHEsayNO;
4753             /* NOTREACHED */
4754
4755         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4756             /* FALL THROUGH */
4757         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4758             REGCP_UNWIND(ST.lastcp);
4759             regcppop(rex);
4760             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4761             cur_curlyx->u.curlyx.count--;
4762             CACHEsayNO;
4763             /* NOTREACHED */
4764
4765         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4766             REGCP_UNWIND(ST.lastcp);
4767             regcppop(rex);      /* Restore some previous $<digit>s? */
4768             PL_reginput = locinput;
4769             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4770                 "%*s  whilem: failed, trying continuation...\n",
4771                 REPORT_CODE_OFF+depth*2, "")
4772             );
4773           do_whilem_B_max:
4774             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4775                 && ckWARN(WARN_REGEXP)
4776                 && !(PL_reg_flags & RF_warned))
4777             {
4778                 PL_reg_flags |= RF_warned;
4779                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4780                      "Complex regular subexpression recursion",
4781                      REG_INFTY - 1);
4782             }
4783
4784             /* now try B */
4785             ST.save_curlyx = cur_curlyx;
4786             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4787             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4788             /* NOTREACHED */
4789
4790         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4791             cur_curlyx = ST.save_curlyx;
4792             REGCP_UNWIND(ST.lastcp);
4793             regcppop(rex);
4794
4795             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4796                 /* Maximum greed exceeded */
4797                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4798                     && ckWARN(WARN_REGEXP)
4799                     && !(PL_reg_flags & RF_warned))
4800                 {
4801                     PL_reg_flags |= RF_warned;
4802                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4803                         "%s limit (%d) exceeded",
4804                         "Complex regular subexpression recursion",
4805                         REG_INFTY - 1);
4806                 }
4807                 cur_curlyx->u.curlyx.count--;
4808                 CACHEsayNO;
4809             }
4810
4811             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4812                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4813             );
4814             /* Try grabbing another A and see if it helps. */
4815             PL_reginput = locinput;
4816             cur_curlyx->u.curlyx.lastloc = locinput;
4817             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4818             REGCP_SET(ST.lastcp);
4819             PUSH_STATE_GOTO(WHILEM_A_min,
4820                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4821             /* NOTREACHED */
4822
4823 #undef  ST
4824 #define ST st->u.branch
4825
4826         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4827             next = scan + ARG(scan);
4828             if (next == scan)
4829                 next = NULL;
4830             scan = NEXTOPER(scan);
4831             /* FALL THROUGH */
4832
4833         case BRANCH:        /*  /(...|A|...)/ */
4834             scan = NEXTOPER(scan); /* scan now points to inner node */
4835             ST.lastparen = *PL_reglastparen;
4836             ST.next_branch = next;
4837             REGCP_SET(ST.cp);
4838             PL_reginput = locinput;
4839
4840             /* Now go into the branch */
4841             if (has_cutgroup) {
4842                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4843             } else {
4844                 PUSH_STATE_GOTO(BRANCH_next, scan);
4845             }
4846             /* NOTREACHED */
4847         case CUTGROUP:
4848             PL_reginput = locinput;
4849             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4850                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4851             PUSH_STATE_GOTO(CUTGROUP_next,next);
4852             /* NOTREACHED */
4853         case CUTGROUP_next_fail:
4854             do_cutgroup = 1;
4855             no_final = 1;
4856             if (st->u.mark.mark_name)
4857                 sv_commit = st->u.mark.mark_name;
4858             sayNO;
4859             /* NOTREACHED */
4860         case BRANCH_next:
4861             sayYES;
4862             /* NOTREACHED */
4863         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4864             if (do_cutgroup) {
4865                 do_cutgroup = 0;
4866                 no_final = 0;
4867             }
4868             REGCP_UNWIND(ST.cp);
4869             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4870                 PL_regoffs[n].end = -1;
4871             *PL_reglastparen = n;
4872             /*dmq: *PL_reglastcloseparen = n; */
4873             scan = ST.next_branch;
4874             /* no more branches? */
4875             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4876                 DEBUG_EXECUTE_r({
4877                     PerlIO_printf( Perl_debug_log,
4878                         "%*s  %sBRANCH failed...%s\n",
4879                         REPORT_CODE_OFF+depth*2, "",
4880                         PL_colors[4],
4881                         PL_colors[5] );
4882                 });
4883                 sayNO_SILENT;
4884             }
4885             continue; /* execute next BRANCH[J] op */
4886             /* NOTREACHED */
4887
4888         case MINMOD:
4889             minmod = 1;
4890             break;
4891
4892 #undef  ST
4893 #define ST st->u.curlym
4894
4895         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4896
4897             /* This is an optimisation of CURLYX that enables us to push
4898              * only a single backtracking state, no matter how many matches
4899              * there are in {m,n}. It relies on the pattern being constant
4900              * length, with no parens to influence future backrefs
4901              */
4902
4903             ST.me = scan;
4904             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4905
4906             /* if paren positive, emulate an OPEN/CLOSE around A */
4907             if (ST.me->flags) {
4908                 U32 paren = ST.me->flags;
4909                 if (paren > PL_regsize)
4910                     PL_regsize = paren;
4911                 if (paren > *PL_reglastparen)
4912                     *PL_reglastparen = paren;
4913                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4914             }
4915             ST.A = scan;
4916             ST.B = next;
4917             ST.alen = 0;
4918             ST.count = 0;
4919             ST.minmod = minmod;
4920             minmod = 0;
4921             ST.c1 = CHRTEST_UNINIT;
4922             REGCP_SET(ST.cp);
4923
4924             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4925                 goto curlym_do_B;
4926
4927           curlym_do_A: /* execute the A in /A{m,n}B/  */
4928             PL_reginput = locinput;
4929             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4930             /* NOTREACHED */
4931
4932         case CURLYM_A: /* we've just matched an A */
4933             locinput = st->locinput;
4934             nextchr = UCHARAT(locinput);
4935
4936             ST.count++;
4937             /* after first match, determine A's length: u.curlym.alen */
4938             if (ST.count == 1) {
4939                 if (PL_reg_match_utf8) {
4940                     char *s = locinput;
4941                     while (s < PL_reginput) {
4942                         ST.alen++;
4943                         s += UTF8SKIP(s);
4944                     }
4945                 }
4946                 else {
4947                     ST.alen = PL_reginput - locinput;
4948                 }
4949                 if (ST.alen == 0)
4950                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4951             }
4952             DEBUG_EXECUTE_r(
4953                 PerlIO_printf(Perl_debug_log,
4954                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4955                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4956                           (IV) ST.count, (IV)ST.alen)
4957             );
4958
4959             locinput = PL_reginput;
4960
4961             if (cur_eval && cur_eval->u.eval.close_paren &&
4962                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4963                 goto fake_end;
4964
4965             {
4966                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4967                 if ( max == REG_INFTY || ST.count < max )
4968                     goto curlym_do_A; /* try to match another A */
4969             }
4970             goto curlym_do_B; /* try to match B */
4971
4972         case CURLYM_A_fail: /* just failed to match an A */
4973             REGCP_UNWIND(ST.cp);
4974
4975             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4976                 || (cur_eval && cur_eval->u.eval.close_paren &&
4977                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4978                 sayNO;
4979
4980           curlym_do_B: /* execute the B in /A{m,n}B/  */
4981             PL_reginput = locinput;
4982             if (ST.c1 == CHRTEST_UNINIT) {
4983                 /* calculate c1 and c2 for possible match of 1st char
4984                  * following curly */
4985                 ST.c1 = ST.c2 = CHRTEST_VOID;
4986                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4987                     regnode *text_node = ST.B;
4988                     if (! HAS_TEXT(text_node))
4989                         FIND_NEXT_IMPT(text_node);
4990                     /* this used to be
4991
4992                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4993
4994                         But the former is redundant in light of the latter.
4995
4996                         if this changes back then the macro for
4997                         IS_TEXT and friends need to change.
4998                      */
4999                     if (PL_regkind[OP(text_node)] == EXACT)
5000                     {
5001
5002                         ST.c1 = (U8)*STRING(text_node);
5003                         switch (OP(text_node)) {
5004                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5005                             case EXACTFA:
5006                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5007                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5008                             default: ST.c2 = ST.c1;
5009                         }
5010                     }
5011                 }
5012             }
5013
5014             DEBUG_EXECUTE_r(
5015                 PerlIO_printf(Perl_debug_log,
5016                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5017                     (int)(REPORT_CODE_OFF+(depth*2)),
5018                     "", (IV)ST.count)
5019                 );
5020             if (ST.c1 != CHRTEST_VOID
5021                     && UCHARAT(PL_reginput) != ST.c1
5022                     && UCHARAT(PL_reginput) != ST.c2)
5023             {
5024                 /* simulate B failing */
5025                 DEBUG_OPTIMISE_r(
5026                     PerlIO_printf(Perl_debug_log,
5027                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5028                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5029                         (IV)ST.c1,(IV)ST.c2
5030                 ));
5031                 state_num = CURLYM_B_fail;
5032                 goto reenter_switch;
5033             }
5034
5035             if (ST.me->flags) {
5036                 /* mark current A as captured */
5037                 I32 paren = ST.me->flags;
5038                 if (ST.count) {
5039                     PL_regoffs[paren].start
5040                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5041                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5042                     /*dmq: *PL_reglastcloseparen = paren; */
5043                 }
5044                 else
5045                     PL_regoffs[paren].end = -1;
5046                 if (cur_eval && cur_eval->u.eval.close_paren &&
5047                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5048                 {
5049                     if (ST.count)
5050                         goto fake_end;
5051                     else
5052                         sayNO;
5053                 }
5054             }
5055
5056             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5057             /* NOTREACHED */
5058
5059         case CURLYM_B_fail: /* just failed to match a B */
5060             REGCP_UNWIND(ST.cp);
5061             if (ST.minmod) {
5062                 I32 max = ARG2(ST.me);
5063                 if (max != REG_INFTY && ST.count == max)
5064                     sayNO;
5065                 goto curlym_do_A; /* try to match a further A */
5066             }
5067             /* backtrack one A */
5068             if (ST.count == ARG1(ST.me) /* min */)
5069                 sayNO;
5070             ST.count--;
5071             locinput = HOPc(locinput, -ST.alen);
5072             goto curlym_do_B; /* try to match B */
5073
5074 #undef ST
5075 #define ST st->u.curly
5076
5077 #define CURLY_SETPAREN(paren, success) \
5078     if (paren) { \
5079         if (success) { \
5080             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5081             PL_regoffs[paren].end = locinput - PL_bostr; \
5082             *PL_reglastcloseparen = paren; \
5083         } \
5084         else \
5085             PL_regoffs[paren].end = -1; \
5086     }
5087
5088         case STAR:              /*  /A*B/ where A is width 1 */
5089             ST.paren = 0;
5090             ST.min = 0;
5091             ST.max = REG_INFTY;
5092             scan = NEXTOPER(scan);
5093             goto repeat;
5094         case PLUS:              /*  /A+B/ where A is width 1 */
5095             ST.paren = 0;
5096             ST.min = 1;
5097             ST.max = REG_INFTY;
5098             scan = NEXTOPER(scan);
5099             goto repeat;
5100         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5101             ST.paren = scan->flags;     /* Which paren to set */
5102             if (ST.paren > PL_regsize)
5103                 PL_regsize = ST.paren;
5104             if (ST.paren > *PL_reglastparen)
5105                 *PL_reglastparen = ST.paren;
5106             ST.min = ARG1(scan);  /* min to match */
5107             ST.max = ARG2(scan);  /* max to match */
5108             if (cur_eval && cur_eval->u.eval.close_paren &&
5109                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5110                 ST.min=1;
5111                 ST.max=1;
5112             }
5113             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5114             goto repeat;
5115         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5116             ST.paren = 0;
5117             ST.min = ARG1(scan);  /* min to match */
5118             ST.max = ARG2(scan);  /* max to match */
5119             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5120           repeat:
5121             /*
5122             * Lookahead to avoid useless match attempts
5123             * when we know what character comes next.
5124             *
5125             * Used to only do .*x and .*?x, but now it allows
5126             * for )'s, ('s and (?{ ... })'s to be in the way
5127             * of the quantifier and the EXACT-like node.  -- japhy
5128             */
5129
5130             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5131                 sayNO;
5132             if (HAS_TEXT(next) || JUMPABLE(next)) {
5133                 U8 *s;
5134                 regnode *text_node = next;
5135
5136                 if (! HAS_TEXT(text_node))
5137                     FIND_NEXT_IMPT(text_node);
5138
5139                 if (! HAS_TEXT(text_node))
5140                     ST.c1 = ST.c2 = CHRTEST_VOID;
5141                 else {
5142                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5143                         ST.c1 = ST.c2 = CHRTEST_VOID;
5144                         goto assume_ok_easy;
5145                     }
5146                     else
5147                         s = (U8*)STRING(text_node);
5148
5149                     /*  Currently we only get here when
5150
5151                         PL_rekind[OP(text_node)] == EXACT
5152
5153                         if this changes back then the macro for IS_TEXT and
5154                         friends need to change. */
5155                     if (!UTF_PATTERN) {
5156                         ST.c1 = *s;
5157                         switch (OP(text_node)) {
5158                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5159                             case EXACTFA:
5160                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5161                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5162                             default: ST.c2 = ST.c1; break;
5163                         }
5164                     }
5165                     else { /* UTF_PATTERN */
5166                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5167                              STRLEN ulen1, ulen2;
5168                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5169                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5170
5171                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5172                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5173 #ifdef EBCDIC
5174                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5175                                                     ckWARN(WARN_UTF8) ?
5176                                                     0 : UTF8_ALLOW_ANY);
5177                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5178                                                     ckWARN(WARN_UTF8) ?
5179                                                     0 : UTF8_ALLOW_ANY);
5180 #else
5181                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5182                                                     uniflags);
5183                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5184                                                     uniflags);
5185 #endif
5186                         }
5187                         else {
5188                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5189                                                      uniflags);
5190                         }
5191                     }
5192                 }
5193             }
5194             else
5195                 ST.c1 = ST.c2 = CHRTEST_VOID;
5196         assume_ok_easy:
5197
5198             ST.A = scan;
5199             ST.B = next;
5200             PL_reginput = locinput;
5201             if (minmod) {
5202                 minmod = 0;
5203                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5204                     sayNO;
5205                 ST.count = ST.min;
5206                 locinput = PL_reginput;
5207                 REGCP_SET(ST.cp);
5208                 if (ST.c1 == CHRTEST_VOID)
5209                     goto curly_try_B_min;
5210
5211                 ST.oldloc = locinput;
5212
5213                 /* set ST.maxpos to the furthest point along the
5214                  * string that could possibly match */
5215                 if  (ST.max == REG_INFTY) {
5216                     ST.maxpos = PL_regeol - 1;
5217                     if (utf8_target)
5218                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5219                             ST.maxpos--;
5220                 }
5221                 else if (utf8_target) {
5222                     int m = ST.max - ST.min;
5223                     for (ST.maxpos = locinput;
5224                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5225                         ST.maxpos += UTF8SKIP(ST.maxpos);
5226                 }
5227                 else {
5228                     ST.maxpos = locinput + ST.max - ST.min;
5229                     if (ST.maxpos >= PL_regeol)
5230                         ST.maxpos = PL_regeol - 1;
5231                 }
5232                 goto curly_try_B_min_known;
5233
5234             }
5235             else {
5236                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5237                 locinput = PL_reginput;
5238                 if (ST.count < ST.min)
5239                     sayNO;
5240                 if ((ST.count > ST.min)
5241                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5242                 {
5243                     /* A{m,n} must come at the end of the string, there's
5244                      * no point in backing off ... */
5245                     ST.min = ST.count;
5246                     /* ...except that $ and \Z can match before *and* after
5247                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5248                        We may back off by one in this case. */
5249                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5250                         ST.min--;
5251                 }
5252                 REGCP_SET(ST.cp);
5253                 goto curly_try_B_max;
5254             }
5255             /* NOTREACHED */
5256
5257
5258         case CURLY_B_min_known_fail:
5259             /* failed to find B in a non-greedy match where c1,c2 valid */
5260             if (ST.paren && ST.count)
5261                 PL_regoffs[ST.paren].end = -1;
5262
5263             PL_reginput = locinput;     /* Could be reset... */
5264             REGCP_UNWIND(ST.cp);
5265             /* Couldn't or didn't -- move forward. */
5266             ST.oldloc = locinput;
5267             if (utf8_target)
5268                 locinput += UTF8SKIP(locinput);
5269             else
5270                 locinput++;
5271             ST.count++;
5272           curly_try_B_min_known:
5273              /* find the next place where 'B' could work, then call B */
5274             {
5275                 int n;
5276                 if (utf8_target) {
5277                     n = (ST.oldloc == locinput) ? 0 : 1;
5278                     if (ST.c1 == ST.c2) {
5279                         STRLEN len;
5280                         /* set n to utf8_distance(oldloc, locinput) */
5281                         while (locinput <= ST.maxpos &&
5282                                utf8n_to_uvchr((U8*)locinput,
5283                                               UTF8_MAXBYTES, &len,
5284                                               uniflags) != (UV)ST.c1) {
5285                             locinput += len;
5286                             n++;
5287                         }
5288                     }
5289                     else {
5290                         /* set n to utf8_distance(oldloc, locinput) */
5291                         while (locinput <= ST.maxpos) {
5292                             STRLEN len;
5293                             const UV c = utf8n_to_uvchr((U8*)locinput,
5294                                                   UTF8_MAXBYTES, &len,
5295                                                   uniflags);
5296                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5297                                 break;
5298                             locinput += len;
5299                             n++;
5300                         }
5301                     }
5302                 }
5303                 else {
5304                     if (ST.c1 == ST.c2) {
5305                         while (locinput <= ST.maxpos &&
5306                                UCHARAT(locinput) != ST.c1)
5307                             locinput++;
5308                     }
5309                     else {
5310                         while (locinput <= ST.maxpos
5311                                && UCHARAT(locinput) != ST.c1
5312                                && UCHARAT(locinput) != ST.c2)
5313                             locinput++;
5314                     }
5315                     n = locinput - ST.oldloc;
5316                 }
5317                 if (locinput > ST.maxpos)
5318                     sayNO;
5319                 /* PL_reginput == oldloc now */
5320                 if (n) {
5321                     ST.count += n;
5322                     if (regrepeat(rex, ST.A, n, depth) < n)
5323                         sayNO;
5324                 }
5325                 PL_reginput = locinput;
5326                 CURLY_SETPAREN(ST.paren, ST.count);
5327                 if (cur_eval && cur_eval->u.eval.close_paren &&
5328                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5329                     goto fake_end;
5330                 }
5331                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5332             }
5333             /* NOTREACHED */
5334
5335
5336         case CURLY_B_min_fail:
5337             /* failed to find B in a non-greedy match where c1,c2 invalid */
5338             if (ST.paren && ST.count)
5339                 PL_regoffs[ST.paren].end = -1;
5340
5341             REGCP_UNWIND(ST.cp);
5342             /* failed -- move forward one */
5343             PL_reginput = locinput;
5344             if (regrepeat(rex, ST.A, 1, depth)) {
5345                 ST.count++;
5346                 locinput = PL_reginput;
5347                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5348                         ST.count > 0)) /* count overflow ? */
5349                 {
5350                   curly_try_B_min:
5351                     CURLY_SETPAREN(ST.paren, ST.count);
5352                     if (cur_eval && cur_eval->u.eval.close_paren &&
5353                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5354                         goto fake_end;
5355                     }
5356                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5357                 }
5358             }
5359             sayNO;
5360             /* NOTREACHED */
5361
5362
5363         curly_try_B_max:
5364             /* a successful greedy match: now try to match B */
5365             if (cur_eval && cur_eval->u.eval.close_paren &&
5366                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5367                 goto fake_end;
5368             }
5369             {
5370                 UV c = 0;
5371                 if (ST.c1 != CHRTEST_VOID)
5372                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5373                                            UTF8_MAXBYTES, 0, uniflags)
5374                                 : (UV) UCHARAT(PL_reginput);
5375                 /* If it could work, try it. */
5376                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5377                     CURLY_SETPAREN(ST.paren, ST.count);
5378                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5379                     /* NOTREACHED */
5380                 }
5381             }
5382             /* FALL THROUGH */
5383         case CURLY_B_max_fail:
5384             /* failed to find B in a greedy match */
5385             if (ST.paren && ST.count)
5386                 PL_regoffs[ST.paren].end = -1;
5387
5388             REGCP_UNWIND(ST.cp);
5389             /*  back up. */
5390             if (--ST.count < ST.min)
5391                 sayNO;
5392             PL_reginput = locinput = HOPc(locinput, -1);
5393             goto curly_try_B_max;
5394
5395 #undef ST
5396
5397         case END:
5398             fake_end:
5399             if (cur_eval) {
5400                 /* we've just finished A in /(??{A})B/; now continue with B */
5401                 I32 tmpix;
5402                 st->u.eval.toggle_reg_flags
5403                             = cur_eval->u.eval.toggle_reg_flags;
5404                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5405
5406                 st->u.eval.prev_rex = rex_sv;           /* inner */
5407                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5408                 rex = (struct regexp *)SvANY(rex_sv);
5409                 rexi = RXi_GET(rex);
5410                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5411                 (void)ReREFCNT_inc(rex_sv);
5412                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5413
5414                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5415                 PL_reglastparen = &rex->lastparen;
5416                 PL_reglastcloseparen = &rex->lastcloseparen;
5417
5418                 REGCP_SET(st->u.eval.lastcp);
5419                 PL_reginput = locinput;
5420
5421                 /* Restore parens of the outer rex without popping the
5422                  * savestack */
5423                 tmpix = PL_savestack_ix;
5424                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5425                 regcppop(rex);
5426                 PL_savestack_ix = tmpix;
5427
5428                 st->u.eval.prev_eval = cur_eval;
5429                 cur_eval = cur_eval->u.eval.prev_eval;
5430                 DEBUG_EXECUTE_r(
5431                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5432                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5433                 if ( nochange_depth )
5434                     nochange_depth--;
5435
5436                 PUSH_YES_STATE_GOTO(EVAL_AB,
5437                         st->u.eval.prev_eval->u.eval.B); /* match B */
5438             }
5439
5440             if (locinput < reginfo->till) {
5441                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5442                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5443                                       PL_colors[4],
5444                                       (long)(locinput - PL_reg_starttry),
5445                                       (long)(reginfo->till - PL_reg_starttry),
5446                                       PL_colors[5]));
5447
5448                 sayNO_SILENT;           /* Cannot match: too short. */
5449             }
5450             PL_reginput = locinput;     /* put where regtry can find it */
5451             sayYES;                     /* Success! */
5452
5453         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5454             DEBUG_EXECUTE_r(
5455             PerlIO_printf(Perl_debug_log,
5456                 "%*s  %ssubpattern success...%s\n",
5457                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5458             PL_reginput = locinput;     /* put where regtry can find it */
5459             sayYES;                     /* Success! */
5460
5461 #undef  ST
5462 #define ST st->u.ifmatch
5463
5464         case SUSPEND:   /* (?>A) */
5465             ST.wanted = 1;
5466             PL_reginput = locinput;
5467             goto do_ifmatch;
5468
5469         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5470             ST.wanted = 0;
5471             goto ifmatch_trivial_fail_test;
5472
5473         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5474             ST.wanted = 1;
5475           ifmatch_trivial_fail_test:
5476             if (scan->flags) {
5477                 char * const s = HOPBACKc(locinput, scan->flags);
5478                 if (!s) {
5479                     /* trivial fail */
5480                     if (logical) {
5481                         logical = 0;
5482                         sw = 1 - cBOOL(ST.wanted);
5483                     }
5484                     else if (ST.wanted)
5485                         sayNO;
5486                     next = scan + ARG(scan);
5487                     if (next == scan)
5488                         next = NULL;
5489                     break;
5490                 }
5491                 PL_reginput = s;
5492             }
5493             else
5494                 PL_reginput = locinput;
5495
5496           do_ifmatch:
5497             ST.me = scan;
5498             ST.logical = logical;
5499             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5500
5501             /* execute body of (?...A) */
5502             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5503             /* NOTREACHED */
5504
5505         case IFMATCH_A_fail: /* body of (?...A) failed */
5506             ST.wanted = !ST.wanted;
5507             /* FALL THROUGH */
5508
5509         case IFMATCH_A: /* body of (?...A) succeeded */
5510             if (ST.logical) {
5511                 sw = cBOOL(ST.wanted);
5512             }
5513             else if (!ST.wanted)
5514                 sayNO;
5515
5516             if (OP(ST.me) == SUSPEND)
5517                 locinput = PL_reginput;
5518             else {
5519                 locinput = PL_reginput = st->locinput;
5520                 nextchr = UCHARAT(locinput);
5521             }
5522             scan = ST.me + ARG(ST.me);
5523             if (scan == ST.me)
5524                 scan = NULL;
5525             continue; /* execute B */
5526
5527 #undef ST
5528
5529         case LONGJMP:
5530             next = scan + ARG(scan);
5531             if (next == scan)
5532                 next = NULL;
5533             break;
5534         case COMMIT:
5535             reginfo->cutpoint = PL_regeol;
5536             /* FALLTHROUGH */
5537         case PRUNE:
5538             PL_reginput = locinput;
5539             if (!scan->flags)
5540                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5541             PUSH_STATE_GOTO(COMMIT_next,next);
5542             /* NOTREACHED */
5543         case COMMIT_next_fail:
5544             no_final = 1;
5545             /* FALLTHROUGH */
5546         case OPFAIL:
5547             sayNO;
5548             /* NOTREACHED */
5549
5550 #define ST st->u.mark
5551         case MARKPOINT:
5552             ST.prev_mark = mark_state;
5553             ST.mark_name = sv_commit = sv_yes_mark
5554                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5555             mark_state = st;
5556             ST.mark_loc = PL_reginput = locinput;
5557             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5558             /* NOTREACHED */
5559         case MARKPOINT_next:
5560             mark_state = ST.prev_mark;
5561             sayYES;
5562             /* NOTREACHED */
5563         case MARKPOINT_next_fail:
5564             if (popmark && sv_eq(ST.mark_name,popmark))
5565             {
5566                 if (ST.mark_loc > startpoint)
5567                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5568                 popmark = NULL; /* we found our mark */
5569                 sv_commit = ST.mark_name;
5570
5571                 DEBUG_EXECUTE_r({
5572                         PerlIO_printf(Perl_debug_log,
5573                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5574                             REPORT_CODE_OFF+depth*2, "",
5575                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5576                 });
5577             }
5578             mark_state = ST.prev_mark;
5579             sv_yes_mark = mark_state ?
5580                 mark_state->u.mark.mark_name : NULL;
5581             sayNO;
5582             /* NOTREACHED */
5583         case SKIP:
5584             PL_reginput = locinput;
5585             if (scan->flags) {
5586                 /* (*SKIP) : if we fail we cut here*/
5587                 ST.mark_name = NULL;
5588                 ST.mark_loc = locinput;
5589                 PUSH_STATE_GOTO(SKIP_next,next);
5590             } else {
5591                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5592                    otherwise do nothing.  Meaning we need to scan
5593                  */
5594                 regmatch_state *cur = mark_state;
5595                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5596
5597                 while (cur) {
5598                     if ( sv_eq( cur->u.mark.mark_name,
5599                                 find ) )
5600                     {
5601                         ST.mark_name = find;
5602                         PUSH_STATE_GOTO( SKIP_next, next );
5603                     }
5604                     cur = cur->u.mark.prev_mark;
5605                 }
5606             }
5607             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5608             break;
5609         case SKIP_next_fail:
5610             if (ST.mark_name) {
5611                 /* (*CUT:NAME) - Set up to search for the name as we
5612                    collapse the stack*/
5613                 popmark = ST.mark_name;
5614             } else {
5615                 /* (*CUT) - No name, we cut here.*/
5616                 if (ST.mark_loc > startpoint)
5617                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5618                 /* but we set sv_commit to latest mark_name if there
5619                    is one so they can test to see how things lead to this
5620                    cut */
5621                 if (mark_state)
5622                     sv_commit=mark_state->u.mark.mark_name;
5623             }
5624             no_final = 1;
5625             sayNO;
5626             /* NOTREACHED */
5627 #undef ST
5628         case FOLDCHAR:
5629             n = ARG(scan);
5630             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5631                 locinput += ln;
5632             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5633                 sayNO;
5634             } else  {
5635                 U8 folded[UTF8_MAXBYTES_CASE+1];
5636                 STRLEN foldlen;
5637                 const char * const l = locinput;
5638                 char *e = PL_regeol;
5639                 to_uni_fold(n, folded, &foldlen);
5640
5641                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5642                                l, &e, 0,  utf8_target)) {
5643                         sayNO;
5644                 }
5645                 locinput = e;
5646             }
5647             nextchr = UCHARAT(locinput);
5648             break;
5649         case LNBREAK:
5650             if ((n=is_LNBREAK(locinput,utf8_target))) {
5651                 locinput += n;
5652                 nextchr = UCHARAT(locinput);
5653             } else
5654                 sayNO;
5655             break;
5656
5657 #define CASE_CLASS(nAmE)                              \
5658         case nAmE:                                    \
5659             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5660                 locinput += n;                        \
5661                 nextchr = UCHARAT(locinput);          \
5662             } else                                    \
5663                 sayNO;                                \
5664             break;                                    \
5665         case N##nAmE:                                 \
5666             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5667                 sayNO;                                \
5668             } else {                                  \
5669                 locinput += UTF8SKIP(locinput);       \
5670                 nextchr = UCHARAT(locinput);          \
5671             }                                         \
5672             break
5673
5674         CASE_CLASS(VERTWS);
5675         CASE_CLASS(HORIZWS);
5676 #undef CASE_CLASS
5677
5678         default:
5679             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5680                           PTR2UV(scan), OP(scan));
5681             Perl_croak(aTHX_ "regexp memory corruption");
5682
5683         } /* end switch */
5684
5685         /* switch break jumps here */
5686         scan = next; /* prepare to execute the next op and ... */
5687         continue;    /* ... jump back to the top, reusing st */
5688         /* NOTREACHED */
5689
5690       push_yes_state:
5691         /* push a state that backtracks on success */
5692         st->u.yes.prev_yes_state = yes_state;
5693         yes_state = st;
5694         /* FALL THROUGH */
5695       push_state:
5696         /* push a new regex state, then continue at scan  */
5697         {
5698             regmatch_state *newst;
5699
5700             DEBUG_STACK_r({
5701                 regmatch_state *cur = st;
5702                 regmatch_state *curyes = yes_state;
5703                 int curd = depth;
5704                 regmatch_slab *slab = PL_regmatch_slab;
5705                 for (;curd > -1;cur--,curd--) {
5706                     if (cur < SLAB_FIRST(slab)) {
5707                         slab = slab->prev;
5708                         cur = SLAB_LAST(slab);
5709                     }
5710                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5711                         REPORT_CODE_OFF + 2 + depth * 2,"",
5712                         curd, PL_reg_name[cur->resume_state],
5713                         (curyes == cur) ? "yes" : ""
5714                     );
5715                     if (curyes == cur)
5716                         curyes = cur->u.yes.prev_yes_state;
5717                 }
5718             } else
5719                 DEBUG_STATE_pp("push")
5720             );
5721             depth++;
5722             st->locinput = locinput;
5723             newst = st+1;
5724             if (newst >  SLAB_LAST(PL_regmatch_slab))
5725                 newst = S_push_slab(aTHX);
5726             PL_regmatch_state = newst;
5727
5728             locinput = PL_reginput;
5729             nextchr = UCHARAT(locinput);
5730             st = newst;
5731             continue;
5732             /* NOTREACHED */
5733         }
5734     }
5735
5736     /*
5737     * We get here only if there's trouble -- normally "case END" is
5738     * the terminating point.
5739     */
5740     Perl_croak(aTHX_ "corrupted regexp pointers");
5741     /*NOTREACHED*/
5742     sayNO;
5743
5744 yes:
5745     if (yes_state) {
5746         /* we have successfully completed a subexpression, but we must now
5747          * pop to the state marked by yes_state and continue from there */
5748         assert(st != yes_state);
5749 #ifdef DEBUGGING
5750         while (st != yes_state) {
5751             st--;
5752             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5753                 PL_regmatch_slab = PL_regmatch_slab->prev;
5754                 st = SLAB_LAST(PL_regmatch_slab);
5755             }
5756             DEBUG_STATE_r({
5757                 if (no_final) {
5758                     DEBUG_STATE_pp("pop (no final)");
5759                 } else {
5760                     DEBUG_STATE_pp("pop (yes)");
5761                 }
5762             });
5763             depth--;
5764         }
5765 #else
5766         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5767             || yes_state > SLAB_LAST(PL_regmatch_slab))
5768         {
5769             /* not in this slab, pop slab */
5770             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5771             PL_regmatch_slab = PL_regmatch_slab->prev;
5772             st = SLAB_LAST(PL_regmatch_slab);
5773         }
5774         depth -= (st - yes_state);
5775 #endif
5776         st = yes_state;
5777         yes_state = st->u.yes.prev_yes_state;
5778         PL_regmatch_state = st;
5779
5780         if (no_final) {
5781             locinput= st->locinput;
5782             nextchr = UCHARAT(locinput);
5783         }
5784         state_num = st->resume_state + no_final;
5785         goto reenter_switch;
5786     }
5787
5788     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5789                           PL_colors[4], PL_colors[5]));
5790
5791     if (PL_reg_eval_set) {
5792         /* each successfully executed (?{...}) block does the equivalent of
5793          *   local $^R = do {...}
5794          * When popping the save stack, all these locals would be undone;
5795          * bypass this by setting the outermost saved $^R to the latest
5796          * value */
5797         if (oreplsv != GvSV(PL_replgv))
5798             sv_setsv(oreplsv, GvSV(PL_replgv));
5799     }
5800     result = 1;
5801     goto final_exit;
5802
5803 no:
5804     DEBUG_EXECUTE_r(
5805         PerlIO_printf(Perl_debug_log,
5806             "%*s  %sfailed...%s\n",
5807             REPORT_CODE_OFF+depth*2, "",
5808             PL_colors[4], PL_colors[5])
5809         );
5810
5811 no_silent:
5812     if (no_final) {
5813         if (yes_state) {
5814             goto yes;
5815         } else {
5816             goto final_exit;
5817         }
5818     }
5819     if (depth) {
5820         /* there's a previous state to backtrack to */
5821         st--;
5822         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5823             PL_regmatch_slab = PL_regmatch_slab->prev;
5824             st = SLAB_LAST(PL_regmatch_slab);
5825         }
5826         PL_regmatch_state = st;
5827         locinput= st->locinput;
5828         nextchr = UCHARAT(locinput);
5829
5830         DEBUG_STATE_pp("pop");
5831         depth--;
5832         if (yes_state == st)
5833             yes_state = st->u.yes.prev_yes_state;
5834
5835         state_num = st->resume_state + 1; /* failure = success + 1 */
5836         goto reenter_switch;
5837     }
5838     result = 0;
5839
5840   final_exit:
5841     if (rex->intflags & PREGf_VERBARG_SEEN) {
5842         SV *sv_err = get_sv("REGERROR", 1);
5843         SV *sv_mrk = get_sv("REGMARK", 1);
5844         if (result) {
5845             sv_commit = &PL_sv_no;
5846             if (!sv_yes_mark)
5847                 sv_yes_mark = &PL_sv_yes;
5848         } else {
5849             if (!sv_commit)
5850                 sv_commit = &PL_sv_yes;
5851             sv_yes_mark = &PL_sv_no;
5852         }
5853         sv_setsv(sv_err, sv_commit);
5854         sv_setsv(sv_mrk, sv_yes_mark);
5855     }
5856
5857     /* clean up; in particular, free all slabs above current one */
5858     LEAVE_SCOPE(oldsave);
5859
5860     return result;
5861 }
5862
5863 /*
5864  - regrepeat - repeatedly match something simple, report how many
5865  */
5866 /*
5867  * [This routine now assumes that it will only match on things of length 1.
5868  * That was true before, but now we assume scan - reginput is the count,
5869  * rather than incrementing count on every character.  [Er, except utf8.]]
5870  */
5871 STATIC I32
5872 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5873 {
5874     dVAR;
5875     register char *scan;
5876     register I32 c;
5877     register char *loceol = PL_regeol;
5878     register I32 hardcount = 0;
5879     register bool utf8_target = PL_reg_match_utf8;
5880     UV utf8_flags;
5881 #ifndef DEBUGGING
5882     PERL_UNUSED_ARG(depth);
5883 #endif
5884
5885     PERL_ARGS_ASSERT_REGREPEAT;
5886
5887     scan = PL_reginput;
5888     if (max == REG_INFTY)
5889         max = I32_MAX;
5890     else if (max < loceol - scan)
5891         loceol = scan + max;
5892     switch (OP(p)) {
5893     case REG_ANY:
5894         if (utf8_target) {
5895             loceol = PL_regeol;
5896             while (scan < loceol && hardcount < max && *scan != '\n') {
5897                 scan += UTF8SKIP(scan);
5898                 hardcount++;
5899             }
5900         } else {
5901             while (scan < loceol && *scan != '\n')
5902                 scan++;
5903         }
5904         break;
5905     case SANY:
5906         if (utf8_target) {
5907             loceol = PL_regeol;
5908             while (scan < loceol && hardcount < max) {
5909                 scan += UTF8SKIP(scan);
5910                 hardcount++;
5911             }
5912         }
5913         else
5914             scan = loceol;
5915         break;
5916     case CANY:
5917         scan = loceol;
5918         break;
5919     case EXACT:
5920         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5921          * means they match only characters in the string that can be expressed
5922          * as a single byte.  For non-utf8 strings, that means a simple match.
5923          * For utf8 strings, the character matched must be an invariant, or
5924          * downgradable to a single byte.  The pattern's utf8ness is
5925          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5926          * it is, it's an invariant */
5927
5928         c = (U8)*STRING(p);
5929         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5930
5931         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5932             while (scan < loceol && UCHARAT(scan) == c) {
5933                 scan++;
5934             }
5935         }
5936         else {
5937
5938             /* Here, the string is utf8, and the pattern char is different
5939              * in utf8 than not, so can't compare them directly.  Outside the
5940              * loop, find find the two utf8 bytes that represent c, and then
5941              * look for those in sequence in the utf8 string */
5942             U8 high = UTF8_TWO_BYTE_HI(c);
5943             U8 low = UTF8_TWO_BYTE_LO(c);
5944             loceol = PL_regeol;
5945
5946             while (hardcount < max
5947                     && scan + 1 < loceol
5948                     && UCHARAT(scan) == high
5949                     && UCHARAT(scan + 1) == low)
5950             {
5951                 scan += 2;
5952                 hardcount++;
5953             }
5954         }
5955         break;
5956     case EXACTFA:
5957         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5958         goto do_exactf;
5959
5960     case EXACTFL:
5961         PL_reg_flags |= RF_tainted;
5962         utf8_flags = FOLDEQ_UTF8_LOCALE;
5963         goto do_exactf;
5964
5965     case EXACTF:
5966     case EXACTFU:
5967         utf8_flags = 0;
5968
5969         /* The comments for the EXACT case above apply as well to these fold
5970          * ones */
5971
5972     do_exactf:
5973         c = (U8)*STRING(p);
5974         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5975
5976         if (utf8_target) { /* Use full Unicode fold matching */
5977             char *tmpeol = loceol;
5978             while (hardcount < max
5979                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5980                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5981             {
5982                 scan = tmpeol;
5983                 tmpeol = loceol;
5984                 hardcount++;
5985             }
5986
5987             /* XXX Note that the above handles properly the German sharp s in
5988              * the pattern matching ss in the string.  But it doesn't handle
5989              * properly cases where the string contains say 'LIGATURE ff' and
5990              * the pattern is 'f+'.  This would require, say, a new function or
5991              * revised interface to foldEQ_utf8(), in which the maximum number
5992              * of characters to match could be passed and it would return how
5993              * many actually did.  This is just one of many cases where
5994              * multi-char folds don't work properly, and so the fix is being
5995              * deferred */
5996         }
5997         else {
5998             U8 folded;
5999
6000             /* Here, the string isn't utf8 and c is a single byte; and either
6001              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6002              * doesn't affect c.  Can just do simple comparisons for exact or
6003              * fold matching. */
6004             switch (OP(p)) {
6005                 case EXACTF: folded = PL_fold[c]; break;
6006                 case EXACTFA:
6007                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6008                 case EXACTFL: folded = PL_fold_locale[c]; break;
6009                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6010             }
6011             while (scan < loceol &&
6012                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6013             {
6014                 scan++;
6015             }
6016         }
6017         break;
6018     case ANYOFV:
6019     case ANYOF:
6020         if (utf8_target || OP(p) == ANYOFV) {
6021             STRLEN inclasslen;
6022             loceol = PL_regeol;
6023             inclasslen = loceol - scan;
6024             while (hardcount < max
6025                    && ((inclasslen = loceol - scan) > 0)
6026                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6027             {
6028                 scan += inclasslen;
6029                 hardcount++;
6030             }
6031         } else {
6032             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6033                 scan++;
6034         }
6035         break;
6036     case ALNUMU:
6037         if (utf8_target) {
6038     utf8_wordchar:
6039             loceol = PL_regeol;
6040             LOAD_UTF8_CHARCLASS_ALNUM();
6041             while (hardcount < max && scan < loceol &&
6042                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6043             {
6044                 scan += UTF8SKIP(scan);
6045                 hardcount++;
6046             }
6047         } else {
6048             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6049                 scan++;
6050             }
6051         }
6052         break;
6053     case ALNUM:
6054         if (utf8_target)
6055             goto utf8_wordchar;
6056         while (scan < loceol && isALNUM((U8) *scan)) {
6057             scan++;
6058         }
6059         break;
6060     case ALNUMA:
6061         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6062             scan++;
6063         }
6064         break;
6065     case ALNUML:
6066         PL_reg_flags |= RF_tainted;
6067         if (utf8_target) {
6068             loceol = PL_regeol;
6069             while (hardcount < max && scan < loceol &&
6070                    isALNUM_LC_utf8((U8*)scan)) {
6071                 scan += UTF8SKIP(scan);
6072                 hardcount++;
6073             }
6074         } else {
6075             while (scan < loceol && isALNUM_LC(*scan))
6076                 scan++;
6077         }
6078         break;
6079     case NALNUMU:
6080         if (utf8_target) {
6081
6082     utf8_Nwordchar:
6083
6084             loceol = PL_regeol;
6085             LOAD_UTF8_CHARCLASS_ALNUM();
6086             while (hardcount < max && scan < loceol &&
6087                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6088             {
6089                 scan += UTF8SKIP(scan);
6090                 hardcount++;
6091             }
6092         } else {
6093             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6094                 scan++;
6095             }
6096         }
6097         break;
6098     case NALNUM:
6099         if (utf8_target)
6100             goto utf8_Nwordchar;
6101         while (scan < loceol && ! isALNUM((U8) *scan)) {
6102             scan++;
6103         }
6104         break;
6105     case NALNUMA:
6106         if (utf8_target) {
6107             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6108                 scan += UTF8SKIP(scan);
6109             }
6110         }
6111         else {
6112             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6113                 scan++;
6114             }
6115         }
6116         break;
6117     case NALNUML:
6118         PL_reg_flags |= RF_tainted;
6119         if (utf8_target) {
6120             loceol = PL_regeol;
6121             while (hardcount < max && scan < loceol &&
6122                    !isALNUM_LC_utf8((U8*)scan)) {
6123                 scan += UTF8SKIP(scan);
6124                 hardcount++;
6125             }
6126         } else {
6127             while (scan < loceol && !isALNUM_LC(*scan))
6128                 scan++;
6129         }
6130         break;
6131     case SPACEU:
6132         if (utf8_target) {
6133
6134     utf8_space:
6135
6136             loceol = PL_regeol;
6137             LOAD_UTF8_CHARCLASS_SPACE();
6138             while (hardcount < max && scan < loceol &&
6139                    (*scan == ' ' ||
6140                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6141             {
6142                 scan += UTF8SKIP(scan);
6143                 hardcount++;
6144             }
6145             break;
6146         }
6147         else {
6148             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6149                 scan++;
6150             }
6151             break;
6152         }
6153     case SPACE:
6154         if (utf8_target)
6155             goto utf8_space;
6156
6157         while (scan < loceol && isSPACE((U8) *scan)) {
6158             scan++;
6159         }
6160         break;
6161     case SPACEA:
6162         while (scan < loceol && isSPACE_A((U8) *scan)) {
6163             scan++;
6164         }
6165         break;
6166     case SPACEL:
6167         PL_reg_flags |= RF_tainted;
6168         if (utf8_target) {
6169             loceol = PL_regeol;
6170             while (hardcount < max && scan < loceol &&
6171                    isSPACE_LC_utf8((U8*)scan)) {
6172                 scan += UTF8SKIP(scan);
6173                 hardcount++;
6174             }
6175         } else {
6176             while (scan < loceol && isSPACE_LC(*scan))
6177                 scan++;
6178         }
6179         break;
6180     case NSPACEU:
6181         if (utf8_target) {
6182
6183     utf8_Nspace:
6184
6185             loceol = PL_regeol;
6186             LOAD_UTF8_CHARCLASS_SPACE();
6187             while (hardcount < max && scan < loceol &&
6188                    ! (*scan == ' ' ||
6189                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6190             {
6191                 scan += UTF8SKIP(scan);
6192                 hardcount++;
6193             }
6194             break;
6195         }
6196         else {
6197             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6198                 scan++;
6199             }
6200         }
6201         break;
6202     case NSPACE:
6203         if (utf8_target)
6204             goto utf8_Nspace;
6205
6206         while (scan < loceol && ! isSPACE((U8) *scan)) {
6207             scan++;
6208         }
6209         break;
6210     case NSPACEA:
6211         if (utf8_target) {
6212             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6213                 scan += UTF8SKIP(scan);
6214             }
6215         }
6216         else {
6217             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6218                 scan++;
6219             }
6220         }
6221         break;
6222     case NSPACEL:
6223         PL_reg_flags |= RF_tainted;
6224         if (utf8_target) {
6225             loceol = PL_regeol;
6226             while (hardcount < max && scan < loceol &&
6227                    !isSPACE_LC_utf8((U8*)scan)) {
6228                 scan += UTF8SKIP(scan);
6229                 hardcount++;
6230             }
6231         } else {
6232             while (scan < loceol && !isSPACE_LC(*scan))
6233                 scan++;
6234         }
6235         break;
6236     case DIGIT:
6237         if (utf8_target) {
6238             loceol = PL_regeol;
6239             LOAD_UTF8_CHARCLASS_DIGIT();
6240             while (hardcount < max && scan < loceol &&
6241                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6242                 scan += UTF8SKIP(scan);
6243                 hardcount++;
6244             }
6245         } else {
6246             while (scan < loceol && isDIGIT(*scan))
6247                 scan++;
6248         }
6249         break;
6250     case DIGITA:
6251         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6252             scan++;
6253         }
6254         break;
6255     case DIGITL:
6256         PL_reg_flags |= RF_tainted;
6257         if (utf8_target) {
6258             loceol = PL_regeol;
6259             while (hardcount < max && scan < loceol &&
6260                    isDIGIT_LC_utf8((U8*)scan)) {
6261                 scan += UTF8SKIP(scan);
6262                 hardcount++;
6263             }
6264         } else {
6265             while (scan < loceol && isDIGIT_LC(*scan))
6266                 scan++;
6267         }
6268         break;
6269     case NDIGIT:
6270         if (utf8_target) {
6271             loceol = PL_regeol;
6272             LOAD_UTF8_CHARCLASS_DIGIT();
6273             while (hardcount < max && scan < loceol &&
6274                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6275                 scan += UTF8SKIP(scan);
6276                 hardcount++;
6277             }
6278         } else {
6279             while (scan < loceol && !isDIGIT(*scan))
6280                 scan++;
6281         }
6282         break;
6283     case NDIGITA:
6284         if (utf8_target) {
6285             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6286                 scan += UTF8SKIP(scan);
6287             }
6288         }
6289         else {
6290             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6291                 scan++;
6292             }
6293         }
6294         break;
6295     case NDIGITL:
6296         PL_reg_flags |= RF_tainted;
6297         if (utf8_target) {
6298             loceol = PL_regeol;
6299             while (hardcount < max && scan < loceol &&
6300                    !isDIGIT_LC_utf8((U8*)scan)) {
6301                 scan += UTF8SKIP(scan);
6302                 hardcount++;
6303             }
6304         } else {
6305             while (scan < loceol && !isDIGIT_LC(*scan))
6306                 scan++;
6307         }
6308         break;
6309     case LNBREAK:
6310         if (utf8_target) {
6311             loceol = PL_regeol;
6312             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6313                 scan += c;
6314                 hardcount++;
6315             }
6316         } else {
6317             /*
6318               LNBREAK can match two latin chars, which is ok,
6319               because we have a null terminated string, but we
6320               have to use hardcount in this situation
6321             */
6322             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6323                 scan+=c;
6324                 hardcount++;
6325             }
6326         }
6327         break;
6328     case HORIZWS:
6329         if (utf8_target) {
6330             loceol = PL_regeol;
6331             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6332                 scan += c;
6333                 hardcount++;
6334             }
6335         } else {
6336             while (scan < loceol && is_HORIZWS_latin1(scan))
6337                 scan++;
6338         }
6339         break;
6340     case NHORIZWS:
6341         if (utf8_target) {
6342             loceol = PL_regeol;
6343             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6344                 scan += UTF8SKIP(scan);
6345                 hardcount++;
6346             }
6347         } else {
6348             while (scan < loceol && !is_HORIZWS_latin1(scan))
6349                 scan++;
6350
6351         }
6352         break;
6353     case VERTWS:
6354         if (utf8_target) {
6355             loceol = PL_regeol;
6356             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6357                 scan += c;
6358                 hardcount++;
6359             }
6360         } else {
6361             while (scan < loceol && is_VERTWS_latin1(scan))
6362                 scan++;
6363
6364         }
6365         break;
6366     case NVERTWS:
6367         if (utf8_target) {
6368             loceol = PL_regeol;
6369             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6370                 scan += UTF8SKIP(scan);
6371                 hardcount++;
6372             }
6373         } else {
6374             while (scan < loceol && !is_VERTWS_latin1(scan))
6375                 scan++;
6376
6377         }
6378         break;
6379
6380     default:            /* Called on something of 0 width. */
6381         break;          /* So match right here or not at all. */
6382     }
6383
6384     if (hardcount)
6385         c = hardcount;
6386     else
6387         c = scan - PL_reginput;
6388     PL_reginput = scan;
6389
6390     DEBUG_r({
6391         GET_RE_DEBUG_FLAGS_DECL;
6392         DEBUG_EXECUTE_r({
6393             SV * const prop = sv_newmortal();
6394             regprop(prog, prop, p);
6395             PerlIO_printf(Perl_debug_log,
6396                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6397                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6398         });
6399     });
6400
6401     return(c);
6402 }
6403
6404
6405 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6406 /*
6407 - regclass_swash - prepare the utf8 swash
6408 */
6409
6410 SV *
6411 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6412 {
6413     dVAR;
6414     SV *sw  = NULL;
6415     SV *si  = NULL;
6416     SV *alt = NULL;
6417     RXi_GET_DECL(prog,progi);
6418     const struct reg_data * const data = prog ? progi->data : NULL;
6419
6420     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6421
6422     assert(ANYOF_NONBITMAP(node));
6423
6424     if (data && data->count) {
6425         const U32 n = ARG(node);
6426
6427         if (data->what[n] == 's') {
6428             SV * const rv = MUTABLE_SV(data->data[n]);
6429             AV * const av = MUTABLE_AV(SvRV(rv));
6430             SV **const ary = AvARRAY(av);
6431             SV **a, **b;
6432
6433             /* See the end of regcomp.c:S_regclass() for
6434              * documentation of these array elements. */
6435
6436             si = *ary;
6437             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6438             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6439
6440             if (a)
6441                 sw = *a;
6442             else if (si && doinit) {
6443                 sw = swash_init("utf8", "", si, 1, 0);
6444                 (void)av_store(av, 1, sw);
6445             }
6446             if (b)
6447                 alt = *b;
6448         }
6449     }
6450
6451     if (listsvp)
6452         *listsvp = si;
6453     if (altsvp)
6454         *altsvp  = alt;
6455
6456     return sw;
6457 }
6458 #endif
6459
6460 /*
6461  - reginclass - determine if a character falls into a character class
6462
6463   n is the ANYOF regnode
6464   p is the target string
6465   lenp is pointer to the maximum number of bytes of how far to go in p
6466     (This is assumed wthout checking to always be at least the current
6467     character's size)
6468   utf8_target tells whether p is in UTF-8.
6469
6470   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6471   from a successful match, the value it points to will be updated to how many
6472   bytes in p were matched.  If there was no match, the value is undefined,
6473   possibly changed from the input.
6474
6475   Note that this can be a synthetic start class, a combination of various
6476   nodes, so things you think might be mutually exclusive, such as locale,
6477   aren't.  It can match both locale and non-locale
6478
6479  */
6480
6481 STATIC bool
6482 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6483 {
6484     dVAR;
6485     const char flags = ANYOF_FLAGS(n);
6486     bool match = FALSE;
6487     UV c = *p;
6488     STRLEN c_len = 0;
6489     STRLEN maxlen;
6490
6491     PERL_ARGS_ASSERT_REGINCLASS;
6492
6493     /* If c is not already the code point, get it */
6494     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6495         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6496                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6497                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6498                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6499                  * UTF8_ALLOW_FFFF */
6500         if (c_len == (STRLEN)-1)
6501             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6502     }
6503     else {
6504         c_len = 1;
6505     }
6506
6507     /* Use passed in max length, or one character if none passed in or less
6508      * than one character.  And assume will match just one character.  This is
6509      * overwritten later if matched more. */
6510     if (lenp) {
6511         maxlen = (*lenp > c_len) ? *lenp : c_len;
6512         *lenp = c_len;
6513
6514     }
6515     else {
6516         maxlen = c_len;
6517     }
6518
6519     /* If this character is potentially in the bitmap, check it */
6520     if (c < 256) {
6521         if (ANYOF_BITMAP_TEST(n, c))
6522             match = TRUE;
6523         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6524                 && ! utf8_target
6525                 && ! isASCII(c))
6526         {
6527             match = TRUE;
6528         }
6529
6530         else if (flags & ANYOF_LOCALE) {
6531             PL_reg_flags |= RF_tainted;
6532
6533             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6534                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6535             {
6536                 match = TRUE;
6537             }
6538             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6539                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6540                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6541                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6542                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6543                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6544                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6545                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6546                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6547                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6548                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6549                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6550                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6551                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6552                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6553                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6554                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6555                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6556                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6557                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6558                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6559                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6560                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6561                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6562                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6563                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6564                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6565                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6566                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6567                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6568                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6569                      ) /* How's that for a conditional? */
6570             ) {
6571                 match = TRUE;
6572             }
6573         }
6574     }
6575
6576     /* If the bitmap didn't (or couldn't) match, and something outside the
6577      * bitmap could match, try that.  Locale nodes specifiy completely the
6578      * behavior of code points in the bit map (otherwise, a utf8 target would
6579      * cause them to be treated as Unicode and not locale), except in
6580      * the very unlikely event when this node is a synthetic start class, which
6581      * could be a combination of locale and non-locale nodes.  So allow locale
6582      * to match for the synthetic start class, which will give a false
6583      * positive that will be resolved when the match is done again as not part
6584      * of the synthetic start class */
6585     if (!match) {
6586         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6587             match = TRUE;       /* Everything above 255 matches */
6588         }
6589         else if (ANYOF_NONBITMAP(n)
6590                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6591                      || (utf8_target
6592                          && (c >=256
6593                              || (! (flags & ANYOF_LOCALE))
6594                              || (flags & ANYOF_IS_SYNTHETIC)))))
6595         {
6596             AV *av;
6597             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6598
6599             if (sw) {
6600                 U8 * utf8_p;
6601                 if (utf8_target) {
6602                     utf8_p = (U8 *) p;
6603                 } else {
6604
6605                     /* Not utf8.  Convert as much of the string as available up
6606                      * to the limit of how far the (single) character in the
6607                      * pattern can possibly match (no need to go further).  If
6608                      * the node is a straight ANYOF or not folding, it can't
6609                      * match more than one.  Otherwise, It can match up to how
6610                      * far a single char can fold to.  Since not utf8, each
6611                      * character is a single byte, so the max it can be in
6612                      * bytes is the same as the max it can be in characters */
6613                     STRLEN len = (OP(n) == ANYOF
6614                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6615                                   ? 1
6616                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6617                                     ? maxlen
6618                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6619                     utf8_p = bytes_to_utf8(p, &len);
6620                 }
6621
6622                 if (swash_fetch(sw, utf8_p, TRUE))
6623                     match = TRUE;
6624                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6625
6626                     /* Here, we need to test if the fold of the target string
6627                      * matches.  The non-multi char folds have all been moved to
6628                      * the compilation phase, and the multi-char folds have
6629                      * been stored by regcomp into 'av'; we linearly check to
6630                      * see if any match the target string (folded).   We know
6631                      * that the originals were each one character, but we don't
6632                      * currently know how many characters/bytes each folded to,
6633                      * except we do know that there are small limits imposed by
6634                      * Unicode.  XXX A performance enhancement would be to have
6635                      * regcomp.c store the max number of chars/bytes that are
6636                      * in an av entry, as, say the 0th element.  Even better
6637                      * would be to have a hash of the few characters that can
6638                      * start a multi-char fold to the max number of chars of
6639                      * those folds.
6640                      *
6641                      * If there is a match, we will need to advance (if lenp is
6642                      * specified) the match pointer in the target string.  But
6643                      * what we are comparing here isn't that string directly,
6644                      * but its fold, whose length may differ from the original.
6645                      * As we go along in constructing the fold, therefore, we
6646                      * create a map so that we know how many bytes in the
6647                      * source to advance given that we have matched a certain
6648                      * number of bytes in the fold.  This map is stored in
6649                      * 'map_fold_len_back'.  Let n mean the number of bytes in
6650                      * the fold of the first character that we are folding.
6651                      * Then map_fold_len_back[n] is set to the number of bytes
6652                      * in that first character.  Similarly let m be the
6653                      * corresponding number for the second character to be
6654                      * folded.  Then map_fold_len_back[n+m] is set to the
6655                      * number of bytes occupied by the first two source
6656                      * characters. ... */
6657                     U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6658                     U8 folded[UTF8_MAXBYTES_CASE+1];
6659                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6660                     STRLEN total_foldlen = 0; /* num bytes in fold of all
6661                                                   chars */
6662
6663                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6664
6665                         /* Here, only need to fold the first char of the target
6666                          * string.  It the source wasn't utf8, is 1 byte long */
6667                         to_utf8_fold(utf8_p, folded, &foldlen);
6668                         total_foldlen = foldlen;
6669                         map_fold_len_back[foldlen] = (utf8_target)
6670                                                      ? UTF8SKIP(utf8_p)
6671                                                      : 1;
6672                     }
6673                     else {
6674
6675                         /* Here, need to fold more than the first char.  Do so
6676                          * up to the limits */
6677                         U8* source_ptr = utf8_p;    /* The source for the fold
6678                                                        is the regex target
6679                                                        string */
6680                         U8* folded_ptr = folded;
6681                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6682                                                        available byte in the
6683                                                        target string */
6684                         U8 i;
6685                         for (i = 0;
6686                              i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6687                              i++)
6688                         {
6689
6690                             /* Fold the next character */
6691                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6692                             STRLEN this_char_foldlen;
6693                             to_utf8_fold(source_ptr,
6694                                          this_char_folded,
6695                                          &this_char_foldlen);
6696
6697                             /* Bail if it would exceed the byte limit for
6698                              * folding a single char. */
6699                             if (this_char_foldlen + folded_ptr - folded >
6700                                                             UTF8_MAXBYTES_CASE)
6701                             {
6702                                 break;
6703                             }
6704
6705                             /* Add the fold of this character */
6706                             Copy(this_char_folded,
6707                                  folded_ptr,
6708                                  this_char_foldlen,
6709                                  U8);
6710                             source_ptr += UTF8SKIP(source_ptr);
6711                             folded_ptr += this_char_foldlen;
6712                             total_foldlen = folded_ptr - folded;
6713
6714                             /* Create map from the number of bytes in the fold
6715                              * back to the number of bytes in the source.  If
6716                              * the source isn't utf8, the byte count is just
6717                              * the number of characters so far */
6718                             map_fold_len_back[total_foldlen]
6719                                                       = (utf8_target)
6720                                                         ? source_ptr - utf8_p
6721                                                         : i + 1;
6722                         }
6723                         *folded_ptr = '\0';
6724                     }
6725
6726
6727                     /* Do the linear search to see if the fold is in the list
6728                      * of multi-char folds. */
6729                     if (av) {
6730                         I32 i;
6731                         for (i = 0; i <= av_len(av); i++) {
6732                             SV* const sv = *av_fetch(av, i, FALSE);
6733                             STRLEN len;
6734                             const char * const s = SvPV_const(sv, len);
6735
6736                             if (len <= total_foldlen
6737                                 && memEQ(s, (char*)folded, len)
6738
6739                                    /* If 0, means matched a partial char. See
6740                                     * [perl #90536] */
6741                                 && map_fold_len_back[len])
6742                             {
6743
6744                                 /* Advance the target string ptr to account for
6745                                  * this fold, but have to translate from the
6746                                  * folded length to the corresponding source
6747                                  * length. */
6748                                 if (lenp) {
6749                                     *lenp = map_fold_len_back[len];
6750                                 }
6751                                 match = TRUE;
6752                                 break;
6753                             }
6754                         }
6755                     }
6756                 }
6757
6758                 /* If we allocated a string above, free it */
6759                 if (! utf8_target) Safefree(utf8_p);
6760             }
6761         }
6762     }
6763
6764     return (flags & ANYOF_INVERT) ? !match : match;
6765 }
6766
6767 STATIC U8 *
6768 S_reghop3(U8 *s, I32 off, const U8* lim)
6769 {
6770     dVAR;
6771
6772     PERL_ARGS_ASSERT_REGHOP3;
6773
6774     if (off >= 0) {
6775         while (off-- && s < lim) {
6776             /* XXX could check well-formedness here */
6777             s += UTF8SKIP(s);
6778         }
6779     }
6780     else {
6781         while (off++ && s > lim) {
6782             s--;
6783             if (UTF8_IS_CONTINUED(*s)) {
6784                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6785                     s--;
6786             }
6787             /* XXX could check well-formedness here */
6788         }
6789     }
6790     return s;
6791 }
6792
6793 #ifdef XXX_dmq
6794 /* there are a bunch of places where we use two reghop3's that should
6795    be replaced with this routine. but since thats not done yet
6796    we ifdef it out - dmq
6797 */
6798 STATIC U8 *
6799 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6800 {
6801     dVAR;
6802
6803     PERL_ARGS_ASSERT_REGHOP4;
6804
6805     if (off >= 0) {
6806         while (off-- && s < rlim) {
6807             /* XXX could check well-formedness here */
6808             s += UTF8SKIP(s);
6809         }
6810     }
6811     else {
6812         while (off++ && s > llim) {
6813             s--;
6814             if (UTF8_IS_CONTINUED(*s)) {
6815                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6816                     s--;
6817             }
6818             /* XXX could check well-formedness here */
6819         }
6820     }
6821     return s;
6822 }
6823 #endif
6824
6825 STATIC U8 *
6826 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6827 {
6828     dVAR;
6829
6830     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6831
6832     if (off >= 0) {
6833         while (off-- && s < lim) {
6834             /* XXX could check well-formedness here */
6835             s += UTF8SKIP(s);
6836         }
6837         if (off >= 0)
6838             return NULL;
6839     }
6840     else {
6841         while (off++ && s > lim) {
6842             s--;
6843             if (UTF8_IS_CONTINUED(*s)) {
6844                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6845                     s--;
6846             }
6847             /* XXX could check well-formedness here */
6848         }
6849         if (off <= 0)
6850             return NULL;
6851     }
6852     return s;
6853 }
6854
6855 static void
6856 restore_pos(pTHX_ void *arg)
6857 {
6858     dVAR;
6859     regexp * const rex = (regexp *)arg;
6860     if (PL_reg_eval_set) {
6861         if (PL_reg_oldsaved) {
6862             rex->subbeg = PL_reg_oldsaved;
6863             rex->sublen = PL_reg_oldsavedlen;
6864 #ifdef PERL_OLD_COPY_ON_WRITE
6865             rex->saved_copy = PL_nrs;
6866 #endif
6867             RXp_MATCH_COPIED_on(rex);
6868         }
6869         PL_reg_magic->mg_len = PL_reg_oldpos;
6870         PL_reg_eval_set = 0;
6871         PL_curpm = PL_reg_oldcurpm;
6872     }
6873 }
6874
6875 STATIC void
6876 S_to_utf8_substr(pTHX_ register regexp *prog)
6877 {
6878     int i = 1;
6879
6880     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6881
6882     do {
6883         if (prog->substrs->data[i].substr
6884             && !prog->substrs->data[i].utf8_substr) {
6885             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6886             prog->substrs->data[i].utf8_substr = sv;
6887             sv_utf8_upgrade(sv);
6888             if (SvVALID(prog->substrs->data[i].substr)) {
6889                 if (SvTAIL(prog->substrs->data[i].substr)) {
6890                     /* Trim the trailing \n that fbm_compile added last
6891                        time.  */
6892                     SvCUR_set(sv, SvCUR(sv) - 1);
6893                     /* Whilst this makes the SV technically "invalid" (as its
6894                        buffer is no longer followed by "\0") when fbm_compile()
6895                        adds the "\n" back, a "\0" is restored.  */
6896                     fbm_compile(sv, FBMcf_TAIL);
6897                 } else
6898                     fbm_compile(sv, 0);
6899             }
6900             if (prog->substrs->data[i].substr == prog->check_substr)
6901                 prog->check_utf8 = sv;
6902         }
6903     } while (i--);
6904 }
6905
6906 STATIC void
6907 S_to_byte_substr(pTHX_ register regexp *prog)
6908 {
6909     dVAR;
6910     int i = 1;
6911
6912     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6913
6914     do {
6915         if (prog->substrs->data[i].utf8_substr
6916             && !prog->substrs->data[i].substr) {
6917             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6918             if (sv_utf8_downgrade(sv, TRUE)) {
6919                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6920                     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
6921                         /* Trim the trailing \n that fbm_compile added last
6922                            time.  */
6923                         SvCUR_set(sv, SvCUR(sv) - 1);
6924                         fbm_compile(sv, FBMcf_TAIL);
6925                     } else
6926                         fbm_compile(sv, 0);
6927                 }
6928             } else {
6929                 SvREFCNT_dec(sv);
6930                 sv = &PL_sv_undef;
6931             }
6932             prog->substrs->data[i].substr = sv;
6933             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6934                 prog->check_substr = sv;
6935         }
6936     } while (i--);
6937 }
6938
6939 /*
6940  * Local variables:
6941  * c-indentation-style: bsd
6942  * c-basic-offset: 4
6943  * indent-tabs-mode: t
6944  * End:
6945  *
6946  * ex: set ts=8 sts=4 sw=4 noet:
6947  */