regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1       /* tainted information used? e.g. locale */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  98  * call if there are no complications: i.e., if everything matchable is
  99  * straight forward in the bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 154
 155 /* The actual code for CCC_TRY, which uses several variables from the routine
 156  * it's callable from.  It is designed to be the bulk of a case statement.
 157  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 158  *      nextchr matches the class.
 159  * UTF8_TEST is the whole test string to use for utf8 targets
 160  * LOAD is what to use to test, and if not present to load in the swash for the
 161  *      class
 162  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 163  *      UTF8_TEST test.
 164  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 165  * utf8 and a variant, load the swash if necessary and test using the utf8
 166  * test.  Advance to the next character if test is ok, otherwise fail; If not
 167  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 168  * fails, or advance to the next character */
 169
 170 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 171     if (locinput >= PL_regeol) {                                              \
 172         sayNO;                                                                \
 173     }                                                                         \
 174     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 175         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 176         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 177             sayNO;                                                            \
 178         }                                                                     \
 179         locinput += PL_utf8skip[nextchr];                                     \
 180         nextchr = UCHARAT(locinput);                                          \
 181         break;                                                                \
 182     }                                                                         \
 183     if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 184         sayNO;                                                                \
 185     }                                                                         \
 186     nextchr = UCHARAT(++locinput);                                            \
 187     break;
 188
 189 /* Handle the non-locale cases for a character class and its complement.  It
 190  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 191  * This is because that code fails when the test succeeds, so we want to have
 192  * the test fail so that the code succeeds.  The swash is stored in a
 193  * predictable PL_ place */
 194 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 195                            CLASS, STR)                                        \
 196     case NAME:                                                                \
 197         _CCC_TRY_CODE( !, FUNC,                                               \
 198                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 199                                             (U8*)locinput, TRUE)),            \
 200                           CLASS, STR)                                         \
 201     case NNAME:                                                               \
 202         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 203                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 204                                             (U8*)locinput, TRUE)),            \
 205                           CLASS, STR)                                         \
 206
 207 /* Generate the case statements for both locale and non-locale character
 208  * classes in regmatch for classes that don't have special unicode semantics.
 209  * Locales don't use an immediate swash, but an intermediary special locale
 210  * function that is called on the pointer to the current place in the input
 211  * string.  That function will resolve to needing the same swash.  One might
 212  * think that because we don't know what the locale will match, we shouldn't
 213  * check with the swash loading function that it loaded properly; ie, that we
 214  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 215  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 216  * irrelevant here */
 217 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 218                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 219                 NAMEA, NNAMEA, FUNCA,                                         \
 220                 CLASS, STR)                                                   \
 221     case NAMEL:                                                               \
 222         PL_reg_flags |= RF_tainted;                                           \
 223         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 224     case NNAMEL:                                                              \
 225         PL_reg_flags |= RF_tainted;                                           \
 226         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 227                        CLASS, STR)                                            \
 228     case NAMEA:                                                               \
 229         if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 230             sayNO;                                                            \
 231         }                                                                     \
 232         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 233         nextchr = UCHARAT(++locinput);                                        \
 234         break;                                                                \
 235     case NNAMEA:                                                              \
 236         if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 237             sayNO;                                                            \
 238         }                                                                     \
 239         if (utf8_target) {                                                    \
 240             locinput += PL_utf8skip[nextchr];                                 \
 241             nextchr = UCHARAT(locinput);                                      \
 242         }                                                                     \
 243         else {                                                                \
 244             nextchr = UCHARAT(++locinput);                                    \
 245         }                                                                     \
 246         break;                                                                \
 247     /* Generate the non-locale cases */                                       \
 248     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 249
 250 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 251  * statements to handle separate Unicode semantics nodes */
 252 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 253                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 254                   NAMEU, NNAMEU, FUNCU,                                        \
 255                   NAMEA, NNAMEA, FUNCA,                                        \
 256                   CLASS, STR)                                                  \
 257     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 258             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 259             NAMEA, NNAMEA, FUNCA,                                              \
 260             CLASS, STR)                                                        \
 261     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 262
 263 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 264
 265 /* for use after a quantifier and before an EXACT-like node -- japhy */
 266 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 267  *
 268  * NOTE that *nothing* that affects backtracking should be in here, specifically
 269  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 270  * node that is in between two EXACT like nodes when ascertaining what the required
 271  * "follow" character is. This should probably be moved to regex compile time
 272  * although it may be done at run time beause of the REF possibility - more
 273  * investigation required. -- demerphq
 274 */
 275 #define JUMPABLE(rn) (      \
 276     OP(rn) == OPEN ||       \
 277     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 278     OP(rn) == EVAL ||   \
 279     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 280     OP(rn) == PLUS || OP(rn) == MINMOD || \
 281     OP(rn) == KEEPS || \
 282     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 283 )
 284 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 285
 286 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 287
 288 #if 0
 289 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 290    we don't need this definition. */
 291 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 292 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 293 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 294
 295 #else
 296 /* ... so we use this as its faster. */
 297 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 298 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 299 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 300 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 301
 302 #endif
 303
 304 /*
 305   Search for mandatory following text node; for lookahead, the text must
 306   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 307 */
 308 #define FIND_NEXT_IMPT(rn) STMT_START { \
 309     while (JUMPABLE(rn)) { \
 310         const OPCODE type = OP(rn); \
 311         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 312             rn = NEXTOPER(NEXTOPER(rn)); \
 313         else if (type == PLUS) \
 314             rn = NEXTOPER(rn); \
 315         else if (type == IFMATCH) \
 316             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 317         else rn += NEXT_OFF(rn); \
 318     } \
 319 } STMT_END
 320
 321
 322 static void restore_pos(pTHX_ void *arg);
 323
 324 #define REGCP_PAREN_ELEMS 4
 325 #define REGCP_OTHER_ELEMS 5
 326 #define REGCP_FRAME_ELEMS 1
 327 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 328  * are needed for the regexp context stack bookkeeping. */
 329
 330 STATIC CHECKPOINT
 331 S_regcppush(pTHX_ I32 parenfloor)
 332 {
 333     dVAR;
 334     const int retval = PL_savestack_ix;
 335     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 336     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 337     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 338     int p;
 339     GET_RE_DEBUG_FLAGS_DECL;
 340
 341     if (paren_elems_to_push < 0)
 342         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 343
 344     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 345         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 346                    " out of range (%lu-%ld)",
 347                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 348
 349     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 350
 351     for (p = PL_regsize; p > parenfloor; p--) {
 352 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 353         SSPUSHINT(PL_regoffs[p].end);
 354         SSPUSHINT(PL_regoffs[p].start);
 355         SSPUSHPTR(PL_reg_start_tmp[p]);
 356         SSPUSHINT(p);
 357         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 358           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 359                       (UV)p, (IV)PL_regoffs[p].start,
 360                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 361                       (IV)PL_regoffs[p].end
 362         ));
 363     }
 364 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 365     SSPUSHPTR(PL_regoffs);
 366     SSPUSHINT(PL_regsize);
 367     SSPUSHINT(*PL_reglastparen);
 368     SSPUSHINT(*PL_reglastcloseparen);
 369     SSPUSHPTR(PL_reginput);
 370     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 371
 372     return retval;
 373 }
 374
 375 /* These are needed since we do not localize EVAL nodes: */
 376 #define REGCP_SET(cp)                                           \
 377     DEBUG_STATE_r(                                              \
 378             PerlIO_printf(Perl_debug_log,                       \
 379                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 380                 (IV)PL_savestack_ix));                          \
 381     cp = PL_savestack_ix
 382
 383 #define REGCP_UNWIND(cp)                                        \
 384     DEBUG_STATE_r(                                              \
 385         if (cp != PL_savestack_ix)                              \
 386             PerlIO_printf(Perl_debug_log,                       \
 387                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 388                 (IV)(cp), (IV)PL_savestack_ix));                \
 389     regcpblow(cp)
 390
 391 STATIC char *
 392 S_regcppop(pTHX_ const regexp *rex)
 393 {
 394     dVAR;
 395     UV i;
 396     char *input;
 397     GET_RE_DEBUG_FLAGS_DECL;
 398
 399     PERL_ARGS_ASSERT_REGCPPOP;
 400
 401     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 402     i = SSPOPUV;
 403     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 404     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 405     input = (char *) SSPOPPTR;
 406     *PL_reglastcloseparen = SSPOPINT;
 407     *PL_reglastparen = SSPOPINT;
 408     PL_regsize = SSPOPINT;
 409     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 410
 411     i -= REGCP_OTHER_ELEMS;
 412     /* Now restore the parentheses context. */
 413     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 414         I32 tmps;
 415         U32 paren = (U32)SSPOPINT;
 416         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 417         PL_regoffs[paren].start = SSPOPINT;
 418         tmps = SSPOPINT;
 419         if (paren <= *PL_reglastparen)
 420             PL_regoffs[paren].end = tmps;
 421         DEBUG_BUFFERS_r(
 422             PerlIO_printf(Perl_debug_log,
 423                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 424                           (UV)paren, (IV)PL_regoffs[paren].start,
 425                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 426                           (IV)PL_regoffs[paren].end,
 427                           (paren > *PL_reglastparen ? "(no)" : ""));
 428         );
 429     }
 430     DEBUG_BUFFERS_r(
 431         if (*PL_reglastparen + 1 <= rex->nparens) {
 432             PerlIO_printf(Perl_debug_log,
 433                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 434                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 435         }
 436     );
 437 #if 1
 438     /* It would seem that the similar code in regtry()
 439      * already takes care of this, and in fact it is in
 440      * a better location to since this code can #if 0-ed out
 441      * but the code in regtry() is needed or otherwise tests
 442      * requiring null fields (pat.t#187 and split.t#{13,14}
 443      * (as of patchlevel 7877)  will fail.  Then again,
 444      * this code seems to be necessary or otherwise
 445      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 446      * --jhi updated by dapm */
 447     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 448         if (i > PL_regsize)
 449             PL_regoffs[i].start = -1;
 450         PL_regoffs[i].end = -1;
 451     }
 452 #endif
 453     return input;
 454 }
 455
 456 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 457
 458 /*
 459  * pregexec and friends
 460  */
 461
 462 #ifndef PERL_IN_XSUB_RE
 463 /*
 464  - pregexec - match a regexp against a string
 465  */
 466 I32
 467 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 468          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 469 /* strend: pointer to null at end of string */
 470 /* strbeg: real beginning of string */
 471 /* minend: end of match must be >=minend after stringarg. */
 472 /* nosave: For optimizations. */
 473 {
 474     PERL_ARGS_ASSERT_PREGEXEC;
 475
 476     return
 477         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 478                       nosave ? 0 : REXEC_COPY_STR);
 479 }
 480 #endif
 481
 482 /*
 483  * Need to implement the following flags for reg_anch:
 484  *
 485  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 486  * USE_INTUIT_ML
 487  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 488  * INTUIT_AUTORITATIVE_ML
 489  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 490  * INTUIT_ONCE_ML
 491  *
 492  * Another flag for this function: SECOND_TIME (so that float substrs
 493  * with giant delta may be not rechecked).
 494  */
 495
 496 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 497
 498 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 499    Otherwise, only SvCUR(sv) is used to get strbeg. */
 500
 501 /* XXXX We assume that strpos is strbeg unless sv. */
 502
 503 /* XXXX Some places assume that there is a fixed substring.
 504         An update may be needed if optimizer marks as "INTUITable"
 505         RExen without fixed substrings.  Similarly, it is assumed that
 506         lengths of all the strings are no more than minlen, thus they
 507         cannot come from lookahead.
 508         (Or minlen should take into account lookahead.)
 509   NOTE: Some of this comment is not correct. minlen does now take account
 510   of lookahead/behind. Further research is required. -- demerphq
 511
 512 */
 513
 514 /* A failure to find a constant substring means that there is no need to make
 515    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 516    finding a substring too deep into the string means that less calls to
 517    regtry() should be needed.
 518
 519    REx compiler's optimizer found 4 possible hints:
 520         a) Anchored substring;
 521         b) Fixed substring;
 522         c) Whether we are anchored (beginning-of-line or \G);
 523         d) First node (of those at offset 0) which may distinguish positions;
 524    We use a)b)d) and multiline-part of c), and try to find a position in the
 525    string which does not contradict any of them.
 526  */
 527
 528 /* Most of decisions we do here should have been done at compile time.
 529    The nodes of the REx which we used for the search should have been
 530    deleted from the finite automaton. */
 531
 532 char *
 533 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 534                      char *strend, const U32 flags, re_scream_pos_data *data)
 535 {
 536     dVAR;
 537     struct regexp *const prog = (struct regexp *)SvANY(rx);
 538     register I32 start_shift = 0;
 539     /* Should be nonnegative! */
 540     register I32 end_shift   = 0;
 541     register char *s;
 542     register SV *check;
 543     char *strbeg;
 544     char *t;
 545     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 546     I32 ml_anch;
 547     register char *other_last = NULL;   /* other substr checked before this */
 548     char *check_at = NULL;              /* check substr found at this pos */
 549     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 550     RXi_GET_DECL(prog,progi);
 551 #ifdef DEBUGGING
 552     const char * const i_strpos = strpos;
 553 #endif
 554     GET_RE_DEBUG_FLAGS_DECL;
 555
 556     PERL_ARGS_ASSERT_RE_INTUIT_START;
 557
 558     RX_MATCH_UTF8_set(rx,utf8_target);
 559
 560     if (RX_UTF8(rx)) {
 561         PL_reg_flags |= RF_utf8;
 562     }
 563     DEBUG_EXECUTE_r(
 564         debug_start_match(rx, utf8_target, strpos, strend,
 565             sv ? "Guessing start of match in sv for"
 566                : "Guessing start of match in string for");
 567               );
 568
 569     /* CHR_DIST() would be more correct here but it makes things slow. */
 570     if (prog->minlen > strend - strpos) {
 571         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 572                               "String too short... [re_intuit_start]\n"));
 573         goto fail;
 574     }
 575
 576     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 577     PL_regeol = strend;
 578     if (utf8_target) {
 579         if (!prog->check_utf8 && prog->check_substr)
 580             to_utf8_substr(prog);
 581         check = prog->check_utf8;
 582     } else {
 583         if (!prog->check_substr && prog->check_utf8)
 584             to_byte_substr(prog);
 585         check = prog->check_substr;
 586     }
 587     if (check == &PL_sv_undef) {
 588         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 589                 "Non-utf8 string cannot match utf8 check string\n"));
 590         goto fail;
 591     }
 592     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 593         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 594                      || ( (prog->extflags & RXf_ANCH_BOL)
 595                           && !multiline ) );    /* Check after \n? */
 596
 597         if (!ml_anch) {
 598           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 599                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 600                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 601                && sv && !SvROK(sv)
 602                && (strpos != strbeg)) {
 603               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 604               goto fail;
 605           }
 606           if (prog->check_offset_min == prog->check_offset_max &&
 607               !(prog->extflags & RXf_CANY_SEEN)) {
 608             /* Substring at constant offset from beg-of-str... */
 609             I32 slen;
 610
 611             s = HOP3c(strpos, prog->check_offset_min, strend);
 612
 613             if (SvTAIL(check)) {
 614                 slen = SvCUR(check);    /* >= 1 */
 615
 616                 if ( strend - s > slen || strend - s < slen - 1
 617                      || (strend - s == slen && strend[-1] != '\n')) {
 618                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 619                     goto fail_finish;
 620                 }
 621                 /* Now should match s[0..slen-2] */
 622                 slen--;
 623                 if (slen && (*SvPVX_const(check) != *s
 624                              || (slen > 1
 625                                  && memNE(SvPVX_const(check), s, slen)))) {
 626                   report_neq:
 627                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 628                     goto fail_finish;
 629                 }
 630             }
 631             else if (*SvPVX_const(check) != *s
 632                      || ((slen = SvCUR(check)) > 1
 633                          && memNE(SvPVX_const(check), s, slen)))
 634                 goto report_neq;
 635             check_at = s;
 636             goto success_at_start;
 637           }
 638         }
 639         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 640         s = strpos;
 641         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 642         end_shift = prog->check_end_shift;
 643
 644         if (!ml_anch) {
 645             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 646                                          - (SvTAIL(check) != 0);
 647             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 648
 649             if (end_shift < eshift)
 650                 end_shift = eshift;
 651         }
 652     }
 653     else {                              /* Can match at random position */
 654         ml_anch = 0;
 655         s = strpos;
 656         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 657         end_shift = prog->check_end_shift;
 658
 659         /* end shift should be non negative here */
 660     }
 661
 662 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 663     if (end_shift < 0)
 664         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 665                    (IV)end_shift, RX_PRECOMP(prog));
 666 #endif
 667
 668   restart:
 669     /* Find a possible match in the region s..strend by looking for
 670        the "check" substring in the region corrected by start/end_shift. */
 671
 672     {
 673         I32 srch_start_shift = start_shift;
 674         I32 srch_end_shift = end_shift;
 675         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 676             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 677             srch_start_shift = strbeg - s;
 678         }
 679     DEBUG_OPTIMISE_MORE_r({
 680         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 681             (IV)prog->check_offset_min,
 682             (IV)srch_start_shift,
 683             (IV)srch_end_shift,
 684             (IV)prog->check_end_shift);
 685     });
 686
 687     if (flags & REXEC_SCREAM) {
 688         I32 p = -1;                     /* Internal iterator of scream. */
 689         I32 * const pp = data ? data->scream_pos : &p;
 690
 691         if (PL_screamfirst[BmRARE(check)] >= 0
 692             || ( BmRARE(check) == '\n'
 693                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 694                  && SvTAIL(check) ))
 695             s = screaminstr(sv, check,
 696                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 697         else
 698             goto fail_finish;
 699         /* we may be pointing at the wrong string */
 700         if (s && RXp_MATCH_COPIED(prog))
 701             s = strbeg + (s - SvPVX_const(sv));
 702         if (data)
 703             *data->scream_olds = s;
 704     }
 705     else {
 706         U8* start_point;
 707         U8* end_point;
 708         if (prog->extflags & RXf_CANY_SEEN) {
 709             start_point= (U8*)(s + srch_start_shift);
 710             end_point= (U8*)(strend - srch_end_shift);
 711         } else {
 712             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 713             end_point= HOP3(strend, -srch_end_shift, strbeg);
 714         }
 715         DEBUG_OPTIMISE_MORE_r({
 716             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 717                 (int)(end_point - start_point),
 718                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 719                 start_point);
 720         });
 721
 722         s = fbm_instr( start_point, end_point,
 723                       check, multiline ? FBMrf_MULTILINE : 0);
 724     }
 725     }
 726     /* Update the count-of-usability, remove useless subpatterns,
 727         unshift s.  */
 728
 729     DEBUG_EXECUTE_r({
 730         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 731             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 732         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 733                           (s ? "Found" : "Did not find"),
 734             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 735                 ? "anchored" : "floating"),
 736             quoted,
 737             RE_SV_TAIL(check),
 738             (s ? " at offset " : "...\n") );
 739     });
 740
 741     if (!s)
 742         goto fail_finish;
 743     /* Finish the diagnostic message */
 744     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 745
 746     /* XXX dmq: first branch is for positive lookbehind...
 747        Our check string is offset from the beginning of the pattern.
 748        So we need to do any stclass tests offset forward from that
 749        point. I think. :-(
 750      */
 751
 752
 753
 754     check_at=s;
 755
 756
 757     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 758        Start with the other substr.
 759        XXXX no SCREAM optimization yet - and a very coarse implementation
 760        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 761                 *always* match.  Probably should be marked during compile...
 762        Probably it is right to do no SCREAM here...
 763      */
 764
 765     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 766                 : (prog->float_substr && prog->anchored_substr))
 767     {
 768         /* Take into account the "other" substring. */
 769         /* XXXX May be hopelessly wrong for UTF... */
 770         if (!other_last)
 771             other_last = strpos;
 772         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 773           do_other_anchored:
 774             {
 775                 char * const last = HOP3c(s, -start_shift, strbeg);
 776                 char *last1, *last2;
 777                 char * const saved_s = s;
 778                 SV* must;
 779
 780                 t = s - prog->check_offset_max;
 781                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 782                     && (!utf8_target
 783                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 784                             && t > strpos)))
 785                     NOOP;
 786                 else
 787                     t = strpos;
 788                 t = HOP3c(t, prog->anchored_offset, strend);
 789                 if (t < other_last)     /* These positions already checked */
 790                     t = other_last;
 791                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 792                 if (last < last1)
 793                     last1 = last;
 794                 /* XXXX It is not documented what units *_offsets are in.
 795                    We assume bytes, but this is clearly wrong.
 796                    Meaning this code needs to be carefully reviewed for errors.
 797                    dmq.
 798                   */
 799
 800                 /* On end-of-str: see comment below. */
 801                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 802                 if (must == &PL_sv_undef) {
 803                     s = (char*)NULL;
 804                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 805                 }
 806                 else
 807                     s = fbm_instr(
 808                         (unsigned char*)t,
 809                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 810                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 811                         must,
 812                         multiline ? FBMrf_MULTILINE : 0
 813                     );
 814                 DEBUG_EXECUTE_r({
 815                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 816                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 817                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 818                         (s ? "Found" : "Contradicts"),
 819                         quoted, RE_SV_TAIL(must));
 820                 });
 821
 822
 823                 if (!s) {
 824                     if (last1 >= last2) {
 825                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 826                                                 ", giving up...\n"));
 827                         goto fail_finish;
 828                     }
 829                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 830                         ", trying floating at offset %ld...\n",
 831                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 832                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 833                     s = HOP3c(last, 1, strend);
 834                     goto restart;
 835                 }
 836                 else {
 837                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 838                           (long)(s - i_strpos)));
 839                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 840                     other_last = HOP3c(s, 1, strend);
 841                     s = saved_s;
 842                     if (t == strpos)
 843                         goto try_at_start;
 844                     goto try_at_offset;
 845                 }
 846             }
 847         }
 848         else {          /* Take into account the floating substring. */
 849             char *last, *last1;
 850             char * const saved_s = s;
 851             SV* must;
 852
 853             t = HOP3c(s, -start_shift, strbeg);
 854             last1 = last =
 855                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 856             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 857                 last = HOP3c(t, prog->float_max_offset, strend);
 858             s = HOP3c(t, prog->float_min_offset, strend);
 859             if (s < other_last)
 860                 s = other_last;
 861  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 862             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 863             /* fbm_instr() takes into account exact value of end-of-str
 864                if the check is SvTAIL(ed).  Since false positives are OK,
 865                and end-of-str is not later than strend we are OK. */
 866             if (must == &PL_sv_undef) {
 867                 s = (char*)NULL;
 868                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 869             }
 870             else
 871                 s = fbm_instr((unsigned char*)s,
 872                               (unsigned char*)last + SvCUR(must)
 873                                   - (SvTAIL(must)!=0),
 874                               must, multiline ? FBMrf_MULTILINE : 0);
 875             DEBUG_EXECUTE_r({
 876                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 877                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 878                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 879                     (s ? "Found" : "Contradicts"),
 880                     quoted, RE_SV_TAIL(must));
 881             });
 882             if (!s) {
 883                 if (last1 == last) {
 884                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 885                                             ", giving up...\n"));
 886                     goto fail_finish;
 887                 }
 888                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 889                     ", trying anchored starting at offset %ld...\n",
 890                     (long)(saved_s + 1 - i_strpos)));
 891                 other_last = last;
 892                 s = HOP3c(t, 1, strend);
 893                 goto restart;
 894             }
 895             else {
 896                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 897                       (long)(s - i_strpos)));
 898                 other_last = s; /* Fix this later. --Hugo */
 899                 s = saved_s;
 900                 if (t == strpos)
 901                     goto try_at_start;
 902                 goto try_at_offset;
 903             }
 904         }
 905     }
 906
 907
 908     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 909
 910     DEBUG_OPTIMISE_MORE_r(
 911         PerlIO_printf(Perl_debug_log,
 912             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 913             (IV)prog->check_offset_min,
 914             (IV)prog->check_offset_max,
 915             (IV)(s-strpos),
 916             (IV)(t-strpos),
 917             (IV)(t-s),
 918             (IV)(strend-strpos)
 919         )
 920     );
 921
 922     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 923         && (!utf8_target
 924             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 925                  && t > strpos)))
 926     {
 927         /* Fixed substring is found far enough so that the match
 928            cannot start at strpos. */
 929       try_at_offset:
 930         if (ml_anch && t[-1] != '\n') {
 931             /* Eventually fbm_*() should handle this, but often
 932                anchored_offset is not 0, so this check will not be wasted. */
 933             /* XXXX In the code below we prefer to look for "^" even in
 934                presence of anchored substrings.  And we search even
 935                beyond the found float position.  These pessimizations
 936                are historical artefacts only.  */
 937           find_anchor:
 938             while (t < strend - prog->minlen) {
 939                 if (*t == '\n') {
 940                     if (t < check_at - prog->check_offset_min) {
 941                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 942                             /* Since we moved from the found position,
 943                                we definitely contradict the found anchored
 944                                substr.  Due to the above check we do not
 945                                contradict "check" substr.
 946                                Thus we can arrive here only if check substr
 947                                is float.  Redo checking for "other"=="fixed".
 948                              */
 949                             strpos = t + 1;
 950                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 951                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 952                             goto do_other_anchored;
 953                         }
 954                         /* We don't contradict the found floating substring. */
 955                         /* XXXX Why not check for STCLASS? */
 956                         s = t + 1;
 957                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 958                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 959                         goto set_useful;
 960                     }
 961                     /* Position contradicts check-string */
 962                     /* XXXX probably better to look for check-string
 963                        than for "\n", so one should lower the limit for t? */
 964                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 965                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 966                     other_last = strpos = s = t + 1;
 967                     goto restart;
 968                 }
 969                 t++;
 970             }
 971             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 972                         PL_colors[0], PL_colors[1]));
 973             goto fail_finish;
 974         }
 975         else {
 976             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 977                         PL_colors[0], PL_colors[1]));
 978         }
 979         s = t;
 980       set_useful:
 981         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 982     }
 983     else {
 984         /* The found string does not prohibit matching at strpos,
 985            - no optimization of calling REx engine can be performed,
 986            unless it was an MBOL and we are not after MBOL,
 987            or a future STCLASS check will fail this. */
 988       try_at_start:
 989         /* Even in this situation we may use MBOL flag if strpos is offset
 990            wrt the start of the string. */
 991         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 992             && (strpos != strbeg) && strpos[-1] != '\n'
 993             /* May be due to an implicit anchor of m{.*foo}  */
 994             && !(prog->intflags & PREGf_IMPLICIT))
 995         {
 996             t = strpos;
 997             goto find_anchor;
 998         }
 999         DEBUG_EXECUTE_r( if (ml_anch)
1000             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1001                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1002         );
1003       success_at_start:
1004         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1005             && (utf8_target ? (
1006                 prog->check_utf8                /* Could be deleted already */
1007                 && --BmUSEFUL(prog->check_utf8) < 0
1008                 && (prog->check_utf8 == prog->float_utf8)
1009             ) : (
1010                 prog->check_substr              /* Could be deleted already */
1011                 && --BmUSEFUL(prog->check_substr) < 0
1012                 && (prog->check_substr == prog->float_substr)
1013             )))
1014         {
1015             /* If flags & SOMETHING - do not do it many times on the same match */
1016             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1017             /* XXX Does the destruction order has to change with utf8_target? */
1018             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1019             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1020             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1021             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1022             check = NULL;                       /* abort */
1023             s = strpos;
1024             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1025                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1026             if (prog->intflags & PREGf_IMPLICIT)
1027                 prog->extflags &= ~RXf_ANCH_MBOL;
1028             /* XXXX This is a remnant of the old implementation.  It
1029                     looks wasteful, since now INTUIT can use many
1030                     other heuristics. */
1031             prog->extflags &= ~RXf_USE_INTUIT;
1032             /* XXXX What other flags might need to be cleared in this branch? */
1033         }
1034         else
1035             s = strpos;
1036     }
1037
1038     /* Last resort... */
1039     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1040     /* trie stclasses are too expensive to use here, we are better off to
1041        leave it to regmatch itself */
1042     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1043         /* minlen == 0 is possible if regstclass is \b or \B,
1044            and the fixed substr is ''$.
1045            Since minlen is already taken into account, s+1 is before strend;
1046            accidentally, minlen >= 1 guaranties no false positives at s + 1
1047            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1048            regstclass does not come from lookahead...  */
1049         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1050            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1051         const U8* const str = (U8*)STRING(progi->regstclass);
1052         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1053                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1054                     : 1);
1055         char * endpos;
1056         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1057             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1058         else if (prog->float_substr || prog->float_utf8)
1059             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1060         else
1061             endpos= strend;
1062
1063         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1064                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1065
1066         t = s;
1067         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1068         if (!s) {
1069 #ifdef DEBUGGING
1070             const char *what = NULL;
1071 #endif
1072             if (endpos == strend) {
1073                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1074                                 "Could not match STCLASS...\n") );
1075                 goto fail;
1076             }
1077             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1078                                    "This position contradicts STCLASS...\n") );
1079             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1080                 goto fail;
1081             /* Contradict one of substrings */
1082             if (prog->anchored_substr || prog->anchored_utf8) {
1083                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1084                     DEBUG_EXECUTE_r( what = "anchored" );
1085                   hop_and_restart:
1086                     s = HOP3c(t, 1, strend);
1087                     if (s + start_shift + end_shift > strend) {
1088                         /* XXXX Should be taken into account earlier? */
1089                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1090                                                "Could not match STCLASS...\n") );
1091                         goto fail;
1092                     }
1093                     if (!check)
1094                         goto giveup;
1095                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1096                                 "Looking for %s substr starting at offset %ld...\n",
1097                                  what, (long)(s + start_shift - i_strpos)) );
1098                     goto restart;
1099                 }
1100                 /* Have both, check_string is floating */
1101                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1102                     goto retry_floating_check;
1103                 /* Recheck anchored substring, but not floating... */
1104                 s = check_at;
1105                 if (!check)
1106                     goto giveup;
1107                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108                           "Looking for anchored substr starting at offset %ld...\n",
1109                           (long)(other_last - i_strpos)) );
1110                 goto do_other_anchored;
1111             }
1112             /* Another way we could have checked stclass at the
1113                current position only: */
1114             if (ml_anch) {
1115                 s = t = t + 1;
1116                 if (!check)
1117                     goto giveup;
1118                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119                           "Looking for /%s^%s/m starting at offset %ld...\n",
1120                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1121                 goto try_at_offset;
1122             }
1123             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1124                 goto fail;
1125             /* Check is floating substring. */
1126           retry_floating_check:
1127             t = check_at - start_shift;
1128             DEBUG_EXECUTE_r( what = "floating" );
1129             goto hop_and_restart;
1130         }
1131         if (t != s) {
1132             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1133                         "By STCLASS: moving %ld --> %ld\n",
1134                                   (long)(t - i_strpos), (long)(s - i_strpos))
1135                    );
1136         }
1137         else {
1138             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1139                                   "Does not contradict STCLASS...\n");
1140                    );
1141         }
1142     }
1143   giveup:
1144     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1145                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1146                           PL_colors[5], (long)(s - i_strpos)) );
1147     return s;
1148
1149   fail_finish:                          /* Substring not found */
1150     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1151         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1152   fail:
1153     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1154                           PL_colors[4], PL_colors[5]));
1155     return NULL;
1156 }
1157
1158 #define DECL_TRIE_TYPE(scan) \
1159     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1160                     trie_type = (scan->flags != EXACT) \
1161                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1162                               : (utf8_target ? trie_utf8 : trie_plain)
1163
1164 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1165 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1166     switch (trie_type) {                                                    \
1167     case trie_utf8_fold:                                                    \
1168         if ( foldlen>0 ) {                                                  \
1169             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1170             foldlen -= len;                                                 \
1171             uscan += len;                                                   \
1172             len=0;                                                          \
1173         } else {                                                            \
1174             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1175             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1176             foldlen -= UNISKIP( uvc );                                      \
1177             uscan = foldbuf + UNISKIP( uvc );                               \
1178         }                                                                   \
1179         break;                                                              \
1180     case trie_latin_utf8_fold:                                              \
1181         if ( foldlen>0 ) {                                                  \
1182             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1183             foldlen -= len;                                                 \
1184             uscan += len;                                                   \
1185             len=0;                                                          \
1186         } else {                                                            \
1187             len = 1;                                                        \
1188             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1189             foldlen -= UNISKIP( uvc );                                      \
1190             uscan = foldbuf + UNISKIP( uvc );                               \
1191         }                                                                   \
1192         break;                                                              \
1193     case trie_utf8:                                                         \
1194         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1195         break;                                                              \
1196     case trie_plain:                                                        \
1197         uvc = (UV)*uc;                                                      \
1198         len = 1;                                                            \
1199     }                                                                       \
1200     if (uvc < 256) {                                                        \
1201         charid = trie->charmap[ uvc ];                                      \
1202     }                                                                       \
1203     else {                                                                  \
1204         charid = 0;                                                         \
1205         if (widecharmap) {                                                  \
1206             SV** const svpp = hv_fetch(widecharmap,                         \
1207                         (char*)&uvc, sizeof(UV), 0);                        \
1208             if (svpp)                                                       \
1209                 charid = (U16)SvIV(*svpp);                                  \
1210         }                                                                   \
1211     }                                                                       \
1212 } STMT_END
1213
1214 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1215 STMT_START {                                              \
1216     while (s <= e) {                                      \
1217         if ( (CoNd)                                       \
1218              && (ln == 1 || folder(s, pat_string, ln))    \
1219              && (!reginfo || regtry(reginfo, &s)) )       \
1220             goto got_it;                                  \
1221         s++;                                              \
1222     }                                                     \
1223 } STMT_END
1224
1225 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1226 STMT_START {                                          \
1227     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1228         CoDe                                          \
1229         s += uskip;                                   \
1230     }                                                 \
1231 } STMT_END
1232
1233 #define REXEC_FBC_SCAN(CoDe)                          \
1234 STMT_START {                                          \
1235     while (s < strend) {                              \
1236         CoDe                                          \
1237         s++;                                          \
1238     }                                                 \
1239 } STMT_END
1240
1241 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1242 REXEC_FBC_UTF8_SCAN(                                  \
1243     if (CoNd) {                                       \
1244         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1245             goto got_it;                              \
1246         else                                          \
1247             tmp = doevery;                            \
1248     }                                                 \
1249     else                                              \
1250         tmp = 1;                                      \
1251 )
1252
1253 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1254 REXEC_FBC_SCAN(                                       \
1255     if (CoNd) {                                       \
1256         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1257             goto got_it;                              \
1258         else                                          \
1259             tmp = doevery;                            \
1260     }                                                 \
1261     else                                              \
1262         tmp = 1;                                      \
1263 )
1264
1265 #define REXEC_FBC_TRYIT               \
1266 if ((!reginfo || regtry(reginfo, &s))) \
1267     goto got_it
1268
1269 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1270     if (utf8_target) {                                             \
1271         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1272     }                                                          \
1273     else {                                                     \
1274         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1275     }
1276
1277 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1278     if (utf8_target) {                                             \
1279         UtFpReLoAd;                                            \
1280         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1281     }                                                          \
1282     else {                                                     \
1283         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1284     }
1285
1286 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1287     PL_reg_flags |= RF_tainted;                                \
1288     if (utf8_target) {                                             \
1289         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1290     }                                                          \
1291     else {                                                     \
1292         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1293     }
1294
1295 #define DUMP_EXEC_POS(li,s,doutf8) \
1296     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1297
1298
1299 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1300         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1301         tmp = TEST_NON_UTF8(tmp);                                              \
1302         REXEC_FBC_UTF8_SCAN(                                                   \
1303             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1304                 tmp = !tmp;                                                    \
1305                 IF_SUCCESS;                                                    \
1306             }                                                                  \
1307             else {                                                             \
1308                 IF_FAIL;                                                       \
1309             }                                                                  \
1310         );                                                                     \
1311
1312 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1313         if (s == PL_bostr) {                                                   \
1314             tmp = '\n';                                                        \
1315         }                                                                      \
1316         else {                                                                 \
1317             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1318             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1319         }                                                                      \
1320         tmp = TeSt1_UtF8;                                                      \
1321         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1322         REXEC_FBC_UTF8_SCAN(                                                   \
1323             if (tmp == ! (TeSt2_UtF8)) { \
1324                 tmp = !tmp;                                                    \
1325                 IF_SUCCESS;                                                    \
1326             }                                                                  \
1327             else {                                                             \
1328                 IF_FAIL;                                                       \
1329             }                                                                  \
1330         );                                                                     \
1331
1332 /* The only difference between the BOUND and NBOUND cases is that
1333  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1334  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1335  * with the other one being empty */
1336 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1337     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1338
1339 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1340     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1341
1342 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1343     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1344
1345 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1346     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1347
1348
1349 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1350  * be passed in completely with the variable name being tested, which isn't
1351  * such a clean interface, but this is easier to read than it was before.  We
1352  * are looking for the boundary (or non-boundary between a word and non-word
1353  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1354  * must be different.  Find the "wordness" of the character just prior to this
1355  * one, and compare it with the wordness of this one.  If they differ, we have
1356  * a boundary.  At the beginning of the string, pretend that the previous
1357  * character was a new-line */
1358 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1359     if (utf8_target) {                                                         \
1360                 UTF8_CODE \
1361     }                                                                          \
1362     else {  /* Not utf8 */                                                     \
1363         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1364         tmp = TEST_NON_UTF8(tmp);                                              \
1365         REXEC_FBC_SCAN(                                                        \
1366             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1367                 tmp = !tmp;                                                    \
1368                 IF_SUCCESS;                                                    \
1369             }                                                                  \
1370             else {                                                             \
1371                 IF_FAIL;                                                       \
1372             }                                                                  \
1373         );                                                                     \
1374     }                                                                          \
1375     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1376         goto got_it;
1377
1378 /* We know what class REx starts with.  Try to find this position... */
1379 /* if reginfo is NULL, its a dryrun */
1380 /* annoyingly all the vars in this routine have different names from their counterparts
1381    in regmatch. /grrr */
1382
1383 STATIC char *
1384 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1385     const char *strend, regmatch_info *reginfo)
1386 {
1387         dVAR;
1388         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1389         char *pat_string;   /* The pattern's exactish string */
1390         char *pat_end;      /* ptr to end char of pat_string */
1391         re_fold_t folder;       /* Function for computing non-utf8 folds */
1392         const U8 *fold_array;   /* array for folding ords < 256 */
1393         STRLEN ln;
1394         STRLEN lnc;
1395         register STRLEN uskip;
1396         U8 c1;
1397         U8 c2;
1398         char *e;
1399         register I32 tmp = 1;   /* Scratch variable? */
1400         register const bool utf8_target = PL_reg_match_utf8;
1401         UV utf8_fold_flags = 0;
1402         RXi_GET_DECL(prog,progi);
1403
1404         PERL_ARGS_ASSERT_FIND_BYCLASS;
1405
1406         /* We know what class it must start with. */
1407         switch (OP(c)) {
1408         case ANYOFV:
1409         case ANYOF:
1410             if (utf8_target || OP(c) == ANYOFV) {
1411                 STRLEN inclasslen = strend - s;
1412                 REXEC_FBC_UTF8_CLASS_SCAN(
1413                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1414             }
1415             else {
1416                  while (s < strend) {
1417                       STRLEN skip = 1;
1418
1419                       if (REGINCLASS(prog, c, (U8*)s)) {
1420                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1421                                 goto got_it;
1422                            else
1423                                 tmp = doevery;
1424                       }
1425                       else
1426                            tmp = 1;
1427                       s += skip;
1428                  }
1429             }
1430             break;
1431         case CANY:
1432             REXEC_FBC_SCAN(
1433                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1434                     goto got_it;
1435                 else
1436                     tmp = doevery;
1437             );
1438             break;
1439
1440         case EXACTFA:
1441             if (UTF_PATTERN || utf8_target) {
1442                 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1443                 goto do_exactf_utf8;
1444             }
1445             fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1446             folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1447             goto do_exactf_non_utf8;        /* isn't dealt with by these */
1448
1449         case EXACTFU:
1450             if (UTF_PATTERN || utf8_target) {
1451                 utf8_fold_flags = 0;
1452                 goto do_exactf_utf8;
1453             }
1454             fold_array = PL_fold_latin1;
1455             folder = foldEQ_latin1;
1456             /* XXX This uses the full utf8 fold because if the pattern contains
1457              * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1458              * There could be a new node type, say EXACTFU_SS, which is
1459              * generated by regcomp only if there is an 'ss', and then every
1460              * other case could goto do_exactf_non_utf8;*/
1461             goto do_exactf_utf8;
1462
1463         case EXACTF:
1464             if (UTF_PATTERN || utf8_target) {
1465                 utf8_fold_flags = 0;
1466                 goto do_exactf_utf8;
1467             }
1468             fold_array = PL_fold;
1469             folder = foldEQ;
1470             goto do_exactf_non_utf8;
1471
1472         case EXACTFL:
1473             if (UTF_PATTERN || utf8_target) {
1474                 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1475                 goto do_exactf_utf8;
1476             }
1477             fold_array = PL_fold_locale;
1478             folder = foldEQ_locale;
1479
1480             /* FALL THROUGH */
1481
1482         do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1483
1484             /* The idea in the non-utf8 EXACTF* cases is to first find the
1485              * first character of the EXACTF* node and then, if necessary,
1486              * case-insensitively compare the full text of the node.  c1 is the
1487              * first character.  c2 is its fold.  This logic will not work for
1488              * Unicode semantics and the german sharp ss, which hence should
1489              * not be compiled into a node that gets here. */
1490             pat_string = STRING(c);
1491             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1492
1493             e = HOP3c(strend, -((I32)ln), s);
1494
1495             if (!reginfo && e < s) {
1496                 e = s;                  /* Due to minlen logic of intuit() */
1497             }
1498
1499             c1 = *pat_string;
1500             c2 = fold_array[c1];
1501             if (c1 == c2) { /* If char and fold are the same */
1502                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1503             }
1504             else {
1505                 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1506             }
1507             break;
1508
1509         do_exactf_utf8:
1510
1511             /* If one of the operands is in utf8, we can't use the simpler
1512              * folding above, due to the fact that many different characters
1513              * can have the same fold, or portion of a fold, or different-
1514              * length fold */
1515             pat_string = STRING(c);
1516             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1517             pat_end = pat_string + ln;
1518             lnc = (UTF_PATTERN) /* length to match in characters */
1519                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1520                     : ln;
1521
1522             e = HOP3c(strend, -((I32)lnc), s);
1523
1524             if (!reginfo && e < s) {
1525                 e = s;                  /* Due to minlen logic of intuit() */
1526             }
1527
1528             while (s <= e) {
1529                 char *my_strend= (char *)strend;
1530                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1531                       pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1532                     && (!reginfo || regtry(reginfo, &s)) )
1533                 {
1534                     goto got_it;
1535                 }
1536                 s += UTF8SKIP(s);
1537             }
1538             break;
1539         case BOUNDL:
1540             PL_reg_flags |= RF_tainted;
1541             FBC_BOUND(isALNUM_LC,
1542                       isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1543                       isALNUM_LC_utf8((U8*)s));
1544             break;
1545         case NBOUNDL:
1546             PL_reg_flags |= RF_tainted;
1547             FBC_NBOUND(isALNUM_LC,
1548                        isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1549                        isALNUM_LC_utf8((U8*)s));
1550             break;
1551         case BOUND:
1552             FBC_BOUND(isWORDCHAR,
1553                       isALNUM_uni(tmp),
1554                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1555             break;
1556         case BOUNDA:
1557             FBC_BOUND_NOLOAD(isWORDCHAR_A,
1558                              isWORDCHAR_A(tmp),
1559                              isWORDCHAR_A((U8*)s));
1560             break;
1561         case NBOUND:
1562             FBC_NBOUND(isWORDCHAR,
1563                        isALNUM_uni(tmp),
1564                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1565             break;
1566         case NBOUNDA:
1567             FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1568                               isWORDCHAR_A(tmp),
1569                               isWORDCHAR_A((U8*)s));
1570             break;
1571         case BOUNDU:
1572             FBC_BOUND(isWORDCHAR_L1,
1573                       isALNUM_uni(tmp),
1574                       cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1575             break;
1576         case NBOUNDU:
1577             FBC_NBOUND(isWORDCHAR_L1,
1578                        isALNUM_uni(tmp),
1579                        cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1580             break;
1581         case ALNUML:
1582             REXEC_FBC_CSCAN_TAINT(
1583                 isALNUM_LC_utf8((U8*)s),
1584                 isALNUM_LC(*s)
1585             );
1586             break;
1587         case ALNUMU:
1588             REXEC_FBC_CSCAN_PRELOAD(
1589                 LOAD_UTF8_CHARCLASS_ALNUM(),
1590                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1591                 isWORDCHAR_L1((U8) *s)
1592             );
1593             break;
1594         case ALNUM:
1595             REXEC_FBC_CSCAN_PRELOAD(
1596                 LOAD_UTF8_CHARCLASS_ALNUM(),
1597                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1598                 isWORDCHAR((U8) *s)
1599             );
1600             break;
1601         case ALNUMA:
1602             /* Don't need to worry about utf8, as it can match only a single
1603              * byte invariant character */
1604             REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1605             break;
1606         case NALNUMU:
1607             REXEC_FBC_CSCAN_PRELOAD(
1608                 LOAD_UTF8_CHARCLASS_ALNUM(),
1609                 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1610                 ! isWORDCHAR_L1((U8) *s)
1611             );
1612             break;
1613         case NALNUM:
1614             REXEC_FBC_CSCAN_PRELOAD(
1615                 LOAD_UTF8_CHARCLASS_ALNUM(),
1616                 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1617                 ! isALNUM(*s)
1618             );
1619             break;
1620         case NALNUMA:
1621             REXEC_FBC_CSCAN(
1622                 !isWORDCHAR_A(*s),
1623                 !isWORDCHAR_A(*s)
1624             );
1625             break;
1626         case NALNUML:
1627             REXEC_FBC_CSCAN_TAINT(
1628                 !isALNUM_LC_utf8((U8*)s),
1629                 !isALNUM_LC(*s)
1630             );
1631             break;
1632         case SPACEU:
1633             REXEC_FBC_CSCAN_PRELOAD(
1634                 LOAD_UTF8_CHARCLASS_SPACE(),
1635                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1636                 isSPACE_L1((U8) *s)
1637             );
1638             break;
1639         case SPACE:
1640             REXEC_FBC_CSCAN_PRELOAD(
1641                 LOAD_UTF8_CHARCLASS_SPACE(),
1642                 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1643                 isSPACE((U8) *s)
1644             );
1645             break;
1646         case SPACEA:
1647             /* Don't need to worry about utf8, as it can match only a single
1648              * byte invariant character */
1649             REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1650             break;
1651         case SPACEL:
1652             REXEC_FBC_CSCAN_TAINT(
1653                 isSPACE_LC_utf8((U8*)s),
1654                 isSPACE_LC(*s)
1655             );
1656             break;
1657         case NSPACEU:
1658             REXEC_FBC_CSCAN_PRELOAD(
1659                 LOAD_UTF8_CHARCLASS_SPACE(),
1660                 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1661                 ! isSPACE_L1((U8) *s)
1662             );
1663             break;
1664         case NSPACE:
1665             REXEC_FBC_CSCAN_PRELOAD(
1666                 LOAD_UTF8_CHARCLASS_SPACE(),
1667                 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1668                 ! isSPACE((U8) *s)
1669             );
1670             break;
1671         case NSPACEA:
1672             REXEC_FBC_CSCAN(
1673                 !isSPACE_A(*s),
1674                 !isSPACE_A(*s)
1675             );
1676             break;
1677         case NSPACEL:
1678             REXEC_FBC_CSCAN_TAINT(
1679                 !isSPACE_LC_utf8((U8*)s),
1680                 !isSPACE_LC(*s)
1681             );
1682             break;
1683         case DIGIT:
1684             REXEC_FBC_CSCAN_PRELOAD(
1685                 LOAD_UTF8_CHARCLASS_DIGIT(),
1686                 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1687                 isDIGIT(*s)
1688             );
1689             break;
1690         case DIGITA:
1691             /* Don't need to worry about utf8, as it can match only a single
1692              * byte invariant character */
1693             REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1694             break;
1695         case DIGITL:
1696             REXEC_FBC_CSCAN_TAINT(
1697                 isDIGIT_LC_utf8((U8*)s),
1698                 isDIGIT_LC(*s)
1699             );
1700             break;
1701         case NDIGIT:
1702             REXEC_FBC_CSCAN_PRELOAD(
1703                 LOAD_UTF8_CHARCLASS_DIGIT(),
1704                 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1705                 !isDIGIT(*s)
1706             );
1707             break;
1708         case NDIGITA:
1709             REXEC_FBC_CSCAN(
1710                 !isDIGIT_A(*s),
1711                 !isDIGIT_A(*s)
1712             );
1713             break;
1714         case NDIGITL:
1715             REXEC_FBC_CSCAN_TAINT(
1716                 !isDIGIT_LC_utf8((U8*)s),
1717                 !isDIGIT_LC(*s)
1718             );
1719             break;
1720         case LNBREAK:
1721             REXEC_FBC_CSCAN(
1722                 is_LNBREAK_utf8(s),
1723                 is_LNBREAK_latin1(s)
1724             );
1725             break;
1726         case VERTWS:
1727             REXEC_FBC_CSCAN(
1728                 is_VERTWS_utf8(s),
1729                 is_VERTWS_latin1(s)
1730             );
1731             break;
1732         case NVERTWS:
1733             REXEC_FBC_CSCAN(
1734                 !is_VERTWS_utf8(s),
1735                 !is_VERTWS_latin1(s)
1736             );
1737             break;
1738         case HORIZWS:
1739             REXEC_FBC_CSCAN(
1740                 is_HORIZWS_utf8(s),
1741                 is_HORIZWS_latin1(s)
1742             );
1743             break;
1744         case NHORIZWS:
1745             REXEC_FBC_CSCAN(
1746                 !is_HORIZWS_utf8(s),
1747                 !is_HORIZWS_latin1(s)
1748             );
1749             break;
1750         case AHOCORASICKC:
1751         case AHOCORASICK:
1752             {
1753                 DECL_TRIE_TYPE(c);
1754                 /* what trie are we using right now */
1755                 reg_ac_data *aho
1756                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1757                 reg_trie_data *trie
1758                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1759                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1760
1761                 const char *last_start = strend - trie->minlen;
1762 #ifdef DEBUGGING
1763                 const char *real_start = s;
1764 #endif
1765                 STRLEN maxlen = trie->maxlen;
1766                 SV *sv_points;
1767                 U8 **points; /* map of where we were in the input string
1768                                 when reading a given char. For ASCII this
1769                                 is unnecessary overhead as the relationship
1770                                 is always 1:1, but for Unicode, especially
1771                                 case folded Unicode this is not true. */
1772                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1773                 U8 *bitmap=NULL;
1774
1775
1776                 GET_RE_DEBUG_FLAGS_DECL;
1777
1778                 /* We can't just allocate points here. We need to wrap it in
1779                  * an SV so it gets freed properly if there is a croak while
1780                  * running the match */
1781                 ENTER;
1782                 SAVETMPS;
1783                 sv_points=newSV(maxlen * sizeof(U8 *));
1784                 SvCUR_set(sv_points,
1785                     maxlen * sizeof(U8 *));
1786                 SvPOK_on(sv_points);
1787                 sv_2mortal(sv_points);
1788                 points=(U8**)SvPV_nolen(sv_points );
1789                 if ( trie_type != trie_utf8_fold
1790                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1791                 {
1792                     if (trie->bitmap)
1793                         bitmap=(U8*)trie->bitmap;
1794                     else
1795                         bitmap=(U8*)ANYOF_BITMAP(c);
1796                 }
1797                 /* this is the Aho-Corasick algorithm modified a touch
1798                    to include special handling for long "unknown char"
1799                    sequences. The basic idea being that we use AC as long
1800                    as we are dealing with a possible matching char, when
1801                    we encounter an unknown char (and we have not encountered
1802                    an accepting state) we scan forward until we find a legal
1803                    starting char.
1804                    AC matching is basically that of trie matching, except
1805                    that when we encounter a failing transition, we fall back
1806                    to the current states "fail state", and try the current char
1807                    again, a process we repeat until we reach the root state,
1808                    state 1, or a legal transition. If we fail on the root state
1809                    then we can either terminate if we have reached an accepting
1810                    state previously, or restart the entire process from the beginning
1811                    if we have not.
1812
1813                  */
1814                 while (s <= last_start) {
1815                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1816                     U8 *uc = (U8*)s;
1817                     U16 charid = 0;
1818                     U32 base = 1;
1819                     U32 state = 1;
1820                     UV uvc = 0;
1821                     STRLEN len = 0;
1822                     STRLEN foldlen = 0;
1823                     U8 *uscan = (U8*)NULL;
1824                     U8 *leftmost = NULL;
1825 #ifdef DEBUGGING
1826                     U32 accepted_word= 0;
1827 #endif
1828                     U32 pointpos = 0;
1829
1830                     while ( state && uc <= (U8*)strend ) {
1831                         int failed=0;
1832                         U32 word = aho->states[ state ].wordnum;
1833
1834                         if( state==1 ) {
1835                             if ( bitmap ) {
1836                                 DEBUG_TRIE_EXECUTE_r(
1837                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1838                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1839                                             (char *)uc, utf8_target );
1840                                         PerlIO_printf( Perl_debug_log,
1841                                             " Scanning for legal start char...\n");
1842                                     }
1843                                 );
1844                                 if (utf8_target) {
1845                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1846                                         uc += UTF8SKIP(uc);
1847                                     }
1848                                 } else {
1849                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1850                                         uc++;
1851                                     }
1852                                 }
1853                                 s= (char *)uc;
1854                             }
1855                             if (uc >(U8*)last_start) break;
1856                         }
1857
1858                         if ( word ) {
1859                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1860                             if (!leftmost || lpos < leftmost) {
1861                                 DEBUG_r(accepted_word=word);
1862                                 leftmost= lpos;
1863                             }
1864                             if (base==0) break;
1865
1866                         }
1867                         points[pointpos++ % maxlen]= uc;
1868                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1869                                              uscan, len, uvc, charid, foldlen,
1870                                              foldbuf, uniflags);
1871                         DEBUG_TRIE_EXECUTE_r({
1872                             dump_exec_pos( (char *)uc, c, strend, real_start,
1873                                 s,   utf8_target );
1874                             PerlIO_printf(Perl_debug_log,
1875                                 " Charid:%3u CP:%4"UVxf" ",
1876                                  charid, uvc);
1877                         });
1878
1879                         do {
1880 #ifdef DEBUGGING
1881                             word = aho->states[ state ].wordnum;
1882 #endif
1883                             base = aho->states[ state ].trans.base;
1884
1885                             DEBUG_TRIE_EXECUTE_r({
1886                                 if (failed)
1887                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1888                                         s,   utf8_target );
1889                                 PerlIO_printf( Perl_debug_log,
1890                                     "%sState: %4"UVxf", word=%"UVxf,
1891                                     failed ? " Fail transition to " : "",
1892                                     (UV)state, (UV)word);
1893                             });
1894                             if ( base ) {
1895                                 U32 tmp;
1896                                 I32 offset;
1897                                 if (charid &&
1898                                      ( ((offset = base + charid
1899                                         - 1 - trie->uniquecharcount)) >= 0)
1900                                      && ((U32)offset < trie->lasttrans)
1901                                      && trie->trans[offset].check == state
1902                                      && (tmp=trie->trans[offset].next))
1903                                 {
1904                                     DEBUG_TRIE_EXECUTE_r(
1905                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1906                                     state = tmp;
1907                                     break;
1908                                 }
1909                                 else {
1910                                     DEBUG_TRIE_EXECUTE_r(
1911                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1912                                     failed = 1;
1913                                     state = aho->fail[state];
1914                                 }
1915                             }
1916                             else {
1917                                 /* we must be accepting here */
1918                                 DEBUG_TRIE_EXECUTE_r(
1919                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1920                                 failed = 1;
1921                                 break;
1922                             }
1923                         } while(state);
1924                         uc += len;
1925                         if (failed) {
1926                             if (leftmost)
1927                                 break;
1928                             if (!state) state = 1;
1929                         }
1930                     }
1931                     if ( aho->states[ state ].wordnum ) {
1932                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1933                         if (!leftmost || lpos < leftmost) {
1934                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1935                             leftmost = lpos;
1936                         }
1937                     }
1938                     if (leftmost) {
1939                         s = (char*)leftmost;
1940                         DEBUG_TRIE_EXECUTE_r({
1941                             PerlIO_printf(
1942                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1943                                 (UV)accepted_word, (IV)(s - real_start)
1944                             );
1945                         });
1946                         if (!reginfo || regtry(reginfo, &s)) {
1947                             FREETMPS;
1948                             LEAVE;
1949                             goto got_it;
1950                         }
1951                         s = HOPc(s,1);
1952                         DEBUG_TRIE_EXECUTE_r({
1953                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1954                         });
1955                     } else {
1956                         DEBUG_TRIE_EXECUTE_r(
1957                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1958                         break;
1959                     }
1960                 }
1961                 FREETMPS;
1962                 LEAVE;
1963             }
1964             break;
1965         default:
1966             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1967             break;
1968         }
1969         return 0;
1970       got_it:
1971         return s;
1972 }
1973
1974
1975 /*
1976  - regexec_flags - match a regexp against a string
1977  */
1978 I32
1979 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1980               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1981 /* strend: pointer to null at end of string */
1982 /* strbeg: real beginning of string */
1983 /* minend: end of match must be >=minend after stringarg. */
1984 /* data: May be used for some additional optimizations.
1985          Currently its only used, with a U32 cast, for transmitting
1986          the ganch offset when doing a /g match. This will change */
1987 /* nosave: For optimizations. */
1988 {
1989     dVAR;
1990     struct regexp *const prog = (struct regexp *)SvANY(rx);
1991     /*register*/ char *s;
1992     register regnode *c;
1993     /*register*/ char *startpos = stringarg;
1994     I32 minlen;         /* must match at least this many chars */
1995     I32 dontbother = 0; /* how many characters not to try at end */
1996     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1997     I32 scream_pos = -1;                /* Internal iterator of scream. */
1998     char *scream_olds = NULL;
1999     const bool utf8_target = cBOOL(DO_UTF8(sv));
2000     I32 multiline;
2001     RXi_GET_DECL(prog,progi);
2002     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2003     regexp_paren_pair *swap = NULL;
2004     GET_RE_DEBUG_FLAGS_DECL;
2005
2006     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2007     PERL_UNUSED_ARG(data);
2008
2009     /* Be paranoid... */
2010     if (prog == NULL || startpos == NULL) {
2011         Perl_croak(aTHX_ "NULL regexp parameter");
2012         return 0;
2013     }
2014
2015     multiline = prog->extflags & RXf_PMf_MULTILINE;
2016     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2017
2018     RX_MATCH_UTF8_set(rx, utf8_target);
2019     DEBUG_EXECUTE_r(
2020         debug_start_match(rx, utf8_target, startpos, strend,
2021         "Matching");
2022     );
2023
2024     minlen = prog->minlen;
2025
2026     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2027         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2028                               "String too short [regexec_flags]...\n"));
2029         goto phooey;
2030     }
2031
2032
2033     /* Check validity of program. */
2034     if (UCHARAT(progi->program) != REG_MAGIC) {
2035         Perl_croak(aTHX_ "corrupted regexp program");
2036     }
2037
2038     PL_reg_flags = 0;
2039     PL_reg_eval_set = 0;
2040     PL_reg_maxiter = 0;
2041
2042     if (RX_UTF8(rx))
2043         PL_reg_flags |= RF_utf8;
2044
2045     /* Mark beginning of line for ^ and lookbehind. */
2046     reginfo.bol = startpos; /* XXX not used ??? */
2047     PL_bostr  = strbeg;
2048     reginfo.sv = sv;
2049
2050     /* Mark end of line for $ (and such) */
2051     PL_regeol = strend;
2052
2053     /* see how far we have to get to not match where we matched before */
2054     reginfo.till = startpos+minend;
2055
2056     /* If there is a "must appear" string, look for it. */
2057     s = startpos;
2058
2059     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2060         MAGIC *mg;
2061         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2062             reginfo.ganch = startpos + prog->gofs;
2063             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2064               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2065         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2066                   && SvMAGIC(sv)
2067                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2068                   && mg->mg_len >= 0) {
2069             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2070             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2071                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2072
2073             if (prog->extflags & RXf_ANCH_GPOS) {
2074                 if (s > reginfo.ganch)
2075                     goto phooey;
2076                 s = reginfo.ganch - prog->gofs;
2077                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2078                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2079                 if (s < strbeg)
2080                     goto phooey;
2081             }
2082         }
2083         else if (data) {
2084             reginfo.ganch = strbeg + PTR2UV(data);
2085             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2086                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2087
2088         } else {                                /* pos() not defined */
2089             reginfo.ganch = strbeg;
2090             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2091                  "GPOS: reginfo.ganch = strbeg\n"));
2092         }
2093     }
2094     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2095         /* We have to be careful. If the previous successful match
2096            was from this regex we don't want a subsequent partially
2097            successful match to clobber the old results.
2098            So when we detect this possibility we add a swap buffer
2099            to the re, and switch the buffer each match. If we fail
2100            we switch it back, otherwise we leave it swapped.
2101         */
2102         swap = prog->offs;
2103         /* do we need a save destructor here for eval dies? */
2104         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2105     }
2106     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2107         re_scream_pos_data d;
2108
2109         d.scream_olds = &scream_olds;
2110         d.scream_pos = &scream_pos;
2111         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2112         if (!s) {
2113             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2114             goto phooey;        /* not present */
2115         }
2116     }
2117
2118
2119
2120     /* Simplest case:  anchored match need be tried only once. */
2121     /*  [unless only anchor is BOL and multiline is set] */
2122     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2123         if (s == startpos && regtry(&reginfo, &startpos))
2124             goto got_it;
2125         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2126                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2127         {
2128             char *end;
2129
2130             if (minlen)
2131                 dontbother = minlen - 1;
2132             end = HOP3c(strend, -dontbother, strbeg) - 1;
2133             /* for multiline we only have to try after newlines */
2134             if (prog->check_substr || prog->check_utf8) {
2135                 /* because of the goto we can not easily reuse the macros for bifurcating the
2136                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2137                 if (utf8_target) {
2138                     if (s == startpos)
2139                         goto after_try_utf8;
2140                     while (1) {
2141                         if (regtry(&reginfo, &s)) {
2142                             goto got_it;
2143                         }
2144                       after_try_utf8:
2145                         if (s > end) {
2146                             goto phooey;
2147                         }
2148                         if (prog->extflags & RXf_USE_INTUIT) {
2149                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2150                             if (!s) {
2151                                 goto phooey;
2152                             }
2153                         }
2154                         else {
2155                             s += UTF8SKIP(s);
2156                         }
2157                     }
2158                 } /* end search for check string in unicode */
2159                 else {
2160                     if (s == startpos) {
2161                         goto after_try_latin;
2162                     }
2163                     while (1) {
2164                         if (regtry(&reginfo, &s)) {
2165                             goto got_it;
2166                         }
2167                       after_try_latin:
2168                         if (s > end) {
2169                             goto phooey;
2170                         }
2171                         if (prog->extflags & RXf_USE_INTUIT) {
2172                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2173                             if (!s) {
2174                                 goto phooey;
2175                             }
2176                         }
2177                         else {
2178                             s++;
2179                         }
2180                     }
2181                 } /* end search for check string in latin*/
2182             } /* end search for check string */
2183             else { /* search for newline */
2184                 if (s > startpos) {
2185                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2186                     s--;
2187                 }
2188                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2189                 while (s < end) {
2190                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2191                         if (regtry(&reginfo, &s))
2192                             goto got_it;
2193                     }
2194                 }
2195             } /* end search for newline */
2196         } /* end anchored/multiline check string search */
2197         goto phooey;
2198     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2199     {
2200         /* the warning about reginfo.ganch being used without initialization
2201            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2202            and we only enter this block when the same bit is set. */
2203         char *tmp_s = reginfo.ganch - prog->gofs;
2204
2205         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2206             goto got_it;
2207         goto phooey;
2208     }
2209
2210     /* Messy cases:  unanchored match. */
2211     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2212         /* we have /x+whatever/ */
2213         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2214         char ch;
2215 #ifdef DEBUGGING
2216         int did_match = 0;
2217 #endif
2218         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2219             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2220         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2221
2222         if (utf8_target) {
2223             REXEC_FBC_SCAN(
2224                 if (*s == ch) {
2225                     DEBUG_EXECUTE_r( did_match = 1 );
2226                     if (regtry(&reginfo, &s)) goto got_it;
2227                     s += UTF8SKIP(s);
2228                     while (s < strend && *s == ch)
2229                         s += UTF8SKIP(s);
2230                 }
2231             );
2232         }
2233         else {
2234             REXEC_FBC_SCAN(
2235                 if (*s == ch) {
2236                     DEBUG_EXECUTE_r( did_match = 1 );
2237                     if (regtry(&reginfo, &s)) goto got_it;
2238                     s++;
2239                     while (s < strend && *s == ch)
2240                         s++;
2241                 }
2242             );
2243         }
2244         DEBUG_EXECUTE_r(if (!did_match)
2245                 PerlIO_printf(Perl_debug_log,
2246                                   "Did not find anchored character...\n")
2247                );
2248     }
2249     else if (prog->anchored_substr != NULL
2250               || prog->anchored_utf8 != NULL
2251               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2252                   && prog->float_max_offset < strend - s)) {
2253         SV *must;
2254         I32 back_max;
2255         I32 back_min;
2256         char *last;
2257         char *last1;            /* Last position checked before */
2258 #ifdef DEBUGGING
2259         int did_match = 0;
2260 #endif
2261         if (prog->anchored_substr || prog->anchored_utf8) {
2262             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2263                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2264             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2265             back_max = back_min = prog->anchored_offset;
2266         } else {
2267             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2268                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2269             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2270             back_max = prog->float_max_offset;
2271             back_min = prog->float_min_offset;
2272         }
2273
2274
2275         if (must == &PL_sv_undef)
2276             /* could not downgrade utf8 check substring, so must fail */
2277             goto phooey;
2278
2279         if (back_min<0) {
2280             last = strend;
2281         } else {
2282             last = HOP3c(strend,        /* Cannot start after this */
2283                   -(I32)(CHR_SVLEN(must)
2284                          - (SvTAIL(must) != 0) + back_min), strbeg);
2285         }
2286         if (s > PL_bostr)
2287             last1 = HOPc(s, -1);
2288         else
2289             last1 = s - 1;      /* bogus */
2290
2291         /* XXXX check_substr already used to find "s", can optimize if
2292            check_substr==must. */
2293         scream_pos = -1;
2294         dontbother = end_shift;
2295         strend = HOPc(strend, -dontbother);
2296         while ( (s <= last) &&
2297                 ((flags & REXEC_SCREAM)
2298                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2299                                     end_shift, &scream_pos, 0))
2300                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2301                                   (unsigned char*)strend, must,
2302                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2303             /* we may be pointing at the wrong string */
2304             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2305                 s = strbeg + (s - SvPVX_const(sv));
2306             DEBUG_EXECUTE_r( did_match = 1 );
2307             if (HOPc(s, -back_max) > last1) {
2308                 last1 = HOPc(s, -back_min);
2309                 s = HOPc(s, -back_max);
2310             }
2311             else {
2312                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2313
2314                 last1 = HOPc(s, -back_min);
2315                 s = t;
2316             }
2317             if (utf8_target) {
2318                 while (s <= last1) {
2319                     if (regtry(&reginfo, &s))
2320                         goto got_it;
2321                     s += UTF8SKIP(s);
2322                 }
2323             }
2324             else {
2325                 while (s <= last1) {
2326                     if (regtry(&reginfo, &s))
2327                         goto got_it;
2328                     s++;
2329                 }
2330             }
2331         }
2332         DEBUG_EXECUTE_r(if (!did_match) {
2333             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2334                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2335             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2336                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2337                                ? "anchored" : "floating"),
2338                 quoted, RE_SV_TAIL(must));
2339         });
2340         goto phooey;
2341     }
2342     else if ( (c = progi->regstclass) ) {
2343         if (minlen) {
2344             const OPCODE op = OP(progi->regstclass);
2345             /* don't bother with what can't match */
2346             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2347                 strend = HOPc(strend, -(minlen - 1));
2348         }
2349         DEBUG_EXECUTE_r({
2350             SV * const prop = sv_newmortal();
2351             regprop(prog, prop, c);
2352             {
2353                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2354                     s,strend-s,60);
2355                 PerlIO_printf(Perl_debug_log,
2356                     "Matching stclass %.*s against %s (%d bytes)\n",
2357                     (int)SvCUR(prop), SvPVX_const(prop),
2358                      quoted, (int)(strend - s));
2359             }
2360         });
2361         if (find_byclass(prog, c, s, strend, &reginfo))
2362             goto got_it;
2363         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2364     }
2365     else {
2366         dontbother = 0;
2367         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2368             /* Trim the end. */
2369             char *last;
2370             SV* float_real;
2371
2372             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2373                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2374             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2375
2376             if (flags & REXEC_SCREAM) {
2377                 last = screaminstr(sv, float_real, s - strbeg,
2378                                    end_shift, &scream_pos, 1); /* last one */
2379                 if (!last)
2380                     last = scream_olds; /* Only one occurrence. */
2381                 /* we may be pointing at the wrong string */
2382                 else if (RXp_MATCH_COPIED(prog))
2383                     s = strbeg + (s - SvPVX_const(sv));
2384             }
2385             else {
2386                 STRLEN len;
2387                 const char * const little = SvPV_const(float_real, len);
2388
2389                 if (SvTAIL(float_real)) {
2390                     if (memEQ(strend - len + 1, little, len - 1))
2391                         last = strend - len + 1;
2392                     else if (!multiline)
2393                         last = memEQ(strend - len, little, len)
2394                             ? strend - len : NULL;
2395                     else
2396                         goto find_last;
2397                 } else {
2398                   find_last:
2399                     if (len)
2400                         last = rninstr(s, strend, little, little + len);
2401                     else
2402                         last = strend;  /* matching "$" */
2403                 }
2404             }
2405             if (last == NULL) {
2406                 DEBUG_EXECUTE_r(
2407                     PerlIO_printf(Perl_debug_log,
2408                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2409                         PL_colors[4], PL_colors[5]));
2410                 goto phooey; /* Should not happen! */
2411             }
2412             dontbother = strend - last + prog->float_min_offset;
2413         }
2414         if (minlen && (dontbother < minlen))
2415             dontbother = minlen - 1;
2416         strend -= dontbother;              /* this one's always in bytes! */
2417         /* We don't know much -- general case. */
2418         if (utf8_target) {
2419             for (;;) {
2420                 if (regtry(&reginfo, &s))
2421                     goto got_it;
2422                 if (s >= strend)
2423                     break;
2424                 s += UTF8SKIP(s);
2425             };
2426         }
2427         else {
2428             do {
2429                 if (regtry(&reginfo, &s))
2430                     goto got_it;
2431             } while (s++ < strend);
2432         }
2433     }
2434
2435     /* Failure. */
2436     goto phooey;
2437
2438 got_it:
2439     Safefree(swap);
2440     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2441
2442     if (PL_reg_eval_set)
2443         restore_pos(aTHX_ prog);
2444     if (RXp_PAREN_NAMES(prog))
2445         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2446
2447     /* make sure $`, $&, $', and $digit will work later */
2448     if ( !(flags & REXEC_NOT_FIRST) ) {
2449         RX_MATCH_COPY_FREE(rx);
2450         if (flags & REXEC_COPY_STR) {
2451             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2452 #ifdef PERL_OLD_COPY_ON_WRITE
2453             if ((SvIsCOW(sv)
2454                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2455                 if (DEBUG_C_TEST) {
2456                     PerlIO_printf(Perl_debug_log,
2457                                   "Copy on write: regexp capture, type %d\n",
2458                                   (int) SvTYPE(sv));
2459                 }
2460                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2461                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2462                 assert (SvPOKp(prog->saved_copy));
2463             } else
2464 #endif
2465             {
2466                 RX_MATCH_COPIED_on(rx);
2467                 s = savepvn(strbeg, i);
2468                 prog->subbeg = s;
2469             }
2470             prog->sublen = i;
2471         }
2472         else {
2473             prog->subbeg = strbeg;
2474             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2475         }
2476     }
2477
2478     return 1;
2479
2480 phooey:
2481     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2482                           PL_colors[4], PL_colors[5]));
2483     if (PL_reg_eval_set)
2484         restore_pos(aTHX_ prog);
2485     if (swap) {
2486         /* we failed :-( roll it back */
2487         Safefree(prog->offs);
2488         prog->offs = swap;
2489     }
2490
2491     return 0;
2492 }
2493
2494
2495 /*
2496  - regtry - try match at specific point
2497  */
2498 STATIC I32                      /* 0 failure, 1 success */
2499 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2500 {
2501     dVAR;
2502     CHECKPOINT lastcp;
2503     REGEXP *const rx = reginfo->prog;
2504     regexp *const prog = (struct regexp *)SvANY(rx);
2505     RXi_GET_DECL(prog,progi);
2506     GET_RE_DEBUG_FLAGS_DECL;
2507
2508     PERL_ARGS_ASSERT_REGTRY;
2509
2510     reginfo->cutpoint=NULL;
2511
2512     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2513         MAGIC *mg;
2514
2515         PL_reg_eval_set = RS_init;
2516         DEBUG_EXECUTE_r(DEBUG_s(
2517             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2518                           (IV)(PL_stack_sp - PL_stack_base));
2519             ));
2520         SAVESTACK_CXPOS();
2521         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2522         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2523         SAVETMPS;
2524         /* Apparently this is not needed, judging by wantarray. */
2525         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2526            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2527
2528         if (reginfo->sv) {
2529             /* Make $_ available to executed code. */
2530             if (reginfo->sv != DEFSV) {
2531                 SAVE_DEFSV;
2532                 DEFSV_set(reginfo->sv);
2533             }
2534
2535             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2536                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2537                 /* prepare for quick setting of pos */
2538 #ifdef PERL_OLD_COPY_ON_WRITE
2539                 if (SvIsCOW(reginfo->sv))
2540                     sv_force_normal_flags(reginfo->sv, 0);
2541 #endif
2542                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2543                                  &PL_vtbl_mglob, NULL, 0);
2544                 mg->mg_len = -1;
2545             }
2546             PL_reg_magic    = mg;
2547             PL_reg_oldpos   = mg->mg_len;
2548             SAVEDESTRUCTOR_X(restore_pos, prog);
2549         }
2550         if (!PL_reg_curpm) {
2551             Newxz(PL_reg_curpm, 1, PMOP);
2552 #ifdef USE_ITHREADS
2553             {
2554                 SV* const repointer = &PL_sv_undef;
2555                 /* this regexp is also owned by the new PL_reg_curpm, which
2556                    will try to free it.  */
2557                 av_push(PL_regex_padav, repointer);
2558                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2559                 PL_regex_pad = AvARRAY(PL_regex_padav);
2560             }
2561 #endif
2562         }
2563 #ifdef USE_ITHREADS
2564         /* It seems that non-ithreads works both with and without this code.
2565            So for efficiency reasons it seems best not to have the code
2566            compiled when it is not needed.  */
2567         /* This is safe against NULLs: */
2568         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2569         /* PM_reg_curpm owns a reference to this regexp.  */
2570         ReREFCNT_inc(rx);
2571 #endif
2572         PM_SETRE(PL_reg_curpm, rx);
2573         PL_reg_oldcurpm = PL_curpm;
2574         PL_curpm = PL_reg_curpm;
2575         if (RXp_MATCH_COPIED(prog)) {
2576             /*  Here is a serious problem: we cannot rewrite subbeg,
2577                 since it may be needed if this match fails.  Thus
2578                 $` inside (?{}) could fail... */
2579             PL_reg_oldsaved = prog->subbeg;
2580             PL_reg_oldsavedlen = prog->sublen;
2581 #ifdef PERL_OLD_COPY_ON_WRITE
2582             PL_nrs = prog->saved_copy;
2583 #endif
2584             RXp_MATCH_COPIED_off(prog);
2585         }
2586         else
2587             PL_reg_oldsaved = NULL;
2588         prog->subbeg = PL_bostr;
2589         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2590     }
2591     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2592     prog->offs[0].start = *startpos - PL_bostr;
2593     PL_reginput = *startpos;
2594     PL_reglastparen = &prog->lastparen;
2595     PL_reglastcloseparen = &prog->lastcloseparen;
2596     prog->lastparen = 0;
2597     prog->lastcloseparen = 0;
2598     PL_regsize = 0;
2599     PL_regoffs = prog->offs;
2600     if (PL_reg_start_tmpl <= prog->nparens) {
2601         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2602         if(PL_reg_start_tmp)
2603             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2604         else
2605             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2606     }
2607
2608     /* XXXX What this code is doing here?!!!  There should be no need
2609        to do this again and again, PL_reglastparen should take care of
2610        this!  --ilya*/
2611
2612     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2613      * Actually, the code in regcppop() (which Ilya may be meaning by
2614      * PL_reglastparen), is not needed at all by the test suite
2615      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2616      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2617      * Meanwhile, this code *is* needed for the
2618      * above-mentioned test suite tests to succeed.  The common theme
2619      * on those tests seems to be returning null fields from matches.
2620      * --jhi updated by dapm */
2621 #if 1
2622     if (prog->nparens) {
2623         regexp_paren_pair *pp = PL_regoffs;
2624         register I32 i;
2625         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2626             ++pp;
2627             pp->start = -1;
2628             pp->end = -1;
2629         }
2630     }
2631 #endif
2632     REGCP_SET(lastcp);
2633     if (regmatch(reginfo, progi->program + 1)) {
2634         PL_regoffs[0].end = PL_reginput - PL_bostr;
2635         return 1;
2636     }
2637     if (reginfo->cutpoint)
2638         *startpos= reginfo->cutpoint;
2639     REGCP_UNWIND(lastcp);
2640     return 0;
2641 }
2642
2643
2644 #define sayYES goto yes
2645 #define sayNO goto no
2646 #define sayNO_SILENT goto no_silent
2647
2648 /* we dont use STMT_START/END here because it leads to
2649    "unreachable code" warnings, which are bogus, but distracting. */
2650 #define CACHEsayNO \
2651     if (ST.cache_mask) \
2652        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2653     sayNO
2654
2655 /* this is used to determine how far from the left messages like
2656    'failed...' are printed. It should be set such that messages
2657    are inline with the regop output that created them.
2658 */
2659 #define REPORT_CODE_OFF 32
2660
2661
2662 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2663 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2664
2665 #define SLAB_FIRST(s) (&(s)->states[0])
2666 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2667
2668 /* grab a new slab and return the first slot in it */
2669
2670 STATIC regmatch_state *
2671 S_push_slab(pTHX)
2672 {
2673 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2674     dMY_CXT;
2675 #endif
2676     regmatch_slab *s = PL_regmatch_slab->next;
2677     if (!s) {
2678         Newx(s, 1, regmatch_slab);
2679         s->prev = PL_regmatch_slab;
2680         s->next = NULL;
2681         PL_regmatch_slab->next = s;
2682     }
2683     PL_regmatch_slab = s;
2684     return SLAB_FIRST(s);
2685 }
2686
2687
2688 /* push a new state then goto it */
2689
2690 #define PUSH_STATE_GOTO(state, node) \
2691     scan = node; \
2692     st->resume_state = state; \
2693     goto push_state;
2694
2695 /* push a new state with success backtracking, then goto it */
2696
2697 #define PUSH_YES_STATE_GOTO(state, node) \
2698     scan = node; \
2699     st->resume_state = state; \
2700     goto push_yes_state;
2701
2702
2703
2704 /*
2705
2706 regmatch() - main matching routine
2707
2708 This is basically one big switch statement in a loop. We execute an op,
2709 set 'next' to point the next op, and continue. If we come to a point which
2710 we may need to backtrack to on failure such as (A|B|C), we push a
2711 backtrack state onto the backtrack stack. On failure, we pop the top
2712 state, and re-enter the loop at the state indicated. If there are no more
2713 states to pop, we return failure.
2714
2715 Sometimes we also need to backtrack on success; for example /A+/, where
2716 after successfully matching one A, we need to go back and try to
2717 match another one; similarly for lookahead assertions: if the assertion
2718 completes successfully, we backtrack to the state just before the assertion
2719 and then carry on.  In these cases, the pushed state is marked as
2720 'backtrack on success too'. This marking is in fact done by a chain of
2721 pointers, each pointing to the previous 'yes' state. On success, we pop to
2722 the nearest yes state, discarding any intermediate failure-only states.
2723 Sometimes a yes state is pushed just to force some cleanup code to be
2724 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2725 it to free the inner regex.
2726
2727 Note that failure backtracking rewinds the cursor position, while
2728 success backtracking leaves it alone.
2729
2730 A pattern is complete when the END op is executed, while a subpattern
2731 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2732 ops trigger the "pop to last yes state if any, otherwise return true"
2733 behaviour.
2734
2735 A common convention in this function is to use A and B to refer to the two
2736 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2737 the subpattern to be matched possibly multiple times, while B is the entire
2738 rest of the pattern. Variable and state names reflect this convention.
2739
2740 The states in the main switch are the union of ops and failure/success of
2741 substates associated with with that op.  For example, IFMATCH is the op
2742 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2743 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2744 successfully matched A and IFMATCH_A_fail is a state saying that we have
2745 just failed to match A. Resume states always come in pairs. The backtrack
2746 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2747 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2748 on success or failure.
2749
2750 The struct that holds a backtracking state is actually a big union, with
2751 one variant for each major type of op. The variable st points to the
2752 top-most backtrack struct. To make the code clearer, within each
2753 block of code we #define ST to alias the relevant union.
2754
2755 Here's a concrete example of a (vastly oversimplified) IFMATCH
2756 implementation:
2757
2758     switch (state) {
2759     ....
2760
2761 #define ST st->u.ifmatch
2762
2763     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2764         ST.foo = ...; // some state we wish to save
2765         ...
2766         // push a yes backtrack state with a resume value of
2767         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2768         // first node of A:
2769         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2770         // NOTREACHED
2771
2772     case IFMATCH_A: // we have successfully executed A; now continue with B
2773         next = B;
2774         bar = ST.foo; // do something with the preserved value
2775         break;
2776
2777     case IFMATCH_A_fail: // A failed, so the assertion failed
2778         ...;   // do some housekeeping, then ...
2779         sayNO; // propagate the failure
2780
2781 #undef ST
2782
2783     ...
2784     }
2785
2786 For any old-timers reading this who are familiar with the old recursive
2787 approach, the code above is equivalent to:
2788
2789     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2790     {
2791         int foo = ...
2792         ...
2793         if (regmatch(A)) {
2794             next = B;
2795             bar = foo;
2796             break;
2797         }
2798         ...;   // do some housekeeping, then ...
2799         sayNO; // propagate the failure
2800     }
2801
2802 The topmost backtrack state, pointed to by st, is usually free. If you
2803 want to claim it, populate any ST.foo fields in it with values you wish to
2804 save, then do one of
2805
2806         PUSH_STATE_GOTO(resume_state, node);
2807         PUSH_YES_STATE_GOTO(resume_state, node);
2808
2809 which sets that backtrack state's resume value to 'resume_state', pushes a
2810 new free entry to the top of the backtrack stack, then goes to 'node'.
2811 On backtracking, the free slot is popped, and the saved state becomes the
2812 new free state. An ST.foo field in this new top state can be temporarily
2813 accessed to retrieve values, but once the main loop is re-entered, it
2814 becomes available for reuse.
2815
2816 Note that the depth of the backtrack stack constantly increases during the
2817 left-to-right execution of the pattern, rather than going up and down with
2818 the pattern nesting. For example the stack is at its maximum at Z at the
2819 end of the pattern, rather than at X in the following:
2820
2821     /(((X)+)+)+....(Y)+....Z/
2822
2823 The only exceptions to this are lookahead/behind assertions and the cut,
2824 (?>A), which pop all the backtrack states associated with A before
2825 continuing.
2826
2827 Backtrack state structs are allocated in slabs of about 4K in size.
2828 PL_regmatch_state and st always point to the currently active state,
2829 and PL_regmatch_slab points to the slab currently containing
2830 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2831 allocated, and is never freed until interpreter destruction. When the slab
2832 is full, a new one is allocated and chained to the end. At exit from
2833 regmatch(), slabs allocated since entry are freed.
2834
2835 */
2836
2837
2838 #define DEBUG_STATE_pp(pp)                                  \
2839     DEBUG_STATE_r({                                         \
2840         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2841         PerlIO_printf(Perl_debug_log,                       \
2842             "    %*s"pp" %s%s%s%s%s\n",                     \
2843             depth*2, "",                                    \
2844             PL_reg_name[st->resume_state],                     \
2845             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2846             ((st==yes_state) ? "Y" : ""),                   \
2847             ((st==mark_state) ? "M" : ""),                  \
2848             ((st==yes_state||st==mark_state) ? "]" : "")    \
2849         );                                                  \
2850     });
2851
2852
2853 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2854
2855 #ifdef DEBUGGING
2856
2857 STATIC void
2858 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2859     const char *start, const char *end, const char *blurb)
2860 {
2861     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2862
2863     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2864
2865     if (!PL_colorset)
2866             reginitcolors();
2867     {
2868         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2869             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2870
2871         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2872             start, end - start, 60);
2873
2874         PerlIO_printf(Perl_debug_log,
2875             "%s%s REx%s %s against %s\n",
2876                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2877
2878         if (utf8_target||utf8_pat)
2879             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2880                 utf8_pat ? "pattern" : "",
2881                 utf8_pat && utf8_target ? " and " : "",
2882                 utf8_target ? "string" : ""
2883             );
2884     }
2885 }
2886
2887 STATIC void
2888 S_dump_exec_pos(pTHX_ const char *locinput,
2889                       const regnode *scan,
2890                       const char *loc_regeol,
2891                       const char *loc_bostr,
2892                       const char *loc_reg_starttry,
2893                       const bool utf8_target)
2894 {
2895     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2896     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2897     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2898     /* The part of the string before starttry has one color
2899        (pref0_len chars), between starttry and current
2900        position another one (pref_len - pref0_len chars),
2901        after the current position the third one.
2902        We assume that pref0_len <= pref_len, otherwise we
2903        decrease pref0_len.  */
2904     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2905         ? (5 + taill) - l : locinput - loc_bostr;
2906     int pref0_len;
2907
2908     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2909
2910     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2911         pref_len++;
2912     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2913     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2914         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2915               ? (5 + taill) - pref_len : loc_regeol - locinput);
2916     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2917         l--;
2918     if (pref0_len < 0)
2919         pref0_len = 0;
2920     if (pref0_len > pref_len)
2921         pref0_len = pref_len;
2922     {
2923         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2924
2925         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2926             (locinput - pref_len),pref0_len, 60, 4, 5);
2927
2928         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2929                     (locinput - pref_len + pref0_len),
2930                     pref_len - pref0_len, 60, 2, 3);
2931
2932         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2933                     locinput, loc_regeol - locinput, 10, 0, 1);
2934
2935         const STRLEN tlen=len0+len1+len2;
2936         PerlIO_printf(Perl_debug_log,
2937                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2938                     (IV)(locinput - loc_bostr),
2939                     len0, s0,
2940                     len1, s1,
2941                     (docolor ? "" : "> <"),
2942                     len2, s2,
2943                     (int)(tlen > 19 ? 0 :  19 - tlen),
2944                     "");
2945     }
2946 }
2947
2948 #endif
2949
2950 /* reg_check_named_buff_matched()
2951  * Checks to see if a named buffer has matched. The data array of
2952  * buffer numbers corresponding to the buffer is expected to reside
2953  * in the regexp->data->data array in the slot stored in the ARG() of
2954  * node involved. Note that this routine doesn't actually care about the
2955  * name, that information is not preserved from compilation to execution.
2956  * Returns the index of the leftmost defined buffer with the given name
2957  * or 0 if non of the buffers matched.
2958  */
2959 STATIC I32
2960 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2961 {
2962     I32 n;
2963     RXi_GET_DECL(rex,rexi);
2964     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2965     I32 *nums=(I32*)SvPVX(sv_dat);
2966
2967     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2968
2969     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2970         if ((I32)*PL_reglastparen >= nums[n] &&
2971             PL_regoffs[nums[n]].end != -1)
2972         {
2973             return nums[n];
2974         }
2975     }
2976     return 0;
2977 }
2978
2979
2980 /* free all slabs above current one  - called during LEAVE_SCOPE */
2981
2982 STATIC void
2983 S_clear_backtrack_stack(pTHX_ void *p)
2984 {
2985     regmatch_slab *s = PL_regmatch_slab->next;
2986     PERL_UNUSED_ARG(p);
2987
2988     if (!s)
2989         return;
2990     PL_regmatch_slab->next = NULL;
2991     while (s) {
2992         regmatch_slab * const osl = s;
2993         s = s->next;
2994         Safefree(osl);
2995     }
2996 }
2997
2998
2999 #define SETREX(Re1,Re2) \
3000     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3001     Re1 = (Re2)
3002
3003 STATIC I32                      /* 0 failure, 1 success */
3004 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3005 {
3006 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3007     dMY_CXT;
3008 #endif
3009     dVAR;
3010     register const bool utf8_target = PL_reg_match_utf8;
3011     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3012     REGEXP *rex_sv = reginfo->prog;
3013     regexp *rex = (struct regexp *)SvANY(rex_sv);
3014     RXi_GET_DECL(rex,rexi);
3015     I32 oldsave;
3016     /* the current state. This is a cached copy of PL_regmatch_state */
3017     register regmatch_state *st;
3018     /* cache heavy used fields of st in registers */
3019     register regnode *scan;
3020     register regnode *next;
3021     register U32 n = 0; /* general value; init to avoid compiler warning */
3022     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3023     register char *locinput = PL_reginput;
3024     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3025
3026     bool result = 0;        /* return value of S_regmatch */
3027     int depth = 0;          /* depth of backtrack stack */
3028     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3029     const U32 max_nochange_depth =
3030         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3031         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3032     regmatch_state *yes_state = NULL; /* state to pop to on success of
3033                                                             subpattern */
3034     /* mark_state piggy backs on the yes_state logic so that when we unwind
3035        the stack on success we can update the mark_state as we go */
3036     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3037     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3038     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3039     U32 state_num;
3040     bool no_final = 0;      /* prevent failure from backtracking? */
3041     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3042     char *startpoint = PL_reginput;
3043     SV *popmark = NULL;     /* are we looking for a mark? */
3044     SV *sv_commit = NULL;   /* last mark name seen in failure */
3045     SV *sv_yes_mark = NULL; /* last mark name we have seen
3046                                during a successful match */
3047     U32 lastopen = 0;       /* last open we saw */
3048     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3049     SV* const oreplsv = GvSV(PL_replgv);
3050     /* these three flags are set by various ops to signal information to
3051      * the very next op. They have a useful lifetime of exactly one loop
3052      * iteration, and are not preserved or restored by state pushes/pops
3053      */
3054     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3055     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3056     int logical = 0;        /* the following EVAL is:
3057                                 0: (?{...})
3058                                 1: (?(?{...})X|Y)
3059                                 2: (??{...})
3060                                or the following IFMATCH/UNLESSM is:
3061                                 false: plain (?=foo)
3062                                 true:  used as a condition: (?(?=foo))
3063                             */
3064 #ifdef DEBUGGING
3065     GET_RE_DEBUG_FLAGS_DECL;
3066 #endif
3067
3068     PERL_ARGS_ASSERT_REGMATCH;
3069
3070     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3071             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3072     }));
3073     /* on first ever call to regmatch, allocate first slab */
3074     if (!PL_regmatch_slab) {
3075         Newx(PL_regmatch_slab, 1, regmatch_slab);
3076         PL_regmatch_slab->prev = NULL;
3077         PL_regmatch_slab->next = NULL;
3078         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3079     }
3080
3081     oldsave = PL_savestack_ix;
3082     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3083     SAVEVPTR(PL_regmatch_slab);
3084     SAVEVPTR(PL_regmatch_state);
3085
3086     /* grab next free state slot */
3087     st = ++PL_regmatch_state;
3088     if (st >  SLAB_LAST(PL_regmatch_slab))
3089         st = PL_regmatch_state = S_push_slab(aTHX);
3090
3091     /* Note that nextchr is a byte even in UTF */
3092     nextchr = UCHARAT(locinput);
3093     scan = prog;
3094     while (scan != NULL) {
3095
3096         DEBUG_EXECUTE_r( {
3097             SV * const prop = sv_newmortal();
3098             regnode *rnext=regnext(scan);
3099             DUMP_EXEC_POS( locinput, scan, utf8_target );
3100             regprop(rex, prop, scan);
3101
3102             PerlIO_printf(Perl_debug_log,
3103                     "%3"IVdf":%*s%s(%"IVdf")\n",
3104                     (IV)(scan - rexi->program), depth*2, "",
3105                     SvPVX_const(prop),
3106                     (PL_regkind[OP(scan)] == END || !rnext) ?
3107                         0 : (IV)(rnext - rexi->program));
3108         });
3109
3110         next = scan + NEXT_OFF(scan);
3111         if (next == scan)
3112             next = NULL;
3113         state_num = OP(scan);
3114
3115       reenter_switch:
3116
3117         assert(PL_reglastparen == &rex->lastparen);
3118         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3119         assert(PL_regoffs == rex->offs);
3120
3121         switch (state_num) {
3122         case BOL:
3123             if (locinput == PL_bostr)
3124             {
3125                 /* reginfo->till = reginfo->bol; */
3126                 break;
3127             }
3128             sayNO;
3129         case MBOL:
3130             if (locinput == PL_bostr ||
3131                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3132             {
3133                 break;
3134             }
3135             sayNO;
3136         case SBOL:
3137             if (locinput == PL_bostr)
3138                 break;
3139             sayNO;
3140         case GPOS:
3141             if (locinput == reginfo->ganch)
3142                 break;
3143             sayNO;
3144
3145         case KEEPS:
3146             /* update the startpoint */
3147             st->u.keeper.val = PL_regoffs[0].start;
3148             PL_reginput = locinput;
3149             PL_regoffs[0].start = locinput - PL_bostr;
3150             PUSH_STATE_GOTO(KEEPS_next, next);
3151             /*NOT-REACHED*/
3152         case KEEPS_next_fail:
3153             /* rollback the start point change */
3154             PL_regoffs[0].start = st->u.keeper.val;
3155             sayNO_SILENT;
3156             /*NOT-REACHED*/
3157         case EOL:
3158                 goto seol;
3159         case MEOL:
3160             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3161                 sayNO;
3162             break;
3163         case SEOL:
3164           seol:
3165             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3166                 sayNO;
3167             if (PL_regeol - locinput > 1)
3168                 sayNO;
3169             break;
3170         case EOS:
3171             if (PL_regeol != locinput)
3172                 sayNO;
3173             break;
3174         case SANY:
3175             if (!nextchr && locinput >= PL_regeol)
3176                 sayNO;
3177             if (utf8_target) {
3178                 locinput += PL_utf8skip[nextchr];
3179                 if (locinput > PL_regeol)
3180                     sayNO;
3181                 nextchr = UCHARAT(locinput);
3182             }
3183             else
3184                 nextchr = UCHARAT(++locinput);
3185             break;
3186         case CANY:
3187             if (!nextchr && locinput >= PL_regeol)
3188                 sayNO;
3189             nextchr = UCHARAT(++locinput);
3190             break;
3191         case REG_ANY:
3192             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3193                 sayNO;
3194             if (utf8_target) {
3195                 locinput += PL_utf8skip[nextchr];
3196                 if (locinput > PL_regeol)
3197                     sayNO;
3198                 nextchr = UCHARAT(locinput);
3199             }
3200             else
3201                 nextchr = UCHARAT(++locinput);
3202             break;
3203
3204 #undef  ST
3205 #define ST st->u.trie
3206         case TRIEC:
3207             /* In this case the charclass data is available inline so
3208                we can fail fast without a lot of extra overhead.
3209              */
3210             if (scan->flags == EXACT || !utf8_target) {
3211                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3212                     DEBUG_EXECUTE_r(
3213                         PerlIO_printf(Perl_debug_log,
3214                                   "%*s  %sfailed to match trie start class...%s\n",
3215                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3216                     );
3217                     sayNO_SILENT;
3218                     /* NOTREACHED */
3219                 }
3220             }
3221             /* FALL THROUGH */
3222         case TRIE:
3223             /* the basic plan of execution of the trie is:
3224              * At the beginning, run though all the states, and
3225              * find the longest-matching word. Also remember the position
3226              * of the shortest matching word. For example, this pattern:
3227              *    1  2 3 4    5
3228              *    ab|a|x|abcd|abc
3229              * when matched against the string "abcde", will generate
3230              * accept states for all words except 3, with the longest
3231              * matching word being 4, and the shortest being 1 (with
3232              * the position being after char 1 of the string).
3233              *
3234              * Then for each matching word, in word order (i.e. 1,2,4,5),
3235              * we run the remainder of the pattern; on each try setting
3236              * the current position to the character following the word,
3237              * returning to try the next word on failure.
3238              *
3239              * We avoid having to build a list of words at runtime by
3240              * using a compile-time structure, wordinfo[].prev, which
3241              * gives, for each word, the previous accepting word (if any).
3242              * In the case above it would contain the mappings 1->2, 2->0,
3243              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3244              * the longest word (4 above), a list of all words, by
3245              * following the list of prev pointers; this gives us the
3246              * unordered list 4,5,1,2. Then given the current word we have
3247              * just tried, we can go through the list and find the
3248              * next-biggest word to try (so if we just failed on word 2,
3249              * the next in the list is 4).
3250              *
3251              * Since at runtime we don't record the matching position in
3252              * the string for each word, we have to work that out for
3253              * each word we're about to process. The wordinfo table holds
3254              * the character length of each word; given that we recorded
3255              * at the start: the position of the shortest word and its
3256              * length in chars, we just need to move the pointer the
3257              * difference between the two char lengths. Depending on
3258              * Unicode status and folding, that's cheap or expensive.
3259              *
3260              * This algorithm is optimised for the case where are only a
3261              * small number of accept states, i.e. 0,1, or maybe 2.
3262              * With lots of accepts states, and having to try all of them,
3263              * it becomes quadratic on number of accept states to find all
3264              * the next words.
3265              */
3266
3267             {
3268                 /* what type of TRIE am I? (utf8 makes this contextual) */
3269                 DECL_TRIE_TYPE(scan);
3270
3271                 /* what trie are we using right now */
3272                 reg_trie_data * const trie
3273                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3274                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3275                 U32 state = trie->startstate;
3276
3277                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3278                     !TRIE_BITMAP_TEST(trie,*locinput)
3279                 ) {
3280                     if (trie->states[ state ].wordnum) {
3281                          DEBUG_EXECUTE_r(
3282                             PerlIO_printf(Perl_debug_log,
3283                                           "%*s  %smatched empty string...%s\n",
3284                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3285                         );
3286                         if (!trie->jump)
3287                             break;
3288                     } else {
3289                         DEBUG_EXECUTE_r(
3290                             PerlIO_printf(Perl_debug_log,
3291                                           "%*s  %sfailed to match trie start class...%s\n",
3292                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3293                         );
3294                         sayNO_SILENT;
3295                    }
3296                 }
3297
3298             {
3299                 U8 *uc = ( U8* )locinput;
3300
3301                 STRLEN len = 0;
3302                 STRLEN foldlen = 0;
3303                 U8 *uscan = (U8*)NULL;
3304                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3305                 U32 charcount = 0; /* how many input chars we have matched */
3306                 U32 accepted = 0; /* have we seen any accepting states? */
3307
3308                 ST.B = next;
3309                 ST.jump = trie->jump;
3310                 ST.me = scan;
3311                 ST.firstpos = NULL;
3312                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3313                 ST.nextword = 0;
3314
3315                 /* fully traverse the TRIE; note the position of the
3316                    shortest accept state and the wordnum of the longest
3317                    accept state */
3318
3319                 while ( state && uc <= (U8*)PL_regeol ) {
3320                     U32 base = trie->states[ state ].trans.base;
3321                     UV uvc = 0;
3322                     U16 charid = 0;
3323                     U16 wordnum;
3324                     wordnum = trie->states[ state ].wordnum;
3325
3326                     if (wordnum) { /* it's an accept state */
3327                         if (!accepted) {
3328                             accepted = 1;
3329                             /* record first match position */
3330                             if (ST.longfold) {
3331                                 ST.firstpos = (U8*)locinput;
3332                                 ST.firstchars = 0;
3333                             }
3334                             else {
3335                                 ST.firstpos = uc;
3336                                 ST.firstchars = charcount;
3337                             }
3338                         }
3339                         if (!ST.nextword || wordnum < ST.nextword)
3340                             ST.nextword = wordnum;
3341                         ST.topword = wordnum;
3342                     }
3343
3344                     DEBUG_TRIE_EXECUTE_r({
3345                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3346                                 PerlIO_printf( Perl_debug_log,
3347                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3348                                     2+depth * 2, "", PL_colors[4],
3349                                     (UV)state, (accepted ? 'Y' : 'N'));
3350                     });
3351
3352                     /* read a char and goto next state */
3353                     if ( base ) {
3354                         I32 offset;
3355                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3356                                              uscan, len, uvc, charid, foldlen,
3357                                              foldbuf, uniflags);
3358                         charcount++;
3359                         if (foldlen>0)
3360                             ST.longfold = TRUE;
3361                         if (charid &&
3362                              ( ((offset =
3363                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3364
3365                              && ((U32)offset < trie->lasttrans)
3366                              && trie->trans[offset].check == state)
3367                         {
3368                             state = trie->trans[offset].next;
3369                         }
3370                         else {
3371                             state = 0;
3372                         }
3373                         uc += len;
3374
3375                     }
3376                     else {
3377                         state = 0;
3378                     }
3379                     DEBUG_TRIE_EXECUTE_r(
3380                         PerlIO_printf( Perl_debug_log,
3381                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3382                             charid, uvc, (UV)state, PL_colors[5] );
3383                     );
3384                 }
3385                 if (!accepted)
3386                    sayNO;
3387
3388                 /* calculate total number of accept states */
3389                 {
3390                     U16 w = ST.topword;
3391                     accepted = 0;
3392                     while (w) {
3393                         w = trie->wordinfo[w].prev;
3394                         accepted++;
3395                     }
3396                     ST.accepted = accepted;
3397                 }
3398
3399                 DEBUG_EXECUTE_r(
3400                     PerlIO_printf( Perl_debug_log,
3401                         "%*s  %sgot %"IVdf" possible matches%s\n",
3402                         REPORT_CODE_OFF + depth * 2, "",
3403                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3404                 );
3405                 goto trie_first_try; /* jump into the fail handler */
3406             }}
3407             /* NOTREACHED */
3408
3409         case TRIE_next_fail: /* we failed - try next alternative */
3410             if ( ST.jump) {
3411                 REGCP_UNWIND(ST.cp);
3412                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3413                     PL_regoffs[n].end = -1;
3414                 *PL_reglastparen = n;
3415             }
3416             if (!--ST.accepted) {
3417                 DEBUG_EXECUTE_r({
3418                     PerlIO_printf( Perl_debug_log,
3419                         "%*s  %sTRIE failed...%s\n",
3420                         REPORT_CODE_OFF+depth*2, "",
3421                         PL_colors[4],
3422                         PL_colors[5] );
3423                 });
3424                 sayNO_SILENT;
3425             }
3426             {
3427                 /* Find next-highest word to process.  Note that this code
3428                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3429                 register U16 min = 0;
3430                 register U16 word;
3431                 register U16 const nextword = ST.nextword;
3432                 register reg_trie_wordinfo * const wordinfo
3433                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3434                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3435                     if (word > nextword && (!min || word < min))
3436                         min = word;
3437                 }
3438                 ST.nextword = min;
3439             }
3440
3441           trie_first_try:
3442             if (do_cutgroup) {
3443                 do_cutgroup = 0;
3444                 no_final = 0;
3445             }
3446
3447             if ( ST.jump) {
3448                 ST.lastparen = *PL_reglastparen;
3449                 REGCP_SET(ST.cp);
3450             }
3451
3452             /* find start char of end of current word */
3453             {
3454                 U32 chars; /* how many chars to skip */
3455                 U8 *uc = ST.firstpos;
3456                 reg_trie_data * const trie
3457                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3458
3459                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3460                             >=  ST.firstchars);
3461                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3462                             - ST.firstchars;
3463
3464                 if (ST.longfold) {
3465                     /* the hard option - fold each char in turn and find
3466                      * its folded length (which may be different */
3467                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3468                     STRLEN foldlen;
3469                     STRLEN len;
3470                     UV uvc;
3471                     U8 *uscan;
3472
3473                     while (chars) {
3474                         if (utf8_target) {
3475                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3476                                                     uniflags);
3477                             uc += len;
3478                         }
3479                         else {
3480                             uvc = *uc;
3481                             uc++;
3482                         }
3483                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3484                         uscan = foldbuf;
3485                         while (foldlen) {
3486                             if (!--chars)
3487                                 break;
3488                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3489                                             uniflags);
3490                             uscan += len;
3491                             foldlen -= len;
3492                         }
3493                     }
3494                 }
3495                 else {
3496                     if (utf8_target)
3497                         while (chars--)
3498                             uc += UTF8SKIP(uc);
3499                     else
3500                         uc += chars;
3501                 }
3502                 PL_reginput = (char *)uc;
3503             }
3504
3505             scan = (ST.jump && ST.jump[ST.nextword])
3506                         ? ST.me + ST.jump[ST.nextword]
3507                         : ST.B;
3508
3509             DEBUG_EXECUTE_r({
3510                 PerlIO_printf( Perl_debug_log,
3511                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3512                     REPORT_CODE_OFF+depth*2, "",
3513                     PL_colors[4],
3514                     ST.nextword,
3515                     PL_colors[5]
3516                     );
3517             });
3518
3519             if (ST.accepted > 1 || has_cutgroup) {
3520                 PUSH_STATE_GOTO(TRIE_next, scan);
3521                 /* NOTREACHED */
3522             }
3523             /* only one choice left - just continue */
3524             DEBUG_EXECUTE_r({
3525                 AV *const trie_words
3526                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3527                 SV ** const tmp = av_fetch( trie_words,
3528                     ST.nextword-1, 0 );
3529                 SV *sv= tmp ? sv_newmortal() : NULL;
3530
3531                 PerlIO_printf( Perl_debug_log,
3532                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3533                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3534                     ST.nextword,
3535                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3536                             PL_colors[0], PL_colors[1],
3537                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3538                         )
3539                     : "not compiled under -Dr",
3540                     PL_colors[5] );
3541             });
3542
3543             locinput = PL_reginput;
3544             nextchr = UCHARAT(locinput);
3545             continue; /* execute rest of RE */
3546             /* NOTREACHED */
3547 #undef  ST
3548
3549         case EXACT: {
3550             char *s = STRING(scan);
3551             ln = STR_LEN(scan);
3552             if (utf8_target != UTF_PATTERN) {
3553                 /* The target and the pattern have differing utf8ness. */
3554                 char *l = locinput;
3555                 const char * const e = s + ln;
3556
3557                 if (utf8_target) {
3558                     /* The target is utf8, the pattern is not utf8. */
3559                     while (s < e) {
3560                         STRLEN ulen;
3561                         if (l >= PL_regeol)
3562                              sayNO;
3563                         if (NATIVE_TO_UNI(*(U8*)s) !=
3564                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3565                                             uniflags))
3566                              sayNO;
3567                         l += ulen;
3568                         s ++;
3569                     }
3570                 }
3571                 else {
3572                     /* The target is not utf8, the pattern is utf8. */
3573                     while (s < e) {
3574                         STRLEN ulen;
3575                         if (l >= PL_regeol)
3576                             sayNO;
3577                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3578                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3579                                            uniflags))
3580                             sayNO;
3581                         s += ulen;
3582                         l ++;
3583                     }
3584                 }
3585                 locinput = l;
3586                 nextchr = UCHARAT(locinput);
3587                 break;
3588             }
3589             /* The target and the pattern have the same utf8ness. */
3590             /* Inline the first character, for speed. */
3591             if (UCHARAT(s) != nextchr)
3592                 sayNO;
3593             if (PL_regeol - locinput < ln)
3594                 sayNO;
3595             if (ln > 1 && memNE(s, locinput, ln))
3596                 sayNO;
3597             locinput += ln;
3598             nextchr = UCHARAT(locinput);
3599             break;
3600             }
3601         case EXACTFL: {
3602             re_fold_t folder;
3603             const U8 * fold_array;
3604             const char * s;
3605             U32 fold_utf8_flags;
3606
3607             PL_reg_flags |= RF_tainted;
3608             folder = foldEQ_locale;
3609             fold_array = PL_fold_locale;
3610             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3611             goto do_exactf;
3612
3613         case EXACTFU:
3614             folder = foldEQ_latin1;
3615             fold_array = PL_fold_latin1;
3616             fold_utf8_flags = 0;
3617             goto do_exactf;
3618
3619         case EXACTFA:
3620             folder = foldEQ_latin1;
3621             fold_array = PL_fold_latin1;
3622             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3623             goto do_exactf;
3624
3625         case EXACTF:
3626             folder = foldEQ;
3627             fold_array = PL_fold;
3628             fold_utf8_flags = 0;
3629
3630           do_exactf:
3631             s = STRING(scan);
3632             ln = STR_LEN(scan);
3633
3634             if (utf8_target || UTF_PATTERN) {
3635               /* Either target or the pattern are utf8. */
3636                 const char * const l = locinput;
3637                 char *e = PL_regeol;
3638
3639                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3640                                l, &e, 0,  utf8_target, fold_utf8_flags)) {
3641                      /* One more case for the sharp s:
3642                       * pack("U0U*", 0xDF) =~ /ss/i,
3643                       * the 0xC3 0x9F are the UTF-8
3644                       * byte sequence for the U+00DF. */
3645
3646                      if (!(utf8_target &&
3647                            toLOWER(s[0]) == 's' &&
3648                            ln >= 2 &&
3649                            toLOWER(s[1]) == 's' &&
3650                            (U8)l[0] == 0xC3 &&
3651                            e - l >= 2 &&
3652                            (U8)l[1] == 0x9F))
3653                           sayNO;
3654                 }
3655                 locinput = e;
3656                 nextchr = UCHARAT(locinput);
3657                 break;
3658             }
3659
3660             /* Neither the target nor the pattern are utf8 */
3661             if (UCHARAT(s) != nextchr &&
3662                 UCHARAT(s) != fold_array[nextchr])
3663             {
3664                 sayNO;
3665             }
3666             if (PL_regeol - locinput < ln)
3667                 sayNO;
3668             if (ln > 1 && ! folder(s, locinput, ln))
3669                 sayNO;
3670             locinput += ln;
3671             nextchr = UCHARAT(locinput);
3672             break;
3673         }
3674
3675         /* XXX Could improve efficiency by separating these all out using a
3676          * macro or in-line function.  At that point regcomp.c would no longer
3677          * have to set the FLAGS fields of these */
3678         case BOUNDL:
3679         case NBOUNDL:
3680             PL_reg_flags |= RF_tainted;
3681             /* FALL THROUGH */
3682         case BOUND:
3683         case BOUNDU:
3684         case BOUNDA:
3685         case NBOUND:
3686         case NBOUNDU:
3687         case NBOUNDA:
3688             /* was last char in word? */
3689             if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3690                 if (locinput == PL_bostr)
3691                     ln = '\n';
3692                 else {
3693                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3694
3695                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3696                 }
3697                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3698                     ln = isALNUM_uni(ln);
3699                     LOAD_UTF8_CHARCLASS_ALNUM();
3700                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3701                 }
3702                 else {
3703                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3704                     n = isALNUM_LC_utf8((U8*)locinput);
3705                 }
3706             }
3707             else {
3708
3709                 /* Here the string isn't utf8, or is utf8 and only ascii
3710                  * characters are to match \w.  In the latter case looking at
3711                  * the byte just prior to the current one may be just the final
3712                  * byte of a multi-byte character.  This is ok.  There are two
3713                  * cases:
3714                  * 1) it is a single byte character, and then the test is doing
3715                  *      just what it's supposed to.
3716                  * 2) it is a multi-byte character, in which case the final
3717                  *      byte is never mistakable for ASCII, and so the test
3718                  *      will say it is not a word character, which is the
3719                  *      correct answer. */
3720                 ln = (locinput != PL_bostr) ?
3721                     UCHARAT(locinput - 1) : '\n';
3722                 switch (FLAGS(scan)) {
3723                     case REGEX_UNICODE_CHARSET:
3724                         ln = isWORDCHAR_L1(ln);
3725                         n = isWORDCHAR_L1(nextchr);
3726                         break;
3727                     case REGEX_LOCALE_CHARSET:
3728                         ln = isALNUM_LC(ln);
3729                         n = isALNUM_LC(nextchr);
3730                         break;
3731                     case REGEX_DEPENDS_CHARSET:
3732                         ln = isALNUM(ln);
3733                         n = isALNUM(nextchr);
3734                         break;
3735                     case REGEX_ASCII_RESTRICTED_CHARSET:
3736                         ln = isWORDCHAR_A(ln);
3737                         n = isWORDCHAR_A(nextchr);
3738                         break;
3739                     default:
3740                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3741                         break;
3742                 }
3743             }
3744             /* Note requires that all BOUNDs be lower than all NBOUNDs in
3745              * regcomp.sym */
3746             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3747                     sayNO;
3748             break;
3749         case ANYOFV:
3750         case ANYOF:
3751             if (utf8_target || state_num == ANYOFV) {
3752                 STRLEN inclasslen = PL_regeol - locinput;
3753                 if (locinput >= PL_regeol)
3754                     sayNO;
3755
3756                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3757                     sayNO;
3758                 locinput += inclasslen;
3759                 nextchr = UCHARAT(locinput);
3760                 break;
3761             }
3762             else {
3763                 if (nextchr < 0)
3764                     nextchr = UCHARAT(locinput);
3765                 if (!nextchr && locinput >= PL_regeol)
3766                     sayNO;
3767                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3768                     sayNO;
3769                 nextchr = UCHARAT(++locinput);
3770                 break;
3771             }
3772             break;
3773         /* Special char classes - The defines start on line 129 or so */
3774         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3775                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3776                   ALNUMU, NALNUMU, isWORDCHAR_L1,
3777                   ALNUMA, NALNUMA, isWORDCHAR_A,
3778                   alnum, "a");
3779
3780         CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3781                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3782                   SPACEU, NSPACEU, isSPACE_L1,
3783                   SPACEA, NSPACEA, isSPACE_A,
3784                   space, " ");
3785
3786         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3787                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3788                 DIGITA, NDIGITA, isDIGIT_A,
3789                 digit, "0");
3790
3791         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3792                        a Unicode extended Grapheme Cluster */
3793             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3794               extended Grapheme Cluster is:
3795
3796                CR LF
3797                | Prepend* Begin Extend*
3798                | .
3799
3800                Begin is (Hangul-syllable | ! Control)
3801                Extend is (Grapheme_Extend | Spacing_Mark)
3802                Control is [ GCB_Control CR LF ]
3803
3804                The discussion below shows how the code for CLUMP is derived
3805                from this regex.  Note that most of these concepts are from
3806                property values of the Grapheme Cluster Boundary (GCB) property.
3807                No code point can have multiple property values for a given
3808                property.  Thus a code point in Prepend can't be in Control, but
3809                it must be in !Control.  This is why Control above includes
3810                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3811                property separately, and so can't be in GCB_Control, even though
3812                they logically are controls.  Control is not the same as gc=cc,
3813                but includes format and other characters as well.
3814
3815                The Unicode definition of Hangul-syllable is:
3816                    L+
3817                    | (L* ( ( V | LV ) V* | LVT ) T*)
3818                    | T+
3819                   )
3820                Each of these is a value for the GCB property, and hence must be
3821                disjoint, so the order they are tested is immaterial, so the
3822                above can safely be changed to
3823                    T+
3824                    | L+
3825                    | (L* ( LVT | ( V | LV ) V*) T*)
3826
3827                The last two terms can be combined like this:
3828                    L* ( L
3829                         | (( LVT | ( V | LV ) V*) T*))
3830
3831                And refactored into this:
3832                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3833
3834                That means that if we have seen any L's at all we can quit
3835                there, but if the next character is a LVT, a V or and LV we
3836                should keep going.
3837
3838                There is a subtlety with Prepend* which showed up in testing.
3839                Note that the Begin, and only the Begin is required in:
3840                 | Prepend* Begin Extend*
3841                Also, Begin contains '! Control'.  A Prepend must be a '!
3842                Control', which means it must be a Begin.  What it comes down to
3843                is that if we match Prepend* and then find no suitable Begin
3844                afterwards, that if we backtrack the last Prepend, that one will
3845                be a suitable Begin.
3846             */
3847
3848             if (locinput >= PL_regeol)
3849                 sayNO;
3850             if  (! utf8_target) {
3851
3852                 /* Match either CR LF  or '.', as all the other possibilities
3853                  * require utf8 */
3854                 locinput++;         /* Match the . or CR */
3855                 if (nextchr == '\r'
3856                     && locinput < PL_regeol
3857                     && UCHARAT(locinput) == '\n') locinput++;
3858             }
3859             else {
3860
3861                 /* Utf8: See if is ( CR LF ); already know that locinput <
3862                  * PL_regeol, so locinput+1 is in bounds */
3863                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3864                     locinput += 2;
3865                 }
3866                 else {
3867                     /* In case have to backtrack to beginning, then match '.' */
3868                     char *starting = locinput;
3869
3870                     /* In case have to backtrack the last prepend */
3871                     char *previous_prepend = 0;
3872
3873                     LOAD_UTF8_CHARCLASS_GCB();
3874
3875                     /* Match (prepend)* */
3876                     while (locinput < PL_regeol
3877                            && swash_fetch(PL_utf8_X_prepend,
3878                                           (U8*)locinput, utf8_target))
3879                     {
3880                         previous_prepend = locinput;
3881                         locinput += UTF8SKIP(locinput);
3882                     }
3883
3884                     /* As noted above, if we matched a prepend character, but
3885                      * the next thing won't match, back off the last prepend we
3886                      * matched, as it is guaranteed to match the begin */
3887                     if (previous_prepend
3888                         && (locinput >=  PL_regeol
3889                             || ! swash_fetch(PL_utf8_X_begin,
3890                                              (U8*)locinput, utf8_target)))
3891                     {
3892                         locinput = previous_prepend;
3893                     }
3894
3895                     /* Note that here we know PL_regeol > locinput, as we
3896                      * tested that upon input to this switch case, and if we
3897                      * moved locinput forward, we tested the result just above
3898                      * and it either passed, or we backed off so that it will
3899                      * now pass */
3900                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3901
3902                         /* Here did not match the required 'Begin' in the
3903                          * second term.  So just match the very first
3904                          * character, the '.' of the final term of the regex */
3905                         locinput = starting + UTF8SKIP(starting);
3906                     } else {
3907
3908                         /* Here is the beginning of a character that can have
3909                          * an extender.  It is either a hangul syllable, or a
3910                          * non-control */
3911                         if (swash_fetch(PL_utf8_X_non_hangul,
3912                                         (U8*)locinput, utf8_target))
3913                         {
3914
3915                             /* Here not a Hangul syllable, must be a
3916                              * ('!  * Control') */
3917                             locinput += UTF8SKIP(locinput);
3918                         } else {
3919
3920                             /* Here is a Hangul syllable.  It can be composed
3921                              * of several individual characters.  One
3922                              * possibility is T+ */
3923                             if (swash_fetch(PL_utf8_X_T,
3924                                             (U8*)locinput, utf8_target))
3925                             {
3926                                 while (locinput < PL_regeol
3927                                         && swash_fetch(PL_utf8_X_T,
3928                                                         (U8*)locinput, utf8_target))
3929                                 {
3930                                     locinput += UTF8SKIP(locinput);
3931                                 }
3932                             } else {
3933
3934                                 /* Here, not T+, but is a Hangul.  That means
3935                                  * it is one of the others: L, LV, LVT or V,
3936                                  * and matches:
3937                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3938
3939                                 /* Match L*           */
3940                                 while (locinput < PL_regeol
3941                                         && swash_fetch(PL_utf8_X_L,
3942                                                         (U8*)locinput, utf8_target))
3943                                 {
3944                                     locinput += UTF8SKIP(locinput);
3945                                 }
3946
3947                                 /* Here, have exhausted L*.  If the next
3948                                  * character is not an LV, LVT nor V, it means
3949                                  * we had to have at least one L, so matches L+
3950                                  * in the original equation, we have a complete
3951                                  * hangul syllable.  Are done. */
3952
3953                                 if (locinput < PL_regeol
3954                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3955                                                     (U8*)locinput, utf8_target))
3956                                 {
3957
3958                                     /* Otherwise keep going.  Must be LV, LVT
3959                                      * or V.  See if LVT */
3960                                     if (swash_fetch(PL_utf8_X_LVT,
3961                                                     (U8*)locinput, utf8_target))
3962                                     {
3963                                         locinput += UTF8SKIP(locinput);
3964                                     } else {
3965
3966                                         /* Must be  V or LV.  Take it, then
3967                                          * match V*     */
3968                                         locinput += UTF8SKIP(locinput);
3969                                         while (locinput < PL_regeol
3970                                                 && swash_fetch(PL_utf8_X_V,
3971                                                          (U8*)locinput, utf8_target))
3972                                         {
3973                                             locinput += UTF8SKIP(locinput);
3974                                         }
3975                                     }
3976
3977                                     /* And any of LV, LVT, or V can be followed
3978                                      * by T*            */
3979                                     while (locinput < PL_regeol
3980                                            && swash_fetch(PL_utf8_X_T,
3981                                                            (U8*)locinput,
3982                                                            utf8_target))
3983                                     {
3984                                         locinput += UTF8SKIP(locinput);
3985                                     }
3986                                 }
3987                             }
3988                         }
3989
3990                         /* Match any extender */
3991                         while (locinput < PL_regeol
3992                                 && swash_fetch(PL_utf8_X_extend,
3993                                                 (U8*)locinput, utf8_target))
3994                         {
3995                             locinput += UTF8SKIP(locinput);
3996                         }
3997                     }
3998                 }
3999                 if (locinput > PL_regeol) sayNO;
4000             }
4001             nextchr = UCHARAT(locinput);
4002             break;
4003
4004         case NREFFL:
4005         {   /* The capture buffer cases.  The ones beginning with N for the
4006                named buffers just convert to the equivalent numbered and
4007                pretend they were called as the corresponding numbered buffer
4008                op.  */
4009             /* don't initialize these in the declaration, it makes C++
4010                unhappy */
4011             char *s;
4012             char type;
4013             re_fold_t folder;
4014             const U8 *fold_array;
4015             UV utf8_fold_flags;
4016
4017             PL_reg_flags |= RF_tainted;
4018             folder = foldEQ_locale;
4019             fold_array = PL_fold_locale;
4020             type = REFFL;
4021             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4022             goto do_nref;
4023
4024         case NREFFA:
4025             folder = foldEQ_latin1;
4026             fold_array = PL_fold_latin1;
4027             type = REFFA;
4028             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4029             goto do_nref;
4030
4031         case NREFFU:
4032             folder = foldEQ_latin1;
4033             fold_array = PL_fold_latin1;
4034             type = REFFU;
4035             utf8_fold_flags = 0;
4036             goto do_nref;
4037
4038         case NREFF:
4039             folder = foldEQ;
4040             fold_array = PL_fold;
4041             type = REFF;
4042             utf8_fold_flags = 0;
4043             goto do_nref;
4044
4045         case NREF:
4046             type = REF;
4047             folder = NULL;
4048             fold_array = NULL;
4049             utf8_fold_flags = 0;
4050           do_nref:
4051
4052             /* For the named back references, find the corresponding buffer
4053              * number */
4054             n = reg_check_named_buff_matched(rex,scan);
4055
4056             if ( ! n ) {
4057                 sayNO;
4058             }
4059             goto do_nref_ref_common;
4060
4061         case REFFL:
4062             PL_reg_flags |= RF_tainted;
4063             folder = foldEQ_locale;
4064             fold_array = PL_fold_locale;
4065             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4066             goto do_ref;
4067
4068         case REFFA:
4069             folder = foldEQ_latin1;
4070             fold_array = PL_fold_latin1;
4071             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4072             goto do_ref;
4073
4074         case REFFU:
4075             folder = foldEQ_latin1;
4076             fold_array = PL_fold_latin1;
4077             utf8_fold_flags = 0;
4078             goto do_ref;
4079
4080         case REFF:
4081             folder = foldEQ;
4082             fold_array = PL_fold;
4083             utf8_fold_flags = 0;
4084             goto do_ref;
4085
4086         case REF:
4087             folder = NULL;
4088             fold_array = NULL;
4089             utf8_fold_flags = 0;
4090
4091           do_ref:
4092             type = OP(scan);
4093             n = ARG(scan);  /* which paren pair */
4094
4095           do_nref_ref_common:
4096             ln = PL_regoffs[n].start;
4097             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4098             if (*PL_reglastparen < n || ln == -1)
4099                 sayNO;                  /* Do not match unless seen CLOSEn. */
4100             if (ln == PL_regoffs[n].end)
4101                 break;
4102
4103             s = PL_bostr + ln;
4104             if (type != REF     /* REF can do byte comparison */
4105                 && (utf8_target || type == REFFU))
4106             { /* XXX handle REFFL better */
4107                 char * limit = PL_regeol;
4108
4109                 /* This call case insensitively compares the entire buffer
4110                     * at s, with the current input starting at locinput, but
4111                     * not going off the end given by PL_regeol, and returns in
4112                     * limit upon success, how much of the current input was
4113                     * matched */
4114                 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4115                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4116                 {
4117                     sayNO;
4118                 }
4119                 locinput = limit;
4120                 nextchr = UCHARAT(locinput);
4121                 break;
4122             }
4123
4124             /* Not utf8:  Inline the first character, for speed. */
4125             if (UCHARAT(s) != nextchr &&
4126                 (type == REF ||
4127                  UCHARAT(s) != fold_array[nextchr]))
4128                 sayNO;
4129             ln = PL_regoffs[n].end - ln;
4130             if (locinput + ln > PL_regeol)
4131                 sayNO;
4132             if (ln > 1 && (type == REF
4133                            ? memNE(s, locinput, ln)
4134                            : ! folder(s, locinput, ln)))
4135                 sayNO;
4136             locinput += ln;
4137             nextchr = UCHARAT(locinput);
4138             break;
4139         }
4140         case NOTHING:
4141         case TAIL:
4142             break;
4143         case BACK:
4144             break;
4145
4146 #undef  ST
4147 #define ST st->u.eval
4148         {
4149             SV *ret;
4150             REGEXP *re_sv;
4151             regexp *re;
4152             regexp_internal *rei;
4153             regnode *startpoint;
4154
4155         case GOSTART:
4156         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4157             if (cur_eval && cur_eval->locinput==locinput) {
4158                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4159                     Perl_croak(aTHX_ "Infinite recursion in regex");
4160                 if ( ++nochange_depth > max_nochange_depth )
4161                     Perl_croak(aTHX_
4162                         "Pattern subroutine nesting without pos change"
4163                         " exceeded limit in regex");
4164             } else {
4165                 nochange_depth = 0;
4166             }
4167             re_sv = rex_sv;
4168             re = rex;
4169             rei = rexi;
4170             (void)ReREFCNT_inc(rex_sv);
4171             if (OP(scan)==GOSUB) {
4172                 startpoint = scan + ARG2L(scan);
4173                 ST.close_paren = ARG(scan);
4174             } else {
4175                 startpoint = rei->program+1;
4176                 ST.close_paren = 0;
4177             }
4178             goto eval_recurse_doit;
4179             /* NOTREACHED */
4180         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4181             if (cur_eval && cur_eval->locinput==locinput) {
4182                 if ( ++nochange_depth > max_nochange_depth )
4183                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4184             } else {
4185                 nochange_depth = 0;
4186             }
4187             {
4188                 /* execute the code in the {...} */
4189                 dSP;
4190                 SV ** const before = SP;
4191                 OP_4tree * const oop = PL_op;
4192                 COP * const ocurcop = PL_curcop;
4193                 PAD *old_comppad;
4194                 char *saved_regeol = PL_regeol;
4195                 struct re_save_state saved_state;
4196
4197                 /* To not corrupt the existing regex state while executing the
4198                  * eval we would normally put it on the save stack, like with
4199                  * save_re_context. However, re-evals have a weird scoping so we
4200                  * can't just add ENTER/LEAVE here. With that, things like
4201                  *
4202                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4203                  *
4204                  * would break, as they expect the localisation to be unwound
4205                  * only when the re-engine backtracks through the bit that
4206                  * localised it.
4207                  *
4208                  * What we do instead is just saving the state in a local c
4209                  * variable.
4210                  */
4211                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4212
4213                 n = ARG(scan);
4214                 PL_op = (OP_4tree*)rexi->data->data[n];
4215                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4216                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4217                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4218                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4219
4220                 if (sv_yes_mark) {
4221                     SV *sv_mrk = get_sv("REGMARK", 1);
4222                     sv_setsv(sv_mrk, sv_yes_mark);
4223                 }
4224
4225                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4226                 SPAGAIN;
4227                 if (SP == before)
4228                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4229                 else {
4230                     ret = POPs;
4231                     PUTBACK;
4232                 }
4233
4234                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4235
4236                 PL_op = oop;
4237                 PAD_RESTORE_LOCAL(old_comppad);
4238                 PL_curcop = ocurcop;
4239                 PL_regeol = saved_regeol;
4240                 if (!logical) {
4241                     /* /(?{...})/ */
4242                     sv_setsv(save_scalar(PL_replgv), ret);
4243                     break;
4244                 }
4245             }
4246             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4247                 logical = 0;
4248                 {
4249                     /* extract RE object from returned value; compiling if
4250                      * necessary */
4251                     MAGIC *mg = NULL;
4252                     REGEXP *rx = NULL;
4253
4254                     if (SvROK(ret)) {
4255                         SV *const sv = SvRV(ret);
4256
4257                         if (SvTYPE(sv) == SVt_REGEXP) {
4258                             rx = (REGEXP*) sv;
4259                         } else if (SvSMAGICAL(sv)) {
4260                             mg = mg_find(sv, PERL_MAGIC_qr);
4261                             assert(mg);
4262                         }
4263                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4264                         rx = (REGEXP*) ret;
4265                     } else if (SvSMAGICAL(ret)) {
4266                         if (SvGMAGICAL(ret)) {
4267                             /* I don't believe that there is ever qr magic
4268                                here.  */
4269                             assert(!mg_find(ret, PERL_MAGIC_qr));
4270                             sv_unmagic(ret, PERL_MAGIC_qr);
4271                         }
4272                         else {
4273                             mg = mg_find(ret, PERL_MAGIC_qr);
4274                             /* testing suggests mg only ends up non-NULL for
4275                                scalars who were upgraded and compiled in the
4276                                else block below. In turn, this is only
4277                                triggered in the "postponed utf8 string" tests
4278                                in t/op/pat.t  */
4279                         }
4280                     }
4281
4282                     if (mg) {
4283                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4284                         assert(rx);
4285                     }
4286                     if (rx) {
4287                         rx = reg_temp_copy(NULL, rx);
4288                     }
4289                     else {
4290                         U32 pm_flags = 0;
4291                         const I32 osize = PL_regsize;
4292
4293                         if (DO_UTF8(ret)) {
4294                             assert (SvUTF8(ret));
4295                         } else if (SvUTF8(ret)) {
4296                             /* Not doing UTF-8, despite what the SV says. Is
4297                                this only if we're trapped in use 'bytes'?  */
4298                             /* Make a copy of the octet sequence, but without
4299                                the flag on, as the compiler now honours the
4300                                SvUTF8 flag on ret.  */
4301                             STRLEN len;
4302                             const char *const p = SvPV(ret, len);
4303                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4304                         }
4305                         rx = CALLREGCOMP(ret, pm_flags);
4306                         if (!(SvFLAGS(ret)
4307                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4308                                  | SVs_GMG))) {
4309                             /* This isn't a first class regexp. Instead, it's
4310                                caching a regexp onto an existing, Perl visible
4311                                scalar.  */
4312                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4313                         }
4314                         PL_regsize = osize;
4315                     }
4316                     re_sv = rx;
4317                     re = (struct regexp *)SvANY(rx);
4318                 }
4319                 RXp_MATCH_COPIED_off(re);
4320                 re->subbeg = rex->subbeg;
4321                 re->sublen = rex->sublen;
4322                 rei = RXi_GET(re);
4323                 DEBUG_EXECUTE_r(
4324                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4325                         "Matching embedded");
4326                 );
4327                 startpoint = rei->program + 1;
4328                 ST.close_paren = 0; /* only used for GOSUB */
4329                 /* borrowed from regtry */
4330                 if (PL_reg_start_tmpl <= re->nparens) {
4331                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4332                     if(PL_reg_start_tmp)
4333                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4334                     else
4335                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4336                 }
4337
4338         eval_recurse_doit: /* Share code with GOSUB below this line */
4339                 /* run the pattern returned from (??{...}) */
4340                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4341                 REGCP_SET(ST.lastcp);
4342
4343                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4344
4345                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4346                 PL_reglastparen = &re->lastparen;
4347                 PL_reglastcloseparen = &re->lastcloseparen;
4348                 re->lastparen = 0;
4349                 re->lastcloseparen = 0;
4350
4351                 PL_reginput = locinput;
4352                 PL_regsize = 0;
4353
4354                 /* XXXX This is too dramatic a measure... */
4355                 PL_reg_maxiter = 0;
4356
4357                 ST.toggle_reg_flags = PL_reg_flags;
4358                 if (RX_UTF8(re_sv))
4359                     PL_reg_flags |= RF_utf8;
4360                 else
4361                     PL_reg_flags &= ~RF_utf8;
4362                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4363
4364                 ST.prev_rex = rex_sv;
4365                 ST.prev_curlyx = cur_curlyx;
4366                 SETREX(rex_sv,re_sv);
4367                 rex = re;
4368                 rexi = rei;
4369                 cur_curlyx = NULL;
4370                 ST.B = next;
4371                 ST.prev_eval = cur_eval;
4372                 cur_eval = st;
4373                 /* now continue from first node in postoned RE */
4374                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4375                 /* NOTREACHED */
4376             }
4377             /* logical is 1,   /(?(?{...})X|Y)/ */
4378             sw = cBOOL(SvTRUE(ret));
4379             logical = 0;
4380             break;
4381         }
4382
4383         case EVAL_AB: /* cleanup after a successful (??{A})B */
4384             /* note: this is called twice; first after popping B, then A */
4385             PL_reg_flags ^= ST.toggle_reg_flags;
4386             ReREFCNT_dec(rex_sv);
4387             SETREX(rex_sv,ST.prev_rex);
4388             rex = (struct regexp *)SvANY(rex_sv);
4389             rexi = RXi_GET(rex);
4390             regcpblow(ST.cp);
4391             cur_eval = ST.prev_eval;
4392             cur_curlyx = ST.prev_curlyx;
4393
4394             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4395             PL_reglastparen = &rex->lastparen;
4396             PL_reglastcloseparen = &rex->lastcloseparen;
4397             /* also update PL_regoffs */
4398             PL_regoffs = rex->offs;
4399
4400             /* XXXX This is too dramatic a measure... */
4401             PL_reg_maxiter = 0;
4402             if ( nochange_depth )
4403                 nochange_depth--;
4404             sayYES;
4405
4406
4407         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4408             /* note: this is called twice; first after popping B, then A */
4409             PL_reg_flags ^= ST.toggle_reg_flags;
4410             ReREFCNT_dec(rex_sv);
4411             SETREX(rex_sv,ST.prev_rex);
4412             rex = (struct regexp *)SvANY(rex_sv);
4413             rexi = RXi_GET(rex);
4414             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4415             PL_reglastparen = &rex->lastparen;
4416             PL_reglastcloseparen = &rex->lastcloseparen;
4417
4418             PL_reginput = locinput;
4419             REGCP_UNWIND(ST.lastcp);
4420             regcppop(rex);
4421             cur_eval = ST.prev_eval;
4422             cur_curlyx = ST.prev_curlyx;
4423             /* XXXX This is too dramatic a measure... */
4424             PL_reg_maxiter = 0;
4425             if ( nochange_depth )
4426                 nochange_depth--;
4427             sayNO_SILENT;
4428 #undef ST
4429
4430         case OPEN:
4431             n = ARG(scan);  /* which paren pair */
4432             PL_reg_start_tmp[n] = locinput;
4433             if (n > PL_regsize)
4434                 PL_regsize = n;
4435             lastopen = n;
4436             break;
4437         case CLOSE:
4438             n = ARG(scan);  /* which paren pair */
4439             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4440             PL_regoffs[n].end = locinput - PL_bostr;
4441             /*if (n > PL_regsize)
4442                 PL_regsize = n;*/
4443             if (n > *PL_reglastparen)
4444                 *PL_reglastparen = n;
4445             *PL_reglastcloseparen = n;
4446             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4447                 goto fake_end;
4448             }
4449             break;
4450         case ACCEPT:
4451             if (ARG(scan)){
4452                 regnode *cursor;
4453                 for (cursor=scan;
4454                      cursor && OP(cursor)!=END;
4455                      cursor=regnext(cursor))
4456                 {
4457                     if ( OP(cursor)==CLOSE ){
4458                         n = ARG(cursor);
4459                         if ( n <= lastopen ) {
4460                             PL_regoffs[n].start
4461                                 = PL_reg_start_tmp[n] - PL_bostr;
4462                             PL_regoffs[n].end = locinput - PL_bostr;
4463                             /*if (n > PL_regsize)
4464                             PL_regsize = n;*/
4465                             if (n > *PL_reglastparen)
4466                                 *PL_reglastparen = n;
4467                             *PL_reglastcloseparen = n;
4468                             if ( n == ARG(scan) || (cur_eval &&
4469                                 cur_eval->u.eval.close_paren == n))
4470                                 break;
4471                         }
4472                     }
4473                 }
4474             }
4475             goto fake_end;
4476             /*NOTREACHED*/
4477         case GROUPP:
4478             n = ARG(scan);  /* which paren pair */
4479             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4480             break;
4481         case NGROUPP:
4482             /* reg_check_named_buff_matched returns 0 for no match */
4483             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4484             break;
4485         case INSUBP:
4486             n = ARG(scan);
4487             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4488             break;
4489         case DEFINEP:
4490             sw = 0;
4491             break;
4492         case IFTHEN:
4493             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4494             if (sw)
4495                 next = NEXTOPER(NEXTOPER(scan));
4496             else {
4497                 next = scan + ARG(scan);
4498                 if (OP(next) == IFTHEN) /* Fake one. */
4499                     next = NEXTOPER(NEXTOPER(next));
4500             }
4501             break;
4502         case LOGICAL:
4503             logical = scan->flags;
4504             break;
4505
4506 /*******************************************************************
4507
4508 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4509 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4510 STAR/PLUS/CURLY/CURLYN are used instead.)
4511
4512 A*B is compiled as <CURLYX><A><WHILEM><B>
4513
4514 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4515 state, which contains the current count, initialised to -1. It also sets
4516 cur_curlyx to point to this state, with any previous value saved in the
4517 state block.
4518
4519 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4520 since the pattern may possibly match zero times (i.e. it's a while {} loop
4521 rather than a do {} while loop).
4522
4523 Each entry to WHILEM represents a successful match of A. The count in the
4524 CURLYX block is incremented, another WHILEM state is pushed, and execution
4525 passes to A or B depending on greediness and the current count.
4526
4527 For example, if matching against the string a1a2a3b (where the aN are
4528 substrings that match /A/), then the match progresses as follows: (the
4529 pushed states are interspersed with the bits of strings matched so far):
4530
4531     <CURLYX cnt=-1>
4532     <CURLYX cnt=0><WHILEM>
4533     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4534     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4535     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4536     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4537
4538 (Contrast this with something like CURLYM, which maintains only a single
4539 backtrack state:
4540
4541     <CURLYM cnt=0> a1
4542     a1 <CURLYM cnt=1> a2
4543     a1 a2 <CURLYM cnt=2> a3
4544     a1 a2 a3 <CURLYM cnt=3> b
4545 )
4546
4547 Each WHILEM state block marks a point to backtrack to upon partial failure
4548 of A or B, and also contains some minor state data related to that
4549 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4550 overall state, such as the count, and pointers to the A and B ops.
4551
4552 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4553 must always point to the *current* CURLYX block, the rules are:
4554
4555 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4556 and set cur_curlyx to point the new block.
4557
4558 When popping the CURLYX block after a successful or unsuccessful match,
4559 restore the previous cur_curlyx.
4560
4561 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4562 to the outer one saved in the CURLYX block.
4563
4564 When popping the WHILEM block after a successful or unsuccessful B match,
4565 restore the previous cur_curlyx.
4566
4567 Here's an example for the pattern (AI* BI)*BO
4568 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4569
4570 cur_
4571 curlyx backtrack stack
4572 ------ ---------------
4573 NULL
4574 CO     <CO prev=NULL> <WO>
4575 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4576 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4577 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4578
4579 At this point the pattern succeeds, and we work back down the stack to
4580 clean up, restoring as we go:
4581
4582 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4583 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4584 CO     <CO prev=NULL> <WO>
4585 NULL
4586
4587 *******************************************************************/
4588
4589 #define ST st->u.curlyx
4590
4591         case CURLYX:    /* start of /A*B/  (for complex A) */
4592         {
4593             /* No need to save/restore up to this paren */
4594             I32 parenfloor = scan->flags;
4595
4596             assert(next); /* keep Coverity happy */
4597             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4598                 next += ARG(next);
4599
4600             /* XXXX Probably it is better to teach regpush to support
4601                parenfloor > PL_regsize... */
4602             if (parenfloor > (I32)*PL_reglastparen)
4603                 parenfloor = *PL_reglastparen; /* Pessimization... */
4604
4605             ST.prev_curlyx= cur_curlyx;
4606             cur_curlyx = st;
4607             ST.cp = PL_savestack_ix;
4608
4609             /* these fields contain the state of the current curly.
4610              * they are accessed by subsequent WHILEMs */
4611             ST.parenfloor = parenfloor;
4612             ST.me = scan;
4613             ST.B = next;
4614             ST.minmod = minmod;
4615             minmod = 0;
4616             ST.count = -1;      /* this will be updated by WHILEM */
4617             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4618
4619             PL_reginput = locinput;
4620             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4621             /* NOTREACHED */
4622         }
4623
4624         case CURLYX_end: /* just finished matching all of A*B */
4625             cur_curlyx = ST.prev_curlyx;
4626             sayYES;
4627             /* NOTREACHED */
4628
4629         case CURLYX_end_fail: /* just failed to match all of A*B */
4630             regcpblow(ST.cp);
4631             cur_curlyx = ST.prev_curlyx;
4632             sayNO;
4633             /* NOTREACHED */
4634
4635
4636 #undef ST
4637 #define ST st->u.whilem
4638
4639         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4640         {
4641             /* see the discussion above about CURLYX/WHILEM */
4642             I32 n;
4643             int min = ARG1(cur_curlyx->u.curlyx.me);
4644             int max = ARG2(cur_curlyx->u.curlyx.me);
4645             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4646
4647             assert(cur_curlyx); /* keep Coverity happy */
4648             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4649             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4650             ST.cache_offset = 0;
4651             ST.cache_mask = 0;
4652
4653             PL_reginput = locinput;
4654
4655             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4656                   "%*s  whilem: matched %ld out of %d..%d\n",
4657                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4658             );
4659
4660             /* First just match a string of min A's. */
4661
4662             if (n < min) {
4663                 cur_curlyx->u.curlyx.lastloc = locinput;
4664                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4665                 /* NOTREACHED */
4666             }
4667
4668             /* If degenerate A matches "", assume A done. */
4669
4670             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4671                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4672                    "%*s  whilem: empty match detected, trying continuation...\n",
4673                    REPORT_CODE_OFF+depth*2, "")
4674                 );
4675                 goto do_whilem_B_max;
4676             }
4677
4678             /* super-linear cache processing */
4679
4680             if (scan->flags) {
4681
4682                 if (!PL_reg_maxiter) {
4683                     /* start the countdown: Postpone detection until we
4684                      * know the match is not *that* much linear. */
4685                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4686                     /* possible overflow for long strings and many CURLYX's */
4687                     if (PL_reg_maxiter < 0)
4688                         PL_reg_maxiter = I32_MAX;
4689                     PL_reg_leftiter = PL_reg_maxiter;
4690                 }
4691
4692                 if (PL_reg_leftiter-- == 0) {
4693                     /* initialise cache */
4694                     const I32 size = (PL_reg_maxiter + 7)/8;
4695                     if (PL_reg_poscache) {
4696                         if ((I32)PL_reg_poscache_size < size) {
4697                             Renew(PL_reg_poscache, size, char);
4698                             PL_reg_poscache_size = size;
4699                         }
4700                         Zero(PL_reg_poscache, size, char);
4701                     }
4702                     else {
4703                         PL_reg_poscache_size = size;
4704                         Newxz(PL_reg_poscache, size, char);
4705                     }
4706                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4707       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4708                               PL_colors[4], PL_colors[5])
4709                     );
4710                 }
4711
4712                 if (PL_reg_leftiter < 0) {
4713                     /* have we already failed at this position? */
4714                     I32 offset, mask;
4715                     offset  = (scan->flags & 0xf) - 1
4716                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4717                     mask    = 1 << (offset % 8);
4718                     offset /= 8;
4719                     if (PL_reg_poscache[offset] & mask) {
4720                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4721                             "%*s  whilem: (cache) already tried at this position...\n",
4722                             REPORT_CODE_OFF+depth*2, "")
4723                         );
4724                         sayNO; /* cache records failure */
4725                     }
4726                     ST.cache_offset = offset;
4727                     ST.cache_mask   = mask;
4728                 }
4729             }
4730
4731             /* Prefer B over A for minimal matching. */
4732
4733             if (cur_curlyx->u.curlyx.minmod) {
4734                 ST.save_curlyx = cur_curlyx;
4735                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4736                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4737                 REGCP_SET(ST.lastcp);
4738                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4739                 /* NOTREACHED */
4740             }
4741
4742             /* Prefer A over B for maximal matching. */
4743
4744             if (n < max) { /* More greed allowed? */
4745                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4746                 cur_curlyx->u.curlyx.lastloc = locinput;
4747                 REGCP_SET(ST.lastcp);
4748                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4749                 /* NOTREACHED */
4750             }
4751             goto do_whilem_B_max;
4752         }
4753         /* NOTREACHED */
4754
4755         case WHILEM_B_min: /* just matched B in a minimal match */
4756         case WHILEM_B_max: /* just matched B in a maximal match */
4757             cur_curlyx = ST.save_curlyx;
4758             sayYES;
4759             /* NOTREACHED */
4760
4761         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4762             cur_curlyx = ST.save_curlyx;
4763             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4764             cur_curlyx->u.curlyx.count--;
4765             CACHEsayNO;
4766             /* NOTREACHED */
4767
4768         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4769             REGCP_UNWIND(ST.lastcp);
4770             regcppop(rex);
4771             /* FALL THROUGH */
4772         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4773             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4774             cur_curlyx->u.curlyx.count--;
4775             CACHEsayNO;
4776             /* NOTREACHED */
4777
4778         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4779             REGCP_UNWIND(ST.lastcp);
4780             regcppop(rex);      /* Restore some previous $<digit>s? */
4781             PL_reginput = locinput;
4782             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4783                 "%*s  whilem: failed, trying continuation...\n",
4784                 REPORT_CODE_OFF+depth*2, "")
4785             );
4786           do_whilem_B_max:
4787             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4788                 && ckWARN(WARN_REGEXP)
4789                 && !(PL_reg_flags & RF_warned))
4790             {
4791                 PL_reg_flags |= RF_warned;
4792                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4793                      "Complex regular subexpression recursion",
4794                      REG_INFTY - 1);
4795             }
4796
4797             /* now try B */
4798             ST.save_curlyx = cur_curlyx;
4799             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4800             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4801             /* NOTREACHED */
4802
4803         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4804             cur_curlyx = ST.save_curlyx;
4805             REGCP_UNWIND(ST.lastcp);
4806             regcppop(rex);
4807
4808             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4809                 /* Maximum greed exceeded */
4810                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4811                     && ckWARN(WARN_REGEXP)
4812                     && !(PL_reg_flags & RF_warned))
4813                 {
4814                     PL_reg_flags |= RF_warned;
4815                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4816                         "%s limit (%d) exceeded",
4817                         "Complex regular subexpression recursion",
4818                         REG_INFTY - 1);
4819                 }
4820                 cur_curlyx->u.curlyx.count--;
4821                 CACHEsayNO;
4822             }
4823
4824             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4825                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4826             );
4827             /* Try grabbing another A and see if it helps. */
4828             PL_reginput = locinput;
4829             cur_curlyx->u.curlyx.lastloc = locinput;
4830             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4831             REGCP_SET(ST.lastcp);
4832             PUSH_STATE_GOTO(WHILEM_A_min,
4833                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4834             /* NOTREACHED */
4835
4836 #undef  ST
4837 #define ST st->u.branch
4838
4839         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4840             next = scan + ARG(scan);
4841             if (next == scan)
4842                 next = NULL;
4843             scan = NEXTOPER(scan);
4844             /* FALL THROUGH */
4845
4846         case BRANCH:        /*  /(...|A|...)/ */
4847             scan = NEXTOPER(scan); /* scan now points to inner node */
4848             ST.lastparen = *PL_reglastparen;
4849             ST.next_branch = next;
4850             REGCP_SET(ST.cp);
4851             PL_reginput = locinput;
4852
4853             /* Now go into the branch */
4854             if (has_cutgroup) {
4855                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4856             } else {
4857                 PUSH_STATE_GOTO(BRANCH_next, scan);
4858             }
4859             /* NOTREACHED */
4860         case CUTGROUP:
4861             PL_reginput = locinput;
4862             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4863                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4864             PUSH_STATE_GOTO(CUTGROUP_next,next);
4865             /* NOTREACHED */
4866         case CUTGROUP_next_fail:
4867             do_cutgroup = 1;
4868             no_final = 1;
4869             if (st->u.mark.mark_name)
4870                 sv_commit = st->u.mark.mark_name;
4871             sayNO;
4872             /* NOTREACHED */
4873         case BRANCH_next:
4874             sayYES;
4875             /* NOTREACHED */
4876         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4877             if (do_cutgroup) {
4878                 do_cutgroup = 0;
4879                 no_final = 0;
4880             }
4881             REGCP_UNWIND(ST.cp);
4882             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4883                 PL_regoffs[n].end = -1;
4884             *PL_reglastparen = n;
4885             /*dmq: *PL_reglastcloseparen = n; */
4886             scan = ST.next_branch;
4887             /* no more branches? */
4888             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4889                 DEBUG_EXECUTE_r({
4890                     PerlIO_printf( Perl_debug_log,
4891                         "%*s  %sBRANCH failed...%s\n",
4892                         REPORT_CODE_OFF+depth*2, "",
4893                         PL_colors[4],
4894                         PL_colors[5] );
4895                 });
4896                 sayNO_SILENT;
4897             }
4898             continue; /* execute next BRANCH[J] op */
4899             /* NOTREACHED */
4900
4901         case MINMOD:
4902             minmod = 1;
4903             break;
4904
4905 #undef  ST
4906 #define ST st->u.curlym
4907
4908         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4909
4910             /* This is an optimisation of CURLYX that enables us to push
4911              * only a single backtracking state, no matter how many matches
4912              * there are in {m,n}. It relies on the pattern being constant
4913              * length, with no parens to influence future backrefs
4914              */
4915
4916             ST.me = scan;
4917             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4918
4919             /* if paren positive, emulate an OPEN/CLOSE around A */
4920             if (ST.me->flags) {
4921                 U32 paren = ST.me->flags;
4922                 if (paren > PL_regsize)
4923                     PL_regsize = paren;
4924                 if (paren > *PL_reglastparen)
4925                     *PL_reglastparen = paren;
4926                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4927             }
4928             ST.A = scan;
4929             ST.B = next;
4930             ST.alen = 0;
4931             ST.count = 0;
4932             ST.minmod = minmod;
4933             minmod = 0;
4934             ST.c1 = CHRTEST_UNINIT;
4935             REGCP_SET(ST.cp);
4936
4937             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4938                 goto curlym_do_B;
4939
4940           curlym_do_A: /* execute the A in /A{m,n}B/  */
4941             PL_reginput = locinput;
4942             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4943             /* NOTREACHED */
4944
4945         case CURLYM_A: /* we've just matched an A */
4946             locinput = st->locinput;
4947             nextchr = UCHARAT(locinput);
4948
4949             ST.count++;
4950             /* after first match, determine A's length: u.curlym.alen */
4951             if (ST.count == 1) {
4952                 if (PL_reg_match_utf8) {
4953                     char *s = locinput;
4954                     while (s < PL_reginput) {
4955                         ST.alen++;
4956                         s += UTF8SKIP(s);
4957                     }
4958                 }
4959                 else {
4960                     ST.alen = PL_reginput - locinput;
4961                 }
4962                 if (ST.alen == 0)
4963                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4964             }
4965             DEBUG_EXECUTE_r(
4966                 PerlIO_printf(Perl_debug_log,
4967                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4968                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4969                           (IV) ST.count, (IV)ST.alen)
4970             );
4971
4972             locinput = PL_reginput;
4973
4974             if (cur_eval && cur_eval->u.eval.close_paren &&
4975                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4976                 goto fake_end;
4977
4978             {
4979                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4980                 if ( max == REG_INFTY || ST.count < max )
4981                     goto curlym_do_A; /* try to match another A */
4982             }
4983             goto curlym_do_B; /* try to match B */
4984
4985         case CURLYM_A_fail: /* just failed to match an A */
4986             REGCP_UNWIND(ST.cp);
4987
4988             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4989                 || (cur_eval && cur_eval->u.eval.close_paren &&
4990                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4991                 sayNO;
4992
4993           curlym_do_B: /* execute the B in /A{m,n}B/  */
4994             PL_reginput = locinput;
4995             if (ST.c1 == CHRTEST_UNINIT) {
4996                 /* calculate c1 and c2 for possible match of 1st char
4997                  * following curly */
4998                 ST.c1 = ST.c2 = CHRTEST_VOID;
4999                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5000                     regnode *text_node = ST.B;
5001                     if (! HAS_TEXT(text_node))
5002                         FIND_NEXT_IMPT(text_node);
5003                     /* this used to be
5004
5005                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5006
5007                         But the former is redundant in light of the latter.
5008
5009                         if this changes back then the macro for
5010                         IS_TEXT and friends need to change.
5011                      */
5012                     if (PL_regkind[OP(text_node)] == EXACT)
5013                     {
5014
5015                         ST.c1 = (U8)*STRING(text_node);
5016                         switch (OP(text_node)) {
5017                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5018                             case EXACTFA:
5019                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5020                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5021                             default: ST.c2 = ST.c1;
5022                         }
5023                     }
5024                 }
5025             }
5026
5027             DEBUG_EXECUTE_r(
5028                 PerlIO_printf(Perl_debug_log,
5029                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5030                     (int)(REPORT_CODE_OFF+(depth*2)),
5031                     "", (IV)ST.count)
5032                 );
5033             if (ST.c1 != CHRTEST_VOID
5034                     && UCHARAT(PL_reginput) != ST.c1
5035                     && UCHARAT(PL_reginput) != ST.c2)
5036             {
5037                 /* simulate B failing */
5038                 DEBUG_OPTIMISE_r(
5039                     PerlIO_printf(Perl_debug_log,
5040                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5041                         (int)(REPORT_CODE_OFF+(depth*2)),"",
5042                         (IV)ST.c1,(IV)ST.c2
5043                 ));
5044                 state_num = CURLYM_B_fail;
5045                 goto reenter_switch;
5046             }
5047
5048             if (ST.me->flags) {
5049                 /* mark current A as captured */
5050                 I32 paren = ST.me->flags;
5051                 if (ST.count) {
5052                     PL_regoffs[paren].start
5053                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5054                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
5055                     /*dmq: *PL_reglastcloseparen = paren; */
5056                 }
5057                 else
5058                     PL_regoffs[paren].end = -1;
5059                 if (cur_eval && cur_eval->u.eval.close_paren &&
5060                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5061                 {
5062                     if (ST.count)
5063                         goto fake_end;
5064                     else
5065                         sayNO;
5066                 }
5067             }
5068
5069             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5070             /* NOTREACHED */
5071
5072         case CURLYM_B_fail: /* just failed to match a B */
5073             REGCP_UNWIND(ST.cp);
5074             if (ST.minmod) {
5075                 I32 max = ARG2(ST.me);
5076                 if (max != REG_INFTY && ST.count == max)
5077                     sayNO;
5078                 goto curlym_do_A; /* try to match a further A */
5079             }
5080             /* backtrack one A */
5081             if (ST.count == ARG1(ST.me) /* min */)
5082                 sayNO;
5083             ST.count--;
5084             locinput = HOPc(locinput, -ST.alen);
5085             goto curlym_do_B; /* try to match B */
5086
5087 #undef ST
5088 #define ST st->u.curly
5089
5090 #define CURLY_SETPAREN(paren, success) \
5091     if (paren) { \
5092         if (success) { \
5093             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5094             PL_regoffs[paren].end = locinput - PL_bostr; \
5095             *PL_reglastcloseparen = paren; \
5096         } \
5097         else \
5098             PL_regoffs[paren].end = -1; \
5099     }
5100
5101         case STAR:              /*  /A*B/ where A is width 1 */
5102             ST.paren = 0;
5103             ST.min = 0;
5104             ST.max = REG_INFTY;
5105             scan = NEXTOPER(scan);
5106             goto repeat;
5107         case PLUS:              /*  /A+B/ where A is width 1 */
5108             ST.paren = 0;
5109             ST.min = 1;
5110             ST.max = REG_INFTY;
5111             scan = NEXTOPER(scan);
5112             goto repeat;
5113         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5114             ST.paren = scan->flags;     /* Which paren to set */
5115             if (ST.paren > PL_regsize)
5116                 PL_regsize = ST.paren;
5117             if (ST.paren > *PL_reglastparen)
5118                 *PL_reglastparen = ST.paren;
5119             ST.min = ARG1(scan);  /* min to match */
5120             ST.max = ARG2(scan);  /* max to match */
5121             if (cur_eval && cur_eval->u.eval.close_paren &&
5122                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5123                 ST.min=1;
5124                 ST.max=1;
5125             }
5126             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5127             goto repeat;
5128         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5129             ST.paren = 0;
5130             ST.min = ARG1(scan);  /* min to match */
5131             ST.max = ARG2(scan);  /* max to match */
5132             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5133           repeat:
5134             /*
5135             * Lookahead to avoid useless match attempts
5136             * when we know what character comes next.
5137             *
5138             * Used to only do .*x and .*?x, but now it allows
5139             * for )'s, ('s and (?{ ... })'s to be in the way
5140             * of the quantifier and the EXACT-like node.  -- japhy
5141             */
5142
5143             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5144                 sayNO;
5145             if (HAS_TEXT(next) || JUMPABLE(next)) {
5146                 U8 *s;
5147                 regnode *text_node = next;
5148
5149                 if (! HAS_TEXT(text_node))
5150                     FIND_NEXT_IMPT(text_node);
5151
5152                 if (! HAS_TEXT(text_node))
5153                     ST.c1 = ST.c2 = CHRTEST_VOID;
5154                 else {
5155                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5156                         ST.c1 = ST.c2 = CHRTEST_VOID;
5157                         goto assume_ok_easy;
5158                     }
5159                     else
5160                         s = (U8*)STRING(text_node);
5161
5162                     /*  Currently we only get here when
5163
5164                         PL_rekind[OP(text_node)] == EXACT
5165
5166                         if this changes back then the macro for IS_TEXT and
5167                         friends need to change. */
5168                     if (!UTF_PATTERN) {
5169                         ST.c1 = *s;
5170                         switch (OP(text_node)) {
5171                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5172                             case EXACTFA:
5173                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5174                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5175                             default: ST.c2 = ST.c1; break;
5176                         }
5177                     }
5178                     else { /* UTF_PATTERN */
5179                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5180                              STRLEN ulen1, ulen2;
5181                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5182                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5183
5184                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5185                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5186 #ifdef EBCDIC
5187                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5188                                                     ckWARN(WARN_UTF8) ?
5189                                                     0 : UTF8_ALLOW_ANY);
5190                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5191                                                     ckWARN(WARN_UTF8) ?
5192                                                     0 : UTF8_ALLOW_ANY);
5193 #else
5194                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5195                                                     uniflags);
5196                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5197                                                     uniflags);
5198 #endif
5199                         }
5200                         else {
5201                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5202                                                      uniflags);
5203                         }
5204                     }
5205                 }
5206             }
5207             else
5208                 ST.c1 = ST.c2 = CHRTEST_VOID;
5209         assume_ok_easy:
5210
5211             ST.A = scan;
5212             ST.B = next;
5213             PL_reginput = locinput;
5214             if (minmod) {
5215                 minmod = 0;
5216                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5217                     sayNO;
5218                 ST.count = ST.min;
5219                 locinput = PL_reginput;
5220                 REGCP_SET(ST.cp);
5221                 if (ST.c1 == CHRTEST_VOID)
5222                     goto curly_try_B_min;
5223
5224                 ST.oldloc = locinput;
5225
5226                 /* set ST.maxpos to the furthest point along the
5227                  * string that could possibly match */
5228                 if  (ST.max == REG_INFTY) {
5229                     ST.maxpos = PL_regeol - 1;
5230                     if (utf8_target)
5231                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5232                             ST.maxpos--;
5233                 }
5234                 else if (utf8_target) {
5235                     int m = ST.max - ST.min;
5236                     for (ST.maxpos = locinput;
5237                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5238                         ST.maxpos += UTF8SKIP(ST.maxpos);
5239                 }
5240                 else {
5241                     ST.maxpos = locinput + ST.max - ST.min;
5242                     if (ST.maxpos >= PL_regeol)
5243                         ST.maxpos = PL_regeol - 1;
5244                 }
5245                 goto curly_try_B_min_known;
5246
5247             }
5248             else {
5249                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5250                 locinput = PL_reginput;
5251                 if (ST.count < ST.min)
5252                     sayNO;
5253                 if ((ST.count > ST.min)
5254                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5255                 {
5256                     /* A{m,n} must come at the end of the string, there's
5257                      * no point in backing off ... */
5258                     ST.min = ST.count;
5259                     /* ...except that $ and \Z can match before *and* after
5260                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5261                        We may back off by one in this case. */
5262                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5263                         ST.min--;
5264                 }
5265                 REGCP_SET(ST.cp);
5266                 goto curly_try_B_max;
5267             }
5268             /* NOTREACHED */
5269
5270
5271         case CURLY_B_min_known_fail:
5272             /* failed to find B in a non-greedy match where c1,c2 valid */
5273             if (ST.paren && ST.count)
5274                 PL_regoffs[ST.paren].end = -1;
5275
5276             PL_reginput = locinput;     /* Could be reset... */
5277             REGCP_UNWIND(ST.cp);
5278             /* Couldn't or didn't -- move forward. */
5279             ST.oldloc = locinput;
5280             if (utf8_target)
5281                 locinput += UTF8SKIP(locinput);
5282             else
5283                 locinput++;
5284             ST.count++;
5285           curly_try_B_min_known:
5286              /* find the next place where 'B' could work, then call B */
5287             {
5288                 int n;
5289                 if (utf8_target) {
5290                     n = (ST.oldloc == locinput) ? 0 : 1;
5291                     if (ST.c1 == ST.c2) {
5292                         STRLEN len;
5293                         /* set n to utf8_distance(oldloc, locinput) */
5294                         while (locinput <= ST.maxpos &&
5295                                utf8n_to_uvchr((U8*)locinput,
5296                                               UTF8_MAXBYTES, &len,
5297                                               uniflags) != (UV)ST.c1) {
5298                             locinput += len;
5299                             n++;
5300                         }
5301                     }
5302                     else {
5303                         /* set n to utf8_distance(oldloc, locinput) */
5304                         while (locinput <= ST.maxpos) {
5305                             STRLEN len;
5306                             const UV c = utf8n_to_uvchr((U8*)locinput,
5307                                                   UTF8_MAXBYTES, &len,
5308                                                   uniflags);
5309                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5310                                 break;
5311                             locinput += len;
5312                             n++;
5313                         }
5314                     }
5315                 }
5316                 else {
5317                     if (ST.c1 == ST.c2) {
5318                         while (locinput <= ST.maxpos &&
5319                                UCHARAT(locinput) != ST.c1)
5320                             locinput++;
5321                     }
5322                     else {
5323                         while (locinput <= ST.maxpos
5324                                && UCHARAT(locinput) != ST.c1
5325                                && UCHARAT(locinput) != ST.c2)
5326                             locinput++;
5327                     }
5328                     n = locinput - ST.oldloc;
5329                 }
5330                 if (locinput > ST.maxpos)
5331                     sayNO;
5332                 /* PL_reginput == oldloc now */
5333                 if (n) {
5334                     ST.count += n;
5335                     if (regrepeat(rex, ST.A, n, depth) < n)
5336                         sayNO;
5337                 }
5338                 PL_reginput = locinput;
5339                 CURLY_SETPAREN(ST.paren, ST.count);
5340                 if (cur_eval && cur_eval->u.eval.close_paren &&
5341                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5342                     goto fake_end;
5343                 }
5344                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5345             }
5346             /* NOTREACHED */
5347
5348
5349         case CURLY_B_min_fail:
5350             /* failed to find B in a non-greedy match where c1,c2 invalid */
5351             if (ST.paren && ST.count)
5352                 PL_regoffs[ST.paren].end = -1;
5353
5354             REGCP_UNWIND(ST.cp);
5355             /* failed -- move forward one */
5356             PL_reginput = locinput;
5357             if (regrepeat(rex, ST.A, 1, depth)) {
5358                 ST.count++;
5359                 locinput = PL_reginput;
5360                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5361                         ST.count > 0)) /* count overflow ? */
5362                 {
5363                   curly_try_B_min:
5364                     CURLY_SETPAREN(ST.paren, ST.count);
5365                     if (cur_eval && cur_eval->u.eval.close_paren &&
5366                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5367                         goto fake_end;
5368                     }
5369                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5370                 }
5371             }
5372             sayNO;
5373             /* NOTREACHED */
5374
5375
5376         curly_try_B_max:
5377             /* a successful greedy match: now try to match B */
5378             if (cur_eval && cur_eval->u.eval.close_paren &&
5379                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5380                 goto fake_end;
5381             }
5382             {
5383                 UV c = 0;
5384                 if (ST.c1 != CHRTEST_VOID)
5385                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5386                                            UTF8_MAXBYTES, 0, uniflags)
5387                                 : (UV) UCHARAT(PL_reginput);
5388                 /* If it could work, try it. */
5389                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5390                     CURLY_SETPAREN(ST.paren, ST.count);
5391                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5392                     /* NOTREACHED */
5393                 }
5394             }
5395             /* FALL THROUGH */
5396         case CURLY_B_max_fail:
5397             /* failed to find B in a greedy match */
5398             if (ST.paren && ST.count)
5399                 PL_regoffs[ST.paren].end = -1;
5400
5401             REGCP_UNWIND(ST.cp);
5402             /*  back up. */
5403             if (--ST.count < ST.min)
5404                 sayNO;
5405             PL_reginput = locinput = HOPc(locinput, -1);
5406             goto curly_try_B_max;
5407
5408 #undef ST
5409
5410         case END:
5411             fake_end:
5412             if (cur_eval) {
5413                 /* we've just finished A in /(??{A})B/; now continue with B */
5414                 I32 tmpix;
5415                 st->u.eval.toggle_reg_flags
5416                             = cur_eval->u.eval.toggle_reg_flags;
5417                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5418
5419                 st->u.eval.prev_rex = rex_sv;           /* inner */
5420                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5421                 rex = (struct regexp *)SvANY(rex_sv);
5422                 rexi = RXi_GET(rex);
5423                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5424                 ReREFCNT_inc(rex_sv);
5425                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5426
5427                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5428                 PL_reglastparen = &rex->lastparen;
5429                 PL_reglastcloseparen = &rex->lastcloseparen;
5430
5431                 REGCP_SET(st->u.eval.lastcp);
5432                 PL_reginput = locinput;
5433
5434                 /* Restore parens of the outer rex without popping the
5435                  * savestack */
5436                 tmpix = PL_savestack_ix;
5437                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5438                 regcppop(rex);
5439                 PL_savestack_ix = tmpix;
5440
5441                 st->u.eval.prev_eval = cur_eval;
5442                 cur_eval = cur_eval->u.eval.prev_eval;
5443                 DEBUG_EXECUTE_r(
5444                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5445                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5446                 if ( nochange_depth )
5447                     nochange_depth--;
5448
5449                 PUSH_YES_STATE_GOTO(EVAL_AB,
5450                         st->u.eval.prev_eval->u.eval.B); /* match B */
5451             }
5452
5453             if (locinput < reginfo->till) {
5454                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5455                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5456                                       PL_colors[4],
5457                                       (long)(locinput - PL_reg_starttry),
5458                                       (long)(reginfo->till - PL_reg_starttry),
5459                                       PL_colors[5]));
5460
5461                 sayNO_SILENT;           /* Cannot match: too short. */
5462             }
5463             PL_reginput = locinput;     /* put where regtry can find it */
5464             sayYES;                     /* Success! */
5465
5466         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5467             DEBUG_EXECUTE_r(
5468             PerlIO_printf(Perl_debug_log,
5469                 "%*s  %ssubpattern success...%s\n",
5470                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5471             PL_reginput = locinput;     /* put where regtry can find it */
5472             sayYES;                     /* Success! */
5473
5474 #undef  ST
5475 #define ST st->u.ifmatch
5476
5477         case SUSPEND:   /* (?>A) */
5478             ST.wanted = 1;
5479             PL_reginput = locinput;
5480             goto do_ifmatch;
5481
5482         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5483             ST.wanted = 0;
5484             goto ifmatch_trivial_fail_test;
5485
5486         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5487             ST.wanted = 1;
5488           ifmatch_trivial_fail_test:
5489             if (scan->flags) {
5490                 char * const s = HOPBACKc(locinput, scan->flags);
5491                 if (!s) {
5492                     /* trivial fail */
5493                     if (logical) {
5494                         logical = 0;
5495                         sw = 1 - cBOOL(ST.wanted);
5496                     }
5497                     else if (ST.wanted)
5498                         sayNO;
5499                     next = scan + ARG(scan);
5500                     if (next == scan)
5501                         next = NULL;
5502                     break;
5503                 }
5504                 PL_reginput = s;
5505             }
5506             else
5507                 PL_reginput = locinput;
5508
5509           do_ifmatch:
5510             ST.me = scan;
5511             ST.logical = logical;
5512             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5513
5514             /* execute body of (?...A) */
5515             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5516             /* NOTREACHED */
5517
5518         case IFMATCH_A_fail: /* body of (?...A) failed */
5519             ST.wanted = !ST.wanted;
5520             /* FALL THROUGH */
5521
5522         case IFMATCH_A: /* body of (?...A) succeeded */
5523             if (ST.logical) {
5524                 sw = cBOOL(ST.wanted);
5525             }
5526             else if (!ST.wanted)
5527                 sayNO;
5528
5529             if (OP(ST.me) == SUSPEND)
5530                 locinput = PL_reginput;
5531             else {
5532                 locinput = PL_reginput = st->locinput;
5533                 nextchr = UCHARAT(locinput);
5534             }
5535             scan = ST.me + ARG(ST.me);
5536             if (scan == ST.me)
5537                 scan = NULL;
5538             continue; /* execute B */
5539
5540 #undef ST
5541
5542         case LONGJMP:
5543             next = scan + ARG(scan);
5544             if (next == scan)
5545                 next = NULL;
5546             break;
5547         case COMMIT:
5548             reginfo->cutpoint = PL_regeol;
5549             /* FALLTHROUGH */
5550         case PRUNE:
5551             PL_reginput = locinput;
5552             if (!scan->flags)
5553                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5554             PUSH_STATE_GOTO(COMMIT_next,next);
5555             /* NOTREACHED */
5556         case COMMIT_next_fail:
5557             no_final = 1;
5558             /* FALLTHROUGH */
5559         case OPFAIL:
5560             sayNO;
5561             /* NOTREACHED */
5562
5563 #define ST st->u.mark
5564         case MARKPOINT:
5565             ST.prev_mark = mark_state;
5566             ST.mark_name = sv_commit = sv_yes_mark
5567                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5568             mark_state = st;
5569             ST.mark_loc = PL_reginput = locinput;
5570             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5571             /* NOTREACHED */
5572         case MARKPOINT_next:
5573             mark_state = ST.prev_mark;
5574             sayYES;
5575             /* NOTREACHED */
5576         case MARKPOINT_next_fail:
5577             if (popmark && sv_eq(ST.mark_name,popmark))
5578             {
5579                 if (ST.mark_loc > startpoint)
5580                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5581                 popmark = NULL; /* we found our mark */
5582                 sv_commit = ST.mark_name;
5583
5584                 DEBUG_EXECUTE_r({
5585                         PerlIO_printf(Perl_debug_log,
5586                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5587                             REPORT_CODE_OFF+depth*2, "",
5588                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5589                 });
5590             }
5591             mark_state = ST.prev_mark;
5592             sv_yes_mark = mark_state ?
5593                 mark_state->u.mark.mark_name : NULL;
5594             sayNO;
5595             /* NOTREACHED */
5596         case SKIP:
5597             PL_reginput = locinput;
5598             if (scan->flags) {
5599                 /* (*SKIP) : if we fail we cut here*/
5600                 ST.mark_name = NULL;
5601                 ST.mark_loc = locinput;
5602                 PUSH_STATE_GOTO(SKIP_next,next);
5603             } else {
5604                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5605                    otherwise do nothing.  Meaning we need to scan
5606                  */
5607                 regmatch_state *cur = mark_state;
5608                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5609
5610                 while (cur) {
5611                     if ( sv_eq( cur->u.mark.mark_name,
5612                                 find ) )
5613                     {
5614                         ST.mark_name = find;
5615                         PUSH_STATE_GOTO( SKIP_next, next );
5616                     }
5617                     cur = cur->u.mark.prev_mark;
5618                 }
5619             }
5620             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5621             break;
5622         case SKIP_next_fail:
5623             if (ST.mark_name) {
5624                 /* (*CUT:NAME) - Set up to search for the name as we
5625                    collapse the stack*/
5626                 popmark = ST.mark_name;
5627             } else {
5628                 /* (*CUT) - No name, we cut here.*/
5629                 if (ST.mark_loc > startpoint)
5630                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5631                 /* but we set sv_commit to latest mark_name if there
5632                    is one so they can test to see how things lead to this
5633                    cut */
5634                 if (mark_state)
5635                     sv_commit=mark_state->u.mark.mark_name;
5636             }
5637             no_final = 1;
5638             sayNO;
5639             /* NOTREACHED */
5640 #undef ST
5641         case FOLDCHAR:
5642             n = ARG(scan);
5643             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5644                 locinput += ln;
5645             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5646                 sayNO;
5647             } else  {
5648                 U8 folded[UTF8_MAXBYTES_CASE+1];
5649                 STRLEN foldlen;
5650                 const char * const l = locinput;
5651                 char *e = PL_regeol;
5652                 to_uni_fold(n, folded, &foldlen);
5653
5654                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5655                                l, &e, 0,  utf8_target)) {
5656                         sayNO;
5657                 }
5658                 locinput = e;
5659             }
5660             nextchr = UCHARAT(locinput);
5661             break;
5662         case LNBREAK:
5663             if ((n=is_LNBREAK(locinput,utf8_target))) {
5664                 locinput += n;
5665                 nextchr = UCHARAT(locinput);
5666             } else
5667                 sayNO;
5668             break;
5669
5670 #define CASE_CLASS(nAmE)                              \
5671         case nAmE:                                    \
5672             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5673                 locinput += n;                        \
5674                 nextchr = UCHARAT(locinput);          \
5675             } else                                    \
5676                 sayNO;                                \
5677             break;                                    \
5678         case N##nAmE:                                 \
5679             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5680                 sayNO;                                \
5681             } else {                                  \
5682                 locinput += UTF8SKIP(locinput);       \
5683                 nextchr = UCHARAT(locinput);          \
5684             }                                         \
5685             break
5686
5687         CASE_CLASS(VERTWS);
5688         CASE_CLASS(HORIZWS);
5689 #undef CASE_CLASS
5690
5691         default:
5692             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5693                           PTR2UV(scan), OP(scan));
5694             Perl_croak(aTHX_ "regexp memory corruption");
5695
5696         } /* end switch */
5697
5698         /* switch break jumps here */
5699         scan = next; /* prepare to execute the next op and ... */
5700         continue;    /* ... jump back to the top, reusing st */
5701         /* NOTREACHED */
5702
5703       push_yes_state:
5704         /* push a state that backtracks on success */
5705         st->u.yes.prev_yes_state = yes_state;
5706         yes_state = st;
5707         /* FALL THROUGH */
5708       push_state:
5709         /* push a new regex state, then continue at scan  */
5710         {
5711             regmatch_state *newst;
5712
5713             DEBUG_STACK_r({
5714                 regmatch_state *cur = st;
5715                 regmatch_state *curyes = yes_state;
5716                 int curd = depth;
5717                 regmatch_slab *slab = PL_regmatch_slab;
5718                 for (;curd > -1;cur--,curd--) {
5719                     if (cur < SLAB_FIRST(slab)) {
5720                         slab = slab->prev;
5721                         cur = SLAB_LAST(slab);
5722                     }
5723                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5724                         REPORT_CODE_OFF + 2 + depth * 2,"",
5725                         curd, PL_reg_name[cur->resume_state],
5726                         (curyes == cur) ? "yes" : ""
5727                     );
5728                     if (curyes == cur)
5729                         curyes = cur->u.yes.prev_yes_state;
5730                 }
5731             } else
5732                 DEBUG_STATE_pp("push")
5733             );
5734             depth++;
5735             st->locinput = locinput;
5736             newst = st+1;
5737             if (newst >  SLAB_LAST(PL_regmatch_slab))
5738                 newst = S_push_slab(aTHX);
5739             PL_regmatch_state = newst;
5740
5741             locinput = PL_reginput;
5742             nextchr = UCHARAT(locinput);
5743             st = newst;
5744             continue;
5745             /* NOTREACHED */
5746         }
5747     }
5748
5749     /*
5750     * We get here only if there's trouble -- normally "case END" is
5751     * the terminating point.
5752     */
5753     Perl_croak(aTHX_ "corrupted regexp pointers");
5754     /*NOTREACHED*/
5755     sayNO;
5756
5757 yes:
5758     if (yes_state) {
5759         /* we have successfully completed a subexpression, but we must now
5760          * pop to the state marked by yes_state and continue from there */
5761         assert(st != yes_state);
5762 #ifdef DEBUGGING
5763         while (st != yes_state) {
5764             st--;
5765             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5766                 PL_regmatch_slab = PL_regmatch_slab->prev;
5767                 st = SLAB_LAST(PL_regmatch_slab);
5768             }
5769             DEBUG_STATE_r({
5770                 if (no_final) {
5771                     DEBUG_STATE_pp("pop (no final)");
5772                 } else {
5773                     DEBUG_STATE_pp("pop (yes)");
5774                 }
5775             });
5776             depth--;
5777         }
5778 #else
5779         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5780             || yes_state > SLAB_LAST(PL_regmatch_slab))
5781         {
5782             /* not in this slab, pop slab */
5783             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5784             PL_regmatch_slab = PL_regmatch_slab->prev;
5785             st = SLAB_LAST(PL_regmatch_slab);
5786         }
5787         depth -= (st - yes_state);
5788 #endif
5789         st = yes_state;
5790         yes_state = st->u.yes.prev_yes_state;
5791         PL_regmatch_state = st;
5792
5793         if (no_final) {
5794             locinput= st->locinput;
5795             nextchr = UCHARAT(locinput);
5796         }
5797         state_num = st->resume_state + no_final;
5798         goto reenter_switch;
5799     }
5800
5801     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5802                           PL_colors[4], PL_colors[5]));
5803
5804     if (PL_reg_eval_set) {
5805         /* each successfully executed (?{...}) block does the equivalent of
5806          *   local $^R = do {...}
5807          * When popping the save stack, all these locals would be undone;
5808          * bypass this by setting the outermost saved $^R to the latest
5809          * value */
5810         if (oreplsv != GvSV(PL_replgv))
5811             sv_setsv(oreplsv, GvSV(PL_replgv));
5812     }
5813     result = 1;
5814     goto final_exit;
5815
5816 no:
5817     DEBUG_EXECUTE_r(
5818         PerlIO_printf(Perl_debug_log,
5819             "%*s  %sfailed...%s\n",
5820             REPORT_CODE_OFF+depth*2, "",
5821             PL_colors[4], PL_colors[5])
5822         );
5823
5824 no_silent:
5825     if (no_final) {
5826         if (yes_state) {
5827             goto yes;
5828         } else {
5829             goto final_exit;
5830         }
5831     }
5832     if (depth) {
5833         /* there's a previous state to backtrack to */
5834         st--;
5835         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5836             PL_regmatch_slab = PL_regmatch_slab->prev;
5837             st = SLAB_LAST(PL_regmatch_slab);
5838         }
5839         PL_regmatch_state = st;
5840         locinput= st->locinput;
5841         nextchr = UCHARAT(locinput);
5842
5843         DEBUG_STATE_pp("pop");
5844         depth--;
5845         if (yes_state == st)
5846             yes_state = st->u.yes.prev_yes_state;
5847
5848         state_num = st->resume_state + 1; /* failure = success + 1 */
5849         goto reenter_switch;
5850     }
5851     result = 0;
5852
5853   final_exit:
5854     if (rex->intflags & PREGf_VERBARG_SEEN) {
5855         SV *sv_err = get_sv("REGERROR", 1);
5856         SV *sv_mrk = get_sv("REGMARK", 1);
5857         if (result) {
5858             sv_commit = &PL_sv_no;
5859             if (!sv_yes_mark)
5860                 sv_yes_mark = &PL_sv_yes;
5861         } else {
5862             if (!sv_commit)
5863                 sv_commit = &PL_sv_yes;
5864             sv_yes_mark = &PL_sv_no;
5865         }
5866         sv_setsv(sv_err, sv_commit);
5867         sv_setsv(sv_mrk, sv_yes_mark);
5868     }
5869
5870     /* clean up; in particular, free all slabs above current one */
5871     LEAVE_SCOPE(oldsave);
5872
5873     return result;
5874 }
5875
5876 /*
5877  - regrepeat - repeatedly match something simple, report how many
5878  */
5879 /*
5880  * [This routine now assumes that it will only match on things of length 1.
5881  * That was true before, but now we assume scan - reginput is the count,
5882  * rather than incrementing count on every character.  [Er, except utf8.]]
5883  */
5884 STATIC I32
5885 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5886 {
5887     dVAR;
5888     register char *scan;
5889     register I32 c;
5890     register char *loceol = PL_regeol;
5891     register I32 hardcount = 0;
5892     register bool utf8_target = PL_reg_match_utf8;
5893     UV utf8_flags;
5894 #ifndef DEBUGGING
5895     PERL_UNUSED_ARG(depth);
5896 #endif
5897
5898     PERL_ARGS_ASSERT_REGREPEAT;
5899
5900     scan = PL_reginput;
5901     if (max == REG_INFTY)
5902         max = I32_MAX;
5903     else if (max < loceol - scan)
5904         loceol = scan + max;
5905     switch (OP(p)) {
5906     case REG_ANY:
5907         if (utf8_target) {
5908             loceol = PL_regeol;
5909             while (scan < loceol && hardcount < max && *scan != '\n') {
5910                 scan += UTF8SKIP(scan);
5911                 hardcount++;
5912             }
5913         } else {
5914             while (scan < loceol && *scan != '\n')
5915                 scan++;
5916         }
5917         break;
5918     case SANY:
5919         if (utf8_target) {
5920             loceol = PL_regeol;
5921             while (scan < loceol && hardcount < max) {
5922                 scan += UTF8SKIP(scan);
5923                 hardcount++;
5924             }
5925         }
5926         else
5927             scan = loceol;
5928         break;
5929     case CANY:
5930         scan = loceol;
5931         break;
5932     case EXACT:
5933         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5934          * means they match only characters in the string that can be expressed
5935          * as a single byte.  For non-utf8 strings, that means a simple match.
5936          * For utf8 strings, the character matched must be an invariant, or
5937          * downgradable to a single byte.  The pattern's utf8ness is
5938          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5939          * it is, it's an invariant */
5940
5941         c = (U8)*STRING(p);
5942         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5943
5944         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5945             while (scan < loceol && UCHARAT(scan) == c) {
5946                 scan++;
5947             }
5948         }
5949         else {
5950
5951             /* Here, the string is utf8, and the pattern char is different
5952              * in utf8 than not, so can't compare them directly.  Outside the
5953              * loop, find find the two utf8 bytes that represent c, and then
5954              * look for those in sequence in the utf8 string */
5955             U8 high = UTF8_TWO_BYTE_HI(c);
5956             U8 low = UTF8_TWO_BYTE_LO(c);
5957             loceol = PL_regeol;
5958
5959             while (hardcount < max
5960                     && scan + 1 < loceol
5961                     && UCHARAT(scan) == high
5962                     && UCHARAT(scan + 1) == low)
5963             {
5964                 scan += 2;
5965                 hardcount++;
5966             }
5967         }
5968         break;
5969     case EXACTFA:
5970         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5971         goto do_exactf;
5972
5973     case EXACTFL:
5974         PL_reg_flags |= RF_tainted;
5975         utf8_flags = FOLDEQ_UTF8_LOCALE;
5976         goto do_exactf;
5977
5978     case EXACTF:
5979     case EXACTFU:
5980         utf8_flags = 0;
5981
5982         /* The comments for the EXACT case above apply as well to these fold
5983          * ones */
5984
5985     do_exactf:
5986         c = (U8)*STRING(p);
5987         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5988
5989         if (utf8_target) { /* Use full Unicode fold matching */
5990             char *tmpeol = loceol;
5991             while (hardcount < max
5992                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5993                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5994             {
5995                 scan = tmpeol;
5996                 tmpeol = loceol;
5997                 hardcount++;
5998             }
5999
6000             /* XXX Note that the above handles properly the German sharp s in
6001              * the pattern matching ss in the string.  But it doesn't handle
6002              * properly cases where the string contains say 'LIGATURE ff' and
6003              * the pattern is 'f+'.  This would require, say, a new function or
6004              * revised interface to foldEQ_utf8(), in which the maximum number
6005              * of characters to match could be passed and it would return how
6006              * many actually did.  This is just one of many cases where
6007              * multi-char folds don't work properly, and so the fix is being
6008              * deferred */
6009         }
6010         else {
6011             U8 folded;
6012
6013             /* Here, the string isn't utf8 and c is a single byte; and either
6014              * the pattern isn't utf8 or c is an invariant, so its utf8ness
6015              * doesn't affect c.  Can just do simple comparisons for exact or
6016              * fold matching. */
6017             switch (OP(p)) {
6018                 case EXACTF: folded = PL_fold[c]; break;
6019                 case EXACTFA:
6020                 case EXACTFU: folded = PL_fold_latin1[c]; break;
6021                 case EXACTFL: folded = PL_fold_locale[c]; break;
6022                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6023             }
6024             while (scan < loceol &&
6025                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6026             {
6027                 scan++;
6028             }
6029         }
6030         break;
6031     case ANYOFV:
6032     case ANYOF:
6033         if (utf8_target || OP(p) == ANYOFV) {
6034             STRLEN inclasslen;
6035             loceol = PL_regeol;
6036             inclasslen = loceol - scan;
6037             while (hardcount < max
6038                    && ((inclasslen = loceol - scan) > 0)
6039                    && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6040             {
6041                 scan += inclasslen;
6042                 hardcount++;
6043             }
6044         } else {
6045             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6046                 scan++;
6047         }
6048         break;
6049     case ALNUMU:
6050         if (utf8_target) {
6051     utf8_wordchar:
6052             loceol = PL_regeol;
6053             LOAD_UTF8_CHARCLASS_ALNUM();
6054             while (hardcount < max && scan < loceol &&
6055                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6056             {
6057                 scan += UTF8SKIP(scan);
6058                 hardcount++;
6059             }
6060         } else {
6061             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6062                 scan++;
6063             }
6064         }
6065         break;
6066     case ALNUM:
6067         if (utf8_target)
6068             goto utf8_wordchar;
6069         while (scan < loceol && isALNUM((U8) *scan)) {
6070             scan++;
6071         }
6072         break;
6073     case ALNUMA:
6074         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6075             scan++;
6076         }
6077         break;
6078     case ALNUML:
6079         PL_reg_flags |= RF_tainted;
6080         if (utf8_target) {
6081             loceol = PL_regeol;
6082             while (hardcount < max && scan < loceol &&
6083                    isALNUM_LC_utf8((U8*)scan)) {
6084                 scan += UTF8SKIP(scan);
6085                 hardcount++;
6086             }
6087         } else {
6088             while (scan < loceol && isALNUM_LC(*scan))
6089                 scan++;
6090         }
6091         break;
6092     case NALNUMU:
6093         if (utf8_target) {
6094
6095     utf8_Nwordchar:
6096
6097             loceol = PL_regeol;
6098             LOAD_UTF8_CHARCLASS_ALNUM();
6099             while (hardcount < max && scan < loceol &&
6100                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6101             {
6102                 scan += UTF8SKIP(scan);
6103                 hardcount++;
6104             }
6105         } else {
6106             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6107                 scan++;
6108             }
6109         }
6110         break;
6111     case NALNUM:
6112         if (utf8_target)
6113             goto utf8_Nwordchar;
6114         while (scan < loceol && ! isALNUM((U8) *scan)) {
6115             scan++;
6116         }
6117         break;
6118     case NALNUMA:
6119         if (utf8_target) {
6120             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6121                 scan += UTF8SKIP(scan);
6122             }
6123         }
6124         else {
6125             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6126                 scan++;
6127             }
6128         }
6129         break;
6130     case NALNUML:
6131         PL_reg_flags |= RF_tainted;
6132         if (utf8_target) {
6133             loceol = PL_regeol;
6134             while (hardcount < max && scan < loceol &&
6135                    !isALNUM_LC_utf8((U8*)scan)) {
6136                 scan += UTF8SKIP(scan);
6137                 hardcount++;
6138             }
6139         } else {
6140             while (scan < loceol && !isALNUM_LC(*scan))
6141                 scan++;
6142         }
6143         break;
6144     case SPACEU:
6145         if (utf8_target) {
6146
6147     utf8_space:
6148
6149             loceol = PL_regeol;
6150             LOAD_UTF8_CHARCLASS_SPACE();
6151             while (hardcount < max && scan < loceol &&
6152                    (*scan == ' ' ||
6153                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6154             {
6155                 scan += UTF8SKIP(scan);
6156                 hardcount++;
6157             }
6158             break;
6159         }
6160         else {
6161             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6162                 scan++;
6163             }
6164             break;
6165         }
6166     case SPACE:
6167         if (utf8_target)
6168             goto utf8_space;
6169
6170         while (scan < loceol && isSPACE((U8) *scan)) {
6171             scan++;
6172         }
6173         break;
6174     case SPACEA:
6175         while (scan < loceol && isSPACE_A((U8) *scan)) {
6176             scan++;
6177         }
6178         break;
6179     case SPACEL:
6180         PL_reg_flags |= RF_tainted;
6181         if (utf8_target) {
6182             loceol = PL_regeol;
6183             while (hardcount < max && scan < loceol &&
6184                    isSPACE_LC_utf8((U8*)scan)) {
6185                 scan += UTF8SKIP(scan);
6186                 hardcount++;
6187             }
6188         } else {
6189             while (scan < loceol && isSPACE_LC(*scan))
6190                 scan++;
6191         }
6192         break;
6193     case NSPACEU:
6194         if (utf8_target) {
6195
6196     utf8_Nspace:
6197
6198             loceol = PL_regeol;
6199             LOAD_UTF8_CHARCLASS_SPACE();
6200             while (hardcount < max && scan < loceol &&
6201                    ! (*scan == ' ' ||
6202                       swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6203             {
6204                 scan += UTF8SKIP(scan);
6205                 hardcount++;
6206             }
6207             break;
6208         }
6209         else {
6210             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6211                 scan++;
6212             }
6213         }
6214         break;
6215     case NSPACE:
6216         if (utf8_target)
6217             goto utf8_Nspace;
6218
6219         while (scan < loceol && ! isSPACE((U8) *scan)) {
6220             scan++;
6221         }
6222         break;
6223     case NSPACEA:
6224         if (utf8_target) {
6225             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6226                 scan += UTF8SKIP(scan);
6227             }
6228         }
6229         else {
6230             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6231                 scan++;
6232             }
6233         }
6234         break;
6235     case NSPACEL:
6236         PL_reg_flags |= RF_tainted;
6237         if (utf8_target) {
6238             loceol = PL_regeol;
6239             while (hardcount < max && scan < loceol &&
6240                    !isSPACE_LC_utf8((U8*)scan)) {
6241                 scan += UTF8SKIP(scan);
6242                 hardcount++;
6243             }
6244         } else {
6245             while (scan < loceol && !isSPACE_LC(*scan))
6246                 scan++;
6247         }
6248         break;
6249     case DIGIT:
6250         if (utf8_target) {
6251             loceol = PL_regeol;
6252             LOAD_UTF8_CHARCLASS_DIGIT();
6253             while (hardcount < max && scan < loceol &&
6254                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6255                 scan += UTF8SKIP(scan);
6256                 hardcount++;
6257             }
6258         } else {
6259             while (scan < loceol && isDIGIT(*scan))
6260                 scan++;
6261         }
6262         break;
6263     case DIGITA:
6264         while (scan < loceol && isDIGIT_A((U8) *scan)) {
6265             scan++;
6266         }
6267         break;
6268     case DIGITL:
6269         PL_reg_flags |= RF_tainted;
6270         if (utf8_target) {
6271             loceol = PL_regeol;
6272             while (hardcount < max && scan < loceol &&
6273                    isDIGIT_LC_utf8((U8*)scan)) {
6274                 scan += UTF8SKIP(scan);
6275                 hardcount++;
6276             }
6277         } else {
6278             while (scan < loceol && isDIGIT_LC(*scan))
6279                 scan++;
6280         }
6281         break;
6282     case NDIGIT:
6283         if (utf8_target) {
6284             loceol = PL_regeol;
6285             LOAD_UTF8_CHARCLASS_DIGIT();
6286             while (hardcount < max && scan < loceol &&
6287                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6288                 scan += UTF8SKIP(scan);
6289                 hardcount++;
6290             }
6291         } else {
6292             while (scan < loceol && !isDIGIT(*scan))
6293                 scan++;
6294         }
6295         break;
6296     case NDIGITA:
6297         if (utf8_target) {
6298             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6299                 scan += UTF8SKIP(scan);
6300             }
6301         }
6302         else {
6303             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6304                 scan++;
6305             }
6306         }
6307         break;
6308     case NDIGITL:
6309         PL_reg_flags |= RF_tainted;
6310         if (utf8_target) {
6311             loceol = PL_regeol;
6312             while (hardcount < max && scan < loceol &&
6313                    !isDIGIT_LC_utf8((U8*)scan)) {
6314                 scan += UTF8SKIP(scan);
6315                 hardcount++;
6316             }
6317         } else {
6318             while (scan < loceol && !isDIGIT_LC(*scan))
6319                 scan++;
6320         }
6321         break;
6322     case LNBREAK:
6323         if (utf8_target) {
6324             loceol = PL_regeol;
6325             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6326                 scan += c;
6327                 hardcount++;
6328             }
6329         } else {
6330             /*
6331               LNBREAK can match two latin chars, which is ok,
6332               because we have a null terminated string, but we
6333               have to use hardcount in this situation
6334             */
6335             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6336                 scan+=c;
6337                 hardcount++;
6338             }
6339         }
6340         break;
6341     case HORIZWS:
6342         if (utf8_target) {
6343             loceol = PL_regeol;
6344             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6345                 scan += c;
6346                 hardcount++;
6347             }
6348         } else {
6349             while (scan < loceol && is_HORIZWS_latin1(scan))
6350                 scan++;
6351         }
6352         break;
6353     case NHORIZWS:
6354         if (utf8_target) {
6355             loceol = PL_regeol;
6356             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6357                 scan += UTF8SKIP(scan);
6358                 hardcount++;
6359             }
6360         } else {
6361             while (scan < loceol && !is_HORIZWS_latin1(scan))
6362                 scan++;
6363
6364         }
6365         break;
6366     case VERTWS:
6367         if (utf8_target) {
6368             loceol = PL_regeol;
6369             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6370                 scan += c;
6371                 hardcount++;
6372             }
6373         } else {
6374             while (scan < loceol && is_VERTWS_latin1(scan))
6375                 scan++;
6376
6377         }
6378         break;
6379     case NVERTWS:
6380         if (utf8_target) {
6381             loceol = PL_regeol;
6382             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6383                 scan += UTF8SKIP(scan);
6384                 hardcount++;
6385             }
6386         } else {
6387             while (scan < loceol && !is_VERTWS_latin1(scan))
6388                 scan++;
6389
6390         }
6391         break;
6392
6393     default:            /* Called on something of 0 width. */
6394         break;          /* So match right here or not at all. */
6395     }
6396
6397     if (hardcount)
6398         c = hardcount;
6399     else
6400         c = scan - PL_reginput;
6401     PL_reginput = scan;
6402
6403     DEBUG_r({
6404         GET_RE_DEBUG_FLAGS_DECL;
6405         DEBUG_EXECUTE_r({
6406             SV * const prop = sv_newmortal();
6407             regprop(prog, prop, p);
6408             PerlIO_printf(Perl_debug_log,
6409                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6410                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6411         });
6412     });
6413
6414     return(c);
6415 }
6416
6417
6418 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6419 /*
6420 - regclass_swash - prepare the utf8 swash
6421 */
6422
6423 SV *
6424 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6425 {
6426     dVAR;
6427     SV *sw  = NULL;
6428     SV *si  = NULL;
6429     SV *alt = NULL;
6430     RXi_GET_DECL(prog,progi);
6431     const struct reg_data * const data = prog ? progi->data : NULL;
6432
6433     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6434
6435     if (data && data->count) {
6436         const U32 n = ARG(node);
6437
6438         if (data->what[n] == 's') {
6439             SV * const rv = MUTABLE_SV(data->data[n]);
6440             AV * const av = MUTABLE_AV(SvRV(rv));
6441             SV **const ary = AvARRAY(av);
6442             SV **a, **b;
6443
6444             /* See the end of regcomp.c:S_regclass() for
6445              * documentation of these array elements. */
6446
6447             si = *ary;
6448             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6449             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6450
6451             if (a)
6452                 sw = *a;
6453             else if (si && doinit) {
6454                 sw = swash_init("utf8", "", si, 1, 0);
6455                 (void)av_store(av, 1, sw);
6456             }
6457             if (b)
6458                 alt = *b;
6459         }
6460     }
6461
6462     if (listsvp)
6463         *listsvp = si;
6464     if (altsvp)
6465         *altsvp  = alt;
6466
6467     return sw;
6468 }
6469 #endif
6470
6471 /*
6472  - reginclass - determine if a character falls into a character class
6473
6474   n is the ANYOF regnode
6475   p is the target string
6476   lenp is pointer to the maximum number of bytes of how far to go in p
6477     (This is assumed wthout checking to always be at least the current
6478     character's size)
6479   utf8_target tells whether p is in UTF-8.
6480
6481   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6482   from a successful match, the value it points to will be updated to how many
6483   bytes in p were matched.  If there was no match, the value is undefined,
6484   possibly changed from the input.
6485
6486   Note that this can be a synthetic start class, a combination of various
6487   nodes, so things you think might be mutually exclusive, such as locale,
6488   aren't.  It can match both locale and non-locale
6489
6490  */
6491
6492 STATIC bool
6493 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6494 {
6495     dVAR;
6496     const char flags = ANYOF_FLAGS(n);
6497     bool match = FALSE;
6498     UV c = *p;
6499     STRLEN c_len = 0;
6500     STRLEN maxlen;
6501
6502     PERL_ARGS_ASSERT_REGINCLASS;
6503
6504     /* If c is not already the code point, get it */
6505     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6506         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6507                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6508                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6509                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6510                  * UTF8_ALLOW_FFFF */
6511         if (c_len == (STRLEN)-1)
6512             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6513     }
6514     else {
6515         c_len = 1;
6516     }
6517
6518     /* Use passed in max length, or one character if none passed in or less
6519      * than one character.  And assume will match just one character.  This is
6520      * overwritten later if matched more. */
6521     if (lenp) {
6522         maxlen = (*lenp > c_len) ? *lenp : c_len;
6523         *lenp = c_len;
6524
6525     }
6526     else {
6527         maxlen = c_len;
6528     }
6529
6530     /* If this character is potentially in the bitmap, check it */
6531     if (c < 256) {
6532         if (ANYOF_BITMAP_TEST(n, c))
6533             match = TRUE;
6534         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6535                 && ! utf8_target
6536                 && ! isASCII(c))
6537         {
6538             match = TRUE;
6539         }
6540
6541         else if (flags & ANYOF_LOCALE) {
6542             PL_reg_flags |= RF_tainted;
6543
6544             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6545                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6546             {
6547                 match = TRUE;
6548             }
6549             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6550                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6551                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6552                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6553                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6554                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6555                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6556                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6557                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6558                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6559                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6560                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6561                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6562                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6563                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6564                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6565                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6566                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6567                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6568                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6569                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6570                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6571                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6572                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6573                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6574                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6575                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6576                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6577                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6578                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6579                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6580                      ) /* How's that for a conditional? */
6581             ) {
6582                 match = TRUE;
6583             }
6584         }
6585     }
6586
6587     /* If the bitmap didn't (or couldn't) match, and something outside the
6588      * bitmap could match, try that.  Locale nodes specifiy completely the
6589      * behavior of code points in the bit map (otherwise, a utf8 target would
6590      * cause them to be treated as Unicode and not locale), except in
6591      * the very unlikely event when this node is a synthetic start class, which
6592      * could be a combination of locale and non-locale nodes.  So allow locale
6593      * to match for the synthetic start class, which will give a false
6594      * positive that will be resolved when the match is done again as not part
6595      * of the synthetic start class */
6596     if (!match) {
6597         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6598             match = TRUE;       /* Everything above 255 matches */
6599         }
6600         else if (ANYOF_NONBITMAP(n)
6601                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6602                      || (utf8_target
6603                          && (c >=256
6604                              || (! (flags & ANYOF_LOCALE))
6605                              || (flags & ANYOF_IS_SYNTHETIC)))))
6606         {
6607             AV *av;
6608             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6609
6610             if (sw) {
6611                 U8 * utf8_p;
6612                 if (utf8_target) {
6613                     utf8_p = (U8 *) p;
6614                 } else {
6615
6616                     /* Not utf8.  Convert as much of the string as available up
6617                      * to the limit of how far the (single) character in the
6618                      * pattern can possibly match (no need to go further).  If
6619                      * the node is a straight ANYOF or not folding, it can't
6620                      * match more than one.  Otherwise, It can match up to how
6621                      * far a single char can fold to.  Since not utf8, each
6622                      * character is a single byte, so the max it can be in
6623                      * bytes is the same as the max it can be in characters */
6624                     STRLEN len = (OP(n) == ANYOF
6625                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6626                                   ? 1
6627                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6628                                     ? maxlen
6629                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6630                     utf8_p = bytes_to_utf8(p, &len);
6631                 }
6632
6633                 if (swash_fetch(sw, utf8_p, TRUE))
6634                     match = TRUE;
6635                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6636
6637                     /* Here, we need to test if the fold of the target string
6638                      * matches.  In the case of a multi-char fold that is
6639                      * caught by regcomp.c, it has stored all such folds into
6640                      * 'av'; we linearly check to see if any match the target
6641                      * string (folded).   We know that the originals were each
6642                      * one character, but we don't currently know how many
6643                      * characters/bytes each folded to, except we do know that
6644                      * there are small limits imposed by Unicode.  XXX A
6645                      * performance enhancement would be to have regcomp.c store
6646                      * the max number of chars/bytes that are in an av entry,
6647                      * as, say the 0th element.  Even better would be to have a
6648                      * hash of the few characters that can start a multi-char
6649                      * fold to the max number of chars of those folds.
6650                      *
6651                      * Further down, if there isn't a
6652                      * match in the av, we will check if there is another
6653                      * fold-type match.  For that, we also need the fold, but
6654                      * only the first character.  No sense in folding it twice,
6655                      * so we do it here, even if there isn't any multi-char
6656                      * fold, so we always fold at least the first character.
6657                      * If the node is a straight ANYOF node, or there is only
6658                      * one character available in the string, or if there isn't
6659                      * any av, that's all we have to fold.  In the case of a
6660                      * multi-char fold, we do have guarantees in Unicode that
6661                      * it can only expand up to so many characters and so many
6662                      * bytes.  We keep track so don't exceed either.
6663                      *
6664                      * If there is a match, we will need to advance (if lenp is
6665                      * specified) the match pointer in the target string.  But
6666                      * what we are comparing here isn't that string directly,
6667                      * but its fold, whose length may differ from the original.
6668                      * As we go along in constructing the fold, therefore, we
6669                      * create a map so that we know how many bytes in the
6670                      * source to advance given that we have matched a certain
6671                      * number of bytes in the fold.  This map is stored in
6672                      * 'map_fold_len_back'.  The first character in the fold
6673                      * has array element 1 contain the number of bytes in the
6674                      * source that folded to it; the 2nd is the cumulative
6675                      * number to match it; ... */
6676                     U8 map_fold_len_back[UTF8_MAX_FOLD_CHAR_EXPAND+1] = { 0 };
6677                     U8 folded[UTF8_MAXBYTES_CASE+1];
6678                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6679                     STRLEN foldlen_for_av; /* num bytes in fold of all chars */
6680
6681                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6682
6683                         /* Here, only need to fold the first char of the target
6684                          * string */
6685                         to_utf8_fold(utf8_p, folded, &foldlen);
6686                         foldlen_for_av = foldlen;
6687                         map_fold_len_back[1] = UTF8SKIP(utf8_p);
6688                     }
6689                     else {
6690
6691                         /* Here, need to fold more than the first char.  Do so
6692                          * up to the limits */
6693                         UV which_char = 0;
6694                         U8* source_ptr = utf8_p;    /* The source for the fold
6695                                                        is the regex target
6696                                                        string */
6697                         U8* folded_ptr = folded;
6698                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6699                                                        available byte in the
6700                                                        target string */
6701                         while (which_char < UTF8_MAX_FOLD_CHAR_EXPAND
6702                                && source_ptr < e)
6703                         {
6704
6705                             /* Fold the next character */
6706                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6707                             STRLEN this_char_foldlen;
6708                             to_utf8_fold(source_ptr,
6709                                          this_char_folded,
6710                                          &this_char_foldlen);
6711
6712                             /* Bail if it would exceed the byte limit for
6713                              * folding a single char. */
6714                             if (this_char_foldlen + folded_ptr - folded >
6715                                                             UTF8_MAXBYTES_CASE)
6716                             {
6717                                 break;
6718                             }
6719
6720                             /* Save the first character's folded length, in
6721                              * case we have to use it later */
6722                             if (! foldlen) {
6723                                 foldlen = this_char_foldlen;
6724                             }
6725
6726                             /* Here, add the fold of this character */
6727                             Copy(this_char_folded,
6728                                  folded_ptr,
6729                                  this_char_foldlen,
6730                                  U8);
6731                             which_char++;
6732                             map_fold_len_back[which_char] =
6733                                 map_fold_len_back[which_char - 1]
6734                                 + UTF8SKIP(source_ptr);
6735                             folded_ptr += this_char_foldlen;
6736                             source_ptr += UTF8SKIP(source_ptr);
6737                         }
6738                         *folded_ptr = '\0';
6739                         foldlen_for_av = folded_ptr - folded;
6740                     }
6741
6742
6743                     /* Do the linear search to see if the fold is in the list
6744                      * of multi-char folds.  (Useless to look if won't be able
6745                      * to store that it is a multi-char fold in *lenp) */
6746                     if (lenp && av) {
6747                         I32 i;
6748                         for (i = 0; i <= av_len(av); i++) {
6749                             SV* const sv = *av_fetch(av, i, FALSE);
6750                             STRLEN len;
6751                             const char * const s = SvPV_const(sv, len);
6752                             if (len <= foldlen_for_av && memEQ(s,
6753                                                                (char*)folded,
6754                                                                len))
6755                             {
6756
6757                                 /* Advance the target string ptr to account for
6758                                  * this fold, but have to translate from the
6759                                  * folded length to the corresponding source
6760                                  * length.  The array is indexed by how many
6761                                  * characters in the match */
6762                                 *lenp = map_fold_len_back[
6763                                         utf8_length(folded, folded + len)];
6764                                 match = TRUE;
6765                                 break;
6766                             }
6767                         }
6768                     }
6769                 }
6770
6771                 /* If we allocated a string above, free it */
6772                 if (! utf8_target) Safefree(utf8_p);
6773             }
6774         }
6775     }
6776
6777     return (flags & ANYOF_INVERT) ? !match : match;
6778 }
6779
6780 STATIC U8 *
6781 S_reghop3(U8 *s, I32 off, const U8* lim)
6782 {
6783     dVAR;
6784
6785     PERL_ARGS_ASSERT_REGHOP3;
6786
6787     if (off >= 0) {
6788         while (off-- && s < lim) {
6789             /* XXX could check well-formedness here */
6790             s += UTF8SKIP(s);
6791         }
6792     }
6793     else {
6794         while (off++ && s > lim) {
6795             s--;
6796             if (UTF8_IS_CONTINUED(*s)) {
6797                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6798                     s--;
6799             }
6800             /* XXX could check well-formedness here */
6801         }
6802     }
6803     return s;
6804 }
6805
6806 #ifdef XXX_dmq
6807 /* there are a bunch of places where we use two reghop3's that should
6808    be replaced with this routine. but since thats not done yet
6809    we ifdef it out - dmq
6810 */
6811 STATIC U8 *
6812 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6813 {
6814     dVAR;
6815
6816     PERL_ARGS_ASSERT_REGHOP4;
6817
6818     if (off >= 0) {
6819         while (off-- && s < rlim) {
6820             /* XXX could check well-formedness here */
6821             s += UTF8SKIP(s);
6822         }
6823     }
6824     else {
6825         while (off++ && s > llim) {
6826             s--;
6827             if (UTF8_IS_CONTINUED(*s)) {
6828                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6829                     s--;
6830             }
6831             /* XXX could check well-formedness here */
6832         }
6833     }
6834     return s;
6835 }
6836 #endif
6837
6838 STATIC U8 *
6839 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6840 {
6841     dVAR;
6842
6843     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6844
6845     if (off >= 0) {
6846         while (off-- && s < lim) {
6847             /* XXX could check well-formedness here */
6848             s += UTF8SKIP(s);
6849         }
6850         if (off >= 0)
6851             return NULL;
6852     }
6853     else {
6854         while (off++ && s > lim) {
6855             s--;
6856             if (UTF8_IS_CONTINUED(*s)) {
6857                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6858                     s--;
6859             }
6860             /* XXX could check well-formedness here */
6861         }
6862         if (off <= 0)
6863             return NULL;
6864     }
6865     return s;
6866 }
6867
6868 static void
6869 restore_pos(pTHX_ void *arg)
6870 {
6871     dVAR;
6872     regexp * const rex = (regexp *)arg;
6873     if (PL_reg_eval_set) {
6874         if (PL_reg_oldsaved) {
6875             rex->subbeg = PL_reg_oldsaved;
6876             rex->sublen = PL_reg_oldsavedlen;
6877 #ifdef PERL_OLD_COPY_ON_WRITE
6878             rex->saved_copy = PL_nrs;
6879 #endif
6880             RXp_MATCH_COPIED_on(rex);
6881         }
6882         PL_reg_magic->mg_len = PL_reg_oldpos;
6883         PL_reg_eval_set = 0;
6884         PL_curpm = PL_reg_oldcurpm;
6885     }
6886 }
6887
6888 STATIC void
6889 S_to_utf8_substr(pTHX_ register regexp *prog)
6890 {
6891     int i = 1;
6892
6893     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6894
6895     do {
6896         if (prog->substrs->data[i].substr
6897             && !prog->substrs->data[i].utf8_substr) {
6898             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6899             prog->substrs->data[i].utf8_substr = sv;
6900             sv_utf8_upgrade(sv);
6901             if (SvVALID(prog->substrs->data[i].substr)) {
6902                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6903                 if (flags & FBMcf_TAIL) {
6904                     /* Trim the trailing \n that fbm_compile added last
6905                        time.  */
6906                     SvCUR_set(sv, SvCUR(sv) - 1);
6907                     /* Whilst this makes the SV technically "invalid" (as its
6908                        buffer is no longer followed by "\0") when fbm_compile()
6909                        adds the "\n" back, a "\0" is restored.  */
6910                 }
6911                 fbm_compile(sv, flags);
6912             }
6913             if (prog->substrs->data[i].substr == prog->check_substr)
6914                 prog->check_utf8 = sv;
6915         }
6916     } while (i--);
6917 }
6918
6919 STATIC void
6920 S_to_byte_substr(pTHX_ register regexp *prog)
6921 {
6922     dVAR;
6923     int i = 1;
6924
6925     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6926
6927     do {
6928         if (prog->substrs->data[i].utf8_substr
6929             && !prog->substrs->data[i].substr) {
6930             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6931             if (sv_utf8_downgrade(sv, TRUE)) {
6932                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6933                     const U8 flags
6934                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6935                     if (flags & FBMcf_TAIL) {
6936                         /* Trim the trailing \n that fbm_compile added last
6937                            time.  */
6938                         SvCUR_set(sv, SvCUR(sv) - 1);
6939                     }
6940                     fbm_compile(sv, flags);
6941                 }
6942             } else {
6943                 SvREFCNT_dec(sv);
6944                 sv = &PL_sv_undef;
6945             }
6946             prog->substrs->data[i].substr = sv;
6947             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6948                 prog->check_substr = sv;
6949         }
6950     } while (i--);
6951 }
6952
6953 /*
6954  * Local variables:
6955  * c-indentation-style: bsd
6956  * c-basic-offset: 4
6957  * indent-tabs-mode: t
6958  * End:
6959  *
6960  * ex: set ts=8 sts=4 sw=4 noet:
6961  */