regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1               /* tainted information used? */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings only: avoids the reginclass call if there are no
  98  * complications: i.e., if everything matchable is straight forward in the
  99  * bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 /*
 154    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 155    so that it is possible to override the option here without having to
 156    rebuild the entire core. as we are required to do if we change regcomp.h
 157    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 158 */
 159 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 160 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #endif
 162
 163 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 164 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 165 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 166 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 167 #define RE_utf8_perl_word   PL_utf8_alnum
 168 #define RE_utf8_perl_space  PL_utf8_space
 169 #define RE_utf8_posix_digit PL_utf8_digit
 170 #define perl_word  alnum
 171 #define perl_space space
 172 #define posix_digit digit
 173 #else
 174 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 175 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 176 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 177 #define RE_utf8_perl_word   PL_utf8_perl_word
 178 #define RE_utf8_perl_space  PL_utf8_perl_space
 179 #define RE_utf8_posix_digit PL_utf8_posix_digit
 180 #endif
 181
 182
 183 #define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
 184         case NAMEL:                                                         \
 185             PL_reg_flags |= RF_tainted;                                     \
 186             /* FALL THROUGH */                                              \
 187         case NAME:                                                          \
 188             if (!nextchr)                                                   \
 189                 sayNO;                                                      \
 190             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
 191                 if (!CAT2(PL_utf8_,CLASS)) {                                \
 192                     bool ok;                                                \
 193                     ENTER;                                                  \
 194                     save_re_context();                                      \
 195                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
 196                     assert(ok);                                             \
 197                     LEAVE;                                                  \
 198                 }                                                           \
 199                 if (!(OP(scan) == NAME                                      \
 200                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 201                     : LCFUNC_utf8((U8*)locinput)))                          \
 202                 {                                                           \
 203                     sayNO;                                                  \
 204                 }                                                           \
 205                 locinput += PL_utf8skip[nextchr];                           \
 206                 nextchr = UCHARAT(locinput);                                \
 207                 break;                                                      \
 208             }                                                               \
 209             /* Drops through to the macro that calls this one */
 210
 211 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
 212     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
 213             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
 214                 sayNO;                                                      \
 215             nextchr = UCHARAT(++locinput);                                  \
 216             break
 217
 218 /* Almost identical to the above, but has a case for a node that matches chars
 219  * between 128 and 255 using Unicode (latin1) semantics. */
 220 #define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 221     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 222             if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 223                 sayNO;                                                       \
 224             nextchr = UCHARAT(++locinput);                                   \
 225             break
 226
 227 #define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
 228         case NAMEL:                                                          \
 229             PL_reg_flags |= RF_tainted;                                      \
 230             /* FALL THROUGH */                                               \
 231         case NAME :                                                          \
 232             if (!nextchr && locinput >= PL_regeol)                           \
 233                 sayNO;                                                       \
 234             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
 235                 if (!CAT2(PL_utf8_,CLASS)) {                                 \
 236                     bool ok;                                                 \
 237                     ENTER;                                                   \
 238                     save_re_context();                                       \
 239                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
 240                     assert(ok);                                              \
 241                     LEAVE;                                                   \
 242                 }                                                            \
 243                 if ((OP(scan) == NAME                                        \
 244                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 245                     : LCFUNC_utf8((U8*)locinput)))                           \
 246                 {                                                            \
 247                     sayNO;                                                   \
 248                 }                                                            \
 249                 locinput += PL_utf8skip[nextchr];                            \
 250                 nextchr = UCHARAT(locinput);                                 \
 251                 break;                                                       \
 252             }
 253
 254 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
 255     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 256             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
 257                 sayNO;                                                       \
 258             nextchr = UCHARAT(++locinput);                                   \
 259             break
 260
 261
 262 #define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 263     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
 264             if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 265                 sayNO;                                                       \
 266             nextchr = UCHARAT(++locinput);                                   \
 267             break
 268
 269
 270
 271 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 272
 273 /* for use after a quantifier and before an EXACT-like node -- japhy */
 274 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 275  *
 276  * NOTE that *nothing* that affects backtracking should be in here, specifically
 277  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 278  * node that is in between two EXACT like nodes when ascertaining what the required
 279  * "follow" character is. This should probably be moved to regex compile time
 280  * although it may be done at run time beause of the REF possibility - more
 281  * investigation required. -- demerphq
 282 */
 283 #define JUMPABLE(rn) (      \
 284     OP(rn) == OPEN ||       \
 285     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 286     OP(rn) == EVAL ||   \
 287     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 288     OP(rn) == PLUS || OP(rn) == MINMOD || \
 289     OP(rn) == KEEPS || \
 290     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 291 )
 292 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 293
 294 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 295
 296 #if 0
 297 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 298    we don't need this definition. */
 299 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 300 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 301 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 302
 303 #else
 304 /* ... so we use this as its faster. */
 305 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 306 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU )
 307 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 308 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 309
 310 #endif
 311
 312 /*
 313   Search for mandatory following text node; for lookahead, the text must
 314   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 315 */
 316 #define FIND_NEXT_IMPT(rn) STMT_START { \
 317     while (JUMPABLE(rn)) { \
 318         const OPCODE type = OP(rn); \
 319         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 320             rn = NEXTOPER(NEXTOPER(rn)); \
 321         else if (type == PLUS) \
 322             rn = NEXTOPER(rn); \
 323         else if (type == IFMATCH) \
 324             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 325         else rn += NEXT_OFF(rn); \
 326     } \
 327 } STMT_END
 328
 329
 330 static void restore_pos(pTHX_ void *arg);
 331
 332 #define REGCP_PAREN_ELEMS 4
 333 #define REGCP_OTHER_ELEMS 5
 334 #define REGCP_FRAME_ELEMS 1
 335 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 336  * are needed for the regexp context stack bookkeeping. */
 337
 338 STATIC CHECKPOINT
 339 S_regcppush(pTHX_ I32 parenfloor)
 340 {
 341     dVAR;
 342     const int retval = PL_savestack_ix;
 343     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 344     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 345     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 346     int p;
 347     GET_RE_DEBUG_FLAGS_DECL;
 348
 349     if (paren_elems_to_push < 0)
 350         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 351
 352     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 353         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 354                    " out of range (%lu-%ld)",
 355                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 356
 357     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 358
 359     for (p = PL_regsize; p > parenfloor; p--) {
 360 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 361         SSPUSHINT(PL_regoffs[p].end);
 362         SSPUSHINT(PL_regoffs[p].start);
 363         SSPUSHPTR(PL_reg_start_tmp[p]);
 364         SSPUSHINT(p);
 365         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 366           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 367                       (UV)p, (IV)PL_regoffs[p].start,
 368                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 369                       (IV)PL_regoffs[p].end
 370         ));
 371     }
 372 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 373     SSPUSHPTR(PL_regoffs);
 374     SSPUSHINT(PL_regsize);
 375     SSPUSHINT(*PL_reglastparen);
 376     SSPUSHINT(*PL_reglastcloseparen);
 377     SSPUSHPTR(PL_reginput);
 378     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 379
 380     return retval;
 381 }
 382
 383 /* These are needed since we do not localize EVAL nodes: */
 384 #define REGCP_SET(cp)                                           \
 385     DEBUG_STATE_r(                                              \
 386             PerlIO_printf(Perl_debug_log,                       \
 387                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 388                 (IV)PL_savestack_ix));                          \
 389     cp = PL_savestack_ix
 390
 391 #define REGCP_UNWIND(cp)                                        \
 392     DEBUG_STATE_r(                                              \
 393         if (cp != PL_savestack_ix)                              \
 394             PerlIO_printf(Perl_debug_log,                       \
 395                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 396                 (IV)(cp), (IV)PL_savestack_ix));                \
 397     regcpblow(cp)
 398
 399 STATIC char *
 400 S_regcppop(pTHX_ const regexp *rex)
 401 {
 402     dVAR;
 403     UV i;
 404     char *input;
 405     GET_RE_DEBUG_FLAGS_DECL;
 406
 407     PERL_ARGS_ASSERT_REGCPPOP;
 408
 409     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 410     i = SSPOPUV;
 411     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 412     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 413     input = (char *) SSPOPPTR;
 414     *PL_reglastcloseparen = SSPOPINT;
 415     *PL_reglastparen = SSPOPINT;
 416     PL_regsize = SSPOPINT;
 417     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 418
 419     i -= REGCP_OTHER_ELEMS;
 420     /* Now restore the parentheses context. */
 421     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 422         I32 tmps;
 423         U32 paren = (U32)SSPOPINT;
 424         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 425         PL_regoffs[paren].start = SSPOPINT;
 426         tmps = SSPOPINT;
 427         if (paren <= *PL_reglastparen)
 428             PL_regoffs[paren].end = tmps;
 429         DEBUG_BUFFERS_r(
 430             PerlIO_printf(Perl_debug_log,
 431                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 432                           (UV)paren, (IV)PL_regoffs[paren].start,
 433                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 434                           (IV)PL_regoffs[paren].end,
 435                           (paren > *PL_reglastparen ? "(no)" : ""));
 436         );
 437     }
 438     DEBUG_BUFFERS_r(
 439         if (*PL_reglastparen + 1 <= rex->nparens) {
 440             PerlIO_printf(Perl_debug_log,
 441                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 442                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 443         }
 444     );
 445 #if 1
 446     /* It would seem that the similar code in regtry()
 447      * already takes care of this, and in fact it is in
 448      * a better location to since this code can #if 0-ed out
 449      * but the code in regtry() is needed or otherwise tests
 450      * requiring null fields (pat.t#187 and split.t#{13,14}
 451      * (as of patchlevel 7877)  will fail.  Then again,
 452      * this code seems to be necessary or otherwise
 453      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 454      * --jhi updated by dapm */
 455     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 456         if (i > PL_regsize)
 457             PL_regoffs[i].start = -1;
 458         PL_regoffs[i].end = -1;
 459     }
 460 #endif
 461     return input;
 462 }
 463
 464 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 465
 466 /*
 467  * pregexec and friends
 468  */
 469
 470 #ifndef PERL_IN_XSUB_RE
 471 /*
 472  - pregexec - match a regexp against a string
 473  */
 474 I32
 475 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 476          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 477 /* strend: pointer to null at end of string */
 478 /* strbeg: real beginning of string */
 479 /* minend: end of match must be >=minend after stringarg. */
 480 /* nosave: For optimizations. */
 481 {
 482     PERL_ARGS_ASSERT_PREGEXEC;
 483
 484     return
 485         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 486                       nosave ? 0 : REXEC_COPY_STR);
 487 }
 488 #endif
 489
 490 /*
 491  * Need to implement the following flags for reg_anch:
 492  *
 493  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 494  * USE_INTUIT_ML
 495  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 496  * INTUIT_AUTORITATIVE_ML
 497  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 498  * INTUIT_ONCE_ML
 499  *
 500  * Another flag for this function: SECOND_TIME (so that float substrs
 501  * with giant delta may be not rechecked).
 502  */
 503
 504 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 505
 506 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 507    Otherwise, only SvCUR(sv) is used to get strbeg. */
 508
 509 /* XXXX We assume that strpos is strbeg unless sv. */
 510
 511 /* XXXX Some places assume that there is a fixed substring.
 512         An update may be needed if optimizer marks as "INTUITable"
 513         RExen without fixed substrings.  Similarly, it is assumed that
 514         lengths of all the strings are no more than minlen, thus they
 515         cannot come from lookahead.
 516         (Or minlen should take into account lookahead.)
 517   NOTE: Some of this comment is not correct. minlen does now take account
 518   of lookahead/behind. Further research is required. -- demerphq
 519
 520 */
 521
 522 /* A failure to find a constant substring means that there is no need to make
 523    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 524    finding a substring too deep into the string means that less calls to
 525    regtry() should be needed.
 526
 527    REx compiler's optimizer found 4 possible hints:
 528         a) Anchored substring;
 529         b) Fixed substring;
 530         c) Whether we are anchored (beginning-of-line or \G);
 531         d) First node (of those at offset 0) which may distinguish positions;
 532    We use a)b)d) and multiline-part of c), and try to find a position in the
 533    string which does not contradict any of them.
 534  */
 535
 536 /* Most of decisions we do here should have been done at compile time.
 537    The nodes of the REx which we used for the search should have been
 538    deleted from the finite automaton. */
 539
 540 char *
 541 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 542                      char *strend, const U32 flags, re_scream_pos_data *data)
 543 {
 544     dVAR;
 545     struct regexp *const prog = (struct regexp *)SvANY(rx);
 546     register I32 start_shift = 0;
 547     /* Should be nonnegative! */
 548     register I32 end_shift   = 0;
 549     register char *s;
 550     register SV *check;
 551     char *strbeg;
 552     char *t;
 553     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 554     I32 ml_anch;
 555     register char *other_last = NULL;   /* other substr checked before this */
 556     char *check_at = NULL;              /* check substr found at this pos */
 557     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 558     RXi_GET_DECL(prog,progi);
 559 #ifdef DEBUGGING
 560     const char * const i_strpos = strpos;
 561 #endif
 562     GET_RE_DEBUG_FLAGS_DECL;
 563
 564     PERL_ARGS_ASSERT_RE_INTUIT_START;
 565
 566     RX_MATCH_UTF8_set(rx,utf8_target);
 567
 568     if (RX_UTF8(rx)) {
 569         PL_reg_flags |= RF_utf8;
 570     }
 571     DEBUG_EXECUTE_r(
 572         debug_start_match(rx, utf8_target, strpos, strend,
 573             sv ? "Guessing start of match in sv for"
 574                : "Guessing start of match in string for");
 575               );
 576
 577     /* CHR_DIST() would be more correct here but it makes things slow. */
 578     if (prog->minlen > strend - strpos) {
 579         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 580                               "String too short... [re_intuit_start]\n"));
 581         goto fail;
 582     }
 583
 584     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 585     PL_regeol = strend;
 586     if (utf8_target) {
 587         if (!prog->check_utf8 && prog->check_substr)
 588             to_utf8_substr(prog);
 589         check = prog->check_utf8;
 590     } else {
 591         if (!prog->check_substr && prog->check_utf8)
 592             to_byte_substr(prog);
 593         check = prog->check_substr;
 594     }
 595     if (check == &PL_sv_undef) {
 596         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 597                 "Non-utf8 string cannot match utf8 check string\n"));
 598         goto fail;
 599     }
 600     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 601         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 602                      || ( (prog->extflags & RXf_ANCH_BOL)
 603                           && !multiline ) );    /* Check after \n? */
 604
 605         if (!ml_anch) {
 606           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 607                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 608                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 609                && sv && !SvROK(sv)
 610                && (strpos != strbeg)) {
 611               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 612               goto fail;
 613           }
 614           if (prog->check_offset_min == prog->check_offset_max &&
 615               !(prog->extflags & RXf_CANY_SEEN)) {
 616             /* Substring at constant offset from beg-of-str... */
 617             I32 slen;
 618
 619             s = HOP3c(strpos, prog->check_offset_min, strend);
 620
 621             if (SvTAIL(check)) {
 622                 slen = SvCUR(check);    /* >= 1 */
 623
 624                 if ( strend - s > slen || strend - s < slen - 1
 625                      || (strend - s == slen && strend[-1] != '\n')) {
 626                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 627                     goto fail_finish;
 628                 }
 629                 /* Now should match s[0..slen-2] */
 630                 slen--;
 631                 if (slen && (*SvPVX_const(check) != *s
 632                              || (slen > 1
 633                                  && memNE(SvPVX_const(check), s, slen)))) {
 634                   report_neq:
 635                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 636                     goto fail_finish;
 637                 }
 638             }
 639             else if (*SvPVX_const(check) != *s
 640                      || ((slen = SvCUR(check)) > 1
 641                          && memNE(SvPVX_const(check), s, slen)))
 642                 goto report_neq;
 643             check_at = s;
 644             goto success_at_start;
 645           }
 646         }
 647         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 648         s = strpos;
 649         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 650         end_shift = prog->check_end_shift;
 651
 652         if (!ml_anch) {
 653             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 654                                          - (SvTAIL(check) != 0);
 655             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 656
 657             if (end_shift < eshift)
 658                 end_shift = eshift;
 659         }
 660     }
 661     else {                              /* Can match at random position */
 662         ml_anch = 0;
 663         s = strpos;
 664         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 665         end_shift = prog->check_end_shift;
 666
 667         /* end shift should be non negative here */
 668     }
 669
 670 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 671     if (end_shift < 0)
 672         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 673                    (IV)end_shift, RX_PRECOMP(prog));
 674 #endif
 675
 676   restart:
 677     /* Find a possible match in the region s..strend by looking for
 678        the "check" substring in the region corrected by start/end_shift. */
 679
 680     {
 681         I32 srch_start_shift = start_shift;
 682         I32 srch_end_shift = end_shift;
 683         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 684             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 685             srch_start_shift = strbeg - s;
 686         }
 687     DEBUG_OPTIMISE_MORE_r({
 688         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 689             (IV)prog->check_offset_min,
 690             (IV)srch_start_shift,
 691             (IV)srch_end_shift,
 692             (IV)prog->check_end_shift);
 693     });
 694
 695     if (flags & REXEC_SCREAM) {
 696         I32 p = -1;                     /* Internal iterator of scream. */
 697         I32 * const pp = data ? data->scream_pos : &p;
 698
 699         if (PL_screamfirst[BmRARE(check)] >= 0
 700             || ( BmRARE(check) == '\n'
 701                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 702                  && SvTAIL(check) ))
 703             s = screaminstr(sv, check,
 704                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 705         else
 706             goto fail_finish;
 707         /* we may be pointing at the wrong string */
 708         if (s && RXp_MATCH_COPIED(prog))
 709             s = strbeg + (s - SvPVX_const(sv));
 710         if (data)
 711             *data->scream_olds = s;
 712     }
 713     else {
 714         U8* start_point;
 715         U8* end_point;
 716         if (prog->extflags & RXf_CANY_SEEN) {
 717             start_point= (U8*)(s + srch_start_shift);
 718             end_point= (U8*)(strend - srch_end_shift);
 719         } else {
 720             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 721             end_point= HOP3(strend, -srch_end_shift, strbeg);
 722         }
 723         DEBUG_OPTIMISE_MORE_r({
 724             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 725                 (int)(end_point - start_point),
 726                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 727                 start_point);
 728         });
 729
 730         s = fbm_instr( start_point, end_point,
 731                       check, multiline ? FBMrf_MULTILINE : 0);
 732     }
 733     }
 734     /* Update the count-of-usability, remove useless subpatterns,
 735         unshift s.  */
 736
 737     DEBUG_EXECUTE_r({
 738         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 739             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 740         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 741                           (s ? "Found" : "Did not find"),
 742             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 743                 ? "anchored" : "floating"),
 744             quoted,
 745             RE_SV_TAIL(check),
 746             (s ? " at offset " : "...\n") );
 747     });
 748
 749     if (!s)
 750         goto fail_finish;
 751     /* Finish the diagnostic message */
 752     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 753
 754     /* XXX dmq: first branch is for positive lookbehind...
 755        Our check string is offset from the beginning of the pattern.
 756        So we need to do any stclass tests offset forward from that
 757        point. I think. :-(
 758      */
 759
 760
 761
 762     check_at=s;
 763
 764
 765     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 766        Start with the other substr.
 767        XXXX no SCREAM optimization yet - and a very coarse implementation
 768        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 769                 *always* match.  Probably should be marked during compile...
 770        Probably it is right to do no SCREAM here...
 771      */
 772
 773     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 774                 : (prog->float_substr && prog->anchored_substr))
 775     {
 776         /* Take into account the "other" substring. */
 777         /* XXXX May be hopelessly wrong for UTF... */
 778         if (!other_last)
 779             other_last = strpos;
 780         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 781           do_other_anchored:
 782             {
 783                 char * const last = HOP3c(s, -start_shift, strbeg);
 784                 char *last1, *last2;
 785                 char * const saved_s = s;
 786                 SV* must;
 787
 788                 t = s - prog->check_offset_max;
 789                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 790                     && (!utf8_target
 791                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 792                             && t > strpos)))
 793                     NOOP;
 794                 else
 795                     t = strpos;
 796                 t = HOP3c(t, prog->anchored_offset, strend);
 797                 if (t < other_last)     /* These positions already checked */
 798                     t = other_last;
 799                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 800                 if (last < last1)
 801                     last1 = last;
 802                 /* XXXX It is not documented what units *_offsets are in.
 803                    We assume bytes, but this is clearly wrong.
 804                    Meaning this code needs to be carefully reviewed for errors.
 805                    dmq.
 806                   */
 807
 808                 /* On end-of-str: see comment below. */
 809                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 810                 if (must == &PL_sv_undef) {
 811                     s = (char*)NULL;
 812                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 813                 }
 814                 else
 815                     s = fbm_instr(
 816                         (unsigned char*)t,
 817                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 818                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 819                         must,
 820                         multiline ? FBMrf_MULTILINE : 0
 821                     );
 822                 DEBUG_EXECUTE_r({
 823                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 824                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 825                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 826                         (s ? "Found" : "Contradicts"),
 827                         quoted, RE_SV_TAIL(must));
 828                 });
 829
 830
 831                 if (!s) {
 832                     if (last1 >= last2) {
 833                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 834                                                 ", giving up...\n"));
 835                         goto fail_finish;
 836                     }
 837                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 838                         ", trying floating at offset %ld...\n",
 839                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 840                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 841                     s = HOP3c(last, 1, strend);
 842                     goto restart;
 843                 }
 844                 else {
 845                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 846                           (long)(s - i_strpos)));
 847                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 848                     other_last = HOP3c(s, 1, strend);
 849                     s = saved_s;
 850                     if (t == strpos)
 851                         goto try_at_start;
 852                     goto try_at_offset;
 853                 }
 854             }
 855         }
 856         else {          /* Take into account the floating substring. */
 857             char *last, *last1;
 858             char * const saved_s = s;
 859             SV* must;
 860
 861             t = HOP3c(s, -start_shift, strbeg);
 862             last1 = last =
 863                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 864             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 865                 last = HOP3c(t, prog->float_max_offset, strend);
 866             s = HOP3c(t, prog->float_min_offset, strend);
 867             if (s < other_last)
 868                 s = other_last;
 869  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 870             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 871             /* fbm_instr() takes into account exact value of end-of-str
 872                if the check is SvTAIL(ed).  Since false positives are OK,
 873                and end-of-str is not later than strend we are OK. */
 874             if (must == &PL_sv_undef) {
 875                 s = (char*)NULL;
 876                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 877             }
 878             else
 879                 s = fbm_instr((unsigned char*)s,
 880                               (unsigned char*)last + SvCUR(must)
 881                                   - (SvTAIL(must)!=0),
 882                               must, multiline ? FBMrf_MULTILINE : 0);
 883             DEBUG_EXECUTE_r({
 884                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 885                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 886                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 887                     (s ? "Found" : "Contradicts"),
 888                     quoted, RE_SV_TAIL(must));
 889             });
 890             if (!s) {
 891                 if (last1 == last) {
 892                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 893                                             ", giving up...\n"));
 894                     goto fail_finish;
 895                 }
 896                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 897                     ", trying anchored starting at offset %ld...\n",
 898                     (long)(saved_s + 1 - i_strpos)));
 899                 other_last = last;
 900                 s = HOP3c(t, 1, strend);
 901                 goto restart;
 902             }
 903             else {
 904                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 905                       (long)(s - i_strpos)));
 906                 other_last = s; /* Fix this later. --Hugo */
 907                 s = saved_s;
 908                 if (t == strpos)
 909                     goto try_at_start;
 910                 goto try_at_offset;
 911             }
 912         }
 913     }
 914
 915
 916     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 917
 918     DEBUG_OPTIMISE_MORE_r(
 919         PerlIO_printf(Perl_debug_log,
 920             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 921             (IV)prog->check_offset_min,
 922             (IV)prog->check_offset_max,
 923             (IV)(s-strpos),
 924             (IV)(t-strpos),
 925             (IV)(t-s),
 926             (IV)(strend-strpos)
 927         )
 928     );
 929
 930     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 931         && (!utf8_target
 932             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 933                  && t > strpos)))
 934     {
 935         /* Fixed substring is found far enough so that the match
 936            cannot start at strpos. */
 937       try_at_offset:
 938         if (ml_anch && t[-1] != '\n') {
 939             /* Eventually fbm_*() should handle this, but often
 940                anchored_offset is not 0, so this check will not be wasted. */
 941             /* XXXX In the code below we prefer to look for "^" even in
 942                presence of anchored substrings.  And we search even
 943                beyond the found float position.  These pessimizations
 944                are historical artefacts only.  */
 945           find_anchor:
 946             while (t < strend - prog->minlen) {
 947                 if (*t == '\n') {
 948                     if (t < check_at - prog->check_offset_min) {
 949                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 950                             /* Since we moved from the found position,
 951                                we definitely contradict the found anchored
 952                                substr.  Due to the above check we do not
 953                                contradict "check" substr.
 954                                Thus we can arrive here only if check substr
 955                                is float.  Redo checking for "other"=="fixed".
 956                              */
 957                             strpos = t + 1;
 958                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 959                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 960                             goto do_other_anchored;
 961                         }
 962                         /* We don't contradict the found floating substring. */
 963                         /* XXXX Why not check for STCLASS? */
 964                         s = t + 1;
 965                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 966                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 967                         goto set_useful;
 968                     }
 969                     /* Position contradicts check-string */
 970                     /* XXXX probably better to look for check-string
 971                        than for "\n", so one should lower the limit for t? */
 972                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 973                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 974                     other_last = strpos = s = t + 1;
 975                     goto restart;
 976                 }
 977                 t++;
 978             }
 979             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 980                         PL_colors[0], PL_colors[1]));
 981             goto fail_finish;
 982         }
 983         else {
 984             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 985                         PL_colors[0], PL_colors[1]));
 986         }
 987         s = t;
 988       set_useful:
 989         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 990     }
 991     else {
 992         /* The found string does not prohibit matching at strpos,
 993            - no optimization of calling REx engine can be performed,
 994            unless it was an MBOL and we are not after MBOL,
 995            or a future STCLASS check will fail this. */
 996       try_at_start:
 997         /* Even in this situation we may use MBOL flag if strpos is offset
 998            wrt the start of the string. */
 999         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1000             && (strpos != strbeg) && strpos[-1] != '\n'
1001             /* May be due to an implicit anchor of m{.*foo}  */
1002             && !(prog->intflags & PREGf_IMPLICIT))
1003         {
1004             t = strpos;
1005             goto find_anchor;
1006         }
1007         DEBUG_EXECUTE_r( if (ml_anch)
1008             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1009                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1010         );
1011       success_at_start:
1012         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1013             && (utf8_target ? (
1014                 prog->check_utf8                /* Could be deleted already */
1015                 && --BmUSEFUL(prog->check_utf8) < 0
1016                 && (prog->check_utf8 == prog->float_utf8)
1017             ) : (
1018                 prog->check_substr              /* Could be deleted already */
1019                 && --BmUSEFUL(prog->check_substr) < 0
1020                 && (prog->check_substr == prog->float_substr)
1021             )))
1022         {
1023             /* If flags & SOMETHING - do not do it many times on the same match */
1024             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1025             /* XXX Does the destruction order has to change with utf8_target? */
1026             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1027             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1028             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1029             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1030             check = NULL;                       /* abort */
1031             s = strpos;
1032             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1033                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1034             if (prog->intflags & PREGf_IMPLICIT)
1035                 prog->extflags &= ~RXf_ANCH_MBOL;
1036             /* XXXX This is a remnant of the old implementation.  It
1037                     looks wasteful, since now INTUIT can use many
1038                     other heuristics. */
1039             prog->extflags &= ~RXf_USE_INTUIT;
1040             /* XXXX What other flags might need to be cleared in this branch? */
1041         }
1042         else
1043             s = strpos;
1044     }
1045
1046     /* Last resort... */
1047     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1048     /* trie stclasses are too expensive to use here, we are better off to
1049        leave it to regmatch itself */
1050     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1051         /* minlen == 0 is possible if regstclass is \b or \B,
1052            and the fixed substr is ''$.
1053            Since minlen is already taken into account, s+1 is before strend;
1054            accidentally, minlen >= 1 guaranties no false positives at s + 1
1055            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1056            regstclass does not come from lookahead...  */
1057         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1058            This leaves EXACTF, EXACTFU only, which are dealt with in find_byclass().  */
1059         const U8* const str = (U8*)STRING(progi->regstclass);
1060         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1061                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1062                     : 1);
1063         char * endpos;
1064         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1065             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1066         else if (prog->float_substr || prog->float_utf8)
1067             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1068         else
1069             endpos= strend;
1070
1071         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1072                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1073
1074         t = s;
1075         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1076         if (!s) {
1077 #ifdef DEBUGGING
1078             const char *what = NULL;
1079 #endif
1080             if (endpos == strend) {
1081                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082                                 "Could not match STCLASS...\n") );
1083                 goto fail;
1084             }
1085             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1086                                    "This position contradicts STCLASS...\n") );
1087             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1088                 goto fail;
1089             /* Contradict one of substrings */
1090             if (prog->anchored_substr || prog->anchored_utf8) {
1091                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1092                     DEBUG_EXECUTE_r( what = "anchored" );
1093                   hop_and_restart:
1094                     s = HOP3c(t, 1, strend);
1095                     if (s + start_shift + end_shift > strend) {
1096                         /* XXXX Should be taken into account earlier? */
1097                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1098                                                "Could not match STCLASS...\n") );
1099                         goto fail;
1100                     }
1101                     if (!check)
1102                         goto giveup;
1103                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104                                 "Looking for %s substr starting at offset %ld...\n",
1105                                  what, (long)(s + start_shift - i_strpos)) );
1106                     goto restart;
1107                 }
1108                 /* Have both, check_string is floating */
1109                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1110                     goto retry_floating_check;
1111                 /* Recheck anchored substring, but not floating... */
1112                 s = check_at;
1113                 if (!check)
1114                     goto giveup;
1115                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116                           "Looking for anchored substr starting at offset %ld...\n",
1117                           (long)(other_last - i_strpos)) );
1118                 goto do_other_anchored;
1119             }
1120             /* Another way we could have checked stclass at the
1121                current position only: */
1122             if (ml_anch) {
1123                 s = t = t + 1;
1124                 if (!check)
1125                     goto giveup;
1126                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127                           "Looking for /%s^%s/m starting at offset %ld...\n",
1128                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1129                 goto try_at_offset;
1130             }
1131             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1132                 goto fail;
1133             /* Check is floating substring. */
1134           retry_floating_check:
1135             t = check_at - start_shift;
1136             DEBUG_EXECUTE_r( what = "floating" );
1137             goto hop_and_restart;
1138         }
1139         if (t != s) {
1140             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1141                         "By STCLASS: moving %ld --> %ld\n",
1142                                   (long)(t - i_strpos), (long)(s - i_strpos))
1143                    );
1144         }
1145         else {
1146             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1147                                   "Does not contradict STCLASS...\n");
1148                    );
1149         }
1150     }
1151   giveup:
1152     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1153                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1154                           PL_colors[5], (long)(s - i_strpos)) );
1155     return s;
1156
1157   fail_finish:                          /* Substring not found */
1158     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1159         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1160   fail:
1161     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1162                           PL_colors[4], PL_colors[5]));
1163     return NULL;
1164 }
1165
1166 #define DECL_TRIE_TYPE(scan) \
1167     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1168                     trie_type = (scan->flags != EXACT) \
1169                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1170                               : (utf8_target ? trie_utf8 : trie_plain)
1171
1172 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1173 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1174     switch (trie_type) {                                                    \
1175     case trie_utf8_fold:                                                    \
1176         if ( foldlen>0 ) {                                                  \
1177             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1178             foldlen -= len;                                                 \
1179             uscan += len;                                                   \
1180             len=0;                                                          \
1181         } else {                                                            \
1182             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1183             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1184             foldlen -= UNISKIP( uvc );                                      \
1185             uscan = foldbuf + UNISKIP( uvc );                               \
1186         }                                                                   \
1187         break;                                                              \
1188     case trie_latin_utf8_fold:                                              \
1189         if ( foldlen>0 ) {                                                  \
1190             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1191             foldlen -= len;                                                 \
1192             uscan += len;                                                   \
1193             len=0;                                                          \
1194         } else {                                                            \
1195             len = 1;                                                        \
1196             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1197             foldlen -= UNISKIP( uvc );                                      \
1198             uscan = foldbuf + UNISKIP( uvc );                               \
1199         }                                                                   \
1200         break;                                                              \
1201     case trie_utf8:                                                         \
1202         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1203         break;                                                              \
1204     case trie_plain:                                                        \
1205         uvc = (UV)*uc;                                                      \
1206         len = 1;                                                            \
1207     }                                                                       \
1208     if (uvc < 256) {                                                        \
1209         charid = trie->charmap[ uvc ];                                      \
1210     }                                                                       \
1211     else {                                                                  \
1212         charid = 0;                                                         \
1213         if (widecharmap) {                                                  \
1214             SV** const svpp = hv_fetch(widecharmap,                         \
1215                         (char*)&uvc, sizeof(UV), 0);                        \
1216             if (svpp)                                                       \
1217                 charid = (U16)SvIV(*svpp);                                  \
1218         }                                                                   \
1219     }                                                                       \
1220 } STMT_END
1221
1222 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1223 {                                                      \
1224     char *my_strend= (char *)strend;                   \
1225     if ( (CoNd)                                        \
1226          && (ln == len ||                              \
1227              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1228                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1229          && (!reginfo || regtry(reginfo, &s)) )        \
1230         goto got_it;                                   \
1231     else {                                             \
1232          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1233          uvchr_to_utf8(tmpbuf, c);                     \
1234          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1235          if ( f != c                                   \
1236               && (f == c1 || f == c2)                  \
1237               && (ln == len ||                         \
1238                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1239                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1240               && (!reginfo || regtry(reginfo, &s)) )   \
1241               goto got_it;                             \
1242     }                                                  \
1243 }                                                      \
1244 s += len
1245
1246 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1247 STMT_START {                                              \
1248     re_fold_t folder;                                   \
1249     switch (OP(c)) {                                      \
1250         case EXACTFU: folder = foldEQ_latin1; break;      \
1251         case EXACTFL: folder = foldEQ_locale; break;      \
1252         case EXACTF:  folder = foldEQ; break;             \
1253         default:                                          \
1254             Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
1255     }                                                     \
1256     while (s <= e) {                                      \
1257         if ( (CoNd)                                       \
1258              && (ln == 1 || folder(s, m, ln))             \
1259              && (!reginfo || regtry(reginfo, &s)) )       \
1260             goto got_it;                                  \
1261         s++;                                              \
1262     }                                                     \
1263 } STMT_END
1264
1265 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1266 STMT_START {                                          \
1267     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1268         CoDe                                          \
1269         s += uskip;                                   \
1270     }                                                 \
1271 } STMT_END
1272
1273 #define REXEC_FBC_SCAN(CoDe)                          \
1274 STMT_START {                                          \
1275     while (s < strend) {                              \
1276         CoDe                                          \
1277         s++;                                          \
1278     }                                                 \
1279 } STMT_END
1280
1281 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1282 REXEC_FBC_UTF8_SCAN(                                  \
1283     if (CoNd) {                                       \
1284         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1285             goto got_it;                              \
1286         else                                          \
1287             tmp = doevery;                            \
1288     }                                                 \
1289     else                                              \
1290         tmp = 1;                                      \
1291 )
1292
1293 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1294 REXEC_FBC_SCAN(                                       \
1295     if (CoNd) {                                       \
1296         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1297             goto got_it;                              \
1298         else                                          \
1299             tmp = doevery;                            \
1300     }                                                 \
1301     else                                              \
1302         tmp = 1;                                      \
1303 )
1304
1305 #define REXEC_FBC_TRYIT               \
1306 if ((!reginfo || regtry(reginfo, &s))) \
1307     goto got_it
1308
1309 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1310     if (utf8_target) {                                             \
1311         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1312     }                                                          \
1313     else {                                                     \
1314         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1315     }                                                          \
1316     break
1317
1318 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1319     if (utf8_target) {                                             \
1320         UtFpReLoAd;                                            \
1321         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1322     }                                                          \
1323     else {                                                     \
1324         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1325     }                                                          \
1326     break
1327
1328 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1329     PL_reg_flags |= RF_tainted;                                \
1330     if (utf8_target) {                                             \
1331         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1332     }                                                          \
1333     else {                                                     \
1334         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1335     }                                                          \
1336     break
1337
1338 #define DUMP_EXEC_POS(li,s,doutf8) \
1339     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1340
1341 /* We know what class REx starts with.  Try to find this position... */
1342 /* if reginfo is NULL, its a dryrun */
1343 /* annoyingly all the vars in this routine have different names from their counterparts
1344    in regmatch. /grrr */
1345
1346 STATIC char *
1347 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1348     const char *strend, regmatch_info *reginfo)
1349 {
1350         dVAR;
1351         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1352         char *m;
1353         STRLEN ln;
1354         STRLEN lnc;
1355         register STRLEN uskip;
1356         unsigned int c1;
1357         unsigned int c2;
1358         char *e;
1359         register I32 tmp = 1;   /* Scratch variable? */
1360         register const bool utf8_target = PL_reg_match_utf8;
1361         RXi_GET_DECL(prog,progi);
1362
1363         PERL_ARGS_ASSERT_FIND_BYCLASS;
1364
1365         /* We know what class it must start with. */
1366         switch (OP(c)) {
1367         case ANYOFV:
1368         case ANYOF:
1369             if (utf8_target || OP(c) == ANYOFV) {
1370                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
1371                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1372                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1373                           REGINCLASS(prog, c, (U8*)s));
1374             }
1375             else {
1376                  while (s < strend) {
1377                       STRLEN skip = 1;
1378
1379                       if (REGINCLASS(prog, c, (U8*)s) ||
1380                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1381                            /* The assignment of 2 is intentional:
1382                             * for the folded sharp s, the skip is 2. */
1383                            (skip = SHARP_S_SKIP))) {
1384                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1385                                 goto got_it;
1386                            else
1387                                 tmp = doevery;
1388                       }
1389                       else
1390                            tmp = 1;
1391                       s += skip;
1392                  }
1393             }
1394             break;
1395         case CANY:
1396             REXEC_FBC_SCAN(
1397                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1398                     goto got_it;
1399                 else
1400                     tmp = doevery;
1401             );
1402             break;
1403         case EXACTFU:
1404         case EXACTF:
1405             m   = STRING(c);
1406             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1407             lnc = (I32) ln;     /* length to match in characters */
1408             if (UTF_PATTERN) {
1409                 STRLEN ulen1, ulen2;
1410                 U8 *sm = (U8 *) m;
1411                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1412                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1413                 /* used by commented-out code below */
1414                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1415
1416                 /* XXX: Since the node will be case folded at compile
1417                    time this logic is a little odd, although im not
1418                    sure that its actually wrong. --dmq */
1419
1420                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1421                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1422
1423                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1424                    codepoint of the first character in the converted
1425                    form, yet originally we did the extra step.
1426                    No tests fail by commenting this code out however
1427                    so Ive left it out. -- dmq.
1428
1429                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1430                                     0, uniflags);
1431                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1432                                     0, uniflags);
1433                 */
1434
1435                 lnc = 0;
1436                 while (sm < ((U8 *) m + ln)) {
1437                     lnc++;
1438                     sm += UTF8SKIP(sm);
1439                 }
1440             }
1441             else {
1442                 c1 = *(U8*)m;
1443                 if (utf8_target || OP(c) == EXACTFU) {
1444
1445                     /* Micro sign folds to GREEK SMALL LETTER MU;
1446                        LATIN_SMALL_LETTER_SHARP_S folds to 'ss', and this sets
1447                        c2 to the first 's' of the pair, and the code below will
1448                        look for others */
1449                     c2 = (c1 == MICRO_SIGN)
1450                         ? GREEK_SMALL_LETTER_MU
1451                         : (c1 == LATIN_SMALL_LETTER_SHARP_S)
1452                            ? 's'
1453                            : PL_fold_latin1[c1];
1454                 } else c2 = PL_fold[c1];
1455             }
1456             goto do_exactf;
1457         case EXACTFL:
1458             m   = STRING(c);
1459             ln  = STR_LEN(c);
1460             lnc = (I32) ln;
1461             c1 = *(U8*)m;
1462             c2 = PL_fold_locale[c1];
1463           do_exactf:
1464             e = HOP3c(strend, -((I32)lnc), s);
1465
1466             if (!reginfo && e < s)
1467                 e = s;                  /* Due to minlen logic of intuit() */
1468
1469             /* The idea in the EXACTF* cases is to first find the
1470              * first character of the EXACTF* node and then, if
1471              * necessary, case-insensitively compare the full
1472              * text of the node.  The c1 and c2 are the first
1473              * characters (though in Unicode it gets a bit
1474              * more complicated because there are more cases
1475              * than just upper and lower: one needs to use
1476              * the so-called folding case for case-insensitive
1477              * matching (called "loose matching" in Unicode).
1478              * foldEQ_utf8() will do just that. */
1479
1480             if (utf8_target || UTF_PATTERN) {
1481                 UV c, f;
1482                 U8 tmpbuf [UTF8_MAXBYTES+1];
1483                 STRLEN len = 1;
1484                 STRLEN foldlen;
1485                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1486                 if (c1 == c2) {
1487                     /* Upper and lower of 1st char are equal -
1488                      * probably not a "letter". */
1489                     while (s <= e) {
1490                         if (utf8_target) {
1491                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1492                                            uniflags);
1493                         } else {
1494                             c = *((U8*)s);
1495                         }
1496                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1497                     }
1498                 }
1499                 else {
1500                     while (s <= e) {
1501                         if (utf8_target) {
1502                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1503                                            uniflags);
1504                         } else {
1505                             c = *((U8*)s);
1506                         }
1507
1508                         /* Handle some of the three Greek sigmas cases.
1509                          * Note that not all the possible combinations
1510                          * are handled here: some of them are handled
1511                          * by the standard folding rules, and some of
1512                          * them (the character class or ANYOF cases)
1513                          * are handled during compiletime in
1514                          * regexec.c:S_regclass(). */
1515                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1516                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1517                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1518
1519                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1520                     }
1521                 }
1522             }
1523             else {
1524                 /* Neither pattern nor string are UTF8 */
1525                 if (c1 == c2)
1526                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1527                 else
1528                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1529             }
1530             break;
1531         case BOUNDL:
1532             PL_reg_flags |= RF_tainted;
1533             /* FALL THROUGH */
1534         case BOUND:
1535             if (utf8_target) {
1536                 if (s == PL_bostr)
1537                     tmp = '\n';
1538                 else {
1539                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1540                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1541                 }
1542                 tmp = ((OP(c) == BOUND ?
1543                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1544                 LOAD_UTF8_CHARCLASS_ALNUM();
1545                 REXEC_FBC_UTF8_SCAN(
1546                     if (tmp == !(OP(c) == BOUND ?
1547                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1548                                  isALNUM_LC_utf8((U8*)s)))
1549                     {
1550                         tmp = !tmp;
1551                         REXEC_FBC_TRYIT;
1552                 }
1553                 );
1554             }
1555             else {  /* Not utf8 */
1556                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1557                 tmp = cBOOL((OP(c) == BOUNDL)
1558                             ? isALNUM_LC(tmp)
1559                             : (isWORDCHAR_L1(tmp)
1560                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1561                 REXEC_FBC_SCAN(
1562                     if (tmp ==
1563                         !((OP(c) == BOUNDL)
1564                           ? isALNUM_LC(*s)
1565                           : (isWORDCHAR_L1((U8) *s)
1566                              && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1567                     {
1568                         tmp = !tmp;
1569                         REXEC_FBC_TRYIT;
1570                 }
1571                 );
1572             }
1573             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1574                 goto got_it;
1575             break;
1576         case NBOUNDL:
1577             PL_reg_flags |= RF_tainted;
1578             /* FALL THROUGH */
1579         case NBOUND:
1580             if (utf8_target) {
1581                 if (s == PL_bostr)
1582                     tmp = '\n';
1583                 else {
1584                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1585                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1586                 }
1587                 tmp = ((OP(c) == NBOUND ?
1588                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1589                 LOAD_UTF8_CHARCLASS_ALNUM();
1590                 REXEC_FBC_UTF8_SCAN(
1591                     if (tmp == !(OP(c) == NBOUND ?
1592                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1593                                  isALNUM_LC_utf8((U8*)s)))
1594                         tmp = !tmp;
1595                     else REXEC_FBC_TRYIT;
1596                 );
1597             }
1598             else {
1599                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1600                 tmp = cBOOL((OP(c) == NBOUNDL)
1601                             ? isALNUM_LC(tmp)
1602                             : (isWORDCHAR_L1(tmp)
1603                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1604                 REXEC_FBC_SCAN(
1605                     if (tmp == ! cBOOL(
1606                             (OP(c) == NBOUNDL)
1607                             ? isALNUM_LC(*s)
1608                             : (isWORDCHAR_L1((U8) *s)
1609                                && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1610                     {
1611                         tmp = !tmp;
1612                     }
1613                     else REXEC_FBC_TRYIT;
1614                 );
1615             }
1616             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1617                 goto got_it;
1618             break;
1619         case ALNUM:
1620             REXEC_FBC_CSCAN_PRELOAD(
1621                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1622                 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1623                 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
1624             );
1625         case ALNUML:
1626             REXEC_FBC_CSCAN_TAINT(
1627                 isALNUM_LC_utf8((U8*)s),
1628                 isALNUM_LC(*s)
1629             );
1630         case NALNUM:
1631             REXEC_FBC_CSCAN_PRELOAD(
1632                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1633                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1634                 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
1635             );
1636         case NALNUML:
1637             REXEC_FBC_CSCAN_TAINT(
1638                 !isALNUM_LC_utf8((U8*)s),
1639                 !isALNUM_LC(*s)
1640             );
1641         case SPACE:
1642             REXEC_FBC_CSCAN_PRELOAD(
1643                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1644                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1645                 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
1646             );
1647         case SPACEL:
1648             REXEC_FBC_CSCAN_TAINT(
1649                 isSPACE_LC_utf8((U8*)s),
1650                 isSPACE_LC(*s)
1651             );
1652         case NSPACE:
1653             REXEC_FBC_CSCAN_PRELOAD(
1654                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1655                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1656                 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
1657             );
1658         case NSPACEL:
1659             REXEC_FBC_CSCAN_TAINT(
1660                 !isSPACE_LC_utf8((U8*)s),
1661                 !isSPACE_LC(*s)
1662             );
1663         case DIGIT:
1664             REXEC_FBC_CSCAN_PRELOAD(
1665                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1666                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1667                 isDIGIT(*s)
1668             );
1669         case DIGITL:
1670             REXEC_FBC_CSCAN_TAINT(
1671                 isDIGIT_LC_utf8((U8*)s),
1672                 isDIGIT_LC(*s)
1673             );
1674         case NDIGIT:
1675             REXEC_FBC_CSCAN_PRELOAD(
1676                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1677                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1678                 !isDIGIT(*s)
1679             );
1680         case NDIGITL:
1681             REXEC_FBC_CSCAN_TAINT(
1682                 !isDIGIT_LC_utf8((U8*)s),
1683                 !isDIGIT_LC(*s)
1684             );
1685         case LNBREAK:
1686             REXEC_FBC_CSCAN(
1687                 is_LNBREAK_utf8(s),
1688                 is_LNBREAK_latin1(s)
1689             );
1690         case VERTWS:
1691             REXEC_FBC_CSCAN(
1692                 is_VERTWS_utf8(s),
1693                 is_VERTWS_latin1(s)
1694             );
1695         case NVERTWS:
1696             REXEC_FBC_CSCAN(
1697                 !is_VERTWS_utf8(s),
1698                 !is_VERTWS_latin1(s)
1699             );
1700         case HORIZWS:
1701             REXEC_FBC_CSCAN(
1702                 is_HORIZWS_utf8(s),
1703                 is_HORIZWS_latin1(s)
1704             );
1705         case NHORIZWS:
1706             REXEC_FBC_CSCAN(
1707                 !is_HORIZWS_utf8(s),
1708                 !is_HORIZWS_latin1(s)
1709             );
1710         case AHOCORASICKC:
1711         case AHOCORASICK:
1712             {
1713                 DECL_TRIE_TYPE(c);
1714                 /* what trie are we using right now */
1715                 reg_ac_data *aho
1716                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1717                 reg_trie_data *trie
1718                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1719                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1720
1721                 const char *last_start = strend - trie->minlen;
1722 #ifdef DEBUGGING
1723                 const char *real_start = s;
1724 #endif
1725                 STRLEN maxlen = trie->maxlen;
1726                 SV *sv_points;
1727                 U8 **points; /* map of where we were in the input string
1728                                 when reading a given char. For ASCII this
1729                                 is unnecessary overhead as the relationship
1730                                 is always 1:1, but for Unicode, especially
1731                                 case folded Unicode this is not true. */
1732                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1733                 U8 *bitmap=NULL;
1734
1735
1736                 GET_RE_DEBUG_FLAGS_DECL;
1737
1738                 /* We can't just allocate points here. We need to wrap it in
1739                  * an SV so it gets freed properly if there is a croak while
1740                  * running the match */
1741                 ENTER;
1742                 SAVETMPS;
1743                 sv_points=newSV(maxlen * sizeof(U8 *));
1744                 SvCUR_set(sv_points,
1745                     maxlen * sizeof(U8 *));
1746                 SvPOK_on(sv_points);
1747                 sv_2mortal(sv_points);
1748                 points=(U8**)SvPV_nolen(sv_points );
1749                 if ( trie_type != trie_utf8_fold
1750                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1751                 {
1752                     if (trie->bitmap)
1753                         bitmap=(U8*)trie->bitmap;
1754                     else
1755                         bitmap=(U8*)ANYOF_BITMAP(c);
1756                 }
1757                 /* this is the Aho-Corasick algorithm modified a touch
1758                    to include special handling for long "unknown char"
1759                    sequences. The basic idea being that we use AC as long
1760                    as we are dealing with a possible matching char, when
1761                    we encounter an unknown char (and we have not encountered
1762                    an accepting state) we scan forward until we find a legal
1763                    starting char.
1764                    AC matching is basically that of trie matching, except
1765                    that when we encounter a failing transition, we fall back
1766                    to the current states "fail state", and try the current char
1767                    again, a process we repeat until we reach the root state,
1768                    state 1, or a legal transition. If we fail on the root state
1769                    then we can either terminate if we have reached an accepting
1770                    state previously, or restart the entire process from the beginning
1771                    if we have not.
1772
1773                  */
1774                 while (s <= last_start) {
1775                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1776                     U8 *uc = (U8*)s;
1777                     U16 charid = 0;
1778                     U32 base = 1;
1779                     U32 state = 1;
1780                     UV uvc = 0;
1781                     STRLEN len = 0;
1782                     STRLEN foldlen = 0;
1783                     U8 *uscan = (U8*)NULL;
1784                     U8 *leftmost = NULL;
1785 #ifdef DEBUGGING
1786                     U32 accepted_word= 0;
1787 #endif
1788                     U32 pointpos = 0;
1789
1790                     while ( state && uc <= (U8*)strend ) {
1791                         int failed=0;
1792                         U32 word = aho->states[ state ].wordnum;
1793
1794                         if( state==1 ) {
1795                             if ( bitmap ) {
1796                                 DEBUG_TRIE_EXECUTE_r(
1797                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1798                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1799                                             (char *)uc, utf8_target );
1800                                         PerlIO_printf( Perl_debug_log,
1801                                             " Scanning for legal start char...\n");
1802                                     }
1803                                 );
1804                                 if (utf8_target) {
1805                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1806                                         uc += UTF8SKIP(uc);
1807                                     }
1808                                 } else {
1809                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1810                                         uc++;
1811                                     }
1812                                 }
1813                                 s= (char *)uc;
1814                             }
1815                             if (uc >(U8*)last_start) break;
1816                         }
1817
1818                         if ( word ) {
1819                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1820                             if (!leftmost || lpos < leftmost) {
1821                                 DEBUG_r(accepted_word=word);
1822                                 leftmost= lpos;
1823                             }
1824                             if (base==0) break;
1825
1826                         }
1827                         points[pointpos++ % maxlen]= uc;
1828                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1829                                              uscan, len, uvc, charid, foldlen,
1830                                              foldbuf, uniflags);
1831                         DEBUG_TRIE_EXECUTE_r({
1832                             dump_exec_pos( (char *)uc, c, strend, real_start,
1833                                 s,   utf8_target );
1834                             PerlIO_printf(Perl_debug_log,
1835                                 " Charid:%3u CP:%4"UVxf" ",
1836                                  charid, uvc);
1837                         });
1838
1839                         do {
1840 #ifdef DEBUGGING
1841                             word = aho->states[ state ].wordnum;
1842 #endif
1843                             base = aho->states[ state ].trans.base;
1844
1845                             DEBUG_TRIE_EXECUTE_r({
1846                                 if (failed)
1847                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1848                                         s,   utf8_target );
1849                                 PerlIO_printf( Perl_debug_log,
1850                                     "%sState: %4"UVxf", word=%"UVxf,
1851                                     failed ? " Fail transition to " : "",
1852                                     (UV)state, (UV)word);
1853                             });
1854                             if ( base ) {
1855                                 U32 tmp;
1856                                 I32 offset;
1857                                 if (charid &&
1858                                      ( ((offset = base + charid
1859                                         - 1 - trie->uniquecharcount)) >= 0)
1860                                      && ((U32)offset < trie->lasttrans)
1861                                      && trie->trans[offset].check == state
1862                                      && (tmp=trie->trans[offset].next))
1863                                 {
1864                                     DEBUG_TRIE_EXECUTE_r(
1865                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1866                                     state = tmp;
1867                                     break;
1868                                 }
1869                                 else {
1870                                     DEBUG_TRIE_EXECUTE_r(
1871                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1872                                     failed = 1;
1873                                     state = aho->fail[state];
1874                                 }
1875                             }
1876                             else {
1877                                 /* we must be accepting here */
1878                                 DEBUG_TRIE_EXECUTE_r(
1879                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1880                                 failed = 1;
1881                                 break;
1882                             }
1883                         } while(state);
1884                         uc += len;
1885                         if (failed) {
1886                             if (leftmost)
1887                                 break;
1888                             if (!state) state = 1;
1889                         }
1890                     }
1891                     if ( aho->states[ state ].wordnum ) {
1892                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1893                         if (!leftmost || lpos < leftmost) {
1894                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1895                             leftmost = lpos;
1896                         }
1897                     }
1898                     if (leftmost) {
1899                         s = (char*)leftmost;
1900                         DEBUG_TRIE_EXECUTE_r({
1901                             PerlIO_printf(
1902                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1903                                 (UV)accepted_word, (IV)(s - real_start)
1904                             );
1905                         });
1906                         if (!reginfo || regtry(reginfo, &s)) {
1907                             FREETMPS;
1908                             LEAVE;
1909                             goto got_it;
1910                         }
1911                         s = HOPc(s,1);
1912                         DEBUG_TRIE_EXECUTE_r({
1913                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1914                         });
1915                     } else {
1916                         DEBUG_TRIE_EXECUTE_r(
1917                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1918                         break;
1919                     }
1920                 }
1921                 FREETMPS;
1922                 LEAVE;
1923             }
1924             break;
1925         default:
1926             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1927             break;
1928         }
1929         return 0;
1930       got_it:
1931         return s;
1932 }
1933
1934
1935 /*
1936  - regexec_flags - match a regexp against a string
1937  */
1938 I32
1939 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1940               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1941 /* strend: pointer to null at end of string */
1942 /* strbeg: real beginning of string */
1943 /* minend: end of match must be >=minend after stringarg. */
1944 /* data: May be used for some additional optimizations.
1945          Currently its only used, with a U32 cast, for transmitting
1946          the ganch offset when doing a /g match. This will change */
1947 /* nosave: For optimizations. */
1948 {
1949     dVAR;
1950     struct regexp *const prog = (struct regexp *)SvANY(rx);
1951     /*register*/ char *s;
1952     register regnode *c;
1953     /*register*/ char *startpos = stringarg;
1954     I32 minlen;         /* must match at least this many chars */
1955     I32 dontbother = 0; /* how many characters not to try at end */
1956     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1957     I32 scream_pos = -1;                /* Internal iterator of scream. */
1958     char *scream_olds = NULL;
1959     const bool utf8_target = cBOOL(DO_UTF8(sv));
1960     I32 multiline;
1961     RXi_GET_DECL(prog,progi);
1962     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1963     regexp_paren_pair *swap = NULL;
1964     GET_RE_DEBUG_FLAGS_DECL;
1965
1966     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1967     PERL_UNUSED_ARG(data);
1968
1969     /* Be paranoid... */
1970     if (prog == NULL || startpos == NULL) {
1971         Perl_croak(aTHX_ "NULL regexp parameter");
1972         return 0;
1973     }
1974
1975     multiline = prog->extflags & RXf_PMf_MULTILINE;
1976     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1977
1978     RX_MATCH_UTF8_set(rx, utf8_target);
1979     DEBUG_EXECUTE_r(
1980         debug_start_match(rx, utf8_target, startpos, strend,
1981         "Matching");
1982     );
1983
1984     minlen = prog->minlen;
1985
1986     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1987         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1988                               "String too short [regexec_flags]...\n"));
1989         goto phooey;
1990     }
1991
1992
1993     /* Check validity of program. */
1994     if (UCHARAT(progi->program) != REG_MAGIC) {
1995         Perl_croak(aTHX_ "corrupted regexp program");
1996     }
1997
1998     PL_reg_flags = 0;
1999     PL_reg_eval_set = 0;
2000     PL_reg_maxiter = 0;
2001
2002     if (RX_UTF8(rx))
2003         PL_reg_flags |= RF_utf8;
2004
2005     /* Mark beginning of line for ^ and lookbehind. */
2006     reginfo.bol = startpos; /* XXX not used ??? */
2007     PL_bostr  = strbeg;
2008     reginfo.sv = sv;
2009
2010     /* Mark end of line for $ (and such) */
2011     PL_regeol = strend;
2012
2013     /* see how far we have to get to not match where we matched before */
2014     reginfo.till = startpos+minend;
2015
2016     /* If there is a "must appear" string, look for it. */
2017     s = startpos;
2018
2019     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2020         MAGIC *mg;
2021         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2022             reginfo.ganch = startpos + prog->gofs;
2023             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2024               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2025         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2026                   && SvMAGIC(sv)
2027                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2028                   && mg->mg_len >= 0) {
2029             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2030             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2031                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2032
2033             if (prog->extflags & RXf_ANCH_GPOS) {
2034                 if (s > reginfo.ganch)
2035                     goto phooey;
2036                 s = reginfo.ganch - prog->gofs;
2037                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2038                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2039                 if (s < strbeg)
2040                     goto phooey;
2041             }
2042         }
2043         else if (data) {
2044             reginfo.ganch = strbeg + PTR2UV(data);
2045             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2046                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2047
2048         } else {                                /* pos() not defined */
2049             reginfo.ganch = strbeg;
2050             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2051                  "GPOS: reginfo.ganch = strbeg\n"));
2052         }
2053     }
2054     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2055         /* We have to be careful. If the previous successful match
2056            was from this regex we don't want a subsequent partially
2057            successful match to clobber the old results.
2058            So when we detect this possibility we add a swap buffer
2059            to the re, and switch the buffer each match. If we fail
2060            we switch it back, otherwise we leave it swapped.
2061         */
2062         swap = prog->offs;
2063         /* do we need a save destructor here for eval dies? */
2064         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2065     }
2066     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2067         re_scream_pos_data d;
2068
2069         d.scream_olds = &scream_olds;
2070         d.scream_pos = &scream_pos;
2071         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2072         if (!s) {
2073             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2074             goto phooey;        /* not present */
2075         }
2076     }
2077
2078
2079
2080     /* Simplest case:  anchored match need be tried only once. */
2081     /*  [unless only anchor is BOL and multiline is set] */
2082     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2083         if (s == startpos && regtry(&reginfo, &startpos))
2084             goto got_it;
2085         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2086                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2087         {
2088             char *end;
2089
2090             if (minlen)
2091                 dontbother = minlen - 1;
2092             end = HOP3c(strend, -dontbother, strbeg) - 1;
2093             /* for multiline we only have to try after newlines */
2094             if (prog->check_substr || prog->check_utf8) {
2095                 /* because of the goto we can not easily reuse the macros for bifurcating the
2096                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2097                 if (utf8_target) {
2098                     if (s == startpos)
2099                         goto after_try_utf8;
2100                     while (1) {
2101                         if (regtry(&reginfo, &s)) {
2102                             goto got_it;
2103                         }
2104                       after_try_utf8:
2105                         if (s > end) {
2106                             goto phooey;
2107                         }
2108                         if (prog->extflags & RXf_USE_INTUIT) {
2109                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2110                             if (!s) {
2111                                 goto phooey;
2112                             }
2113                         }
2114                         else {
2115                             s += UTF8SKIP(s);
2116                         }
2117                     }
2118                 } /* end search for check string in unicode */
2119                 else {
2120                     if (s == startpos) {
2121                         goto after_try_latin;
2122                     }
2123                     while (1) {
2124                         if (regtry(&reginfo, &s)) {
2125                             goto got_it;
2126                         }
2127                       after_try_latin:
2128                         if (s > end) {
2129                             goto phooey;
2130                         }
2131                         if (prog->extflags & RXf_USE_INTUIT) {
2132                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2133                             if (!s) {
2134                                 goto phooey;
2135                             }
2136                         }
2137                         else {
2138                             s++;
2139                         }
2140                     }
2141                 } /* end search for check string in latin*/
2142             } /* end search for check string */
2143             else { /* search for newline */
2144                 if (s > startpos) {
2145                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2146                     s--;
2147                 }
2148                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2149                 while (s < end) {
2150                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2151                         if (regtry(&reginfo, &s))
2152                             goto got_it;
2153                     }
2154                 }
2155             } /* end search for newline */
2156         } /* end anchored/multiline check string search */
2157         goto phooey;
2158     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2159     {
2160         /* the warning about reginfo.ganch being used without initialization
2161            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2162            and we only enter this block when the same bit is set. */
2163         char *tmp_s = reginfo.ganch - prog->gofs;
2164
2165         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2166             goto got_it;
2167         goto phooey;
2168     }
2169
2170     /* Messy cases:  unanchored match. */
2171     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2172         /* we have /x+whatever/ */
2173         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2174         char ch;
2175 #ifdef DEBUGGING
2176         int did_match = 0;
2177 #endif
2178         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2179             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2180         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2181
2182         if (utf8_target) {
2183             REXEC_FBC_SCAN(
2184                 if (*s == ch) {
2185                     DEBUG_EXECUTE_r( did_match = 1 );
2186                     if (regtry(&reginfo, &s)) goto got_it;
2187                     s += UTF8SKIP(s);
2188                     while (s < strend && *s == ch)
2189                         s += UTF8SKIP(s);
2190                 }
2191             );
2192         }
2193         else {
2194             REXEC_FBC_SCAN(
2195                 if (*s == ch) {
2196                     DEBUG_EXECUTE_r( did_match = 1 );
2197                     if (regtry(&reginfo, &s)) goto got_it;
2198                     s++;
2199                     while (s < strend && *s == ch)
2200                         s++;
2201                 }
2202             );
2203         }
2204         DEBUG_EXECUTE_r(if (!did_match)
2205                 PerlIO_printf(Perl_debug_log,
2206                                   "Did not find anchored character...\n")
2207                );
2208     }
2209     else if (prog->anchored_substr != NULL
2210               || prog->anchored_utf8 != NULL
2211               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2212                   && prog->float_max_offset < strend - s)) {
2213         SV *must;
2214         I32 back_max;
2215         I32 back_min;
2216         char *last;
2217         char *last1;            /* Last position checked before */
2218 #ifdef DEBUGGING
2219         int did_match = 0;
2220 #endif
2221         if (prog->anchored_substr || prog->anchored_utf8) {
2222             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2223                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2224             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2225             back_max = back_min = prog->anchored_offset;
2226         } else {
2227             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2228                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2229             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2230             back_max = prog->float_max_offset;
2231             back_min = prog->float_min_offset;
2232         }
2233
2234
2235         if (must == &PL_sv_undef)
2236             /* could not downgrade utf8 check substring, so must fail */
2237             goto phooey;
2238
2239         if (back_min<0) {
2240             last = strend;
2241         } else {
2242             last = HOP3c(strend,        /* Cannot start after this */
2243                   -(I32)(CHR_SVLEN(must)
2244                          - (SvTAIL(must) != 0) + back_min), strbeg);
2245         }
2246         if (s > PL_bostr)
2247             last1 = HOPc(s, -1);
2248         else
2249             last1 = s - 1;      /* bogus */
2250
2251         /* XXXX check_substr already used to find "s", can optimize if
2252            check_substr==must. */
2253         scream_pos = -1;
2254         dontbother = end_shift;
2255         strend = HOPc(strend, -dontbother);
2256         while ( (s <= last) &&
2257                 ((flags & REXEC_SCREAM)
2258                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2259                                     end_shift, &scream_pos, 0))
2260                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2261                                   (unsigned char*)strend, must,
2262                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2263             /* we may be pointing at the wrong string */
2264             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2265                 s = strbeg + (s - SvPVX_const(sv));
2266             DEBUG_EXECUTE_r( did_match = 1 );
2267             if (HOPc(s, -back_max) > last1) {
2268                 last1 = HOPc(s, -back_min);
2269                 s = HOPc(s, -back_max);
2270             }
2271             else {
2272                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2273
2274                 last1 = HOPc(s, -back_min);
2275                 s = t;
2276             }
2277             if (utf8_target) {
2278                 while (s <= last1) {
2279                     if (regtry(&reginfo, &s))
2280                         goto got_it;
2281                     s += UTF8SKIP(s);
2282                 }
2283             }
2284             else {
2285                 while (s <= last1) {
2286                     if (regtry(&reginfo, &s))
2287                         goto got_it;
2288                     s++;
2289                 }
2290             }
2291         }
2292         DEBUG_EXECUTE_r(if (!did_match) {
2293             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2294                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2295             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2296                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2297                                ? "anchored" : "floating"),
2298                 quoted, RE_SV_TAIL(must));
2299         });
2300         goto phooey;
2301     }
2302     else if ( (c = progi->regstclass) ) {
2303         if (minlen) {
2304             const OPCODE op = OP(progi->regstclass);
2305             /* don't bother with what can't match */
2306             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2307                 strend = HOPc(strend, -(minlen - 1));
2308         }
2309         DEBUG_EXECUTE_r({
2310             SV * const prop = sv_newmortal();
2311             regprop(prog, prop, c);
2312             {
2313                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2314                     s,strend-s,60);
2315                 PerlIO_printf(Perl_debug_log,
2316                     "Matching stclass %.*s against %s (%d bytes)\n",
2317                     (int)SvCUR(prop), SvPVX_const(prop),
2318                      quoted, (int)(strend - s));
2319             }
2320         });
2321         if (find_byclass(prog, c, s, strend, &reginfo))
2322             goto got_it;
2323         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2324     }
2325     else {
2326         dontbother = 0;
2327         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2328             /* Trim the end. */
2329             char *last;
2330             SV* float_real;
2331
2332             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2333                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2334             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2335
2336             if (flags & REXEC_SCREAM) {
2337                 last = screaminstr(sv, float_real, s - strbeg,
2338                                    end_shift, &scream_pos, 1); /* last one */
2339                 if (!last)
2340                     last = scream_olds; /* Only one occurrence. */
2341                 /* we may be pointing at the wrong string */
2342                 else if (RXp_MATCH_COPIED(prog))
2343                     s = strbeg + (s - SvPVX_const(sv));
2344             }
2345             else {
2346                 STRLEN len;
2347                 const char * const little = SvPV_const(float_real, len);
2348
2349                 if (SvTAIL(float_real)) {
2350                     if (memEQ(strend - len + 1, little, len - 1))
2351                         last = strend - len + 1;
2352                     else if (!multiline)
2353                         last = memEQ(strend - len, little, len)
2354                             ? strend - len : NULL;
2355                     else
2356                         goto find_last;
2357                 } else {
2358                   find_last:
2359                     if (len)
2360                         last = rninstr(s, strend, little, little + len);
2361                     else
2362                         last = strend;  /* matching "$" */
2363                 }
2364             }
2365             if (last == NULL) {
2366                 DEBUG_EXECUTE_r(
2367                     PerlIO_printf(Perl_debug_log,
2368                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2369                         PL_colors[4], PL_colors[5]));
2370                 goto phooey; /* Should not happen! */
2371             }
2372             dontbother = strend - last + prog->float_min_offset;
2373         }
2374         if (minlen && (dontbother < minlen))
2375             dontbother = minlen - 1;
2376         strend -= dontbother;              /* this one's always in bytes! */
2377         /* We don't know much -- general case. */
2378         if (utf8_target) {
2379             for (;;) {
2380                 if (regtry(&reginfo, &s))
2381                     goto got_it;
2382                 if (s >= strend)
2383                     break;
2384                 s += UTF8SKIP(s);
2385             };
2386         }
2387         else {
2388             do {
2389                 if (regtry(&reginfo, &s))
2390                     goto got_it;
2391             } while (s++ < strend);
2392         }
2393     }
2394
2395     /* Failure. */
2396     goto phooey;
2397
2398 got_it:
2399     Safefree(swap);
2400     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2401
2402     if (PL_reg_eval_set)
2403         restore_pos(aTHX_ prog);
2404     if (RXp_PAREN_NAMES(prog))
2405         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2406
2407     /* make sure $`, $&, $', and $digit will work later */
2408     if ( !(flags & REXEC_NOT_FIRST) ) {
2409         RX_MATCH_COPY_FREE(rx);
2410         if (flags & REXEC_COPY_STR) {
2411             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2412 #ifdef PERL_OLD_COPY_ON_WRITE
2413             if ((SvIsCOW(sv)
2414                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2415                 if (DEBUG_C_TEST) {
2416                     PerlIO_printf(Perl_debug_log,
2417                                   "Copy on write: regexp capture, type %d\n",
2418                                   (int) SvTYPE(sv));
2419                 }
2420                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2421                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2422                 assert (SvPOKp(prog->saved_copy));
2423             } else
2424 #endif
2425             {
2426                 RX_MATCH_COPIED_on(rx);
2427                 s = savepvn(strbeg, i);
2428                 prog->subbeg = s;
2429             }
2430             prog->sublen = i;
2431         }
2432         else {
2433             prog->subbeg = strbeg;
2434             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2435         }
2436     }
2437
2438     return 1;
2439
2440 phooey:
2441     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2442                           PL_colors[4], PL_colors[5]));
2443     if (PL_reg_eval_set)
2444         restore_pos(aTHX_ prog);
2445     if (swap) {
2446         /* we failed :-( roll it back */
2447         Safefree(prog->offs);
2448         prog->offs = swap;
2449     }
2450
2451     return 0;
2452 }
2453
2454
2455 /*
2456  - regtry - try match at specific point
2457  */
2458 STATIC I32                      /* 0 failure, 1 success */
2459 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2460 {
2461     dVAR;
2462     CHECKPOINT lastcp;
2463     REGEXP *const rx = reginfo->prog;
2464     regexp *const prog = (struct regexp *)SvANY(rx);
2465     RXi_GET_DECL(prog,progi);
2466     GET_RE_DEBUG_FLAGS_DECL;
2467
2468     PERL_ARGS_ASSERT_REGTRY;
2469
2470     reginfo->cutpoint=NULL;
2471
2472     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2473         MAGIC *mg;
2474
2475         PL_reg_eval_set = RS_init;
2476         DEBUG_EXECUTE_r(DEBUG_s(
2477             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2478                           (IV)(PL_stack_sp - PL_stack_base));
2479             ));
2480         SAVESTACK_CXPOS();
2481         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2482         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2483         SAVETMPS;
2484         /* Apparently this is not needed, judging by wantarray. */
2485         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2486            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2487
2488         if (reginfo->sv) {
2489             /* Make $_ available to executed code. */
2490             if (reginfo->sv != DEFSV) {
2491                 SAVE_DEFSV;
2492                 DEFSV_set(reginfo->sv);
2493             }
2494
2495             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2496                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2497                 /* prepare for quick setting of pos */
2498 #ifdef PERL_OLD_COPY_ON_WRITE
2499                 if (SvIsCOW(reginfo->sv))
2500                     sv_force_normal_flags(reginfo->sv, 0);
2501 #endif
2502                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2503                                  &PL_vtbl_mglob, NULL, 0);
2504                 mg->mg_len = -1;
2505             }
2506             PL_reg_magic    = mg;
2507             PL_reg_oldpos   = mg->mg_len;
2508             SAVEDESTRUCTOR_X(restore_pos, prog);
2509         }
2510         if (!PL_reg_curpm) {
2511             Newxz(PL_reg_curpm, 1, PMOP);
2512 #ifdef USE_ITHREADS
2513             {
2514                 SV* const repointer = &PL_sv_undef;
2515                 /* this regexp is also owned by the new PL_reg_curpm, which
2516                    will try to free it.  */
2517                 av_push(PL_regex_padav, repointer);
2518                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2519                 PL_regex_pad = AvARRAY(PL_regex_padav);
2520             }
2521 #endif
2522         }
2523 #ifdef USE_ITHREADS
2524         /* It seems that non-ithreads works both with and without this code.
2525            So for efficiency reasons it seems best not to have the code
2526            compiled when it is not needed.  */
2527         /* This is safe against NULLs: */
2528         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2529         /* PM_reg_curpm owns a reference to this regexp.  */
2530         ReREFCNT_inc(rx);
2531 #endif
2532         PM_SETRE(PL_reg_curpm, rx);
2533         PL_reg_oldcurpm = PL_curpm;
2534         PL_curpm = PL_reg_curpm;
2535         if (RXp_MATCH_COPIED(prog)) {
2536             /*  Here is a serious problem: we cannot rewrite subbeg,
2537                 since it may be needed if this match fails.  Thus
2538                 $` inside (?{}) could fail... */
2539             PL_reg_oldsaved = prog->subbeg;
2540             PL_reg_oldsavedlen = prog->sublen;
2541 #ifdef PERL_OLD_COPY_ON_WRITE
2542             PL_nrs = prog->saved_copy;
2543 #endif
2544             RXp_MATCH_COPIED_off(prog);
2545         }
2546         else
2547             PL_reg_oldsaved = NULL;
2548         prog->subbeg = PL_bostr;
2549         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2550     }
2551     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2552     prog->offs[0].start = *startpos - PL_bostr;
2553     PL_reginput = *startpos;
2554     PL_reglastparen = &prog->lastparen;
2555     PL_reglastcloseparen = &prog->lastcloseparen;
2556     prog->lastparen = 0;
2557     prog->lastcloseparen = 0;
2558     PL_regsize = 0;
2559     PL_regoffs = prog->offs;
2560     if (PL_reg_start_tmpl <= prog->nparens) {
2561         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2562         if(PL_reg_start_tmp)
2563             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2564         else
2565             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2566     }
2567
2568     /* XXXX What this code is doing here?!!!  There should be no need
2569        to do this again and again, PL_reglastparen should take care of
2570        this!  --ilya*/
2571
2572     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2573      * Actually, the code in regcppop() (which Ilya may be meaning by
2574      * PL_reglastparen), is not needed at all by the test suite
2575      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2576      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2577      * Meanwhile, this code *is* needed for the
2578      * above-mentioned test suite tests to succeed.  The common theme
2579      * on those tests seems to be returning null fields from matches.
2580      * --jhi updated by dapm */
2581 #if 1
2582     if (prog->nparens) {
2583         regexp_paren_pair *pp = PL_regoffs;
2584         register I32 i;
2585         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2586             ++pp;
2587             pp->start = -1;
2588             pp->end = -1;
2589         }
2590     }
2591 #endif
2592     REGCP_SET(lastcp);
2593     if (regmatch(reginfo, progi->program + 1)) {
2594         PL_regoffs[0].end = PL_reginput - PL_bostr;
2595         return 1;
2596     }
2597     if (reginfo->cutpoint)
2598         *startpos= reginfo->cutpoint;
2599     REGCP_UNWIND(lastcp);
2600     return 0;
2601 }
2602
2603
2604 #define sayYES goto yes
2605 #define sayNO goto no
2606 #define sayNO_SILENT goto no_silent
2607
2608 /* we dont use STMT_START/END here because it leads to
2609    "unreachable code" warnings, which are bogus, but distracting. */
2610 #define CACHEsayNO \
2611     if (ST.cache_mask) \
2612        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2613     sayNO
2614
2615 /* this is used to determine how far from the left messages like
2616    'failed...' are printed. It should be set such that messages
2617    are inline with the regop output that created them.
2618 */
2619 #define REPORT_CODE_OFF 32
2620
2621
2622 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2623 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2624
2625 #define SLAB_FIRST(s) (&(s)->states[0])
2626 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2627
2628 /* grab a new slab and return the first slot in it */
2629
2630 STATIC regmatch_state *
2631 S_push_slab(pTHX)
2632 {
2633 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2634     dMY_CXT;
2635 #endif
2636     regmatch_slab *s = PL_regmatch_slab->next;
2637     if (!s) {
2638         Newx(s, 1, regmatch_slab);
2639         s->prev = PL_regmatch_slab;
2640         s->next = NULL;
2641         PL_regmatch_slab->next = s;
2642     }
2643     PL_regmatch_slab = s;
2644     return SLAB_FIRST(s);
2645 }
2646
2647
2648 /* push a new state then goto it */
2649
2650 #define PUSH_STATE_GOTO(state, node) \
2651     scan = node; \
2652     st->resume_state = state; \
2653     goto push_state;
2654
2655 /* push a new state with success backtracking, then goto it */
2656
2657 #define PUSH_YES_STATE_GOTO(state, node) \
2658     scan = node; \
2659     st->resume_state = state; \
2660     goto push_yes_state;
2661
2662
2663
2664 /*
2665
2666 regmatch() - main matching routine
2667
2668 This is basically one big switch statement in a loop. We execute an op,
2669 set 'next' to point the next op, and continue. If we come to a point which
2670 we may need to backtrack to on failure such as (A|B|C), we push a
2671 backtrack state onto the backtrack stack. On failure, we pop the top
2672 state, and re-enter the loop at the state indicated. If there are no more
2673 states to pop, we return failure.
2674
2675 Sometimes we also need to backtrack on success; for example /A+/, where
2676 after successfully matching one A, we need to go back and try to
2677 match another one; similarly for lookahead assertions: if the assertion
2678 completes successfully, we backtrack to the state just before the assertion
2679 and then carry on.  In these cases, the pushed state is marked as
2680 'backtrack on success too'. This marking is in fact done by a chain of
2681 pointers, each pointing to the previous 'yes' state. On success, we pop to
2682 the nearest yes state, discarding any intermediate failure-only states.
2683 Sometimes a yes state is pushed just to force some cleanup code to be
2684 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2685 it to free the inner regex.
2686
2687 Note that failure backtracking rewinds the cursor position, while
2688 success backtracking leaves it alone.
2689
2690 A pattern is complete when the END op is executed, while a subpattern
2691 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2692 ops trigger the "pop to last yes state if any, otherwise return true"
2693 behaviour.
2694
2695 A common convention in this function is to use A and B to refer to the two
2696 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2697 the subpattern to be matched possibly multiple times, while B is the entire
2698 rest of the pattern. Variable and state names reflect this convention.
2699
2700 The states in the main switch are the union of ops and failure/success of
2701 substates associated with with that op.  For example, IFMATCH is the op
2702 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2703 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2704 successfully matched A and IFMATCH_A_fail is a state saying that we have
2705 just failed to match A. Resume states always come in pairs. The backtrack
2706 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2707 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2708 on success or failure.
2709
2710 The struct that holds a backtracking state is actually a big union, with
2711 one variant for each major type of op. The variable st points to the
2712 top-most backtrack struct. To make the code clearer, within each
2713 block of code we #define ST to alias the relevant union.
2714
2715 Here's a concrete example of a (vastly oversimplified) IFMATCH
2716 implementation:
2717
2718     switch (state) {
2719     ....
2720
2721 #define ST st->u.ifmatch
2722
2723     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2724         ST.foo = ...; // some state we wish to save
2725         ...
2726         // push a yes backtrack state with a resume value of
2727         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2728         // first node of A:
2729         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2730         // NOTREACHED
2731
2732     case IFMATCH_A: // we have successfully executed A; now continue with B
2733         next = B;
2734         bar = ST.foo; // do something with the preserved value
2735         break;
2736
2737     case IFMATCH_A_fail: // A failed, so the assertion failed
2738         ...;   // do some housekeeping, then ...
2739         sayNO; // propagate the failure
2740
2741 #undef ST
2742
2743     ...
2744     }
2745
2746 For any old-timers reading this who are familiar with the old recursive
2747 approach, the code above is equivalent to:
2748
2749     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2750     {
2751         int foo = ...
2752         ...
2753         if (regmatch(A)) {
2754             next = B;
2755             bar = foo;
2756             break;
2757         }
2758         ...;   // do some housekeeping, then ...
2759         sayNO; // propagate the failure
2760     }
2761
2762 The topmost backtrack state, pointed to by st, is usually free. If you
2763 want to claim it, populate any ST.foo fields in it with values you wish to
2764 save, then do one of
2765
2766         PUSH_STATE_GOTO(resume_state, node);
2767         PUSH_YES_STATE_GOTO(resume_state, node);
2768
2769 which sets that backtrack state's resume value to 'resume_state', pushes a
2770 new free entry to the top of the backtrack stack, then goes to 'node'.
2771 On backtracking, the free slot is popped, and the saved state becomes the
2772 new free state. An ST.foo field in this new top state can be temporarily
2773 accessed to retrieve values, but once the main loop is re-entered, it
2774 becomes available for reuse.
2775
2776 Note that the depth of the backtrack stack constantly increases during the
2777 left-to-right execution of the pattern, rather than going up and down with
2778 the pattern nesting. For example the stack is at its maximum at Z at the
2779 end of the pattern, rather than at X in the following:
2780
2781     /(((X)+)+)+....(Y)+....Z/
2782
2783 The only exceptions to this are lookahead/behind assertions and the cut,
2784 (?>A), which pop all the backtrack states associated with A before
2785 continuing.
2786
2787 Backtrack state structs are allocated in slabs of about 4K in size.
2788 PL_regmatch_state and st always point to the currently active state,
2789 and PL_regmatch_slab points to the slab currently containing
2790 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2791 allocated, and is never freed until interpreter destruction. When the slab
2792 is full, a new one is allocated and chained to the end. At exit from
2793 regmatch(), slabs allocated since entry are freed.
2794
2795 */
2796
2797
2798 #define DEBUG_STATE_pp(pp)                                  \
2799     DEBUG_STATE_r({                                         \
2800         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2801         PerlIO_printf(Perl_debug_log,                       \
2802             "    %*s"pp" %s%s%s%s%s\n",                     \
2803             depth*2, "",                                    \
2804             PL_reg_name[st->resume_state],                     \
2805             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2806             ((st==yes_state) ? "Y" : ""),                   \
2807             ((st==mark_state) ? "M" : ""),                  \
2808             ((st==yes_state||st==mark_state) ? "]" : "")    \
2809         );                                                  \
2810     });
2811
2812
2813 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2814
2815 #ifdef DEBUGGING
2816
2817 STATIC void
2818 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2819     const char *start, const char *end, const char *blurb)
2820 {
2821     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2822
2823     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2824
2825     if (!PL_colorset)
2826             reginitcolors();
2827     {
2828         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2829             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2830
2831         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2832             start, end - start, 60);
2833
2834         PerlIO_printf(Perl_debug_log,
2835             "%s%s REx%s %s against %s\n",
2836                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2837
2838         if (utf8_target||utf8_pat)
2839             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2840                 utf8_pat ? "pattern" : "",
2841                 utf8_pat && utf8_target ? " and " : "",
2842                 utf8_target ? "string" : ""
2843             );
2844     }
2845 }
2846
2847 STATIC void
2848 S_dump_exec_pos(pTHX_ const char *locinput,
2849                       const regnode *scan,
2850                       const char *loc_regeol,
2851                       const char *loc_bostr,
2852                       const char *loc_reg_starttry,
2853                       const bool utf8_target)
2854 {
2855     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2856     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2857     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2858     /* The part of the string before starttry has one color
2859        (pref0_len chars), between starttry and current
2860        position another one (pref_len - pref0_len chars),
2861        after the current position the third one.
2862        We assume that pref0_len <= pref_len, otherwise we
2863        decrease pref0_len.  */
2864     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2865         ? (5 + taill) - l : locinput - loc_bostr;
2866     int pref0_len;
2867
2868     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2869
2870     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2871         pref_len++;
2872     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2873     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2874         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2875               ? (5 + taill) - pref_len : loc_regeol - locinput);
2876     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2877         l--;
2878     if (pref0_len < 0)
2879         pref0_len = 0;
2880     if (pref0_len > pref_len)
2881         pref0_len = pref_len;
2882     {
2883         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2884
2885         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2886             (locinput - pref_len),pref0_len, 60, 4, 5);
2887
2888         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2889                     (locinput - pref_len + pref0_len),
2890                     pref_len - pref0_len, 60, 2, 3);
2891
2892         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2893                     locinput, loc_regeol - locinput, 10, 0, 1);
2894
2895         const STRLEN tlen=len0+len1+len2;
2896         PerlIO_printf(Perl_debug_log,
2897                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2898                     (IV)(locinput - loc_bostr),
2899                     len0, s0,
2900                     len1, s1,
2901                     (docolor ? "" : "> <"),
2902                     len2, s2,
2903                     (int)(tlen > 19 ? 0 :  19 - tlen),
2904                     "");
2905     }
2906 }
2907
2908 #endif
2909
2910 /* reg_check_named_buff_matched()
2911  * Checks to see if a named buffer has matched. The data array of
2912  * buffer numbers corresponding to the buffer is expected to reside
2913  * in the regexp->data->data array in the slot stored in the ARG() of
2914  * node involved. Note that this routine doesn't actually care about the
2915  * name, that information is not preserved from compilation to execution.
2916  * Returns the index of the leftmost defined buffer with the given name
2917  * or 0 if non of the buffers matched.
2918  */
2919 STATIC I32
2920 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2921 {
2922     I32 n;
2923     RXi_GET_DECL(rex,rexi);
2924     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2925     I32 *nums=(I32*)SvPVX(sv_dat);
2926
2927     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2928
2929     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2930         if ((I32)*PL_reglastparen >= nums[n] &&
2931             PL_regoffs[nums[n]].end != -1)
2932         {
2933             return nums[n];
2934         }
2935     }
2936     return 0;
2937 }
2938
2939
2940 /* free all slabs above current one  - called during LEAVE_SCOPE */
2941
2942 STATIC void
2943 S_clear_backtrack_stack(pTHX_ void *p)
2944 {
2945     regmatch_slab *s = PL_regmatch_slab->next;
2946     PERL_UNUSED_ARG(p);
2947
2948     if (!s)
2949         return;
2950     PL_regmatch_slab->next = NULL;
2951     while (s) {
2952         regmatch_slab * const osl = s;
2953         s = s->next;
2954         Safefree(osl);
2955     }
2956 }
2957
2958
2959 #define SETREX(Re1,Re2) \
2960     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2961     Re1 = (Re2)
2962
2963 STATIC I32                      /* 0 failure, 1 success */
2964 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2965 {
2966 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2967     dMY_CXT;
2968 #endif
2969     dVAR;
2970     register const bool utf8_target = PL_reg_match_utf8;
2971     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2972     REGEXP *rex_sv = reginfo->prog;
2973     regexp *rex = (struct regexp *)SvANY(rex_sv);
2974     RXi_GET_DECL(rex,rexi);
2975     I32 oldsave;
2976     /* the current state. This is a cached copy of PL_regmatch_state */
2977     register regmatch_state *st;
2978     /* cache heavy used fields of st in registers */
2979     register regnode *scan;
2980     register regnode *next;
2981     register U32 n = 0; /* general value; init to avoid compiler warning */
2982     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2983     register char *locinput = PL_reginput;
2984     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2985
2986     bool result = 0;        /* return value of S_regmatch */
2987     int depth = 0;          /* depth of backtrack stack */
2988     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2989     const U32 max_nochange_depth =
2990         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2991         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2992     regmatch_state *yes_state = NULL; /* state to pop to on success of
2993                                                             subpattern */
2994     /* mark_state piggy backs on the yes_state logic so that when we unwind
2995        the stack on success we can update the mark_state as we go */
2996     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2997     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2998     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2999     U32 state_num;
3000     bool no_final = 0;      /* prevent failure from backtracking? */
3001     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3002     char *startpoint = PL_reginput;
3003     SV *popmark = NULL;     /* are we looking for a mark? */
3004     SV *sv_commit = NULL;   /* last mark name seen in failure */
3005     SV *sv_yes_mark = NULL; /* last mark name we have seen
3006                                during a successful match */
3007     U32 lastopen = 0;       /* last open we saw */
3008     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3009     SV* const oreplsv = GvSV(PL_replgv);
3010     /* these three flags are set by various ops to signal information to
3011      * the very next op. They have a useful lifetime of exactly one loop
3012      * iteration, and are not preserved or restored by state pushes/pops
3013      */
3014     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3015     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3016     int logical = 0;        /* the following EVAL is:
3017                                 0: (?{...})
3018                                 1: (?(?{...})X|Y)
3019                                 2: (??{...})
3020                                or the following IFMATCH/UNLESSM is:
3021                                 false: plain (?=foo)
3022                                 true:  used as a condition: (?(?=foo))
3023                             */
3024 #ifdef DEBUGGING
3025     GET_RE_DEBUG_FLAGS_DECL;
3026 #endif
3027
3028     PERL_ARGS_ASSERT_REGMATCH;
3029
3030     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3031             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3032     }));
3033     /* on first ever call to regmatch, allocate first slab */
3034     if (!PL_regmatch_slab) {
3035         Newx(PL_regmatch_slab, 1, regmatch_slab);
3036         PL_regmatch_slab->prev = NULL;
3037         PL_regmatch_slab->next = NULL;
3038         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3039     }
3040
3041     oldsave = PL_savestack_ix;
3042     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3043     SAVEVPTR(PL_regmatch_slab);
3044     SAVEVPTR(PL_regmatch_state);
3045
3046     /* grab next free state slot */
3047     st = ++PL_regmatch_state;
3048     if (st >  SLAB_LAST(PL_regmatch_slab))
3049         st = PL_regmatch_state = S_push_slab(aTHX);
3050
3051     /* Note that nextchr is a byte even in UTF */
3052     nextchr = UCHARAT(locinput);
3053     scan = prog;
3054     while (scan != NULL) {
3055
3056         DEBUG_EXECUTE_r( {
3057             SV * const prop = sv_newmortal();
3058             regnode *rnext=regnext(scan);
3059             DUMP_EXEC_POS( locinput, scan, utf8_target );
3060             regprop(rex, prop, scan);
3061
3062             PerlIO_printf(Perl_debug_log,
3063                     "%3"IVdf":%*s%s(%"IVdf")\n",
3064                     (IV)(scan - rexi->program), depth*2, "",
3065                     SvPVX_const(prop),
3066                     (PL_regkind[OP(scan)] == END || !rnext) ?
3067                         0 : (IV)(rnext - rexi->program));
3068         });
3069
3070         next = scan + NEXT_OFF(scan);
3071         if (next == scan)
3072             next = NULL;
3073         state_num = OP(scan);
3074
3075       reenter_switch:
3076
3077         assert(PL_reglastparen == &rex->lastparen);
3078         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3079         assert(PL_regoffs == rex->offs);
3080
3081         switch (state_num) {
3082         case BOL:
3083             if (locinput == PL_bostr)
3084             {
3085                 /* reginfo->till = reginfo->bol; */
3086                 break;
3087             }
3088             sayNO;
3089         case MBOL:
3090             if (locinput == PL_bostr ||
3091                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3092             {
3093                 break;
3094             }
3095             sayNO;
3096         case SBOL:
3097             if (locinput == PL_bostr)
3098                 break;
3099             sayNO;
3100         case GPOS:
3101             if (locinput == reginfo->ganch)
3102                 break;
3103             sayNO;
3104
3105         case KEEPS:
3106             /* update the startpoint */
3107             st->u.keeper.val = PL_regoffs[0].start;
3108             PL_reginput = locinput;
3109             PL_regoffs[0].start = locinput - PL_bostr;
3110             PUSH_STATE_GOTO(KEEPS_next, next);
3111             /*NOT-REACHED*/
3112         case KEEPS_next_fail:
3113             /* rollback the start point change */
3114             PL_regoffs[0].start = st->u.keeper.val;
3115             sayNO_SILENT;
3116             /*NOT-REACHED*/
3117         case EOL:
3118                 goto seol;
3119         case MEOL:
3120             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3121                 sayNO;
3122             break;
3123         case SEOL:
3124           seol:
3125             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3126                 sayNO;
3127             if (PL_regeol - locinput > 1)
3128                 sayNO;
3129             break;
3130         case EOS:
3131             if (PL_regeol != locinput)
3132                 sayNO;
3133             break;
3134         case SANY:
3135             if (!nextchr && locinput >= PL_regeol)
3136                 sayNO;
3137             if (utf8_target) {
3138                 locinput += PL_utf8skip[nextchr];
3139                 if (locinput > PL_regeol)
3140                     sayNO;
3141                 nextchr = UCHARAT(locinput);
3142             }
3143             else
3144                 nextchr = UCHARAT(++locinput);
3145             break;
3146         case CANY:
3147             if (!nextchr && locinput >= PL_regeol)
3148                 sayNO;
3149             nextchr = UCHARAT(++locinput);
3150             break;
3151         case REG_ANY:
3152             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3153                 sayNO;
3154             if (utf8_target) {
3155                 locinput += PL_utf8skip[nextchr];
3156                 if (locinput > PL_regeol)
3157                     sayNO;
3158                 nextchr = UCHARAT(locinput);
3159             }
3160             else
3161                 nextchr = UCHARAT(++locinput);
3162             break;
3163
3164 #undef  ST
3165 #define ST st->u.trie
3166         case TRIEC:
3167             /* In this case the charclass data is available inline so
3168                we can fail fast without a lot of extra overhead.
3169              */
3170             if (scan->flags == EXACT || !utf8_target) {
3171                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3172                     DEBUG_EXECUTE_r(
3173                         PerlIO_printf(Perl_debug_log,
3174                                   "%*s  %sfailed to match trie start class...%s\n",
3175                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3176                     );
3177                     sayNO_SILENT;
3178                     /* NOTREACHED */
3179                 }
3180             }
3181             /* FALL THROUGH */
3182         case TRIE:
3183             /* the basic plan of execution of the trie is:
3184              * At the beginning, run though all the states, and
3185              * find the longest-matching word. Also remember the position
3186              * of the shortest matching word. For example, this pattern:
3187              *    1  2 3 4    5
3188              *    ab|a|x|abcd|abc
3189              * when matched against the string "abcde", will generate
3190              * accept states for all words except 3, with the longest
3191              * matching word being 4, and the shortest being 1 (with
3192              * the position being after char 1 of the string).
3193              *
3194              * Then for each matching word, in word order (i.e. 1,2,4,5),
3195              * we run the remainder of the pattern; on each try setting
3196              * the current position to the character following the word,
3197              * returning to try the next word on failure.
3198              *
3199              * We avoid having to build a list of words at runtime by
3200              * using a compile-time structure, wordinfo[].prev, which
3201              * gives, for each word, the previous accepting word (if any).
3202              * In the case above it would contain the mappings 1->2, 2->0,
3203              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3204              * the longest word (4 above), a list of all words, by
3205              * following the list of prev pointers; this gives us the
3206              * unordered list 4,5,1,2. Then given the current word we have
3207              * just tried, we can go through the list and find the
3208              * next-biggest word to try (so if we just failed on word 2,
3209              * the next in the list is 4).
3210              *
3211              * Since at runtime we don't record the matching position in
3212              * the string for each word, we have to work that out for
3213              * each word we're about to process. The wordinfo table holds
3214              * the character length of each word; given that we recorded
3215              * at the start: the position of the shortest word and its
3216              * length in chars, we just need to move the pointer the
3217              * difference between the two char lengths. Depending on
3218              * Unicode status and folding, that's cheap or expensive.
3219              *
3220              * This algorithm is optimised for the case where are only a
3221              * small number of accept states, i.e. 0,1, or maybe 2.
3222              * With lots of accepts states, and having to try all of them,
3223              * it becomes quadratic on number of accept states to find all
3224              * the next words.
3225              */
3226
3227             {
3228                 /* what type of TRIE am I? (utf8 makes this contextual) */
3229                 DECL_TRIE_TYPE(scan);
3230
3231                 /* what trie are we using right now */
3232                 reg_trie_data * const trie
3233                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3234                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3235                 U32 state = trie->startstate;
3236
3237                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3238                     !TRIE_BITMAP_TEST(trie,*locinput)
3239                 ) {
3240                     if (trie->states[ state ].wordnum) {
3241                          DEBUG_EXECUTE_r(
3242                             PerlIO_printf(Perl_debug_log,
3243                                           "%*s  %smatched empty string...%s\n",
3244                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3245                         );
3246                         if (!trie->jump)
3247                             break;
3248                     } else {
3249                         DEBUG_EXECUTE_r(
3250                             PerlIO_printf(Perl_debug_log,
3251                                           "%*s  %sfailed to match trie start class...%s\n",
3252                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3253                         );
3254                         sayNO_SILENT;
3255                    }
3256                 }
3257
3258             {
3259                 U8 *uc = ( U8* )locinput;
3260
3261                 STRLEN len = 0;
3262                 STRLEN foldlen = 0;
3263                 U8 *uscan = (U8*)NULL;
3264                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3265                 U32 charcount = 0; /* how many input chars we have matched */
3266                 U32 accepted = 0; /* have we seen any accepting states? */
3267
3268                 ST.B = next;
3269                 ST.jump = trie->jump;
3270                 ST.me = scan;
3271                 ST.firstpos = NULL;
3272                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3273                 ST.nextword = 0;
3274
3275                 /* fully traverse the TRIE; note the position of the
3276                    shortest accept state and the wordnum of the longest
3277                    accept state */
3278
3279                 while ( state && uc <= (U8*)PL_regeol ) {
3280                     U32 base = trie->states[ state ].trans.base;
3281                     UV uvc = 0;
3282                     U16 charid = 0;
3283                     U16 wordnum;
3284                     wordnum = trie->states[ state ].wordnum;
3285
3286                     if (wordnum) { /* it's an accept state */
3287                         if (!accepted) {
3288                             accepted = 1;
3289                             /* record first match position */
3290                             if (ST.longfold) {
3291                                 ST.firstpos = (U8*)locinput;
3292                                 ST.firstchars = 0;
3293                             }
3294                             else {
3295                                 ST.firstpos = uc;
3296                                 ST.firstchars = charcount;
3297                             }
3298                         }
3299                         if (!ST.nextword || wordnum < ST.nextword)
3300                             ST.nextword = wordnum;
3301                         ST.topword = wordnum;
3302                     }
3303
3304                     DEBUG_TRIE_EXECUTE_r({
3305                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3306                                 PerlIO_printf( Perl_debug_log,
3307                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3308                                     2+depth * 2, "", PL_colors[4],
3309                                     (UV)state, (accepted ? 'Y' : 'N'));
3310                     });
3311
3312                     /* read a char and goto next state */
3313                     if ( base ) {
3314                         I32 offset;
3315                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3316                                              uscan, len, uvc, charid, foldlen,
3317                                              foldbuf, uniflags);
3318                         charcount++;
3319                         if (foldlen>0)
3320                             ST.longfold = TRUE;
3321                         if (charid &&
3322                              ( ((offset =
3323                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3324
3325                              && ((U32)offset < trie->lasttrans)
3326                              && trie->trans[offset].check == state)
3327                         {
3328                             state = trie->trans[offset].next;
3329                         }
3330                         else {
3331                             state = 0;
3332                         }
3333                         uc += len;
3334
3335                     }
3336                     else {
3337                         state = 0;
3338                     }
3339                     DEBUG_TRIE_EXECUTE_r(
3340                         PerlIO_printf( Perl_debug_log,
3341                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3342                             charid, uvc, (UV)state, PL_colors[5] );
3343                     );
3344                 }
3345                 if (!accepted)
3346                    sayNO;
3347
3348                 /* calculate total number of accept states */
3349                 {
3350                     U16 w = ST.topword;
3351                     accepted = 0;
3352                     while (w) {
3353                         w = trie->wordinfo[w].prev;
3354                         accepted++;
3355                     }
3356                     ST.accepted = accepted;
3357                 }
3358
3359                 DEBUG_EXECUTE_r(
3360                     PerlIO_printf( Perl_debug_log,
3361                         "%*s  %sgot %"IVdf" possible matches%s\n",
3362                         REPORT_CODE_OFF + depth * 2, "",
3363                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3364                 );
3365                 goto trie_first_try; /* jump into the fail handler */
3366             }}
3367             /* NOTREACHED */
3368
3369         case TRIE_next_fail: /* we failed - try next alternative */
3370             if ( ST.jump) {
3371                 REGCP_UNWIND(ST.cp);
3372                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3373                     PL_regoffs[n].end = -1;
3374                 *PL_reglastparen = n;
3375             }
3376             if (!--ST.accepted) {
3377                 DEBUG_EXECUTE_r({
3378                     PerlIO_printf( Perl_debug_log,
3379                         "%*s  %sTRIE failed...%s\n",
3380                         REPORT_CODE_OFF+depth*2, "",
3381                         PL_colors[4],
3382                         PL_colors[5] );
3383                 });
3384                 sayNO_SILENT;
3385             }
3386             {
3387                 /* Find next-highest word to process.  Note that this code
3388                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3389                 register U16 min = 0;
3390                 register U16 word;
3391                 register U16 const nextword = ST.nextword;
3392                 register reg_trie_wordinfo * const wordinfo
3393                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3394                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3395                     if (word > nextword && (!min || word < min))
3396                         min = word;
3397                 }
3398                 ST.nextword = min;
3399             }
3400
3401           trie_first_try:
3402             if (do_cutgroup) {
3403                 do_cutgroup = 0;
3404                 no_final = 0;
3405             }
3406
3407             if ( ST.jump) {
3408                 ST.lastparen = *PL_reglastparen;
3409                 REGCP_SET(ST.cp);
3410             }
3411
3412             /* find start char of end of current word */
3413             {
3414                 U32 chars; /* how many chars to skip */
3415                 U8 *uc = ST.firstpos;
3416                 reg_trie_data * const trie
3417                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3418
3419                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3420                             >=  ST.firstchars);
3421                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3422                             - ST.firstchars;
3423
3424                 if (ST.longfold) {
3425                     /* the hard option - fold each char in turn and find
3426                      * its folded length (which may be different */
3427                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3428                     STRLEN foldlen;
3429                     STRLEN len;
3430                     UV uvc;
3431                     U8 *uscan;
3432
3433                     while (chars) {
3434                         if (utf8_target) {
3435                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3436                                                     uniflags);
3437                             uc += len;
3438                         }
3439                         else {
3440                             uvc = *uc;
3441                             uc++;
3442                         }
3443                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3444                         uscan = foldbuf;
3445                         while (foldlen) {
3446                             if (!--chars)
3447                                 break;
3448                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3449                                             uniflags);
3450                             uscan += len;
3451                             foldlen -= len;
3452                         }
3453                     }
3454                 }
3455                 else {
3456                     if (utf8_target)
3457                         while (chars--)
3458                             uc += UTF8SKIP(uc);
3459                     else
3460                         uc += chars;
3461                 }
3462                 PL_reginput = (char *)uc;
3463             }
3464
3465             scan = (ST.jump && ST.jump[ST.nextword])
3466                         ? ST.me + ST.jump[ST.nextword]
3467                         : ST.B;
3468
3469             DEBUG_EXECUTE_r({
3470                 PerlIO_printf( Perl_debug_log,
3471                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3472                     REPORT_CODE_OFF+depth*2, "",
3473                     PL_colors[4],
3474                     ST.nextword,
3475                     PL_colors[5]
3476                     );
3477             });
3478
3479             if (ST.accepted > 1 || has_cutgroup) {
3480                 PUSH_STATE_GOTO(TRIE_next, scan);
3481                 /* NOTREACHED */
3482             }
3483             /* only one choice left - just continue */
3484             DEBUG_EXECUTE_r({
3485                 AV *const trie_words
3486                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3487                 SV ** const tmp = av_fetch( trie_words,
3488                     ST.nextword-1, 0 );
3489                 SV *sv= tmp ? sv_newmortal() : NULL;
3490
3491                 PerlIO_printf( Perl_debug_log,
3492                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3493                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3494                     ST.nextword,
3495                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3496                             PL_colors[0], PL_colors[1],
3497                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3498                         )
3499                     : "not compiled under -Dr",
3500                     PL_colors[5] );
3501             });
3502
3503             locinput = PL_reginput;
3504             nextchr = UCHARAT(locinput);
3505             continue; /* execute rest of RE */
3506             /* NOTREACHED */
3507 #undef  ST
3508
3509         case EXACT: {
3510             char *s = STRING(scan);
3511             ln = STR_LEN(scan);
3512             if (utf8_target != UTF_PATTERN) {
3513                 /* The target and the pattern have differing utf8ness. */
3514                 char *l = locinput;
3515                 const char * const e = s + ln;
3516
3517                 if (utf8_target) {
3518                     /* The target is utf8, the pattern is not utf8. */
3519                     while (s < e) {
3520                         STRLEN ulen;
3521                         if (l >= PL_regeol)
3522                              sayNO;
3523                         if (NATIVE_TO_UNI(*(U8*)s) !=
3524                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3525                                             uniflags))
3526                              sayNO;
3527                         l += ulen;
3528                         s ++;
3529                     }
3530                 }
3531                 else {
3532                     /* The target is not utf8, the pattern is utf8. */
3533                     while (s < e) {
3534                         STRLEN ulen;
3535                         if (l >= PL_regeol)
3536                             sayNO;
3537                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3538                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3539                                            uniflags))
3540                             sayNO;
3541                         s += ulen;
3542                         l ++;
3543                     }
3544                 }
3545                 locinput = l;
3546                 nextchr = UCHARAT(locinput);
3547                 break;
3548             }
3549             /* The target and the pattern have the same utf8ness. */
3550             /* Inline the first character, for speed. */
3551             if (UCHARAT(s) != nextchr)
3552                 sayNO;
3553             if (PL_regeol - locinput < ln)
3554                 sayNO;
3555             if (ln > 1 && memNE(s, locinput, ln))
3556                 sayNO;
3557             locinput += ln;
3558             nextchr = UCHARAT(locinput);
3559             break;
3560             }
3561         case EXACTFL: {
3562             re_fold_t folder;
3563             const U8 * fold_array;
3564             const char * s;
3565
3566             PL_reg_flags |= RF_tainted;
3567             folder = foldEQ_locale;
3568             fold_array = PL_fold_locale;
3569             goto do_exactf;
3570
3571         case EXACTFU:
3572             folder = foldEQ_latin1;
3573             fold_array = PL_fold_latin1;
3574             goto do_exactf;
3575
3576         case EXACTF:
3577             folder = foldEQ;
3578             fold_array = PL_fold;
3579
3580           do_exactf:
3581             s = STRING(scan);
3582             ln = STR_LEN(scan);
3583
3584             if (utf8_target || UTF_PATTERN) {
3585               /* Either target or the pattern are utf8. */
3586                 const char * const l = locinput;
3587                 char *e = PL_regeol;
3588
3589                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3590                                l, &e, 0,  utf8_target)) {
3591                      /* One more case for the sharp s:
3592                       * pack("U0U*", 0xDF) =~ /ss/i,
3593                       * the 0xC3 0x9F are the UTF-8
3594                       * byte sequence for the U+00DF. */
3595
3596                      if (!(utf8_target &&
3597                            toLOWER(s[0]) == 's' &&
3598                            ln >= 2 &&
3599                            toLOWER(s[1]) == 's' &&
3600                            (U8)l[0] == 0xC3 &&
3601                            e - l >= 2 &&
3602                            (U8)l[1] == 0x9F))
3603                           sayNO;
3604                 }
3605                 locinput = e;
3606                 nextchr = UCHARAT(locinput);
3607                 break;
3608             }
3609
3610             /* Neither the target and the pattern are utf8. */
3611
3612             /* Inline the first character, for speed. */
3613             if (UCHARAT(s) != nextchr &&
3614                 UCHARAT(s) != fold_array[nextchr])
3615             {
3616                 sayNO;
3617             }
3618             if (PL_regeol - locinput < ln)
3619                 sayNO;
3620             if (ln > 1 && ! folder(s, locinput, ln))
3621                 sayNO;
3622             locinput += ln;
3623             nextchr = UCHARAT(locinput);
3624             break;
3625         }
3626         case BOUNDL:
3627         case NBOUNDL:
3628             PL_reg_flags |= RF_tainted;
3629             /* FALL THROUGH */
3630         case BOUND:
3631         case NBOUND:
3632             /* was last char in word? */
3633             if (utf8_target) {
3634                 if (locinput == PL_bostr)
3635                     ln = '\n';
3636                 else {
3637                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3638
3639                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3640                 }
3641                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3642                     ln = isALNUM_uni(ln);
3643                     LOAD_UTF8_CHARCLASS_ALNUM();
3644                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3645                 }
3646                 else {
3647                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3648                     n = isALNUM_LC_utf8((U8*)locinput);
3649                 }
3650             }
3651             else {
3652                 ln = (locinput != PL_bostr) ?
3653                     UCHARAT(locinput - 1) : '\n';
3654                 if (FLAGS(scan) & USE_UNI) {
3655
3656                     /* Here, can't be BOUNDL or NBOUNDL because they never set
3657                      * the flags to USE_UNI */
3658                     ln = isWORDCHAR_L1(ln);
3659                     n = isWORDCHAR_L1(nextchr);
3660                 }
3661                 else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3662                     ln = isALNUM(ln);
3663                     n = isALNUM(nextchr);
3664                 }
3665                 else {
3666                     ln = isALNUM_LC(ln);
3667                     n = isALNUM_LC(nextchr);
3668                 }
3669             }
3670             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3671                                     OP(scan) == BOUNDL))
3672                     sayNO;
3673             break;
3674         case ANYOFV:
3675         case ANYOF:
3676             if (utf8_target || state_num == ANYOFV) {
3677                 STRLEN inclasslen = PL_regeol - locinput;
3678                 if (locinput >= PL_regeol)
3679                     sayNO;
3680
3681                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3682                     goto anyof_fail;
3683                 locinput += inclasslen;
3684                 nextchr = UCHARAT(locinput);
3685                 break;
3686             }
3687             else {
3688                 if (nextchr < 0)
3689                     nextchr = UCHARAT(locinput);
3690                 if (!nextchr && locinput >= PL_regeol)
3691                     sayNO;
3692                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3693                     goto anyof_fail;
3694                 nextchr = UCHARAT(++locinput);
3695                 break;
3696             }
3697         anyof_fail:
3698             sayNO;
3699             break;
3700         /* Special char classes - The defines start on line 129 or so */
3701         CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3702         CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3703
3704         CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3705         CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3706
3707         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3708         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3709
3710         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3711                        a Unicode extended Grapheme Cluster */
3712             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3713               extended Grapheme Cluster is:
3714
3715                CR LF
3716                | Prepend* Begin Extend*
3717                | .
3718
3719                Begin is (Hangul-syllable | ! Control)
3720                Extend is (Grapheme_Extend | Spacing_Mark)
3721                Control is [ GCB_Control CR LF ]
3722
3723                The discussion below shows how the code for CLUMP is derived
3724                from this regex.  Note that most of these concepts are from
3725                property values of the Grapheme Cluster Boundary (GCB) property.
3726                No code point can have multiple property values for a given
3727                property.  Thus a code point in Prepend can't be in Control, but
3728                it must be in !Control.  This is why Control above includes
3729                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3730                property separately, and so can't be in GCB_Control, even though
3731                they logically are controls.  Control is not the same as gc=cc,
3732                but includes format and other characters as well.
3733
3734                The Unicode definition of Hangul-syllable is:
3735                    L+
3736                    | (L* ( ( V | LV ) V* | LVT ) T*)
3737                    | T+
3738                   )
3739                Each of these is a value for the GCB property, and hence must be
3740                disjoint, so the order they are tested is immaterial, so the
3741                above can safely be changed to
3742                    T+
3743                    | L+
3744                    | (L* ( LVT | ( V | LV ) V*) T*)
3745
3746                The last two terms can be combined like this:
3747                    L* ( L
3748                         | (( LVT | ( V | LV ) V*) T*))
3749
3750                And refactored into this:
3751                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3752
3753                That means that if we have seen any L's at all we can quit
3754                there, but if the next character is a LVT, a V or and LV we
3755                should keep going.
3756
3757                There is a subtlety with Prepend* which showed up in testing.
3758                Note that the Begin, and only the Begin is required in:
3759                 | Prepend* Begin Extend*
3760                Also, Begin contains '! Control'.  A Prepend must be a '!
3761                Control', which means it must be a Begin.  What it comes down to
3762                is that if we match Prepend* and then find no suitable Begin
3763                afterwards, that if we backtrack the last Prepend, that one will
3764                be a suitable Begin.
3765             */
3766
3767             if (locinput >= PL_regeol)
3768                 sayNO;
3769             if  (! utf8_target) {
3770
3771                 /* Match either CR LF  or '.', as all the other possibilities
3772                  * require utf8 */
3773                 locinput++;         /* Match the . or CR */
3774                 if (nextchr == '\r'
3775                     && locinput < PL_regeol
3776                     && UCHARAT(locinput) == '\n') locinput++;
3777             }
3778             else {
3779
3780                 /* Utf8: See if is ( CR LF ); already know that locinput <
3781                  * PL_regeol, so locinput+1 is in bounds */
3782                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3783                     locinput += 2;
3784                 }
3785                 else {
3786                     /* In case have to backtrack to beginning, then match '.' */
3787                     char *starting = locinput;
3788
3789                     /* In case have to backtrack the last prepend */
3790                     char *previous_prepend = 0;
3791
3792                     LOAD_UTF8_CHARCLASS_GCB();
3793
3794                     /* Match (prepend)* */
3795                     while (locinput < PL_regeol
3796                            && swash_fetch(PL_utf8_X_prepend,
3797                                           (U8*)locinput, utf8_target))
3798                     {
3799                         previous_prepend = locinput;
3800                         locinput += UTF8SKIP(locinput);
3801                     }
3802
3803                     /* As noted above, if we matched a prepend character, but
3804                      * the next thing won't match, back off the last prepend we
3805                      * matched, as it is guaranteed to match the begin */
3806                     if (previous_prepend
3807                         && (locinput >=  PL_regeol
3808                             || ! swash_fetch(PL_utf8_X_begin,
3809                                              (U8*)locinput, utf8_target)))
3810                     {
3811                         locinput = previous_prepend;
3812                     }
3813
3814                     /* Note that here we know PL_regeol > locinput, as we
3815                      * tested that upon input to this switch case, and if we
3816                      * moved locinput forward, we tested the result just above
3817                      * and it either passed, or we backed off so that it will
3818                      * now pass */
3819                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3820
3821                         /* Here did not match the required 'Begin' in the
3822                          * second term.  So just match the very first
3823                          * character, the '.' of the final term of the regex */
3824                         locinput = starting + UTF8SKIP(starting);
3825                     } else {
3826
3827                         /* Here is the beginning of a character that can have
3828                          * an extender.  It is either a hangul syllable, or a
3829                          * non-control */
3830                         if (swash_fetch(PL_utf8_X_non_hangul,
3831                                         (U8*)locinput, utf8_target))
3832                         {
3833
3834                             /* Here not a Hangul syllable, must be a
3835                              * ('!  * Control') */
3836                             locinput += UTF8SKIP(locinput);
3837                         } else {
3838
3839                             /* Here is a Hangul syllable.  It can be composed
3840                              * of several individual characters.  One
3841                              * possibility is T+ */
3842                             if (swash_fetch(PL_utf8_X_T,
3843                                             (U8*)locinput, utf8_target))
3844                             {
3845                                 while (locinput < PL_regeol
3846                                         && swash_fetch(PL_utf8_X_T,
3847                                                         (U8*)locinput, utf8_target))
3848                                 {
3849                                     locinput += UTF8SKIP(locinput);
3850                                 }
3851                             } else {
3852
3853                                 /* Here, not T+, but is a Hangul.  That means
3854                                  * it is one of the others: L, LV, LVT or V,
3855                                  * and matches:
3856                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3857
3858                                 /* Match L*           */
3859                                 while (locinput < PL_regeol
3860                                         && swash_fetch(PL_utf8_X_L,
3861                                                         (U8*)locinput, utf8_target))
3862                                 {
3863                                     locinput += UTF8SKIP(locinput);
3864                                 }
3865
3866                                 /* Here, have exhausted L*.  If the next
3867                                  * character is not an LV, LVT nor V, it means
3868                                  * we had to have at least one L, so matches L+
3869                                  * in the original equation, we have a complete
3870                                  * hangul syllable.  Are done. */
3871
3872                                 if (locinput < PL_regeol
3873                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3874                                                     (U8*)locinput, utf8_target))
3875                                 {
3876
3877                                     /* Otherwise keep going.  Must be LV, LVT
3878                                      * or V.  See if LVT */
3879                                     if (swash_fetch(PL_utf8_X_LVT,
3880                                                     (U8*)locinput, utf8_target))
3881                                     {
3882                                         locinput += UTF8SKIP(locinput);
3883                                     } else {
3884
3885                                         /* Must be  V or LV.  Take it, then
3886                                          * match V*     */
3887                                         locinput += UTF8SKIP(locinput);
3888                                         while (locinput < PL_regeol
3889                                                 && swash_fetch(PL_utf8_X_V,
3890                                                          (U8*)locinput, utf8_target))
3891                                         {
3892                                             locinput += UTF8SKIP(locinput);
3893                                         }
3894                                     }
3895
3896                                     /* And any of LV, LVT, or V can be followed
3897                                      * by T*            */
3898                                     while (locinput < PL_regeol
3899                                            && swash_fetch(PL_utf8_X_T,
3900                                                            (U8*)locinput,
3901                                                            utf8_target))
3902                                     {
3903                                         locinput += UTF8SKIP(locinput);
3904                                     }
3905                                 }
3906                             }
3907                         }
3908
3909                         /* Match any extender */
3910                         while (locinput < PL_regeol
3911                                 && swash_fetch(PL_utf8_X_extend,
3912                                                 (U8*)locinput, utf8_target))
3913                         {
3914                             locinput += UTF8SKIP(locinput);
3915                         }
3916                     }
3917                 }
3918                 if (locinput > PL_regeol) sayNO;
3919             }
3920             nextchr = UCHARAT(locinput);
3921             break;
3922
3923         case NREFFL:
3924         {   /* The capture buffer cases.  The ones beginning with N for the
3925                named buffers just convert to the equivalent numbered and
3926                pretend they were called as the corresponding numbered buffer
3927                op.  */
3928             /* don't initialize these, it makes C++ unhappy */
3929             char *s;
3930             char type;
3931             re_fold_t folder;
3932             const U8 *fold_array;
3933
3934             PL_reg_flags |= RF_tainted;
3935             folder = foldEQ_locale;
3936             fold_array = PL_fold_locale;
3937             type = REFFL;
3938             goto do_nref;
3939
3940         case NREFFU:
3941             folder = foldEQ_latin1;
3942             fold_array = PL_fold_latin1;
3943             type = REFFU;
3944             goto do_nref;
3945
3946         case NREFF:
3947             folder = foldEQ;
3948             fold_array = PL_fold;
3949             type = REFF;
3950             goto do_nref;
3951
3952         case NREF:
3953             type = REF;
3954             folder = NULL;
3955             fold_array = NULL;
3956           do_nref:
3957
3958             /* For the named back references, find the corresponding buffer
3959              * number */
3960             n = reg_check_named_buff_matched(rex,scan);
3961
3962             if ( ! n ) {
3963                 sayNO;
3964             }
3965             goto do_nref_ref_common;
3966
3967         case REFFL:
3968             PL_reg_flags |= RF_tainted;
3969             folder = foldEQ_locale;
3970             fold_array = PL_fold_locale;
3971             goto do_ref;
3972
3973         case REFFU:
3974             folder = foldEQ_latin1;
3975             fold_array = PL_fold_latin1;
3976             goto do_ref;
3977
3978         case REFF:
3979             folder = foldEQ;
3980             fold_array = PL_fold;
3981             goto do_ref;
3982
3983         case REF:
3984             folder = NULL;
3985             fold_array = NULL;
3986
3987           do_ref:
3988             type = OP(scan);
3989             n = ARG(scan);  /* which paren pair */
3990
3991           do_nref_ref_common:
3992             ln = PL_regoffs[n].start;
3993             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3994             if (*PL_reglastparen < n || ln == -1)
3995                 sayNO;                  /* Do not match unless seen CLOSEn. */
3996             if (ln == PL_regoffs[n].end)
3997                 break;
3998
3999             s = PL_bostr + ln;
4000             if (type != REF     /* REF can do byte comparison */
4001                 && (utf8_target
4002                     || (type == REFFU
4003                         && (*s == (char) LATIN_SMALL_LETTER_SHARP_S
4004                             || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S))))
4005             { /* XXX handle REFFL better */
4006                 char * limit = PL_regeol;
4007
4008                 /* This call case insensitively compares the entire buffer
4009                     * at s, with the current input starting at locinput, but
4010                     * not going off the end given by PL_regeol, and returns in
4011                     * limit upon success, how much of the current input was
4012                     * matched */
4013                 if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4014                                     locinput, &limit, 0, utf8_target))
4015                 {
4016                     sayNO;
4017                 }
4018                 locinput = limit;
4019                 nextchr = UCHARAT(locinput);
4020                 break;
4021             }
4022
4023             /* Not utf8:  Inline the first character, for speed. */
4024             if (UCHARAT(s) != nextchr &&
4025                 (type == REF ||
4026                  UCHARAT(s) != fold_array[nextchr]))
4027                 sayNO;
4028             ln = PL_regoffs[n].end - ln;
4029             if (locinput + ln > PL_regeol)
4030                 sayNO;
4031             if (ln > 1 && (type == REF
4032                            ? memNE(s, locinput, ln)
4033                            : ! folder(s, locinput, ln)))
4034                 sayNO;
4035             locinput += ln;
4036             nextchr = UCHARAT(locinput);
4037             break;
4038         }
4039         case NOTHING:
4040         case TAIL:
4041             break;
4042         case BACK:
4043             break;
4044
4045 #undef  ST
4046 #define ST st->u.eval
4047         {
4048             SV *ret;
4049             REGEXP *re_sv;
4050             regexp *re;
4051             regexp_internal *rei;
4052             regnode *startpoint;
4053
4054         case GOSTART:
4055         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4056             if (cur_eval && cur_eval->locinput==locinput) {
4057                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4058                     Perl_croak(aTHX_ "Infinite recursion in regex");
4059                 if ( ++nochange_depth > max_nochange_depth )
4060                     Perl_croak(aTHX_
4061                         "Pattern subroutine nesting without pos change"
4062                         " exceeded limit in regex");
4063             } else {
4064                 nochange_depth = 0;
4065             }
4066             re_sv = rex_sv;
4067             re = rex;
4068             rei = rexi;
4069             (void)ReREFCNT_inc(rex_sv);
4070             if (OP(scan)==GOSUB) {
4071                 startpoint = scan + ARG2L(scan);
4072                 ST.close_paren = ARG(scan);
4073             } else {
4074                 startpoint = rei->program+1;
4075                 ST.close_paren = 0;
4076             }
4077             goto eval_recurse_doit;
4078             /* NOTREACHED */
4079         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4080             if (cur_eval && cur_eval->locinput==locinput) {
4081                 if ( ++nochange_depth > max_nochange_depth )
4082                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4083             } else {
4084                 nochange_depth = 0;
4085             }
4086             {
4087                 /* execute the code in the {...} */
4088                 dSP;
4089                 SV ** const before = SP;
4090                 OP_4tree * const oop = PL_op;
4091                 COP * const ocurcop = PL_curcop;
4092                 PAD *old_comppad;
4093                 char *saved_regeol = PL_regeol;
4094                 struct re_save_state saved_state;
4095
4096                 /* To not corrupt the existing regex state while executing the
4097                  * eval we would normally put it on the save stack, like with
4098                  * save_re_context. However, re-evals have a weird scoping so we
4099                  * can't just add ENTER/LEAVE here. With that, things like
4100                  *
4101                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4102                  *
4103                  * would break, as they expect the localisation to be unwound
4104                  * only when the re-engine backtracks through the bit that
4105                  * localised it.
4106                  *
4107                  * What we do instead is just saving the state in a local c
4108                  * variable.
4109                  */
4110                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4111
4112                 n = ARG(scan);
4113                 PL_op = (OP_4tree*)rexi->data->data[n];
4114                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4115                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4116                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4117                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4118
4119                 if (sv_yes_mark) {
4120                     SV *sv_mrk = get_sv("REGMARK", 1);
4121                     sv_setsv(sv_mrk, sv_yes_mark);
4122                 }
4123
4124                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4125                 SPAGAIN;
4126                 if (SP == before)
4127                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4128                 else {
4129                     ret = POPs;
4130                     PUTBACK;
4131                 }
4132
4133                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4134
4135                 PL_op = oop;
4136                 PAD_RESTORE_LOCAL(old_comppad);
4137                 PL_curcop = ocurcop;
4138                 PL_regeol = saved_regeol;
4139                 if (!logical) {
4140                     /* /(?{...})/ */
4141                     sv_setsv(save_scalar(PL_replgv), ret);
4142                     break;
4143                 }
4144             }
4145             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4146                 logical = 0;
4147                 {
4148                     /* extract RE object from returned value; compiling if
4149                      * necessary */
4150                     MAGIC *mg = NULL;
4151                     REGEXP *rx = NULL;
4152
4153                     if (SvROK(ret)) {
4154                         SV *const sv = SvRV(ret);
4155
4156                         if (SvTYPE(sv) == SVt_REGEXP) {
4157                             rx = (REGEXP*) sv;
4158                         } else if (SvSMAGICAL(sv)) {
4159                             mg = mg_find(sv, PERL_MAGIC_qr);
4160                             assert(mg);
4161                         }
4162                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4163                         rx = (REGEXP*) ret;
4164                     } else if (SvSMAGICAL(ret)) {
4165                         if (SvGMAGICAL(ret)) {
4166                             /* I don't believe that there is ever qr magic
4167                                here.  */
4168                             assert(!mg_find(ret, PERL_MAGIC_qr));
4169                             sv_unmagic(ret, PERL_MAGIC_qr);
4170                         }
4171                         else {
4172                             mg = mg_find(ret, PERL_MAGIC_qr);
4173                             /* testing suggests mg only ends up non-NULL for
4174                                scalars who were upgraded and compiled in the
4175                                else block below. In turn, this is only
4176                                triggered in the "postponed utf8 string" tests
4177                                in t/op/pat.t  */
4178                         }
4179                     }
4180
4181                     if (mg) {
4182                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4183                         assert(rx);
4184                     }
4185                     if (rx) {
4186                         rx = reg_temp_copy(NULL, rx);
4187                     }
4188                     else {
4189                         U32 pm_flags = 0;
4190                         const I32 osize = PL_regsize;
4191
4192                         if (DO_UTF8(ret)) {
4193                             assert (SvUTF8(ret));
4194                         } else if (SvUTF8(ret)) {
4195                             /* Not doing UTF-8, despite what the SV says. Is
4196                                this only if we're trapped in use 'bytes'?  */
4197                             /* Make a copy of the octet sequence, but without
4198                                the flag on, as the compiler now honours the
4199                                SvUTF8 flag on ret.  */
4200                             STRLEN len;
4201                             const char *const p = SvPV(ret, len);
4202                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4203                         }
4204                         rx = CALLREGCOMP(ret, pm_flags);
4205                         if (!(SvFLAGS(ret)
4206                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4207                                  | SVs_GMG))) {
4208                             /* This isn't a first class regexp. Instead, it's
4209                                caching a regexp onto an existing, Perl visible
4210                                scalar.  */
4211                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4212                         }
4213                         PL_regsize = osize;
4214                     }
4215                     re_sv = rx;
4216                     re = (struct regexp *)SvANY(rx);
4217                 }
4218                 RXp_MATCH_COPIED_off(re);
4219                 re->subbeg = rex->subbeg;
4220                 re->sublen = rex->sublen;
4221                 rei = RXi_GET(re);
4222                 DEBUG_EXECUTE_r(
4223                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4224                         "Matching embedded");
4225                 );
4226                 startpoint = rei->program + 1;
4227                 ST.close_paren = 0; /* only used for GOSUB */
4228                 /* borrowed from regtry */
4229                 if (PL_reg_start_tmpl <= re->nparens) {
4230                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4231                     if(PL_reg_start_tmp)
4232                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4233                     else
4234                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4235                 }
4236
4237         eval_recurse_doit: /* Share code with GOSUB below this line */
4238                 /* run the pattern returned from (??{...}) */
4239                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4240                 REGCP_SET(ST.lastcp);
4241
4242                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4243
4244                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4245                 PL_reglastparen = &re->lastparen;
4246                 PL_reglastcloseparen = &re->lastcloseparen;
4247                 re->lastparen = 0;
4248                 re->lastcloseparen = 0;
4249
4250                 PL_reginput = locinput;
4251                 PL_regsize = 0;
4252
4253                 /* XXXX This is too dramatic a measure... */
4254                 PL_reg_maxiter = 0;
4255
4256                 ST.toggle_reg_flags = PL_reg_flags;
4257                 if (RX_UTF8(re_sv))
4258                     PL_reg_flags |= RF_utf8;
4259                 else
4260                     PL_reg_flags &= ~RF_utf8;
4261                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4262
4263                 ST.prev_rex = rex_sv;
4264                 ST.prev_curlyx = cur_curlyx;
4265                 SETREX(rex_sv,re_sv);
4266                 rex = re;
4267                 rexi = rei;
4268                 cur_curlyx = NULL;
4269                 ST.B = next;
4270                 ST.prev_eval = cur_eval;
4271                 cur_eval = st;
4272                 /* now continue from first node in postoned RE */
4273                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4274                 /* NOTREACHED */
4275             }
4276             /* logical is 1,   /(?(?{...})X|Y)/ */
4277             sw = cBOOL(SvTRUE(ret));
4278             logical = 0;
4279             break;
4280         }
4281
4282         case EVAL_AB: /* cleanup after a successful (??{A})B */
4283             /* note: this is called twice; first after popping B, then A */
4284             PL_reg_flags ^= ST.toggle_reg_flags;
4285             ReREFCNT_dec(rex_sv);
4286             SETREX(rex_sv,ST.prev_rex);
4287             rex = (struct regexp *)SvANY(rex_sv);
4288             rexi = RXi_GET(rex);
4289             regcpblow(ST.cp);
4290             cur_eval = ST.prev_eval;
4291             cur_curlyx = ST.prev_curlyx;
4292
4293             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4294             PL_reglastparen = &rex->lastparen;
4295             PL_reglastcloseparen = &rex->lastcloseparen;
4296             /* also update PL_regoffs */
4297             PL_regoffs = rex->offs;
4298
4299             /* XXXX This is too dramatic a measure... */
4300             PL_reg_maxiter = 0;
4301             if ( nochange_depth )
4302                 nochange_depth--;
4303             sayYES;
4304
4305
4306         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4307             /* note: this is called twice; first after popping B, then A */
4308             PL_reg_flags ^= ST.toggle_reg_flags;
4309             ReREFCNT_dec(rex_sv);
4310             SETREX(rex_sv,ST.prev_rex);
4311             rex = (struct regexp *)SvANY(rex_sv);
4312             rexi = RXi_GET(rex);
4313             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4314             PL_reglastparen = &rex->lastparen;
4315             PL_reglastcloseparen = &rex->lastcloseparen;
4316
4317             PL_reginput = locinput;
4318             REGCP_UNWIND(ST.lastcp);
4319             regcppop(rex);
4320             cur_eval = ST.prev_eval;
4321             cur_curlyx = ST.prev_curlyx;
4322             /* XXXX This is too dramatic a measure... */
4323             PL_reg_maxiter = 0;
4324             if ( nochange_depth )
4325                 nochange_depth--;
4326             sayNO_SILENT;
4327 #undef ST
4328
4329         case OPEN:
4330             n = ARG(scan);  /* which paren pair */
4331             PL_reg_start_tmp[n] = locinput;
4332             if (n > PL_regsize)
4333                 PL_regsize = n;
4334             lastopen = n;
4335             break;
4336         case CLOSE:
4337             n = ARG(scan);  /* which paren pair */
4338             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4339             PL_regoffs[n].end = locinput - PL_bostr;
4340             /*if (n > PL_regsize)
4341                 PL_regsize = n;*/
4342             if (n > *PL_reglastparen)
4343                 *PL_reglastparen = n;
4344             *PL_reglastcloseparen = n;
4345             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4346                 goto fake_end;
4347             }
4348             break;
4349         case ACCEPT:
4350             if (ARG(scan)){
4351                 regnode *cursor;
4352                 for (cursor=scan;
4353                      cursor && OP(cursor)!=END;
4354                      cursor=regnext(cursor))
4355                 {
4356                     if ( OP(cursor)==CLOSE ){
4357                         n = ARG(cursor);
4358                         if ( n <= lastopen ) {
4359                             PL_regoffs[n].start
4360                                 = PL_reg_start_tmp[n] - PL_bostr;
4361                             PL_regoffs[n].end = locinput - PL_bostr;
4362                             /*if (n > PL_regsize)
4363                             PL_regsize = n;*/
4364                             if (n > *PL_reglastparen)
4365                                 *PL_reglastparen = n;
4366                             *PL_reglastcloseparen = n;
4367                             if ( n == ARG(scan) || (cur_eval &&
4368                                 cur_eval->u.eval.close_paren == n))
4369                                 break;
4370                         }
4371                     }
4372                 }
4373             }
4374             goto fake_end;
4375             /*NOTREACHED*/
4376         case GROUPP:
4377             n = ARG(scan);  /* which paren pair */
4378             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4379             break;
4380         case NGROUPP:
4381             /* reg_check_named_buff_matched returns 0 for no match */
4382             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4383             break;
4384         case INSUBP:
4385             n = ARG(scan);
4386             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4387             break;
4388         case DEFINEP:
4389             sw = 0;
4390             break;
4391         case IFTHEN:
4392             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4393             if (sw)
4394                 next = NEXTOPER(NEXTOPER(scan));
4395             else {
4396                 next = scan + ARG(scan);
4397                 if (OP(next) == IFTHEN) /* Fake one. */
4398                     next = NEXTOPER(NEXTOPER(next));
4399             }
4400             break;
4401         case LOGICAL:
4402             logical = scan->flags;
4403             break;
4404
4405 /*******************************************************************
4406
4407 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4408 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4409 STAR/PLUS/CURLY/CURLYN are used instead.)
4410
4411 A*B is compiled as <CURLYX><A><WHILEM><B>
4412
4413 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4414 state, which contains the current count, initialised to -1. It also sets
4415 cur_curlyx to point to this state, with any previous value saved in the
4416 state block.
4417
4418 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4419 since the pattern may possibly match zero times (i.e. it's a while {} loop
4420 rather than a do {} while loop).
4421
4422 Each entry to WHILEM represents a successful match of A. The count in the
4423 CURLYX block is incremented, another WHILEM state is pushed, and execution
4424 passes to A or B depending on greediness and the current count.
4425
4426 For example, if matching against the string a1a2a3b (where the aN are
4427 substrings that match /A/), then the match progresses as follows: (the
4428 pushed states are interspersed with the bits of strings matched so far):
4429
4430     <CURLYX cnt=-1>
4431     <CURLYX cnt=0><WHILEM>
4432     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4433     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4434     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4435     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4436
4437 (Contrast this with something like CURLYM, which maintains only a single
4438 backtrack state:
4439
4440     <CURLYM cnt=0> a1
4441     a1 <CURLYM cnt=1> a2
4442     a1 a2 <CURLYM cnt=2> a3
4443     a1 a2 a3 <CURLYM cnt=3> b
4444 )
4445
4446 Each WHILEM state block marks a point to backtrack to upon partial failure
4447 of A or B, and also contains some minor state data related to that
4448 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4449 overall state, such as the count, and pointers to the A and B ops.
4450
4451 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4452 must always point to the *current* CURLYX block, the rules are:
4453
4454 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4455 and set cur_curlyx to point the new block.
4456
4457 When popping the CURLYX block after a successful or unsuccessful match,
4458 restore the previous cur_curlyx.
4459
4460 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4461 to the outer one saved in the CURLYX block.
4462
4463 When popping the WHILEM block after a successful or unsuccessful B match,
4464 restore the previous cur_curlyx.
4465
4466 Here's an example for the pattern (AI* BI)*BO
4467 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4468
4469 cur_
4470 curlyx backtrack stack
4471 ------ ---------------
4472 NULL
4473 CO     <CO prev=NULL> <WO>
4474 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4475 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4476 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4477
4478 At this point the pattern succeeds, and we work back down the stack to
4479 clean up, restoring as we go:
4480
4481 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4482 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4483 CO     <CO prev=NULL> <WO>
4484 NULL
4485
4486 *******************************************************************/
4487
4488 #define ST st->u.curlyx
4489
4490         case CURLYX:    /* start of /A*B/  (for complex A) */
4491         {
4492             /* No need to save/restore up to this paren */
4493             I32 parenfloor = scan->flags;
4494
4495             assert(next); /* keep Coverity happy */
4496             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4497                 next += ARG(next);
4498
4499             /* XXXX Probably it is better to teach regpush to support
4500                parenfloor > PL_regsize... */
4501             if (parenfloor > (I32)*PL_reglastparen)
4502                 parenfloor = *PL_reglastparen; /* Pessimization... */
4503
4504             ST.prev_curlyx= cur_curlyx;
4505             cur_curlyx = st;
4506             ST.cp = PL_savestack_ix;
4507
4508             /* these fields contain the state of the current curly.
4509              * they are accessed by subsequent WHILEMs */
4510             ST.parenfloor = parenfloor;
4511             ST.me = scan;
4512             ST.B = next;
4513             ST.minmod = minmod;
4514             minmod = 0;
4515             ST.count = -1;      /* this will be updated by WHILEM */
4516             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4517
4518             PL_reginput = locinput;
4519             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4520             /* NOTREACHED */
4521         }
4522
4523         case CURLYX_end: /* just finished matching all of A*B */
4524             cur_curlyx = ST.prev_curlyx;
4525             sayYES;
4526             /* NOTREACHED */
4527
4528         case CURLYX_end_fail: /* just failed to match all of A*B */
4529             regcpblow(ST.cp);
4530             cur_curlyx = ST.prev_curlyx;
4531             sayNO;
4532             /* NOTREACHED */
4533
4534
4535 #undef ST
4536 #define ST st->u.whilem
4537
4538         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4539         {
4540             /* see the discussion above about CURLYX/WHILEM */
4541             I32 n;
4542             int min = ARG1(cur_curlyx->u.curlyx.me);
4543             int max = ARG2(cur_curlyx->u.curlyx.me);
4544             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4545
4546             assert(cur_curlyx); /* keep Coverity happy */
4547             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4548             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4549             ST.cache_offset = 0;
4550             ST.cache_mask = 0;
4551
4552             PL_reginput = locinput;
4553
4554             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4555                   "%*s  whilem: matched %ld out of %d..%d\n",
4556                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4557             );
4558
4559             /* First just match a string of min A's. */
4560
4561             if (n < min) {
4562                 cur_curlyx->u.curlyx.lastloc = locinput;
4563                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4564                 /* NOTREACHED */
4565             }
4566
4567             /* If degenerate A matches "", assume A done. */
4568
4569             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4570                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4571                    "%*s  whilem: empty match detected, trying continuation...\n",
4572                    REPORT_CODE_OFF+depth*2, "")
4573                 );
4574                 goto do_whilem_B_max;
4575             }
4576
4577             /* super-linear cache processing */
4578
4579             if (scan->flags) {
4580
4581                 if (!PL_reg_maxiter) {
4582                     /* start the countdown: Postpone detection until we
4583                      * know the match is not *that* much linear. */
4584                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4585                     /* possible overflow for long strings and many CURLYX's */
4586                     if (PL_reg_maxiter < 0)
4587                         PL_reg_maxiter = I32_MAX;
4588                     PL_reg_leftiter = PL_reg_maxiter;
4589                 }
4590
4591                 if (PL_reg_leftiter-- == 0) {
4592                     /* initialise cache */
4593                     const I32 size = (PL_reg_maxiter + 7)/8;
4594                     if (PL_reg_poscache) {
4595                         if ((I32)PL_reg_poscache_size < size) {
4596                             Renew(PL_reg_poscache, size, char);
4597                             PL_reg_poscache_size = size;
4598                         }
4599                         Zero(PL_reg_poscache, size, char);
4600                     }
4601                     else {
4602                         PL_reg_poscache_size = size;
4603                         Newxz(PL_reg_poscache, size, char);
4604                     }
4605                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4606       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4607                               PL_colors[4], PL_colors[5])
4608                     );
4609                 }
4610
4611                 if (PL_reg_leftiter < 0) {
4612                     /* have we already failed at this position? */
4613                     I32 offset, mask;
4614                     offset  = (scan->flags & 0xf) - 1
4615                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4616                     mask    = 1 << (offset % 8);
4617                     offset /= 8;
4618                     if (PL_reg_poscache[offset] & mask) {
4619                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4620                             "%*s  whilem: (cache) already tried at this position...\n",
4621                             REPORT_CODE_OFF+depth*2, "")
4622                         );
4623                         sayNO; /* cache records failure */
4624                     }
4625                     ST.cache_offset = offset;
4626                     ST.cache_mask   = mask;
4627                 }
4628             }
4629
4630             /* Prefer B over A for minimal matching. */
4631
4632             if (cur_curlyx->u.curlyx.minmod) {
4633                 ST.save_curlyx = cur_curlyx;
4634                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4635                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4636                 REGCP_SET(ST.lastcp);
4637                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4638                 /* NOTREACHED */
4639             }
4640
4641             /* Prefer A over B for maximal matching. */
4642
4643             if (n < max) { /* More greed allowed? */
4644                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4645                 cur_curlyx->u.curlyx.lastloc = locinput;
4646                 REGCP_SET(ST.lastcp);
4647                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4648                 /* NOTREACHED */
4649             }
4650             goto do_whilem_B_max;
4651         }
4652         /* NOTREACHED */
4653
4654         case WHILEM_B_min: /* just matched B in a minimal match */
4655         case WHILEM_B_max: /* just matched B in a maximal match */
4656             cur_curlyx = ST.save_curlyx;
4657             sayYES;
4658             /* NOTREACHED */
4659
4660         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4661             cur_curlyx = ST.save_curlyx;
4662             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4663             cur_curlyx->u.curlyx.count--;
4664             CACHEsayNO;
4665             /* NOTREACHED */
4666
4667         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4668             REGCP_UNWIND(ST.lastcp);
4669             regcppop(rex);
4670             /* FALL THROUGH */
4671         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4672             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4673             cur_curlyx->u.curlyx.count--;
4674             CACHEsayNO;
4675             /* NOTREACHED */
4676
4677         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4678             REGCP_UNWIND(ST.lastcp);
4679             regcppop(rex);      /* Restore some previous $<digit>s? */
4680             PL_reginput = locinput;
4681             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4682                 "%*s  whilem: failed, trying continuation...\n",
4683                 REPORT_CODE_OFF+depth*2, "")
4684             );
4685           do_whilem_B_max:
4686             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4687                 && ckWARN(WARN_REGEXP)
4688                 && !(PL_reg_flags & RF_warned))
4689             {
4690                 PL_reg_flags |= RF_warned;
4691                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4692                      "Complex regular subexpression recursion",
4693                      REG_INFTY - 1);
4694             }
4695
4696             /* now try B */
4697             ST.save_curlyx = cur_curlyx;
4698             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4699             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4700             /* NOTREACHED */
4701
4702         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4703             cur_curlyx = ST.save_curlyx;
4704             REGCP_UNWIND(ST.lastcp);
4705             regcppop(rex);
4706
4707             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4708                 /* Maximum greed exceeded */
4709                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4710                     && ckWARN(WARN_REGEXP)
4711                     && !(PL_reg_flags & RF_warned))
4712                 {
4713                     PL_reg_flags |= RF_warned;
4714                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4715                         "%s limit (%d) exceeded",
4716                         "Complex regular subexpression recursion",
4717                         REG_INFTY - 1);
4718                 }
4719                 cur_curlyx->u.curlyx.count--;
4720                 CACHEsayNO;
4721             }
4722
4723             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4724                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4725             );
4726             /* Try grabbing another A and see if it helps. */
4727             PL_reginput = locinput;
4728             cur_curlyx->u.curlyx.lastloc = locinput;
4729             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4730             REGCP_SET(ST.lastcp);
4731             PUSH_STATE_GOTO(WHILEM_A_min,
4732                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4733             /* NOTREACHED */
4734
4735 #undef  ST
4736 #define ST st->u.branch
4737
4738         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4739             next = scan + ARG(scan);
4740             if (next == scan)
4741                 next = NULL;
4742             scan = NEXTOPER(scan);
4743             /* FALL THROUGH */
4744
4745         case BRANCH:        /*  /(...|A|...)/ */
4746             scan = NEXTOPER(scan); /* scan now points to inner node */
4747             ST.lastparen = *PL_reglastparen;
4748             ST.next_branch = next;
4749             REGCP_SET(ST.cp);
4750             PL_reginput = locinput;
4751
4752             /* Now go into the branch */
4753             if (has_cutgroup) {
4754                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4755             } else {
4756                 PUSH_STATE_GOTO(BRANCH_next, scan);
4757             }
4758             /* NOTREACHED */
4759         case CUTGROUP:
4760             PL_reginput = locinput;
4761             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4762                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4763             PUSH_STATE_GOTO(CUTGROUP_next,next);
4764             /* NOTREACHED */
4765         case CUTGROUP_next_fail:
4766             do_cutgroup = 1;
4767             no_final = 1;
4768             if (st->u.mark.mark_name)
4769                 sv_commit = st->u.mark.mark_name;
4770             sayNO;
4771             /* NOTREACHED */
4772         case BRANCH_next:
4773             sayYES;
4774             /* NOTREACHED */
4775         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4776             if (do_cutgroup) {
4777                 do_cutgroup = 0;
4778                 no_final = 0;
4779             }
4780             REGCP_UNWIND(ST.cp);
4781             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4782                 PL_regoffs[n].end = -1;
4783             *PL_reglastparen = n;
4784             /*dmq: *PL_reglastcloseparen = n; */
4785             scan = ST.next_branch;
4786             /* no more branches? */
4787             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4788                 DEBUG_EXECUTE_r({
4789                     PerlIO_printf( Perl_debug_log,
4790                         "%*s  %sBRANCH failed...%s\n",
4791                         REPORT_CODE_OFF+depth*2, "",
4792                         PL_colors[4],
4793                         PL_colors[5] );
4794                 });
4795                 sayNO_SILENT;
4796             }
4797             continue; /* execute next BRANCH[J] op */
4798             /* NOTREACHED */
4799
4800         case MINMOD:
4801             minmod = 1;
4802             break;
4803
4804 #undef  ST
4805 #define ST st->u.curlym
4806
4807         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4808
4809             /* This is an optimisation of CURLYX that enables us to push
4810              * only a single backtracking state, no matter how many matches
4811              * there are in {m,n}. It relies on the pattern being constant
4812              * length, with no parens to influence future backrefs
4813              */
4814
4815             ST.me = scan;
4816             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4817
4818             /* if paren positive, emulate an OPEN/CLOSE around A */
4819             if (ST.me->flags) {
4820                 U32 paren = ST.me->flags;
4821                 if (paren > PL_regsize)
4822                     PL_regsize = paren;
4823                 if (paren > *PL_reglastparen)
4824                     *PL_reglastparen = paren;
4825                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4826             }
4827             ST.A = scan;
4828             ST.B = next;
4829             ST.alen = 0;
4830             ST.count = 0;
4831             ST.minmod = minmod;
4832             minmod = 0;
4833             ST.c1 = CHRTEST_UNINIT;
4834             REGCP_SET(ST.cp);
4835
4836             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4837                 goto curlym_do_B;
4838
4839           curlym_do_A: /* execute the A in /A{m,n}B/  */
4840             PL_reginput = locinput;
4841             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4842             /* NOTREACHED */
4843
4844         case CURLYM_A: /* we've just matched an A */
4845             locinput = st->locinput;
4846             nextchr = UCHARAT(locinput);
4847
4848             ST.count++;
4849             /* after first match, determine A's length: u.curlym.alen */
4850             if (ST.count == 1) {
4851                 if (PL_reg_match_utf8) {
4852                     char *s = locinput;
4853                     while (s < PL_reginput) {
4854                         ST.alen++;
4855                         s += UTF8SKIP(s);
4856                     }
4857                 }
4858                 else {
4859                     ST.alen = PL_reginput - locinput;
4860                 }
4861                 if (ST.alen == 0)
4862                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4863             }
4864             DEBUG_EXECUTE_r(
4865                 PerlIO_printf(Perl_debug_log,
4866                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4867                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4868                           (IV) ST.count, (IV)ST.alen)
4869             );
4870
4871             locinput = PL_reginput;
4872
4873             if (cur_eval && cur_eval->u.eval.close_paren &&
4874                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4875                 goto fake_end;
4876
4877             {
4878                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4879                 if ( max == REG_INFTY || ST.count < max )
4880                     goto curlym_do_A; /* try to match another A */
4881             }
4882             goto curlym_do_B; /* try to match B */
4883
4884         case CURLYM_A_fail: /* just failed to match an A */
4885             REGCP_UNWIND(ST.cp);
4886
4887             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4888                 || (cur_eval && cur_eval->u.eval.close_paren &&
4889                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4890                 sayNO;
4891
4892           curlym_do_B: /* execute the B in /A{m,n}B/  */
4893             PL_reginput = locinput;
4894             if (ST.c1 == CHRTEST_UNINIT) {
4895                 /* calculate c1 and c2 for possible match of 1st char
4896                  * following curly */
4897                 ST.c1 = ST.c2 = CHRTEST_VOID;
4898                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4899                     regnode *text_node = ST.B;
4900                     if (! HAS_TEXT(text_node))
4901                         FIND_NEXT_IMPT(text_node);
4902                     /* this used to be
4903
4904                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4905
4906                         But the former is redundant in light of the latter.
4907
4908                         if this changes back then the macro for
4909                         IS_TEXT and friends need to change.
4910                      */
4911                     if (PL_regkind[OP(text_node)] == EXACT)
4912                     {
4913
4914                         ST.c1 = (U8)*STRING(text_node);
4915                         switch (OP(text_node)) {
4916                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
4917                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
4918                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
4919                             default: ST.c2 = ST.c1;
4920                         }
4921                     }
4922                 }
4923             }
4924
4925             DEBUG_EXECUTE_r(
4926                 PerlIO_printf(Perl_debug_log,
4927                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4928                     (int)(REPORT_CODE_OFF+(depth*2)),
4929                     "", (IV)ST.count)
4930                 );
4931             if (ST.c1 != CHRTEST_VOID
4932                     && UCHARAT(PL_reginput) != ST.c1
4933                     && UCHARAT(PL_reginput) != ST.c2)
4934             {
4935                 /* simulate B failing */
4936                 DEBUG_OPTIMISE_r(
4937                     PerlIO_printf(Perl_debug_log,
4938                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4939                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4940                         (IV)ST.c1,(IV)ST.c2
4941                 ));
4942                 state_num = CURLYM_B_fail;
4943                 goto reenter_switch;
4944             }
4945
4946             if (ST.me->flags) {
4947                 /* mark current A as captured */
4948                 I32 paren = ST.me->flags;
4949                 if (ST.count) {
4950                     PL_regoffs[paren].start
4951                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4952                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4953                     /*dmq: *PL_reglastcloseparen = paren; */
4954                 }
4955                 else
4956                     PL_regoffs[paren].end = -1;
4957                 if (cur_eval && cur_eval->u.eval.close_paren &&
4958                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4959                 {
4960                     if (ST.count)
4961                         goto fake_end;
4962                     else
4963                         sayNO;
4964                 }
4965             }
4966
4967             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4968             /* NOTREACHED */
4969
4970         case CURLYM_B_fail: /* just failed to match a B */
4971             REGCP_UNWIND(ST.cp);
4972             if (ST.minmod) {
4973                 I32 max = ARG2(ST.me);
4974                 if (max != REG_INFTY && ST.count == max)
4975                     sayNO;
4976                 goto curlym_do_A; /* try to match a further A */
4977             }
4978             /* backtrack one A */
4979             if (ST.count == ARG1(ST.me) /* min */)
4980                 sayNO;
4981             ST.count--;
4982             locinput = HOPc(locinput, -ST.alen);
4983             goto curlym_do_B; /* try to match B */
4984
4985 #undef ST
4986 #define ST st->u.curly
4987
4988 #define CURLY_SETPAREN(paren, success) \
4989     if (paren) { \
4990         if (success) { \
4991             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4992             PL_regoffs[paren].end = locinput - PL_bostr; \
4993             *PL_reglastcloseparen = paren; \
4994         } \
4995         else \
4996             PL_regoffs[paren].end = -1; \
4997     }
4998
4999         case STAR:              /*  /A*B/ where A is width 1 */
5000             ST.paren = 0;
5001             ST.min = 0;
5002             ST.max = REG_INFTY;
5003             scan = NEXTOPER(scan);
5004             goto repeat;
5005         case PLUS:              /*  /A+B/ where A is width 1 */
5006             ST.paren = 0;
5007             ST.min = 1;
5008             ST.max = REG_INFTY;
5009             scan = NEXTOPER(scan);
5010             goto repeat;
5011         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5012             ST.paren = scan->flags;     /* Which paren to set */
5013             if (ST.paren > PL_regsize)
5014                 PL_regsize = ST.paren;
5015             if (ST.paren > *PL_reglastparen)
5016                 *PL_reglastparen = ST.paren;
5017             ST.min = ARG1(scan);  /* min to match */
5018             ST.max = ARG2(scan);  /* max to match */
5019             if (cur_eval && cur_eval->u.eval.close_paren &&
5020                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5021                 ST.min=1;
5022                 ST.max=1;
5023             }
5024             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5025             goto repeat;
5026         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5027             ST.paren = 0;
5028             ST.min = ARG1(scan);  /* min to match */
5029             ST.max = ARG2(scan);  /* max to match */
5030             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5031           repeat:
5032             /*
5033             * Lookahead to avoid useless match attempts
5034             * when we know what character comes next.
5035             *
5036             * Used to only do .*x and .*?x, but now it allows
5037             * for )'s, ('s and (?{ ... })'s to be in the way
5038             * of the quantifier and the EXACT-like node.  -- japhy
5039             */
5040
5041             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5042                 sayNO;
5043             if (HAS_TEXT(next) || JUMPABLE(next)) {
5044                 U8 *s;
5045                 regnode *text_node = next;
5046
5047                 if (! HAS_TEXT(text_node))
5048                     FIND_NEXT_IMPT(text_node);
5049
5050                 if (! HAS_TEXT(text_node))
5051                     ST.c1 = ST.c2 = CHRTEST_VOID;
5052                 else {
5053                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5054                         ST.c1 = ST.c2 = CHRTEST_VOID;
5055                         goto assume_ok_easy;
5056                     }
5057                     else
5058                         s = (U8*)STRING(text_node);
5059
5060                     /*  Currently we only get here when
5061
5062                         PL_rekind[OP(text_node)] == EXACT
5063
5064                         if this changes back then the macro for IS_TEXT and
5065                         friends need to change. */
5066                     if (!UTF_PATTERN) {
5067                         ST.c1 = *s;
5068                         switch (OP(text_node)) {
5069                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5070                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5071                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5072                             default: ST.c2 = ST.c1; break;
5073                         }
5074                     }
5075                     else { /* UTF_PATTERN */
5076                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5077                              STRLEN ulen1, ulen2;
5078                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5079                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5080
5081                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5082                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5083 #ifdef EBCDIC
5084                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5085                                                     ckWARN(WARN_UTF8) ?
5086                                                     0 : UTF8_ALLOW_ANY);
5087                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5088                                                     ckWARN(WARN_UTF8) ?
5089                                                     0 : UTF8_ALLOW_ANY);
5090 #else
5091                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5092                                                     uniflags);
5093                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5094                                                     uniflags);
5095 #endif
5096                         }
5097                         else {
5098                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5099                                                      uniflags);
5100                         }
5101                     }
5102                 }
5103             }
5104             else
5105                 ST.c1 = ST.c2 = CHRTEST_VOID;
5106         assume_ok_easy:
5107
5108             ST.A = scan;
5109             ST.B = next;
5110             PL_reginput = locinput;
5111             if (minmod) {
5112                 minmod = 0;
5113                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5114                     sayNO;
5115                 ST.count = ST.min;
5116                 locinput = PL_reginput;
5117                 REGCP_SET(ST.cp);
5118                 if (ST.c1 == CHRTEST_VOID)
5119                     goto curly_try_B_min;
5120
5121                 ST.oldloc = locinput;
5122
5123                 /* set ST.maxpos to the furthest point along the
5124                  * string that could possibly match */
5125                 if  (ST.max == REG_INFTY) {
5126                     ST.maxpos = PL_regeol - 1;
5127                     if (utf8_target)
5128                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5129                             ST.maxpos--;
5130                 }
5131                 else if (utf8_target) {
5132                     int m = ST.max - ST.min;
5133                     for (ST.maxpos = locinput;
5134                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5135                         ST.maxpos += UTF8SKIP(ST.maxpos);
5136                 }
5137                 else {
5138                     ST.maxpos = locinput + ST.max - ST.min;
5139                     if (ST.maxpos >= PL_regeol)
5140                         ST.maxpos = PL_regeol - 1;
5141                 }
5142                 goto curly_try_B_min_known;
5143
5144             }
5145             else {
5146                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5147                 locinput = PL_reginput;
5148                 if (ST.count < ST.min)
5149                     sayNO;
5150                 if ((ST.count > ST.min)
5151                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5152                 {
5153                     /* A{m,n} must come at the end of the string, there's
5154                      * no point in backing off ... */
5155                     ST.min = ST.count;
5156                     /* ...except that $ and \Z can match before *and* after
5157                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5158                        We may back off by one in this case. */
5159                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5160                         ST.min--;
5161                 }
5162                 REGCP_SET(ST.cp);
5163                 goto curly_try_B_max;
5164             }
5165             /* NOTREACHED */
5166
5167
5168         case CURLY_B_min_known_fail:
5169             /* failed to find B in a non-greedy match where c1,c2 valid */
5170             if (ST.paren && ST.count)
5171                 PL_regoffs[ST.paren].end = -1;
5172
5173             PL_reginput = locinput;     /* Could be reset... */
5174             REGCP_UNWIND(ST.cp);
5175             /* Couldn't or didn't -- move forward. */
5176             ST.oldloc = locinput;
5177             if (utf8_target)
5178                 locinput += UTF8SKIP(locinput);
5179             else
5180                 locinput++;
5181             ST.count++;
5182           curly_try_B_min_known:
5183              /* find the next place where 'B' could work, then call B */
5184             {
5185                 int n;
5186                 if (utf8_target) {
5187                     n = (ST.oldloc == locinput) ? 0 : 1;
5188                     if (ST.c1 == ST.c2) {
5189                         STRLEN len;
5190                         /* set n to utf8_distance(oldloc, locinput) */
5191                         while (locinput <= ST.maxpos &&
5192                                utf8n_to_uvchr((U8*)locinput,
5193                                               UTF8_MAXBYTES, &len,
5194                                               uniflags) != (UV)ST.c1) {
5195                             locinput += len;
5196                             n++;
5197                         }
5198                     }
5199                     else {
5200                         /* set n to utf8_distance(oldloc, locinput) */
5201                         while (locinput <= ST.maxpos) {
5202                             STRLEN len;
5203                             const UV c = utf8n_to_uvchr((U8*)locinput,
5204                                                   UTF8_MAXBYTES, &len,
5205                                                   uniflags);
5206                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5207                                 break;
5208                             locinput += len;
5209                             n++;
5210                         }
5211                     }
5212                 }
5213                 else {
5214                     if (ST.c1 == ST.c2) {
5215                         while (locinput <= ST.maxpos &&
5216                                UCHARAT(locinput) != ST.c1)
5217                             locinput++;
5218                     }
5219                     else {
5220                         while (locinput <= ST.maxpos
5221                                && UCHARAT(locinput) != ST.c1
5222                                && UCHARAT(locinput) != ST.c2)
5223                             locinput++;
5224                     }
5225                     n = locinput - ST.oldloc;
5226                 }
5227                 if (locinput > ST.maxpos)
5228                     sayNO;
5229                 /* PL_reginput == oldloc now */
5230                 if (n) {
5231                     ST.count += n;
5232                     if (regrepeat(rex, ST.A, n, depth) < n)
5233                         sayNO;
5234                 }
5235                 PL_reginput = locinput;
5236                 CURLY_SETPAREN(ST.paren, ST.count);
5237                 if (cur_eval && cur_eval->u.eval.close_paren &&
5238                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5239                     goto fake_end;
5240                 }
5241                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5242             }
5243             /* NOTREACHED */
5244
5245
5246         case CURLY_B_min_fail:
5247             /* failed to find B in a non-greedy match where c1,c2 invalid */
5248             if (ST.paren && ST.count)
5249                 PL_regoffs[ST.paren].end = -1;
5250
5251             REGCP_UNWIND(ST.cp);
5252             /* failed -- move forward one */
5253             PL_reginput = locinput;
5254             if (regrepeat(rex, ST.A, 1, depth)) {
5255                 ST.count++;
5256                 locinput = PL_reginput;
5257                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5258                         ST.count > 0)) /* count overflow ? */
5259                 {
5260                   curly_try_B_min:
5261                     CURLY_SETPAREN(ST.paren, ST.count);
5262                     if (cur_eval && cur_eval->u.eval.close_paren &&
5263                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5264                         goto fake_end;
5265                     }
5266                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5267                 }
5268             }
5269             sayNO;
5270             /* NOTREACHED */
5271
5272
5273         curly_try_B_max:
5274             /* a successful greedy match: now try to match B */
5275             if (cur_eval && cur_eval->u.eval.close_paren &&
5276                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5277                 goto fake_end;
5278             }
5279             {
5280                 UV c = 0;
5281                 if (ST.c1 != CHRTEST_VOID)
5282                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5283                                            UTF8_MAXBYTES, 0, uniflags)
5284                                 : (UV) UCHARAT(PL_reginput);
5285                 /* If it could work, try it. */
5286                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5287                     CURLY_SETPAREN(ST.paren, ST.count);
5288                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5289                     /* NOTREACHED */
5290                 }
5291             }
5292             /* FALL THROUGH */
5293         case CURLY_B_max_fail:
5294             /* failed to find B in a greedy match */
5295             if (ST.paren && ST.count)
5296                 PL_regoffs[ST.paren].end = -1;
5297
5298             REGCP_UNWIND(ST.cp);
5299             /*  back up. */
5300             if (--ST.count < ST.min)
5301                 sayNO;
5302             PL_reginput = locinput = HOPc(locinput, -1);
5303             goto curly_try_B_max;
5304
5305 #undef ST
5306
5307         case END:
5308             fake_end:
5309             if (cur_eval) {
5310                 /* we've just finished A in /(??{A})B/; now continue with B */
5311                 I32 tmpix;
5312                 st->u.eval.toggle_reg_flags
5313                             = cur_eval->u.eval.toggle_reg_flags;
5314                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5315
5316                 st->u.eval.prev_rex = rex_sv;           /* inner */
5317                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5318                 rex = (struct regexp *)SvANY(rex_sv);
5319                 rexi = RXi_GET(rex);
5320                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5321                 ReREFCNT_inc(rex_sv);
5322                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5323
5324                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5325                 PL_reglastparen = &rex->lastparen;
5326                 PL_reglastcloseparen = &rex->lastcloseparen;
5327
5328                 REGCP_SET(st->u.eval.lastcp);
5329                 PL_reginput = locinput;
5330
5331                 /* Restore parens of the outer rex without popping the
5332                  * savestack */
5333                 tmpix = PL_savestack_ix;
5334                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5335                 regcppop(rex);
5336                 PL_savestack_ix = tmpix;
5337
5338                 st->u.eval.prev_eval = cur_eval;
5339                 cur_eval = cur_eval->u.eval.prev_eval;
5340                 DEBUG_EXECUTE_r(
5341                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5342                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5343                 if ( nochange_depth )
5344                     nochange_depth--;
5345
5346                 PUSH_YES_STATE_GOTO(EVAL_AB,
5347                         st->u.eval.prev_eval->u.eval.B); /* match B */
5348             }
5349
5350             if (locinput < reginfo->till) {
5351                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5352                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5353                                       PL_colors[4],
5354                                       (long)(locinput - PL_reg_starttry),
5355                                       (long)(reginfo->till - PL_reg_starttry),
5356                                       PL_colors[5]));
5357
5358                 sayNO_SILENT;           /* Cannot match: too short. */
5359             }
5360             PL_reginput = locinput;     /* put where regtry can find it */
5361             sayYES;                     /* Success! */
5362
5363         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5364             DEBUG_EXECUTE_r(
5365             PerlIO_printf(Perl_debug_log,
5366                 "%*s  %ssubpattern success...%s\n",
5367                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5368             PL_reginput = locinput;     /* put where regtry can find it */
5369             sayYES;                     /* Success! */
5370
5371 #undef  ST
5372 #define ST st->u.ifmatch
5373
5374         case SUSPEND:   /* (?>A) */
5375             ST.wanted = 1;
5376             PL_reginput = locinput;
5377             goto do_ifmatch;
5378
5379         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5380             ST.wanted = 0;
5381             goto ifmatch_trivial_fail_test;
5382
5383         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5384             ST.wanted = 1;
5385           ifmatch_trivial_fail_test:
5386             if (scan->flags) {
5387                 char * const s = HOPBACKc(locinput, scan->flags);
5388                 if (!s) {
5389                     /* trivial fail */
5390                     if (logical) {
5391                         logical = 0;
5392                         sw = 1 - cBOOL(ST.wanted);
5393                     }
5394                     else if (ST.wanted)
5395                         sayNO;
5396                     next = scan + ARG(scan);
5397                     if (next == scan)
5398                         next = NULL;
5399                     break;
5400                 }
5401                 PL_reginput = s;
5402             }
5403             else
5404                 PL_reginput = locinput;
5405
5406           do_ifmatch:
5407             ST.me = scan;
5408             ST.logical = logical;
5409             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5410
5411             /* execute body of (?...A) */
5412             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5413             /* NOTREACHED */
5414
5415         case IFMATCH_A_fail: /* body of (?...A) failed */
5416             ST.wanted = !ST.wanted;
5417             /* FALL THROUGH */
5418
5419         case IFMATCH_A: /* body of (?...A) succeeded */
5420             if (ST.logical) {
5421                 sw = cBOOL(ST.wanted);
5422             }
5423             else if (!ST.wanted)
5424                 sayNO;
5425
5426             if (OP(ST.me) == SUSPEND)
5427                 locinput = PL_reginput;
5428             else {
5429                 locinput = PL_reginput = st->locinput;
5430                 nextchr = UCHARAT(locinput);
5431             }
5432             scan = ST.me + ARG(ST.me);
5433             if (scan == ST.me)
5434                 scan = NULL;
5435             continue; /* execute B */
5436
5437 #undef ST
5438
5439         case LONGJMP:
5440             next = scan + ARG(scan);
5441             if (next == scan)
5442                 next = NULL;
5443             break;
5444         case COMMIT:
5445             reginfo->cutpoint = PL_regeol;
5446             /* FALLTHROUGH */
5447         case PRUNE:
5448             PL_reginput = locinput;
5449             if (!scan->flags)
5450                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5451             PUSH_STATE_GOTO(COMMIT_next,next);
5452             /* NOTREACHED */
5453         case COMMIT_next_fail:
5454             no_final = 1;
5455             /* FALLTHROUGH */
5456         case OPFAIL:
5457             sayNO;
5458             /* NOTREACHED */
5459
5460 #define ST st->u.mark
5461         case MARKPOINT:
5462             ST.prev_mark = mark_state;
5463             ST.mark_name = sv_commit = sv_yes_mark
5464                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5465             mark_state = st;
5466             ST.mark_loc = PL_reginput = locinput;
5467             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5468             /* NOTREACHED */
5469         case MARKPOINT_next:
5470             mark_state = ST.prev_mark;
5471             sayYES;
5472             /* NOTREACHED */
5473         case MARKPOINT_next_fail:
5474             if (popmark && sv_eq(ST.mark_name,popmark))
5475             {
5476                 if (ST.mark_loc > startpoint)
5477                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5478                 popmark = NULL; /* we found our mark */
5479                 sv_commit = ST.mark_name;
5480
5481                 DEBUG_EXECUTE_r({
5482                         PerlIO_printf(Perl_debug_log,
5483                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5484                             REPORT_CODE_OFF+depth*2, "",
5485                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5486                 });
5487             }
5488             mark_state = ST.prev_mark;
5489             sv_yes_mark = mark_state ?
5490                 mark_state->u.mark.mark_name : NULL;
5491             sayNO;
5492             /* NOTREACHED */
5493         case SKIP:
5494             PL_reginput = locinput;
5495             if (scan->flags) {
5496                 /* (*SKIP) : if we fail we cut here*/
5497                 ST.mark_name = NULL;
5498                 ST.mark_loc = locinput;
5499                 PUSH_STATE_GOTO(SKIP_next,next);
5500             } else {
5501                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5502                    otherwise do nothing.  Meaning we need to scan
5503                  */
5504                 regmatch_state *cur = mark_state;
5505                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5506
5507                 while (cur) {
5508                     if ( sv_eq( cur->u.mark.mark_name,
5509                                 find ) )
5510                     {
5511                         ST.mark_name = find;
5512                         PUSH_STATE_GOTO( SKIP_next, next );
5513                     }
5514                     cur = cur->u.mark.prev_mark;
5515                 }
5516             }
5517             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5518             break;
5519         case SKIP_next_fail:
5520             if (ST.mark_name) {
5521                 /* (*CUT:NAME) - Set up to search for the name as we
5522                    collapse the stack*/
5523                 popmark = ST.mark_name;
5524             } else {
5525                 /* (*CUT) - No name, we cut here.*/
5526                 if (ST.mark_loc > startpoint)
5527                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5528                 /* but we set sv_commit to latest mark_name if there
5529                    is one so they can test to see how things lead to this
5530                    cut */
5531                 if (mark_state)
5532                     sv_commit=mark_state->u.mark.mark_name;
5533             }
5534             no_final = 1;
5535             sayNO;
5536             /* NOTREACHED */
5537 #undef ST
5538         case FOLDCHAR:
5539             n = ARG(scan);
5540             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5541                 locinput += ln;
5542             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5543                 sayNO;
5544             } else  {
5545                 U8 folded[UTF8_MAXBYTES_CASE+1];
5546                 STRLEN foldlen;
5547                 const char * const l = locinput;
5548                 char *e = PL_regeol;
5549                 to_uni_fold(n, folded, &foldlen);
5550
5551                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5552                                l, &e, 0,  utf8_target)) {
5553                         sayNO;
5554                 }
5555                 locinput = e;
5556             }
5557             nextchr = UCHARAT(locinput);
5558             break;
5559         case LNBREAK:
5560             if ((n=is_LNBREAK(locinput,utf8_target))) {
5561                 locinput += n;
5562                 nextchr = UCHARAT(locinput);
5563             } else
5564                 sayNO;
5565             break;
5566
5567 #define CASE_CLASS(nAmE)                              \
5568         case nAmE:                                    \
5569             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5570                 locinput += n;                        \
5571                 nextchr = UCHARAT(locinput);          \
5572             } else                                    \
5573                 sayNO;                                \
5574             break;                                    \
5575         case N##nAmE:                                 \
5576             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5577                 sayNO;                                \
5578             } else {                                  \
5579                 locinput += UTF8SKIP(locinput);       \
5580                 nextchr = UCHARAT(locinput);          \
5581             }                                         \
5582             break
5583
5584         CASE_CLASS(VERTWS);
5585         CASE_CLASS(HORIZWS);
5586 #undef CASE_CLASS
5587
5588         default:
5589             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5590                           PTR2UV(scan), OP(scan));
5591             Perl_croak(aTHX_ "regexp memory corruption");
5592
5593         } /* end switch */
5594
5595         /* switch break jumps here */
5596         scan = next; /* prepare to execute the next op and ... */
5597         continue;    /* ... jump back to the top, reusing st */
5598         /* NOTREACHED */
5599
5600       push_yes_state:
5601         /* push a state that backtracks on success */
5602         st->u.yes.prev_yes_state = yes_state;
5603         yes_state = st;
5604         /* FALL THROUGH */
5605       push_state:
5606         /* push a new regex state, then continue at scan  */
5607         {
5608             regmatch_state *newst;
5609
5610             DEBUG_STACK_r({
5611                 regmatch_state *cur = st;
5612                 regmatch_state *curyes = yes_state;
5613                 int curd = depth;
5614                 regmatch_slab *slab = PL_regmatch_slab;
5615                 for (;curd > -1;cur--,curd--) {
5616                     if (cur < SLAB_FIRST(slab)) {
5617                         slab = slab->prev;
5618                         cur = SLAB_LAST(slab);
5619                     }
5620                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5621                         REPORT_CODE_OFF + 2 + depth * 2,"",
5622                         curd, PL_reg_name[cur->resume_state],
5623                         (curyes == cur) ? "yes" : ""
5624                     );
5625                     if (curyes == cur)
5626                         curyes = cur->u.yes.prev_yes_state;
5627                 }
5628             } else
5629                 DEBUG_STATE_pp("push")
5630             );
5631             depth++;
5632             st->locinput = locinput;
5633             newst = st+1;
5634             if (newst >  SLAB_LAST(PL_regmatch_slab))
5635                 newst = S_push_slab(aTHX);
5636             PL_regmatch_state = newst;
5637
5638             locinput = PL_reginput;
5639             nextchr = UCHARAT(locinput);
5640             st = newst;
5641             continue;
5642             /* NOTREACHED */
5643         }
5644     }
5645
5646     /*
5647     * We get here only if there's trouble -- normally "case END" is
5648     * the terminating point.
5649     */
5650     Perl_croak(aTHX_ "corrupted regexp pointers");
5651     /*NOTREACHED*/
5652     sayNO;
5653
5654 yes:
5655     if (yes_state) {
5656         /* we have successfully completed a subexpression, but we must now
5657          * pop to the state marked by yes_state and continue from there */
5658         assert(st != yes_state);
5659 #ifdef DEBUGGING
5660         while (st != yes_state) {
5661             st--;
5662             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5663                 PL_regmatch_slab = PL_regmatch_slab->prev;
5664                 st = SLAB_LAST(PL_regmatch_slab);
5665             }
5666             DEBUG_STATE_r({
5667                 if (no_final) {
5668                     DEBUG_STATE_pp("pop (no final)");
5669                 } else {
5670                     DEBUG_STATE_pp("pop (yes)");
5671                 }
5672             });
5673             depth--;
5674         }
5675 #else
5676         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5677             || yes_state > SLAB_LAST(PL_regmatch_slab))
5678         {
5679             /* not in this slab, pop slab */
5680             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5681             PL_regmatch_slab = PL_regmatch_slab->prev;
5682             st = SLAB_LAST(PL_regmatch_slab);
5683         }
5684         depth -= (st - yes_state);
5685 #endif
5686         st = yes_state;
5687         yes_state = st->u.yes.prev_yes_state;
5688         PL_regmatch_state = st;
5689
5690         if (no_final) {
5691             locinput= st->locinput;
5692             nextchr = UCHARAT(locinput);
5693         }
5694         state_num = st->resume_state + no_final;
5695         goto reenter_switch;
5696     }
5697
5698     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5699                           PL_colors[4], PL_colors[5]));
5700
5701     if (PL_reg_eval_set) {
5702         /* each successfully executed (?{...}) block does the equivalent of
5703          *   local $^R = do {...}
5704          * When popping the save stack, all these locals would be undone;
5705          * bypass this by setting the outermost saved $^R to the latest
5706          * value */
5707         if (oreplsv != GvSV(PL_replgv))
5708             sv_setsv(oreplsv, GvSV(PL_replgv));
5709     }
5710     result = 1;
5711     goto final_exit;
5712
5713 no:
5714     DEBUG_EXECUTE_r(
5715         PerlIO_printf(Perl_debug_log,
5716             "%*s  %sfailed...%s\n",
5717             REPORT_CODE_OFF+depth*2, "",
5718             PL_colors[4], PL_colors[5])
5719         );
5720
5721 no_silent:
5722     if (no_final) {
5723         if (yes_state) {
5724             goto yes;
5725         } else {
5726             goto final_exit;
5727         }
5728     }
5729     if (depth) {
5730         /* there's a previous state to backtrack to */
5731         st--;
5732         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5733             PL_regmatch_slab = PL_regmatch_slab->prev;
5734             st = SLAB_LAST(PL_regmatch_slab);
5735         }
5736         PL_regmatch_state = st;
5737         locinput= st->locinput;
5738         nextchr = UCHARAT(locinput);
5739
5740         DEBUG_STATE_pp("pop");
5741         depth--;
5742         if (yes_state == st)
5743             yes_state = st->u.yes.prev_yes_state;
5744
5745         state_num = st->resume_state + 1; /* failure = success + 1 */
5746         goto reenter_switch;
5747     }
5748     result = 0;
5749
5750   final_exit:
5751     if (rex->intflags & PREGf_VERBARG_SEEN) {
5752         SV *sv_err = get_sv("REGERROR", 1);
5753         SV *sv_mrk = get_sv("REGMARK", 1);
5754         if (result) {
5755             sv_commit = &PL_sv_no;
5756             if (!sv_yes_mark)
5757                 sv_yes_mark = &PL_sv_yes;
5758         } else {
5759             if (!sv_commit)
5760                 sv_commit = &PL_sv_yes;
5761             sv_yes_mark = &PL_sv_no;
5762         }
5763         sv_setsv(sv_err, sv_commit);
5764         sv_setsv(sv_mrk, sv_yes_mark);
5765     }
5766
5767     /* clean up; in particular, free all slabs above current one */
5768     LEAVE_SCOPE(oldsave);
5769
5770     return result;
5771 }
5772
5773 /*
5774  - regrepeat - repeatedly match something simple, report how many
5775  */
5776 /*
5777  * [This routine now assumes that it will only match on things of length 1.
5778  * That was true before, but now we assume scan - reginput is the count,
5779  * rather than incrementing count on every character.  [Er, except utf8.]]
5780  */
5781 STATIC I32
5782 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5783 {
5784     dVAR;
5785     register char *scan;
5786     register I32 c;
5787     register char *loceol = PL_regeol;
5788     register I32 hardcount = 0;
5789     register bool utf8_target = PL_reg_match_utf8;
5790 #ifndef DEBUGGING
5791     PERL_UNUSED_ARG(depth);
5792 #endif
5793
5794     PERL_ARGS_ASSERT_REGREPEAT;
5795
5796     scan = PL_reginput;
5797     if (max == REG_INFTY)
5798         max = I32_MAX;
5799     else if (max < loceol - scan)
5800         loceol = scan + max;
5801     switch (OP(p)) {
5802     case REG_ANY:
5803         if (utf8_target) {
5804             loceol = PL_regeol;
5805             while (scan < loceol && hardcount < max && *scan != '\n') {
5806                 scan += UTF8SKIP(scan);
5807                 hardcount++;
5808             }
5809         } else {
5810             while (scan < loceol && *scan != '\n')
5811                 scan++;
5812         }
5813         break;
5814     case SANY:
5815         if (utf8_target) {
5816             loceol = PL_regeol;
5817             while (scan < loceol && hardcount < max) {
5818                 scan += UTF8SKIP(scan);
5819                 hardcount++;
5820             }
5821         }
5822         else
5823             scan = loceol;
5824         break;
5825     case CANY:
5826         scan = loceol;
5827         break;
5828     case EXACT:
5829         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5830          * means they match only characters in the string that can be expressed
5831          * as a single byte.  For non-utf8 strings, that means a simple match.
5832          * For utf8 strings, the character matched must be an invariant, or
5833          * downgradable to a single byte.  The pattern's utf8ness is
5834          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5835          * it is, it's an invariant */
5836
5837         c = (U8)*STRING(p);
5838         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5839
5840         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5841             while (scan < loceol && UCHARAT(scan) == c) {
5842                 scan++;
5843             }
5844         }
5845         else {
5846
5847             /* Here, the string is utf8, and the pattern char is different
5848              * in utf8 than not, so can't compare them directly.  Outside the
5849              * loop, find find the two utf8 bytes that represent c, and then
5850              * look for those in sequence in the utf8 string */
5851             U8 high = UTF8_TWO_BYTE_HI(c);
5852             U8 low = UTF8_TWO_BYTE_LO(c);
5853             loceol = PL_regeol;
5854
5855             while (hardcount < max
5856                     && scan + 1 < loceol
5857                     && UCHARAT(scan) == high
5858                     && UCHARAT(scan + 1) == low)
5859             {
5860                 scan += 2;
5861                 hardcount++;
5862             }
5863         }
5864         break;
5865     case EXACTFL:
5866         PL_reg_flags |= RF_tainted;
5867         /* FALL THROUGH */
5868     case EXACTF:
5869     case EXACTFU:
5870
5871         /* The comments for the EXACT case above apply as well to these fold
5872          * ones */
5873
5874         c = (U8)*STRING(p);
5875         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5876
5877         if (utf8_target) { /* Use full Unicode fold matching */
5878
5879             /* For the EXACTFL case, It doesn't really make sense to compare
5880              * locale and utf8, but it is best we can do.  The documents warn
5881              * against mixing them */
5882
5883             char *tmpeol = loceol;
5884             while (hardcount < max
5885                     && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
5886                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN)))
5887             {
5888                 scan = tmpeol;
5889                 tmpeol = loceol;
5890                 hardcount++;
5891             }
5892
5893             /* XXX Note that the above handles properly the German sharp s in
5894              * the pattern matching ss in the string.  But it doesn't handle
5895              * properly cases where the string contains say 'LIGATURE ff' and
5896              * the pattern is 'f+'.  This would require, say, a new function or
5897              * revised interface to foldEQ_utf8(), in which the maximum number
5898              * of characters to match could be passed and it would return how
5899              * many actually did.  This is just one of many cases where
5900              * multi-char folds don't work properly, and so the fix is being
5901              * deferred */
5902         }
5903         else {
5904             U8 folded;
5905
5906             /* Here, the string isn't utf8 and c is a single byte; and either
5907              * the pattern isn't utf8 or c is an invariant, so its utf8ness
5908              * doesn't affect c.  Can just do simple comparisons for exact or
5909              * fold matching. */
5910             switch (OP(p)) {
5911                 case EXACTF: folded = PL_fold[c]; break;
5912                 case EXACTFU: folded = PL_fold_latin1[c]; break;
5913                 case EXACTFL: folded = PL_fold_locale[c]; break;
5914                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
5915             }
5916             while (scan < loceol &&
5917                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
5918             {
5919                 scan++;
5920             }
5921         }
5922         break;
5923     case ANYOFV:
5924     case ANYOF:
5925         if (utf8_target) {
5926             loceol = PL_regeol;
5927             while (hardcount < max && scan < loceol &&
5928                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5929                 scan += UTF8SKIP(scan);
5930                 hardcount++;
5931             }
5932         } else {
5933             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5934                 scan++;
5935         }
5936         break;
5937     case ALNUM:
5938         if (utf8_target) {
5939             loceol = PL_regeol;
5940             LOAD_UTF8_CHARCLASS_ALNUM();
5941             while (hardcount < max && scan < loceol &&
5942                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5943             {
5944                 scan += UTF8SKIP(scan);
5945                 hardcount++;
5946             }
5947         } else if (FLAGS(p) & USE_UNI) {
5948             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
5949                 scan++;
5950             }
5951         } else {
5952             while (scan < loceol && isALNUM((U8) *scan)) {
5953                 scan++;
5954             }
5955         }
5956         break;
5957     case ALNUML:
5958         PL_reg_flags |= RF_tainted;
5959         if (utf8_target) {
5960             loceol = PL_regeol;
5961             while (hardcount < max && scan < loceol &&
5962                    isALNUM_LC_utf8((U8*)scan)) {
5963                 scan += UTF8SKIP(scan);
5964                 hardcount++;
5965             }
5966         } else {
5967             while (scan < loceol && isALNUM_LC(*scan))
5968                 scan++;
5969         }
5970         break;
5971     case NALNUM:
5972         if (utf8_target) {
5973             loceol = PL_regeol;
5974             LOAD_UTF8_CHARCLASS_ALNUM();
5975             while (hardcount < max && scan < loceol &&
5976                    !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5977             {
5978                 scan += UTF8SKIP(scan);
5979                 hardcount++;
5980             }
5981         } else if (FLAGS(p) & USE_UNI) {
5982             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
5983                 scan++;
5984             }
5985         } else {
5986             while (scan < loceol && ! isALNUM((U8) *scan)) {
5987                 scan++;
5988             }
5989         }
5990         break;
5991     case NALNUML:
5992         PL_reg_flags |= RF_tainted;
5993         if (utf8_target) {
5994             loceol = PL_regeol;
5995             while (hardcount < max && scan < loceol &&
5996                    !isALNUM_LC_utf8((U8*)scan)) {
5997                 scan += UTF8SKIP(scan);
5998                 hardcount++;
5999             }
6000         } else {
6001             while (scan < loceol && !isALNUM_LC(*scan))
6002                 scan++;
6003         }
6004         break;
6005     case SPACE:
6006         if (utf8_target) {
6007             loceol = PL_regeol;
6008             LOAD_UTF8_CHARCLASS_SPACE();
6009             while (hardcount < max && scan < loceol &&
6010                    (*scan == ' ' ||
6011                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6012             {
6013                 scan += UTF8SKIP(scan);
6014                 hardcount++;
6015             }
6016         } else if (FLAGS(p) & USE_UNI) {
6017             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6018                 scan++;
6019             }
6020         } else {
6021             while (scan < loceol && isSPACE((U8) *scan))
6022                 scan++;
6023         }
6024         break;
6025     case SPACEL:
6026         PL_reg_flags |= RF_tainted;
6027         if (utf8_target) {
6028             loceol = PL_regeol;
6029             while (hardcount < max && scan < loceol &&
6030                    isSPACE_LC_utf8((U8*)scan)) {
6031                 scan += UTF8SKIP(scan);
6032                 hardcount++;
6033             }
6034         } else {
6035             while (scan < loceol && isSPACE_LC(*scan))
6036                 scan++;
6037         }
6038         break;
6039     case NSPACE:
6040         if (utf8_target) {
6041             loceol = PL_regeol;
6042             LOAD_UTF8_CHARCLASS_SPACE();
6043             while (hardcount < max && scan < loceol &&
6044                    !(*scan == ' ' ||
6045                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6046             {
6047                 scan += UTF8SKIP(scan);
6048                 hardcount++;
6049             }
6050         } else if (FLAGS(p) & USE_UNI) {
6051             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6052                 scan++;
6053             }
6054         } else {
6055             while (scan < loceol && ! isSPACE((U8) *scan)) {
6056                 scan++;
6057             }
6058         }
6059         break;
6060     case NSPACEL:
6061         PL_reg_flags |= RF_tainted;
6062         if (utf8_target) {
6063             loceol = PL_regeol;
6064             while (hardcount < max && scan < loceol &&
6065                    !isSPACE_LC_utf8((U8*)scan)) {
6066                 scan += UTF8SKIP(scan);
6067                 hardcount++;
6068             }
6069         } else {
6070             while (scan < loceol && !isSPACE_LC(*scan))
6071                 scan++;
6072         }
6073         break;
6074     case DIGIT:
6075         if (utf8_target) {
6076             loceol = PL_regeol;
6077             LOAD_UTF8_CHARCLASS_DIGIT();
6078             while (hardcount < max && scan < loceol &&
6079                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6080                 scan += UTF8SKIP(scan);
6081                 hardcount++;
6082             }
6083         } else {
6084             while (scan < loceol && isDIGIT(*scan))
6085                 scan++;
6086         }
6087         break;
6088     case DIGITL:
6089         PL_reg_flags |= RF_tainted;
6090         if (utf8_target) {
6091             loceol = PL_regeol;
6092             while (hardcount < max && scan < loceol &&
6093                    isDIGIT_LC_utf8((U8*)scan)) {
6094                 scan += UTF8SKIP(scan);
6095                 hardcount++;
6096             }
6097         } else {
6098             while (scan < loceol && isDIGIT_LC(*scan))
6099                 scan++;
6100         }
6101         break;
6102     case NDIGIT:
6103         if (utf8_target) {
6104             loceol = PL_regeol;
6105             LOAD_UTF8_CHARCLASS_DIGIT();
6106             while (hardcount < max && scan < loceol &&
6107                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6108                 scan += UTF8SKIP(scan);
6109                 hardcount++;
6110             }
6111         } else {
6112             while (scan < loceol && !isDIGIT(*scan))
6113                 scan++;
6114         }
6115     case NDIGITL:
6116         PL_reg_flags |= RF_tainted;
6117         if (utf8_target) {
6118             loceol = PL_regeol;
6119             while (hardcount < max && scan < loceol &&
6120                    !isDIGIT_LC_utf8((U8*)scan)) {
6121                 scan += UTF8SKIP(scan);
6122                 hardcount++;
6123             }
6124         } else {
6125             while (scan < loceol && !isDIGIT_LC(*scan))
6126                 scan++;
6127         }
6128         break;
6129     case LNBREAK:
6130         if (utf8_target) {
6131             loceol = PL_regeol;
6132             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6133                 scan += c;
6134                 hardcount++;
6135             }
6136         } else {
6137             /*
6138               LNBREAK can match two latin chars, which is ok,
6139               because we have a null terminated string, but we
6140               have to use hardcount in this situation
6141             */
6142             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6143                 scan+=c;
6144                 hardcount++;
6145             }
6146         }
6147         break;
6148     case HORIZWS:
6149         if (utf8_target) {
6150             loceol = PL_regeol;
6151             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6152                 scan += c;
6153                 hardcount++;
6154             }
6155         } else {
6156             while (scan < loceol && is_HORIZWS_latin1(scan))
6157                 scan++;
6158         }
6159         break;
6160     case NHORIZWS:
6161         if (utf8_target) {
6162             loceol = PL_regeol;
6163             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6164                 scan += UTF8SKIP(scan);
6165                 hardcount++;
6166             }
6167         } else {
6168             while (scan < loceol && !is_HORIZWS_latin1(scan))
6169                 scan++;
6170
6171         }
6172         break;
6173     case VERTWS:
6174         if (utf8_target) {
6175             loceol = PL_regeol;
6176             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6177                 scan += c;
6178                 hardcount++;
6179             }
6180         } else {
6181             while (scan < loceol && is_VERTWS_latin1(scan))
6182                 scan++;
6183
6184         }
6185         break;
6186     case NVERTWS:
6187         if (utf8_target) {
6188             loceol = PL_regeol;
6189             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6190                 scan += UTF8SKIP(scan);
6191                 hardcount++;
6192             }
6193         } else {
6194             while (scan < loceol && !is_VERTWS_latin1(scan))
6195                 scan++;
6196
6197         }
6198         break;
6199
6200     default:            /* Called on something of 0 width. */
6201         break;          /* So match right here or not at all. */
6202     }
6203
6204     if (hardcount)
6205         c = hardcount;
6206     else
6207         c = scan - PL_reginput;
6208     PL_reginput = scan;
6209
6210     DEBUG_r({
6211         GET_RE_DEBUG_FLAGS_DECL;
6212         DEBUG_EXECUTE_r({
6213             SV * const prop = sv_newmortal();
6214             regprop(prog, prop, p);
6215             PerlIO_printf(Perl_debug_log,
6216                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6217                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6218         });
6219     });
6220
6221     return(c);
6222 }
6223
6224
6225 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6226 /*
6227 - regclass_swash - prepare the utf8 swash
6228 */
6229
6230 SV *
6231 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6232 {
6233     dVAR;
6234     SV *sw  = NULL;
6235     SV *si  = NULL;
6236     SV *alt = NULL;
6237     RXi_GET_DECL(prog,progi);
6238     const struct reg_data * const data = prog ? progi->data : NULL;
6239
6240     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6241
6242     if (data && data->count) {
6243         const U32 n = ARG(node);
6244
6245         if (data->what[n] == 's') {
6246             SV * const rv = MUTABLE_SV(data->data[n]);
6247             AV * const av = MUTABLE_AV(SvRV(rv));
6248             SV **const ary = AvARRAY(av);
6249             SV **a, **b;
6250
6251             /* See the end of regcomp.c:S_regclass() for
6252              * documentation of these array elements. */
6253
6254             si = *ary;
6255             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6256             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6257
6258             if (a)
6259                 sw = *a;
6260             else if (si && doinit) {
6261                 sw = swash_init("utf8", "", si, 1, 0);
6262                 (void)av_store(av, 1, sw);
6263             }
6264             if (b)
6265                 alt = *b;
6266         }
6267     }
6268
6269     if (listsvp)
6270         *listsvp = si;
6271     if (altsvp)
6272         *altsvp  = alt;
6273
6274     return sw;
6275 }
6276 #endif
6277
6278 /*
6279  - reginclass - determine if a character falls into a character class
6280
6281   n is the ANYOF regnode
6282   p is the target string
6283   lenp is pointer to the maximum number of bytes of how far to go in p
6284     (This is assumed wthout checking to always be at least the current
6285     character's size)
6286   utf8_target tells whether p is in UTF-8.
6287
6288   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6289   from a successful match, the value it points to will be updated to how many
6290   bytes in p were matched.  If there was no match, the value is undefined,
6291   possibly changed from the input.
6292
6293   Note that this can be a synthetic start class, a combination of various
6294   nodes, so things you think might be mutually exclusive, such as locale,
6295   aren't.  It can match both locale and non-locale
6296
6297  */
6298
6299 STATIC bool
6300 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6301 {
6302     dVAR;
6303     const char flags = ANYOF_FLAGS(n);
6304     bool match = FALSE;
6305     UV c = *p;
6306     STRLEN c_len = 0;
6307     STRLEN maxlen;
6308
6309     PERL_ARGS_ASSERT_REGINCLASS;
6310
6311     /* If c is not already the code point, get it */
6312     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6313         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6314                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6315                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6316                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6317                  * UTF8_ALLOW_FFFF */
6318         if (c_len == (STRLEN)-1)
6319             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6320     }
6321     else {
6322         c_len = 1;
6323     }
6324
6325     /* Use passed in max length, or one character if none passed in or less
6326      * than one character.  And assume will match just one character.  This is
6327      * overwritten later if matched more. */
6328     if (lenp) {
6329         maxlen = (*lenp > c_len) ? *lenp : c_len;
6330         *lenp = c_len;
6331
6332     }
6333     else {
6334         maxlen = c_len;
6335     }
6336
6337     /* If this character is potentially in the bitmap, check it */
6338     if (c < 256) {
6339         if (ANYOF_BITMAP_TEST(n, c))
6340             match = TRUE;
6341
6342         else if (flags & ANYOF_LOCALE) {
6343             PL_reg_flags |= RF_tainted;
6344
6345             if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6346                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6347             {
6348                 match = TRUE;
6349             }
6350             else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6351                      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6352                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6353                       (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6354                       (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6355                       (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6356                       (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6357                       (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6358                       (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6359                       (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6360                       (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6361                       (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6362                       (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6363                       (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6364                       (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6365                       (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6366                       (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6367                       (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6368                       (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6369                       (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6370                       (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6371                       (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6372                       (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6373                       (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6374                       (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6375                       (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6376                       (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6377                       (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6378                       (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6379                       (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6380                       (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6381                      ) /* How's that for a conditional? */
6382             ) {
6383                 match = TRUE;
6384             }
6385         }
6386     }
6387
6388     /* If the bitmap didn't (or couldn't) match, and something outside the
6389      * bitmap could match, try that */
6390     if (!match) {
6391         if (utf8_target && (flags & ANYOF_UNICODE_ALL)) {
6392             if (c >= 256
6393                 || ((flags & ANYOF_LOC_NONBITMAP_FOLD) /* Latin1 1 that has a
6394                                                           non-Latin1 fold
6395                                                           should match */
6396                     && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c)))
6397             {
6398                 match = TRUE;
6399             }
6400         }
6401         if (!match && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6402                        || (utf8_target && flags & ANYOF_UTF8)))
6403         {
6404             AV *av;
6405             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6406
6407             if (sw) {
6408                 U8 * utf8_p;
6409                 if (utf8_target) {
6410                     utf8_p = (U8 *) p;
6411                 } else {
6412
6413                     /* Not utf8.  Convert as much of the string as available up
6414                      * to the limit of how far the (single) character in the
6415                      * pattern can possibly match (no need to go further).  If
6416                      * the node is a straight ANYOF or not folding, it can't
6417                      * match more than one.  Otherwise, It can match up to how
6418                      * far a single char can fold to.  Since not utf8, each
6419                      * character is a single byte, so the max it can be in
6420                      * bytes is the same as the max it can be in characters */
6421                     STRLEN len = (OP(n) == ANYOF
6422                                   || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6423                                   ? 1
6424                                   : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6425                                     ? maxlen
6426                                     : UTF8_MAX_FOLD_CHAR_EXPAND;
6427                     utf8_p = bytes_to_utf8(p, &len);
6428                 }
6429
6430                 if (swash_fetch(sw, utf8_p, 1)) /* See if in the swash */
6431                     match = TRUE;
6432                 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6433
6434                     /* Here, we need to test if the fold of the target string
6435                      * matches.  In the case of a multi-char fold that is
6436                      * caught by regcomp.c, it has stored all such folds into
6437                      * 'av'; we linearly check to see if any match the target
6438                      * string (folded).   We know that the originals were each
6439                      * one character, but we don't currently know how many
6440                      * characters/bytes each folded to, except we do know that
6441                      * there are small limits imposed by Unicode.  XXX A
6442                      * performance enhancement would be to have regcomp.c store
6443                      * the max number of chars/bytes that are in an av entry,
6444                      * as, say the 0th element.  Further down, if there isn't a
6445                      * match in the av, we will check if there is another
6446                      * fold-type match.  For that, we also need the fold, but
6447                      * only the first character.  No sense in folding it twice,
6448                      * so we do it here, even if there isn't any multi-char
6449                      * fold, so we always fold at least the first character.
6450                      * If the node is a straight ANYOF node, or there is only
6451                      * one character available in the string, or if there isn't
6452                      * any av, that's all we have to fold.  In the case of a
6453                      * multi-char fold, we do have guarantees in Unicode that
6454                      * it can only expand up to so many characters and so many
6455                      * bytes.  We keep track so don't exceed either.
6456                      *
6457                      * If there is a match, we will need to advance (if lenp is
6458                      * specified) the match pointer in the target string.  But
6459                      * what we are comparing here isn't that string directly,
6460                      * but its fold, whose length may differ from the original.
6461                      * As we go along in constructing the fold, therefore, we
6462                      * create a map so that we know how many bytes in the
6463                      * source to advance given that we have matched a certain
6464                      * number of bytes in the fold.  This map is stored in
6465                      * 'map_fold_len_back'.  The first character in the fold
6466                      * has array element 1 contain the number of bytes in the
6467                      * source that folded to it; the 2nd is the cumulative
6468                      * number to match it; ... */
6469                     U8 map_fold_len_back[UTF8_MAX_FOLD_CHAR_EXPAND] = { 0 };
6470                     U8 folded[UTF8_MAXBYTES_CASE+1];
6471                     STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6472                     STRLEN foldlen_for_av; /* num bytes in fold of all chars */
6473
6474                     if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6475
6476                         /* Here, only need to fold the first char of the target
6477                          * string */
6478                         to_utf8_fold(utf8_p, folded, &foldlen);
6479                         foldlen_for_av = foldlen;
6480                         map_fold_len_back[1] = UTF8SKIP(utf8_p);
6481                     }
6482                     else {
6483
6484                         /* Here, need to fold more than the first char.  Do so
6485                          * up to the limits */
6486                         UV which_char = 0;
6487                         U8* source_ptr = utf8_p;    /* The source for the fold
6488                                                        is the regex target
6489                                                        string */
6490                         U8* folded_ptr = folded;
6491                         U8* e = utf8_p + maxlen;    /* Can't go beyond last
6492                                                        available byte in the
6493                                                        target string */
6494                         while (which_char < UTF8_MAX_FOLD_CHAR_EXPAND
6495                                && source_ptr < e)
6496                         {
6497
6498                             /* Fold the next character */
6499                             U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6500                             STRLEN this_char_foldlen;
6501                             to_utf8_fold(source_ptr,
6502                                          this_char_folded,
6503                                          &this_char_foldlen);
6504
6505                             /* Bail if it would exceed the byte limit for
6506                              * folding a single char. */
6507                             if (this_char_foldlen + folded_ptr - folded >
6508                                                             UTF8_MAXBYTES_CASE)
6509                             {
6510                                 break;
6511                             }
6512
6513                             /* Save the first character's folded length, in
6514                              * case we have to use it later */
6515                             if (! foldlen) {
6516                                 foldlen = this_char_foldlen;
6517                             }
6518
6519                             /* Here, add the fold of this character */
6520                             Copy(this_char_folded,
6521                                  folded_ptr,
6522                                  this_char_foldlen,
6523                                  U8);
6524                             which_char++;
6525                             map_fold_len_back[which_char] =
6526                                 map_fold_len_back[which_char - 1]
6527                                 + UTF8SKIP(source_ptr);
6528                             folded_ptr += this_char_foldlen;
6529                             source_ptr += UTF8SKIP(source_ptr);
6530                         }
6531                         *folded_ptr = '\0';
6532                         foldlen_for_av = folded_ptr - folded;
6533                     }
6534
6535
6536                     /* Do the linear search to see if the fold is in the list
6537                      * of multi-char folds.  (Useless to look if won't be able
6538                      * to store that it is a multi-char fold in *lenp) */
6539                     if (lenp && av) {
6540                         I32 i;
6541                         for (i = 0; i <= av_len(av); i++) {
6542                             SV* const sv = *av_fetch(av, i, FALSE);
6543                             STRLEN len;
6544                             const char * const s = SvPV_const(sv, len);
6545                             if (len <= foldlen_for_av && memEQ(s,
6546                                                                (char*)folded,
6547                                                                len))
6548                             {
6549
6550                                 /* Advance the target string ptr to account for
6551                                  * this fold, but have to translate from the
6552                                  * folded length to the corresponding source
6553                                  * length.  The array is indexed by how many
6554                                  * characters in the match */
6555                                 *lenp = map_fold_len_back[
6556                                         utf8_length(folded, folded + len)];
6557                                 match = TRUE;
6558                                 break;
6559                             }
6560                         }
6561                     }
6562                     if (!match) { /* See if the folded version matches */
6563                         SV** listp;
6564
6565                         /* Consider "k" =~ /[K]/i.  The line above would have
6566                          * just folded the 'k' to itself, and that isn't going
6567                          * to match 'K'.  So we look through the closure of
6568                          * everything that folds to 'k'.  That will find the
6569                          * 'K'.  Initialize the list, if necessary */
6570                         if (! PL_utf8_foldclosures) {
6571
6572                             /* If the folds haven't been read in, call a fold
6573                              * function to force that */
6574                             if (! PL_utf8_tofold) {
6575                                 U8 dummy[UTF8_MAXBYTES+1];
6576                                 STRLEN dummy_len;
6577                                 to_utf8_fold((U8*) "A", dummy, &dummy_len);
6578                             }
6579                             PL_utf8_foldclosures =
6580                                   _swash_inversion_hash(PL_utf8_tofold);
6581                         }
6582
6583                         /* The data structure is a hash with the keys every
6584                          * character that is folded to, like 'k', and the
6585                          * values each an array of everything that folds to its
6586                          * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
6587                         if ((listp = hv_fetch(PL_utf8_foldclosures,
6588                                       (char *) folded, foldlen, FALSE)))
6589                         {
6590                             AV* list = (AV*) *listp;
6591                             IV i;
6592                             for (i = 0; i <= av_len(list); i++) {
6593                                 SV** try_p = av_fetch(list, i, FALSE);
6594                                 char* try_c;
6595                                 if (try_p == NULL) {
6596                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6597                                 }
6598                                 /* Don't have to worry about embedded nulls
6599                                  * since NULL isn't folded or foldable */
6600                                 try_c = SvPVX(*try_p);
6601
6602                                 /* The fold in a few cases  of an above Latin1
6603                                  * char is in the Latin1 range, and hence may
6604                                  * be in the bitmap */
6605                                 if (UTF8_IS_INVARIANT(*try_c)
6606                                     && ANYOF_BITMAP_TEST(n,
6607                                                     UNI_TO_NATIVE(*try_c)))
6608                                 {
6609                                     match = TRUE;
6610                                     break;
6611                                 }
6612                                 else if
6613                                     (UTF8_IS_DOWNGRADEABLE_START(*try_c)
6614                                      && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
6615                                                 TWO_BYTE_UTF8_TO_UNI(try_c[0],
6616                                                                     try_c[1]))))
6617                                 {
6618                                    /* Since the fold comes from internally
6619                                     * generated data, we can safely assume it
6620                                     * is valid utf8 in the test above */
6621                                     match = TRUE;
6622                                     break;
6623                                 } else if (swash_fetch(sw, (U8*) try_c, 1)) {
6624                                     match = TRUE;
6625                                     break;
6626                                 }
6627                             }
6628                         }
6629                     }
6630                 }
6631
6632                 /* If we allocated a string above, free it */
6633                 if (! utf8_target) Safefree(utf8_p);
6634             }
6635         }
6636     }
6637
6638     return (flags & ANYOF_INVERT) ? !match : match;
6639 }
6640
6641 STATIC U8 *
6642 S_reghop3(U8 *s, I32 off, const U8* lim)
6643 {
6644     dVAR;
6645
6646     PERL_ARGS_ASSERT_REGHOP3;
6647
6648     if (off >= 0) {
6649         while (off-- && s < lim) {
6650             /* XXX could check well-formedness here */
6651             s += UTF8SKIP(s);
6652         }
6653     }
6654     else {
6655         while (off++ && s > lim) {
6656             s--;
6657             if (UTF8_IS_CONTINUED(*s)) {
6658                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6659                     s--;
6660             }
6661             /* XXX could check well-formedness here */
6662         }
6663     }
6664     return s;
6665 }
6666
6667 #ifdef XXX_dmq
6668 /* there are a bunch of places where we use two reghop3's that should
6669    be replaced with this routine. but since thats not done yet
6670    we ifdef it out - dmq
6671 */
6672 STATIC U8 *
6673 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6674 {
6675     dVAR;
6676
6677     PERL_ARGS_ASSERT_REGHOP4;
6678
6679     if (off >= 0) {
6680         while (off-- && s < rlim) {
6681             /* XXX could check well-formedness here */
6682             s += UTF8SKIP(s);
6683         }
6684     }
6685     else {
6686         while (off++ && s > llim) {
6687             s--;
6688             if (UTF8_IS_CONTINUED(*s)) {
6689                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6690                     s--;
6691             }
6692             /* XXX could check well-formedness here */
6693         }
6694     }
6695     return s;
6696 }
6697 #endif
6698
6699 STATIC U8 *
6700 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6701 {
6702     dVAR;
6703
6704     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6705
6706     if (off >= 0) {
6707         while (off-- && s < lim) {
6708             /* XXX could check well-formedness here */
6709             s += UTF8SKIP(s);
6710         }
6711         if (off >= 0)
6712             return NULL;
6713     }
6714     else {
6715         while (off++ && s > lim) {
6716             s--;
6717             if (UTF8_IS_CONTINUED(*s)) {
6718                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6719                     s--;
6720             }
6721             /* XXX could check well-formedness here */
6722         }
6723         if (off <= 0)
6724             return NULL;
6725     }
6726     return s;
6727 }
6728
6729 static void
6730 restore_pos(pTHX_ void *arg)
6731 {
6732     dVAR;
6733     regexp * const rex = (regexp *)arg;
6734     if (PL_reg_eval_set) {
6735         if (PL_reg_oldsaved) {
6736             rex->subbeg = PL_reg_oldsaved;
6737             rex->sublen = PL_reg_oldsavedlen;
6738 #ifdef PERL_OLD_COPY_ON_WRITE
6739             rex->saved_copy = PL_nrs;
6740 #endif
6741             RXp_MATCH_COPIED_on(rex);
6742         }
6743         PL_reg_magic->mg_len = PL_reg_oldpos;
6744         PL_reg_eval_set = 0;
6745         PL_curpm = PL_reg_oldcurpm;
6746     }
6747 }
6748
6749 STATIC void
6750 S_to_utf8_substr(pTHX_ register regexp *prog)
6751 {
6752     int i = 1;
6753
6754     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6755
6756     do {
6757         if (prog->substrs->data[i].substr
6758             && !prog->substrs->data[i].utf8_substr) {
6759             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6760             prog->substrs->data[i].utf8_substr = sv;
6761             sv_utf8_upgrade(sv);
6762             if (SvVALID(prog->substrs->data[i].substr)) {
6763                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6764                 if (flags & FBMcf_TAIL) {
6765                     /* Trim the trailing \n that fbm_compile added last
6766                        time.  */
6767                     SvCUR_set(sv, SvCUR(sv) - 1);
6768                     /* Whilst this makes the SV technically "invalid" (as its
6769                        buffer is no longer followed by "\0") when fbm_compile()
6770                        adds the "\n" back, a "\0" is restored.  */
6771                 }
6772                 fbm_compile(sv, flags);
6773             }
6774             if (prog->substrs->data[i].substr == prog->check_substr)
6775                 prog->check_utf8 = sv;
6776         }
6777     } while (i--);
6778 }
6779
6780 STATIC void
6781 S_to_byte_substr(pTHX_ register regexp *prog)
6782 {
6783     dVAR;
6784     int i = 1;
6785
6786     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6787
6788     do {
6789         if (prog->substrs->data[i].utf8_substr
6790             && !prog->substrs->data[i].substr) {
6791             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6792             if (sv_utf8_downgrade(sv, TRUE)) {
6793                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6794                     const U8 flags
6795                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6796                     if (flags & FBMcf_TAIL) {
6797                         /* Trim the trailing \n that fbm_compile added last
6798                            time.  */
6799                         SvCUR_set(sv, SvCUR(sv) - 1);
6800                     }
6801                     fbm_compile(sv, flags);
6802                 }
6803             } else {
6804                 SvREFCNT_dec(sv);
6805                 sv = &PL_sv_undef;
6806             }
6807             prog->substrs->data[i].substr = sv;
6808             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6809                 prog->check_substr = sv;
6810         }
6811     } while (i--);
6812 }
6813
6814 /*
6815  * Local variables:
6816  * c-indentation-style: bsd
6817  * c-basic-offset: 4
6818  * indent-tabs-mode: t
6819  * End:
6820  *
6821  * ex: set ts=8 sts=4 sw=4 noet:
6822  */